Click here to Skip to main content
15,886,873 members
Articles / Programming Languages / C++

A Spell Checking Engine

Rate me:
Please Sign up or sign in to vote.
4.88/5 (16 votes)
5 Feb 2001 265.9K   7K   108  
A free spell checking engine for use in your C++ applications. Includes the current US English dictionary
#include "stdafx.h"
#include "FPSSpellCheckerInclude.h"


void TrimRight(char *szText)
{
	ASSERT(szText);
	ASSERT(AfxIsValidString(szText));

	int iLen = lstrlen(szText);
	int iPos = 0;
	BOOL bFound = FALSE;

	while (iPos < iLen && !bFound)
	{
		if (szText[iPos] == ' ')
			szText[iPos] = '\0';
		iPos++;
	}
}

// this function uses a modified version of the Metaphone algorithm to generate
// a quasi phonetic representation of a word.
void MetaphoneEx(const char *szInput, char *szOutput, int iMaxLen)
{
	ASSERT(szInput);
	ASSERT(AfxIsValidString(szInput));
	ASSERT(szOutput);
	ASSERT(AfxIsValidString(szOutput));
	ASSERT(iMaxLen > 0);
	ASSERT(iMaxLen < 50);

	char szLastChar = '\0';
	char szThisChar = '\0';
	char szNextChar = '\0';
	char szOutChar = '\0';
	char szTemp[200];
	int iInputPos = 0;
	int iOutputPos = 0;
	int iLen = 0;

	// copy input into temp
	memset(szTemp, 0, 200);
	strcpy(szTemp, szInput);
	TrimRight(szTemp);
	CharUpper(szTemp);
	iLen = lstrlen(szTemp);

	//szOutput[0] = szTemp[0];
	//iOutputPos++;

	while (iInputPos < iLen)
	{
		szOutChar = '\0';
		szThisChar = szTemp[iInputPos];
		szNextChar = szTemp[iInputPos+1];

		//if (szThisChar != szLastChar)
		if (szThisChar != szNextChar)
		{
			switch (szThisChar)
			{
			case 'A':
				{
					szOutChar = '\0';
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'E')
					{
						szOutChar = 'E';
						iInputPos++;
					}
					break;
				}
			case 'B': 
				{
					szOutChar = 'B';
					if (szTemp[iInputPos+1] == 'R')
						szOutChar = '\0';
					break;
				}
			case 'C':
				{
					szOutChar = 'K';
					if (szTemp[iInputPos+1] == 'H')
					{
						szOutChar = 'X';
						iInputPos++;
					}
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
					{
						szOutChar = 'X';
						iInputPos += 2;
					}
					//if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] != 'A')
					//	szOutChar = 'S';
					//if (szTemp[iInputPos+1] == 'E')
					//	szOutChar = 'S';
					if (szTemp[iInputPos+1] == 'Y')
						szOutChar = 'S';
					if (szTemp[iInputPos+1] == 'Q')
						szOutChar = 0;
					break;
				}
			case 'D':
				{
					szOutChar = 'D';
					if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'E')
						szOutChar = 'J';
					if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'Y')
						szOutChar = 'J';
					if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'I')
						szOutChar = 'J';
					if (iInputPos == iLen-1)
						szOutChar = '\0';
					break;
				}
			case 'E':
				{
					szOutChar = 0;
					//if (szTemp[iInputPos+1] == 'A')
					//{
					//	szOutChar = 'A';
					//	iInputPos++;
					//}
					break;
				}
			case 'F':
				{
					szOutChar = 'F';
					break;
				}
			case 'G':
				{
					szOutChar = 'J';
					if (szTemp[iInputPos+1] == 'H' && szTemp[iInputPos+2] == 'T')
					{
						szOutChar = 'T';
						iInputPos += 2;
					}
					else
					{
						if (szTemp[iInputPos+1] == 'H')
						{
							if (szTemp[iInputPos+2] != '\0' && !IsVowel(szTemp[iInputPos+2]))
								szOutChar = '\0';
						} 
						if (szTemp[iInputPos+1] == 'I' || szTemp[iInputPos+1] == 'E' || szTemp[iInputPos+1] == 'Y')
							szOutChar = 'J';
						if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
						{
							szOutChar = 'N';
							iInputPos++;
						}
						if (szTemp[iInputPos+1] == 'N')
							iInputPos++;
					}
					break;
				}
			case 'H':
				{
					szOutChar = 'H';
					if (iInputPos > 0)
					{
						if (IsVowel(szTemp[iInputPos-1]) && !IsVowel(szTemp[iInputPos+1]))
							szOutChar = '\0';
					}
					break;
				}
			case 'J':
				{
					szOutChar = 'J';
					break;
				}
			case 'K':
				{
					szOutChar = 'K';
					if (iInputPos > 0)
					{
						if (szTemp[iInputPos-1] == 'C')
							szOutChar = '\0';
					}
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
					{
						szOutChar = 'N';
						iInputPos++;
					}
					if (szOutChar == 'K' && szTemp[iInputPos+1] == 'R')
						iInputPos++;
					if (szTemp[iInputPos+1] == 'C')
						iInputPos++;
					break;
				}
			case 'L':
				{
					szOutChar = 'L';
					if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
					{
						szOutput[iOutputPos] = 'L';
						szOutChar = 'K';
						iOutputPos++;
						iInputPos+=2;
					}
					//if (szTemp[iInputPos+1] == 'Y' && szTemp[iInputPos+2] == 0)
					//{
					//	szOutChar = 0;
					//	iInputPos++;
					//}
					break;
				}
			case 'M':
				{
					szOutChar = 'M';
					break;
				}
			case 'N':
				{
					szOutChar = 'N';
					if (szTemp[iInputPos+1] == 'D')
					{
						szOutChar = 'D';
						iInputPos++;
					}
					else if (szTemp[iInputPos+1] == 'E' && szTemp[iInputPos+2] == 'N' && szTemp[iInputPos+3] == 'T')
					{
						szOutput[iOutputPos] = 'N';
						iOutputPos++;
						szOutChar = 'T';
						iInputPos += 3;
					}
					else if (szTemp[iInputPos+1] == 'T')
					{
						szOutChar = 'T';
						iInputPos++;
					}
					break;
				}
			case 'P':
				{
					szOutChar = 'P';
					if (szTemp[iInputPos+1] == 'H')
					{
						szOutChar = 'F';
						iInputPos++;
					}
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
					{
						szOutChar = 'N';
						iInputPos++;
					}
					break;
				}
			case 'Q':
				{
					szOutChar = 'K';
					break;
				}
			case 'R':
				{
					szOutChar = 'R';
					//if (iInputPos == iLen-1)
					//	szOutChar = '\0';
					if (szTemp[iInputPos+1] == 'K')
					{
						szOutChar = 'K';
						iInputPos++;
					}
					if (szTemp[iInputPos+1] == 'T' && szTemp[iInputPos+2] == 0)
						szOutChar = 'T';
					break;
				}
			case 'S':
				{
					szOutChar = 'S';
					if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
					{
						szOutput[iOutputPos] = szOutChar;
						iOutputPos++;

						szOutChar = 'K';
						iInputPos+=2;
					}
					else
					{
						if (szTemp[iInputPos+1] == 'H')
							szOutChar = 'X';
						if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
							szOutChar = 'X';
						if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'O')
							szOutChar = 'X';
						// TEST 1-15-2000
						if (szTemp[iInputPos+1] == 'I')
							szOutChar = 'X';
						if (szTemp[iInputPos+1] == 'E' && szTemp[iInputPos+2] == 'I')
							szOutChar = 'X';
						if (iInputPos == iLen-1 || (iInputPos == iLen-2 && szTemp[iInputPos+1] == 'S'))
							szOutChar = '\0';
					}
					break;
				}
			case 'T':
				{
					szOutChar = 'T';
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
						szOutChar = 'X';
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'O')
						szOutChar = 'X';
					if (szTemp[iInputPos+1] == 'H')
						szOutChar = '\0';
					if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
						szOutChar = '\0';
					if (iInputPos > 0 && szTemp[iInputPos-1] == 'F')
						szOutChar = '\0';
					if (szTemp[iInputPos+1] == 'A' && szTemp[iInputPos+2] == 'B')
						szOutChar = '\0';
					//if (!IsVowel(szTemp[iInputPos+1]) && szTemp[iInputPos+1] != 'H')
					//	szOutChar = '\0';
					break;
				}
			case 'V':
				{
					szOutChar = 'F';
					break;
				}
			case 'W':
				{
					szOutChar = '\0';
					if (IsVowel(szTemp[iInputPos+1]))
						szOutChar = 'X';
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'R')
					{
						szOutChar = 'R';
						iInputPos++;
					}
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'H')
					{
						szOutChar = 'W';
						iInputPos++;
					}
					break;
				}
			case 'X':
				{
					szOutput[iOutputPos] = 'K';
					iOutputPos++;
					szOutChar = 'S';

					if (iInputPos == 0)
						szOutChar = 'S';
					break;
				}
			case 'Y':
				{
					szOutChar = '\0';
					if (IsVowel(szTemp[iInputPos+1]))
						szOutChar = 'Y';
					if (iInputPos == iLen-1)
						szOutChar = '\0';
					break;
				}
			case 'Z':
				{
					szOutChar = 'S';
					break;
				}
			}
		}

		if (szOutChar != '\0')
		{
			szOutput[iOutputPos] = szOutChar;
			iOutputPos++;
		}
		
		szLastChar = szThisChar;
		iInputPos++;
	}

	if (IsVowel(szOutput[0]))
		szOutput[0] = 'A';

	// truncate output if necessary
	TrimRight(szOutput);
	szOutput[iOutputPos] = '\0';
	szOutput[iMaxLen] = '\0';
}

// determine if a character is a volel (A,E,I,O,Y)
BOOL IsVowel(const char szChar) 
{
	switch (szChar)
	{
	case 'A':
	case 'E':
	case 'I':
	case 'O':
	case 'U':
	case 'a':
	case 'e':
	case 'i':
	case 'o':
	case 'u':
		{
			return TRUE;
			break;
		}
	}
	
	return FALSE;
}

int EditDistance(const char *szWord1, const char *szWord2) 
{
	ASSERT(szWord1);
	ASSERT(AfxIsValidString(szWord1));
	ASSERT(szWord2);
	ASSERT(AfxIsValidString(szWord2));

	int iDiffCount = 0;
	int iPos1 = 0;
	int iPos2 = 0;
	BOOL bContinue = TRUE;

	while (bContinue)
	{
		if (IsCharMatch(szWord1[iPos1], szWord2[iPos2]))
		{
			iPos1++;
			iPos2++;
		}
		else if (szWord1[iPos1] == 0)
		{
			iDiffCount++;
			iPos2++;
		}
		else if (szWord2[iPos2] == 0)
		{
			iDiffCount++;
			iPos1++;
		}
		else if (IsCharMatch(szWord1[iPos1+1], szWord2[iPos2]))
		{
			iDiffCount++;
			iPos1++;
		}
		else if (IsCharMatch(szWord1[iPos1], szWord2[iPos2+1]))
		{
			iDiffCount++;
			iPos2++;
		}
		else if (IsCharMatch(szWord1[iPos1+1], szWord2[iPos2+1]))
		{
			iDiffCount++;
			iPos1++;
			iPos2++;
		}
		else
		{
			int iOldPos2 = iPos2;
			int iOldPos1 = iPos1;
			int iTemp1 = 0;
			int iTemp2 = 0;
			int iTemp3 = 0;
			int iTemp4 = 0;
			int iTemp5 = 0;
			int iPos1_4 = 0;
			int iPos2_4 = 0;
			int iPos1_5 = 0;
			int iPos2_5 = 0;

			while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && szWord2[iPos2] != 0)
			{
				iPos2++;
				iTemp1++;
			}

			iPos2 = iOldPos2;
			iPos1 = iOldPos1;
			if (iTemp1 > 1)
			{
				while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && szWord1[iPos1] != 0)
				{
					iPos1++;
					iTemp2++;
				}
			}
			else
			{
				iTemp2 = 100;
			}

			iPos2 = iOldPos2;
			iPos1 = iOldPos1;
			if (iTemp1 > 1 && iTemp2 > 1)
			{
				while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && (szWord1[iPos1] != 0 && szWord2[iPos2] != 0))
				{
					iPos1++;
					iPos2++;
					iTemp3++;
				}
			}
			else
			{
				iTemp3 = 100;
			}

			iPos2 = iOldPos2;
			iPos1 = iOldPos1;
			if (iTemp1 > 1 && iTemp2 > 1 && iTemp3 > 1)
			{
				while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && szWord1[iPos1] != 0)
				{
					iPos1++;
					iPos2 = iOldPos2;
					while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && szWord2[iPos2] != 0)
						iPos2++;
				}
				iPos1_4 = iPos1;
				iPos2_4 = iPos2;
				//iTemp4 = (iPos1 - iOldPos1 - 1) + (iPos2 - iOldPos2 - 1);
				iTemp4 = (iPos1 - iOldPos1) + (iPos2 - iOldPos2) - 1;
			}
			else
			{
				iTemp4 = 100;
			}

			iPos2 = iOldPos2;
			iPos1 = iOldPos1;
			if (iTemp1 > 1 && iTemp2 > 1 && iTemp3 > 1 && iTemp4 > 1)
			{
				while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && szWord2[iPos2] != 0)
				{
					iPos2++;
					iPos1 = iOldPos1;
					while (!IsCharMatch(szWord1[iPos1], szWord2[iPos2]) && szWord1[iPos1] != 0)
						iPos1++;
				}
				iPos1_5 = iPos1;
				iPos2_5 = iPos2;
				iTemp5 = (iPos1 - iOldPos1) + (iPos2 - iOldPos2) - 1;
				//iTemp5 = (iPos1 - iOldPos1 - 1) + (iPos2 - iOldPos2 - 1);
			}
			else
			{
				iTemp5 = 100;
			}

			iPos2 = iOldPos2;
			iPos1 = iOldPos1;
			if (iTemp1 <= iTemp2 && iTemp1 <= iTemp3 && iTemp1 <= iTemp4 && iTemp1 <= iTemp5)
			{
				iDiffCount += iTemp1;
				iPos2 += iTemp1;
			}
			else if (iTemp2 <= iTemp3 && iTemp2 <= iTemp4 && iTemp2 <= iTemp5)
			{
				iDiffCount += iTemp2;
				iPos1 += iTemp2;
			}
			else if (iTemp3 <= iTemp4 && iTemp4 <= iTemp5)
			{
				iDiffCount += iTemp3;
				iPos1 += iTemp3;
				iPos2 += iTemp3;
			}
			else if (iTemp4 <= iTemp5)
			{
				iPos1 = iPos1_4;
				iPos2 = iPos2_4;

				if (szWord1[iPos1] != 0) iPos1++;
				if (szWord2[iPos2] != 0) iPos2++;

				iDiffCount += iTemp4;
			}
			else 
			{
				iPos1 = iPos1_5;
				iPos2 = iPos2_5;

				if (szWord1[iPos1] != 0) iPos1++;
				if (szWord2[iPos2] != 0) iPos2++;

				iDiffCount += iTemp5;
			}
		}

		if (szWord1[iPos1] == 0 && szWord2[iPos2] == 0)
			bContinue = FALSE;
	}

	return iDiffCount;
}

inline BOOL IsCharMatch(const char c1, const char c2)  
{
	if (c1 == c2)
		return TRUE;

	if (abs(c1 - c2) == 32)
		return TRUE;

	return FALSE;
}

BOOL IsNumber(LPCSTR lpszWord) 
{
	ASSERT(lpszWord);
	ASSERT(AfxIsValidString(lpszWord));

	BOOL bReturn = TRUE;
	int iLen = lstrlen(lpszWord);
	int iPos = 0;

	while (iPos < iLen && bReturn)
	{
		switch (lpszWord[iPos])
		{
		case '0':
		case '1':
		case '2':
		case '3':
		case '4':
		case '5':
		case '6':
		case '7':
		case '8':
		case '9':
			{
				break;
			}
		default:
			{
				bReturn = FALSE;
				break;
			}
		}

		iPos++;
	}

	return bReturn;

}

void SortMatches(LPCSTR lpszBadWord, CStringList &Matches)
{
	ASSERT(lpszBadWord);
	ASSERT(AfxIsValidString(lpszBadWord));

	int iSize = Matches.GetCount()+1;
	FPSSPELLCHECK_WORDINFO* pArray = NULL;
	int iPos = 0;
	int iLen = lstrlen(lpszBadWord);
	POSITION Pos;
	CString strWord;

	pArray = new FPSSPELLCHECK_WORDINFO[iSize];

	// populate our array
	Pos = Matches.GetHeadPosition();
	while (Pos)
	{
		strWord = Matches.GetNext(Pos);

		pArray[iPos].strWord = strWord;
		pArray[iPos].iDiff = EditDistance(lpszBadWord, strWord.GetBuffer(strWord.GetLength()));

		iPos++;
	}

	// now, sort array
	for (int iX = 0; iX < iPos; iX++)
	{
		for (int iY = 0; iY < iPos; iY++)
		{
			if (pArray[iY].iDiff > pArray[iX].iDiff || (pArray[iY].iDiff == pArray[iX].iDiff && pArray[iY].strWord > pArray[iX].strWord))
			{
				int iTemp = pArray[iX].iDiff;
				CString strTemp = pArray[iX].strWord;

				pArray[iX].iDiff = pArray[iY].iDiff;
				pArray[iX].strWord = pArray[iY].strWord;

				pArray[iY].iDiff = iTemp;
				pArray[iY].strWord = strTemp;
			}
		}
	}

	// rebuild match list
	Matches.RemoveAll();
	for (iX = 0; iX < iPos; iX++)
		Matches.AddTail(pArray[iX].strWord);

	delete[] pArray;
}


// creates an output based on an input word, but removes
// all vowels
void RemoveVowels(const char *szInput, char *szOutput)
{
	ASSERT(szInput);
	ASSERT(AfxIsValidString(szInput));
	ASSERT(szOutput);
	ASSERT(AfxIsValidString(szOutput));

	int iInLen = lstrlen(szInput);
	int iInPos = 0;
	int iOutPos = 0;

	memset(szOutput, 0, iInLen);

	for (iInPos = 0; iInPos < iInLen; iInPos++)
	{
		if (!IsVowel(szInput[iInPos]))
		{
			szOutput[iOutPos] = szInput[iInPos];
			iOutPos++;
		}
	}
}

void RemoveVowelsAndDups(const char *szInput, char *szOutput)
{
	ASSERT(szInput);
	ASSERT(AfxIsValidString(szInput));
	ASSERT(szOutput);
	ASSERT(AfxIsValidString(szOutput));

	int iInLen = lstrlen(szInput);
	int iInPos = 0;
	int iOutPos = 0;
	char cLastChar = 0;

	memset(szOutput, 0, iInLen);

	for (iInPos = 0; iInPos < iInLen; iInPos++)
	{
		if (!IsVowel(szInput[iInPos]))
		{
			if (cLastChar != szInput[iInPos])
			{
				szOutput[iOutPos] = szInput[iInPos];
				iOutPos++;
			}
		}

		cLastChar = szInput[iInPos];
	}
}

// checks 2 words to determine if they match enough
// to quialify for the spell checking engines
// requirements
BOOL IsWordMatch(const char *szWord1, int iWord1Len, const char *szWord2, int iWord2Len) 
{
	ASSERT(szWord1);
	ASSERT(AfxIsValidString(szWord1));
	ASSERT(iWord1Len >= 0);
	ASSERT(szWord2);
	ASSERT(AfxIsValidString(szWord2));
	ASSERT(iWord2Len >= 0);

	BOOL bReturn = FALSE;

	// only attempt edit-difference algorithm when
	if (abs(iWord1Len - iWord2Len) < 4 && iWord1Len > 3)
	{
		int iDiff = EditDistance(szWord1, szWord2);
		//int iDiff = EditDistance(szWord1, szWord2);

		if (iWord1Len > 8 && iDiff <= 2)
			return TRUE;

		if (iWord1Len > 4 && iDiff <= 2)
			return TRUE;

		if (iDiff <= 1)
			return TRUE;

		return FALSE;
	}

	return bReturn;
}

BOOL ConatainsUpperCase(LPCSTR lpszWord)
{
	ASSERT(lpszWord);
	ASSERT(AfxIsValidString(lpszWord));


	BOOL bReturn = FALSE;
	int iLen = lstrlen(lpszWord);
	int iPos = 0;

	while (iPos < iLen && !bReturn)
	{
		if (lpszWord[iPos] >= 'A' && lpszWord[iPos] <= 'Z')
			bReturn = TRUE;

		iPos++;
	}


	return bReturn;
}

BOOL IsOK(int iCode)
{
	return (iCode == FPSSPELLCHECK_ERROR_NONE);
}

BOOL IsWordBreak(char cThisChar)
{
	BOOL bReturn = FALSE;

	if (!::isalnum(cThisChar))
	{
		bReturn = TRUE;

		if (cThisChar == '\'')
			bReturn = FALSE;
		if (cThisChar == '_')
			bReturn = FALSE;
		if (cThisChar == '@')
			bReturn = FALSE;
		if (cThisChar == '\\')
			bReturn = FALSE;
		if (cThisChar == '//')
			bReturn = FALSE;
		if (cThisChar == ':')
			bReturn = FALSE;
	}

	return bReturn;
}

int GetVowelCount(const char* szWord)
{
	ASSERT(szWord);
	ASSERT(AfxIsValidString(szWord));

	int iCount = 0;
	int iLen = lstrlen(szWord);

	for (int iPos = 0; iPos < iLen; iPos++)
	{
		if (IsVowel(szWord[iPos]))
			iCount++;
	}

	return iCount;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions