/*
Copyright: 2000
Author: Matthew T Gullett
Email: gullettm@yahoo.com
Name: CFPSSpellCheckEngine
Part of: Spell Checking Engine 1 of 1
Requires: USMain.dic, [USUser.dic], [USCommon.dic]
DESCRIPTION
----------------------------------------------------------
This class is designed to implement a phonetic spell
checking engine with support for US English (and probably
UK english) It implements a simple interface for accessing
spell checking functions. It is designed to be very fast
with support for a user dictionary and a common misspelling
dictionary to improve performance and user perception.
INFO:
----------------------------------------------------------
This class is provided -as is-. No warranty as to the
function or performance of this class is provided either
written or implied.
You may freely use this code and modify it as necessary,
as long as this header is unmodified and credit is given
to the author in the application(s) in which it is
incorporated.
*/
#include "stdafx.h"
#include "FPSSpellCheckEngine.h"
#include "io.h"
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CFPSSpellCheckEngine::CFPSSpellCheckEngine()
{
// initialize internal variables
m_iMainHandle = 0;
m_pMainBuffer = NULL;
m_pWordCache = NULL;
m_lMainLen = 0;
m_lCacheStartPos = 0;
}
CFPSSpellCheckEngine::~CFPSSpellCheckEngine()
{
// cleanup memory and file handles
if (m_iMainHandle)
_close(m_iMainHandle);
if (m_pMainBuffer)
free (m_pMainBuffer);
if (m_pWordCache)
free (m_pWordCache);
}
// use this function to specify the location of the dictionaries
void CFPSSpellCheckEngine::SetDictionaries(LPCSTR lpszMain, LPCSTR lpszUser, LPCSTR lpszCommonUse)
{
m_strMainDic = lpszMain;
m_strUserDic = lpszUser;
m_strCmnUseDic = lpszCommonUse;
}
// initialize the spell checking engine
// this function caches the user and commonly mispelled
// dictionaries, and opens the main dictionary and caches
// information on the number of words in the dictionary
// and the location of the letter breaks within the file
BOOL CFPSSpellCheckEngine::InitEngine()
{
if (m_iMainHandle || m_pMainBuffer)
return TRUE;
// cache secondary dictionaries
CacheUserDic();
CacheCommonUseWords();
// open main data dictionary
m_iMainHandle = _open(m_strMainDic, _O_BINARY);
if (!m_iMainHandle)
return FALSE;
m_pMainBuffer = (char*)malloc(FPSSPELL_DEFAULT_CACHE_SIZE);
// get file length
lseek(m_iMainHandle, 0, SEEK_END);
m_lMainLen = tell(m_iMainHandle);
lseek(m_iMainHandle, 0, SEEK_SET);
// set cache position information
m_lCacheStartPos = 0;
m_lCacheLen = _read(m_iMainHandle, m_pMainBuffer, FPSSPELL_DEFAULT_CACHE_SIZE);
// ************************************************
// create word cache
if (m_pWordCache)
free (m_pWordCache);
m_pWordCache = (char*)malloc(GetRecordCount() * FPSSPELL_MAIN_WORD_MAX_LEN);
// cache word locations within main datadictionay
CacheWordLocations();
return TRUE;
}
// this is the main function used to search for a word in
// the dictionary and returning a list of possible matches
BOOL CFPSSpellCheckEngine::FindWord(LPCSTR lpszWord, CStringList &PossibleMatches)
{
char szWordToFind[200];
// convert lpszWord to UPPERCASE and place in szWordToFind
strcpy(szWordToFind, lpszWord);
TrimRight(szWordToFind);
CharUpper(szWordToFind);
// clear possible matches
PossibleMatches.RemoveAll();
// *******************************************************
// 1. look in recent finds first
if (m_RecentFinds.Lookup(szWordToFind, m_strTempWord))
return TRUE;
// *******************************************************
// 2. Look in User Data dictionary
if (m_UserDic.Lookup(szWordToFind, m_strTempWord))
return TRUE;
// *******************************************************
// 3. look in common misspells
if (m_CmnUseDic.Lookup(szWordToFind, m_strTempWord))
{
PossibleMatches.AddTail(m_strTempWord);
return FALSE;
}
// *******************************************************
// 4. look in main dictionary
if (FindWordMain(szWordToFind))
{
m_RecentFinds.SetAt(szWordToFind, lpszWord);
return TRUE;
}
// find the possible matches for this word
FindMatchingWords(szWordToFind, PossibleMatches);
return FALSE;
}
// cache the USERDD file
void CFPSSpellCheckEngine::CacheUserDic()
{
CStdioFile fp;
CString strOldWord;
CString strNewWord;
if (fp.Open(m_strUserDic, CFile::modeRead | CFile::shareExclusive))
{
while (fp.ReadString(strOldWord))
{
strOldWord.TrimLeft();
strOldWord.TrimRight();
strNewWord = strOldWord;
strNewWord.MakeLower();
m_UserDic.SetAt(strNewWord, strOldWord);
}
fp.Close();
}
}
// cache the common mispelled dictionary
void CFPSSpellCheckEngine::CacheCommonUseWords()
{
return;
CStdioFile fp;
CString strLine;
CString strIncorrect;
CString strCorrect;
int iPos = 0;
if (fp.Open(m_strCmnUseDic, CFile::modeRead | CFile::shareExclusive))
{
while (fp.ReadString(strLine))
{
strLine.TrimLeft();
strLine.TrimRight();
iPos = strLine.Find('=');
if (iPos > 0)
{
strIncorrect = strLine.Left(iPos);
strCorrect = strLine.Mid(iPos+1);
strIncorrect.TrimLeft();
strIncorrect.TrimRight();
strIncorrect.MakeUpper();
strCorrect.TrimLeft();
strCorrect.TrimRight();
m_CmnUseDic.SetAt(strIncorrect, strCorrect);
}
}
fp.Close();
}
}
// determine if a character is a volel (A,E,I,O,Y)
BOOL CFPSSpellCheckEngine::IsVowel(char szChar)
{
switch (szChar)
{
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
{
return TRUE;
break;
}
}
return FALSE;
}
// determines the file offset where a record exists in the
// main dictionary
long CFPSSpellCheckEngine::GetRecordPos(long lRecord)
{
return ((lRecord - 1) * FPSSPELL_MAIN_RECORD_LEN);
}
// determine the number of records in the main dictionary
long CFPSSpellCheckEngine::GetRecordCount()
{
return (m_lMainLen / FPSSPELL_MAIN_RECORD_LEN);
}
// adds a word to the recent finds map so the next time
// a findword is done, it will find it here and ignore
// it
void CFPSSpellCheckEngine::IgnoreWord(LPCSTR lpszWord)
{
char szUpperWord[200];
// make word upper
strcpy(szUpperWord, lpszWord);
CharUpper(szUpperWord);
// add word to recent finds
m_RecentFinds.SetAt(szUpperWord, lpszWord);
}
// adds a word to the USER DD
void CFPSSpellCheckEngine::AddWord(LPCSTR lpszWord)
{
char szUpperWord[200];
strcpy(szUpperWord, lpszWord);
CharUpper(szUpperWord);
// add word to memory map of user data dictionary
m_UserDic.SetAt(szUpperWord, lpszWord);
// physically add word to user data dictionary file
CStdioFile fp;
if (fp.Open(m_strUserDic, CFile::modeRead | CFile::modeWrite | CFile::shareExclusive))
{
fp.Seek(0, CFile::end);
fp.WriteString(lpszWord);
fp.WriteString("\n");
fp.Close();
}
else
{
if (fp.Open(m_strUserDic, CFile::modeCreate | CFile::modeWrite | CFile::shareExclusive))
{
fp.Seek(0, CFile::end);
fp.WriteString(lpszWord);
fp.WriteString("\n");
fp.Close();
}
}
}
// finds a word in the main dictionary
BOOL CFPSSpellCheckEngine::FindWordMain(LPCSTR lpszWordToFind)
{
BOOL bFound = FALSE;
long lRecord;
char szTest[50];
strcpy(szTest, lpszWordToFind);
CharUpper(szTest);
if (lpszWordToFind[0] != '\0')
{
if (m_LocationCache[lpszWordToFind[0]].bPresent)
{
long lStartRecord = m_LocationCache[lpszWordToFind[0]].lFirstRecord;
long lLastRecord = m_LocationCache[lpszWordToFind[0]].lLastRecord;
lRecord = lStartRecord;
while (lRecord <= lLastRecord && !bFound)
{
GetRecord(lRecord, szTest);
if (stricmp(szTest, lpszWordToFind) == 0)
bFound = TRUE;
lRecord++;
}
}
}
return bFound;
}
// this function is called by FindWord and is used to search
// the main dictionary for possible matches. This function
// uses several methods to determine if words could match.
void CFPSSpellCheckEngine::FindMatchingWords(LPCSTR lpszWord, CStringList &Matches)
{
long lRecord;
char szMetaphone[20];
char szWord[50];
char szParsedWord[50];
int iWordSylableCount = GetSylableCount((char*)lpszWord);
int iDicSylableCount = 0;
int iWordLen = lstrlen(lpszWord);
int iRecordFirstVowelPos;
int iRecordWordLen = 0;
int iFirstVowelPos;
char cFirstVowel = GetFirstVowel((char*)lpszWord, iFirstVowelPos);
char cRecordFirstVowel = 0;
long lRecordCount = GetRecordCount();
int iMetaphoneLen = 0;
char szPhoneticWord[200];
char szPhoneticRecord[200];
char szOutWord[200];
int iVowelCount = GetVowelCount((char*)lpszWord);
char szTemp[50];
memset(szTemp, 0, 50);
m_lWordsSearched = 0;
m_lComparisonsMade = 0;
// do not perform lookup if word is blank
if (iWordLen == 0)
return;
// if no vowels are in the non-found word, use this method to look for possible matches
if (iVowelCount == 0)
{
int iMaxVowelCount = 1;
if (lpszWord[iWordLen-1] == 'R' || lpszWord[iWordLen-1] == 'S' || lpszWord[iWordLen-1] == 'D')
iMaxVowelCount = 2;
if (m_LocationCache[lpszWord[0]].bPresent)
{
long lStartRecord = m_LocationCache[lpszWord[0]].lFirstRecord;
long lLastRecord = m_LocationCache[lpszWord[0]].lLastRecord;
RemoveVowelsAndDups((char*)lpszWord, szTemp);
lRecord = lStartRecord;
while (lRecord <= lLastRecord)
{
m_lWordsSearched++;
GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
iRecordWordLen = lstrlen(szWord);
if (IsWordMatch((char*)lpszWord, iWordLen, szWord, iRecordWordLen))
{
Matches.AddTail(szWord);
}
//else if (iDicSylableCount <= iMaxVowelCount)
else /*if (iDicSylableCount <= iMaxVowelCount)*/
{
RemoveVowelsAndDups(szWord, szOutWord);
if (stricmp(szOutWord, szTemp) == 0)
{
if (abs(iWordLen - iRecordWordLen) < 3)
Matches.AddTail(szWord);
}
}
lRecord++;
}
}
}
else if (iWordLen == 2)
{
char szTempRecordWord[200];
char szTempInWord[200];
long lStartRecord = m_LocationCache[lpszWord[0]].lFirstRecord;
long lLastRecord = m_LocationCache[lpszWord[0]].lLastRecord;
// make upper case of input word
strcpy(szTempInWord, lpszWord);
CharUpper(szTempInWord);
lRecord = lStartRecord;
while (lRecord <= lLastRecord)
{
m_lWordsSearched++;
GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
//if (iDicSylableCount == 1 && lstrlen(szWord) <= 3)
{
strcpy(szTempRecordWord, szWord);
CharUpper(szTempRecordWord);
if (szTempRecordWord[1] == szTempInWord[1] || szTempRecordWord[2] == szTempInWord[1])
{
Matches.AddTail(szWord);
}
else if (IsWordMatch((char*)lpszWord, iWordLen, szWord, iRecordWordLen))
{
Matches.AddTail(szWord);
}
}
lRecord++;
}
}
/*
else if (iWordLen == 3)
{
char szTempRecordWord[200];
char szTempInWord[200];
char cWordChar1;
char cWordChar2;
char cWordChar3;
char cRecordChar1;
char cRecordChar2;
char cRecordChar3;
// make upper case of input word
strcpy(szTempInWord, lpszWord);
CharUpper(szTempInWord);
cWordChar1 = szTempInWord[0];
cWordChar2 = szTempInWord[1];
cWordChar3 = szTempInWord[2];
// make metaphone out of input word
MetaphoneEx((char*)lpszWord, szMetaphone);
iMetaphoneLen = lstrlen(szMetaphone);
BOOL bInclude = FALSE;
// phonetic word
PhoneticWord((char*)lpszWord, szPhoneticWord);
// find record
for (lRecord = 1; lRecord <= lRecordCount; lRecord++)
{
m_lWordsSearched++;
GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
szParsedWord[5] = 0;
iRecordWordLen = lstrlen(szWord);
cRecordFirstVowel = GetFirstVowel(szWord, iRecordFirstVowelPos);
bInclude = FALSE;
if (iRecordWordLen <= 5)
{
if ((IsSylableCountsOK(iWordSylableCount, iDicSylableCount)) && (IsWordLenOK(iWordLen, iRecordWordLen)) && (cFirstVowel == cRecordFirstVowel))
{
if (stricmp(szParsedWord, szMetaphone) == 0)
{
Matches.AddTail(szWord);
bInclude = TRUE;
}
else
{
if (strcmp(szPhoneticRecord, szPhoneticWord) == 0)
{
if (szParsedWord[0] == szMetaphone[0] && szParsedWord[1] == szMetaphone[1] && szParsedWord[2] == szMetaphone[2])
{
Matches.AddTail(szWord);
bInclude = TRUE;
}
}
}
}
}
if (!bInclude && iRecordWordLen == 3)
{
strcpy(szTempRecordWord, szWord);
CharUpper(szTempRecordWord);
cRecordChar1 = szTempRecordWord[0];
cRecordChar2 = szTempRecordWord[1];
cRecordChar3 = szTempRecordWord[2];
if ((cRecordChar1 == cWordChar1 || cRecordChar1 == cWordChar2 || cRecordChar1 == cWordChar3) &&
(cRecordChar2 == cWordChar1 || cRecordChar2 == cWordChar2 || cRecordChar2 == cWordChar3) &&
(cRecordChar3 == cWordChar1 || cRecordChar3 == cWordChar2 || cRecordChar3 == cWordChar3))
{
Matches.AddTail(szWord);
}
}
}
}
*/
else
{
// make metaphone out of input word
MetaphoneEx((char*)lpszWord, szMetaphone);
iMetaphoneLen = lstrlen(szMetaphone);
// phonetic word
PhoneticWord((char*)lpszWord, szPhoneticWord);
// find record
for (lRecord = 1; lRecord <= lRecordCount; lRecord++)
{
m_lWordsSearched++;
GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
szParsedWord[6] = 0;
iRecordWordLen = lstrlen(szWord);
cRecordFirstVowel = GetFirstVowel(szWord, iRecordFirstVowelPos);
if (abs(iWordLen - iRecordWordLen) < 4)
{
//if ((IsSylableCountsOK(iWordSylableCount, iDicSylableCount)) && (IsWordLenOK(iWordLen, iRecordWordLen)) && (cFirstVowel == cRecordFirstVowel || iFirstVowelPos > 2))
//if ((IsWordLenOK(iWordLen, iRecordWordLen)) && (cFirstVowel == cRecordFirstVowel || iFirstVowelPos > 2))
{
if (stricmp(szParsedWord, szMetaphone) == 0)
{
if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
Matches.AddTail(szWord);
}
else if (IsMetaphoneMatch(szParsedWord, szMetaphone))
{
if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
Matches.AddTail(szWord);
}
else if (IsWordMatch((char*)lpszWord, iWordLen, szWord, iRecordWordLen))
{
if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
Matches.AddTail(szWord);
}
else
{
/*
if (strcmp(szPhoneticRecord, szPhoneticWord) == 0)
{
if (szParsedWord[0] == szMetaphone[0] && szParsedWord[1] == szMetaphone[1] && szParsedWord[2] == szMetaphone[2])
{
if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
Matches.AddTail(szWord);
}
}
*/
}
}
}
}
}
}
// caches the file locations for the word breaks in the
// main dictionary
void CFPSSpellCheckEngine::CacheWordLocations()
{
long lRecord = 0;
char szWord[50];
char szMetaphone[50];
char szPhonetic[50];
char cLastChar = '\0';
char cThisChar = '\0';
int iDicSylableCount;
long lWordCachePos = 0;
// set default values for cache locations
for (int x = 0; x < 255; x++)
{
m_LocationCache[x].bPresent = FALSE;
m_LocationCache[x].lFirstRecord = 0;
m_LocationCache[x].lLastRecord = 0;
m_LocationCache[x].lLen = 0;
m_LocationCache[x].lStartPos = 0;
}
// look through all records and find letter breaks
for (lRecord = 1; lRecord <= GetRecordCount(); lRecord++)
{
GetRecord(lRecord, szWord, szMetaphone, iDicSylableCount, szPhonetic);
memcpy(&m_pWordCache[lWordCachePos], szWord, 18);
lWordCachePos += 18;
switch(szWord[0])
{
case 'a': cThisChar = 'A'; break;
case 'b': cThisChar = 'B'; break;
case 'c': cThisChar = 'C'; break;
case 'd': cThisChar = 'D'; break;
case 'e': cThisChar = 'E'; break;
case 'f': cThisChar = 'F'; break;
case 'g': cThisChar = 'G'; break;
case 'h': cThisChar = 'H'; break;
case 'i': cThisChar = 'I'; break;
case 'j': cThisChar = 'J'; break;
case 'k': cThisChar = 'K'; break;
case 'l': cThisChar = 'L'; break;
case 'm': cThisChar = 'M'; break;
case 'n': cThisChar = 'N'; break;
case 'o': cThisChar = 'O'; break;
case 'p': cThisChar = 'P'; break;
case 'q': cThisChar = 'Q'; break;
case 'r': cThisChar = 'R'; break;
case 's': cThisChar = 'S'; break;
case 't': cThisChar = 'T'; break;
case 'u': cThisChar = 'U'; break;
case 'v': cThisChar = 'V'; break;
case 'w': cThisChar = 'W'; break;
case 'x': cThisChar = 'X'; break;
case 'y': cThisChar = 'Y'; break;
case 'z': cThisChar = 'Z'; break;
}
if (cLastChar != cThisChar)
{
m_LocationCache[cThisChar].bPresent = TRUE;
m_LocationCache[cThisChar].lFirstRecord = lRecord;
m_LocationCache[cThisChar].lStartPos = GetRecordPos(lRecord);
if (cLastChar != '\0')
{
m_LocationCache[cLastChar].lLastRecord = lRecord-1;
m_LocationCache[cLastChar].lLen = GetRecordPos(lRecord) - m_LocationCache[cLastChar].lStartPos;
}
}
cLastChar = cThisChar;
}
}
// retrieves a dictionary record from the main dictionary
BOOL CFPSSpellCheckEngine::GetRecord(long lRecord, char *szWord, char *szMetaphon, int & iSylableCount, char* szPhonetic)
{
char szTemp[20];
char szRecord[50];
long lFilePos = GetRecordPos(lRecord);
// see if the desired record falls within the
if (lFilePos >= m_lCacheStartPos && lFilePos < m_lCacheStartPos + m_lCacheLen)
{
long lCachePos = lFilePos - m_lCacheStartPos;
memcpy(szRecord, &m_pMainBuffer[lCachePos], FPSSPELL_MAIN_RECORD_LEN_NOCRLF);
}
else
{
// if the record is not in the cache, then go cache the
// requirested record and the cache size thereafter
_lseek(m_iMainHandle, lFilePos, SEEK_SET);
m_lCacheStartPos = tell(m_iMainHandle);
m_lCacheLen = _read(m_iMainHandle, m_pMainBuffer, FPSSPELL_DEFAULT_CACHE_SIZE);
long lCachePos = lFilePos - m_lCacheStartPos;
memcpy(szRecord, &m_pMainBuffer[lCachePos], FPSSPELL_MAIN_RECORD_LEN_NOCRLF);
}
szWord[0] = 0;
szMetaphon[0] = 0;
memcpy(szWord, &szRecord[0], 18);
memcpy(szMetaphon, &szRecord[18], 6);
memcpy(szTemp, &szRecord[24], 2);
iSylableCount = atoi(szTemp);
memcpy(szPhonetic, &szRecord[26], 8);
szWord[18] = 0;
szMetaphon[6] = 0;
szPhonetic[8] = 0;
return TRUE;
}
// this function uses a modified version of the Metaphone algorithm to generate
// a quasi phonetic representation of a word.
void CFPSSpellCheckEngine::MetaphoneEx(char *szInput, char *szOutput, int iMaxLen)
{
char szLastChar = '\0';
char szThisChar = '\0';
char szOutChar = '\0';
char szTemp[200];
int iInputPos = 0;
int iOutputPos = 0;
int iLen = 0;
// copy input into temp
memset(szTemp, 0, 200);
strcpy(szTemp, szInput);
TrimRight(szTemp);
CharUpper(szTemp);
iLen = lstrlen(szTemp);
szOutput[0] = szTemp[0];
iOutputPos++;
while (iInputPos < iLen)
{
szOutChar = '\0';
szThisChar = szTemp[iInputPos];
if (szThisChar != szLastChar)
{
switch (szThisChar)
{
case 'A':
{
szOutChar = '\0';
if (iInputPos == 0 && szTemp[iInputPos+1] == 'E')
{
szOutChar = 'E';
iInputPos++;
}
break;
}
case 'B':
{
szOutChar = 'B';
if (szTemp[iInputPos+1] == 'R')
szOutChar = '\0';
break;
}
case 'C':
{
szOutChar = 'K';
if (szTemp[iInputPos+1] == 'H')
szOutChar = 'X';
if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
szOutChar = 'X';
if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] != 'A')
szOutChar = 'S';
//if (szTemp[iInputPos+1] == 'E')
// szOutChar = 'S';
if (szTemp[iInputPos+1] == 'Y')
szOutChar = 'S';
break;
}
case 'D':
{
szOutChar = 'D';
if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'E')
szOutChar = 'J';
if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'Y')
szOutChar = 'J';
if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'I')
szOutChar = 'J';
if (iInputPos == iLen-1)
szOutChar = '\0';
break;
}
case 'E':
{
szOutChar = 0;
//if (szTemp[iInputPos+1] == 'A')
//{
// szOutChar = 'A';
// iInputPos++;
//}
break;
}
case 'F':
{
szOutChar = 'F';
break;
}
case 'G':
{
szOutChar = 'J';
if (szTemp[iInputPos+1] == 'H' && szTemp[iInputPos+2] == 'T')
{
szOutChar = 'T';
iInputPos += 2;
}
else
{
if (szTemp[iInputPos+1] == 'H')
{
if (szTemp[iInputPos+2] != '\0' && !IsVowel(szTemp[iInputPos+2]))
szOutChar = '\0';
}
if (szTemp[iInputPos+1] == 'I' || szTemp[iInputPos+1] == 'E' || szTemp[iInputPos+1] == 'Y')
szOutChar = 'J';
if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
{
szOutChar = 'N';
iInputPos++;
}
}
break;
}
case 'H':
{
szOutChar = 'H';
if (iInputPos > 0)
{
if (IsVowel(szTemp[iInputPos-1]) && !IsVowel(szTemp[iInputPos+1]))
szOutChar = '\0';
}
break;
}
case 'J':
{
szOutChar = 'J';
break;
}
case 'K':
{
szOutChar = 'K';
if (iInputPos > 0)
{
if (szTemp[iInputPos-1] == 'C')
szOutChar = '\0';
}
if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
{
szOutChar = 'N';
iInputPos++;
}
if (szOutChar == 'K' && szTemp[iInputPos+1] == 'R')
iInputPos++;
if (szTemp[iInputPos+1] == 'C')
iInputPos++;
break;
}
case 'L':
{
szOutChar = 'L';
if (szTemp[iInputPos+1] == 'Y' && szTemp[iInputPos+2] == 0)
{
szOutChar = 0;
iInputPos++;
}
break;
}
case 'M':
{
szOutChar = 'M';
break;
}
case 'N':
{
szOutChar = 'N';
if (szTemp[iInputPos+1] == 'D')
{
szOutChar = 'D';
iInputPos++;
}
break;
}
case 'P':
{
szOutChar = 'P';
if (szTemp[iInputPos+1] == 'H')
{
szOutChar = 'F';
iInputPos++;
}
if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
{
szOutChar = 'N';
iInputPos++;
}
break;
}
case 'Q':
{
szOutChar = 'K';
break;
}
case 'R':
{
szOutChar = 'R';
if (iInputPos == iLen-1)
szOutChar = '\0';
if (szTemp[iInputPos+1] == 'K')
{
szOutChar = 'K';
iInputPos++;
}
break;
}
case 'S':
{
szOutChar = 'S';
if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
{
szOutput[iOutputPos] = szOutChar;
iOutputPos++;
szOutChar = 'K';
iInputPos+=2;
}
else
{
if (szTemp[iInputPos+1] == 'H')
szOutChar = 'X';
if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
szOutChar = 'X';
if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'O')
szOutChar = 'X';
if (iInputPos == iLen-1 || (iInputPos == iLen-2 && szTemp[iInputPos+1] == 'S'))
szOutChar = '\0';
}
break;
}
case 'T':
{
szOutChar = 'T';
if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
szOutChar = 'X';
if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'O')
szOutChar = 'X';
if (szTemp[iInputPos+1] == 'H')
szOutChar = '0';
if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
szOutChar = '\0';
if (iInputPos > 0 && szTemp[iInputPos-1] == 'F')
szOutChar = '\0';
break;
}
case 'V':
{
szOutChar = 'F';
break;
}
case 'W':
{
szOutChar = '\0';
if (IsVowel(szTemp[iInputPos+1]))
szOutChar = 'X';
if (iInputPos == 0 && szTemp[iInputPos+1] == 'R')
{
szOutChar = 'R';
iInputPos++;
}
if (iInputPos == 0 && szTemp[iInputPos+1] == 'H')
{
szOutChar = 'W';
iInputPos++;
}
break;
}
case 'X':
{
szOutput[iOutputPos] = 'K';
iOutputPos++;
szOutChar = 'S';
if (iInputPos == 0)
szOutChar = 'S';
break;
}
case 'Y':
{
szOutChar = '\0';
if (IsVowel(szTemp[iInputPos+1]))
szOutChar = 'Y';
if (iInputPos == iLen-1)
szOutChar = '\0';
break;
}
case 'Z':
{
szOutChar = 'S';
break;
}
}
}
if (szOutChar != '\0')
{
szOutput[iOutputPos] = szOutChar;
iOutputPos++;
}
szLastChar = szThisChar;
iInputPos++;
}
if (IsVowel(szOutput[0]))
szOutput[0] = 'A';
// truncate output if necessary
TrimRight(szOutput);
szOutput[iOutputPos] = '\0';
szOutput[iMaxLen] = '\0';
}
void CFPSSpellCheckEngine::TrimRight(char *szText)
{
int iLen = lstrlen(szText);
int iPos = 0;
BOOL bFound = FALSE;
while (iPos < iLen && !bFound)
{
if (szText[iPos] == ' ')
szText[iPos] = '\0';
iPos++;
}
}
// this function counts the sylables in a word. It is not a truly
// accurate count, but works well for the spell checking engine
// purposes.
int CFPSSpellCheckEngine::GetSylableCount(char *szWord)
{
int iCount = 0;
int iLen = lstrlen(szWord);
for (int iPos = 0; iPos < iLen; iPos++)
{
switch (szWord[iPos])
{
case 'A':
case 'a':
case 'I':
case 'i':
case 'O':
case 'o':
case 'U':
case 'u':
{
if (iPos != 0)
{
if (!IsVowel(szWord[iPos-1]))
iCount++;
break;
}
}
case 'E':
case 'e':
{
if (iPos != 0 && iPos != iLen-1)
{
if (!IsVowel(szWord[iPos-1]))
iCount++;
break;
}
}
}
}
if (iCount == 0)
iCount = 1;
return iCount;
}
// used by the FindMatchingWords function to determine if 2 words
// could possibly match.
BOOL CFPSSpellCheckEngine::IsWordLenOK(int iWordLen, int iRecordWordLen)
{
if (iWordLen - iRecordWordLen > FPSSPELL_MAX_WORD_LEN_DIFF)
return FALSE;
if (iRecordWordLen - iWordLen > FPSSPELL_MAX_WORD_LEN_DIFF)
return FALSE;
if (iWordLen == 4 && iRecordWordLen < FPSSPELL_MAX_WORD_LEN_DIFF)
return FALSE;
return TRUE;
}
// extracts the first volel from a word and then translates it from
// 5 possible vowels (A,E,I,O,U) to 3 possible vowels (A,I,O)
char CFPSSpellCheckEngine::GetFirstVowel(char* szWord, int& iVowelPos)
{
int iPos = 1;
BOOL bFound = FALSE;
int iLen = lstrlen(szWord);
char cReturn = 0;
iVowelPos = 0;
if (IsVowel(szWord[0]))
{
cReturn = szWord[0];
if ((cReturn == 'e' || cReturn == 'E') && (szWord[1] == 'A' || szWord[1] == 'a')&& (szWord[2] == 'U' || szWord[2] == 'u'))
{
cReturn = 'U';
}
else if ((cReturn == 'e' || cReturn == 'E') && (szWord[1] == 'A' || szWord[1] == 'a'))
{
cReturn = 'A';
}
else if ((cReturn == 'a' || cReturn == 'A') && (szWord[1] == 'U' || szWord[1] == 'u'))
{
cReturn = 'O';
}
}
else
{
while (iPos < iLen && !bFound)
{
if (!IsVowel(szWord[iPos-1]) && IsVowel(szWord[iPos]))
{
iVowelPos = iPos;
cReturn = szWord[iPos];
if ((cReturn == 'e' || cReturn == 'E') && (szWord[iPos+1] == 'A' || szWord[iPos+1] == 'a') && (szWord[iPos+2] == 'U' || szWord[iPos+2] == 'u'))
{
cReturn = 'U';
}
else if ((cReturn == 'e' || cReturn == 'E') && (szWord[iPos+1] == 'A' || szWord[iPos+1] == 'a'))
{
cReturn = 'A';
}
else if ((cReturn == 'a' || cReturn == 'A') && (szWord[iPos+1] == 'U' || szWord[iPos+1] == 'u'))
{
cReturn = 'O';
}
bFound = TRUE;
}
iPos++;
}
}
switch (cReturn)
{
case 'a': cReturn = 'A';break;
case 'e': cReturn = 'E';break;
case 'i': cReturn = 'I';break;
case 'o': cReturn = 'O';break;
case 'u': cReturn = 'U';break;
case 'y': cReturn = 'Y';break;
}
switch (cReturn)
{
case 'A': cReturn = 'A';break;
case 'E': cReturn = 'I';break;
case 'I': cReturn = 'I';break;
case 'O': cReturn = 'O';break;
case 'U': cReturn = 'O';break;
case 'Y': cReturn = 'I';break;
}
return cReturn;
}
// called by FindMatchingWords to determine if the syllable
// counts of 2 words are OK for matching purposes
BOOL CFPSSpellCheckEngine::IsSylableCountsOK(int iWord, int iRecord)
{
if (iWord == iRecord)
return TRUE;
if (iWord - iRecord == 1)
return TRUE;
if (iRecord - iWord == 1)
return TRUE;
return FALSE;
}
// similar to MetaphoneEx, but instead uses a custom algorithm
// to generate a quasi-phonetic representation of a word. The
// output of this algorithm is used by FindMatchingWords to
// augment the metaphone algorithm
void CFPSSpellCheckEngine::PhoneticWord(char *szInput, char *szOutput)
{
int iInputPos = 0;
int iOutputPos = 0;
int iOutputLen = 0;
int iLen = lstrlen(szInput);
char szTemp[200];
char szTempOut[200];
char cLastChar = 0;
char cThisChar = 0;
char cNextChar = 0;
// make word upper and clear output
memset(szOutput, 0, 20);
memset(szTempOut, 0, 20);
strcpy(szTemp, szInput);
CharUpper(szTemp);
// FIRST, PARSE INPUT FOR PHONETICS
while (iInputPos < iLen && iOutputPos < 8)
{
cLastChar = cThisChar;
cThisChar = szTemp[iInputPos];
cNextChar = szTemp[iInputPos+1];
if (cThisChar != cLastChar)
{
switch (cThisChar)
{
// PROCESS ALL VOWELS FIRST
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
case 'Y':
{
if (iOutputPos == 0)
//if (!IsVowel(cLastChar))
{
szTempOut[iOutputPos] = cThisChar;
iOutputPos++;
}
break;
}
// B,D,P,F,T,V can all be followed by 'R'.
// in this condition, only use the R
case 'B':
case 'D':
{
if (cNextChar != 'R')
{
szTempOut[iOutputPos] = 'D';
iOutputPos++;
}
break;
}
case 'P':
case 'F':
{
if (cNextChar != 'R')
{
szTempOut[iOutputPos] = 'F';
iOutputPos++;
}
break;
}
case 'T':
{
if (cNextChar != 'R' && cLastChar != 'F')
{
szTempOut[iOutputPos] = 'T';
iOutputPos++;
}
break;
}
case 'V':
{
if (cNextChar != 'R')
{
szTempOut[iOutputPos] = 'F';
iOutputPos++;
}
break;
}
case 'L':
{
szTempOut[iOutputPos] = 'L';
iOutputPos++;
break;
}
case 'R':
{
szTempOut[iOutputPos] = 'R';
iOutputPos++;
break;
}
// M & N should always be mapped to same char
case 'M':
{
szTempOut[iOutputPos] = 'M';
iOutputPos++;
break;
}
case 'N':
{
if (cNextChar != 'T' && cNextChar != 'G')
{
szTempOut[iOutputPos] = 'M';
iOutputPos++;
}
break;
}
// C, K, Q, S should always be same char
case 'C':
case 'K':
case 'Q':
case 'S':
{
if (cLastChar != 'C' && cLastChar != 'K' && cLastChar != 'S')
{
szTempOut[iOutputPos] = 'C';
iOutputPos++;
}
break;
}
// G and J should always be G (with no duplicate chars)
case 'G':
case 'J':
{
if (cLastChar != 'G' && cLastChar != 'J')
{
szTempOut[iOutputPos] = 'G';
iOutputPos++;
}
break;
}
// CHARS WHICH SHOULD ALL BE SILENT, except when first
case 'W':
case 'X':
case 'Z':
case 'H':
{
if (iInputPos == 0)
{
szTempOut[iOutputPos] = cThisChar;
iOutputPos++;
break;
}
break;
}
}
}
iInputPos++;
}
// NOW, BUILD FINAL OUTPUT - THIS SHOULD REMOVE DUPLICATES
iOutputLen = iOutputPos;
iInputPos = 0;
iOutputPos = 0;
cLastChar = 0;
cThisChar = 0;
while (iInputPos < iOutputLen)
{
cLastChar = cThisChar;
cThisChar = szTempOut[iInputPos];
if (cLastChar != cThisChar)
{
szOutput[iOutputPos] = cThisChar;
iOutputPos++;
}
iInputPos++;
}
}
// this function is used to rebuild the main dictionary file
// from a word file. The word file should already be sorted
// in ascending order
void CFPSSpellCheckEngine::BuildMainDIC(LPCSTR lpszWordFile, LPCSTR lpszDICFile)
{
CWaitCursor cursor;
CStdioFile InFile;
CStdioFile OutFile;
CString strWord;
char szOutWord[200];
char szOut[50];
char szPhonetic[50];
char szTemp[20];
if (InFile.Open(lpszWordFile, CFile::modeRead | CFile::shareExclusive))
{
if (OutFile.Open(lpszDICFile, CFile::modeCreate | CFile::modeWrite | CFile::shareExclusive))
{
while (InFile.ReadString(strWord))
{
memset(szOutWord, 0, 6);
memset(szPhonetic, 0, 10);
strWord.TrimLeft();
strWord.TrimRight();
PhoneticWord(strWord.GetBuffer(strWord.GetLength()), szPhonetic);
MetaphoneEx(strWord.GetBuffer(strWord.GetLength()), szOutWord);
_itoa(GetSylableCount(strWord.GetBuffer(strWord.GetLength())), szTemp, 10);
strWord = strWord.Left(18);
memset(szOut, 0, 50);
memcpy(szOut, strWord.GetBuffer(strWord.GetLength()), strWord.GetLength());
memcpy(&szOut[18], szOutWord, 6);
memcpy(&szOut[24], szTemp, 2);
memcpy(&szOut[26], szPhonetic, 8);
OutFile.Write(szOut, 34);
OutFile.WriteString("\n");
}
OutFile.Close();
}
InFile.Close();
}
}
// this is an alternate version of the GetRecord function
// which only returns the word from the dictionary, and not
// the metaphone, length and phonetic representations
BOOL CFPSSpellCheckEngine::GetRecord(long lRecord, char *szWord)
{
long lCachePos = (lRecord - 1) * 18;
memset(szWord, 0, 19);
memcpy(szWord, &m_pWordCache[lCachePos], 18);
return TRUE;
}
// returns the number of vowels in a word
int CFPSSpellCheckEngine::GetVowelCount(char* szWord)
{
int iCount = 0;
int iLen = lstrlen(szWord);
for (int iPos = 0; iPos < iLen; iPos++)
{
if (IsVowel(szWord[iPos]))
iCount++;
}
return iCount;
}
// creates an output based on an input word, but removes
// all vowels
void CFPSSpellCheckEngine::RemoveVowels(char *szInput, char *szOutput)
{
int iInLen = lstrlen(szInput);
int iInPos = 0;
int iOutPos = 0;
memset(szOutput, 0, iInLen);
for (iInPos = 0; iInPos < iInLen; iInPos++)
{
if (!IsVowel(szInput[iInPos]))
{
szOutput[iOutPos] = szInput[iInPos];
iOutPos++;
}
}
}
// caches the words in the main dictionary
void CFPSSpellCheckEngine::CacheWords()
{
long lRecord = 0;
long lCachePos = 0;
char szWord[50];
char szMetaphone[50];
int iSylables;
char szPhonetic[50];
if (m_pWordCache)
free (m_pWordCache);
m_pWordCache = (char*)malloc(GetRecordCount() * 18);
if (!m_pWordCache)
return;
for (lRecord = 1; lRecord <= GetRecordCount(); lRecord++)
{
GetRecord(lRecord, szWord, szMetaphone, iSylables, szPhonetic);
memcpy(&m_pWordCache[lCachePos], szWord, 18);
lCachePos += 18;
}
}
// this function is called by the FindMatchingWords function
// to determine if 2 metaphone representations of a word
// match based on the spell checking engines requirements
BOOL CFPSSpellCheckEngine::IsMetaphoneMatch(char *szMeta1, char *szMeta2)
{
BOOL bReturn = FALSE;
// if the first letter does not match, but the remainder matches, OK
if (strcmp(&szMeta1[1], &szMeta2[1]) == 0)
{
// various combinations of letters which when
// first, and remainder of metaphone matches,
// match=OK
if (szMeta1[0] == 'K' && szMeta2[0] == 'C')
bReturn = TRUE;
if (szMeta1[0] == 'C' && szMeta2[0] == 'K')
bReturn = TRUE;
if (szMeta1[0] == 'P' && szMeta2[0] == 'F')
bReturn = TRUE;
if (szMeta1[0] == 'F' && szMeta2[0] == 'P')
bReturn = TRUE;
if (szMeta1[0] == 'S' && szMeta2[0] == 'C')
bReturn = TRUE;
if (szMeta1[0] == 'C' && szMeta2[0] == 'S')
bReturn = TRUE;
}
return bReturn;
}
// checks 2 words to determine if they match enough
// to quialify for the spell checking engines
// requirements
BOOL CFPSSpellCheckEngine::IsWordMatch(char *szWord1, int iWord1Len, char *szWord2, int iWord2Len)
{
BOOL bReturn = FALSE;
char szCopy2[50];
strcpy(szCopy2, szWord2);
CharUpper(szCopy2);
// check for one letter differences on words with same len more than 4 chars
if ((!bReturn) && (iWord1Len == iWord2Len) && (iWord1Len > 2))
{
int iDiffCount = 0;
int iPos = 0;
while (iPos < iWord1Len && iDiffCount < 2)
{
if (szWord1[iPos] != szCopy2[iPos])
iDiffCount++;
iPos++;
}
if (iDiffCount == 1)
bReturn = TRUE;
}
// if not same len, check for missing 'l' in 'ly' ending
if (szCopy2[iWord2Len-1] == 'Y' && szWord1[iWord1Len-2] != 'L')
{
//if (memcmp(szWord1, szCopy2, iWord2Len-2) == 0)
// bReturn = TRUE;
}
return bReturn;
}
BOOL CFPSSpellCheckEngine::SaveWordFile(LPCSTR lpszFileName)
{
CStdioFile fp;
BOOL bReturn = TRUE;
char szWord[50];
long lRecordCount = GetRecordCount();
try
{
if (fp.Open(lpszFileName, CFile::modeWrite | CFile::modeCreate | CFile::shareExclusive))
{
for (long lRecord = 1; lRecord <= lRecordCount; lRecord++)
{
GetRecord(lRecord, szWord);
fp.WriteString(CString(szWord));
fp.WriteString("\n");
}
fp.Close();
}
else
{
bReturn = FALSE;
}
}
catch(...)
{
bReturn = FALSE;
}
return bReturn;
}
BOOL CFPSSpellCheckEngine::CheckEndings(LPCSTR lpszW1, int iLen1, LPCSTR lpszW2, int iLen2)
{
BOOL bReturn = TRUE;
if (lpszW1[iLen1-1] == 'S' || lpszW1[iLen1-1] == 's')
{
if (lpszW2[iLen2-1] != 'S' && lpszW2[iLen2-1] != 's' && lpszW2[iLen2-1] != 'D' && lpszW2[iLen2-1] != 'd')
bReturn = FALSE;
}
else if (lpszW1[iLen1-1] == 'D' || lpszW1[iLen1-1] == 'd')
{
if (lpszW2[iLen2-1] != 'D' && lpszW2[iLen2-1] != 'd' && lpszW2[iLen2-1] != 'S' && lpszW2[iLen2-1] != 's')
bReturn = FALSE;
}
return bReturn;
}
void CFPSSpellCheckEngine::RemoveVowelsAndDups(char *szInput, char *szOutput)
{
int iInLen = lstrlen(szInput);
int iInPos = 0;
int iOutPos = 0;
char cLastChar = 0;
memset(szOutput, 0, iInLen);
for (iInPos = 0; iInPos < iInLen; iInPos++)
{
if (!IsVowel(szInput[iInPos]))
{
if (cLastChar != szInput[iInPos])
{
szOutput[iOutPos] = szInput[iInPos];
iOutPos++;
}
}
cLastChar = szInput[iInPos];
}
}