// Ternary DAG spellchecker
// (c) Peter Kankowski, 2008. kankowski@narod.ru
#define _CRT_SECURE_NO_DEPRECATE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "spellchecker.h"
#ifndef __cplusplus
typedef int bool;
#define true 1
#define false 0
#endif
static ST_FILE_NODE* get_left(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
return dict + node->left;
}
static ST_FILE_NODE* get_right(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
return dict + node->right;
}
static ST_FILE_NODE* get_middle(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
return dict + node->middle;
}
static bool is_null_link(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
return node == dict;
}
// Open a ternary DAG dictionary, returns NULL in case of error
ST_FILE_NODE * ST_open_dict (const char * filename)
{
FILE* f = fopen(filename, "rb");
if(!f)
return NULL;
// Get file size
fseek(f, 0, SEEK_END);
size_t filesize = ftell(f);
fseek(f, 0L, SEEK_SET);
// Read the whole file to the allocated buffer
ST_FILE_NODE* dict = (ST_FILE_NODE*)malloc(filesize);
if(dict && ( fread(dict, 1, filesize, f) != filesize ||
0 != memcmp(dict, ST_DICT_SIGNATURE, sizeof(ST_FILE_NODE)) ) ) {
free(dict);
dict = NULL;
}
fclose(f);
return dict;
}
// Close the dictionary
void ST_close_dict (ST_FILE_NODE * dict)
{
free(dict);
}
// ===========================
// Searching in ternary DAG
// ===========================
// Returns a non-zero value if the word is present in the dictionary
static int find_exact (ST_FILE_NODE * node, const char * word, ST_FILE_NODE * dict) {
while(!is_null_link(node, dict)) {
// Move to the left of right subtree
if(*word < node->ch)
node = get_left(node, dict);
else if(*word > node->ch)
node = get_right(node, dict);
// Found node with the character; begin searching for the next character
else {
if(*word == '\0')
return true;
node = get_middle(node, dict);
word++;
}
}
return false;
}
int ST_find_exact (ST_FILE_NODE * dict, const char * word) {
if(find_exact(dict + 1, word, dict))
return true;
// Convert the first letter to lower-case
if(!isupper(*word))
return false;
char lowercase[ST_MAX_WORD];
char * lc = lowercase;
*lc++ = (char)tolower(*word++);
// One-letter words should match case-sensitively
if(*word == '\0')
return false;
// Two case variations are allowed: all capital letters (SOMETHING) or the first
// capital letter (Something). Other combinations should match exactly (MHz).
if ( isupper(*word) ) { // Other letters must be capital, too
*lc++ = (char)tolower(*word++);
while(*word) {
if (!isupper(*word) && *word != '\'')
return false;
*lc++ = (char)tolower(*word++);
}
} else { // Other letters must be small
*lc++ = *word++;
while(*word) {
if (!islower(*word) && *word != '\'')
return false;
*lc++ = *word++;
}
}
*lc = '\0';
return find_exact(dict + 1, lowercase, dict);
}
// ======================
// Fuzzy matching
// ======================
struct ST_ENUMINFO {
void (* enumproc)(const char*, void*);
void* param;
char result[ST_MAX_WORD];
char variation[ST_MAX_WORD];
ST_FILE_NODE * dict;
};
typedef struct ST_ENUMINFO ST_ENUMINFO;
static void report_suggestion(ST_ENUMINFO * enuminfo) {
enuminfo->enumproc(enuminfo->result, enuminfo->param);
}
// Correct double letters (including multiple errors in one word, e.g. Misisipi)
static void match_doubles(ST_FILE_NODE* node, const char* word, char* result,
ST_ENUMINFO * enuminfo) {
while(!is_null_link(node, enuminfo->dict)) {
if(*word < node->ch)
node = get_left(node, enuminfo->dict);
else if(*word > node->ch)
node = get_right(node, enuminfo->dict);
else {
*result = *word;
if(*result == '\0') {
// Found the word
report_suggestion(enuminfo);
return;
}
if(*(word + 1) == *word) {
// When found two equal letters, try to find a single letter
match_doubles(get_middle(node, enuminfo->dict), word + 2, result + 1, enuminfo);
} else {
// Otherwise, try to find two equal letters here
match_doubles(get_middle(node, enuminfo->dict), word, result + 1, enuminfo);
}
// Also find exact match
node = get_middle(node, enuminfo->dict);
word++;
result++;
}
}
}
static void exact_match(ST_FILE_NODE* node, const char* word, char* result,
ST_ENUMINFO * enuminfo)
{
if (find_exact(node, word, enuminfo->dict))
{
strcpy(result, word);
report_suggestion(enuminfo);
}
}
// Returns true if the letters are phonetically similar or
// are located close to each other on qwerty keyboard
static bool similar_letters(char ch1, char ch2) {
static const char are_similar[26][26] = {
// a b c d e f g h i j k l m n o p q r s t u v w x y z
/* a */ {1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1},
/* b */ {0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0},
/* c */ {0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0},
/* d */ {0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0},
/* e */ {1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0},
/* f */ {0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0},
/* g */ {0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0},
/* h */ {0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0},
/* i */ {1,0,0,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0},
/* j */ {0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0},
/* k */ {0,0,1,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0},
/* l */ {0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0},
/* m */ {0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0},
/* n */ {0,1,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0},
/* o */ {0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0},
/* p */ {0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0},
/* q */ {1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0},
/* r */ {0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0},
/* s */ {1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1},
/* t */ {0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0},
/* u */ {0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0},
/* v */ {0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0},
/* w */ {1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0},
/* x */ {0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1},
/* y */ {0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0},
/* z */ {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1}
};
unsigned index1 = tolower(ch1) - 'a';
unsigned index2 = tolower(ch2) - 'a';
if (index1 >= 26 || index2 >= 26)
return false;
return are_similar[index1][index2] != 0;
}
// Correct a single typo. Enumerate all adjacent nodes and try to find a similar word
static void fuzzy_match(ST_FILE_NODE* node, const char* word, char* result,
ST_ENUMINFO * enuminfo) {
while(!is_null_link(node, enuminfo->dict)) {
if(*word == node->ch) { // If found the exact letter, continue fuzzy matching
*result = node->ch;
if(*word == '\0') {
report_suggestion(enuminfo);
return;
}
fuzzy_match(get_middle(node, enuminfo->dict), word + 1, result + 1, enuminfo);
} else {
// Match a similar letter (e.g., berru instead of berry)
if(similar_letters(*word, node->ch)) {
// Don't allow multiple errors in one word
*result = node->ch;
exact_match(get_middle(node, enuminfo->dict), word + 1, result + 1, enuminfo);
}
// Insert a letter from node (e.g., sometmes instead of sometimes).
// Also handle double letters (e.g., sory instead of sorry).
*result = node->ch;
exact_match(get_middle(node, enuminfo->dict), word, result + 1, enuminfo);
if(word[1] == node->ch) {
// Transpose letters (e.g., comrpession instead of compression)
if(node->ch != '\0') {
*result = node->ch;
enuminfo->variation[0] = *word;
strcpy(enuminfo->variation + 1, word + 2);
exact_match(get_middle(node, enuminfo->dict), enuminfo->variation, result + 1, enuminfo);
}
// Delete a letter (e.g., ghjost instead of ghost)
*result = node->ch;
exact_match(get_middle(node, enuminfo->dict), word + 2, result + 1, enuminfo);
}
// Insert a space ("cando" instead of "can do")
if(node->ch == '\0') {
*result = ' ';
exact_match(enuminfo->dict + 1, word, result + 1, enuminfo);
}
}
// Look in all adjacent nodes (left and right)
fuzzy_match(get_left(node, enuminfo->dict), word, result, enuminfo);
node = get_right(node, enuminfo->dict);
}
}
// Enumerate spelling suggestions (fuzzy matches)
void ST_enum_suggestions (ST_FILE_NODE * dict, const char * word,
void (* enumproc)(const char*, void*), void * param) {
ST_ENUMINFO enuminfo;
enuminfo.param = param;
enuminfo.enumproc = enumproc;
enuminfo.dict = dict;
fuzzy_match(dict + 1, word, enuminfo.result, &enuminfo);
}