// Ternary DAG spellchecker
// (c) Peter Kankowski, 2008. kankowski@narod.ru
#define _CRT_SECURE_NO_DEPRECATE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <limits.h>
#include "spellchecker.h"
#ifndef __cplusplus
typedef int bool;
#define true 1
#define false 0
#endif
#define NUM_OF(A) (sizeof(A)/sizeof((A)[0]))
// ==============================
// Hash table-based spell checker
// (for performance comparison)
// ==============================
// String pool for the words
char g_htstrings[1048576];
// Hash table
struct HT_ITEM {
struct HT_ITEM* next;
char* word;
};
typedef struct HT_ITEM HT_ITEM;
HT_ITEM* g_htable[131072] = {0};
// http://murmurhash.googlepages.com/MurmurHash2.cpp
static unsigned murmur2_hash(const char * key, size_t len) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const unsigned int m = 0x5bd1e995;
const int r = 24;
// Initialize the hash to a 'random' value
unsigned int seed = 0x3FB0BB5F;
unsigned int h = seed ^ (unsigned)len;
// Mix 4 bytes at a time into the hash
const unsigned char * data = (const unsigned char *)key;
while(len >= 4) {
unsigned int k = *(unsigned int *)data;
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
len -= 4;
}
// Handle the last few bytes of the input array
switch(len) {
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
h *= m;
};
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return h;
}
// Hash table lookup
static int find_exact(const char* key, size_t len)
{
unsigned hash = murmur2_hash(key, len) % NUM_OF(g_htable);
for(HT_ITEM* item = g_htable[hash]; item != NULL; item = item->next)
if(0 == memcmp(item->word, key, len))
return true;
return false;
}
static int HT_find_exact (const char* word) {
if(find_exact(word, strlen(word)))
return true;
// Convert the first letter to lower-case
if(!isupper(*word))
return false;
char lowercase[ST_MAX_WORD];
char * lc = lowercase;
*lc++ = (char)tolower(*word++);
// One-letter words should match case-sensitively
if(*word == '\0')
return false;
// Two case variations are allowed: all capital letters (SOMETHING) or the first
// capital letter (Something). Other combinations should match exactly (MHz).
if ( isupper(*word) ) { // Other letters must be capital, too
*lc++ = (char)tolower(*word++);
while(*word) {
if (!isupper(*word) && *word != '\'')
return false;
*lc++ = (char)tolower(*word++);
}
} else { // Other letters must be small
*lc++ = *word++;
while(*word) {
if (!islower(*word) && *word != '\'')
return false;
*lc++ = *word++;
}
}
*lc = '\0';
return find_exact(lowercase, lc - lowercase);
}
struct HT_ENUMINFO {
void (* enumproc)(const char*, void*);
void* param;
char result[ST_MAX_WORD];
};
typedef struct HT_ENUMINFO HT_ENUMINFO;
static void report_suggestion(HT_ENUMINFO * enuminfo) {
enuminfo->enumproc(enuminfo->result, enuminfo->param);
}
// Match a similar letter (e.g., berru instead of berry).
static void HT_similar_letters (const char * word, HT_ENUMINFO * enuminfo) {
// Similar letters are phonetically similar or located close on qwerty keyboard
static const char * similar_letters[] = {
/* a */ "eiqswz",
/* b */ "ghnpv",
/* c */ "dfksvx",
/* d */ "cefrstwx",
/* e */ "adfirsuw",
/* f */ "cdegrtv",
/* g */ "bfhkrtvy",
/* h */ "bgjntuy",
/* i */ "aejklouy",
/* j */ "hikmnuy",
/* k */ "cgijlmoqu",
/* l */ "ikop",
/* m */ "jkn",
/* n */ "bhjm",
/* o */ "iklpu",
/* p */ "blo",
/* q */ "aksw",
/* r */ "defgt",
/* s */ "acdeqwxz",
/* t */ "dfghry",
/* u */ "ehijkoy",
/* v */ "bcfg",
/* w */ "adeqs",
/* x */ "cdsz",
/* y */ "ghijtu",
/* z */ "asx",
};
size_t len = strlen(word);
strcpy(enuminfo->result, word);
for(unsigned j = 0; j < len; j++) {
if( (unsigned)(word[j] - 'a') >= 26 )
continue;
const char * similar = similar_letters[word[j] - 'a'];
size_t similarlen = strlen(similar);
// Try each similar letter
for(unsigned i = 0; i < similarlen; i++) {
enuminfo->result[j] = similar[i];
if(find_exact(enuminfo->result, len))
report_suggestion(enuminfo);
}
enuminfo->result[j] = word[j];
}
}
static const char g_alphabet[] = "abcdefghijklmnopqrstuvwxyz";
// Insert a letter (e.g., sometmes instead of sometimes).
static void HT_insert (const char * word, HT_ENUMINFO * enuminfo) {
size_t len = strlen(word);
strcpy(enuminfo->result + 1, word);
for(unsigned j = 0; j <= len; j++) {
// Try each letter
for(unsigned i = 0; i < NUM_OF(g_alphabet) - 1; i++) {
enuminfo->result[j] = g_alphabet[i];
if(find_exact(enuminfo->result, len + 1))
report_suggestion(enuminfo);
}
enuminfo->result[j] = word[j];
}
}
// Transpose letters (e.g., comrpession instead of compression)
static void HT_transpose (const char * word, HT_ENUMINFO * enuminfo) {
size_t len = strlen(word);
strcpy(enuminfo->result, word);
for(unsigned j = 0; j < len - 1; j++) {
enuminfo->result[j] = word[j+1];
enuminfo->result[j+1] = word[j];
if(find_exact(enuminfo->result, len))
report_suggestion(enuminfo);
enuminfo->result[j] = word[j];
}
}
// Delete a letter (e.g., ghjost instead of ghost)
static void HT_delete (const char * word, HT_ENUMINFO * enuminfo) {
size_t len = strlen(word);
strcpy(enuminfo->result, word + 1);
for(unsigned j = 0; j < len; j++) {
if(find_exact(enuminfo->result, len - 1))
report_suggestion(enuminfo);
enuminfo->result[j] = word[j];
}
}
// Insert a space ("cando" instead of "can do")
static void HT_insertspace (const char * word, HT_ENUMINFO * enuminfo) {
size_t len = strlen(word);
for(unsigned j = 1; j < len - 1; j++) {
if(find_exact(word, j) && find_exact(word + j, len - j))
{
memcpy(enuminfo->result, word, j);
enuminfo->result[j] = ' ';
memcpy(enuminfo->result + j + 1, word + j, len - j + 1);
report_suggestion(enuminfo);
}
}
}
static void HT_enum_suggestions (const char * word,
void (* enumproc)(const char*, void*), void * param)
{
HT_ENUMINFO enuminfo;
enuminfo.param = param;
enuminfo.enumproc = enumproc;
HT_similar_letters(word, &enuminfo);
HT_insert(word, &enuminfo);
HT_transpose(word, &enuminfo);
HT_delete(word, &enuminfo);
HT_insertspace(word, &enuminfo);
}
static bool HT_load_file(void) {
// Read the dictionary in g_htstrings string pool
FILE* dic = fopen("dic.txt", "r");
if(!dic) {
puts("Open dictionary failed\n");
return false;
}
char* end = g_htstrings +
fread(g_htstrings, sizeof(char), NUM_OF(g_htstrings) - 2, dic);
*end = '\0';
*(end+1) = '\0';
fclose(dic);
char* line = g_htstrings;
while(line < end) {
// Add word to the hash table
size_t len = strcspn(line, "\r\n");
unsigned hash = murmur2_hash(line, len) % NUM_OF(g_htable);
HT_ITEM* item = (HT_ITEM*)malloc(sizeof(HT_ITEM));
if(!item) {
puts("Out of memory\n");
return false;
}
item->next = g_htable[hash];
item->word = line;
g_htable[hash] = item;
// Null-terminate the word
line += len;
*line = '\0';
// Advance to the next word
do
line++;
while(*line == '\r' || *line == '\n');
}
return true;
}
// Free hash table items
static void HT_free(void) {
for(size_t i = 0; i < NUM_OF(g_htable); i++) {
HT_ITEM* item = g_htable[i];
while(item != NULL) {
HT_ITEM* next = item->next;
free(item);
item = next;
}
}
}
// ======================
// Tests
// ======================
int g_found;
static void test_enum_proc(const char* found, void* param) {
const char * expected = (const char*)param;
if(0 == strcmp(expected, found))
{
g_found = 1;
}
}
static void test_suggestion(const char * word, const char * expected)
{
g_found = 0;
HT_enum_suggestions(word, test_enum_proc, (void*)expected);
assert(g_found);
}
// ======= Benchmarks ============
typedef unsigned __int64 TICKS;
#ifdef __GNUC__
__attribute((always_inline)) TICKS GetRDTSC(void) {
TICKS ret;
__asm__ __volatile__ (
"XORL %%eax, %%eax\n"
"CPUID\n"
"RDTSC\n"
: "=A" (ret)
:
: "%ebx", "%ecx"
);
return ret;
}
#elif _MSC_VER
TICKS inline GetRDTSC(void) {
__asm {
; Flush the pipeline
XOR eax, eax
CPUID
; Get RDTSC counter in edx:eax
RDTSC
}
}
#else
#error Unknown compiler
#endif
static void benchmark_HT_find_exact (void * param) {
param = param;
HT_find_exact("Four");
HT_find_exact("score");
HT_find_exact("and");
HT_find_exact("seven");
HT_find_exact("years");
HT_find_exact("ago");
}
static void benchmark_ST_find_exact (void * param) {
ST_FILE_NODE * dict = (ST_FILE_NODE *)param;
ST_find_exact(dict, "Four");
ST_find_exact(dict, "score");
ST_find_exact(dict, "and");
ST_find_exact(dict, "seven");
ST_find_exact(dict, "years");
ST_find_exact(dict, "ago");
}
static void benchmark_enum_proc(const char* found, void* param) {
found = found;
param = param;
}
static void benchmark_HT_fuzzy_match (void * param) {
param = param;
HT_enum_suggestions("comrpession", benchmark_enum_proc, NULL);
HT_enum_suggestions("brotjer", benchmark_enum_proc, NULL);
HT_enum_suggestions("teh", benchmark_enum_proc, NULL);
HT_enum_suggestions("silencet", benchmark_enum_proc, NULL);
}
static void benchmark_ST_fuzzy_match (void * param) {
ST_FILE_NODE * dict = (ST_FILE_NODE *)param;
ST_enum_suggestions(dict, "comrpession", benchmark_enum_proc, NULL);
ST_enum_suggestions(dict, "brotjer", benchmark_enum_proc, NULL);
ST_enum_suggestions(dict, "teh", benchmark_enum_proc, NULL);
ST_enum_suggestions(dict, "silencet", benchmark_enum_proc, NULL);
}
static void benchmark(char * name, void (*func)(void *), void * param ) {
unsigned min_time = INT_MAX;
for(int k = 0; k < 5; k++) {
TICKS start = GetRDTSC();
func(param);
TICKS time = GetRDTSC() - start;
if(time < min_time)
min_time = (int)time;
}
printf("%-20s: %7d\n", name, min_time);
}
int main() {
// Create hash table and do some tests
if (!HT_load_file())
return 1;
assert( HT_find_exact("address") );
assert( !HT_find_exact("addres") );
assert( !HT_find_exact("ddress") );
assert( !HT_find_exact("addressd") );
assert( HT_find_exact("you") );
assert( HT_find_exact("I") );
assert( HT_find_exact("Address") );
assert( HT_find_exact("ADDRESS") );
assert( !HT_find_exact("AddReSS") );
assert( !HT_find_exact("ADDReSS") );
assert( !HT_find_exact("aDDReSS") );
assert( HT_find_exact("Me") );
assert( HT_find_exact("ME") );
assert( !HT_find_exact("mE") );
//test_suggestion(dict, "address", "address");
test_suggestion("addres", "address");
test_suggestion("berru", "berry");
test_suggestion("brotjer", "brother");
test_suggestion("sometme", "sometime");
test_suggestion("roket", "rocket");
test_suggestion("sory", "sorry");
test_suggestion("comrpession", "compression");
test_suggestion("ghjost", "ghost");
test_suggestion("cando", "can do");
// Open ternary DAG dictionary
ST_FILE_NODE * dict = ST_open_dict("spellchk.dat");
if(!dict) {
puts("Open dictionary (spellchk.dat) failed\n");
return 1;
}
// Run the benchmarks
puts("Exact match");
benchmark("hash table", benchmark_HT_find_exact, NULL);
benchmark("ternary DAG", benchmark_ST_find_exact, dict);
puts("\nFuzzy match");
benchmark("hash table", benchmark_HT_fuzzy_match, NULL);
benchmark("ternary DAG", benchmark_ST_fuzzy_match, dict);
HT_free();
ST_close_dict(dict);
return 0;
}