Using Ternary DAGs for Spelling Correction

Peter Kankowski
Rate me:
4.97/5 (32 votes)
10 Jan 2009Zlib9 min read
57.7K
739
A new data structure for spellchecking and spelling correction
ternarydags_src.zip
- benchmark.c
- benchmark.exe
- benchmark.vcproj
- build.bat
- create_dictionary.c
- create_dictionary.exe
- create_dictionary.ppj
- create_dictionary.vcproj
- dic.Copyright.txt
- dic.txt
- spellchecker.c
- spellchecker.h
- spellchecker.sln
- spellchecker.vcproj
- spellchecker_test.c
- spellchecker_test.exe
- spellchk.dat
// Ternary DAG spellchecker
// (c) Peter Kankowski, 2008. kankowski@narod.ru

#define _CRT_SECURE_NO_DEPRECATE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <limits.h>
#include "spellchecker.h"

#ifndef __cplusplus
typedef int bool;
#define true 1
#define false 0
#endif

#define NUM_OF(A) (sizeof(A)/sizeof((A)[0]))

// ==============================
// Hash table-based spell checker
// (for performance comparison)
// ==============================

// String pool for the words
char g_htstrings[1048576];

// Hash table
struct HT_ITEM {
	struct HT_ITEM* next;
	char* word;
};
typedef struct HT_ITEM HT_ITEM;

HT_ITEM* g_htable[131072] = {0};


// http://murmurhash.googlepages.com/MurmurHash2.cpp
static unsigned murmur2_hash(const char * key, size_t len) {
	// 'm' and 'r' are mixing constants generated offline.
	// They're not really 'magic', they just happen to work well.

	const unsigned int m = 0x5bd1e995;
	const int r = 24;

	// Initialize the hash to a 'random' value
	unsigned int seed = 0x3FB0BB5F;
	unsigned int h = seed ^ (unsigned)len;

	// Mix 4 bytes at a time into the hash

	const unsigned char * data = (const unsigned char *)key;

	while(len >= 4) {
		unsigned int k = *(unsigned int *)data;

		k *= m; 
		k ^= k >> r; 
		k *= m; 
		
		h *= m; 
		h ^= k;

		data += 4;
		len -= 4;
	}
	
	// Handle the last few bytes of the input array

	switch(len) {
		case 3: h ^= data[2] << 16;
		case 2: h ^= data[1] << 8;
		case 1: h ^= data[0];
				h *= m;
	};

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.

	h ^= h >> 13;
	h *= m;
	h ^= h >> 15;

	return h;
} 

// Hash table lookup
static int find_exact(const char* key, size_t len)
{
	unsigned hash = murmur2_hash(key, len) % NUM_OF(g_htable);
	for(HT_ITEM* item = g_htable[hash]; item != NULL; item = item->next)
		if(0 == memcmp(item->word, key, len))
			return true;
	return false;
}

static int HT_find_exact (const char* word) {
	if(find_exact(word, strlen(word)))
		return true;
	
	// Convert the first letter to lower-case
	if(!isupper(*word))
		return false;
	char lowercase[ST_MAX_WORD];
	char * lc = lowercase;
	*lc++ = (char)tolower(*word++);

	// One-letter words should match case-sensitively
	if(*word == '\0')
		return false;
	
	// Two case variations are allowed: all capital letters (SOMETHING) or the first
	// capital letter (Something). Other combinations should match exactly (MHz).
	if ( isupper(*word) ) { // Other letters must be capital, too
		*lc++ = (char)tolower(*word++);
		while(*word) {
			if (!isupper(*word) && *word != '\'')
				return false;
			*lc++ = (char)tolower(*word++);
		}
	} else { // Other letters must be small
		*lc++ = *word++;
		while(*word) {
			if (!islower(*word) && *word != '\'')
				return false;
			*lc++ = *word++;
		}
	}
	*lc = '\0';
	return find_exact(lowercase, lc - lowercase);
}

struct HT_ENUMINFO {
	void (* enumproc)(const char*, void*);
	void* param;
	char result[ST_MAX_WORD];
};
typedef struct HT_ENUMINFO HT_ENUMINFO;

static void report_suggestion(HT_ENUMINFO * enuminfo) {
	enuminfo->enumproc(enuminfo->result, enuminfo->param);
}

// Match a similar letter (e.g., berru instead of berry).
static void HT_similar_letters (const char * word, HT_ENUMINFO * enuminfo) {
	// Similar letters are phonetically similar or located close on qwerty keyboard
	static const char * similar_letters[] = {
		/* a */ "eiqswz",
		/* b */ "ghnpv",
		/* c */ "dfksvx",
		/* d */ "cefrstwx",
		/* e */ "adfirsuw",
		/* f */ "cdegrtv",
		/* g */ "bfhkrtvy",
		/* h */ "bgjntuy",
		/* i */ "aejklouy",
		/* j */ "hikmnuy",
		/* k */ "cgijlmoqu",
		/* l */ "ikop",
		/* m */ "jkn",
		/* n */ "bhjm",
		/* o */ "iklpu",
		/* p */ "blo",
		/* q */ "aksw",
		/* r */ "defgt",
		/* s */ "acdeqwxz",
		/* t */ "dfghry",
		/* u */ "ehijkoy",
		/* v */ "bcfg",
		/* w */ "adeqs",
		/* x */ "cdsz",
		/* y */ "ghijtu",
		/* z */ "asx",
	};
	size_t len = strlen(word);
	strcpy(enuminfo->result, word);	
	for(unsigned j = 0; j < len; j++) {
		if( (unsigned)(word[j] - 'a') >= 26 )
			continue;
		const char * similar = similar_letters[word[j] - 'a'];
		size_t similarlen = strlen(similar);
		// Try each similar letter
		for(unsigned i = 0; i < similarlen; i++) {
			enuminfo->result[j] = similar[i];
			if(find_exact(enuminfo->result, len))
				report_suggestion(enuminfo);
		}
		enuminfo->result[j] = word[j];
	}
}

static const char g_alphabet[] = "abcdefghijklmnopqrstuvwxyz";

// Insert a letter (e.g., sometmes instead of sometimes).
static void HT_insert (const char * word, HT_ENUMINFO * enuminfo) {
	size_t len = strlen(word);
	strcpy(enuminfo->result + 1, word);
	for(unsigned j = 0; j <= len; j++) {
		// Try each letter
		for(unsigned i = 0; i < NUM_OF(g_alphabet) - 1; i++) {
			enuminfo->result[j] = g_alphabet[i];
			if(find_exact(enuminfo->result, len + 1))
				report_suggestion(enuminfo);
		}
		enuminfo->result[j] = word[j];
	}
}

// Transpose letters (e.g., comrpession instead of compression)
static void HT_transpose (const char * word, HT_ENUMINFO * enuminfo) {
	size_t len = strlen(word);
	strcpy(enuminfo->result, word);
	for(unsigned j = 0; j < len - 1; j++) {
		enuminfo->result[j] = word[j+1];
		enuminfo->result[j+1] = word[j];
		if(find_exact(enuminfo->result, len))
			report_suggestion(enuminfo);
		enuminfo->result[j] = word[j];
	}
}

// Delete a letter (e.g., ghjost instead of ghost)
static void HT_delete (const char * word, HT_ENUMINFO * enuminfo) {
	size_t len = strlen(word);
	strcpy(enuminfo->result, word + 1);
	for(unsigned j = 0; j < len; j++) {
		if(find_exact(enuminfo->result, len - 1))
			report_suggestion(enuminfo);
		enuminfo->result[j] = word[j];
	}
}

// Insert a space ("cando" instead of "can do")
static void HT_insertspace (const char * word, HT_ENUMINFO * enuminfo) {
	size_t len = strlen(word);
	for(unsigned j = 1; j < len - 1; j++) {
		if(find_exact(word, j) && find_exact(word + j, len - j))
		{
			memcpy(enuminfo->result, word, j);
			enuminfo->result[j] = ' ';
			memcpy(enuminfo->result + j + 1, word + j, len - j + 1);
			report_suggestion(enuminfo);
		}
	}
}

static void HT_enum_suggestions (const char * word,
								 void (* enumproc)(const char*, void*), void * param)
{
	HT_ENUMINFO enuminfo;
	enuminfo.param = param;
	enuminfo.enumproc = enumproc;
	HT_similar_letters(word, &enuminfo);
	HT_insert(word, &enuminfo);
	HT_transpose(word, &enuminfo);
	HT_delete(word, &enuminfo);
	HT_insertspace(word, &enuminfo);
}

static bool HT_load_file(void) {
	// Read the dictionary in g_htstrings string pool
	FILE* dic = fopen("dic.txt", "r");
	if(!dic) {
		puts("Open dictionary failed\n");
		return false;
	}
	char* end = g_htstrings +
		fread(g_htstrings, sizeof(char), NUM_OF(g_htstrings) - 2, dic);
	*end = '\0';
	*(end+1) = '\0';
	fclose(dic);
	
	char* line = g_htstrings;
	while(line < end) {
		// Add word to the hash table
		size_t len = strcspn(line, "\r\n");
		unsigned hash = murmur2_hash(line, len) % NUM_OF(g_htable);
		HT_ITEM* item = (HT_ITEM*)malloc(sizeof(HT_ITEM));
		if(!item) {
			puts("Out of memory\n");
			return false;
		}
		item->next = g_htable[hash];
		item->word = line;
		g_htable[hash] = item;

		// Null-terminate the word
		line += len;
		*line = '\0';

		// Advance to the next word
		do
			line++;
		while(*line == '\r' || *line == '\n');
	}

	return true;
}

// Free hash table items
static void HT_free(void) {
	for(size_t i = 0; i < NUM_OF(g_htable); i++) {
		HT_ITEM* item = g_htable[i];
		while(item != NULL) {
			HT_ITEM* next = item->next;
			free(item);
			item = next;
		}
	}
}

// ======================
// Tests
// ======================

int g_found;

static void test_enum_proc(const char* found, void* param) {
	const char * expected = (const char*)param;
	if(0 == strcmp(expected, found))
	{
		g_found = 1;
	}
}

static void test_suggestion(const char * word, const char * expected)
{
	g_found = 0;
	HT_enum_suggestions(word, test_enum_proc, (void*)expected);
	assert(g_found);
}


// ======= Benchmarks ============

typedef unsigned __int64 TICKS;
#ifdef __GNUC__
__attribute((always_inline)) TICKS GetRDTSC(void) {
   TICKS ret;
   __asm__ __volatile__ (
      "XORL %%eax, %%eax\n"
      "CPUID\n"
      "RDTSC\n"
      : "=A" (ret)
      : 
      : "%ebx", "%ecx"
   );
   return ret;
}
#elif _MSC_VER
TICKS inline GetRDTSC(void) {
   __asm {
	  ; Flush the pipeline
      XOR eax, eax
      CPUID
	  ; Get RDTSC counter in edx:eax
	  RDTSC
   }
}
#else
#error Unknown compiler
#endif

static void benchmark_HT_find_exact (void * param) {
	param = param;
	HT_find_exact("Four");
	HT_find_exact("score");
	HT_find_exact("and");
	HT_find_exact("seven");
	HT_find_exact("years");
	HT_find_exact("ago");
}

static void benchmark_ST_find_exact (void * param) {
	ST_FILE_NODE * dict = (ST_FILE_NODE *)param;
	ST_find_exact(dict, "Four");
	ST_find_exact(dict, "score");
	ST_find_exact(dict, "and");
	ST_find_exact(dict, "seven");
	ST_find_exact(dict, "years");
	ST_find_exact(dict, "ago");
}




static void benchmark_enum_proc(const char* found, void* param) {
	found = found;
	param = param;
}

static void benchmark_HT_fuzzy_match (void * param) {
	param = param;
	HT_enum_suggestions("comrpession", benchmark_enum_proc, NULL);
	HT_enum_suggestions("brotjer", benchmark_enum_proc, NULL);
	HT_enum_suggestions("teh", benchmark_enum_proc, NULL);
	HT_enum_suggestions("silencet", benchmark_enum_proc, NULL);
}

static void benchmark_ST_fuzzy_match (void * param) {
	ST_FILE_NODE * dict = (ST_FILE_NODE *)param;
	ST_enum_suggestions(dict, "comrpession", benchmark_enum_proc, NULL);
	ST_enum_suggestions(dict, "brotjer", benchmark_enum_proc, NULL);
	ST_enum_suggestions(dict, "teh", benchmark_enum_proc, NULL);
	ST_enum_suggestions(dict, "silencet", benchmark_enum_proc, NULL);
}

static void benchmark(char * name, void (*func)(void *), void * param ) {
	unsigned min_time = INT_MAX;
	for(int k = 0; k < 5; k++) {
		TICKS start = GetRDTSC();
		func(param);
		TICKS time = GetRDTSC() - start;
		if(time < min_time)
			min_time = (int)time;
	}
	printf("%-20s: %7d\n", name, min_time);
}

int main() {
	// Create hash table and do some tests
	if (!HT_load_file())
		return 1;
	assert( HT_find_exact("address") );
	assert( !HT_find_exact("addres") );
	assert( !HT_find_exact("ddress") );
	assert( !HT_find_exact("addressd") );
	assert( HT_find_exact("you") );
	assert( HT_find_exact("I") );
	assert( HT_find_exact("Address") );
	assert( HT_find_exact("ADDRESS") );
	assert( !HT_find_exact("AddReSS") );
	assert( !HT_find_exact("ADDReSS") );
	assert( !HT_find_exact("aDDReSS") );
	assert( HT_find_exact("Me") );
	assert( HT_find_exact("ME") );
	assert( !HT_find_exact("mE") );

	//test_suggestion(dict, "address", "address");
	test_suggestion("addres", "address");
	test_suggestion("berru", "berry");
	test_suggestion("brotjer", "brother");
	test_suggestion("sometme", "sometime");
	test_suggestion("roket", "rocket");
	test_suggestion("sory", "sorry");
	test_suggestion("comrpession", "compression");
	test_suggestion("ghjost", "ghost");
	test_suggestion("cando", "can do");

	// Open ternary DAG dictionary
	ST_FILE_NODE * dict = ST_open_dict("spellchk.dat");
	if(!dict) {
		puts("Open dictionary (spellchk.dat) failed\n");
		return 1;
	}

	// Run the benchmarks
	puts("Exact match");
	benchmark("hash table", benchmark_HT_find_exact, NULL);
	benchmark("ternary DAG", benchmark_ST_find_exact, dict);

	puts("\nFuzzy match");
	benchmark("hash table", benchmark_HT_fuzzy_match, NULL);
	benchmark("ternary DAG", benchmark_ST_fuzzy_match, dict);

	HT_free();
	ST_close_dict(dict);
	return 0;
}
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.
License

This article, along with any associated source code and files, is licensed under The zlib/libpng License
Written By
Peter Kankowski
Software Developer
Czech Republic
Peter is the developer of Aba Search and Replace, a tool for replacing text in multiple files. He likes to program in C with a bit of C++, also in x86 assembly language, Python, and PHP.
Using Ternary DAGs for Spelling Correction

License

Comments and Discussions