Using Ternary DAGs for Spelling Correction

Peter Kankowski

Rate me:

4.97/5 (32 votes)

10 Jan 2009Zlib9 min read

57.7K

739

A new data structure for spellchecking and spelling correction

ternarydags_src.zip
- benchmark.c
- benchmark.exe
- benchmark.vcproj
- build.bat
- create_dictionary.c
- create_dictionary.exe
- create_dictionary.ppj
- create_dictionary.vcproj
- dic.Copyright.txt
- dic.txt
- spellchecker.c
- spellchecker.h
- spellchecker.sln
- spellchecker.vcproj
- spellchecker_test.c
- spellchecker_test.exe
- spellchk.dat

// Ternary DAG spellchecker
// (c) Peter Kankowski, 2008. kankowski@narod.ru

#define _CRT_SECURE_NO_DEPRECATE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "spellchecker.h"

#ifndef __cplusplus
typedef int bool;
#define true 1
#define false 0
#endif

static ST_FILE_NODE* get_left(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
	return dict + node->left;
}
static ST_FILE_NODE* get_right(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
	return dict + node->right;
}
static ST_FILE_NODE* get_middle(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
	return dict + node->middle;
}
static bool is_null_link(ST_FILE_NODE* node, ST_FILE_NODE* dict) {
	return node == dict;
}

// Open a ternary DAG dictionary, returns NULL in case of error
ST_FILE_NODE * ST_open_dict (const char * filename)
{
	FILE* f = fopen(filename, "rb");
	if(!f)
		return NULL;
	
	// Get file size
	fseek(f, 0, SEEK_END);
	size_t filesize = ftell(f);
	fseek(f, 0L, SEEK_SET);
	
	// Read the whole file to the allocated buffer
	ST_FILE_NODE* dict = (ST_FILE_NODE*)malloc(filesize);
	if(dict && ( fread(dict, 1, filesize, f) != filesize ||
				 0 != memcmp(dict, ST_DICT_SIGNATURE, sizeof(ST_FILE_NODE)) ) ) {
		free(dict);
		dict = NULL;
	}
	
	fclose(f);
	return dict;
}

// Close the dictionary
void ST_close_dict (ST_FILE_NODE * dict)
{
	free(dict);
}

// ===========================
// Searching in ternary DAG
// ===========================

// Returns a non-zero value if the word is present in the dictionary
static int find_exact (ST_FILE_NODE * node, const char * word, ST_FILE_NODE * dict) {
	while(!is_null_link(node, dict)) {
		// Move to the left of right subtree
		if(*word < node->ch)
			node = get_left(node, dict);
		else if(*word > node->ch)
			node = get_right(node, dict);
		
		// Found node with the character; begin searching for the next character
		else {
			if(*word == '\0')
				return true;
			node = get_middle(node, dict);
			word++;
		}
	}
	return false;
}

int ST_find_exact (ST_FILE_NODE * dict, const char * word) {
	if(find_exact(dict + 1, word, dict))
		return true;
	
	// Convert the first letter to lower-case
	if(!isupper(*word))
		return false;
	char lowercase[ST_MAX_WORD];
	char * lc = lowercase;
	*lc++ = (char)tolower(*word++);

	// One-letter words should match case-sensitively
	if(*word == '\0')
		return false;
	
	// Two case variations are allowed: all capital letters (SOMETHING) or the first
	// capital letter (Something). Other combinations should match exactly (MHz).
	if ( isupper(*word) ) { // Other letters must be capital, too
		*lc++ = (char)tolower(*word++);
		while(*word) {
			if (!isupper(*word) && *word != '\'')
				return false;
			*lc++ = (char)tolower(*word++);
		}
	} else { // Other letters must be small
		*lc++ = *word++;
		while(*word) {
			if (!islower(*word) && *word != '\'')
				return false;
			*lc++ = *word++;
		}
	}
	*lc = '\0';
	return find_exact(dict + 1, lowercase, dict);
}


// ======================
// Fuzzy matching
// ======================

struct ST_ENUMINFO {
	void (* enumproc)(const char*, void*);
	void* param;
	char result[ST_MAX_WORD];
	char variation[ST_MAX_WORD];
	ST_FILE_NODE * dict;
};
typedef struct ST_ENUMINFO ST_ENUMINFO;

static void report_suggestion(ST_ENUMINFO * enuminfo) {
	enuminfo->enumproc(enuminfo->result, enuminfo->param);
}

// Correct double letters (including multiple errors in one word, e.g. Misisipi)
static void match_doubles(ST_FILE_NODE* node, const char* word, char* result,
						  ST_ENUMINFO * enuminfo) {
	while(!is_null_link(node, enuminfo->dict)) {
		if(*word < node->ch)
			node = get_left(node, enuminfo->dict);
		else if(*word > node->ch)
			node = get_right(node, enuminfo->dict);
		else {
			*result = *word;
			if(*result == '\0') {
				// Found the word
				report_suggestion(enuminfo);
				return;
			}
			if(*(word + 1) == *word) {
				// When found two equal letters, try to find a single letter
				match_doubles(get_middle(node, enuminfo->dict), word + 2, result + 1, enuminfo);
			} else {
				// Otherwise, try to find two equal letters here
				match_doubles(get_middle(node, enuminfo->dict), word, result + 1, enuminfo);
			}
			// Also find exact match
			node = get_middle(node, enuminfo->dict);
			word++;
			result++;
		}
	}
}

static void exact_match(ST_FILE_NODE* node, const char* word, char* result,
						ST_ENUMINFO * enuminfo)
{
	if (find_exact(node, word, enuminfo->dict))
	{
		strcpy(result, word);
		report_suggestion(enuminfo);
	}
}

// Returns true if the letters are phonetically similar or
// are located close to each other on qwerty keyboard
static bool similar_letters(char ch1, char ch2) {
	static const char are_similar[26][26] = {
			  // a b c d e f g h i j k l m n o p q r s t u v w x y z
		/* a */ {1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1},
		/* b */ {0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0},
		/* c */ {0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0},
		/* d */ {0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0},
		/* e */ {1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0},
		/* f */ {0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0},
		/* g */ {0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0},
		/* h */ {0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0},
		/* i */ {1,0,0,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0},
		/* j */ {0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0},
		/* k */ {0,0,1,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0},
		/* l */ {0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0},
		/* m */ {0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0},
		/* n */ {0,1,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0},
		/* o */ {0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0},
		/* p */ {0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0},
		/* q */ {1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0},
		/* r */ {0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0},
		/* s */ {1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1},
		/* t */ {0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0},
		/* u */ {0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0},
		/* v */ {0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0},
		/* w */ {1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0},
		/* x */ {0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1},
		/* y */ {0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0},
		/* z */ {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1}
	};
	unsigned index1 = tolower(ch1) - 'a';
	unsigned index2 = tolower(ch2) - 'a';
	if (index1 >= 26 || index2 >= 26)
		return false;
	return are_similar[index1][index2] != 0;
}

// Correct a single typo. Enumerate all adjacent nodes and try to find a similar word
static void fuzzy_match(ST_FILE_NODE* node, const char* word, char* result,
						ST_ENUMINFO * enuminfo) {
	while(!is_null_link(node, enuminfo->dict)) {
		if(*word == node->ch) { // If found the exact letter, continue fuzzy matching
			*result = node->ch;
			if(*word == '\0') {
				report_suggestion(enuminfo);
				return;
			}
			fuzzy_match(get_middle(node, enuminfo->dict), word + 1, result + 1, enuminfo);
		} else {
			// Match a similar letter (e.g., berru instead of berry)
			if(similar_letters(*word, node->ch)) {
				// Don't allow multiple errors in one word
				*result = node->ch;
				exact_match(get_middle(node, enuminfo->dict), word + 1, result + 1, enuminfo);
			}
			
			// Insert a letter from node (e.g., sometmes instead of sometimes).
			// Also handle double letters (e.g., sory instead of sorry).
			*result = node->ch;
			exact_match(get_middle(node, enuminfo->dict), word, result + 1, enuminfo);
			
			if(word[1] == node->ch) {
				// Transpose letters (e.g., comrpession instead of compression)
				if(node->ch != '\0') {
					*result = node->ch;
					enuminfo->variation[0] = *word;
					strcpy(enuminfo->variation + 1, word + 2);
					exact_match(get_middle(node, enuminfo->dict), enuminfo->variation, result + 1, enuminfo);
				}
				
				// Delete a letter (e.g., ghjost instead of ghost)
				*result = node->ch;
				exact_match(get_middle(node, enuminfo->dict), word + 2, result + 1, enuminfo);
			}
			// Insert a space ("cando" instead of "can do")
			if(node->ch == '\0') {
				*result = ' ';
				exact_match(enuminfo->dict + 1, word, result + 1, enuminfo);
			}
		}
		
		// Look in all adjacent nodes (left and right)
		fuzzy_match(get_left(node, enuminfo->dict), word, result, enuminfo);
		node = get_right(node, enuminfo->dict);
	}
}

// Enumerate spelling suggestions (fuzzy matches)
void ST_enum_suggestions (ST_FILE_NODE * dict, const char * word,
						  void (* enumproc)(const char*, void*), void * param) {
	ST_ENUMINFO enuminfo;
	enuminfo.param = param;
	enuminfo.enumproc = enumproc;
	enuminfo.dict = dict;
	fuzzy_match(dict + 1, word, enuminfo.result, &enuminfo);
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The zlib/libpng License

Written By

Peter Kankowski

Software Developer

Czech Republic

Peter is the developer of Aba Search and Replace, a tool for replacing text in multiple files. He likes to program in C with a bit of C++, also in x86 assembly language, Python, and PHP.

Using Ternary DAGs for Spelling Correction

License

Comments and Discussions