Introduction
While working on adding an English dictionary to a company website, I ran upon the problem of mispelling a word while testing the application. As this is likely to be a common user error, I decided to read up on basic phonetic matching. While SQL Server implements the Soundex function, Microsoft Access (the format in which the dictionary is stored) does not.
So the task was simple. Find an algorithm on the internet that could be used to populate a Soundex field within the database, for use in phonetic comparisons.
Unfortunately, when I went looking for sample code on the internet, most of it was terribly outdated. Most of the code, written for either VBScript, or Visual Basic 6 or earlier, made heavy use of expensive functions such as MID and LEFT. These functions, to put it mildly, are not effecient, when compared to accessing characters directly via a character array.
Since I was going to be processing well over 100,000 articles, I decided to write my own Soundex functions based on standardized algorithms, using a tighter, more effecient loop. The resulting code is included below.
VISUAL BASIC CODE SAMPLE
Public Shared Function Compute(ByVal Word As String) As String
Return Compute(Word, 4)
End Function
Public Shared Function Compute(ByVal Word As String, ByVal Length As Integer) As String
Dim Value As String = ""
Dim Size As Integer = Word.Length
If (Size > 1) Then
Word = Word.ToUpper()
Dim Chars() As Char = Word.ToCharArray()
Dim Buffer As New System.Text.StringBuilder
Buffer.Length = 0
Dim PrevCode As Integer = 0
Dim CurrCode As Integer = 0
Buffer.Append(Chars(0))
Dim i As Integer
Dim LoopLimit As Integer = Size - 1
For i = 1 To LoopLimit
Select Case Chars(i)
Case "A", "E", "I", "O", "U", "H", "W", "Y"
CurrCode = 0
Case "B", "F", "P", "V"
CurrCode = 1
Case "C", "G", "J", "K", "Q", "S", "X", "Z"
CurrCode = 2
Case "D", "T"
CurrCode = 3
Case "L"
CurrCode = 4
Case "M", "N"
CurrCode = 5
Case "R"
CurrCode = 6
End Select
If (CurrCode <> PrevCode) Then
If (CurrCode <> 0) Then
Buffer.Append(CurrCode)
End If
End If
If (Buffer.Length = Length) Then
Exit For
End If
Next
Size = Buffer.Length
If (Size < Length) Then
Buffer.Append("0", (Length - Size))
End If
Value = Buffer.ToString()
End If
Return Value
End Function
C SHARP CODE SAMPLE
public static string Compute(string word)
{
return Compute(word, 4);
} public static string Compute(string word, int length)
{
string value = "";
int size = word.Length;
if (size > 1)
{
word = word.ToUpper();
char[] chars = word.ToCharArray();
StringBuilder buffer = new StringBuilder();
buffer.Length = 0;
int prevCode = 0;
int currCode = 0;
buffer.Append(chars[0]);
for (int i = 1; i < size; i++)
{
switch (chars[i])
{
case 'A':
currCode = 0;
break;
case 'E':
currCode = 0;
break;
case 'I':
currCode = 0;
break;
case 'O':
currCode = 0;
break;
case 'U':
currCode = 0;
break;
case 'H':
currCode = 0;
break;
case 'W':
currCode = 0;
break;
case 'Y':
currCode = 0;
break;
case 'B':
currCode = 1;
break;
case 'F':
currCode = 1;
break;
case 'P':
currCode = 1;
break;
case 'V':
currCode = 1;
break;
case 'C':
currCode = 2;
break;
case 'G':
currCode = 2;
break;
case 'J':
currCode = 2;
break;
case 'K':
currCode = 2;
break;
case 'Q':
currCode = 2;
break;
case 'S':
currCode = 2;
break;
case 'X':
currCode = 2;
break;
case 'Z':
currCode = 2;
break;
case 'D':
currCode = 3;
break;
case 'T':
currCode = 3;
break;
case 'L':
currCode = 4;
break;
case 'M':
currCode = 5;
break;
case 'N':
currCode = 5;
break;
case 'R':
currCode = 6;
break;
} if (currCode != prevCode)
{
if (currCode != 0)
buffer.Append(currCode);
}
prevCode = currCode;
if (buffer.Length == length)
break;
} size = buffer.Length;
if (size < length)
buffer.Append('0', (length - size));
value = buffer.ToString();
}
return value;
}
}