Click here to Skip to main content
15,886,026 members
Articles / Database Development / SQL Server / SQL Server 2008

Configuring SQL Server Accent-Insensitivity

Rate me:
Please Sign up or sign in to vote.
5.00/5 (9 votes)
21 Feb 2012CPOL7 min read 57.8K   389   9  
This article describes the steps to re-configure SQL Server so that queries return accent-insensitive results. It includes source code (using Microsoft SQL Server 2008 R2 Shared Management Objects) to automate some of the steps in this procedure.
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace Siphon
{
	public static class Inflector
	{
		#region Inflection rules

		private static Boolean s_isDisposed;
		private static Boolean s_isInitialized;
		private static List<InflectorRule> s_plurals;
		private static List<InflectorRule> s_singulars;
		private static List<String> s_uncountables;

		private sealed class InflectorRule
		{
			private readonly Regex m_regex;
			private readonly String m_replacement;

			public InflectorRule(String regexPattern, String replacementText)
			{
				m_regex = new Regex(regexPattern, RegexOptions.IgnoreCase);
				m_replacement = replacementText;
			}

			public String Apply(String word)
			{
				if (!m_regex.IsMatch(word))
					return null;

				String replace = m_regex.Replace(word, m_replacement);
				return replace;
			}
		}

		#endregion

		#region Construction and initialization

		public static void InitializeRules()
		{
			if (!s_isDisposed && (!s_isInitialized || s_plurals == null || s_singulars == null || s_uncountables == null))
			{
				s_plurals = new List<InflectorRule>();
				s_singulars = new List<InflectorRule>();
				s_uncountables = new List<String>();

				AddPluralRule("$", "s");
				AddPluralRule("s$", "s");
				AddPluralRule("(ax|test)is$", "$1es");
				AddPluralRule("(octop|vir)us$", "$1i");
				AddPluralRule("(alias|status)$", "$1es");
				AddPluralRule("(bu)s$", "$1ses");
				AddPluralRule("(buffal|tomat)o$", "$1oes");
				AddPluralRule("([ti])um$", "$1a");
				AddPluralRule("sis$", "ses");
				AddPluralRule("(?:([^f])fe|([lr])f)$", "$1$2ves");
				AddPluralRule("(hive)$", "$1s");
				AddPluralRule("([^aeiouy]|qu)y$", "$1ies");
				AddPluralRule("(x|ch|ss|sh)$", "$1es");
				AddPluralRule("(matr|vert|ind)ix|ex$", "$1ices");
				AddPluralRule("([m|l])ouse$", "$1ice");
				AddPluralRule("^(ox)$", "$1en");
				AddPluralRule("(quiz)$", "$1zes");

				AddSingularRule("s$", "");
				AddSingularRule("ss$", "ss");
				AddSingularRule("(n)ews$", "$1ews");
				AddSingularRule("([ti])a$", "$1um");
				AddSingularRule("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis");
				AddSingularRule("(^analy)ses$", "$1sis");
				AddSingularRule("([^f])ves$", "$1fe");
				AddSingularRule("(hive)s$", "$1");
				AddSingularRule("(tive)s$", "$1");
				AddSingularRule("([lr])ves$", "$1f");
				AddSingularRule("([^aeiouy]|qu)ies$", "$1y");
				AddSingularRule("(s)eries$", "$1eries");
				AddSingularRule("(m)ovies$", "$1ovie");
				AddSingularRule("(x|ch|ss|sh)es$", "$1");
				AddSingularRule("([m|l])ice$", "$1ouse");
				AddSingularRule("(bus)es$", "$1");
				AddSingularRule("(o)es$", "$1");
				AddSingularRule("(shoe)s$", "$1");
				AddSingularRule("(cris|ax|test)es$", "$1is");
				AddSingularRule("(octop|vir)i$", "$1us");
				AddSingularRule("(alias|status)$", "$1");
				AddSingularRule("(alias|status)es$", "$1");
				AddSingularRule("^(ox)en", "$1");
				AddSingularRule("(vert|ind)ices$", "$1ex");
				AddSingularRule("(matr)ices$", "$1ix");
				AddSingularRule("(quiz)zes$", "$1");

				AddIrregularRule("person", "people");
				AddIrregularRule("man", "men");
				AddIrregularRule("child", "children");
				AddIrregularRule("sex", "sexes");
				AddIrregularRule("index", "indexes");
				AddIrregularRule("tax", "taxes");
				AddIrregularRule("move", "moves");
				AddIrregularRule("tooth", "teeth");

				AddUnknownCountRule("equipment");
				AddUnknownCountRule("information");
				AddUnknownCountRule("rice");
				AddUnknownCountRule("money");
				AddUnknownCountRule("species");
				AddUnknownCountRule("series");
				AddUnknownCountRule("fish");
				AddUnknownCountRule("sheep");

				s_isInitialized = true;
			}
		}

		private static void AddPluralRule(String rule, String replacement)
		{
			s_plurals.Add(new InflectorRule(rule, replacement));
		}

		private static void AddSingularRule(String rule, String replacement)
		{
			s_singulars.Add(new InflectorRule(rule, replacement));
		}

		private static void AddIrregularRule(String singular, String plural)
		{
			AddPluralRule("(" + singular[0] + ")" + singular.Substring(1) + "$",
			              "$1" + plural.Substring(1));

			AddSingularRule("(" + plural[0] + ")" + plural.Substring(1) + "$",
			                "$1" + singular.Substring(1));
		}

		private static void AddUnknownCountRule(String word)
		{
			s_uncountables.Add(word.ToLower());
		}

		public static void Initialize()
		{
			s_isDisposed = false;
			s_isInitialized = false;
			InitializeRules();
		}

		public static void Dispose()
		{
			s_isInitialized = false;
			s_isDisposed = true;
			s_plurals = null;
			s_singulars = null;
			s_uncountables = null;
		}

		#endregion

		#region Converting between singular and plural forms

		public static String MakePlural(String word)
		{
			InitializeRules();
			return ApplyRules(s_plurals, word);
		}

		public static String MakeSingular(String word)
		{
			try
			{
				InitializeRules();
				return ApplyRules(s_singulars, word);
			}
			catch (ArgumentNullException ex)
			{
				String message = String.Format("Failed to singularize the word \"{0}\"", word ?? "(null)");
				throw new SiphonException(message, ex);
			}
		}

		private static String ApplyRules(IList<InflectorRule> rules, String word)
		{
			// Set the preconditions for this function.

			if (rules == null)
				throw new ArgumentNullException("rules");

			if (word == null)
				throw new ArgumentNullException("word");

			// Apply the inflection rules to the word.

			String result = word;
			if (s_uncountables == null || !s_uncountables.Contains(word.ToLower()))
			{
				for (Int32 i = rules.Count - 1; i >= 0; i--)
				{
					String temp = rules[i].Apply(word);
					if (temp != null)
					{
						result = temp;
						break;
					}
				}
			}
			return result;
		}

		#endregion

		#region Changing case (upper, lower, title, camel, etc.)

		public static String ToTitleCase(String text)
		{
			return ToTitleCase(text, true);
		}

		public static String ToTitleCase(String text, Boolean ignoreShortWords)
		{
			List<String> ignoreWords = null;
			if (ignoreShortWords)
			{
				ignoreWords = new List<String>();
				ignoreWords.Add("a");
				ignoreWords.Add("is");
				ignoreWords.Add("was");
				ignoreWords.Add("the");
			}

			String[] tokens = text.Split(' ');
			StringBuilder sb = new StringBuilder(text.Length);
			foreach (String token in tokens)
			{
				if (ignoreShortWords
				    && token != tokens[0]
				    && ignoreWords.Contains(token.ToLower()))
				{
					sb.Append(token.ToLower());
				}
				else
				{
					sb.Append(MakeInitialUppercase(token));
				}

				sb.Append(" ");
			}

			return sb.ToString().Trim();
		}

		public static String ToHumanCase(String lowercaseAndUnderscoredWord)
		{
			return MakeInitialUppercase(Regex.Replace(lowercaseAndUnderscoredWord, @"_", " "));
		}

		public static String ToPascalCase(String lowercaseAndUnderscoredWord)
		{
			return ToPascalCase(lowercaseAndUnderscoredWord, true);
		}

		public static String ToPascalCase(String lowercaseAndUnderscoredWord, Boolean removeUnderscores)
		{
			String pattern = "(?:^|_)(.)";
			if (!removeUnderscores)
				pattern = "(?:^)(.)";

			return Regex.Replace(lowercaseAndUnderscoredWord
			                     , pattern
			                     , delegate(Match match) { return match.Groups[1].Value.ToUpper(); });
		}

		public static String ToCamelCase(String lowercaseAndUnderscoredWord)
		{
			return MakeInitialLowercase(ToPascalCase(lowercaseAndUnderscoredWord));
		}

		public static String MakeInitialUppercase(String text)
		{
			if (text == null)
				throw new ArgumentNullException("text");

			return text.Substring(0, 1).ToUpper()
			       + text.Substring(1).ToLower();
		}

		public static String MakeInitialLowercase(String text)
		{
			if (text == null)
				throw new ArgumentNullException("text");

			return text.Substring(0, 1).ToLower() + text.Substring(1);
		}

		#endregion

		#region Soundex

		/*
		 * The algorithm for this function is:
		 * Copyright (c) 2006 Creativyst, Inc.
		 * 
		 * For more information go to:
		 * http://www.creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
		 * 
		 * censusMode
		 *    0 = Disabled
		 *    1 = Properly calculated SoundEx codes found in all census years
		 *    2 = Improperly calculated SoundEx codes found in SOME of the censuses performed in 1880, 1900, and 1910
		 */

		public static String Soundex(String word, Int32 resultLength, Int32 censusMode)
		{
			String temp;
			Int32 soundexLength = 10;

			// Perform sanity checks 

			if (censusMode > 0)
				resultLength = 4;

			if (resultLength > 0)
				soundexLength = resultLength;

			if (soundexLength > 10)
				soundexLength = 10;

			if (soundexLength < 4)
				soundexLength = 4;

			// Remove leading and trailing whitespace

			if (word == null)
				return String.Empty;

			word = word.Trim();

			if (String.IsNullOrEmpty(word))
				return String.Empty;

			// Company names may include digits (e.g. 3M) - it is important to
			// preserve these digits in the resulting SoundEx code

			word = word.Replace("0", "Zero");
			word = word.Replace("1", "One");
			word = word.Replace("2", "Two");
			word = word.Replace("3", "Three");
			word = word.Replace("4", "Four");
			word = word.Replace("5", "Five");
			word = word.Replace("6", "Six");
			word = word.Replace("7", "Seven");
			word = word.Replace("8", "Eight");
			word = word.Replace("9", "Nine");

			// Clean and tidy

			word = word.ToUpper();

			String sound = word;

			// Replace non-chars with spaces; if there is nothing but 
			// whitespace left in the resulting string then abandon ship

			sound = sound.Replace(@"[^A-Z]", " ");
			sound = sound.Trim();

			if (String.IsNullOrEmpty(sound))
				return String.Empty;

			if (censusMode == 0)
			{
				sound = sound.Replace(@"DG", "G"); // Change DG to G
				sound = sound.Replace(@"GH", "H"); // Change GH to H
				sound = sound.Replace(@"GN", "N"); // Change GN to N
				sound = sound.Replace(@"KN", "N"); // Change KN to N
				sound = sound.Replace(@"PH", "F"); // Change PH to F
				sound = sound.Replace(@"MP([STZ])", "M$1"); // Change MPST|MPZ to MST|MZ
				sound = sound.Replace(@"^PS", "S"); // Change leading PS to S
				sound = sound.Replace(@"^PF", "F"); // Change leading PF to F
				sound = sound.Replace(@"MB", "M"); // Change MB to M
				sound = sound.Replace(@"TCH", "CH"); // Change TCH to CH
			}

			// The above improvements may change the first letter

			Char firstLetter = sound[0];

			// In properly done census, SoundEx for H and W will be squezeed out 
			// before performing the test for adjacent digits (this differs from 
			// how 'real' vowels are handled)

			if (firstLetter == 'H' || firstLetter == 'W')
			{
				temp = sound.Substring(1);
				sound = "-";
				sound += temp;
			}

			if (censusMode == 1)
				sound = sound.Replace(@"[HW]", ".");

			// Classic SoundEx

			sound = sound.Replace(@"[AEIOUYHW]", "0");
			sound = sound.Replace(@"[BPFV]", "1");
			sound = sound.Replace(@"[CSGJKQXZ]", "2");
			sound = sound.Replace(@"[DT]", "3");
			sound = sound.Replace(@"[L]", "4");
			sound = sound.Replace(@"[MN]", "5");
			sound = sound.Replace(@"[R]", "6");

			// Properly done census: squeeze out H and W before doing adjacent 
			// digit removal.

			if (censusMode == 1)
				sound = sound.Replace(@"\.", String.Empty);

			// Remove extra equal adjacent digits

			Int32 soundLength = sound.Length;
			Char lastChar = '\0';
			temp = String.Empty;

			for (Int32 i = 0; i < soundLength; i++)
			{
				Char currentChar = sound[i];
				if (currentChar == lastChar)
					temp += " ";
				else
				{
					temp += currentChar;
					lastChar = currentChar;
				}
			}

			sound = temp;

			// Drop the first letter code
			sound = sound.Substring(1);

			// Remove spaces
			sound = sound.Replace(@"\s", String.Empty);

			// Remove zeros
			sound = sound.Replace(@"0", String.Empty);

			// Right-pad with zeros
			sound += "0000000000";

			// Add the first letter of the word
			sound = firstLetter + sound;

			// And, finally, size to taste
			sound = sound.Substring(0, soundexLength);

			return sound;
		}

		#endregion

		#region Helpers

		public static String SeparateWordsWithUnderscores(String text)
		{
			return
				Regex.Replace(
					Regex.Replace(
						Regex.Replace(text, @"([A-Z]+)([A-Z][a-z])", "$1_$2")
						, @"([a-z\d])([A-Z])"
						, "$1_$2")
					, @"[-\s]"
					, "_");
		}

		#endregion
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Chief Technology Officer Shift iQ
Canada Canada
I have been building software systems for more than 20 years, working for organizations that range from small non-profit associations in my local community to global Fortune 500 enterprises.

I specialize in the design and implementation of online database solutions. My work-related research interests include software design patterns and information architecture.

Comments and Discussions