Click here to Skip to main content
15,892,517 members
Articles / Programming Languages / C#

Aho-Corasick string matching in C#

Rate me:
Please Sign up or sign in to vote.
4.88/5 (39 votes)
3 Dec 2005CPOL5 min read 262.8K   7.8K   122  
A C# implementation of the very efficient Aho-Corasick keyword matching algorithm with multiple keywords support.
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

using EeekSoft.Text;

namespace AhoCorasickSearch
{
	/// <summary>
	/// Search using string.IndexOf (for comparsion)
	/// </summary>
	class IndexOfSearch : IStringSearchAlgorithm
	{
		#region IStringSearchAlgorithm Members

		string[] _keywords;

		public string[] Keywords
		{
			get { return _keywords; }
			set { _keywords=value; }
		}

		public StringSearchResult[] FindAll(string text)
		{
			throw new NotImplementedException();
		}

		public StringSearchResult FindFirst(string text)
		{
			foreach(string kwd in _keywords)
			{
				int i=text.IndexOf(kwd);
				if (i!=-1) return new StringSearchResult(i,kwd);
			}
			return StringSearchResult.Empty;
		}

		public bool ContainsAny(string text)
		{
			foreach(string kwd in _keywords)
			{
				int i=text.IndexOf(kwd);
				if (i!=-1) return true;
			}
			return false;		
		}

		#endregion
	}


	/// <summary>
	/// Search using regular expressions (for comparsion)
	/// </summary>
	class RegexSearch : IStringSearchAlgorithm
	{
		#region IStringSearchAlgorithm Members

		string[] _keywords;
		Regex _reg;

		public string[] Keywords
		{
			get { return _keywords; }
			set 
			{
				_keywords=value; 
				_reg=new Regex("("+string.Join("|",value)+")",RegexOptions.None);
			}
		}

		public StringSearchResult[] FindAll(string text)
		{
			throw new NotImplementedException();
		}

		public StringSearchResult FindFirst(string text)
		{
			throw new NotImplementedException();
		}

		public bool ContainsAny(string text)
		{
			return _reg.Match(text).Success;
		}

		#endregion
	}


	/// <summary>
	/// Testing application
	/// </summary>
	class TestApp
	{
		/// <summary>
		/// Minimal and maximal word length
		/// </summary>
		const int MaxWordLength = 10;
		const int MinWordLength = 3;
		
		/// <summary>
		/// Allowed letters in word
		/// </summary>
		const string AllowedLetters="abcdefghijklmnopqrstuvwxyz";

		#region Utility methods

		static Random rnd=new Random(12345);

		/// <summary>
		/// Generate random word
		/// </summary>
		public static string GetRandomWord()
		{
			StringBuilder sb=new StringBuilder();
      for(int i=0; i<rnd.Next(MaxWordLength-MinWordLength)+MinWordLength; i++)
				sb.Append(AllowedLetters[rnd.Next(AllowedLetters.Length)]);
			return sb.ToString();
		}


		/// <summary>
		/// Generate list of random keywords
		/// </summary>
		public static string[] GetRandomKeywords(int count)
		{
			string[] ret=new string[count];
			for(int i=0; i<count; i++)
				ret[i]=GetRandomWord();
			return ret;
		}


		/// <summary>
		/// Generate random text
		/// </summary>
		public static string GetRandomText(int count)
		{
			StringBuilder sb=new StringBuilder();
			while(sb.Length<count) { sb.Append(GetRandomWord()); sb.Append(" "); }
			return sb.ToString();
		}

		#endregion

		/// <summary>
		/// The main entry point for the application.
		/// </summary>
		[STAThread]
		static void Main(string[] args)
		{
			// set language to english
			// (because in czech "ch" is one letter and "abch".IndexOf("bc") returns -1)
			System.Threading.Thread.CurrentThread.CurrentCulture=System.Globalization.CultureInfo.CreateSpecificCulture("en");

			// algorithms
			IStringSearchAlgorithm[] algorithms=new IStringSearchAlgorithm[3];
			algorithms[0]=new StringSearch();
			algorithms[1]=new IndexOfSearch();
			algorithms[2]=new RegexSearch();

			double[] results=new double[3];
			for(int j=0; j<100; j++)
			{
				// generate random keywords and random text
				string[] keywords=GetRandomKeywords(80);
				string text=GetRandomText(2000);

				// insert keyword into text
				text=text.Insert(rnd.Next(text.Length),keywords[rnd.Next(keywords.Length)]);
				
				// for each algorithm...
				double first=0;
				for(int algIndex=0; algIndex<algorithms.Length; algIndex++)
				{
					IStringSearchAlgorithm alg=algorithms[algIndex];
					alg.Keywords=keywords;

					// search for keywords and measure performance
					HiPerfTimer tmr=new HiPerfTimer();
					tmr.Start();
					for(int i=0; i<10; i++)
					{
						if (!alg.ContainsAny(text)) 
						{ Console.WriteLine(" Invalid result!"); return; }
					}
					tmr.Stop();

					// write results
					if (first==0) first=tmr.Duration;
					results[algIndex]+=tmr.Duration;
					Console.WriteLine("{0}: {1:0.00000}ms  ({2:0.00}%)",alg.GetType().Name.PadRight(15),tmr.Duration,
						100.0*tmr.Duration/first);
				}
				Console.WriteLine();
			}


			// Write summary results for all test
			Console.WriteLine("\n===SUMMARY===");
			double finalFirst=results[0];
			for(int j=0; j<algorithms.Length; j++)
			{
				IStringSearchAlgorithm alg=algorithms[j];
				Console.WriteLine("{0}: {1:0.00000}ms  ({2:0.00}%)",alg.GetType().Name.PadRight(15),results[j],
					100.0*results[j]/finalFirst);
			}

			Console.WriteLine();
		}
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Czech Republic Czech Republic
I live in Prague, the capital city of Czech republic (most of the time Smile | :) ). I've been very interested in functional programming recently and I have a passion for the new Microsoft F# language. I'm writing a book about Functional Programming in the Real World that shows the ideas using examples in C# 3.0 and F#.

I've been Microsoft MVP (for C#) since 2004 and I'm one of the most active members of the F# community. I'm a computer science student at Charles University of Prague. My hobbies include photography, fractals and of course many things related to computers (except fixing them). My favorite book writers are Terry Pratchett and Philip K Dick and I like paintings by M. C. Escher.

PS: My favorite codeproject icon is Sheep | [baah] .

Comments and Discussions