Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version
Go to top

Aho-Corasick string matching in C#

, 3 Dec 2005
A C# implementation of the very efficient Aho-Corasick keyword matching algorithm with multiple keywords support.
demo.zip
EeekSoft.Text.Test
App.ico
bin
comparsion.xls
EeekSoft.Text.Test.csproj.user
EeekSoft.Text.Test.csproj.vspscc
mssccprj.scc
vssver2.scc
EeekSoft.Text
ahocorasick
graph10kb.gif
graph1kb.gif
tree1.gif
tree2.gif
vssver2.scc
comparsion.xls
EeekSoft.Text.csproj.user
EeekSoft.Text.csproj.vspscc
mssccprj.scc
vssver2.scc
source.zip
graph10kb.gif
graph1kb.gif
tree1.gif
tree2.gif
vssver2.scc
bin
comparsion.xls
EeekSoft.Text.csproj.user
EeekSoft.Text.csproj.vspscc
mssccprj.scc
vssver2.scc
EeekSoft.Text (.Net 2)
EeekSoft.Text.dll
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

using EeekSoft.Text;

namespace AhoCorasickSearch
{
	/// <summary>
	/// Search using string.IndexOf (for comparsion)
	/// </summary>
	class IndexOfSearch : IStringSearchAlgorithm
	{
		#region IStringSearchAlgorithm Members

		string[] _keywords;

		public string[] Keywords
		{
			get { return _keywords; }
			set { _keywords=value; }
		}

		public StringSearchResult[] FindAll(string text)
		{
			throw new NotImplementedException();
		}

		public StringSearchResult FindFirst(string text)
		{
			foreach(string kwd in _keywords)
			{
				int i=text.IndexOf(kwd);
				if (i!=-1) return new StringSearchResult(i,kwd);
			}
			return StringSearchResult.Empty;
		}

		public bool ContainsAny(string text)
		{
			foreach(string kwd in _keywords)
			{
				int i=text.IndexOf(kwd);
				if (i!=-1) return true;
			}
			return false;		
		}

		#endregion
	}


	/// <summary>
	/// Search using regular expressions (for comparsion)
	/// </summary>
	class RegexSearch : IStringSearchAlgorithm
	{
		#region IStringSearchAlgorithm Members

		string[] _keywords;
		Regex _reg;

		public string[] Keywords
		{
			get { return _keywords; }
			set 
			{
				_keywords=value; 
				_reg=new Regex("("+string.Join("|",value)+")",RegexOptions.None);
			}
		}

		public StringSearchResult[] FindAll(string text)
		{
			throw new NotImplementedException();
		}

		public StringSearchResult FindFirst(string text)
		{
			throw new NotImplementedException();
		}

		public bool ContainsAny(string text)
		{
			return _reg.Match(text).Success;
		}

		#endregion
	}


	/// <summary>
	/// Testing application
	/// </summary>
	class TestApp
	{
		/// <summary>
		/// Minimal and maximal word length
		/// </summary>
		const int MaxWordLength = 10;
		const int MinWordLength = 3;
		
		/// <summary>
		/// Allowed letters in word
		/// </summary>
		const string AllowedLetters="abcdefghijklmnopqrstuvwxyz";

		#region Utility methods

		static Random rnd=new Random(12345);

		/// <summary>
		/// Generate random word
		/// </summary>
		public static string GetRandomWord()
		{
			StringBuilder sb=new StringBuilder();
      for(int i=0; i<rnd.Next(MaxWordLength-MinWordLength)+MinWordLength; i++)
				sb.Append(AllowedLetters[rnd.Next(AllowedLetters.Length)]);
			return sb.ToString();
		}


		/// <summary>
		/// Generate list of random keywords
		/// </summary>
		public static string[] GetRandomKeywords(int count)
		{
			string[] ret=new string[count];
			for(int i=0; i<count; i++)
				ret[i]=GetRandomWord();
			return ret;
		}


		/// <summary>
		/// Generate random text
		/// </summary>
		public static string GetRandomText(int count)
		{
			StringBuilder sb=new StringBuilder();
			while(sb.Length<count) { sb.Append(GetRandomWord()); sb.Append(" "); }
			return sb.ToString();
		}

		#endregion

		/// <summary>
		/// The main entry point for the application.
		/// </summary>
		[STAThread]
		static void Main(string[] args)
		{
			// set language to english
			// (because in czech "ch" is one letter and "abch".IndexOf("bc") returns -1)
			System.Threading.Thread.CurrentThread.CurrentCulture=System.Globalization.CultureInfo.CreateSpecificCulture("en");

			// algorithms
			IStringSearchAlgorithm[] algorithms=new IStringSearchAlgorithm[3];
			algorithms[0]=new StringSearch();
			algorithms[1]=new IndexOfSearch();
			algorithms[2]=new RegexSearch();

			double[] results=new double[3];
			for(int j=0; j<100; j++)
			{
				// generate random keywords and random text
				string[] keywords=GetRandomKeywords(80);
				string text=GetRandomText(2000);

				// insert keyword into text
				text=text.Insert(rnd.Next(text.Length),keywords[rnd.Next(keywords.Length)]);
				
				// for each algorithm...
				double first=0;
				for(int algIndex=0; algIndex<algorithms.Length; algIndex++)
				{
					IStringSearchAlgorithm alg=algorithms[algIndex];
					alg.Keywords=keywords;

					// search for keywords and measure performance
					HiPerfTimer tmr=new HiPerfTimer();
					tmr.Start();
					for(int i=0; i<10; i++)
					{
						if (!alg.ContainsAny(text)) 
						{ Console.WriteLine(" Invalid result!"); return; }
					}
					tmr.Stop();

					// write results
					if (first==0) first=tmr.Duration;
					results[algIndex]+=tmr.Duration;
					Console.WriteLine("{0}: {1:0.00000}ms  ({2:0.00}%)",alg.GetType().Name.PadRight(15),tmr.Duration,
						100.0*tmr.Duration/first);
				}
				Console.WriteLine();
			}


			// Write summary results for all test
			Console.WriteLine("\n===SUMMARY===");
			double finalFirst=results[0];
			for(int j=0; j<algorithms.Length; j++)
			{
				IStringSearchAlgorithm alg=algorithms[j];
				Console.WriteLine("{0}: {1:0.00000}ms  ({2:0.00}%)",alg.GetType().Name.PadRight(15),results[j],
					100.0*results[j]/finalFirst);
			}

			Console.WriteLine();
		}
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

Tomas Petricek

Czech Republic Czech Republic
I live in Prague, the capital city of Czech republic (most of the time Smile | :) ). I've been very interested in functional programming recently and I have a passion for the new Microsoft F# language. I'm writing a book about Functional Programming in the Real World that shows the ideas using examples in C# 3.0 and F#.
 
I've been Microsoft MVP (for C#) since 2004 and I'm one of the most active members of the F# community. I'm a computer science student at Charles University of Prague. My hobbies include photography, fractals and of course many things related to computers (except fixing them). My favorite book writers are Terry Pratchett and Philip K Dick and I like paintings by M. C. Escher.
 
PS: My favorite codeproject icon is Sheep | [baah] .

| Advertise | Privacy | Mobile
Web02 | 2.8.140916.1 | Last Updated 3 Dec 2005
Article Copyright 2005 by Tomas Petricek
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid