|
/* Aho-Corasick text search algorithm implementation
*
* For more information visit
* - http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
*/
using System;
using System.Collections.Generic;
namespace Devsense.Text
{
/// <summary>
/// Interface containing all methods to be implemented
/// by string search algorithm
/// </summary>
public interface IStringSearchAlgorithm
{
#region Methods & Properties
/// <summary>
/// List of keywords to search for
/// </summary>
string[] Keywords { get; }
/// <summary>
/// Searches passed text and returns all occurrences of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>Array of occurrences</returns>
StringSearchResult[] FindAll(string text);
/// <summary>
/// Searches passed text and returns first occurrence of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns>
StringSearchResult FindFirst(string text);
/// <summary>
/// Searches passed text and returns true if text contains any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>True when text contains any keyword</returns>
bool ContainsAny(string text);
#endregion
}
/// <summary>
/// Structure containing results of search
/// (keyword and position in original text)
/// </summary>
public struct StringSearchResult
{
#region Members
private int _index;
private string _keyword;
/// <summary>
/// Initialize string search result
/// </summary>
/// <param name="index">Index in text</param>
/// <param name="keyword">Found keyword</param>
public StringSearchResult(int index, string keyword)
{
_index = index; _keyword = keyword;
}
/// <summary>
/// Returns index of found keyword in original text
/// </summary>
public int Index
{
get { return _index; }
}
/// <summary>
/// Returns keyword found by this result
/// </summary>
public string Keyword
{
get { return _keyword; }
}
/// <summary>
/// Returns empty search result
/// </summary>
public static StringSearchResult Empty
{
get { return new StringSearchResult(-1, ""); }
}
#endregion
}
/// <summary>
/// Class for searching string for one or multiple
/// keywords using efficient Aho-Corasick search algorithm
/// </summary>
public class StringSearch : IStringSearchAlgorithm
{
#region Objects
/// <summary>
/// Tree node representing character and its
/// transition and failure function
/// </summary>
class TreeNode
{
#region Constructor & Methods
/// <summary>
/// Initialize tree node with specified character
/// </summary>
/// <param name="parent">Parent node</param>
/// <param name="c">Character</param>
public TreeNode(TreeNode parent, char c)
{
_char = c;
_parent = parent;
_transitions = new Dictionary<char,TreeNode>();
_results = new List<string>();
}
/// <summary>
/// Adds pattern ending in this node
/// </summary>
/// <param name="result">Pattern</param>
public void AddResult(string result)
{
if (_results.Contains(result)) return;
_results.Add(result);
}
/// <summary>
/// Adds transition node
/// </summary>
/// <param name="node">Node</param>
public void AddTransition(TreeNode node)
{
_transitions.Add(node.Char, node);
}
/// <summary>
/// Returns transition to specified character (if exists)
/// </summary>
/// <param name="c">Character</param>
/// <returns>Returns TreeNode or null</returns>
public TreeNode GetTransition(char c)
{
TreeNode result;
_transitions.TryGetValue(c, out result);
return result;
}
/// <summary>
/// Returns true if node contains transition to specified character
/// </summary>
/// <param name="c">Character</param>
/// <returns>True if transition exists</returns>
public bool ContainsTransition(char c)
{
return GetTransition(c) != null;
}
#endregion
#region Properties
private char _char;
private TreeNode _parent;
private TreeNode _failure;
private List<string> _results;
private Dictionary<char, TreeNode> _transitions;
/// <summary>
/// Character
/// </summary>
public char Char
{
get { return _char; }
}
/// <summary>
/// Parent tree node
/// </summary>
public TreeNode Parent
{
get { return _parent; }
}
/// <summary>
/// Failure function - descendant node
/// </summary>
public TreeNode Failure
{
get { return _failure; }
internal set { _failure = value; }
}
public IEnumerable<TreeNode> Transitions
{
get
{
foreach (var node in _transitions.Values)
yield return node;
}
}
/// <summary>
/// Returns list of patterns ending by this letter
/// </summary>
public List<string> Results
{
get { return _results; }
}
#endregion
}
#endregion
#region Local fields
/// <summary>
/// Root of keyword tree
/// </summary>
private TreeNode _root;
/// <summary>
/// Keywords to search for
/// </summary>
private string[] _keywords;
private bool _matchWholeworld;
#endregion
#region Initialization
/// <summary>
/// Initialize search algorithm (Build keyword tree)
/// </summary>
/// <param name="keywords">Keywords to search for</param>
public StringSearch(string[] keywords, bool matchWholeWorld=true)
{
_matchWholeworld = matchWholeWorld;
_keywords = keywords;
BuildTree();
}
/// <summary>
/// Initialize search algorithm with no keywords
/// (Use Keywords property)
/// </summary>
public StringSearch()
{ }
#endregion
#region Implementation
public
/// <summary>
/// Build tree from specified keywords
/// </summary>
void BuildTree()
{
// Build keyword tree and transition function
_root = new TreeNode(null, ' ');
foreach (string p in _keywords)
{
// add pattern to tree
TreeNode nd = _root;
foreach (char c in p)
{
TreeNode ndNew = null;
foreach (TreeNode trans in nd.Transitions)
if (trans.Char == c) { ndNew = trans; break; }
if (ndNew == null)
{
ndNew = new TreeNode(nd, c);
nd.AddTransition(ndNew);
}
nd = ndNew;
}
nd.AddResult(p);
}
// Find failure functions
List<TreeNode> nodes = new List<TreeNode>();
// level 1 nodes - fail to root node
foreach (TreeNode nd in _root.Transitions)
{
nd.Failure = _root;
foreach (TreeNode trans in nd.Transitions) nodes.Add(trans);
}
if (_matchWholeworld)
{
// other nodes - using BFS
while (nodes.Count != 0)
{
List<TreeNode> newNodes = new List<TreeNode>();
foreach (TreeNode nd in nodes)
{
nd.Failure = _root;
// add child nodes to BFS list
foreach (TreeNode child in nd.Transitions)
newNodes.Add(child);
}
nodes = newNodes;
}
return;
}
// other nodes - using BFS
while (nodes.Count != 0)
{
List<TreeNode> newNodes = new List<TreeNode>();
foreach (TreeNode nd in nodes)
{
TreeNode r = nd.Parent.Failure;
char c = nd.Char;
while (r != null && !r.ContainsTransition(c)) r = r.Failure;
if (r == null)
nd.Failure = _root;
else
{
nd.Failure = r.GetTransition(c);
foreach (string result in nd.Failure.Results)
nd.AddResult(result);
}
// add child nodes to BFS list
foreach (TreeNode child in nd.Transitions)
newNodes.Add(child);
}
nodes = newNodes;
}
_root.Failure = _root;
}
#endregion
#region Methods & Properties
/// <summary>
/// Keywords to search for (setting this property is slow, because
/// it requieres rebuilding of keyword tree)
/// </summary>
public string[] Keywords
{
get { return _keywords; }
}
/// <summary>
/// Searches passed text and returns all occurrences of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>Array of occurrences</returns>
public StringSearchResult[] FindAll(string text)
{
List<StringSearchResult> ret = new List<StringSearchResult>();
TreeNode ptr = _root;
int index = 0;
while (index < text.Length)
{
TreeNode trans = null;
while (trans == null)
{
trans = ptr.GetTransition(text[index]);
if (ptr == _root) break;
if (trans == null) ptr = ptr.Failure;
}
if (trans != null) ptr = trans;
foreach (string found in ptr.Results)
ret.Add(new StringSearchResult(index - found.Length + 1, found));
index++;
}
return ret.ToArray();
}
/// <summary>
/// Searches passed text and returns first occurrence of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns>
public StringSearchResult FindFirst(string text)
{
TreeNode ptr = _root;
int index = 0;
while (index < text.Length)
{
TreeNode trans = null;
while (trans == null)
{
trans = ptr.GetTransition(text[index]);
if (ptr == _root) break;
if (trans == null) ptr = ptr.Failure;
}
if (trans != null) ptr = trans;
foreach (string found in ptr.Results)
return new StringSearchResult(index - found.Length + 1, found);
index++;
}
return StringSearchResult.Empty;
}
/// <summary>
/// Searches passed text and returns true if text contains any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>True when text contains any keyword</returns>
public bool ContainsAny(string text)
{
TreeNode ptr = _root;
int index = 0;
while (index < text.Length)
{
TreeNode trans = null;
while (trans == null)
{
trans = ptr.GetTransition(text[index]);
if (ptr == _root) break;
if (trans == null) ptr = ptr.Failure;
}
if (trans != null) ptr = trans;
if (ptr.Results.Count > 0)
{
if (_matchWholeworld )
{
int start = Math.Max(0,index - ptr.Results[0].Length);
// in this case only one result can be present
if ((start == 0 || !char.IsLetterOrDigit(text[start])) &&
(index+1 == text.Length || !char.IsLetterOrDigit(text[index+1])))
return true;
}
else
return true;
}
index++;
}
return false;
}
#endregion
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.