using System;
using System.Text;
using System.Collections;
using System.Globalization;
namespace GZTW.SearchStuff
{
#region WordBreaker class
/// <summary>
/// A unicode and globalization aware word breaker.
/// With a simplistic but effective n-gram method of
/// 2 character overlap match for CJK (Chinese, Japanese, Korean)
/// and other languages that do not have any concept of word boundaries.
///
///
/// Helpful links:
///
/// Unicode in brief:
///
/// "Bluffers guide to unicode": http://www.jbrowse.com/text/unicode.shtml
///
/// "On the use of Words and N-grams for Chinese Information Retrieval"
/// http://research.microsoft.com/research/pubs/view.aspx?pubid=957
///
///
/// </summary>
public sealed class WordBreaker
{
/// <summary>
/// Used in breaking process
/// </summary>
private enum TokenTypes
{Nothing,Separator,CJK,Latin};
//Holds stop words - words not to be included in results
static Hashtable StopWords;
static WordBreaker()
{
/*
This is a simplistic example of using StopWords
stopwords are words that are so common they are not useful
when searching for text.
This list contains the most often
used words in english, however for a true internationalized application
you will of course want to load this dynamically instead or allow the
end user a method of entering as they wish
*/
#region StopWords
StopWords=new Hashtable(118);//currently there are 117 words below
StopWords.Add("about",0);
StopWords.Add("after",0);
StopWords.Add("all",0);
StopWords.Add("also",0);
StopWords.Add("an",0);
StopWords.Add("and",0);
StopWords.Add("another",0);
StopWords.Add("any",0);
StopWords.Add("are",0);
StopWords.Add("as",0);
StopWords.Add("at",0);
StopWords.Add("be",0);
StopWords.Add("because",0);
StopWords.Add("been",0);
StopWords.Add("before",0);
StopWords.Add("being",0);
StopWords.Add("between",0);
StopWords.Add("both",0);
StopWords.Add("but",0);
StopWords.Add("by",0);
StopWords.Add("came",0);
StopWords.Add("can",0);
StopWords.Add("come",0);
StopWords.Add("could",0);
StopWords.Add("did",0);
StopWords.Add("do",0);
StopWords.Add("does",0);
StopWords.Add("each",0);
StopWords.Add("else",0);
StopWords.Add("for",0);
StopWords.Add("from",0);
StopWords.Add("get",0);
StopWords.Add("got",0);
StopWords.Add("had",0);
StopWords.Add("has",0);
StopWords.Add("have",0);
StopWords.Add("he",0);
StopWords.Add("her",0);
StopWords.Add("here",0);
StopWords.Add("him",0);
StopWords.Add("himself",0);
StopWords.Add("his",0);
StopWords.Add("how",0);
StopWords.Add("if",0);
StopWords.Add("in",0);
StopWords.Add("into",0);
StopWords.Add("is",0);
StopWords.Add("it",0);
StopWords.Add("its",0);
StopWords.Add("just",0);
StopWords.Add("like",0);
StopWords.Add("make",0);
StopWords.Add("many",0);
StopWords.Add("me",0);
StopWords.Add("might",0);
StopWords.Add("more",0);
StopWords.Add("most",0);
StopWords.Add("much",0);
StopWords.Add("must",0);
StopWords.Add("my",0);
StopWords.Add("never",0);
StopWords.Add("now",0);
StopWords.Add("of",0);
StopWords.Add("on",0);
StopWords.Add("only",0);
StopWords.Add("or",0);
StopWords.Add("other",0);
StopWords.Add("our",0);
StopWords.Add("out",0);
StopWords.Add("over",0);
StopWords.Add("re",0);
StopWords.Add("said",0);
StopWords.Add("same",0);
StopWords.Add("see",0);
StopWords.Add("should",0);
StopWords.Add("since",0);
StopWords.Add("so",0);
StopWords.Add("some",0);
StopWords.Add("still",0);
StopWords.Add("such",0);
StopWords.Add("take",0);
StopWords.Add("than",0);
StopWords.Add("that",0);
StopWords.Add("the",0);
StopWords.Add("their",0);
StopWords.Add("them",0);
StopWords.Add("then",0);
StopWords.Add("there",0);
StopWords.Add("these",0);
StopWords.Add("they",0);
StopWords.Add("this",0);
StopWords.Add("those",0);
StopWords.Add("through",0);
StopWords.Add("to",0);
StopWords.Add("too",0);
StopWords.Add("under",0);
StopWords.Add("up",0);
StopWords.Add("use",0);
StopWords.Add("very",0);
StopWords.Add("want",0);
StopWords.Add("was",0);
StopWords.Add("way",0);
StopWords.Add("we",0);
StopWords.Add("well",0);
StopWords.Add("were",0);
StopWords.Add("what",0);
StopWords.Add("when",0);
StopWords.Add("where",0);
StopWords.Add("which",0);
StopWords.Add("while",0);
StopWords.Add("who",0);
StopWords.Add("will",0);
StopWords.Add("with",0);
StopWords.Add("would",0);
StopWords.Add("you",0);
StopWords.Add("your",0);
#endregion
}
/// <summary>
/// Take an array of strings and break it into
/// separate words or for languages with no concept of "words"
/// use a rolling two character window system.
///
/// </summary>
/// <param name="breakStyleCJK">true=use rolling two character style break on non Latin characters, false=identify word boundaries</param>
/// <param name="returnAsXml">true=return as XML fragment suitable for passing to stored procedures etc, false=return as comma delimited string</param>
/// <param name="text">one or more strings containing source text to be broken</param>
/// <returns>Either a fragment of XML or a comma delimited string containing a list of unique "words" found in source text</returns>
public static string Break(bool breakStyleCJK, bool returnAsXml, params string[] text)
{
//Maximum number of characters to
//include in a word.
const int MAXWORDLENGTH=255;
StringBuilder sbResults=new StringBuilder();
//Hashtable to temporarily hold parsed words
//used to easily ensure unique words only
Hashtable ht = new Hashtable();
//Stuff required for creating xml fragment on the fly in memory (string)
StringBuilder sb = new StringBuilder();
StringBuilder sbWord=new StringBuilder();
System.IO.StringWriter sr=new System.IO.StringWriter(sb,CultureInfo.InvariantCulture);
System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
w.Formatting = System.Xml.Formatting.Indented;
w.WriteStartElement("Items");
//Loop through each of the passed in strings
foreach(string InputTextString in text)
{
if(InputTextString==null) continue;
//get all the characters in a unicode compliant manner.
//what's cool about this is it will automatically get the characters from
//right to left if appropriate based on the unicode range and handles
//glyphs that in effect take more than one character internally to represent a single glyph
//(glyph being what you see on the screen as a character and for the purposes of breaking text
// a glyph is ultimately what we're interested in)
TextElementEnumerator CurrentInputTextElement = StringInfo.GetTextElementEnumerator( InputTextString );
//start at the top
CurrentInputTextElement.Reset();
//A flag to keep track of what the last token was
//this is used to detect the transition over important boundaries in the
//source input text such as a transition from CJK to latin characters or
//from latin characters to spaces, punctuation etc.
TokenTypes LastToken=TokenTypes.Nothing;
//Used by CJK
bool IsInUnicodeLatinBlock=true;
//Process each "character" (text element,glyph whatever) in the
//current string
while (CurrentInputTextElement.MoveNext())
{
//get it as a character
char c=CurrentInputTextElement.GetTextElement()[0];
if(!breakStyleCJK)
{
#region regular tokenizer
//Is it a token we want to include?
if(char.IsLetterOrDigit(c) )
{
#region Include token
//All latin text is converted to lower case
c=char.ToLower(c,CultureInfo.InvariantCulture);
//Do we already have a word?
if(sbWord.Length>0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit
if(sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
sbWord.Append(c);
LastToken=TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken=TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Word Boundary token
LastToken=TokenTypes.Separator;
if(sbWord.Length>0)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
continue;
}
#endregion
}
#endregion
}
else
{
#region CJK Tokenizer
//Is it a basic latin or Latin-1 charater? (ascii basically)
//see: http://www.unicode.org/charts/index.html
//and here for a funky online viewer:
//http://www.fileformat.info/info/unicode/block/index.htm
//we need to know this so that regular english text
//within CJK+ text gets properly indexed as whole words
IsInUnicodeLatinBlock=false;
//simple test to see if the character is in the range of the
//basic latin or Latin-1 range in unicode
if((int)c<256) IsInUnicodeLatinBlock=true;
if(IsInUnicodeLatinBlock)
{
//Is it a token we want to include?
if(char.IsLetterOrDigit(c))
{
#region Latin Include token
//All latin text is converted to lower case
c=char.ToLower(c,CultureInfo.InvariantCulture);
//Do we already have a word?
if(sbWord.Length>0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//CJK to latin
if(LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
sbWord.Append(c);
LastToken=TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken=TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Latin Word Boundary token
LastToken=TokenTypes.Separator;
if(sbWord.Length>0)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
continue;
}
#endregion
}
}
else//CJK character
{
if(char.IsLetter(c))
{
#region CJK Include token
//Do we already have a word?
if(sbWord.Length>0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//latin TO CJK
if(LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
sbWord.Append(c);
LastToken=TokenTypes.CJK;
continue;
}
if(LastToken==TokenTypes.CJK)
{
//we're here because there is more than zero characters already stored
//and the last was CJK so we need append current character
//and flush the resultant 2 character n-gram
sbWord.Append(c);
System.Diagnostics.Debug.Assert(sbWord.Length==2);
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
sbWord.Append(c);
LastToken=TokenTypes.CJK;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken=TokenTypes.CJK;
continue;
#endregion
}
else
{
#region CJK Word Boundary token
LastToken=TokenTypes.Separator;
if(sbWord.Length>0)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
sbWord.Length=0;
continue;
}
#endregion
}
}
#endregion
}
}
//Flush out the last word
if(sbWord.Length>0)
{
//flush away...
if(!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()]=1;
}
}
}
#region Return results
if(returnAsXml)
{
//Make a return xml fragment
//from the word list
foreach(string word in ht.Keys)
{
//Add only non stopwords
if(!StopWords.Contains(word))
{
w.WriteStartElement("i");
w.WriteAttributeString("w",word);
w.WriteEndElement();
}
}
w.WriteEndElement();
sr.Close();
return sr.ToString();
}
else
{
//Make a return string array
//from the word list
foreach(string word in ht.Keys)
{
//Add only non stopwords
if(!StopWords.Contains(word))
{
sbResults.Append(word+",");
}
}
//don't leave that comma trailing at the end...
sbResults.Length=sbResults.Length-1;
return sbResults.ToString();
}
#endregion return results
}
}//end of class
#endregion WordBreakerClass
}