Unicode compliant multilingual word breaker

Member 96
Rate me:
3.43/5 (18 votes)
21 Apr 2005CPOL9 min read
87.6K
1.5K
A simple but effective multilingual word breaker for use with text retrieval systems.
breaker_src.zip
- WordBreaker.cs
breaker_demo.zip
- App.ico
- AssemblyInfo.cs
- Form1.cs
- Form1.resx
- WordBreaker.cs
- WordBreaker.csproj
- WordBreaker.csproj.user
- WordBreaker.sln
using System;
using System.Text;
using System.Collections;
using System.Globalization;

namespace GZTW.SearchStuff
{
	#region WordBreaker class
	/// <summary>
	/// A unicode and globalization aware word breaker.
	/// With a simplistic but effective n-gram method of
	/// 2 character overlap match for CJK (Chinese, Japanese, Korean) 
	/// and other languages that do not have any concept of word boundaries.
	/// 
	/// 
	/// Helpful links:
	/// 
	/// Unicode in brief:
	/// 
	/// "Bluffers guide to unicode": http://www.jbrowse.com/text/unicode.shtml
	/// 
	/// "On the use of Words and N-grams for Chinese Information Retrieval"
	/// http://research.microsoft.com/research/pubs/view.aspx?pubid=957
	/// 
	/// 
	/// </summary>
	public sealed class WordBreaker
	{
		/// <summary>
		/// Used in breaking process
		/// </summary>
		private enum TokenTypes
		{Nothing,Separator,CJK,Latin};
		
		
		//Holds stop words - words not to be included in results
		static Hashtable StopWords;
		
		static WordBreaker()
		{
			/*  
			  This is a simplistic example of using StopWords
			stopwords are words that are so common they are not useful
			when searching for text.  
			This list contains the most often
			used words in english, however for a true internationalized application
			you will of course want to load this dynamically instead or allow the
			end user a method of entering as they wish
			  
			 */
			
			
			#region StopWords
			StopWords=new Hashtable(118);//currently there are 117 words below
			StopWords.Add("about",0);
			StopWords.Add("after",0);
			StopWords.Add("all",0);
			StopWords.Add("also",0);
			StopWords.Add("an",0);
			StopWords.Add("and",0);
			StopWords.Add("another",0);
			StopWords.Add("any",0);
			StopWords.Add("are",0);
			StopWords.Add("as",0);
			StopWords.Add("at",0);
			StopWords.Add("be",0);
			StopWords.Add("because",0);
			StopWords.Add("been",0);
			StopWords.Add("before",0);
			StopWords.Add("being",0);
			StopWords.Add("between",0);
			StopWords.Add("both",0);
			StopWords.Add("but",0);
			StopWords.Add("by",0);
			StopWords.Add("came",0);
			StopWords.Add("can",0);
			StopWords.Add("come",0);
			StopWords.Add("could",0);
			StopWords.Add("did",0);
			StopWords.Add("do",0);
			StopWords.Add("does",0);
			StopWords.Add("each",0);
			StopWords.Add("else",0);
			StopWords.Add("for",0);
			StopWords.Add("from",0);
			StopWords.Add("get",0);
			StopWords.Add("got",0);
			StopWords.Add("had",0);
			StopWords.Add("has",0);
			StopWords.Add("have",0);
			StopWords.Add("he",0);
			StopWords.Add("her",0);
			StopWords.Add("here",0);
			StopWords.Add("him",0);
			StopWords.Add("himself",0);
			StopWords.Add("his",0);
			StopWords.Add("how",0);
			StopWords.Add("if",0);
			StopWords.Add("in",0);
			StopWords.Add("into",0);
			StopWords.Add("is",0);
			StopWords.Add("it",0);
			StopWords.Add("its",0);
			StopWords.Add("just",0);
			StopWords.Add("like",0);
			StopWords.Add("make",0);
			StopWords.Add("many",0);
			StopWords.Add("me",0);
			StopWords.Add("might",0);
			StopWords.Add("more",0);
			StopWords.Add("most",0);
			StopWords.Add("much",0);
			StopWords.Add("must",0);
			StopWords.Add("my",0);
			StopWords.Add("never",0);
			StopWords.Add("now",0);
			StopWords.Add("of",0);
			StopWords.Add("on",0);
			StopWords.Add("only",0);
			StopWords.Add("or",0);
			StopWords.Add("other",0);
			StopWords.Add("our",0);
			StopWords.Add("out",0);
			StopWords.Add("over",0);
			StopWords.Add("re",0);
			StopWords.Add("said",0);
			StopWords.Add("same",0);
			StopWords.Add("see",0);
			StopWords.Add("should",0);
			StopWords.Add("since",0);
			StopWords.Add("so",0);
			StopWords.Add("some",0);
			StopWords.Add("still",0);
			StopWords.Add("such",0);
			StopWords.Add("take",0);
			StopWords.Add("than",0);
			StopWords.Add("that",0);
			StopWords.Add("the",0);
			StopWords.Add("their",0);
			StopWords.Add("them",0);
			StopWords.Add("then",0);
			StopWords.Add("there",0);
			StopWords.Add("these",0);
			StopWords.Add("they",0);
			StopWords.Add("this",0);
			StopWords.Add("those",0);
			StopWords.Add("through",0);
			StopWords.Add("to",0);
			StopWords.Add("too",0);
			StopWords.Add("under",0);
			StopWords.Add("up",0);
			StopWords.Add("use",0);
			StopWords.Add("very",0);
			StopWords.Add("want",0);
			StopWords.Add("was",0);
			StopWords.Add("way",0);
			StopWords.Add("we",0);
			StopWords.Add("well",0);
			StopWords.Add("were",0);
			StopWords.Add("what",0);
			StopWords.Add("when",0);
			StopWords.Add("where",0);
			StopWords.Add("which",0);
			StopWords.Add("while",0);
			StopWords.Add("who",0);
			StopWords.Add("will",0);
			StopWords.Add("with",0);
			StopWords.Add("would",0);
			StopWords.Add("you",0);
			StopWords.Add("your",0);
			
			
			#endregion
		}


		

		/// <summary>
		/// Take an array of strings and break it into
		/// separate words or for languages with no concept of "words"
		/// use a rolling two character window system.
		/// 
		/// </summary>
		/// <param name="breakStyleCJK">true=use rolling two character style break on non Latin characters, false=identify word boundaries</param>
		/// <param name="returnAsXml">true=return as XML fragment suitable for passing to stored procedures etc, false=return as comma delimited string</param>
		/// <param name="text">one or more strings containing source text to be broken</param>
		/// <returns>Either a fragment of XML or a comma delimited string containing a list of unique "words" found in source text</returns>
		public static string Break(bool breakStyleCJK, bool returnAsXml, params string[] text)
		{		

			//Maximum number of characters to 
			//include in a word.
			const int MAXWORDLENGTH=255;

			StringBuilder sbResults=new StringBuilder();
			
			//Hashtable to temporarily hold parsed words
			//used to easily ensure unique words only
			Hashtable ht = new Hashtable();

			//Stuff required for creating xml fragment on the fly in memory (string)
			StringBuilder sb = new StringBuilder();
			StringBuilder sbWord=new StringBuilder();
			System.IO.StringWriter sr=new System.IO.StringWriter(sb,CultureInfo.InvariantCulture);
			System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);

			w.Formatting = System.Xml.Formatting.Indented;
			w.WriteStartElement("Items");		

		
			//Loop through each of the passed in strings
			foreach(string InputTextString in text)
			{
				if(InputTextString==null) continue;
				//get all the characters in a unicode compliant manner.
				//what's cool about this is it will automatically get the characters from 
				//right to left if appropriate based on the unicode range and handles
				//glyphs that in effect take more than one character internally to represent a single glyph
				//(glyph being what you see on the screen as a character and for the purposes of breaking text
				// a glyph is ultimately what we're interested in)
				TextElementEnumerator CurrentInputTextElement = StringInfo.GetTextElementEnumerator( InputTextString );
				
				//start at the top
				CurrentInputTextElement.Reset();

				//A flag to keep track of what the last token was
				//this is used to detect the transition over important boundaries in the 
				//source input text such as a transition from CJK to latin characters or
				//from latin characters to spaces, punctuation etc.
				TokenTypes LastToken=TokenTypes.Nothing;				

				//Used by CJK
				bool IsInUnicodeLatinBlock=true;
				
				//Process each "character" (text element,glyph whatever) in the 
				//current string
				while (CurrentInputTextElement.MoveNext())  
				{	
					//get it as a character
					char c=CurrentInputTextElement.GetTextElement()[0];

					if(!breakStyleCJK)
					{	
						#region regular tokenizer
						
						//Is it a token we want to include?
						if(char.IsLetterOrDigit(c) )
						{
							#region Include token
							//All latin text is converted to lower case
							c=char.ToLower(c,CultureInfo.InvariantCulture);
							
							//Do we already have a word?
							if(sbWord.Length>0)
							{
								//Maybe we need to flush this word into the word list
								//if we're over the word length limit 
								if(sbWord.Length >= MAXWORDLENGTH)
								{
									//flush away...
									if(!ht.ContainsKey(sbWord.ToString()))
									{
										ht[sbWord.ToString()]=1;															
									}
									sbWord.Length=0;
									sbWord.Append(c);
									LastToken=TokenTypes.Latin;	
									continue;

								}
							}

							//append character and go on to next one
							sbWord.Append(c);
							LastToken=TokenTypes.Latin;	
							continue;
							#endregion
						}
						else
						{								
							#region Word Boundary token
							LastToken=TokenTypes.Separator;
							if(sbWord.Length>0)
							{
								//flush away...
								if(!ht.ContainsKey(sbWord.ToString()))
								{
									ht[sbWord.ToString()]=1;															
								}	
								sbWord.Length=0;								
									
								continue;

							}

							#endregion
						}
						#endregion
					}					
					else
					{
						#region CJK Tokenizer						

						//Is it a basic latin or Latin-1 charater? (ascii basically)
						//see: http://www.unicode.org/charts/index.html
						//and here for a funky online viewer:
						//http://www.fileformat.info/info/unicode/block/index.htm
						//we need to know this so that regular english text
						//within CJK+ text gets properly indexed as whole words
						IsInUnicodeLatinBlock=false;
						//simple test to see if the character is in the range of the 
						//basic latin  or Latin-1 range in unicode 
						if((int)c<256) IsInUnicodeLatinBlock=true;

						if(IsInUnicodeLatinBlock)
						{
							//Is it a token we want to include?
							if(char.IsLetterOrDigit(c))
							{
								#region Latin Include token
								//All latin text is converted to lower case
								c=char.ToLower(c,CultureInfo.InvariantCulture);

								//Do we already have a word?
								if(sbWord.Length>0)
								{
									//Maybe we need to flush this word into the word list
									//if we're over the word length limit or we are going from 
									//CJK to latin
									if(LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
									{
										//flush away...
										if(!ht.ContainsKey(sbWord.ToString()))
										{
											ht[sbWord.ToString()]=1;															
										}
										sbWord.Length=0;
										sbWord.Append(c);
										LastToken=TokenTypes.Latin;	
										continue;

									}
								}

								//append character and go on to next one
								sbWord.Append(c);
								LastToken=TokenTypes.Latin;	
								continue;
								#endregion
							}
							else
							{								
								#region Latin Word Boundary token
								LastToken=TokenTypes.Separator;
								if(sbWord.Length>0)
								{
									//flush away...
									if(!ht.ContainsKey(sbWord.ToString()))
									{
										ht[sbWord.ToString()]=1;															
									}	
									sbWord.Length=0;								
									
									continue;

								}

								#endregion
							}

						}
						else//CJK character
						{
							if(char.IsLetter(c))
							{
								#region CJK Include token
								//Do we already have a word?
								if(sbWord.Length>0)
								{
									//Maybe we need to flush this word into the word list
									//if we're over the word length limit or we are going from 
									//latin TO CJK 
									if(LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
									{
										//flush away...
										if(!ht.ContainsKey(sbWord.ToString()))
										{
											ht[sbWord.ToString()]=1;															
										}	
										sbWord.Length=0;
										sbWord.Append(c);
										LastToken=TokenTypes.CJK;	
										continue;

									}

									if(LastToken==TokenTypes.CJK)
									{
										//we're here because there is more than zero characters already stored
										//and the last was CJK so we need append current character
										//and flush the resultant 2 character n-gram 
										sbWord.Append(c);
										System.Diagnostics.Debug.Assert(sbWord.Length==2);
										if(!ht.ContainsKey(sbWord.ToString()))
										{
											ht[sbWord.ToString()]=1;															
										}
										sbWord.Length=0;
										sbWord.Append(c);
										LastToken=TokenTypes.CJK;	
										continue;

									}
								}

								//append character and go on to next one
								sbWord.Append(c);
								LastToken=TokenTypes.CJK;	
								continue;
								#endregion


							}
							else
							{								
								#region CJK Word Boundary token
								LastToken=TokenTypes.Separator;
								if(sbWord.Length>0)
								{
									//flush away...
									if(!ht.ContainsKey(sbWord.ToString()))
									{
										ht[sbWord.ToString()]=1;															
									}
									sbWord.Length=0;									
									
									continue;

								}

								#endregion
							}
							

						}



						


						#endregion
					}					
				}
				
				//Flush out the last word
				if(sbWord.Length>0)
				{
					//flush away...
					if(!ht.ContainsKey(sbWord.ToString()))
					{
						ht[sbWord.ToString()]=1;															
					}
				}
			}


			#region Return results

			if(returnAsXml)
			{
				//Make a return xml fragment
				//from the word list
				foreach(string word in ht.Keys)
				{
					//Add only non stopwords
					if(!StopWords.Contains(word))
					{
						w.WriteStartElement("i");
						w.WriteAttributeString("w",word);
						w.WriteEndElement();				
					}
				}

				w.WriteEndElement();
				sr.Close();
				return sr.ToString();
			}
			else
			{
				//Make a return string array
				//from the word list
				foreach(string word in ht.Keys)
				{
					//Add only non stopwords
					if(!StopWords.Contains(word))
					{
						sbResults.Append(word+",");
										
					}					

				}


				//don't leave that comma trailing at the end...
				sbResults.Length=sbResults.Length-1;

				return sbResults.ToString();

			}
			#endregion return results
			

		}

		

	}//end of class
	#endregion WordBreakerClass
}
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.
License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)
Written By
Member 96
Canada
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.
Unicode compliant multilingual word breaker

License

Comments and Discussions