Click here to Skip to main content
15,896,606 members
Articles / Programming Languages / C#

Word stemming for German on .NET Framework

Rate me:
Please Sign up or sign in to vote.
3.57/5 (6 votes)
22 Feb 20031 min read 80.6K   918   19  
An article on word stemming algorithm, implemented for German language on the .NET framework.
#define DEBUG_STEM

using System;

namespace StemmingLib
{
	/// <summary>
	/// Summary description for Class1.
	/// </summary>
	public class destemmer
	{

		private char[] word_buffer;
		private int STR_SIZE, R1, R2;
		private int BUFFER_SIZE;
		private const int INC = 20; // I found out this is optimal word string size

		public destemmer()
		{
			SetInitState();
		}

		private void SetInitState()
		{
			R1 = R2 = -1; STR_SIZE = 0;
			BUFFER_SIZE = INC;
			word_buffer = null;
			word_buffer = new char[BUFFER_SIZE];
		}

		private void Increment()
		{
			char[] tmp_buffer = word_buffer;
			BUFFER_SIZE += INC;
			word_buffer = new char[BUFFER_SIZE];
			tmp_buffer.CopyTo(word_buffer, 0);
			tmp_buffer = null;
		}

	
		// is char vowel
		/*
		 * The following letters are vowels: 
		 *		a   e   i   o   u   y   �   �   � 
		 */
		private bool vowel(char ch)
		{
			switch(ch)
			{
				case 'a': case 'e': case 'i': case 'o':
				case 'u': case 'y': case '�': case '�':
				case '�': return true;
				default: return false;
			}
		}

		// R1 and R2 set up in the standard way
		private int DefineR(int start)
		{
			int len = STR_SIZE;
			if(start == 0)
				start = 1;

			if((start < len) && (start > 0))
			{
				for(int i = start; i < len; ++i)
				{
					if( ( !vowel(word_buffer[i]) ) && vowel( word_buffer[(i-1)] ) )
						return ( (i - start) + start + 1);
				}
			}
			return -1;
		}

		private void SetWord(string strWord)
		{
			// First, replace � by ss
			string tmp = strWord.Replace("�", "ss");
			// adjust buffer size
			while(tmp.Length > BUFFER_SIZE)
				Increment();
			STR_SIZE = tmp.Length;
			// fill in to buffer
			tmp.CopyTo(0, word_buffer, 0, STR_SIZE);
		}

		private void CutEnd(int count){	STR_SIZE -= count; }

		private bool EndsWith(string end)
		{
			if(STR_SIZE > end.Length)
			{
				int stop = STR_SIZE - end.Length - 1;
				int j = end.Length - 1;
				for(int i = (STR_SIZE -1); i > stop; --i)
				{
					if(word_buffer[i] != end[j])
						return false;
					--j;
				}
				return true;
			}
			return false;
		}

		private void preprocess()
		{
			// put u and y between vowels into upper case
			for(int i=1; i<STR_SIZE; ++i)
			{
				if( word_buffer[i] == 'u' )
				{
					if( vowel(word_buffer[(i-1)]) && vowel(word_buffer[(i+1)]) )
						word_buffer[i] = 'U';
				}
				else if( word_buffer[i] == 'y' )
				{
					if( vowel(word_buffer[(i-1)]) && vowel(word_buffer[(i+1)]) )
						word_buffer[i] = 'Y';
				}
			}
			/*
			 * R1 and R2 are first set up in the standard way,
			 * but then R1 is adjusted 
			 * so that the region before it contains at least 3 letters
			 */
			R1 = DefineR(0);
			R2 = DefineR(R1);
			if((R1 < 3) && (R1 > -1))
				R1 = 3;
		}

		/*
		 * Search for the longest among the following suffixes, 
		 *		(a) e   em   en   ern   er   es
		 *		(b) s (preceded by a valid s-ending) 
		 * and delete if in R1.
		 * (Of course the letter of the valid s-ending is not necessarily in R1) 
		 * 
		 * (For example, �ckern -> �ck, ackers -> acker, armes -> arm) 
		 */
		private void Step1()
		{
			if(R1 < 0)
				return;
			// e   em   en   ern   er   es
			if(EndsWith("ern"))
			{
				if( (STR_SIZE - R1) >= 3 )
					CutEnd(3);
			}
			else if(EndsWith("em") || EndsWith("en") || EndsWith("er") || EndsWith("es"))
			{
				if( (STR_SIZE - R1) >= 2 )
					CutEnd(2);
			}
			else if(EndsWith("e"))
			{
				if( (STR_SIZE - R1) >= 1 )
					CutEnd(1);
			}
			// b, d, f, g, h, k, l, m, n, r or t
			else if(EndsWith("bs") || EndsWith("ds") || EndsWith("fs") ||
				EndsWith("gs") || EndsWith("hs") || EndsWith("ks") ||
				EndsWith("ls") || EndsWith("ms") || EndsWith("ns") ||
				EndsWith("rs") || EndsWith("ts") )
			{
				if( (STR_SIZE - R1) >= 1 )
					CutEnd(1);
			}
		}

		/*
		 * Search for the longest among the following suffixes, 
		 *		(a) en   er   est
		 *		(b) st (preceded by a valid st-ending, itself preceded by at least 3 letters) 
		 * and delete if in R1. 
		 * 
		 * (For example, derbsten -> derbst by step 1, and derbst -> derb by step 2, since b is a valid st-ending, and is preceded by just 3 letters)
		 */
		private void Step2()
		{
			if(R1 < 0)
				return;
			 // en   er   est
			if(EndsWith("est"))
			{
				if( (STR_SIZE - R1) >= 3 )
					CutEnd(3);
			}
			else if(EndsWith("en") || EndsWith("er") )
			{
				if( (STR_SIZE - R1) >= 2 )
					CutEnd(2);
			}
			// b, d, f, g, h, k, l, m, n or t
			else if(EndsWith("bst") || EndsWith("dst") || EndsWith("fst") ||
				EndsWith("gst") || EndsWith("hst") || EndsWith("kst") ||
				EndsWith("lst") || EndsWith("mst") || EndsWith("nst") ||
				EndsWith("tst") )
			{
				// preceded by at least 3 letters
				if(STR_SIZE > 5)
				{
					if( (STR_SIZE - R1) >= 2 )
						CutEnd(2);
				}
			}
		}

		private void Step3()
		{
			if((R2 < 0) || (R1 < 0))
				return;
			/*
			 * Search for the longest among the following suffixes, 
			 * and perform the action indicated. 
			 *		end   ung 
			 * delete if in R2 
			 * if preceded by ig, delete if in R2 and not preceded by e 
			 */
			if( EndsWith("end") || EndsWith("ung") )
			{
				if( (STR_SIZE - R2) >= 3 )
					CutEnd(3);
				if( EndsWith("ig") && (word_buffer[(STR_SIZE - 3)] != 'e') )
				{
					if( (STR_SIZE - R2) >= 2 )
						CutEnd(2);
				}
			}
			/*
			 * ig   ik   isch 
			 *		delete if in R2 and not preceded by e 
			 */
			else if( (EndsWith("ig") || EndsWith("ik")) && (word_buffer[(STR_SIZE - 3)] != 'e') )
			{
				if( (STR_SIZE - R2) >= 2 )
					CutEnd(2);
			}
			else if( EndsWith("isch") && (word_buffer[(STR_SIZE - 5)] != 'e') )
			{
				if( (STR_SIZE - R2) >= 4 )
					CutEnd(4);
			}
			/*
			 * lich   heit 
			 * delete if in R2 
			 * if preceded by er or en, delete if in R1 
			 */
			else if( EndsWith("lich") || EndsWith("heit") )
			{
				CutEnd(4);
				// if preceded by er or en, delete if in R1
				if( EndsWith("en") || EndsWith("er") )
				{
					if( (STR_SIZE - R1) >= 2 )
						CutEnd(2);
					else
						STR_SIZE += 4;
				}
				else
				{
					STR_SIZE += 4;
					if( (STR_SIZE - R2) >= 4 )
						CutEnd(4);
				}
			}
			/*
			 * keit 
			 * delete if in R2 
			 * if preceded by lich or ig, delete if in R2 
			 */
			else if(EndsWith("keit"))
			{
				if( (STR_SIZE - R2) >= 4 )
					CutEnd(4);
				// if preceded by lich or ig, delete if in R2 
				if(EndsWith("ig"))
				{
					if( (STR_SIZE - R2) >= 2)
						CutEnd(2);
				}
				else if(EndsWith("lich"))
				{
					if( (STR_SIZE - R2) >= 4)
						CutEnd(4);
				}
			}
		}

		// Turn U and Y back into lower case,
		// and remove the umlaut accent from a, o and u.
		private void Finally()
		{
			for(int i=0; i<STR_SIZE; ++i)
			{
				switch(word_buffer[i])
				{
					case '�':
						word_buffer[i] = 'a'; break;
					case 'U':
						word_buffer[i] = 'u'; break;
					case '�':
						word_buffer[i] = 'u'; break;
					case 'Y':
						word_buffer[i] = 'y'; break;
					case '�':
						word_buffer[i] = 'o'; break;
					default: break;
				}
			}
		}

		public string Stem()
		{
			preprocess();
			Step1();
			Step2();
			Step3();
			Finally();
			// return stemed word
			return new string(word_buffer, 0, STR_SIZE);
		}

		public string Stem(string word)
		{
			Word = word;
			return Stem();
		}

		public string Word
		{
			get
			{
				return new string(word_buffer, 0, STR_SIZE);
			}
			set
			{
				SetInitState();
				SetWord(value);
			}
		}

	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
Bosnia and Herzegovina Bosnia and Herzegovina
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions