Click here to Skip to main content
15,887,335 members
Articles / Programming Languages / C#

XTract: A tool for excerpting relevant text

Rate me:
Please Sign up or sign in to vote.
3.79/5 (10 votes)
22 Apr 20055 min read 36.2K   602   28  
Extracting the best fixed length excerpt from a document based on search terms.
using System;

namespace  GZTW.SearchStuff
{
	/// <summary>
	/// Rank and extract best excerpt of specified text and search terms
	/// </summary>
	public sealed class ExtractAndRank
	{
		
		#region Fields
		private string[] searchTerms;
		private string rawtext;
		private string extract="";
		private bool flattenExtract=true;
		private float ranking;		
		private int extractionThresholdRank=10;
		private int maximumCharactersToExtract=80;
		#endregion
		
		#region Properties		

		/// <summary>
		/// This is the ranking of the source text as it pertains to the 
		/// search terms
		/// 
		/// A rank of zero means either there was no match or the rank that was calculated
		/// was lower than the threshold ranking, either way, no excerpt extraction is done.
		/// 
		/// It is a percentage value on a scale of 0 to 100
		///	and is weighted:
		///	
		/// 75% of the score is the percentage of all search terms found in the text
		/// 25% of the score is the percentage of all characters in the text that are search term characters
		/// 
		///  
		/// </summary>
		public float Ranking
		{
			get
			{
				return ranking;
			}
		}

		/// <summary>
		/// Maximum characters to appear in an extraction 
		/// default is 80
		/// Minimum is 10
		/// </summary>
		public int MaximumCharactersToExtract
		{
			get
			{
				return maximumCharactersToExtract;
			}
			set
			{
				
				if(value>10)
					maximumCharactersToExtract=value;
				else
					maximumCharactersToExtract=10;

			}
		}

		/// <summary>
		/// ExtractionThresholdRank
		/// Extraction will only take place if the rank is
		/// this value or higher
		/// 
		/// default is 10, maximum is 100 minimum is 0
		/// </summary>
		public int ExtractionThresholdRank
		{
			get
			{
				return extractionThresholdRank;
			}
			set
			{
				if(value>100) 
					extractionThresholdRank=100;
				else if(value<0)
					extractionThresholdRank=0;
				else
					extractionThresholdRank=value;
			}
		}

		

		/// <summary>
		/// If true, carriage returns and line feeds will be removed from extract
		/// </summary>
		public bool FlattenExtract
		{
			get
			{
				return this.flattenExtract;
			}
			set
			{
				this.flattenExtract=value;
			}
		}

		/// <summary>
		/// Extracted text excerpt that best reflects search terms
		/// </summary>
		public string Extract
		{
			get
			{
				return extract;
			}
		}

		#endregion

		#region public methods
		/// <summary>
		/// Do the extraction and ranking
		/// </summary>
		/// <param name="rawtext"></param>
		/// <param name="searchTerms"></param>
		public void Process(string rawText, string[]searchTerms)
		{
			ranking=0;
			extract="";
			this.rawtext=rawText;
			this.searchTerms=searchTerms;
			

			ranking=score(0,this.rawtext.Length);
			if(ranking>extractionThresholdRank)
				DoExtract();
		}
		#endregion
		
		#region Calculate score
		/// <summary>
		/// Give a percentage score for a given window of
		/// text in the raw text string
		/// 75% of the score is the percentage of all search terms found in the window
		/// 25% of the score is the percentage of all characters in the search window that are search term characters
		/// 
		/// 
		/// 
		/// </summary>
		/// <param name="nStartPos"></param>
		/// <param name="nEndPos"></param>
		/// <returns>Float value of zero to one hundred</returns>
		private float score(int nStartPos, int nEndPos)
		{	
			//rewrite this as an integer based calculation

			System.Diagnostics.Debug.Assert(nStartPos< nEndPos);
			if(nStartPos<0) nStartPos=0;
			if(nEndPos>this.rawtext.Length) nEndPos=this.rawtext.Length;
			
			int nTermCharsInWindow=0;//how many of the characters in the window are matching term characters
			string SearchString=this.rawtext.Substring(nStartPos,nEndPos-nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);

			int nMatches=0;

			foreach(string term in searchTerms)
			{	
				string lTerm=term.ToLower(System.Globalization.CultureInfo.CurrentCulture);
				int nLocation=SearchString.IndexOf(lTerm);
				if(nLocation!=-1)
				{
					nMatches++;
					while(nLocation!=-1)
					{
						nTermCharsInWindow+=lTerm.Length;;
						nLocation=SearchString.IndexOf(lTerm,nLocation+1);
						
					}
					
				}
			}

			//If no matches then rank is automatically zero
			if(nMatches==0) 
			{
				return 0;
			}

			

			//Rank is calculated on a weighted scale
			//75% for matching all search terms
			//25% for the quantity of search terms versus other text found
			float fTermsFoundPct = 75*((float)nMatches/(float)searchTerms.GetLength(0));
			float fTermsVsTextPct=0;
			if(nTermCharsInWindow>0)
				fTermsVsTextPct = 25*((float)nTermCharsInWindow/(float) SearchString.Length);		
			
			return fTermsFoundPct+fTermsVsTextPct;

		}
		#endregion

		#region Extract best excerpt 
		/// <summary>
		/// Extract the best scoring excerpt fragments of 
		/// raw text
		/// </summary>
		private void DoExtract()
		{
			//If the whole thing is less than the max to extract
			//just save time and return the whole thing
			if(this.rawtext.Length<this.maximumCharactersToExtract)
			{
				this.extract=this.rawtext;
				return;
			}

			string BestWindow="";
			float BestScore=0;			
			float thisscore=0;
			int BestWindowStartPos=0;

			//Get the shortest search term length so 
			//we can save time iterating over the window in the extract
			//function below
			int shortestSearchTermLength=int.MaxValue;
			foreach(string s in this.searchTerms)
			{
				if(s.Length<shortestSearchTermLength)
					shortestSearchTermLength=s.Length;

			}


			//slide a window over the text and check it's score, the highest scoring window wins
			//move the length of the shortest search term so as to ensure we won't
			//miss it, but faster than moving one character at a time
			for(int x=0;x<this.rawtext.Length-maximumCharactersToExtract;x+=shortestSearchTermLength)
			{
				thisscore=score(x,x+(maximumCharactersToExtract));
				
				if(thisscore==0) continue;

				if(thisscore>BestScore)
				{
					BestScore=thisscore;
					BestWindow=this.rawtext.Substring(x,maximumCharactersToExtract);
					//Best window to get if the future score is equal
					//I.E. put the terms in the center of the window if 
					//the score is equal
					BestWindowStartPos=x+(maximumCharactersToExtract/2);
				}

				//If it's equal to the last and we're positioned over
				//the best spot (terms in center) then capture that
				if(thisscore==BestScore && x==BestWindowStartPos)
				{
					BestWindow=this.rawtext.Substring(x,maximumCharactersToExtract);

				}
			}

			if(this.flattenExtract)
				this.extract="..."+BestWindow.Trim().Replace("\r","").Replace("\n","")+"...";
			else
				this.extract="..."+BestWindow.Trim()+"...";


		}


		//========================================================================

		#endregion

	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Canada Canada
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions