|
using System;
namespace GZTW.SearchStuff
{
/// <summary>
/// Rank and extract best excerpt of specified text and search terms
/// </summary>
public sealed class ExtractAndRank
{
#region Fields
private string[] searchTerms;
private string rawtext;
private string extract="";
private bool flattenExtract=true;
private float ranking;
private int extractionThresholdRank=10;
private int maximumCharactersToExtract=80;
#endregion
#region Properties
/// <summary>
/// This is the ranking of the source text as it pertains to the
/// search terms
///
/// A rank of zero means either there was no match or the rank that was calculated
/// was lower than the threshold ranking, either way, no excerpt extraction is done.
///
/// It is a percentage value on a scale of 0 to 100
/// and is weighted:
///
/// 75% of the score is the percentage of all search terms found in the text
/// 25% of the score is the percentage of all characters in the text that are search term characters
///
///
/// </summary>
public float Ranking
{
get
{
return ranking;
}
}
/// <summary>
/// Maximum characters to appear in an extraction
/// default is 80
/// Minimum is 10
/// </summary>
public int MaximumCharactersToExtract
{
get
{
return maximumCharactersToExtract;
}
set
{
if(value>10)
maximumCharactersToExtract=value;
else
maximumCharactersToExtract=10;
}
}
/// <summary>
/// ExtractionThresholdRank
/// Extraction will only take place if the rank is
/// this value or higher
///
/// default is 10, maximum is 100 minimum is 0
/// </summary>
public int ExtractionThresholdRank
{
get
{
return extractionThresholdRank;
}
set
{
if(value>100)
extractionThresholdRank=100;
else if(value<0)
extractionThresholdRank=0;
else
extractionThresholdRank=value;
}
}
/// <summary>
/// If true, carriage returns and line feeds will be removed from extract
/// </summary>
public bool FlattenExtract
{
get
{
return this.flattenExtract;
}
set
{
this.flattenExtract=value;
}
}
/// <summary>
/// Extracted text excerpt that best reflects search terms
/// </summary>
public string Extract
{
get
{
return extract;
}
}
#endregion
#region public methods
/// <summary>
/// Do the extraction and ranking
/// </summary>
/// <param name="rawtext"></param>
/// <param name="searchTerms"></param>
public void Process(string rawText, string[]searchTerms)
{
ranking=0;
extract="";
this.rawtext=rawText;
this.searchTerms=searchTerms;
ranking=score(0,this.rawtext.Length);
if(ranking>extractionThresholdRank)
DoExtract();
}
#endregion
#region Calculate score
/// <summary>
/// Give a percentage score for a given window of
/// text in the raw text string
/// 75% of the score is the percentage of all search terms found in the window
/// 25% of the score is the percentage of all characters in the search window that are search term characters
///
///
///
/// </summary>
/// <param name="nStartPos"></param>
/// <param name="nEndPos"></param>
/// <returns>Float value of zero to one hundred</returns>
private float score(int nStartPos, int nEndPos)
{
//rewrite this as an integer based calculation
System.Diagnostics.Debug.Assert(nStartPos< nEndPos);
if(nStartPos<0) nStartPos=0;
if(nEndPos>this.rawtext.Length) nEndPos=this.rawtext.Length;
int nTermCharsInWindow=0;//how many of the characters in the window are matching term characters
string SearchString=this.rawtext.Substring(nStartPos,nEndPos-nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);
int nMatches=0;
foreach(string term in searchTerms)
{
string lTerm=term.ToLower(System.Globalization.CultureInfo.CurrentCulture);
int nLocation=SearchString.IndexOf(lTerm);
if(nLocation!=-1)
{
nMatches++;
while(nLocation!=-1)
{
nTermCharsInWindow+=lTerm.Length;;
nLocation=SearchString.IndexOf(lTerm,nLocation+1);
}
}
}
//If no matches then rank is automatically zero
if(nMatches==0)
{
return 0;
}
//Rank is calculated on a weighted scale
//75% for matching all search terms
//25% for the quantity of search terms versus other text found
float fTermsFoundPct = 75*((float)nMatches/(float)searchTerms.GetLength(0));
float fTermsVsTextPct=0;
if(nTermCharsInWindow>0)
fTermsVsTextPct = 25*((float)nTermCharsInWindow/(float) SearchString.Length);
return fTermsFoundPct+fTermsVsTextPct;
}
#endregion
#region Extract best excerpt
/// <summary>
/// Extract the best scoring excerpt fragments of
/// raw text
/// </summary>
private void DoExtract()
{
//If the whole thing is less than the max to extract
//just save time and return the whole thing
if(this.rawtext.Length<this.maximumCharactersToExtract)
{
this.extract=this.rawtext;
return;
}
string BestWindow="";
float BestScore=0;
float thisscore=0;
int BestWindowStartPos=0;
//Get the shortest search term length so
//we can save time iterating over the window in the extract
//function below
int shortestSearchTermLength=int.MaxValue;
foreach(string s in this.searchTerms)
{
if(s.Length<shortestSearchTermLength)
shortestSearchTermLength=s.Length;
}
//slide a window over the text and check it's score, the highest scoring window wins
//move the length of the shortest search term so as to ensure we won't
//miss it, but faster than moving one character at a time
for(int x=0;x<this.rawtext.Length-maximumCharactersToExtract;x+=shortestSearchTermLength)
{
thisscore=score(x,x+(maximumCharactersToExtract));
if(thisscore==0) continue;
if(thisscore>BestScore)
{
BestScore=thisscore;
BestWindow=this.rawtext.Substring(x,maximumCharactersToExtract);
//Best window to get if the future score is equal
//I.E. put the terms in the center of the window if
//the score is equal
BestWindowStartPos=x+(maximumCharactersToExtract/2);
}
//If it's equal to the last and we're positioned over
//the best spot (terms in center) then capture that
if(thisscore==BestScore && x==BestWindowStartPos)
{
BestWindow=this.rawtext.Substring(x,maximumCharactersToExtract);
}
}
if(this.flattenExtract)
this.extract="..."+BestWindow.Trim().Replace("\r","").Replace("\n","")+"...";
else
this.extract="..."+BestWindow.Trim()+"...";
}
//========================================================================
#endregion
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.