|
/*
* tf/idf implementation
* Author: Thanh Dao, thanh.dao@gmx.net or thanh.ngoc.dao@gmail.com
*/
using System;
using System.Collections;
namespace ServiceRanking
{
/// <summary>
/// Summary description for TF_IDFMeasure
/// </summary>
public class TFIDFMeasure
{
private string[] _docs;
private string[][] _ngramDoc;
private int _numDocs=0;
private int _numTerms=0;
private ArrayList _terms;
private int[][] _termFreq;
private float[][] _termWeight;
private int[] _maxTermFreq;
private int[] _docFreq;
public class TermVector
{
public static float ComputeCosineSimilarity(float[] vector1, float[] vector2)
{
if (vector1.Length != vector2.Length)
throw new Exception("DIFER LENGTH");
float denom=(VectorLength(vector1) * VectorLength(vector2));
if (denom == 0F)
return 0F;
else
return (InnerProduct(vector1, vector2) / denom);
}
public static float InnerProduct(float[] vector1, float[] vector2)
{
if (vector1.Length != vector2.Length)
throw new Exception("DIFFER LENGTH ARE NOT ALLOWED");
float result=0F;
for (int i=0; i < vector1.Length; i++)
result += vector1[i] * vector2[i];
return result;
}
public static float VectorLength(float[] vector)
{
float sum=0.0F;
for (int i=0; i < vector.Length; i++)
sum=sum + (vector[i] * vector[i]);
return (float)Math.Sqrt(sum);
}
}
private IDictionary _termIndex=new Hashtable() ;
public TFIDFMeasure(string[] documents)
{
_docs=documents;
_numDocs=documents.Length ;
StopWordsHandler sw = new StopWordsHandler();
MyInit();
}
private string[] GetNgrams(string text)
{
int ngramLen=4;
string[] words = GetWords(text);
text = string.Empty;
for (int j = 0; j < words.Length; j++)
text += words[j] + " ";
if (text.Length > 0)
{
text.Remove(text.Length - 1);
string[] ngrams = TextMatching.GenerateNGrams(text, ngramLen);
return ngrams;
}
else
return new string[1] { text };
}
private string[] GetWords(string text)
{
Tokeniser tokenizer = new Tokeniser();
string[] words = tokenizer.Partition(text);
return words;
}
private ArrayList GenerateTerms(string[] docs)
{
ArrayList uniqTerms=new ArrayList() ;
for (int i=0; i < docs.Length ; i++)
{
//string[] constituents = GetWords(docs[i]);
string[] constituents = GetNgrams(docs[i]);
for (int j = 0; j < constituents.Length; j++)
if (!uniqTerms.Contains(constituents[j]))
uniqTerms.Add(constituents[j]);
}
return uniqTerms;
}
private static object AddElement(IDictionary collection, object key, object newValue)
{
object element=collection[key];
collection[key]=newValue;
return element;
}
private int GetTermIndex(string term)
{
object index=_termIndex[term];
if (index == null) return -1;
return (int) index;
}
private void MyInit()
{
_terms=GenerateTerms (_docs );
_numTerms=_terms.Count ;
_maxTermFreq=new int[_numDocs] ;
_docFreq=new int[_numTerms] ;
_termFreq =new int[_numTerms][] ;
_termWeight=new float[_numTerms][] ;
for(int i=0; i < _terms.Count ; i++)
{
_termWeight[i]=new float[_numDocs] ;
_termFreq[i]=new int[_numDocs] ;
AddElement(_termIndex, _terms[i], i);
}
GenerateTermFrequency ();
GenerateTermWeight();
}
private float Log(float num)
{
return (float) Math.Log(num) ;//log2
}
private void GenerateTermFrequency()
{
for(int i=0; i < _numDocs ; i++)
{
string curDoc=_docs[i];
//IDictionary freq=GetWordFrequency(curDoc);
IDictionary freq = GetNgramFrequency(curDoc);
IDictionaryEnumerator enums=freq.GetEnumerator() ;
_maxTermFreq[i]=int.MinValue ;
while (enums.MoveNext())
{
string term=(string)enums.Key;
int termFrequency=(int)enums.Value ;
int termIdx=GetTermIndex(term);
_termFreq [termIdx][i]=termFrequency; //frequency in doc
_docFreq[termIdx] ++; //number of documents contain this term
if (termFrequency > _maxTermFreq[i]) _maxTermFreq[i]=termFrequency;
}
}
}
private void GenerateTermWeight()
{
for(int i=0; i < _numTerms ; i++)
{
for(int j=0; j < _numDocs ; j++)
_termWeight[i][j]=ComputeTermWeight (i, j);
}
}
private float GetTermFrequency(int term, int doc)
{
int freq=_termFreq [term][doc];
int maxfreq=_maxTermFreq[doc];
return ( (float) freq/(float)maxfreq );
}
private float GetInverseDocumentFrequency(int term)
{
int df=_docFreq[term];
return Log((float) (_numDocs) / (float) df );
}
private float ComputeTermWeight(int term, int doc)
{
float tf=GetTermFrequency (term, doc);
float idf=GetInverseDocumentFrequency(term);
return tf * idf;
}
private float[] GetTermVector(int doc)
{
float[] w=new float[_numTerms] ;
for (int i=0; i < _numTerms; i++)
w[i]=_termWeight[i][doc];
return w;
}
public float GetSimilarity(int doc_i, int doc_j)
{
float[] vector1=GetTermVector (doc_i);
float[] vector2=GetTermVector (doc_j);
return TermVector.ComputeCosineSimilarity(vector1, vector2) ;
}
private IDictionary GetNgramFrequency(string input)
{
string[] ngrams = GetNgrams (input);
Array.Sort(ngrams);
String[] distinctWords = GetDistinctConstituents(ngrams);
IDictionary result = new Hashtable();
for (int i = 0; i < distinctWords.Length; i++)
{
object tmp;
tmp = CountConstituent(distinctWords[i], ngrams);
result[distinctWords[i]] = tmp;
}
return result;
}
private IDictionary GetWordFrequency(string input)
{
string[] words = GetWords(input);
Array.Sort(words);
string[] distinctWords=GetDistinctConstituents(words);
IDictionary result=new Hashtable();
for (int i=0; i < distinctWords.Length; i++)
{
object tmp;
tmp=CountConstituent(distinctWords[i], words);
result[distinctWords[i]]=tmp;
}
return result;
}
private string[] GetDistinctConstituents(string[] input)
{
if (input == null)
return new string[0];
else
{
ArrayList list=new ArrayList() ;
for (int i=0; i < input.Length; i++)
if (!list.Contains(input[i])) // N-GRAM SIMILARITY?
list.Add(input[i]);
return Tokeniser.ArrayListToArray(list) ;
}
}
private int CountConstituent(string constituent, string[] list)
{
int itemIdx=Array.BinarySearch(list, constituent);
if (itemIdx > 0)
while (itemIdx > 0 && list[itemIdx].Equals(constituent))
itemIdx--;
int count=0;
while (itemIdx < list.Length && itemIdx >= 0)
{
if (list[itemIdx].Equals(constituent)) count++;
itemIdx++;
if (itemIdx < list.Length)
if (!list[itemIdx].Equals(constituent)) break;
}
return count;
}
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.
I'm still alive...but temporarily moved to work on mobile & web stuffs(j2me/brew/php/flash...something not M$). things have just been very busy, and probably will continue...so don't have chance to maintain & respond. Hope will have time to try to write again, because many ideas with WPF &silver light are waiting. wish me luck
FYI:
- MESHSimPack project(c# library for measuring similarity among concepts of the MESH ontology):
http://sourceforge.net/projects/meshsimpack.