Click here to Skip to main content
15,894,460 members
Articles / Programming Languages / C#

UDDI Explorer: Tool for Searching Web Services

Rate me:
Please Sign up or sign in to vote.
4.93/5 (49 votes)
20 Dec 200517 min read 222.9K   3.2K   109  
Tool for searching web service(s) and viewing their WSDL information
/*
 * tf/idf implementation 
 * Author: Thanh Dao, thanh.dao@gmx.net or thanh.ngoc.dao@gmail.com
 */
using System;
using System.Collections;


namespace ServiceRanking
{
	/// <summary>
	/// Summary description for TF_IDFMeasure
	/// </summary>
	public class TFIDFMeasure
	{
		private string[] _docs;
		private string[][] _ngramDoc;
		private int _numDocs=0;
		private int _numTerms=0;
		private ArrayList _terms;
		private int[][] _termFreq;
		private float[][] _termWeight;
		private int[] _maxTermFreq;
		private int[] _docFreq;


		public class TermVector
		{		
			public static float ComputeCosineSimilarity(float[] vector1, float[] vector2)
			{
				if (vector1.Length != vector2.Length)				
					throw new Exception("DIFER LENGTH");
				

				float denom=(VectorLength(vector1) * VectorLength(vector2));
				if (denom == 0F)				
					return 0F;				
				else				
					return (InnerProduct(vector1, vector2) / denom);
				
			}

			public static float InnerProduct(float[] vector1, float[] vector2)
			{
			
				if (vector1.Length != vector2.Length)
					throw new Exception("DIFFER LENGTH ARE NOT ALLOWED");
				
			
				float result=0F;
				for (int i=0; i < vector1.Length; i++)				
					result += vector1[i] * vector2[i];
				
				return result;
			}
		
			public static float VectorLength(float[] vector)
			{			
				float sum=0.0F;
				for (int i=0; i < vector.Length; i++)				
					sum=sum + (vector[i] * vector[i]);
						
				return (float)Math.Sqrt(sum);
			}

		}

		private IDictionary _termIndex=new Hashtable() ;

		public TFIDFMeasure(string[] documents)
		{
			_docs=documents;
			_numDocs=documents.Length ;
            StopWordsHandler sw = new StopWordsHandler();

			MyInit();
		}

        private string[] GetNgrams(string text)
        {
            int ngramLen=4;
            string[] words = GetWords(text);

            text = string.Empty;
            for (int j = 0; j < words.Length; j++)
                text += words[j] + " ";

            if (text.Length > 0)
            {
                text.Remove(text.Length - 1);

                string[] ngrams = TextMatching.GenerateNGrams(text, ngramLen);
                return ngrams;
            }
            else
                return new string[1] { text };
            
        }

        private string[] GetWords(string text)
        {
            Tokeniser tokenizer = new Tokeniser();
            string[] words = tokenizer.Partition(text);

            return words;                    
        }


        private ArrayList GenerateTerms(string[] docs)
		{
			ArrayList uniqTerms=new ArrayList() ;
			
			for (int i=0; i < docs.Length ; i++)
			{
                //string[] constituents = GetWords(docs[i]);
                string[] constituents = GetNgrams(docs[i]);

                for (int j = 0; j < constituents.Length; j++)
                    if (!uniqTerms.Contains(constituents[j]))
                        uniqTerms.Add(constituents[j]);
								
			}

			return uniqTerms;
		}
		


		private static object AddElement(IDictionary collection, object key, object newValue)
		{
			object element=collection[key];
			collection[key]=newValue;
			return element;
		}

		private int GetTermIndex(string term)
		{
			object index=_termIndex[term];
			if (index == null) return -1;
			return (int) index;
		}

		private void MyInit()
		{
			_terms=GenerateTerms (_docs );
			_numTerms=_terms.Count ;

			_maxTermFreq=new int[_numDocs] ;
			_docFreq=new int[_numTerms] ;
			_termFreq =new int[_numTerms][] ;
			_termWeight=new float[_numTerms][] ;

			for(int i=0; i < _terms.Count ; i++)			
			{
				_termWeight[i]=new float[_numDocs] ;
				_termFreq[i]=new int[_numDocs] ;

				AddElement(_termIndex, _terms[i], i);			
			}
			
			GenerateTermFrequency ();
			GenerateTermWeight();			
				
		}
		
		private float Log(float num)
		{
			return (float) Math.Log(num) ;//log2
		}

		private void GenerateTermFrequency()
		{
			for(int i=0; i < _numDocs  ; i++)
			{								
				string curDoc=_docs[i];
                //IDictionary freq=GetWordFrequency(curDoc);
                IDictionary freq = GetNgramFrequency(curDoc);

				IDictionaryEnumerator enums=freq.GetEnumerator() ;
				_maxTermFreq[i]=int.MinValue ;
				while (enums.MoveNext())
				{
					string term=(string)enums.Key;
					int termFrequency=(int)enums.Value ;
					int termIdx=GetTermIndex(term);

					_termFreq [termIdx][i]=termFrequency; //frequency in doc
					_docFreq[termIdx] ++; //number of documents contain this term

					if (termFrequency > _maxTermFreq[i]) _maxTermFreq[i]=termFrequency;					
				}
			}
		}
		

		private void GenerateTermWeight()
		{			
			for(int i=0; i < _numTerms   ; i++)
			{
				for(int j=0; j < _numDocs ; j++)				
					_termWeight[i][j]=ComputeTermWeight (i, j);				
			}
		}

		private float GetTermFrequency(int term, int doc)
		{			
			int freq=_termFreq [term][doc];
			int maxfreq=_maxTermFreq[doc];			
			
			return ( (float) freq/(float)maxfreq );
		}

		private float GetInverseDocumentFrequency(int term)
		{
			int df=_docFreq[term];
			return Log((float) (_numDocs) / (float) df );
		}

		private float ComputeTermWeight(int term, int doc)
		{
			float tf=GetTermFrequency (term, doc);
			float idf=GetInverseDocumentFrequency(term);
			return tf * idf;
		}
		
		private  float[] GetTermVector(int doc)
		{
			float[] w=new float[_numTerms] ;
			for (int i=0; i < _numTerms; i++)											
				w[i]=_termWeight[i][doc];
			
				
			return w;
		}

		public float GetSimilarity(int doc_i, int doc_j)
		{
			float[] vector1=GetTermVector (doc_i);
			float[] vector2=GetTermVector (doc_j);

			return TermVector.ComputeCosineSimilarity(vector1, vector2) ;

		}


        private IDictionary GetNgramFrequency(string input)
        {         
            string[] ngrams = GetNgrams (input);
           
            Array.Sort(ngrams);

            String[] distinctWords = GetDistinctConstituents(ngrams);

            IDictionary result = new Hashtable();
            for (int i = 0; i < distinctWords.Length; i++)
            {
                object tmp;
                tmp = CountConstituent(distinctWords[i], ngrams);
                result[distinctWords[i]] = tmp;

            }

            return result;
        }				

		private IDictionary GetWordFrequency(string input)
		{

            string[] words = GetWords(input);
			Array.Sort(words);
			
			string[] distinctWords=GetDistinctConstituents(words);
						
			IDictionary result=new Hashtable();
			for (int i=0; i < distinctWords.Length; i++)
			{
				object tmp;
				tmp=CountConstituent(distinctWords[i], words);
				result[distinctWords[i]]=tmp;
				
			}
			
			return result;
		}				
				
		private string[] GetDistinctConstituents(string[] input)
		{				
			if (input == null)			
				return new string[0];			
			else
			{
				ArrayList list=new ArrayList() ;
				
				for (int i=0; i < input.Length; i++)
					if (!list.Contains(input[i])) // N-GRAM SIMILARITY?				
						list.Add(input[i]);
				
				return Tokeniser.ArrayListToArray(list) ;
			}
		}
		

		
		private int CountConstituent(string constituent, string[] list)
		{
			int itemIdx=Array.BinarySearch(list, constituent);
			
			if (itemIdx > 0)			
				while (itemIdx > 0 && list[itemIdx].Equals(constituent))				
					itemIdx--;				
						
			int count=0;
			while (itemIdx < list.Length && itemIdx >= 0)
			{
				if (list[itemIdx].Equals(constituent)) count++;				
				
				itemIdx++;
				if (itemIdx < list.Length)				
					if (!list[itemIdx].Equals(constituent)) break;					
				
			}
			
			return count;
		}				
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.


Written By
Software Developer
Vietnam Vietnam
I'm still alive...but temporarily moved to work on mobile & web stuffs(j2me/brew/php/flash...something not M$). things have just been very busy, and probably will continue...so don't have chance to maintain & respond. Hope will have time to try to write again, because many ideas with WPF &silver light are waiting. wish me luck Smile | :)

FYI:
- MESHSimPack project(c# library for measuring similarity among concepts of the MESH ontology):
http://sourceforge.net/projects/meshsimpack.

Comments and Discussions