Click here to Skip to main content
15,881,882 members
Please Sign up or sign in to vote.
1.86/5 (4 votes)
See more:
it is a similarity matching code that matches the similarity b/w authors papers titles & titles of clusters(qurries ).but this code is running very slow.so kindly help me to optimize this code. only main () function needs to be change. plzzzzzzzzzzzzzzzz help
C#
using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Linq;
namespace VectorSpaceModel
{
    class Program
    {
        static Hashtable DTVector = new Hashtable(); //Hashtable to hold Document Term Vector
        static List<string> wordlist = new List<string>(); //List of terms found in documents
        static Dictionary<double,> sortedList1 = new Dictionary<double,>(); //Documents ranked by VSM with angle value
        static Dictionary<string,> sortedList = new Dictionary<string,>();
        static string[] docs = new string[37406];
    

        static void Main(string[] args)
        {
           // string fileName = @"D:\FYP\new fyp\fnlfyp\OSIM\vsm2\AuthorsList.txt";
            string fileName2 = @"F:\fyp\fnlfyp\vsm2\OSIM\vsm2\output_titles.txt";
            string fileName3 = @"F:\fyp\fnlfyp\vsm2\OSIM\vsm2\queries.txt";
            string fileName4 = @"F:\fyp\fnlfyp\vsm2\OSIM\vsm2\output_vsm.txt";

            int num = 0;
            string[] authors = new string[37406];

            using (System.IO.StreamReader read_author = new System.IO.StreamReader(fileName2))
            {
                String line;
                int j = 0;
                j++;
                while ((line = read_author.ReadLine()) != null)
                {
                    //if (File.Exists(@"D:\FYP\new fyp\fnlfyp\OSIM\output_authorTitles\" + line + ".txt"))
                    //{
                        //using (System.IO.StreamReader read_authFile = new System.IO.StreamReader(@"D:\FYP\new fyp\fnlfyp\OSIM\output_authorTitles\" + line + ".txt"))
                        //{
                           // String line1;
                            //while ((line1 = read_authFile.ReadLine()) != null)
                            //{
                                string[] array = line.Split('=');
                                Console.WriteLine(j);
                                authors[j] = array[0];
                                docs[j] = array[1];
                                j++;
                           // }
                       // }
                    //}
                   /* if (line.StartsWith("Topic"))
                    { }
                    else
                    {
                        String line1 = "";
                        for (int i = 0; i < 10; i++)
                        {
                            string[] words = line.Split(' ');
                            line = words[0];
                            line = line.Trim();
                            line1 = line1 + line + " ";
                            line = sr.ReadLine();
                        }
                        docs[j] = line1;
                        j++;
                    }*/
                }
            }

            using (System.IO.StreamReader sr1 = new System.IO.StreamReader(fileName3))
            {
                String line2 = "";
                while ((line2 = sr1.ReadLine()) != null)
                {
                    docs[0] = line2;
                    num++;
                    createWordList();
                    createVector();
                    classify();
                    var dict = sortedList;
                    using (System.IO.StreamWriter writer = new System.IO.StreamWriter(fileName4, true))
                    {
                        writer.WriteLine(num);
                        writer.WriteLine(line2);
                        foreach (var x in dict.Reverse())
                        {
                            Console.WriteLine("Doc{1} -> {0}", x.Value, x.Key);
                            writer.WriteLine("Doc{1} -> {0}", x.Value, x.Key);
                        }
                        writer.WriteLine("");
                       
                    }
                   // Console.ReadLine();
                    docs.LastOrDefault();
                    DTVector.Clear();
                    wordlist.Clear();
                    sortedList.Clear();

                }
            }
        }


        public static void createWordList()
        {
            foreach (string doc in docs)
            {
                wordlist = getWordList(wordlist, doc);
            }
        }

        public static List<string> getWordList(List<string> wordlist, string query)
        {
            Regex exp = new Regex("\\w+", RegexOptions.IgnoreCase);
            MatchCollection MCollection = exp.Matches(query);

            foreach (Match match in MCollection)
            {
                if (!wordlist.Contains(match.Value))
                {
                    wordlist.Add(match.Value);
                }
            }

            return wordlist;
        }

        public static void createVector()
        {
            double[] queryvector;

            for (int j = 0; j < docs.Length; j++)
            {
                queryvector = new double[wordlist.Count];

                for (int i = 0; i < wordlist.Count; i++)
                {

                    double tfIDF = getTF(docs[j], wordlist[i]) * getIDF(wordlist[i]);
                    queryvector[i] = tfIDF;
                }

                if (j == 0) //is it a query?
                {
                    DTVector.Add("Query", queryvector);

                }
                else
                {

                    DTVector.Add(j.ToString(), queryvector);
                }
            }
        }

        public static void classify()
        {
            double temp = 0.0;

            IDictionaryEnumerator _enumerator = DTVector.GetEnumerator();

            double[] queryvector = new double[wordlist.Count];

            Array.Copy((double[])DTVector["Query"], queryvector, wordlist.Count);

            while (_enumerator.MoveNext())
            {
                if (_enumerator.Key.ToString() != "Query")
                {
                    temp = cosinetheta(queryvector, (double[])_enumerator.Value);
                    if(temp != 0)
                    {
                        sortedList.Add(_enumerator.Key.ToString(), temp);
                    }
                }
            }
        }

        public static double dotproduct(double[] v1, double[] v2)
        {
            double product = 0.0;
            if (v1.Length == v2.Length)
            {
                for (int i = 0; i < v1.Length; i++)
                {
                    product += v1[i] * v2[i];
                }
            }
            return product;
        }

        public static double vectorlength(double[] vector)
        {
            double length = 0.0;
            for (int i = 0; i < vector.Length; i++)
            {
                length += Math.Pow(vector[i], 2);
            }

            return Math.Sqrt(length);
        }
        private static double getTF(string document, string term)
        {
            string[] queryTerms = Regex.Split(document, "\\s");
            double count = 0;


            foreach (string t in queryTerms)
            {
                if (t == term)
                {
                    count++;
                }
            }
            return count;

        }

        private static double getIDF(string term)
        {
            double df = 0.0;
            //get term frequency of all of the sentences except for the query
            for (int i = 1; i < docs.Length; i++)
            {
                if (docs[i].Contains(term))
                {
                    df++;
                }
            }

            //Get sentence count
            double D = docs.Length - 1; //excluding the query 

            double IDF = 0.0;

            if (df > 0)
            {
                IDF = Math.Log(D / df);
            }

            return IDF;
        }

        public static double cosinetheta(double[] v1, double[] v2)
        {
            double lengthV1 = vectorlength(v1);
            double lengthV2 = vectorlength(v2);

            double dotprod = dotproduct(v1, v2);
            if (lengthV1 != 0)
                return dotprod / (lengthV1 * lengthV2);
            else
                return 0;

        }
    }
}
Posted
Updated 18-May-14 20:35pm
v2

1 solution

Seriously? You slap a pile of messy rubbish on a website and you expect us to sort it out for you? When you can't even be bothered to get rid of redundant code so we can see what is there? Or comment your code to make it simple for us, or even give us what we would need to run it?


We do not do your homework: it is set for a reason. It is there so that you think about what you have been told, and try to understand it. It is also there so that your tutor can identify areas where you are weak, and focus more attention on remedial action.

Try it yourself, or learn the Magic Words: "Do you want fries with that?"
 
Share this answer
 
Comments
Kornfeld Eliyahu Peter 19-May-14 2:50am    
Grab a cup of coffee! You seem to be a bit angry this morning...
OriginalGriff 19-May-14 3:02am    
You're probably right...:laugh:
Telstra 19-May-14 3:21am    
I think the @ayesha04 is new here. But you should not reply with such a answer so that she will never come back to this website again by such a experience.
Richard MacCutchan 19-May-14 3:43am    
Looks like exactly the right answer to me. There are too many people posting rubbish like this and expecting someone else to do their work for them.
Telstra 19-May-14 3:57am    
You are right Richard. Our site is for that only. But if you see the guidelines for submit the solution it is clearly saying that "Let's work to help developers, not make them feel stupid." Every one is not having the same level of understanding in this world.
Thanks

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900