Click here to Skip to main content
13,139,569 members (53,967 online)
Click here to Skip to main content

Stats

13.8K views
7 bookmarked
Posted 26 Apr 2012

Implementation of XML Information Retrieval by LINQ

, 31 May 2012
We describe all concepts according to XML corpus by its full set of words based on the tags Term frequencies, Inverse document frequencies, words from the given document are used.
AjaxControlToolkitDll
AjaxControlToolkit.dll
IRProj
IRProj
IRProj.csproj.user
Properties
IRProj.suo
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Xml.Linq;
using System.Text.RegularExpressions;
using System.IO;

namespace IRProj
{
    public partial class IRForm : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {
            

            XElement xIr = XElement.Load(Server.MapPath("~\\IR.xml"));

            lblTotalDocs.Text  = Convert.ToString(xIr.Descendants().Count());                      

            List<string> allWords = new List<string>();
            Dictionary<string, int> corpusFreq = new Dictionary<string, int>();
            Dictionary<string, List<string>> docsWord = new Dictionary<string, List<string>>();
            Dictionary<string, int> docFreq = new Dictionary<string, int>();

            // Match any character in the range 0 - 9, A - Z and a - z (equivalent of POSIX [:alnum:])
            string pattern = "\\w+";//\\w+
            Regex regex = new Regex(pattern);


           
            string docIDs = xIr.Elements().SingleOrDefault(p => p.Attribute("ID").Value.Equals("3")).Value;
            List<string> param = xIr.Elements().Where(p => p.Value.ToLower().Contains("sense")).Select(p=>p.Attribute("ID").Value).ToList();
          
            //all the words is seprated with space
            foreach (XElement el in xIr.Elements())
            {
               el.Value =  el.Value.Replace("'","");
                MatchCollection matchCollection = regex.Matches(el.Value);
                string docId = el.Attribute("ID").Value;

                foreach (Match match in matchCollection)
                {
                    if (!StopWordsHandler.IsStopword(match.Value.ToLower()))
                    {
                        if (!corpusFreq.ContainsKey(match.Value.ToLower()))
                        {                

                            corpusFreq.Add(match.Value.ToLower(), 1);
                            docFreq.Add(match.Value.ToLower(), 1);
                            docsWord.Add(match.Value.ToLower(), new List<string> { docId });
                        }
                        else
                        {
                            corpusFreq[match.Value.ToLower()] = corpusFreq[match.Value.ToLower()] + 1;
                            if (docsWord.ContainsKey(match.Value.ToLower()))
                            {
                                if (!docsWord[match.Value.ToLower()].Contains(docId))
                                {
                                    docFreq[match.Value.ToLower()] = docFreq[match.Value.ToLower()] + 1;
                                    docsWord[match.Value.ToLower()].Add(docId);
                                }
                            }
                        }
                        allWords.Add(match.Value.ToLower());
                    }

                }

            }

            lblTotalWords.Text = Convert.ToString(allWords.Count);
            lblDistinctWords.Text = Convert.ToString(corpusFreq.Count);

            int distinctNo = allWords.Distinct().ToList().Count;
            // allWords.AddRange(matchCollection.Cast<Match>().Select(m => m.Value.ToLower()).ToList());


            lvWords.DataSource = corpusFreq;
            lvWords.DataBind();

            lvDocFreq.DataSource = docFreq;
            lvDocFreq.DataBind();

           var corpusFreqSorted = corpusFreq.OrderByDescending(p => p.Value);
           lvMostFreq.DataSource = corpusFreqSorted.Take(50);
           lvMostFreq.DataBind();

           var docFreqSorted = docFreq.OrderByDescending(p => p.Value);
           lvMostFreqDocs.DataSource = docFreqSorted.Take(50);
           lvMostFreqDocs.DataBind();

           lblFifthFreq.Text = corpusFreqSorted.Take(50).Last().Key;
           lblHundredthFreq.Text = corpusFreqSorted.Take(100).Last().Key;
           lbl2HundredthFreq.Text = corpusFreqSorted.Take(200).Last().Key;

           lblFifthFreqCol.Text = docFreqSorted.Take(50).Last().Key;
           lblHundredthFreqCol.Text = docFreqSorted.Take(100).Last().Key;
           lbl2HundredthFreqCol.Text = docFreqSorted.Take(200).Last().Key;

           double zipf = Math.Log(docFreqSorted.Take(200).Last().Value) + Math.Log(200);

           int numWordsOccurOnly = docFreq.Where(p => p.Value.Equals(1)).Count();
           lblOneDocWords.Text = Convert.ToString(numWordsOccurOnly) + "((" + (numWordsOccurOnly * 100) / corpusFreq.Count() + " %))";

           //Load xml
           // DirectoryInfo Dir = new DirectoryInfo(@"Z:\pages");

           //IEnumerable<FileInfo> FI =Dir.GetFiles("*.*", SearchOption.AllDirectories).Where(p => p.Extension.Equals(".xml"));
           //foreach (var obj in FI)
           //{
           //    try
           //    {
           //        XDocument xdoc = XDocument.Load(obj.DirectoryName + "/" + obj);
           //        int wordno = xdoc.Descendants().Elements("id").Count(p => p.Value.Equals("7974019"));
           //        string xx = xdoc.Root.Value;
           //    }
           //    catch
           //    { }  
           //}             

          

        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

farzaneh ansari
Web Developer PascalSystem,is located in Iran-Isfahan
Iran (Islamic Republic of) Iran (Islamic Republic of)
I was graduated from Esfahan unversity(BS Applied Mathematics)
Master of Computer Science at Saarland University

You may also be interested in...

Permalink | Advertise | Privacy | Terms of Use | Mobile
Web04 | 2.8.170915.1 | Last Updated 31 May 2012
Article Copyright 2012 by farzaneh ansari
Everything else Copyright © CodeProject, 1999-2017
Layout: fixed | fluid