Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

Implementation of XML Information Retrieval by LINQ

, 31 May 2012 CPOL
We describe all concepts according to XML corpus by its full set of words based on the tags Term frequencies, Inverse document frequencies, words from the given document are used.
AjaxControlToolkitDll.zip
AjaxControlToolkitDll
AjaxControlToolkit.dll
IRProj.zip
IRProj
IRProj
IRProj.csproj.user
Properties
IRProj.suo
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Xml.Linq;
using System.Text.RegularExpressions;
using System.IO;

namespace IRProj
{
    public partial class IRForm : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {
            

            XElement xIr = XElement.Load(Server.MapPath("~\\IR.xml"));

            lblTotalDocs.Text  = Convert.ToString(xIr.Descendants().Count());                      

            List<string> allWords = new List<string>();
            Dictionary<string, int> corpusFreq = new Dictionary<string, int>();
            Dictionary<string, List<string>> docsWord = new Dictionary<string, List<string>>();
            Dictionary<string, int> docFreq = new Dictionary<string, int>();

            // Match any character in the range 0 - 9, A - Z and a - z (equivalent of POSIX [:alnum:])
            string pattern = "\\w+";//\\w+
            Regex regex = new Regex(pattern);


           
            string docIDs = xIr.Elements().SingleOrDefault(p => p.Attribute("ID").Value.Equals("3")).Value;
            List<string> param = xIr.Elements().Where(p => p.Value.ToLower().Contains("sense")).Select(p=>p.Attribute("ID").Value).ToList();
          
            //all the words is seprated with space
            foreach (XElement el in xIr.Elements())
            {
               el.Value =  el.Value.Replace("'","");
                MatchCollection matchCollection = regex.Matches(el.Value);
                string docId = el.Attribute("ID").Value;

                foreach (Match match in matchCollection)
                {
                    if (!StopWordsHandler.IsStopword(match.Value.ToLower()))
                    {
                        if (!corpusFreq.ContainsKey(match.Value.ToLower()))
                        {                

                            corpusFreq.Add(match.Value.ToLower(), 1);
                            docFreq.Add(match.Value.ToLower(), 1);
                            docsWord.Add(match.Value.ToLower(), new List<string> { docId });
                        }
                        else
                        {
                            corpusFreq[match.Value.ToLower()] = corpusFreq[match.Value.ToLower()] + 1;
                            if (docsWord.ContainsKey(match.Value.ToLower()))
                            {
                                if (!docsWord[match.Value.ToLower()].Contains(docId))
                                {
                                    docFreq[match.Value.ToLower()] = docFreq[match.Value.ToLower()] + 1;
                                    docsWord[match.Value.ToLower()].Add(docId);
                                }
                            }
                        }
                        allWords.Add(match.Value.ToLower());
                    }

                }

            }

            lblTotalWords.Text = Convert.ToString(allWords.Count);
            lblDistinctWords.Text = Convert.ToString(corpusFreq.Count);

            int distinctNo = allWords.Distinct().ToList().Count;
            // allWords.AddRange(matchCollection.Cast<Match>().Select(m => m.Value.ToLower()).ToList());


            lvWords.DataSource = corpusFreq;
            lvWords.DataBind();

            lvDocFreq.DataSource = docFreq;
            lvDocFreq.DataBind();

           var corpusFreqSorted = corpusFreq.OrderByDescending(p => p.Value);
           lvMostFreq.DataSource = corpusFreqSorted.Take(50);
           lvMostFreq.DataBind();

           var docFreqSorted = docFreq.OrderByDescending(p => p.Value);
           lvMostFreqDocs.DataSource = docFreqSorted.Take(50);
           lvMostFreqDocs.DataBind();

           lblFifthFreq.Text = corpusFreqSorted.Take(50).Last().Key;
           lblHundredthFreq.Text = corpusFreqSorted.Take(100).Last().Key;
           lbl2HundredthFreq.Text = corpusFreqSorted.Take(200).Last().Key;

           lblFifthFreqCol.Text = docFreqSorted.Take(50).Last().Key;
           lblHundredthFreqCol.Text = docFreqSorted.Take(100).Last().Key;
           lbl2HundredthFreqCol.Text = docFreqSorted.Take(200).Last().Key;

           double zipf = Math.Log(docFreqSorted.Take(200).Last().Value) + Math.Log(200);

           int numWordsOccurOnly = docFreq.Where(p => p.Value.Equals(1)).Count();
           lblOneDocWords.Text = Convert.ToString(numWordsOccurOnly) + "((" + (numWordsOccurOnly * 100) / corpusFreq.Count() + " %))";

           //Load xml
           // DirectoryInfo Dir = new DirectoryInfo(@"Z:\pages");

           //IEnumerable<FileInfo> FI =Dir.GetFiles("*.*", SearchOption.AllDirectories).Where(p => p.Extension.Equals(".xml"));
           //foreach (var obj in FI)
           //{
           //    try
           //    {
           //        XDocument xdoc = XDocument.Load(obj.DirectoryName + "/" + obj);
           //        int wordno = xdoc.Descendants().Elements("id").Count(p => p.Value.Equals("7974019"));
           //        string xx = xdoc.Root.Value;
           //    }
           //    catch
           //    { }  
           //}             

          

        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

farzaneh ansari
Web Developer PascalSystem,is located in Iran-Isfahan
Iran (Islamic Republic Of) Iran (Islamic Republic Of)
I was graduated from Esfahan unversity(BS Applied Mathematics)
Master of Computer Science at Saarland University

| Advertise | Privacy | Terms of Use | Mobile
Web02 | 2.8.1411023.1 | Last Updated 31 May 2012
Article Copyright 2012 by farzaneh ansari
Everything else Copyright © CodeProject, 1999-2014
Layout: fixed | fluid