Click here to Skip to main content
15,881,248 members
Articles / Programming Languages / XML

Large XML Files Processing and Indexing

Rate me:
Please Sign up or sign in to vote.
4.88/5 (10 votes)
14 Nov 2008CPOL3 min read 68.7K   1.4K   47  
Index large XML file for fast access. Use IO and XMLReader for parsing with Regex.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Xml;
using System.Xml.XPath;
using System.Text.RegularExpressions;

namespace FileContextIndexer {
    /// <summary>
    /// @Author: Slava Khristich
    /// 
    /// Index large xml files into indexing file. Read node from index position.
    /// 
    /// </summary>
    public partial class Form1 : Form {
        string filePath = string.Empty;
        string wokingCopy = string.Empty;
        string xmlIndexFile = string.Empty;
        //Output document writer
        StreamWriter sw = null;
        //Sorted list of IDs from XML document
        SortedList<int, int> indexList = new SortedList<int, int>();

        public Form1() {
            InitializeComponent();
        }

        /// <summary>
        /// Navigate to XML document for import.
        /// </summary>
        private void ImportXMLDoc() {
            openFileDlg = new OpenFileDialog();
            openFileDlg.Filter = "XML files|*.xml|All files|*.*";
            openFileDlg.ShowDialog();
            filePath = openFileDlg.FileName;
            MakeCopy();
            btnIndex.Enabled = true;
        }

        /// <summary>
        /// Indexing process. Specify your node of interest in txtNode field.
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnIndex_Click(object sender, EventArgs e) {
            ImportXMLDoc();
            double timeDelta = DateTime.Now.TimeOfDay.TotalSeconds;
            Console.Out.WriteLine("Start time:" + DateTime.Now.ToString());
            using (FileStream fs = new FileStream(wokingCopy, FileMode.Open, FileAccess.Read))
            using (StreamReader sr = new StreamReader(wokingCopy, Encoding.UTF8)) {
                string parseText = txtNode.Text.Trim();
                //Matching expression for the node:
                Regex rx = new Regex(@"<" + parseText, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                int pos = 0;
                int startIndex = 0;
                int lastPositio = 0;
                //Read each line in XML document as regular file stream.
                do {
                    string line = sr.ReadLine();
                    pos += Encoding.UTF8.GetByteCount(line) + 2;// 2 extra bites for end of line chars.
                    MatchCollection m = rx.Matches(line);
                    foreach (Match mt in m) {
                        startIndex = lastPositio + mt.Index;
                        ValidateXPathCondition(fs, startIndex);
                    }
                    lastPositio = pos;
                } while (!sr.EndOfStream);

                sr.Close();
                sw.Close();
                fs.Close();
            }

            WriteSortedDocument();
            
            Console.Out.WriteLine("End time:" + DateTime.Now.ToString());
            timeDelta = DateTime.Now.TimeOfDay.TotalSeconds - timeDelta;
            this.Text = "Process time: " + timeDelta + " sec";
            btnReadIndexed.Enabled = true;
        }

        /// <summary>
        /// Create an output document with selected indexed nodes.
        /// </summary>
        private void WriteSortedDocument() {
            using(FileStream fs = new FileStream(wokingCopy, FileMode.Open, FileAccess.Read))
            using (StreamWriter wr = new StreamWriter(wokingCopy + ".sorted", false)) {
                wr.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
                wr.WriteLine("<" + txtNode.Text.Trim().ToUpper() + "S>");
                foreach (int key in indexList.Keys) {
                    fs.Seek(indexList[key], SeekOrigin.Begin);
                    using (XmlReader reader = XmlReader.Create(fs)) {
                        reader.MoveToContent();
                        XmlDocument d = new XmlDocument();
                        d.Load(reader.ReadSubtree());
                        wr.WriteLine(d.InnerXml);
                        wr.Flush();
                        reader.Close();
                    }
                }
                fs.Close();
                wr.WriteLine("</" + txtNode.Text.Trim().ToUpper() + "S>");
                wr.Flush();
                wr.Close();
            }
        }

        /// <summary>
        /// Write value and index in original file into the indexing file.
        /// </summary>
        /// <param name="value"></param>
        /// <param name="startIndex"></param>
        private void SaveMatchedIndex(string value, int startIndex) {            
            sw.WriteLine(value + "\t" + startIndex);
            sw.Flush();
            //Add new element to sorted list. Sorting is by key
            indexList.Add(Int32.Parse(value), startIndex);
        }

        /// <summary>
        /// Validate condition for output file.
        /// </summary>
        /// <param name="fs">Original File stream</param>
        /// <param name="startIndex">Position in the file</param>
        /// <returns></returns>
        private string ValidateXPathCondition(FileStream fs, int startIndex) {
            fs.Seek(startIndex, SeekOrigin.Begin);
            using (XmlReader reader = XmlReader.Create(fs)) {
                reader.MoveToContent();
                XmlDocument d = new XmlDocument();
                d.Load(reader.ReadSubtree());

                //TODO: Enter any condition you like for validation of this node to write ot output file:
                //Change true to your condition.
                if (true) {
                    //IN this case we want to get value of ID attribute of this node
                    SaveMatchedIndex(d.SelectSingleNode(txtCondition.Text.Trim()).Value, startIndex);
                }
                reader.Close();
            }
            return null;
        }

        /// <summary>
        /// Make a copy of document you want to process
        /// </summary>
        private void MakeCopy() {
            string curDir = Application.StartupPath + "/";
            FileInfo fi = new FileInfo(filePath);
            wokingCopy = curDir + (Guid.NewGuid()) + "_" + fi.Name;
            this.Text = "File size is:" + fi.Length/1000000 + " - MB";
            File.Copy(filePath, wokingCopy, true);
            xmlIndexFile = wokingCopy + ".idx";
            this.Text += " Copied to:" + curDir;
            using (StreamWriter f = File.CreateText(xmlIndexFile)) {
                f.Close();
            }
            sw = new StreamWriter(xmlIndexFile, true, Encoding.UTF8);
        }

        /// <summary>
        /// Testing function to output node from index
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnReadIndexed_Click(object sender, EventArgs e) {
            using (FileStream fs = new FileStream(wokingCopy, FileMode.Open, FileAccess.Read)) {
                int startIdx = Int32.Parse(txtPos.Text.Trim());
                fs.Seek(startIdx, SeekOrigin.Begin);

                using (XmlReader reader = XmlReader.Create(fs)) {
                    reader.MoveToContent();
                    XmlDocument d = new XmlDocument();
                    d.Load(reader.ReadSubtree());
                    tbOutput.Text = d.InnerXml;
                    reader.Close();
                }
            }
        }


    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer (Senior) Tateeda Media Networks
United States United States

Software development is my passion as well as photography.


If you got a sec stop by to see my photography work at http://sk68.com


Tateeda Media Network

Comments and Discussions