|
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Xml;
using System.Xml.XPath;
using System.Text.RegularExpressions;
namespace FileContextIndexer {
/// <summary>
/// @Author: Slava Khristich
///
/// Index large xml files into indexing file. Read node from index position.
///
/// </summary>
public partial class Form1 : Form {
string filePath = string.Empty;
string wokingCopy = string.Empty;
string xmlIndexFile = string.Empty;
//Output document writer
StreamWriter sw = null;
//Sorted list of IDs from XML document
SortedList<int, int> indexList = new SortedList<int, int>();
public Form1() {
InitializeComponent();
}
/// <summary>
/// Navigate to XML document for import.
/// </summary>
private void ImportXMLDoc() {
openFileDlg = new OpenFileDialog();
openFileDlg.Filter = "XML files|*.xml|All files|*.*";
openFileDlg.ShowDialog();
filePath = openFileDlg.FileName;
MakeCopy();
btnIndex.Enabled = true;
}
/// <summary>
/// Indexing process. Specify your node of interest in txtNode field.
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnIndex_Click(object sender, EventArgs e) {
ImportXMLDoc();
double timeDelta = DateTime.Now.TimeOfDay.TotalSeconds;
Console.Out.WriteLine("Start time:" + DateTime.Now.ToString());
using (FileStream fs = new FileStream(wokingCopy, FileMode.Open, FileAccess.Read))
using (StreamReader sr = new StreamReader(wokingCopy, Encoding.UTF8)) {
string parseText = txtNode.Text.Trim();
//Matching expression for the node:
Regex rx = new Regex(@"<" + parseText, RegexOptions.Compiled | RegexOptions.IgnoreCase);
int pos = 0;
int startIndex = 0;
int lastPositio = 0;
//Read each line in XML document as regular file stream.
do {
string line = sr.ReadLine();
pos += Encoding.UTF8.GetByteCount(line) + 2;// 2 extra bites for end of line chars.
MatchCollection m = rx.Matches(line);
foreach (Match mt in m) {
startIndex = lastPositio + mt.Index;
ValidateXPathCondition(fs, startIndex);
}
lastPositio = pos;
} while (!sr.EndOfStream);
sr.Close();
sw.Close();
fs.Close();
}
WriteSortedDocument();
Console.Out.WriteLine("End time:" + DateTime.Now.ToString());
timeDelta = DateTime.Now.TimeOfDay.TotalSeconds - timeDelta;
this.Text = "Process time: " + timeDelta + " sec";
btnReadIndexed.Enabled = true;
}
/// <summary>
/// Create an output document with selected indexed nodes.
/// </summary>
private void WriteSortedDocument() {
using(FileStream fs = new FileStream(wokingCopy, FileMode.Open, FileAccess.Read))
using (StreamWriter wr = new StreamWriter(wokingCopy + ".sorted", false)) {
wr.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
wr.WriteLine("<" + txtNode.Text.Trim().ToUpper() + "S>");
foreach (int key in indexList.Keys) {
fs.Seek(indexList[key], SeekOrigin.Begin);
using (XmlReader reader = XmlReader.Create(fs)) {
reader.MoveToContent();
XmlDocument d = new XmlDocument();
d.Load(reader.ReadSubtree());
wr.WriteLine(d.InnerXml);
wr.Flush();
reader.Close();
}
}
fs.Close();
wr.WriteLine("</" + txtNode.Text.Trim().ToUpper() + "S>");
wr.Flush();
wr.Close();
}
}
/// <summary>
/// Write value and index in original file into the indexing file.
/// </summary>
/// <param name="value"></param>
/// <param name="startIndex"></param>
private void SaveMatchedIndex(string value, int startIndex) {
sw.WriteLine(value + "\t" + startIndex);
sw.Flush();
//Add new element to sorted list. Sorting is by key
indexList.Add(Int32.Parse(value), startIndex);
}
/// <summary>
/// Validate condition for output file.
/// </summary>
/// <param name="fs">Original File stream</param>
/// <param name="startIndex">Position in the file</param>
/// <returns></returns>
private string ValidateXPathCondition(FileStream fs, int startIndex) {
fs.Seek(startIndex, SeekOrigin.Begin);
using (XmlReader reader = XmlReader.Create(fs)) {
reader.MoveToContent();
XmlDocument d = new XmlDocument();
d.Load(reader.ReadSubtree());
//TODO: Enter any condition you like for validation of this node to write ot output file:
//Change true to your condition.
if (true) {
//IN this case we want to get value of ID attribute of this node
SaveMatchedIndex(d.SelectSingleNode(txtCondition.Text.Trim()).Value, startIndex);
}
reader.Close();
}
return null;
}
/// <summary>
/// Make a copy of document you want to process
/// </summary>
private void MakeCopy() {
string curDir = Application.StartupPath + "/";
FileInfo fi = new FileInfo(filePath);
wokingCopy = curDir + (Guid.NewGuid()) + "_" + fi.Name;
this.Text = "File size is:" + fi.Length/1000000 + " - MB";
File.Copy(filePath, wokingCopy, true);
xmlIndexFile = wokingCopy + ".idx";
this.Text += " Copied to:" + curDir;
using (StreamWriter f = File.CreateText(xmlIndexFile)) {
f.Close();
}
sw = new StreamWriter(xmlIndexFile, true, Encoding.UTF8);
}
/// <summary>
/// Testing function to output node from index
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnReadIndexed_Click(object sender, EventArgs e) {
using (FileStream fs = new FileStream(wokingCopy, FileMode.Open, FileAccess.Read)) {
int startIdx = Int32.Parse(txtPos.Text.Trim());
fs.Seek(startIdx, SeekOrigin.Begin);
using (XmlReader reader = XmlReader.Create(fs)) {
reader.MoveToContent();
XmlDocument d = new XmlDocument();
d.Load(reader.ReadSubtree());
tbOutput.Text = d.InnerXml;
reader.Close();
}
}
}
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.