Hi, I'm trying to create a program which finds whether there are 3 or more consecutive nodes of a particular type and shows them in a console.
For example, if my file contains consecutive nodes in the format
<xref ref-type="bibr" rid="ref11">[11]</xref>, <xref ref-type="bibr" rid="ref12">[12]</xref>, <xref ref-type="bibr" rid="ref13">[13]</xref>
then this is printed on the console, however
<xref ref-type="bibr" rid="ref11">[11]</xref>, <xref ref-type="bibr" rid="ref12">[12]</xref>, <xref ref-type="bibr" rid="ref14">[14]</xref>
as the
rid
value increased form 12 to 14, it should find matches only when the
rid
value is increment by +1.
Anyways, the below code does the job
What I have tried:
using System;
using System.Collections.Generic;
using System.Linq;
using System.IO;
using System.Text;
using System.Xml;
using System.Text.RegularExpressions;
namespace CityRemover
{
class Program
{
public static void Main(string[] args)
{
string[] files=Directory.GetFiles(@"D:\test\Jobs\12335","*.xml");
foreach (var file in files) {
XmlDocument doc = new XmlDocument();
doc.PreserveWhitespace = true;
doc.Load(file);
XmlNodeList nodes = doc.DocumentElement.SelectNodes("//*[count(xref[@ref-type='bibr' and starts-with(@rid,'ref')])>2]");
List<string> results = new List<string>();
foreach (XmlNode x in nodes)
{
XmlNodeList xrefs = x.SelectNodes(".//xref[@ref-type='bibr' and starts-with(@rid,'ref')]");
List<StartEnd> startEndOfEachTag = new List<StartEnd>();
string temp = x.OuterXml;
foreach (XmlNode xN in xrefs){
StartEnd se = new StartEnd(temp.IndexOf(xN.OuterXml), temp.IndexOf(xN.OuterXml) + xN.OuterXml.Length);
startEndOfEachTag.Add(se);
}
for (int i=0; i<xrefs.Count; i++)
{
int newIterator = i;
string regCompare = Regex.Escape(xrefs[i].OuterXml);
int count = 1;
string tempRes = "";
int consecutive = Int32.Parse(xrefs[i].Attributes["rid"].Value.Substring(3));
for (int j=i+1; j<xrefs.Count; j++)
{
if(consecutive == Int32.Parse(xrefs[j].Attributes["rid"].Value.Substring(3)) - 1)
{
consecutive++;
}
else { break; }
regCompare += "([ ]|(, ))" + Regex.Escape(xrefs[j].OuterXml);
Match matchReg;
try
{
matchReg = Regex.Match(temp.Substring(startEndOfEachTag[i].start, startEndOfEachTag[j].end - startEndOfEachTag[i].start),
regCompare);
}
catch
{
i = j;
break;
}
if (matchReg.Success){
count++;
tempRes = matchReg.Value;
newIterator = j;
}
else {
i = j;
break;
}
}
i = newIterator;
if (count > 2)
{
results.Add(tempRes);
}
}
}
Console.WriteLine("Results: {0}",file.ToString());
foreach(string s in results)
{
Console.WriteLine(s+"\n");
}
}
Console.ReadKey();
}
}
class StartEnd
{
public int start=-1;
public int end = -1;
public StartEnd(int start, int end)
{
this.start = start;
this.end = end;
}
}
}
However I get dtd processing errors in some files as there are dtd declared in the file and I want to ignore it.
So I tried
XmlReaderSettings settings = new XmlReaderSettings();
settings.XmlResolver = null;
settings.DtdProcessing = DtdProcessing.Ignore;
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read);
XmlReader reader = XmlTextReader.Create(fs, settings);
XmlDocument doc = new XmlDocument();
doc.Load(reader);
instead of
XmlDocument doc = new XmlDocument();
doc.PreserveWhitespace = true;
doc.Load(file);
I don't get any error but the matched expressions are also not displayed. I'm not that familiar with Filestream though. Can anyone tell where am I doing it wrong?