Click here to Skip to main content
15,894,540 members
Articles / Web Development / ASP.NET

A Web Spider Library in C#

Rate me:
Please Sign up or sign in to vote.
4.28/5 (40 votes)
18 Sep 2007CPOL2 min read 265.3K   15.4K   170  
An article about a spider library to grab websites and store them locally
/*
* 
* An XmlReader implementation for loading SGML (including HTML) converting it
* to well formed XML, by adding missing quotes, empty attribute values, ignoring
* duplicate attributes, case folding on tag names, adding missing closing tags
* based on SGML DTD information, and so on.
*
* Copyright (c) 2002 Microsoft Corporation. All rights reserved.
*
* Chris Lovett
* 
*/

using System;
using System.Xml;
using System.IO;
using System.Collections;
using System.Text;
using System.Reflection;

namespace Sgml {

    /// <summary>
    /// SGML is case insensitive, so here you can choose between converting
    /// to lower case or upper case tags.  "None" means that the case is left
    /// alone, except that end tags will be folded to match the start tags.
    /// </summary>
    public enum CaseFolding {
        None,
        ToUpper,
        ToLower
    }

    /// <summary>
    /// This stack maintains a high water mark for allocated objects so the client
    /// can reuse the objects in the stack to reduce memory allocations, this is
    /// used to maintain current state of the parser for element stack, and attributes
    /// in each element.
    /// </summary>
    internal class HWStack {
        object[] items;
        int size;
        int count;
        int growth;

        public HWStack(int growth) {
            this.growth = growth;
        }

        public int Count {
            get { return this.count; }
            set { this.count = value; }
        }
        public int Size {
            get { return this.size; }
        }
        // returns the item at the requested index or null if index is out of bounds
        public object this[int i] {
            get { return (i>=0 && i < this.size) ? items[i] : null; }
            set { this.items[i] = value; }
        }
        public object Pop(){
            this.count--;
            if (this.count>0){
                return items[this.count-1];
            }
            return null;
        }
        // This method tries to reuse a slot, if it returns null then
        // the user has to call the other Push method.
        public object Push(){
            if (this.count == this.size){
                int newsize = this.size+this.growth;
                object[] newarray = new object[newsize];
                if (this.items != null)
                    Array.Copy(this.items, newarray, this.size);
                this.size = newsize;
                this.items = newarray;
            }
            return items[this.count++];
        }        
        public void RemoveAt(int i){
            this.items[i] = null;
            Array.Copy(this.items, i+1, this.items, i, this.count - i - 1);
            this.count--;

        }
    }

    /// <summary>
    /// This class represents an attribute.  The AttDef is assigned
    /// from a validation process, and is used to provide default values.
    /// </summary>
    internal class Attribute {
        internal string Name;    // the atomized name (using XmlNameTable).
        internal AttDef DtdType; // the AttDef of the attribute from the SGML DTD.
        internal char QuoteChar; // the quote character used for the attribute value.
        internal string literalValue; // tha attribute value

        /// <summary>
        /// Attribute objects are reused during parsing to reduce memory allocations, 
        /// hence the Reset method. 
        /// </summary>
        public void Reset(string name, string value, char quote) {
            this.Name = name;
            this.literalValue = value;
            this.QuoteChar = quote;
            this.DtdType = null;
        }

        public string Value {
            get {
                if (this.literalValue != null) 
                    return this.literalValue;
                if (this.DtdType != null) 
                    return this.DtdType.Default;
                return null;
            }
            set {
                this.literalValue = value;
            }
        }

        public bool IsDefault {
            get {
                return (this.literalValue == null);
            }
        }
    }    

    /// <summary>
    /// This class models an XML node, an array of elements in scope is maintained while parsing
    /// for validation purposes, and these Node objects are reused to reduce object allocation,
    /// hence the reset method.  
    /// </summary>
    internal class Node {
        internal XmlNodeType NodeType;
        internal string Value;
        internal XmlSpace Space;
        internal string XmlLang;
        internal bool IsEmpty;        
        internal string Name;
        internal ElementDecl DtdType; // the DTD type found via validation
        internal State CurrentState;
        internal bool Simulated; // tag was injected into result stream.
        HWStack attributes = new HWStack(10);

        /// <summary>
        /// Attribute objects are reused during parsing to reduce memory allocations, 
        /// hence the Reset method. 
        /// </summary>
        public void Reset(string name, XmlNodeType nt, string value) {           
            this.Value = value;
            this.Name = name;
            this.NodeType = nt;
            this.Space = XmlSpace.None;
            this.XmlLang= null;
            this.IsEmpty = true;
            this.attributes.Count = 0;
            this.DtdType = null;
        }

        public Attribute AddAttribute(string name, string value, char quotechar, bool caseInsensitive) {
            Attribute a;
            // check for duplicates!
            for (int i = 0, n = this.attributes.Count; i < n; i++) {
                a = (Attribute)this.attributes[i];             
                if (caseInsensitive && string.Compare(a.Name, name, true) == 0) {
                    return null;
                } else if ((object)a.Name == (object)name) {
                    return null; 
                }
            }
            // This code makes use of the high water mark for attribute objects,
            // and reuses exisint Attribute objects to avoid memory allocation.
            a = (Attribute)this.attributes.Push();
            if (a == null) {
                a = new Attribute();
                this.attributes[this.attributes.Count-1] = a;
            }
            a.Reset(name, value, quotechar);
            return a;
        }

        public void RemoveAttribute(string name) {
            for (int i = 0, n = this.attributes.Count; i < n; i++) {
                Attribute a  = (Attribute)this.attributes[i];
                if (a.Name == name) {
                    this.attributes.RemoveAt(i);
                    return;
                }
            }
        }
        public void CopyAttributes(Node n) {
            for (int i = 0, len = n.attributes.Count; i < len; i++) {
                Attribute a = (Attribute)n.attributes[i];
                Attribute na = this.AddAttribute(a.Name, a.Value, a.QuoteChar, false);
                na.DtdType = a.DtdType;
            }
        }

        public int AttributeCount {
            get {
                return this.attributes.Count;
            }
        }

        public int GetAttribute(string name) {
            for (int i = 0, n = this.attributes.Count; i < n; i++) {
                Attribute a = (Attribute)this.attributes[i];
                if (a.Name == name) {
                    return i;
                }
            }
            return -1;
        }

        public Attribute GetAttribute(int i) {
            if (i>=0 && i<this.attributes.Count) {
                Attribute a = (Attribute)this.attributes[i];
                return a;
            }
            return null;
        }
    }

    // This enum is used to track the current state of te SgmlReader
    internal enum State {
        Initial,    // The initial state (Read has not been called yet)
        Markup,     // Expecting text or markup
        EndTag,     // Positioned on an end tag
        Attr,       // Positioned on an attribute
        AttrValue,  // Positioned in an attribute value
        Text,       // Positioned on a Text node.
        PartialTag, // Positioned on a text node, and we have hit a start tag
        AutoClose,  // We are auto-closing tags (this is like State.EndTag), but end tag was generated
        CData,      // We are on a CDATA type node, eg. <scipt> where we have special parsing rules.
        PartialText,
        PseudoStartTag, // we pushed a pseudo-start tag, need to continue with previous start tag.
        Eof
    }


    /// <summary>
    /// SgmlReader is an XmlReader API over any SGML document (including built in 
    /// support for HTML).  
    /// </summary>
    public class SgmlReader : XmlReader {
        SgmlDtd dtd;
        Entity current;
        State state;
        XmlNameTable nametable;
        char partial;
        object endTag;
        HWStack stack;
        Node node; // current node (except for attributes)
        // Attributes are handled separately using these members.
        Attribute a;
        int apos; // which attribute are we positioned on in the collection.
        Uri baseUri;
        StringBuilder sb;
        StringBuilder name;
        TextWriter log;
        bool foundRoot;

        // autoclose support
        Node newnode;
        int poptodepth;
        int rootCount;
        bool isHtml;
        string rootElementName;

        string href;
        string errorLogFile;
        Entity lastError;
        string proxy;
        TextReader inputStream;
        string syslit;
        string pubid;
        string subset;
        string docType;
        WhitespaceHandling whitespaceHandling;
        CaseFolding folding = CaseFolding.None;
        bool stripDocType = true;      
        string startTag;

        public SgmlReader() {
            Init();    
            this.nametable = new NameTable();
        }

        /// <summary>
        /// Specify the SgmlDtd object directly.  This allows you to cache the Dtd and share
        /// it across multipl SgmlReaders.  To load a DTD from a URL use the SystemLiteral property.
        /// </summary>
        public SgmlDtd Dtd {
            get { 
                LazyLoadDtd(this.baseUri);
                return this.dtd; 
            }
            set { this.dtd = value; }
        }

        private void LazyLoadDtd(Uri baseUri) {
            if (this.dtd == null) {
                if (this.syslit == null || this.syslit == "") {
                    if (this.docType != null && StringUtilities.EqualsIgnoreCase(this.docType, "html")) {
                        Assembly a = typeof(SgmlReader).Assembly;
                        string name = a.FullName.Split(',')[0]+".Html.dtd";
                        Stream stm = a.GetManifestResourceStream(name);
                        if (stm != null){
                            StreamReader sr = new StreamReader(stm);
                            this.dtd = SgmlDtd.Parse(baseUri, "HTML", null, sr, null, this.proxy, this.nametable);
                        }
                    }
                } else { 
                    if (baseUri != null) {
                        baseUri = new Uri(baseUri, this.syslit);
                    } else if (this.baseUri != null) {
                        baseUri = new Uri(this.baseUri, this.syslit);
                    } else {
                        baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), this.syslit);
                    }
                    this.dtd = SgmlDtd.Parse(baseUri, this.docType, this.pubid, baseUri.AbsoluteUri, this.subset, this.proxy, this.nametable);
                }

                if (this.dtd != null && this.dtd.Name != null){
                    switch (this.CaseFolding){
                        case CaseFolding.ToUpper:
                            this.rootElementName = this.dtd.Name.ToUpper();
                            break;
                        case CaseFolding.ToLower:
                            this.rootElementName = this.dtd.Name.ToLower();
                            break;
                        default:
                            this.rootElementName = this.dtd.Name;
                            break;
                    }
                    this.isHtml = StringUtilities.EqualsIgnoreCase(this.dtd.Name, "html");
                }

            }
        }

        /// <summary>
        /// The name of root element specified in the DOCTYPE tag.
        /// </summary>
        public string DocType {
            get { return this.docType; }
            set { this.docType = value; }
        }

        /// <summary>
        /// The PUBLIC identifier in the DOCTYPE tag
        /// </summary>
        public string PublicIdentifier {
            get { return this.pubid; }
            set { this.pubid = value; }
        }

        /// <summary>
        /// The SYSTEM literal in the DOCTYPE tag identifying the location of the DTD.
        /// </summary>
        public string SystemLiteral {
            get { return this.syslit; }
            set { this.syslit = value; }
        }

        /// <summary>
        /// The DTD internal subset in the DOCTYPE tag
        /// </summary>
        public string InternalSubset {
            get { return this.subset; }
            set { this.subset = value; }
        }

        /// <summary>
        /// The input stream containing SGML data to parse.
        /// You must specify this property or the Href property before calling Read().
        /// </summary>
        public TextReader InputStream {
            get { return this.inputStream; }
            set { this.inputStream = value; Init();}
        }

        /// <summary>
        /// Sometimes you need to specify a proxy server in order to load data via HTTP
        /// from outside the firewall.  For example: "itgproxy:80".
        /// </summary>
        public string WebProxy {
            get { return this.proxy; }
            set { this.proxy = value; }
        }

        /// <summary>
        /// The base Uri is used to resolve relative Uri's like the SystemLiteral and
        /// Href properties.  This is a method because BaseURI is a read-only
        /// property on the base XmlReader class.
        /// </summary>
        public void SetBaseUri(string uri)  {
            this.baseUri = new Uri(uri);
        }

        /// <summary>
        /// Specify the location of the input SGML document as a URL.
        /// </summary>
        public string Href {
            get { return this.href; }
            set { this.href = value; 
                Init();
                if (this.baseUri == null) {
                    if (this.href.IndexOf("://")>0) {
                        this.baseUri = new Uri(this.href);
                    } else {
                        this.baseUri = new Uri("file:///"+Directory.GetCurrentDirectory()+"//");
                    }
                }
            }
        }

        /// <summary>
        /// Whether to strip out the DOCTYPE tag from the output (default true)
        /// </summary>
        public bool StripDocType {
            get { return this.stripDocType; }
            set { this.stripDocType = value; }
        }

        public CaseFolding CaseFolding {
            get { return this.folding; }
            set { this.folding = value; }
        }

        /// <summary>
        /// DTD validation errors are written to this stream.
        /// </summary>
        public TextWriter ErrorLog {
            get { return this.log; }
            set { this.log = value; }
        }

        /// <summary>
        /// DTD validation errors are written to this log file.
        /// </summary>
        public string ErrorLogFile {
            get { return this.errorLogFile; }
            set { this.errorLogFile = value; 
                this.ErrorLog = new StreamWriter(value); }
        }

        void Log(string msg, params string[] args) {
            if (ErrorLog != null) {
                string err = String.Format(msg, args);
                if (this.lastError != this.current) {
                    err = err + "    " + this.current.Context();
                    this.lastError = this.current;
                    ErrorLog.WriteLine("### Error:"+err);
                } else {
                    string path = "";
                    if (this.current.ResolvedUri != null) {
                        path = this.current.ResolvedUri.AbsolutePath;
                    }
                    ErrorLog.WriteLine("### Error in "+
                        path+"#"+
                        this.current.Name+
                        ", line "+this.current.Line + ", position " + this.current.LinePosition + ": "+
                        err);
                }
            }
        }
        void Log(string msg, char ch) {
            Log(msg, ch.ToString());
        }


        void Init() {
            this.state = State.Initial;
            this.stack = new HWStack(10);
            this.node = Push(null, XmlNodeType.Document, null);
            this.node.IsEmpty = false;
            this.sb = new StringBuilder();
            this.name = new StringBuilder();
            this.poptodepth = 0;
            this.current = null;
            this.partial = '\0';
            this.endTag = null;
            this.a = null;
            this.apos = 0;
            this.newnode = null;
            this.rootCount = 0;
            this.foundRoot = false;
        }

        Node Push(string name, XmlNodeType nt, string value) {
            Node result = (Node)this.stack.Push();
            if (result == null) {
                result = new Node();
                this.stack[this.stack.Count-1] = result;
            }
            result.Reset(name, nt, value);
            this.node = result;
            return result;
        }

        void SwapTopNodes() {
            int top = this.stack.Count-1;
            if (top > 0) {
                Node n = (Node)this.stack[top - 1];
                this.stack[top - 1] = this.stack[top];
                this.stack[top] = n;
            }
        }

        Node Push(Node n) {
            // we have to do a deep clone of the Node object because
            // it is reused in the stack.
            Node n2 = Push(n.Name, n.NodeType, n.Value);
            n2.DtdType = n.DtdType;
            n2.IsEmpty = n.IsEmpty;
            n2.Space = n.Space;
            n2.XmlLang = n.XmlLang;
            n2.CurrentState = n.CurrentState;
            n2.CopyAttributes(n);
            this.node = n2;
            return n2;
        }

        void Pop() {
            if (this.stack.Count > 1) {
                this.node = (Node)this.stack.Pop();
            }
        }

        Node Top() {
            int top = this.stack.Count - 1;
            if (top > 0) {
                return (Node)this.stack[top];
            }
            return null;
        }

        public override XmlNodeType NodeType {
            get { 
                if (this.state == State.Attr) {
                    return XmlNodeType.Attribute;
                } 
                else if (this.state == State.AttrValue) {
                    return XmlNodeType.Text;
                }
                else if (this.state == State.EndTag || this.state == State.AutoClose) {
                    return XmlNodeType.EndElement;
                }
                return this.node.NodeType;
            }
        }

        public override string Name {
            get {
                return this.LocalName;
            }
        }

        public override string LocalName { 
            get {
                string result = null;
                if (this.state == State.Attr) {
                    result = this.a.Name;
                } 
                else if (this.state == State.AttrValue) {
                    result = null;
                }
                else {
                    result = this.node.Name;
                }

                return result;
            }
        }

        public override string NamespaceURI { 
            get {
                // SGML has no namespaces, unless this turned out to be an xmlns attribute.
                if (this.state == State.Attr && StringUtilities.EqualsIgnoreCase(this.a.Name, "xmlns")) {
                    return "http://www.w3.org/2000/xmlns/";
                }
                return String.Empty;
            }
        }

        public override string Prefix { 
            get {
                // SGML has no namespaces.
                return String.Empty;
            }
        }

        public override bool HasValue { 
            get {
                if (this.state == State.Attr || this.state == State.AttrValue) {
                    return true;
                }
                return (this.node.Value != null);
            }
        }

        public override string Value { 
            get {
                if (this.state == State.Attr || this.state == State.AttrValue) {
                    return this.a.Value;
                }
                return this.node.Value;
            }
        }

        public override int Depth { 
            get {
                if (this.state == State.Attr) {
                    return this.stack.Count;
                } 
                else if (this.state == State.AttrValue) {
                    return this.stack.Count+1;
                }
                return this.stack.Count-1;
            }
        }

        public override string BaseURI { 
            get {
                return this.baseUri == null ? "" : this.baseUri.AbsoluteUri;
            }
        }

        public override bool IsEmptyElement { 
            get {
                if (this.state == State.Markup || this.state == State.Attr || this.state == State.AttrValue) {
                    return this.node.IsEmpty;
                }
                return false;
            }
        }
        public override bool IsDefault { 
            get {
                if (this.state == State.Attr || this.state == State.AttrValue) 
                    return this.a.IsDefault;
                return false;
            }
        }
        public override char QuoteChar { 
            get {
                if (this.a != null) return this.a.QuoteChar;
                return '\0';
            }
        }

        public override XmlSpace XmlSpace { 
            get {
                for (int i = this.stack.Count-1; i > 1; i--) {
                    Node n = (Node)this.stack[i];
                    XmlSpace xs = n.Space;
                    if (xs != XmlSpace.None) return xs;
                }
                return XmlSpace.None;
            }
        }

        public override string XmlLang { 
            get {
                for (int i = this.stack.Count-1; i > 1; i--) {
                    Node n = (Node)this.stack[i];
                    string xmllang = n.XmlLang;
                    if (xmllang != null) return xmllang;
                }
                return String.Empty;
            }
        }

        public WhitespaceHandling WhitespaceHandling {
            get {
                return this.whitespaceHandling;
            } 
            set {
                this.whitespaceHandling = value;
            }
        }

        public override int AttributeCount { 
            get {
                if (this.state == State.Attr || this.state == State.AttrValue) 
                    return 0;
                if (this.node.NodeType == XmlNodeType.Element ||
                    this.node.NodeType == XmlNodeType.DocumentType)
                    return this.node.AttributeCount;
                return 0;
            }
        }

        public override string GetAttribute(string name) {
            if (this.state != State.Attr && this.state != State.AttrValue) {
                int i = this.node.GetAttribute(name);
                if (i>=0) return GetAttribute(i);
            }
            return null;
        }

        public override string GetAttribute(string name, string namespaceURI) {
            return GetAttribute(name); // SGML has no namespaces.
        }

        public override string GetAttribute(int i) {
            if (this.state != State.Attr && this.state != State.AttrValue) {
                Attribute a = this.node.GetAttribute(i);
                if (a != null)
                    return a.Value;
            }
            throw new IndexOutOfRangeException();
        }

        public override string this [ int i ] { 
            get {
                return GetAttribute(i);
            }
        }

        public override string this [ string name ] { 
            get {
                return GetAttribute(name);
            }
        }

        public override string this [ string name,string namespaceURI ] { 
            get {
                return GetAttribute(name, namespaceURI);
            }
        }

        public override bool MoveToAttribute(string name) {
            int i = this.node.GetAttribute(name);
            if (i>=0) {
                MoveToAttribute(i);
                return true;
            }
            return false;
        }

        public override bool MoveToAttribute(string name, string ns) {
            return MoveToAttribute(name);
        }

        public override void MoveToAttribute(int i) {
            Attribute a = this.node.GetAttribute(i);
            if (a != null) {
                this.apos = i;
                this.a = a;
                if (this.state != State.Attr) {
                    this.node.CurrentState = this.state;//save current state.
                }
                this.state = State.Attr;
                return;
            }
            throw new IndexOutOfRangeException();
        }

        public override bool MoveToFirstAttribute() {
            if (this.node.AttributeCount>0) {
                MoveToAttribute(0);
                return true;
            }
            return false;
        }

        public override bool MoveToNextAttribute() {
            if (this.state != State.Attr && this.state != State.AttrValue) {
                return MoveToFirstAttribute();
            }
            if (this.apos<this.node.AttributeCount-1) {
                MoveToAttribute(this.apos+1);
                return true;
            }
            return false;
        }

        public override bool MoveToElement() {
            if (this.state == State.Attr || this.state == State.AttrValue) {
                this.state = this.node.CurrentState;
                this.a = null;
                return true;
            }
            return (this.node.NodeType == XmlNodeType.Element);
        }

        bool IsHtml {
            get {
              return this.isHtml;
            }
        }

        public Encoding GetEncoding(){
            if (this.current == null) {
                OpenInput();
            }
            return this.current.GetEncoding();
        }

        void OpenInput(){
            LazyLoadDtd(this.baseUri);

            if (this.Href != null) {
                this.current = new Entity("#document", null, this.href, this.proxy);
            } else if (this.inputStream != null) {
                this.current = new Entity("#document", null, this.inputStream, this.proxy);           
            } else {
                throw new InvalidOperationException("You must specify input either via Href or InputStream properties");
            }
            this.current.Html = this.IsHtml;
            this.current.Open(null, this.baseUri);
            if (this.current.ResolvedUri != null)
                this.baseUri = this.current.ResolvedUri;

            if (this.current.Html && this.dtd == null){
                this.docType = "HTML";
                LazyLoadDtd(this.baseUri);
            }
        }

        public override bool Read() {
            if (current == null) {
                OpenInput();
            }
            State start = this.state;
            if (node.Simulated) {
                // return the next node
                node.Simulated = false;
                this.node = Top();
                this.state = this.node.CurrentState;
                return true;
            }

            bool foundnode = false;
            while (! foundnode) {
                switch (this.state) {
                    case State.Initial:
                        this.state = State.Markup;
                        this.current.ReadChar();
                        goto case State.Markup;
                    case State.Eof:
                        if (this.current.Parent != null) {
                            this.current.Close();
                            this.current = this.current.Parent;
                        } else {                           
                            return false;
                        }
                        break;
                    case State.EndTag:
                        if (this.endTag == (object)this.node.Name) {
                            Pop(); // we're done!
                            this.state = State.Markup;
                            goto case State.Markup;
                        }                     
                        Pop(); // close one element
                        foundnode = true;// return another end element.
                        break;
                    case State.Markup:
                        if (this.node.IsEmpty) {
                            Pop();
                        }
                        Node n = this.node;
                        foundnode = ParseMarkup();
                        break;
                    case State.PartialTag:
                        Pop(); // remove text node.
                        this.state = State.Markup;
                        foundnode = ParseTag(this.partial);
                        break;
                    case State.PseudoStartTag:
                        foundnode = ParseStartTag('<');                        
                        break;
                    case State.AutoClose:
                        Pop(); // close next node.
                        if (this.stack.Count <= this.poptodepth) {
                            this.state = State.Markup;
                            if (this.newnode != null) {
                                Push(this.newnode); // now we're ready to start the new node.
                                this.newnode = null;
                                this.state = State.Markup;
                            } else if (this.node.NodeType == XmlNodeType.Document) {
                                this.state = State.Eof;
                                goto case State.Eof;
                            }
                        } 
                        foundnode = true;
                        break;
                    case State.CData:
                        foundnode = ParseCData();
                        break;
                    case State.Attr:
                        goto case State.AttrValue;
                    case State.AttrValue:
                        this.state = State.Markup;
                        goto case State.Markup;
                    case State.Text:
                        Pop();
                        goto case State.Markup;
                    case State.PartialText:
                        if (ParseText(this.current.Lastchar, false)) {
                            this.node.NodeType = XmlNodeType.Whitespace;
                        }
                        foundnode = true;
                        break;
                }
                if (foundnode && this.node.NodeType == XmlNodeType.Whitespace && this.whitespaceHandling == WhitespaceHandling.None) {
                    // strip out whitespace (caller is probably pretty printing the XML).
                    foundnode = false;
                }
                if (!foundnode && this.state == State.Eof && this.stack.Count>1) {
                    this.poptodepth = 1;
                    state = State.AutoClose;
                    this.node = Top();
                    return true;
                }
            }
            if (!foundRoot && (this.NodeType == XmlNodeType.Element ||
                    this.NodeType == XmlNodeType.Text ||
                    this.NodeType == XmlNodeType.CDATA)) {
                foundRoot = true;
                if (this.IsHtml && (this.NodeType != XmlNodeType.Element ||
                    string.Compare(this.LocalName, "html", true, System.Globalization.CultureInfo.InvariantCulture) != 0)) {
                    // Simulate an HTML root element!
                    this.node.CurrentState = this.state;
                    Node root = Push("html", XmlNodeType.Element, null);
                    SwapTopNodes(); // make html the outer element.
                    this.node = root;
                    root.Simulated = true;
                    root.IsEmpty = false;
                    this.state = State.Markup;
                    //this.state = State.PseudoStartTag;
                    //this.startTag = name;
                }
                return true;
            }
            return true;
        }

        bool ParseMarkup() {
            char ch = this.current.Lastchar;
            if (ch == '<') {
                ch = this.current.ReadChar();
                return ParseTag(ch);
            } 
            else if (ch != Entity.EOF) {
                if (this.node.DtdType != null && this.node.DtdType.ContentModel.DeclaredContent == DeclaredContent.CDATA) {
                    // e.g. SCRIPT or STYLE tags which contain unparsed character data.
                    this.partial = '\0';
                    this.state = State.CData;
                    return false;
                }
                else if (ParseText(ch, true)) {
                    this.node.NodeType = XmlNodeType.Whitespace;
                }
                return true;
            }
            this.state = State.Eof;
            return false;
        }

        static string declterm = " \t\r\n><";
        bool ParseTag(char ch) {
            if (ch == '%') {
                return ParseAspNet();
            }
            if (ch == '!') {
                ch = this.current.ReadChar();
                if (ch == '-') {
                    return ParseComment();
                } else if (ch == '[') {
                    return ParseConditionalBlock();
                }else if (ch != '_' && !Char.IsLetter(ch)) {
                    // perhaps it's one of those nasty office document hacks like '<![if ! ie ]>'
                    string value = this.current.ScanToEnd(this.sb, "Recovering", ">"); // skip it
                    Log("Ignoring invalid markup '<!"+value+">");
                    return false;
                }
                else {
                    string name = this.current.ScanToken(this.sb, SgmlReader.declterm, false);
                    if (name == "DOCTYPE") {
                        ParseDocType();
                        // In SGML DOCTYPE SYSTEM attribute is optional, but in XML it is required,
                        // therefore if there is no SYSTEM literal then add an empty one.
                        if (this.GetAttribute("SYSTEM") == null && this.GetAttribute("PUBLIC") != null) {
                            this.node.AddAttribute("SYSTEM", "", '"', this.folding == CaseFolding.None);
                        }
                        if (stripDocType) {
                            return false;
                        } else {
                            this.node.NodeType = XmlNodeType.DocumentType;
                            return true;
                        }
                    } 
                    else {
                        Log("Invalid declaration '<!{0}...'.  Expecting '<!DOCTYPE' only.", name);
                        this.current.ScanToEnd(null, "Recovering", ">"); // skip it
                        return false;
                    }
                }
            } 
            else if (ch == '?') {
                this.current.ReadChar();// consume the '?' character.
                return ParsePI();
            }
            else if (ch == '/') {
                return ParseEndTag();
            }
            else {
                return ParseStartTag(ch);
            }
            return true;
        }

        string ScanName(string terminators) {
            string name = this.current.ScanToken(this.sb, terminators, false);
            switch (this.folding){
                case CaseFolding.ToUpper:
                    name = name.ToUpper();
                    break;
                case CaseFolding.ToLower:
                    name = name.ToLower();
                    break;
            }
            return this.nametable.Add(name);
        }

        static string tagterm = " \t\r\n=/><";
        static string aterm = " \t\r\n='\"/>";
        static string avterm = " \t\r\n>";
        bool ParseStartTag(char ch) {
            string name = null;
            if (state != State.PseudoStartTag){
                if (SgmlReader.tagterm.IndexOf(ch)>=0) {
                    this.sb.Length = 0;
                    this.sb.Append('<');
                    this.state = State.PartialText;
                    return false;
                }
                name = ScanName(SgmlReader.tagterm);                
            } else {
                name = this.startTag;
                state = State.Markup;
            }
            Node n = Push(name, XmlNodeType.Element, null);
            n.IsEmpty = false;
            Validate(n);
            ch = this.current.SkipWhitespace();
            while (ch != Entity.EOF && ch != '>') {
                if (ch == '/') {
                    n.IsEmpty = true;
                    ch = this.current.ReadChar();
                    if (ch != '>') {
                        Log("Expected empty start tag '/>' sequence instead of '{0}'", ch);
                        this.current.ScanToEnd(null, "Recovering", ">");
                        return false;
                    }
                    break;
                } 
                else if (ch == '<') {
                    Log("Start tag '{0}' is missing '>'", name);
                    break;
                }
                string aname = ScanName(SgmlReader.aterm);
                ch = this.current.SkipWhitespace();
                if (aname == "," || aname == "=" || aname == ":" || aname == ";") {
                    continue;
                }
                string value = null;
                char quote = '\0';
                if (ch == '=' || ch == '"' || ch == '\'') {
                    if (ch == '=' ){
                        this.current.ReadChar();
                        ch = this.current.SkipWhitespace();
                    }
                    if (ch == '\'' || ch == '\"') {
                        quote = ch;
                        value = ScanLiteral(this.sb, ch);
                    } 
                    else if (ch != '>') {
                        string term = SgmlReader.avterm;
                        value = this.current.ScanToken(this.sb, term, false);
                    }
                } 
                if (aname.Length > 0) {
                    Attribute a = n.AddAttribute(aname, value, quote, this.folding == CaseFolding.None);
                    if (a == null) {
                        Log("Duplicate attribute '{0}' ignored", aname);
                    } else {
                        ValidateAttribute(n, a);
                    }
                }
                ch = this.current.SkipWhitespace();
            }
            if (ch == Entity.EOF) {
                this.current.Error("Unexpected EOF parsing start tag '{0}'", name);
            } 
            else if (ch == '>') {
                this.current.ReadChar(); // consume '>'
            }
            if (this.Depth == 1) {
                if (this.rootCount == 1) {
                    // Hmmm, we found another root level tag, soooo, the only
                    // thing we can do to keep this a valid XML document is stop
                    this.state = State.Eof;
                    return false;
                }
                this.rootCount++;
            }
            ValidateContent(n);
            return true;
        }

        bool ParseEndTag() {
            this.state = State.EndTag;
            this.current.ReadChar(); // consume '/' char.
            string name = this.ScanName(SgmlReader.tagterm);
            char ch = this.current.SkipWhitespace();
            if (ch != '>') {
                Log("Expected empty start tag '/>' sequence instead of '{0}'", ch);
                this.current.ScanToEnd(null, "Recovering", ">");
            }
            this.current.ReadChar(); // consume '>'

            this.endTag = name;
            // Make sure there's a matching start tag for it.                        
            bool caseInsensitive = (this.folding == CaseFolding.None);
            this.node = (Node)this.stack[this.stack.Count-1];
            for (int i = this.stack.Count-1; i>0; i--) {
                Node n = (Node)this.stack[i];
                if (caseInsensitive && string.Compare(n.Name, name, true) == 0) {
                    this.endTag = n.Name;
                    return true;
                } else if ((object)n.Name == (object)name) {
                    return true;
                }
            }
            Log("No matching start tag for '</{0}>'", name);
            this.state = State.Markup;
            return false;
        }

        bool ParseAspNet() {
            string value = "<%" + this.current.ScanToEnd(this.sb, "AspNet", "%>") + "%>";
            Push(null, XmlNodeType.CDATA, value);         
            return true;
        }

        bool ParseComment() {
            char ch = this.current.ReadChar();
            if (ch != '-') {
                Log("Expecting comment '<!--' but found {0}", ch);
                this.current.ScanToEnd(null, "Comment", ">");
                return false;
            }
            string value = this.current.ScanToEnd(this.sb, "Comment", "-->");
            
            // Make sure it's a valid comment!
            int i = value.IndexOf("--");
            while (i>=0) {
                int j = i+2;
                while (j<value.Length && value[j]=='-')
                    j++;
                if (i>0) {
                    value = value.Substring(0, i-1)+"-"+value.Substring(j);
                } 
                else {
                    value = "-"+value.Substring(j);
                }
                i = value.IndexOf("--");
            }
            if (value.Length>0 && value[value.Length-1] == '-') {
                value += " "; // '-' cannot be last character
            }
            Push(null, XmlNodeType.Comment, value);         
            return true;
        }

        static string cdataterm = "\t\r\n[<>";
        bool ParseConditionalBlock(){
            char ch = current.ReadChar(); // skip '['
            ch = current.SkipWhitespace();
            string name = current.ScanToken(sb, cdataterm, false);
            if (name != "CDATA"){
                Log("Expecting CDATA but found '{0}'", name);
                current.ScanToEnd(null, "CDATA", ">");
                return false;
            }
            ch = current.SkipWhitespace();
            if (ch != '[') {
                Log("Expecting '[' but found '{0}'", ch);
                current.ScanToEnd(null, "CDATA", ">");
                return false;
            }
            string value = current.ScanToEnd(sb, "CDATA", "]]>");
                        
            Push(null, XmlNodeType.CDATA, value);         
            return true;
        }

        static string dtterm = " \t\r\n>";
        void ParseDocType() {
            char ch = this.current.SkipWhitespace();
            string name = this.ScanName(SgmlReader.dtterm);
            Push(name, XmlNodeType.DocumentType, null);
            ch = this.current.SkipWhitespace();
            if (ch != '>') {
                string subset = "";
                string pubid = "";
                string syslit = "";

                if (ch != '[') {
                    string token = this.current.ScanToken(this.sb, SgmlReader.dtterm, false);
                    if (token == "PUBLIC") {
                        ch = this.current.SkipWhitespace();
                        if (ch == '\"' || ch == '\'') {
                            pubid = this.current.ScanLiteral(this.sb, ch);
                            this.node.AddAttribute(token, pubid, ch, this.folding == CaseFolding.None);  
                        }
                    } 
                    else if (token != "SYSTEM") {
                        Log("Unexpected token in DOCTYPE '{0}'", token);
                        this.current.ScanToEnd(null, "DOCTYPE", ">");
                    }
                    ch = this.current.SkipWhitespace();
                    if (ch == '\"' || ch == '\'') {
                        token = this.nametable.Add("SYSTEM");
                        syslit = this.current.ScanLiteral(this.sb, ch);
                        this.node.AddAttribute(token, syslit, ch, this.folding == CaseFolding.None);  
                    }
                    ch = this.current.SkipWhitespace();
                }
                if (ch == '[') {
                    subset = this.current.ScanToEnd(this.sb, "Internal Subset", "]");
                    this.node.Value = subset;
                }
                ch = this.current.SkipWhitespace();
                if (ch != '>') {
                    Log("Expecting end of DOCTYPE tag, but found '{0}'", ch);
                    this.current.ScanToEnd(null, "DOCTYPE", ">");
                }

                if (this.dtd == null) {
                    this.docType = name;
                    this.pubid = pubid;
                    this.syslit = syslit;
                    this.subset = subset;
                    LazyLoadDtd(this.current.ResolvedUri);
                }
            }           
            this.current.ReadChar();
        }

        static string piterm = " \t\r\n?";
        bool ParsePI() {
            string name = this.current.ScanToken(this.sb, SgmlReader.piterm, false);
            string value = null;
            if (this.current.Lastchar != '?') {
                // Notice this is not "?>".  This is because Office generates bogus PI's that end with "/>".
                value = this.current.ScanToEnd(this.sb, "Processing Instruction", ">");
            }
            else {
                // error recovery.
                value = this.current.ScanToEnd(this.sb, "Processing Instruction", ">");
            }
            // skip xml declarations, since these are generated in the output instead.
            if (name != "xml"){
                Push(nametable.Add(name), XmlNodeType.ProcessingInstruction, value);
                return true;
            }
            return false;
        }

        bool ParseText(char ch, bool newtext) {
            bool ws = !newtext || this.current.IsWhitespace;
            if (newtext) this.sb.Length = 0;
            //this.sb.Append(ch);
            //ch = this.current.ReadChar();
            this.state = State.Text;
            while (ch != Entity.EOF) {
                if (ch == '<') {
                    ch = this.current.ReadChar();
                    if (ch == '/' || ch == '!' || ch == '?' || Char.IsLetter(ch)) {
                        // Hit a tag, so return XmlNodeType.Text token
                        // and remember we partially started a new tag.
                        this.state = State.PartialTag;
                        this.partial = ch;
                        break;
                    } 
                    else {
                        // not a tag, so just proceed.
                        this.sb.Append('<'); 
                        this.sb.Append(ch);
                        ws = false;
                        ch = this.current.ReadChar();
                    }
                } 
                else if (ch == '&') {
                    ExpandEntity(this.sb, '<');
                    ws = false;
                    ch = this.current.Lastchar;
                }
                else {
                    if (!this.current.IsWhitespace) ws = false;
                    this.sb.Append(ch);
                    ch = this.current.ReadChar();
                }
            }
            string value = this.sb.ToString();
            Push(null, XmlNodeType.Text, value);
            return ws;
        }

        // This version is slightly different from Entity.ScanLiteral in that
        // it also expands entities.
        public string ScanLiteral(StringBuilder sb, char quote) {
            sb.Length = 0;
            char ch = this.current.ReadChar();
            while (ch != Entity.EOF && ch != quote ) {
                if (ch == '&') {
                    ExpandEntity(this.sb, quote);
                    ch = this.current.Lastchar;
                }               
                else {
                    sb.Append(ch);
                    ch = this.current.ReadChar();
                }
            }
            this.current.ReadChar(); // consume end quote.          
            return sb.ToString();
        }

        bool ParseCData() {
            // Like ParseText(), only it doesn't allow elements in the content.  
            // It allows comments and processing instructions and text only and
            // text is not returned as text but CDATA (since it may contain angle brackets).
            // And initial whitespace is ignored.  It terminates when we hit the
            // end tag for the current CDATA node (e.g. </style>).
            bool ws = this.current.IsWhitespace;
            this.sb.Length = 0;
            char ch = this.current.Lastchar;
            if (this.partial != '\0') {
                Pop(); // pop the CDATA
                switch (this.partial) {
                    case '!':
                        this.partial = ' '; // and pop the comment next time around
                        return ParseComment();
                    case '?':
                        this.partial = ' '; // and pop the PI next time around
                        return ParsePI();
                    case '/':
                        this.state = State.EndTag;
                        return true;    // we are done!
                    case ' ':
                        break; // means we just needed to pop the Comment, PI or CDATA.
                }
            } else {
                ch = this.current.ReadChar();
            }            
            
            // if this.partial == '!' then parse the comment and return
            // if this.partial == '?' then parse the processing instruction and return.            
            while (ch != Entity.EOF) {
                if (ch == '<') {
                    ch = this.current.ReadChar();
                    if (ch == '!') {
                        ch = this.current.ReadChar();
                        if (ch == '-') {
                            // return what CDATA we have accumulated so far
                            // then parse the comment and return to here.
                            if (ws) {
                                this.partial = ' '; // pop comment next time through
                                return ParseComment();
                            } 
                            else {
                                // return what we've accumulated so far then come
                                // back in and parse the comment.
                                this.partial = '!';
                                break; 
                            }
#if FIX
                        } else if (ch == '['){
                            // We are about to wrap this node as a CDATA block because of it's
                            // type in the DTD, but since we found a CDATA block in the input
                            // we have to parse it as a CDATA block, otherwise we will attempt
                            // to output nested CDATA blocks which of course is illegal.
                            if (this.ParseConditionalBlock()){
                                this.partial = ' ';
                                return true;
                            }
#endif
                        } else {
                            // not a comment, so ignore it and continue on.
                            this.sb.Append('<');
                            this.sb.Append('!');
                            this.sb.Append(ch);
                            ws = false;
                        }
                    } 
                    else if (ch == '?') {
                        // processing instruction.
                        this.current.ReadChar();// consume the '?' character.
                        if (ws) {
                            this.partial = ' '; // pop PI next time through
                            return ParsePI();
                        } 
                        else {
                            this.partial = '?';
                            break;
                        }
                    }
                    else if (ch == '/') {
                        // see if this is the end tag for this CDATA node.
                        string temp = this.sb.ToString();
                        if (ParseEndTag() && this.endTag == (object)this.node.Name) {
                            if (ws || temp == "") {
                                // we are done!
                                return true;
                            } 
                            else {
                                // return CDATA text then the end tag
                                this.partial = '/';
                                this.sb.Length = 0; // restore buffer!
                                this.sb.Append(temp); 
                                this.state = State.CData;
                                break;
                            }
                        } 
                        else {
                            // wrong end tag, so continue on.
                            this.sb.Length = 0; // restore buffer!
                            this.sb.Append(temp); 
                            this.sb.Append("</"+this.endTag+">");
                            ws = false;
                        }
                    }
                    else {
                        // must be just part of the CDATA block, so proceed.
                        this.sb.Append('<'); 
                        this.sb.Append(ch);
                        ws = false;
                    }
                } 
                else {
                    if (!this.current.IsWhitespace && ws) ws = false;
                    this.sb.Append(ch);
                }
                ch = this.current.ReadChar();
            }
            string value = this.sb.ToString();
            Push(null, XmlNodeType.CDATA, value);
            if (this.partial == '\0')
                this.partial = ' ';// force it to pop this CDATA next time in.
            return true;
        }

        void ExpandEntity(StringBuilder sb, char terminator) {
            char ch = this.current.ReadChar();
            if (ch == '#') {
                string charent = this.current.ExpandCharEntity();
                sb.Append(charent);
                ch = this.current.Lastchar;
            } 
            else {
                this.name.Length = 0;
                while (ch != Entity.EOF && 
                    (Char.IsLetter(ch) || ch == '_' || ch == '-')) {
                    this.name.Append(ch);
                    ch = this.current.ReadChar();
                }
                string name = this.name.ToString();
                if (this.dtd != null && name != "") {
                    Entity e = (Entity)this.dtd.FindEntity(name);
                    if (e != null) {
                        if (e.Internal) {
                            sb.Append(e.Literal);
                            if (ch != terminator) 
                                ch = this.current.ReadChar();
                            return;
                        } 
                        else {
                            Entity ex = new Entity(name, e.PublicId, e.Uri, this.current.Proxy);
                            e.Open(this.current, new Uri(e.Uri));
                            this.current = ex;
                            this.current.ReadChar();
                            return;
                        }
                    } 
                    else {
                        Log("Undefined entity '{0}'", name);
                    }
                }
                // Entity is not defined, so just keep it in with the rest of the
                // text.
                sb.Append("&");
                sb.Append(name);
                if (ch != terminator) {
                    sb.Append(ch);
                    ch = this.current.ReadChar();
                }
            }
        }

        public override bool EOF { 
            get {
                return this.state == State.Eof;
            }
        }

        public override void Close() {
            if (this.current != null) {
                this.current.Close();
                this.current = null;
            }
            if (this.log != null) {
                this.log.Close();
                this.log = null;
            }
        }

        public override ReadState ReadState { 
            get {
                if (this.state == State.Initial) return ReadState.Initial;
                else if (this.state == State.Eof) return ReadState.EndOfFile;
                return ReadState.Interactive;
            }
        }

        public override string ReadString() {
            if (this.node.NodeType == XmlNodeType.Element) {
                this.sb.Length = 0;
                while (Read()) {
                    switch (this.NodeType) {
                        case XmlNodeType.CDATA:
                        case XmlNodeType.SignificantWhitespace:
                        case XmlNodeType.Whitespace:
                        case XmlNodeType.Text:
                            this.sb.Append(this.node.Value);
                            break;
                        default:
                            return this.sb.ToString();
                    }
                }
                return this.sb.ToString();
            }
            return this.node.Value;
        }


        public override string ReadInnerXml() {
            StringWriter sw = new StringWriter();
            XmlTextWriter xw = new XmlTextWriter(sw);
            xw.Formatting = Formatting.Indented;
            switch (this.NodeType) {
                case XmlNodeType.Element:
                    Read();
                    while (!this.EOF && this.NodeType != XmlNodeType.EndElement) {
                        xw.WriteNode(this, true);
                    }
                    Read(); // consume the end tag
                    break;
                case XmlNodeType.Attribute:
                    sw.Write(this.Value);
                    break;
                default:
                    // return empty string according to XmlReader spec.
                    break;
            }
            xw.Close();
            return sw.ToString();
        }

        public override string ReadOuterXml() {
            StringWriter sw = new StringWriter();
            XmlTextWriter xw = new XmlTextWriter(sw);
            xw.Formatting = Formatting.Indented;
            xw.WriteNode(this, true);
            xw.Close();
            return sw.ToString();
        }

        public override XmlNameTable NameTable { 
            get {
                return this.nametable;
            }
        }

        public override string LookupNamespace(string prefix) {           
            return null;// there are no namespaces in SGML.
        }

        public override void ResolveEntity() {
            // We never return any entity reference nodes, so this should never be called.
            throw new InvalidOperationException("Not on an entity reference.");
        }

        public override bool ReadAttributeValue() {
            if (this.state == State.Attr) {
                this.state = State.AttrValue;
                return true;
            } 
            else if (this.state == State.AttrValue) {
                return false;
            }
            throw new InvalidOperationException("Not on an attribute.");
        }   

        void Validate(Node node) {
            if (this.dtd != null) {
                ElementDecl e = this.dtd.FindElement(node.Name);
                if (e != null) {
                    node.DtdType = e;
                    if (e.ContentModel.DeclaredContent == DeclaredContent.EMPTY) 
                        node.IsEmpty = true;
                }
            }
        }

        void ValidateAttribute(Node node, Attribute a) {
            ElementDecl e = node.DtdType;
            if (e != null) {
                AttDef ad = e.FindAttribute(a.Name);
                if (ad != null) {
                    a.DtdType = ad;
                }
            }
        }   

        void ValidateContent(Node node) {
            if (this.dtd != null) {
                // See if this element is allowed inside the current element.
                // If it isn't, then auto-close elements until we find one
                // that it is allowed to be in.                                  
                string name = this.nametable.Add(node.Name.ToUpper()); // DTD is in upper case
                int i = 0;
                int top = this.stack.Count-2;
                if (node.DtdType != null) { 
                    // it is a known element, let's see if it's allowed in the
                    // current context.
                    for (i = top; i>0; i--) {
                        Node n = (Node)this.stack[i];
                        if (n.IsEmpty) 
                            continue; // we'll have to pop this one
                        ElementDecl f = n.DtdType;
                        if (f != null) {
                            if (f.Name == this.dtd.Name)
                                break; // can't pop the root element.
                            if (f.CanContain(name, this.dtd)) {
                                break;
                            } 
                            else if (!f.EndTagOptional) {
                                // If the end tag is not optional then we can't
                                // auto-close it.  We'll just have to live with the
                                // junk we've found and move on.
                                break;
                            }
                        } 
                        else {
                            // Since we don't understand this tag anyway,
                            // we might as well allow this content!
                            break;
                        }
                    }
                }
                if (i == 0) {
                    // Tag was not found or is not allowed anywhere, ignore it and 
                    // continue on.
                }
                else if (i < top) {
                    Node n = (Node)this.stack[top];
                    if (i == top - 1 && name == n.Name) {
                        // e.g. p not allowed inside p, not an interesting error.
                    } else {
                        string closing = "";
                        for (int k = top; k >= i+1; k--) {
                            if (closing != "") closing += ",";
                            Node n2 = (Node)this.stack[k];
                            closing += "<"+n2.Name+">";
                        }
                        Log("Element '{0}' not allowed inside '{1}', closing {2}.", 
                            name, n.Name, closing);
                    }
                    this.state = State.AutoClose;
                    this.newnode = node;
                    Pop(); // save this new node until we pop the others
                    this.poptodepth = i+1;
                }
            }
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Chief Technology Officer Zeta Software GmbH
Germany Germany
Uwe does programming since 1989 with experiences in Assembler, C++, MFC and lots of web- and database stuff and now uses ASP.NET and C# extensively, too. He has also teached programming to students at the local university.

➡️ Give me a tip 🙂

In his free time, he does climbing, running and mountain biking. In 2012 he became a father of a cute boy and in 2014 of an awesome girl.

Some cool, free software from us:

Windows 10 Ereignisanzeige  
German Developer Community  
Free Test Management Software - Intuitive, competitive, Test Plans.  
Homepage erstellen - Intuitive, very easy to use.  
Offline-Homepage-Baukasten

Comments and Discussions