Click here to Skip to main content
15,881,757 members
Articles / Web Development / CSS3

AngleSharp

Rate me:
Please Sign up or sign in to vote.
5.00/5 (87 votes)
3 Jul 2013BSD28 min read 260.4K   4.3K   166  
Bringing the DOM to C# with a HTML5/CSS3 parser written in C#.
using AngleSharp.DOM;
using AngleSharp.DOM.Xml;
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;

namespace AngleSharp.Xml
{
    /// <summary>
    /// For more details: See http://www.w3.org/TR/xml11/ and 
    /// http://www.w3.org/html/wg/drafts/html/master/the-xhtml-syntax.html#xml-parser.
    /// </summary>
    public class XmlParser : IParser
    {
        #region Members

        XmlTokenizer tokenizer;
        Boolean started;
        XMLDocument doc;
        List<Element> open;
        XmlTreeMode insert;
        TaskCompletionSource<Boolean> tcs;
        Boolean standalone;

        #endregion

        #region Events

        /// <summary>
        /// This event is raised once a parser error occured.
        /// </summary>
        public event EventHandler<ParseErrorEventArgs> ErrorOccurred;

        #endregion

        #region ctor

        /// <summary>
        /// Creates a new instance of the XML parser with an new document
        /// based on the given source.
        /// </summary>
        /// <param name="source">The source code as a string.</param>
        public XmlParser(String source)
            : this(new XMLDocument(), new SourceManager(source))
        {
        }

        /// <summary>
        /// Creates a new instance of the XML parser with an new document
        /// based on the given stream.
        /// </summary>
        /// <param name="stream">The stream to use as source.</param>
        public XmlParser(Stream stream)
            : this(new XMLDocument(), new SourceManager(stream))
        {
        }

        /// <summary>
        /// Creates a new instance of the XML parser with the specified document
        /// based on the given source.
        /// </summary>
        /// <param name="document">The document instance to be constructed.</param>
        /// <param name="source">The source code as a string.</param>
        public XmlParser(XMLDocument document, String source)
            : this(document, new SourceManager(source))
        {
        }

        /// <summary>
        /// Creates a new instance of the XML parser with the specified document
        /// based on the given stream.
        /// </summary>
        /// <param name="document">The document instance to be constructed.</param>
        /// <param name="stream">The stream to use as source.</param>
        public XmlParser(XMLDocument document, Stream stream)
            : this(document, new SourceManager(stream))
        {
        }

        /// <summary>
        /// Creates a new instance of the XML parser with the specified document
        /// based on the given source manager.
        /// </summary>
        /// <param name="document">The document instance to be constructed.</param>
        /// <param name="source">The source to use.</param>
        internal XmlParser(XMLDocument document, SourceManager source)
        {
            tokenizer = new XmlTokenizer(source);

            tokenizer.ErrorOccurred += (s, ev) =>
            {
                if (ErrorOccurred != null)
                    ErrorOccurred(this, ev);
            };

            started = false;
            doc = document;
            standalone = false;
            open = new List<Element>();
            insert = XmlTreeMode.Initial;
        }

        #endregion

        #region Properties

        /// <summary>
        /// Gets the current node.
        /// </summary>
        internal Node CurrentNode
        {
            get { return open.Count > 0 ? (Node)open[open.Count - 1] : (Node)doc; }
        }

        /// <summary>
        /// Gets the (maybe intermediate) result of the parsing process.
        /// </summary>
        public XMLDocument Result
        {
            get
            {
                Parse();
                return doc;
            }
        }

        /// <summary>
        /// Gets if the XML is standalone.
        /// </summary>
        public Boolean Standalone
        {
            get { return standalone; }
        }

        /// <summary>
        /// Gets if the parser has been started asynchronously.
        /// </summary>
        public Boolean IsAsync
        {
            get { return tcs != null; }
        }

        #endregion

        #region Methods

        /// <summary>
        /// Parses the given source and creates the document.
        /// </summary>
        public void Parse()
        {
            if (!started)
            {
                started = true;
                XmlToken token;

                do
                {
                    token = tokenizer.Get();
                    Consume(token);
                }
                while (token.Type != XmlTokenType.EOF);
            }
        }

        /// <summary>
        /// Parses the given source asynchronously and creates the document.
        /// WARNING: This method is not yet implemented.
        /// </summary>
        /// <returns>The task which could be awaited or continued differently.</returns>
        public Task ParseAsync()
        {
            if (!started)
            {
                started = true;
                tcs = new TaskCompletionSource<bool>();
                //TODO
                return tcs.Task;
            }
            else if (tcs == null)
            {
                var temp = new TaskCompletionSource<bool>();
                temp.SetResult(true);
                return temp.Task;
            }

            return tcs.Task;
        }

        /// <summary>
        /// Consumes a token and processes it.
        /// </summary>
        /// <param name="token">The token to consume.</param>
        void Consume(XmlToken token)
        {
            switch (insert)
            {
                case XmlTreeMode.Initial:
                    Initial(token);
                    break;
                case XmlTreeMode.Prolog:
                    BeforeDoctype(token);
                    break;
                case XmlTreeMode.Body:
                    InBody(token);
                    break;
            }
        }

        #endregion

        #region States

        void Initial(XmlToken token)
        {
            if (token.Type == XmlTokenType.Declaration)
            {
                var tok = (XmlDeclarationToken)token;
                standalone = tok.Standalone;
                var ver = 1.0;

                if (!tok.IsEncodingMissing)
                    SetEncoding(tok.Encoding);

                //The declaration token -- Check version
                if (!Double.TryParse(tok.Version, out ver) || ver >= 2.0)
                    throw new ArgumentException("The given version number is not supported.");
            }
            else if (!token.IsIgnorable)
            {
                RaiseErrorOccurred(ErrorCode.UndefinedMarkupDeclaration);
                insert = XmlTreeMode.Prolog;
                BeforeDoctype(token);
            }
        }

        void BeforeDoctype(XmlToken token)
        {
            if (token.Type == XmlTokenType.DOCTYPE)
            {
                var tok = (XmlDoctypeToken)token;
                var doctype = new DocumentType();
                doctype.SystemId = tok.SystemIdentifier;
                doctype.PublicId = tok.PublicIdentifier;
                doctype.Name = tok.Name;
                doc.AppendChild(doctype);
                insert = XmlTreeMode.Body;
            }
            else if (token.Type == XmlTokenType.ProcessingInstruction)
            {
                var tok = (XmlPIToken)token;
                var pi = doc.CreateProcessingInstruction(tok.Target, tok.Content);
                doc.AppendChild(pi);
            }
            else if (token.Type == XmlTokenType.Comment)
            {
                var tok = (XmlCommentToken)token;
                var com = doc.CreateComment(tok.Data);
                doc.AppendChild(com);
            }
            else if (!token.IsIgnorable)
            {
                insert = XmlTreeMode.Body;
                InBody(token);
            }
        }

        void InBody(XmlToken token)
        {
            switch (token.Type)
            {
                case XmlTokenType.StartTag:
                {
                    var tok = (XmlTagToken)token;
                    var tag = doc.CreateElement(tok.Name);

                    if(!tok.IsSelfClosing)
                        open.Add(tag);

                    CurrentNode.AppendChild(tag);

                    for (int i = 0; i < tok.Attributes.Count; i++)
                        tag.SetAttribute(tok.Attributes[i].Key, tok.Attributes[i].Value);

                    break;
                }
                case XmlTokenType.EndTag:
                {
                    if (open.Count == 0)
                        throw new ArgumentException("Unexpected end-tag (no current element).");

                    var tok = (XmlTagToken)token;

                    if (CurrentNode.NodeName != tok.Name)
                        throw new ArgumentException("Mismatched end-tag.");

                    open.RemoveAt(open.Count - 1);
                    break;
                }
                case XmlTokenType.Comment:
                {
                    var tok = (XmlCommentToken)token;
                    var com = doc.CreateComment(tok.Data);
                    CurrentNode.AppendChild(com);
                    break;
                }
                case XmlTokenType.ProcessingInstruction:
                {
                    var tok = (XmlPIToken)token;
                    var pi = doc.CreateProcessingInstruction(tok.Target, tok.Content);
                    CurrentNode.AppendChild(pi);
                    break;
                }
                case XmlTokenType.Character:
                {
                    //Append character to node
                    break;
                }
                case XmlTokenType.EOF:
                {
                    if (open.Count != 0)
                    {
                        RaiseErrorOccurred(ErrorCode.EOF);
                        open.RemoveRange(0, open.Count);
                    }
                    break;
                }
                case XmlTokenType.DOCTYPE:
                {
                    RaiseErrorOccurred(ErrorCode.DoctypeUnexpected);
                    break;
                }
                case XmlTokenType.Declaration:
                {
                    RaiseErrorOccurred(ErrorCode.UndefinedMarkupDeclaration);
                    break;
                }
            }
        }

        #endregion

        #region Helpers

        void SetEncoding(String encoding)
        {
            if (HtmlEncoding.IsSupported(encoding))
            {
                var enc = HtmlEncoding.Resolve(encoding);

                if (enc != null)
                {
                    doc.InputEncoding = enc.WebName;
                    tokenizer.Stream.Encoding = enc;
                }
            }
        }

        #endregion

        #region Handlers

        /// <summary>
        /// Fires an error occurred event.
        /// </summary>
        /// <param name="code">The associated error code.</param>
        void RaiseErrorOccurred(ErrorCode code)
        {
            if (ErrorOccurred != null)
            {
                var pck = new ParseErrorEventArgs((int)code, Errors.GetError(code));
                pck.Line = tokenizer.Stream.Line;
                pck.Column = tokenizer.Stream.Column;
                ErrorOccurred(this, pck);
            }
        }

        #endregion
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The BSD License


Written By
Chief Technology Officer
Germany Germany
Florian lives in Munich, Germany. He started his programming career with Perl. After programming C/C++ for some years he discovered his favorite programming language C#. He did work at Siemens as a programmer until he decided to study Physics.

During his studies he worked as an IT consultant for various companies. After graduating with a PhD in theoretical particle Physics he is working as a senior technical consultant in the field of home automation and IoT.

Florian has been giving lectures in C#, HTML5 with CSS3 and JavaScript, software design, and other topics. He is regularly giving talks at user groups, conferences, and companies. He is actively contributing to open-source projects. Florian is the maintainer of AngleSharp, a completely managed browser engine.

Comments and Discussions