Click here to Skip to main content
15,894,405 members
Articles / Desktop Programming / XAML

Dead Simple HTML Sanitizer

Rate me:
Please Sign up or sign in to vote.
4.75/5 (4 votes)
18 Jan 2013CPOL 34.3K   600   8  
A dead simple HTML Sanitizer (and HTML Parser) you can use to clean user HTML input.

namespace WhichMan.Utilities.HtmlUtils
{
    using System.Collections.Generic;
    using System.Collections.ObjectModel;
    using System.Linq;

    public static class HtmlSanitizer
    {
        private static readonly Collection<string> Unsafe;

        #region - Ctor -
        static HtmlSanitizer()
        {
            var list = new[]
                           {
                               "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmousemove",
                               "onmouseout", "onkeypress", "onkeydown", "onkeyup", "script", "applet", "embed", "frameset",
                               "iframe", "frame", "object", "ilayer", "layer"
                           };
            Unsafe = new Collection<string>();

            foreach (var item in list)
            {
                Unsafe.Add(item);
            }
           
        }

        #endregion

        public static string Sanitize(string html)
        {
            var doc = HtmlParser.Parse("<html>" + html + "</html>");
            Sanitize(doc);

            if (doc.ChildNodes.Count == 0)
                return null;

            var result = doc.ToString();
            return result.Substring(6, result.Length - 13);
        }

        public static string Sanitize(HtmlElement doc)
        {
            doc.RemoveUnSafe();
            var result = doc.ToString();
            return result;
        }

        private static void RemoveUnSafe(this HtmlElement element)
        {
            var attributesToRemove = (from attribute in element.Attributes where attribute.IsUnSafe() select attribute);
            element.RemoveAll(attributesToRemove);

            if (element.Name == "input")
            {
                foreach (var attribute in element.Attributes)
                {
                    if (attribute.Name == "type" && attribute.Value.ToLower() == "submit")
                        attribute.Value = "button";
                }
            }

            var nodesToRemove = new List<int>();
            for (var i = 0; i < element.ChildNodes.Count; i++)
            {
                var node = element.ChildNodes[i];
                if (node is HtmlInstruction || node is HtmlComment || node.IsUnSafe())
                    nodesToRemove.Add(i);
            }
            if (nodesToRemove.Count > 0)
            {
                var index = nodesToRemove.Count - 1;
                while (index > -1)
                {
                    element.ChildNodes.RemoveAt(nodesToRemove[index]);
                    index--;
                }
            }

            foreach (var node in element.ChildNodes)
            {
                if (!(node is HtmlElement))
                    continue;
                (node as HtmlElement).RemoveUnSafe();
            }
        }

        private static bool IsUnSafe(this HtmlAttribute attribute)
        {
            if (Unsafe.Contains(attribute.Name))
                return true;
            return false;
        }

        private static bool IsUnSafe(this HtmlNode node)
        {
            if (!(node is HtmlElement))
                return false;

            return Unsafe.Contains((node as HtmlElement).Name);
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Architect
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions