Click here to Skip to main content
Click here to Skip to main content
Articles » Languages » C# » General » Downloads
 
Add your own
alternative version

A non-well-formed HTML Parser and CSS Resolver

, 20 Jul 2007
A non-well-formed HTML parser and CSS Resolver builded by pure .NET C#
dols_html.zip
DOLS
Backup
DFuzzy.rar
DOLRss.rar
DXHtmlObjectModel.rar
bin
Debug
doc
Check.doc
DOL
DBase
DHtml
DCssResolver
DHtmlParser
Node
DOLBase.csproj.user
DOLHtml.csproj.user
DOLS.suo
HTMLTreeDemo
bin
Debug
HTMLTreeDemo.vshost.exe
Google News.files
cleardot.gif
envelope.gif
news(1).jpg
news(10).jpg
news(11).jpg
news(12).jpg
news(13).jpg
news(14).jpg
news(15).jpg
news(16).jpg
news(17).jpg
news(18).jpg
news(19).jpg
news(2).jpg
news(20).jpg
news(21).jpg
news(22).jpg
news(23).jpg
news(24).jpg
news(25).jpg
news(3).jpg
news(4).jpg
news(5).jpg
news(6).jpg
news(7).jpg
news(8).jpg
news(9).jpg
news.gif
news.jpg
Thumbs.db
obj
Debug
TempPE
Properties
Settings.settings
VTune
HTMLTreeDemo.vpj
obj
Debug
TempPE
Properties
Settings.settings
VTune
DOLBase.vpj
DOLHtml.vpj
DOLHtml.vws
dols_html_20070322.zip
DFuzzy.rar
DOLRss.rar
DXHtmlObjectModel.rar
Demo
bin
Debug
Google News.files
cleardot.gif
envelope.gif
news(1).jpg
news(10).jpg
news(11).jpg
news(12).jpg
news(13).jpg
news(14).jpg
news(15).jpg
news(16).jpg
news(17).jpg
news(18).jpg
news(19).jpg
news(2).jpg
news(20).jpg
news(21).jpg
news(22).jpg
news(23).jpg
news(24).jpg
news(25).jpg
news(3).jpg
news(4).jpg
news(5).jpg
news(6).jpg
news(7).jpg
news(8).jpg
news(9).jpg
news.gif
news.jpg
obj
Properties
Check.doc
DOLBase.csproj.user
DOLS.suo
DOLS.vsmdi
Settings.settings
/*****************************************************************************\
>	Copyright 2004 DOL for design studio.
>
>	DOLS DHtmlDocument Class
>
>	E-mail�G	  nomad_libra.tw@yahoo.com.tw
>	E-mail�G	  jameshrsp@ms2.url.com.tw
>
\*****************************************************************************/

// DHtmlDocument.cs: implementation of the DHtmlDocument class.
//
///////////////////////////////////////////////////////////////////////////////

using System;
using System.IO;
using System.Text;
using DOL.DHtml.DHtmlParser.Node;

namespace DOL.DHtml.DHtmlParser
{
	/// <summary>
/// This is the basic HTML document object used to represent a sequence of HTML.
	/// </summary>
    public class DHtmlDocument : DOL.DBase.DIDiagnosisable, ICloneable
	{

    /////////////////////////////////////////////////////////////////////////////////
    #region �򥻾ާ@

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// This will create a new document object by parsing the HTML specified.
        /// </summary>
        public DHtmlDocument()
        {
            m_parser = new DHtmlGeneralParser();
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// This will create a new document object by parsing the HTML specified.
        /// </summary>
        /// <param name="parser"></param>
        public DHtmlDocument(DIHtmlParser parser)
        {
            System.Diagnostics.Debug.Assert(parser != null);
            m_parser = parser;
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// This will create a new document object by parsing the HTML specified.
        /// </summary>
        /// <param name="html"></param>
        public DHtmlDocument(string html)
        {
            System.Diagnostics.Debug.Assert(html != null);
            m_parser = new DHtmlGeneralParser();
            LoadHtml(html);
        }

		/////////////////////////////////////////////////////////////////////////////////
		/// <summary>
        /// This will create a new document object by parsing the HTML specified.
		/// </summary>
		/// <param name="html"></param>
		/// <param name="parser"></param>
        public DHtmlDocument(string html, DIHtmlParser parser)
		{
            System.Diagnostics.Debug.Assert(html != null);
            System.Diagnostics.Debug.Assert(parser != null);
            m_parser = parser;
            LoadHtml(html);
		}

        /////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public object Clone()
        {
            DHtmlDocument newDoc = new DHtmlDocument();
            newDoc.m_charset = m_charset;

            int count = m_nodeList.Count;
            newDoc.m_nodeList.Capacity = count;
            for(int index = 0; index < count; ++index)
                newDoc.m_nodeList.Add((DHtmlNode)m_nodeList[index].Clone());                

           return newDoc;
        }

        /////////////////////////////////////////////////////////////////////////////////        
		/// <summary>
		/// �O�_�����Ī���
		/// </summary>
		public void AssertValid()
		{
            for(int index = 0, count = m_nodeList.Count; index < count; ++index)
                ((DOL.DBase.DIDiagnosisable)m_nodeList[index]).AssertValid();
		}

        /////////////////////////////////////////////////////////////////////////////////       
		/// <summary>
		/// �ɦL����
		/// </summary>
		public void Dump(StringBuilder buffer, string prefix)
		{
			AssertValid();
			string old = prefix;
			buffer.Append(old + "�uObject " + GetType().Name + " Dump : \n");							

			prefix += "�x�@";
			buffer.Append(prefix + "DHtmlNode number: " + m_nodeList.Count + "\n");	

			if(m_nodeList.Count != 0)
			{
				buffer.Append(prefix + "Deep dump in the following:\n");

                for(int index = 0, count = m_nodeList.Count; index < count; ++index) // �Ҧ�����M�X�@���I�s Dump
                {
                    buffer.Append(prefix + "�x\n");
                    m_nodeList[index].Dump(buffer, prefix);
                }      
			}
		}

    #endregion 

    /////////////////////////////////////////////////////////////////////////////////
    #region ���ާ@

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="filePath"></param>
        public virtual void Load(string filePath)
        {
            System.Diagnostics.Debug.Assert(filePath != null);

            FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read);
            Load(fileStream);
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="stream"></param>
        public virtual void Load(Stream inStream)
        {
            System.Diagnostics.Debug.Assert(inStream != null);
            m_charset = null;

            StreamReader streamReader = null;

            m_charset = DetectCharset(inStream);
            if(m_charset != null)
                streamReader = new StreamReader(inStream, m_charset);
            else
            {
                streamReader = new StreamReader(inStream, true);
                m_charset = streamReader.CurrentEncoding;
            }

            m_nodeList.Clear();
            m_parser.Parse(streamReader.ReadToEnd(), m_nodeList);
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="reader"></param>
        public virtual void Load(TextReader reader)
        {            
            System.Diagnostics.Debug.Assert(reader != null);
            m_charset = null;

            LoadHtml(reader.ReadToEnd());            
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="html"></param>
        public virtual void LoadHtml(string html)
        {
            System.Diagnostics.Debug.Assert(html != null);
            m_nodeList.Clear();
            m_parser.Parse(html, m_nodeList);

            if(m_charset == null)
            {
                m_charset = DetectCharset(m_nodeList);
                if(m_charset == null) m_charset = Encoding.Unicode;
            }
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="filePath"></param>
        public virtual void Save(string filePath)
        {
            System.Diagnostics.Debug.Assert(filePath != null);

            FileStream fileStream = new FileStream(filePath, FileMode.Create, FileAccess.Write, FileShare.None);
            Save(fileStream);
            fileStream.Flush();
            fileStream.Close();
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="outStream"></param>
        public virtual void Save(Stream outStream)
        {
            StreamWriter writer = new StreamWriter(outStream, m_charset);
            Save(writer);
            writer.Flush();
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="writer"></param>
        public virtual void Save(TextWriter writer)
        {
             writer.Write(this.HTML);
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="visitor"></param>
        public void Visit(DOL.DBase.DIBaseVisitor visitor)
        {
            int count = m_nodeList.Count;
            for(int index = 0; index < count; ++index)
                m_nodeList[index].Accept(visitor);
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        public virtual Encoding Charset
        {
            get
            {
                return m_charset;
            }
            set
            {
                System.Diagnostics.Debug.Assert(value != null);

                if(!m_charset.Equals(value))
                {
                    m_charset = value;
                    
                    DHtmlNodeCollection metaNodes = new DHtmlNodeCollection();
                    DHtmlElement node = m_nodeList["html"] as DHtmlElement;
                    if(node != null) node = node.Nodes["head"] as DHtmlElement;
                    if(node != null) node.Nodes.FindByNameAttribute(metaNodes, "meta", "content", false);

                    for(int nodeIndex = 0, nodeCount = metaNodes.Count; nodeIndex < nodeCount; ++nodeIndex) // �Ҧ�����M�X�@���I�s Dump
                    {
                        DHtmlElement metaElement = metaNodes[nodeIndex] as DHtmlElement;
                        if(metaElement != null)
                        {
                            int index = -1;
                            DHtmlAttributeCollection attributes = metaElement.Attributes.FindByName("content");
                            for(int attributeIndex = 0, attributeCount = attributes.Count; attributeIndex < attributeCount; ++attributeIndex) // �Ҧ�����M�X�@���I�s Dump
                            {
                                DHtmlAttribute attribute = attributes[attributeIndex];
                                if((index = attribute.Value.IndexOf("charset")) != -1)
                                {
                                    string attributeValue = attribute.Value;
                                    // ���o CodePage �y�z ���}�Y����
                                    int startIndex = index + 7;
                                    while(startIndex < attributeValue.Length && DHtmlTextProcessor.EqualesOfAnyChar(attributeValue[startIndex], " =")) ++startIndex;
                                    // ���o CodePage �y�z ����������
                                    int endIndex = startIndex + 1;
                                    while(endIndex < attributeValue.Length && !DHtmlTextProcessor.EqualesOfAnyChar(attributeValue[endIndex], " ")) ++endIndex;

                                    // ���o CodePage �y�z
                                    if(startIndex < attributeValue.Length && endIndex - startIndex > 0)
                                    {
                                        attributeValue = attributeValue.Remove(startIndex, endIndex - startIndex);
                                        attributeValue = attributeValue.Insert(startIndex, m_charset.WebName);
                                        attribute.Value = attributeValue;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

		/////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// This will return the HTML used to represent this document.
		/// </summary>
        public virtual string HTML
		{
			get
			{
				StringBuilder writer = new StringBuilder();
                for(int index = 0, count = m_nodeList.Count; index < count; ++index)
                    m_nodeList[index].TransformHTML(writer, 0);

                return writer.ToString();
			}
		}

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        public DHtmlNodeCollection Nodes
        {
            get
            {
                return m_nodeList;
            }
        }

    #endregion

	/////////////////////////////////////////////////////////////////////////////////
	#region �������

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="htmlStream"></param>
        /// <returns></returns>
        private Encoding DetectCharset(DHtmlNodeCollection nodes)
        {
            Encoding result = null;

            // ���ѽs�X
            string charset = "";

            DHtmlNodeCollection metaNodes = new DHtmlNodeCollection();
            DHtmlElement node = nodes["html"] as DHtmlElement;
            if(node != null) node = node.Nodes["head"] as DHtmlElement;
            if(node != null) node.Nodes.FindByNameAttribute(metaNodes, "meta", "content", false);

            for(int nodeIndex = 0, count = metaNodes.Count; nodeIndex < count; ++nodeIndex)
            {
                DHtmlElement metaElement = metaNodes[nodeIndex] as DHtmlElement;
                if(metaElement != null)
                {
                    int index = -1;
                    DHtmlAttributeCollection attributes = metaElement.Attributes.FindByName("content");
                    for(int attributeIndex = 0, attributeCount = attributes.Count; attributeIndex < attributeCount; ++attributeIndex) // �Ҧ�����M�X�@���I�s Dump
                    {
                        DHtmlAttribute attribute = attributes[attributeIndex];
                        if((index = attribute.Value.IndexOf("charset")) != -1)
                        {
                            string value = attribute.Value;
                            // ���o CodePage �y�z ���}�Y����
                            int startIndex = index + 7;
                            while(startIndex < value.Length && DHtmlTextProcessor.EqualesOfAnyChar(value[startIndex], " =")) ++startIndex;
                            // ���o CodePage �y�z ����������
                            int endIndex = startIndex + 1;
                            while(endIndex < value.Length && !DHtmlTextProcessor.EqualesOfAnyChar(value[endIndex], " ")) ++endIndex;

                            // ���o CodePage �y�z
                            if(startIndex < value.Length && endIndex - startIndex > 0)
                            {
                                charset = value.Substring(startIndex, endIndex - startIndex);
                                try
                                {
                                    result = Encoding.GetEncoding(charset);
                                    break;
                                }
                                catch(Exception)
                                {
                                }
                            }
                        }
                    }
                }
            }
                
            return result;
        }

        /////////////////////////////////////////////////////////////////////////////////
        /// <summary>
        /// 
        /// </summary>
        /// <param name="htmlStream"></param>
        private Encoding DetectCharset(Stream inStream)
        {
            Encoding result = null;

            // ���ѽs�X
            string charset = "";

            long position = inStream.Position;
            System.IO.StreamReader reader = new System.IO.StreamReader(inStream);
            while(reader.EndOfStream == false)
            {
                string buffer = reader.ReadLine();
                int index = buffer.IndexOf("charset");

                if(index != -1 && buffer.Length > "charset".Length)
                {
                    // ���o CodePage �y�z ���}�Y����
                    int startIndex = index + "charset".Length;
                    while(startIndex < buffer.Length && DHtmlTextProcessor.EqualesOfAnyChar(buffer[startIndex], " \r\n\t=\'\"<>")) ++startIndex;
                    // ���o CodePage �y�z ����������
                    int endIndex = startIndex + 1;
                    while(endIndex < buffer.Length && !DHtmlTextProcessor.EqualesOfAnyChar(buffer[endIndex], " \r\n\t=\'\"<>")) ++endIndex;

                    // ���o CodePage �y�z
                    if(startIndex < buffer.Length && endIndex - startIndex > 0)
                    {
                        charset = buffer.Substring(startIndex, endIndex - startIndex);
                        try
                        {
                            result = Encoding.GetEncoding(charset);
                            break;
                        }
                        catch(Exception)
                        {
                        }
                    }
                }
            }

            inStream.Position = position;

            return result;
        }

    #endregion

	/////////////////////////////////////////////////////////////////////////////////
	#region �������

        /// <summary>
        /// 
        /// </summary>
        private DIHtmlParser m_parser = null;
        /// <summary>
        /// 
        /// </summary>
        private Encoding m_charset = Encoding.Default;
		/// <summary>
		/// 
		/// </summary>
		private DHtmlNodeCollection m_nodeList = new DHtmlNodeCollection(null);		

    #endregion 

	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

About the Author

James S.F. Hsieh
Web Developer
United States United States
James S.F. Hsieh(Nomad Libra) Working as engineer for "Corel Intervideo" company situated in Taiwan.
He received his master degree in Graduate Institute of Network Learning Technology, National Central University, Taiwan in 2006.
His research interests are semantic Web services, intelligent software agent, machine learning, algorithm, software
engineering and multimedia programming.

| Advertise | Privacy | Mobile
Web01 | 2.8.140721.1 | Last Updated 20 Jul 2007
Article Copyright 2007 by James S.F. Hsieh
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid