/**
* MIL HTML Parser
* ===============
*
* Version: 1.1
* Date: 17th March 2004
* Author: Andy Powney (andy@powney.demon.co.uk)
*
* DESCRIPTION
* ===========
*
* I have long been after code to represent an HTML page as a set of objects, but without
* having to insist that the page developer implement SGML- or XML-style formatting. This was
* rather a tricky process, but one which I am sure others will find useful.
*
* This initial version has a couple of known issues, but is generally stable. Currently, it
* only handles a string of HTML. It is trivial for you to make a minor change to read from
* a stream etc., so I have not bothered writing that five line function.
*
* Furthermore, this initial version does not report errors. It simply builds a tree as best
* it can. In this respect, it is not a true parser, but is just as useful for most applications.
* It does not enforce conformance of any kind (to DTDs etc.), so should be able to handle just
* about anything you throw at it.
*
* The parser is case insensitive, both in element/tag names and attribute names.
*
* HOW IT WORKS
* ============
*
* The Parse() method on this object will build a tree of nodes that represent the HTML you
* pass in as a parameter. It will go though each piece of HTML, getting rid of comments, and
* building the tree. The tree itself will consist of a number of HtmlNode objects. A node may
* be either text (HtmlText) or an element (HtmlElement).
*
* The tree that gets produced is relatively simple once you get your head round it. I think it
* best to demonstrate it using some examples:
*
* For example,
*
* <BODY>
* <IMG SRC="world.jpg">
* </BODY>
*
* Results in:
*
* BODY
* IMG SRC="world.jpg"
*
* For example,
*
* This is a <B>bold</B> text example.
*
* Results in,
*
* This is a
* B
* bold
* text example
*
* For example,
*
* <TABLE>
* <TR>
* <TD>
* Row One, Column One
* </TD>
* <TD>
* Row One, Column Two
* </TD>
* </TR>
* <TR>
* <TD ColSpan="2">
* Bottom Row, Both Columns
* </TD>
* </TR>
* </TABLE>
*
* Results in:
*
* TABLE
* TR
* TD
* Row One, Column One
* TD
* Row One, Column Two
* TR
* TD ColSpan="2"
* Bottom Row, Both Columns
*
* KNOWN ISSUES
* ============
*
* 1. Scripts that contain the </script> tag in quotation marks get extracted improperly.
* 2. Backslash escaping of quotation marks inside an attribute value doesn't work.
* 3. Full SGML comments are not supported - only "<!-- comment -->" format is
*
* FUTURE DEVELOPMENT
* ==================
*
* I am thinking about creating a wrapper library for the latest version of HTML. These
* wrappers would encapsulate the HtmlElement objects and expose the appropriate attributes
* as parameters. For example, a HtmlElementTD object would expose a Width property. etc. You'll
* probably see that it's quite a bit of work, so I haven't scheduled any of that yet.
*
* I'm going to implement methods particulary for SGML compliance.
*
* I'm going to implement a flag indicating that the parser should not return whitespace-only
* text items (except inside <pre></pre> tags). I thought this current behaviour would be
* more useful, but I am gradually leaning towards this new change.
*
* The known issues (listed above) will be resolved in v1.2.
*
* I am not working to a schedule, so I tend to implement things when I get time.
*
* REPORTING BUGS AND CONTACTING ME
* ================================
*
* This is only v1.1, so there are bound to be bugs that I haven't found. If you want to
* contact me, please do so via email: andy@powney.demon.co.uk
*
* Please bear in mind that I go though periods of utter laziness...
*
* If you find any bugs or want to furnish me with some wishlists, get in touch. I'd also
* like to hear how you use this code.
*
* LEGAL
* =====
*
* Use this code at your own risk. It's free to use and modify, but don't release any variants
* of this code without my permission. (Usual GPL things).
*/
using System;
using System.IO;
using System.Text;
using System.Collections;
using System.Collections.Specialized;
namespace MIL.Html
{
/// <summary>
/// This is the main HTML parser class. I recommend you don't play around too much in here
/// as it's a little fiddly.
///
/// Bascially, this class will build a tree containing HtmlNode elements.
/// </summary>
public class HtmlParser
{
private static char[] WHITESPACE_CHARS = " \t\r\n".ToCharArray();
/// <summary>
/// Internal FSM to represent the state of the parser
/// </summary>
private enum ParseStatus
{
ReadText = 0,
ReadEndTag = 1,
ReadStartTag = 2,
ReadAttributeName = 3,
ReadAttributeValue = 4
};
/// <summary>
/// This constructs a new parser. Even though this object is currently stateless,
/// in the future, parameters coping for tollerance and SGML (etc.) will be passed.
/// </summary>
public HtmlParser()
{
}
#region The main parser
/// <summary>
/// This will parse a string containing HTML and will produce a domain tree.
/// </summary>
/// <param name="html">The HTML to be parsed</param>
/// <returns>A tree representing the elements</returns>
public HtmlNodeCollection Parse(string html)
{
HtmlNodeCollection nodes = new HtmlNodeCollection();
html = PreprocessScript( html );
html = RemoveComments( html );
StringCollection tokens = GetTokens( html );
int index = 0;
while( index < tokens.Count )
{
if( "<".Equals( tokens[index] ) )
{
// Read open tag
index++;
if( index >= tokens.Count ) break;
string tag_name = tokens[index];
index++;
HtmlElement element = new HtmlElement( tag_name );
// read the attributes and values
while( index < tokens.Count && ! ">".Equals( tokens[index] ) )
{
string attribute_name = tokens[ index ];
index++;
if( index < tokens.Count && "=".Equals( tokens[ index ] ) )
{
index++;
string attribute_value;
if( index < tokens.Count )
{
attribute_value = tokens[ index ];
}
else
{
attribute_value = null;
}
index++;
HtmlAttribute attribute = new HtmlAttribute( attribute_name , attribute_value );
element.Attributes.Add( attribute );
}
else if( index < tokens.Count )
{
// Null-value attribute
HtmlAttribute attribute = new HtmlAttribute( attribute_name , null );
element.Attributes.Add( attribute );
}
}
if( index < tokens.Count && ">".Equals( tokens[ index ] ) )
{
index++;
}
nodes.Add( element );
}
else if( ">".Equals( tokens[index] ) )
{
index++;
}
else if( "</".Equals( tokens[index] ) )
{
// Read close tag
index++;
if( index >= tokens.Count ) break;
string tag_name = tokens[index];
index++;
int open_index = FindTagOpenNodeIndex( nodes , tag_name );
if( open_index != -1 )
{
MoveNodesDown( ref nodes , open_index + 1 , nodes[open_index] );
}
else
{
// Er, there is a close tag without an opening tag!!
}
// Skip to the end of this tag
while( index < tokens.Count && ! ">".Equals( tokens[ index ] ) )
{
index++;
}
if( index < tokens.Count && ">".Equals( tokens[ index ] ) )
{
index++;
}
}
else
{
// Read text
HtmlText node = new HtmlText( DecodeScript( tokens[ index ] ) );
index++;
nodes.Add( node );
}
}
return nodes;
}
/// <summary>
/// This will move all the nodes from the specified index to the new parent.
/// </summary>
/// <param name="nodes">The collection of nodes</param>
/// <param name="node_index">The index of the first node (in the above collection) to move</param>
/// <param name="new_parent">The node which will become the parent of the moved nodes</param>
private void MoveNodesDown(ref HtmlNodeCollection nodes,int node_index,HtmlNode new_parent)
{
for( int i = node_index ; i < nodes.Count ; i++ )
{
new_parent.Nodes.Add( nodes[i] );
}
int c = nodes.Count;
for( int i = node_index ; i < c ; i++ )
{
nodes.RemoveAt( node_index );
}
}
/// <summary>
/// This will find the corresponding opening tag for the named one. This is identified as
/// the most recently read node with the same name, but with no child nodes.
/// </summary>
/// <param name="nodes">The collection of nodes</param>
/// <param name="name">The name of the tag</param>
/// <returns>The index of the opening tag, or -1 if it was not found</returns>
private int FindTagOpenNodeIndex(HtmlNodeCollection nodes,string name)
{
for( int index = nodes.Count - 1 ; index >= 0 ; index-- )
{
if( nodes[index] is HtmlElement )
{
if( ( (HtmlElement) nodes[index] ).Name.ToLower().Equals( name.ToLower() ) && ( (HtmlElement) nodes[index] ).Nodes.Count == 0 )
{
return index;
}
}
}
return -1;
}
#endregion
#region HTML clean-up functions
/// <summary>
/// This will remove all HTML comments from the input string. This will
/// not remove comment markers from inside tag attribute values.
/// </summary>
/// <param name="input">Input HTML containing comments</param>
/// <returns>HTML containing no comments</returns>
private string RemoveComments(string input)
{
StringBuilder output = new StringBuilder();
int i = 0;
bool inTag = false;
while( i < input.Length )
{
if( i + 4 < input.Length && input.Substring( i , 4 ).Equals( "<!--" ) )
{
i += 4;
i = input.IndexOf( "-->" , i );
if( i == -1 )
{
break;
}
i += 3;
}
else if( input.Substring( i , 1 ).Equals( "<" ) )
{
inTag = true;
output.Append( "<" );
i++;
}
else if( input.Substring( i , 1 ).Equals( ">" ) )
{
inTag = false;
output.Append( ">" );
i++;
}
else if( input.Substring( i , 1 ).Equals( "\"" ) && inTag )
{
int string_start = i;
i++;
i = input.IndexOf( "\"" , i );
if( i == -1 )
{
break;
}
i++;
output.Append( input.Substring( string_start , i - string_start ) );
}
else if( input.Substring( i , 1 ).Equals( "\'" ) && inTag )
{
int string_start = i;
i++;
i = input.IndexOf( "\'" , i );
if( i == -1 )
{
break;
}
i++;
output.Append( input.Substring( string_start , i - string_start ) );
}
else
{
output.Append( input.Substring( i , 1 ) );
i++;
}
}
return output.ToString();
}
/// <summary>
/// This will encode the scripts within the page so they get passed through the
/// parser properly. This is due to some people using comments protect the script
/// and others who don't. It also takes care of issues where the script itself has
/// HTML comments in (in strings, for example).
/// </summary>
/// <param name="input">The HTML to examine</param>
/// <returns>The HTML with the scripts marked up differently</returns>
private string PreprocessScript(string input)
{
StringBuilder output = new StringBuilder();
int index = 0;
while( index < input.Length )
{
if( index + 7 < input.Length && input.Substring( index , 7 ).ToLower().Equals( "<script" ) )
{
// Look for the end of the tag (we pass the attributes through as normal)
do
{
if( index >= input.Length )
{
break;
}
else if( input.Substring( index , 1 ).Equals( ">" ) )
{
output.Append( ">" );
index++;
break;
}
else if( input.Substring( index , 1 ).Equals( "\"" ) )
{
output.Append( "\"" );
index++;
while( index < input.Length && ! input.Substring( index , 1 ).Equals( "\"" ) )
{
output.Append( input.Substring( index , 1 ) );
index++;
}
if( index < input.Length )
{
index++;
output.Append( "\"" );
}
}
else if( input.Substring( index , 1 ).Equals( "\'" ) )
{
output.Append( "\'" );
index++;
while( index < input.Length && ! input.Substring( index , 1 ).Equals( "\'" ) )
{
output.Append( input.Substring( index , 1 ) );
index++;
}
if( index < input.Length )
{
index++;
output.Append( "\'" );
}
}
else
{
output.Append( input.Substring( index , 1 ) );
index++;
}
} while( true );
if( index >= input.Length ) break;
// Phew! Ok now we are reading the script body
StringBuilder script_body = new StringBuilder();
while( index + 9 < input.Length && ! input.Substring( index , 9 ).ToLower().Equals( "</script>" ) )
{
script_body.Append( input.Substring( index , 1 ) );
index++;
}
// Done - now encode the script
output.Append( EncodeScript( script_body.ToString() ) );
output.Append( "</script>" );
if( index + 9 < input.Length )
{
index += 9;
}
}
else
{
output.Append( input.Substring( index , 1 ) );
index++;
}
}
return output.ToString();
}
private static string EncodeScript(string script)
{
string output = script.Replace( "<" , "[MIL-SCRIPT-LT]" );
output = output.Replace( ">" , "[MIL-SCRIPT-GT]" );
return output;
}
private static string DecodeScript(string script)
{
string output = script.Replace( "[MIL-SCRIPT-LT]" , "<" );
output = output.Replace( "[MIL-SCRIPT-GT]" , ">" );
return output;
}
#endregion
#region HTML tokeniser
/// <summary>
/// This will tokenise the HTML input string.
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
private StringCollection GetTokens(string input)
{
StringCollection tokens = new StringCollection();
int i = 0;
ParseStatus status = ParseStatus.ReadText;
while( i < input.Length )
{
if( status == ParseStatus.ReadText )
{
if( i+2 < input.Length && input.Substring( i , 2 ).Equals( "</" ) )
{
i += 2;
tokens.Add( "</" );
status = ParseStatus.ReadEndTag;
}
else if( input.Substring( i , 1 ).Equals( "<" ) )
{
i++;
tokens.Add( "<" );
status = ParseStatus.ReadStartTag;
}
else
{
int next_index = input.IndexOf( "<" , i );
if( next_index == -1 )
{
tokens.Add( input.Substring( i ) );
break;
}
else
{
tokens.Add( input.Substring( i , next_index - i ) );
i = next_index;
}
}
}
else if( status == ParseStatus.ReadStartTag )
{
// Skip leading whitespace in tag
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
// Read tag name
int tag_name_start = i;
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( " \r\n\t>".ToCharArray() ) == -1 )
{
i++;
}
tokens.Add( input.Substring( tag_name_start , i - tag_name_start ) );
// Skip trailing whitespace in tag
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( ">" ) )
{
tokens.Add( ">" );
status = ParseStatus.ReadText;
i++;
}
else
{
status = ParseStatus.ReadAttributeName;
}
}
else if( status == ParseStatus.ReadEndTag )
{
// Skip leading whitespace in tag
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
// Read tag name
int tag_name_start = i;
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( " \r\n\t>".ToCharArray() ) == -1 )
{
i++;
}
tokens.Add( input.Substring( tag_name_start , i - tag_name_start ) );
// Skip trailing whitespace in tag
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( ">" ) )
{
tokens.Add( ">" );
status = ParseStatus.ReadText;
i++;
}
}
else if( status == ParseStatus.ReadAttributeName )
{
// Read attribute name
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
int attribute_name_start = i;
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( " \r\n\t>=".ToCharArray() ) == -1 )
{
i++;
}
tokens.Add( input.Substring( attribute_name_start , i - attribute_name_start ) );
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( ">" ) )
{
tokens.Add( ">" );
status = ParseStatus.ReadText;
i++;
}
else if( i<input.Length && input.Substring( i , 1 ).Equals( "=" ) )
{
tokens.Add( "=" );
i++;
status = ParseStatus.ReadAttributeValue;
}
}
else if( status == ParseStatus.ReadAttributeValue )
{
// Read the attribute value
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( "\"" ) )
{
int value_start = i;
i++;
while( i<input.Length && ! input.Substring( i , 1 ).Equals( "\"" ) )
{
i++;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( "\"" ) )
{
i++;
}
tokens.Add( input.Substring( value_start + 1 , i - value_start - 2) );
status = ParseStatus.ReadAttributeName;
}
else if( i<input.Length && input.Substring( i , 1 ).Equals( "\'" ) )
{
int value_start = i;
i++;
while( i<input.Length && ! input.Substring( i , 1 ).Equals( "\'" ) )
{
i++;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( "\'" ) )
{
i++;
}
tokens.Add( input.Substring( value_start + 1, i - value_start - 2 ) );
status = ParseStatus.ReadAttributeName;
}
else
{
int value_start = i;
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( " \r\n\t>".ToCharArray() ) == -1 )
{
i++;
}
tokens.Add( input.Substring( value_start , i - value_start ) );
while( i<input.Length && input.Substring( i , 1 ).IndexOfAny( WHITESPACE_CHARS ) != -1 )
{
i++;
}
status = ParseStatus.ReadAttributeName;
}
if( i<input.Length && input.Substring( i , 1 ).Equals( ">" ) )
{
tokens.Add( ">" );
i++;
status = ParseStatus.ReadText;
}
}
}
return tokens;
}
#endregion
}
}