Click here to Skip to main content
15,886,362 members
Articles / Web Development / ASP.NET

HTML fragments parsing and creation

Rate me:
Please Sign up or sign in to vote.
3.08/5 (9 votes)
15 Mar 20033 min read 84K   964   43  
Classes to parse HTML parts into an object tree and back
using System;
using System.Diagnostics;
using System.Collections;
using System.Text;
using System.Text.RegularExpressions;
using System.Diagnostics;

namespace HtmlFragments
{
  /// <summary>
  /// A collection of fragments organized in a list.
  /// </summary>
	public class Fragments: Fragment, IList
	{
    /// <summary>
    /// Construct an empty collection.
    /// </summary>
		public Fragments()
		{
		}

    /// <summary>
    /// Constructs a collection and parses the given text into fragments.
    /// </summary>
    /// <param name="fragment">The text to be parsed.</param>
    /// <exception cref="HtmlFragments.FragmentParsingException">Will be thrown, if there is an error in the input.</exception>
    public Fragments( string fragment )
    {
      string patternQuoted = "[\"](?'{0}'[^\"]*)[\"]|['](?'text'[^']*)[']";
      string patternAttributes = @"(\s+(?'attribute'(?'name'[\w-]+)=" + String.Format( patternQuoted, "value" ) + "))*";
      string patternTag = @"(?'Tag'\w+)";
      string patternOpenTag = @"(?'OpenTag'<" + patternTag + patternAttributes + @"\s*>)";
      string patternOpenCloseTag = @"(?'OpenCloseTag'<"+ patternTag + patternAttributes + @"\s*/>)";
      string patternCloseTag = @"(?'CloseTag'</"+ patternTag + @"\s*>)";
      string patternWhiteSpace = @"(?'WhiteSpace'\s+)";
      string patternText = @"(?'Text'[^<>\s](" + patternWhiteSpace + @"[^<>\s]|[^<>\s])*)";
      string patternDoctype = @"(?'Doctype'<!DOCTYPE\s+(?'root'\w+)\s+(?'type'\w+)(\s+" + String.Format( patternQuoted, "identifier" ) + @")?(\s+" + String.Format( patternQuoted, "uri" ) + @")?\s*>)";
      string patternComment = @"<!--(?'Comment'[\s\S]*)-->";
      string pattern = patternOpenTag + "|" + patternOpenCloseTag + "|" + patternCloseTag + "|" + patternDoctype + "|" + patternComment + "|" + patternText + "|" + patternWhiteSpace;

      Regex regex = new Regex( pattern, RegexOptions.ExplicitCapture );

      ArrayList stack = new ArrayList();
      FragmentTag currentTag = null;
      Fragments currentNodes = this;

      int mustMatchAt = 0;
      Match match = regex.Match( fragment, mustMatchAt );
      while ( match.Success )
      { 
        if ( match.Index != mustMatchAt )
        {
          throw new FragmentParsingException( "unmatched text found.", mustMatchAt, fragment );
        }
        mustMatchAt = match.Index + match.Length;
        if ( match.Groups["Text"].Success )
        {
          int offset = 0;
          StringBuilder text = new StringBuilder( match.Groups["Text"].Value );
          foreach ( Capture white in match.Groups["WhiteSpace"].Captures )
          {
            text.Replace( white.Value, " ", white.Index - match.Index + offset, white.Length );
            offset += 1 - white.Length;
          }
          currentNodes.Add( new FragmentText( text.ToString() ) );
        }
        else if ( match.Groups["Doctype"].Success )
        {
          string root = match.Groups["root"].Value;
          string type = match.Groups["type"].Value;
          string identifier = match.Groups["identifier"].Value;
          string uri = match.Groups["uri"].Value;
          currentNodes.Add( new FragmentDoctype( root, type, identifier, uri ) );
        }
        else if ( match.Groups["Comment"].Success )
        {
          currentNodes.Add( new FragmentComment( match.Groups["Comment"].Value ) );
        }
        else if ( match.Groups["OpenCloseTag"].Success )
        {
          FragmentTag tag = new FragmentTag( match.Groups["Tag"].Value, FragmentTagType.OpenCloseShort );
          AppendAttributes( tag, match );
          currentNodes.Add( tag );
        }
        else if ( match.Groups["OpenTag"].Success )
        {
          FragmentTag tag = new FragmentTag( match.Groups["Tag"].Value, FragmentTagType.Open );
          AppendAttributes( tag, match );
          currentNodes.Add( tag );
          stack.Add( tag );
          currentTag = tag;
          currentNodes = currentTag.Nodes;
        }
        else if ( match.Groups["CloseTag"].Success )
        {
          if ( currentTag != null )
          {
            if ( currentTag.Name == match.Groups["Tag"].Value )
            {
              currentTag.Type = FragmentTagType.OpenClose;
              stack.Remove( currentTag );
              if ( stack.Count > 0 )
              {
                currentTag = (FragmentTag)stack[ stack.Count - 1 ];
                currentNodes = currentTag.Nodes;
              }
              else
              {
                currentTag = null;
                currentNodes = this;
              }              
            }
            else
            {
              throw new FragmentParsingException( "unmatched closing tag '" + match.Groups["Tag"].Value + "'. Should match '" + currentTag.Name + "'.", match.Index, fragment );
            }
          }
          else
          {
            FragmentTag tag = new FragmentTag( match.Groups["Tag"].Value, FragmentTagType.Close );
            foreach ( Fragment node in currentNodes )
            {
              tag.Nodes.Add( node );
            }
            currentNodes.Clear();
            currentNodes.Add( tag );
          }
        }
        match = match.NextMatch();
      }
    }
    private void AppendAttributes( FragmentTag tag, Match match )
    {
      Group nameGroup = match.Groups["name"];
      Group valueGroup = match.Groups["value"];

      if ( nameGroup.Success && valueGroup.Success )
      {
        CaptureCollection names = nameGroup.Captures;
        CaptureCollection values = valueGroup.Captures;

        for ( int i=0; i < names.Count; i++ )
        {
          tag.Attributes[ names[i].Value ] = values[i].Value;
        }
      }
    }
    
    private ArrayList nodes = new ArrayList();

    #region Append
    /// <summary>
    /// Appends the text of all child fragment to the string.
    /// </summary>
    /// <param name="builder">To this object all text is appended.</param>
    public override void Append( StringBuilder builder )
    {
      foreach ( Fragment node in nodes )
        node.Append( builder );
    }
    #endregion

    #region Implementation of IList
    /// <summary>
    /// Removes the fragment at an index.
    /// </summary>
    /// <param name="index">Index of the element to remove.</param>
    public void RemoveAt( int index )
    {
      nodes.RemoveAt( index );
    }
    /// <summary>
    /// Inserts the fragment at the given index.
    /// </summary>
    /// <param name="index">Where to insert.</param>
    /// <param name="fragment">Fragment to insert.</param>
    public void Insert( int index, Fragment fragment )
    {
      nodes.Insert( index, fragment );
    }
    
    void IList.Insert( int index, object value )
    {
      Insert( index, (Fragment)value );
    }

    /// <summary>
    /// Removes the given element.
    /// </summary>
    /// <param name="fragment">Fragment to remove.</param>
    public void Remove( Fragment fragment )
    {
      nodes.Remove( fragment );
    }

    void IList.Remove( object value )
    {
      Remove( (Fragment)value );
    }

    bool IList.Contains( object value )
    {
      return Contains( (Fragment)value );
    }
    /// <summary>
    /// Checks, whether a fragment is contained in the collection.
    /// </summary>
    /// <param name="fragment">Fragment to look for.</param>
    /// <returns>true if the fragment is in the collection, else false.</returns>
    public bool Contains( Fragment fragment )
    {
      return nodes.Contains( fragment );
    }

    /// <summary>
    /// Clear the collection.
    /// </summary>
    public void Clear()
    {
      nodes.Clear();
    }

    int IList.IndexOf( object value )
    {
      return IndexOf( (Fragment)value );
    }
    /// <summary>
    /// Returns the index of a fragment.
    /// </summary>
    /// <param name="fragment">Fragment to look for.</param>
    /// <returns>Index of the fragment.</returns>
    public int IndexOf( Fragment fragment )
    {
      return nodes.IndexOf( fragment );
    }

    int IList.Add( object value )
    {
      return Add( (Fragment)value );
    }
    /// <summary>
    /// Add a fragment to the end of the collection.
    /// </summary>
    /// <param name="fragment">Fragment to add.</param>
    /// <returns>Index of the appended fragment.</returns>
    public int Add( Fragment fragment )
    {
      return nodes.Add( fragment );
    }

    /// <summary>
    /// Is the collection read-only.
    /// </summary>
    public bool IsReadOnly
    {
      get
      {
        return nodes.IsReadOnly;
      }
    }

    object IList.this[int index]
    {
      get
      {
        return nodes[index];
      }
      set
      {
        if ( value is Fragment )
          nodes[index] = value;
        else
          throw new ArgumentException( "Does not inherit " + typeof( Fragment ).FullName + ".", "value" );
      }
    }

    /// <summary>
    /// Indexed access to the collection.
    /// </summary>
    public Fragment this[int index]
    {
      get
      {
        return (Fragment)nodes[index];
      }
      set
      {
        nodes[index] = value;
      }
    }

    /// <summary>
    /// Is the collection of fixed size.
    /// </summary>
    public bool IsFixedSize
    {
      get
      {
        return nodes.IsFixedSize;
      }
    }
    #endregion
    #region Implementation of ICollection
    /// <summary>
    /// Copies the collection into an array.
    /// </summary>
    /// <param name="array">The one-dimensional Array that is the destination of the elements copied from the current collection.</param>
    /// <param name="index">The index in array at which copying begins.</param>
    public void CopyTo( System.Array array, int index )
    {
      nodes.CopyTo( array, index );
    }

    /// <summary>
    /// Gets a value indicating whether access to the collection is synchronized (thread-safe).
    /// </summary>
    public bool IsSynchronized
    {
      get
      {
        return nodes.IsSynchronized;
      }
    }

    /// <summary>
    /// Gets the number of element in the collection.
    /// </summary>
    public int Count
    {
      get
      {
        return nodes.Count;
      }
    }

    /// <summary>
    /// Gets an object that can be used to synchronize access to the collection.
    /// </summary>
    public object SyncRoot
    {
      get
      {
        return nodes.SyncRoot; 
      }
    }
    #endregion
    #region Implementation of IEnumerable
    /// <summary>
    /// Returns an enumerator that can iterate through the collection.
    /// </summary>
    /// <returns>An IEnumerator for the entire collection.</returns>
    public System.Collections.IEnumerator GetEnumerator()
    {
      return nodes.GetEnumerator();
    }
    #endregion
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
Germany Germany
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions