Click here to Skip to main content
15,891,184 members
Articles / Programming Languages / C#

Harvesting Web Content into MHTML Archive

Rate me:
Please Sign up or sign in to vote.
4.69/5 (14 votes)
18 Feb 20068 min read 61.9K   729   49  
Capture and archive web resources by saving them in RFC-2557 (MHTML) compliant format. This library includes a framework for augmenting the capture process and allowing programmatic control, from downloading through saving. A replacement for CDOSYS/CDONTS.
using Rilling.Web.Mhtml;
using System.Text;
using Rilling.MhtmlLib.Media.Artifacts;
using Rilling.Web.Mhtml.Configuration;
using System.Xml;
using System.Diagnostics;
using System.Reflection;
using Rilling.MhtmlLib.Media;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO;
using System;

namespace Rilling.MhtmlLib.Media.Resolvers
{
	/// <summary>
	///		This resolver is responsible for locating all image references
	///		within the HTML or CSS content.
	/// </summary>
	public class ImageLinkResolver : LinkResolverBase, IExpandedConfiguration
	{
		/// <summary>
		///		Sets or retrieves whether background images are also located.
		/// </summary>
		/// <remarks>
		///		Background images are defined as the following.
		///			1)  CSS styles (in the style tag or inline attributes)
		///				that define the backgound-image or
		///				list-style-image attributes.
		///			2)  The BACKGROUND attribute that can exist on tags
		///				such as BODY and TD.
		/// </remarks>
		public bool IncludeBackgroundImages
		{
			get{return m_inclBackImgs;}
			set{m_inclBackImgs = value;}
		}

		/// <summary>
		///		Sets or retrieves whether foreground images are also located.
		/// </summary>
		/// <remarks>
		///		Foreground images are defined as the following:
		///			1)  Paths provided in the SRC attribute of the IMG tag.
		///			2)  Paths provided in an INPUT type=image tag.
		/// </remarks>
		public bool IncludeForegroundImages
		{
			get{return m_inclForeImgs;}
			set{m_inclForeImgs = value;}
		}

		/// <summary>
		///		Sets or retrieves whether CSS files are processed.
		/// </summary>
		public bool EvaluateCssFiles
		{
			get{return m_srchCssFiles;}
			set{m_srchCssFiles = value;}
		}

		/// <summary>
		///		Sets or retrieves whether HTML files are processed.
		/// </summary>
		public bool EvaluateHtmlFiles
		{
			get{return m_srchHtmlFiles;}
			set{m_srchHtmlFiles = value;}
		}

		/// <summary>
		///		Parses the file contents locates any potential external
		///		image references.
		/// </summary>
		/// <param name="contentData">
		///		The contents that will be parsed.
		/// </param>
		/// <param name="baseUrl">
		///		The <see cref="Uri"/> used to locate relative references.
		///		In order for a download to occur, an absolute reference is
		///		required.  If the reference is already absolute, this parameter
		///		is not used.
		/// </param>
		/// <returns>
		///		An array of <see cref="LinkReplacementInfo"/> objects that
		///		describe the references such as its address and location
		///		within the text stream.
		/// </returns>
		/// <remarks>
		///		Simply finding a references does not, by itself, indicate 
		///		whether or not the resource actually exists.  The purpose
		///		of this method is to only locate patterns within the text
		///		that could be a reference.  Once the potential links are
		///		returned, the calling engine will attempt to resolve them
		///		to physical resources.
		/// </remarks>
		/// <remarks>
		///		If parameter baseUrl ends in a forward-slash (/), then the 
		///		path is considered to be a directory, which is then merged
		///		with the located reference.  If there is no forward-slash,
		///		then the base url is consider a file, in which case the parent
		///		directory is then used when forming an absolute address with
		///		the located reference.
		/// </remarks>
		/// <remarks>
		///		There is no gaurentee of order when locating references.  Meaning
		///		they may not be returned in the same order that they existed
		///		within the documents.
		/// </remarks>
		public override LinkIdentification[]
			LocateReferences(TextArtifact contentData, Uri baseUrl)
		{
			if (contentData == null) throw (new ArgumentNullException("contentData"));
			if (baseUrl == null) throw (new ArgumentNullException("baseUrl"));

			ArrayList al = new ArrayList();

			string contentSource = contentData.ToString();

			// 
			// Locate all references within a CSS document.  Note that CSS images are always
			// considered part of the background.
			//
			if( contentData.ContentType == ContentTypes.Css && 
				EvaluateCssFiles && 
				IncludeBackgroundImages)
			{
				al.AddRange(Utilities.SearchContent(m_linkPatternStyleAtt,
					contentSource,
					baseUrl));
			}

			// 
			// Locate all references within an HTML document.
			//
			if( contentData.ContentType == ContentTypes.Html && EvaluateHtmlFiles)
			{
				//
				// Detemine if background images are to be located.
				//
				if(IncludeBackgroundImages)
				{
					//
					// To simpify the pattern matching, each style section is isolated
					// so a second search can be performed in that limited scope.  There
					// can be more than one style section in a page.  Same for the 
					// style attributes.
					//
					foreach(Match m in m_styleTagContainment.Matches(contentSource))
					{
						al.AddRange(Utilities.SearchContent(m_linkPatternStyleAtt,
												  m.Value,
												  baseUrl));
					}

					foreach(Match m in m_styleAttContainment.Matches(contentSource))
					{
						al.AddRange(Utilities.SearchContent(m_linkPatternStyleAtt,
												  m.Value,
												  baseUrl));
					}

					al.AddRange(Utilities.SearchContent(m_linkPatternBkTag, 
						contentSource,
						baseUrl));
				}

				//
				// Determine if foreground images are to be located.
				//
				if(IncludeForegroundImages)
				{
					al.AddRange(Utilities.SearchContent(m_linkPatternImgTag, 
						contentSource,
						baseUrl));
					al.AddRange(Utilities.SearchContent(m_linkPatternImgBntTag,
						contentSource,
						baseUrl));
				}
			}

			//
			// Convert to a strongly-typed array.
			//
			LinkIdentification[] linkInfos = new LinkIdentification[al.Count];
			al.CopyTo(linkInfos);

			//
			// Send event for each link found.
			//
			foreach (LinkIdentification link in linkInfos)
			{
				OnHyperlinkFound(new HyperlinkFoundEventArgs(link.Url));
			}

			return linkInfos;
		}

		public void InitializeContext(XmlNode node)
		{
			if (node == null) throw (new ArgumentNullException("node"));

			//
			// Extract the known parameters and configure this object.
			//
			XmlAttribute evalCssAtt = node.Attributes["evalCss"];
			XmlAttribute evalHtmlAtt = node.Attributes["evalHtml"];
			XmlAttribute includeBackImgAtt = node.Attributes["includeBackImg"];
			XmlAttribute includeForeImgAtt = node.Attributes["includeForeImg"];

			if (evalCssAtt != null) 
				this.EvaluateCssFiles = bool.Parse(evalCssAtt.Value);
			if (evalHtmlAtt != null) 
				this.EvaluateHtmlFiles = bool.Parse(evalHtmlAtt.Value);
			if (includeBackImgAtt != null) 
				this.IncludeBackgroundImages = bool.Parse(includeBackImgAtt.Value);
			if (includeForeImgAtt != null) 
				this.IncludeForegroundImages = bool.Parse(includeForeImgAtt.Value);
		}

		private bool m_inclBackImgs = true;
		private bool m_inclForeImgs = true;
		private bool m_srchCssFiles = true;
		private bool m_srchHtmlFiles = true;

		//
		// The following expressions are used to locate areas of the page
		// to ensure the correct match context.  These area are often containers
		// for actual references, such as the <STYLE> tag.
		//
		private static Regex m_styleTagContainment =
			new Regex(Utilities.GetResolutionPattern("LR_STYLETAG_CONTAINMENT"),
			RegexOptions.Compiled | RegexOptions.IgnoreCase);
		private static Regex m_styleAttContainment =
			new Regex(Utilities.GetResolutionPattern("LR_STYLEATT_CONTAINMENT"),
			RegexOptions.Compiled | RegexOptions.IgnoreCase);

		//
		// Expressions used to locate image references.
		//
		private static Regex m_linkPatternBkTag =
			new Regex(LinkMatchPatterns.BackImagePattern1,
			RegexOptions.Compiled | RegexOptions.IgnoreCase);
		private static Regex m_linkPatternStyleAtt =
			new Regex(LinkMatchPatterns.BackImagePattern2,
			RegexOptions.Compiled | RegexOptions.IgnoreCase);
		private static Regex m_linkPatternImgTag =
			new Regex(LinkMatchPatterns.ForeImagePattern1,
			RegexOptions.Compiled | RegexOptions.IgnoreCase);
		private static Regex m_linkPatternImgBntTag =
			new Regex(LinkMatchPatterns.ForeImagePattern2,
			RegexOptions.Compiled | RegexOptions.IgnoreCase);
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Software Developer (Senior)
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions