Click here to Skip to main content
15,892,059 members
Articles / Programming Languages / C#

Harvesting Web Content into MHTML Archive

Rate me:
Please Sign up or sign in to vote.
4.69/5 (14 votes)
18 Feb 20068 min read 62K   729   49  
Capture and archive web resources by saving them in RFC-2557 (MHTML) compliant format. This library includes a framework for augmenting the capture process and allowing programmatic control, from downloading through saving. A replacement for CDOSYS/CDONTS.
using System;
using System.Reflection;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO;
using Rilling.MhtmlLib.Collections;
using System.Text;
using Rilling.Web.Mhtml;

namespace Rilling.MhtmlLib.Media.Artifacts
{
	/// <summary>
	///		Provides the base interface for all classes that want
	///		to handle web content.  The base storage format is
	///		strictly binary.  It is the responsibility of derived
	///		classes to interpret the data and make it meaningful.
	/// </summary>
	public abstract class ArtifactBase
	{
		/// <summary>
		///		Returns a list of parent artifacts that this content is 
		///		referenced within.  Duplicate content is only included once
		///		in the MHTML, therefore it is possible that a single artifact
		///		belongs to more then one parent such as when an image is used
		///		in multiple pages on a site.
		/// </summary>
		public ArtifactCollection ParentArtifacts
		{
			get
			{
				if (Owner == null) 
					throw (new InvalidOperationException("No document context is defined."));

				// todo: create the artifact collection.

				return m_parentColl;
			}
		}

		/// <summary>
		///		Returns a collection containing all the referenced artifacts for
		///		this content.
		/// </summary>
		public ArtifactCollection ChildArtifacts
		{
			get
			{
				if (Owner == null)
					throw (new InvalidOperationException("No document context is defined."));

                if (m_coll == null)
                    m_coll = new ArtifactCollection(new ChildrenArtifactView(m_owner.m_graph, this));

                return m_coll;
			}
		}

        private ArtifactCollection m_coll = null;

		/// <summary>
		///		Initializes a new instance.
		/// </summary>
		/// <param name="data">
		///		Source content for this object.
		/// </param>
		/// <param name="relations">
		///		Map that defines a relationship between other artifacts.
		///		Artifacts that share a common ArtifactRelationMap
		///		are related.  If this artifact will not be assoicated,
		///		then pass null for this parameter.
		/// </param>
		public ArtifactBase(Stream data)
		{
			if (data == null)
				throw(new ArgumentNullException("data"));

			m_contentData	= new VersionStream(data, true);
		}

		/// <summary>
		///		Gets or retrieves the <see cref="MhtmlDocument"/> that contains
		///		this artifact or null if the artifact does not belong to any 
		///		document.
		/// </summary>
		public MhtmlDocument Owner
		{
			get { return m_owner; }
			set
			{
				if (m_owner != value)
				{
					if (m_owner != null)
					{
						// Remove the artifact if it currently exists in document.
						//Owner.Artifacts.Remove(this);
					}

					m_owner = value;

					//Owner.Artifacts.Add(this);
				}
			}
		}

		/// <summary>
		///		The number of bytes allocated to this instance for content data.
		///		This is strictly the number of bytes and does not represent any 
		///		conversions such as encoding formats.
		/// </summary>
		public long DataSize
		{
			get { return m_contentData.Length; }
		}

		/// <summary>
		///		Gets the <see cref="VersionStream"/> that contains the physical, raw
		///		data.  Since this is a <see cref="VersionStream"/>, each write to the 
		///		stream will increment a version counter.  This helps to identify
		///		when changes have been made.
		/// </summary>
		/// <returns>
		///		The stream containing the raw data.
		/// </returns>
		/// <remarks>
		///		Before returning the stream, the stream is positioned at the beginning.
		///		This means that any references to the <see cref="VersionStream"/>
		///		instance will also be affected.
		/// </remarks>
		public VersionStream BaseStream
		{
			get{return m_contentData;}
		}

		/// <summary>
		///		Gets or sets the location of this content.  This is the location
		///		where it originally was located.  This location may
		///		not exist in the future.
		/// </summary>
		/// <remarks>
		///		A null value indicates the location is unknown.
		/// </remarks>
		public Uri Location
		{
			get { return m_location; }
			set {
				if (value == null)
					throw (new ArgumentNullException("value"));

				m_location = value; 
			}
		}

		/// <summary>
		///		The type for this content.  This maps to the	
		///		CONTENT-TYPE field in the content's header.
		/// </summary>
		/// <remarks>
		///		MediaType is required.  If by chance this field
		///		does not exist, it will be assumed to be 
		///		<see cref="DefaultMediaType"/> (just binary content).
		/// </remarks>
		public string ContentType
		{
			get{return m_mediaType;}
			set{
				if(value == null || value.Trim() == String.Empty) 
					value = DefaultMediaType;

				m_mediaType = value;
			}
		}

		private const string DefaultMediaType = "application/octet-stream";

		private VersionStream m_contentData			= null;
		private Uri m_location						= null;
		private string m_mediaType					= DefaultMediaType;
		private MhtmlDocument m_owner				= null;
		private ArtifactCollection m_parents		= null;
		private ArtifactCollection m_parentColl		= null;
		private ArtifactCollection m_children		= null;
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Software Developer (Senior)
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions