Click here to Skip to main content
15,896,063 members
Articles / Programming Languages / C#

Harvesting Web Content into MHTML Archive

Rate me:
Please Sign up or sign in to vote.
4.69/5 (14 votes)
18 Feb 20068 min read 62.1K   729   49  
Capture and archive web resources by saving them in RFC-2557 (MHTML) compliant format. This library includes a framework for augmenting the capture process and allowing programmatic control, from downloading through saving. A replacement for CDOSYS/CDONTS.
using System;
using Rilling.MhtmlLib.Media.Artifacts;
using System.Collections;
using System.Web;
using System.Net;
using System.Security.Cryptography;
using System.Text;
using System.IO;
using Rilling.MhtmlLib.Media.Handlers;
using Rilling.MhtmlLib.Media;
using Rilling.MhtmlLib.Media.Resolvers;
using Rilling.MhtmlLib.IO;
using Rilling.MhtmlLib.Collections;

namespace Rilling.Web.Mhtml
{
	public class MhtmlDocument : ICloneable
	{
		public ArtifactCollection Artifacts
		{
			get {
				if (m_coll == null)
				{
					m_coll = new ArtifactCollection(new CompleteArtifactView(m_graph));
				}

				return m_coll;
			}
		}

		private ArtifactCollection m_coll = null;
		internal GraphCollection m_graph = new GraphCollection();
















		public MhtmlDocument()
		{
			m_pageMap = new ArtifactRelationMap();
		}

		public Rilling.MhtmlLib.Media.ArtifactRelationMap RelationMap
		{
			get { return m_pageMap; }
		}


		internal ArtifactRelationMap m_pageMap = null;

//        public ArtifactCollection Artifacts
//        {
//            get
//            {
////				if(!IsContentReady)
////					throw(new InvalidOperationException("No content has been loaded into the document."));

//                if(this.m_fileSet == null)
//                    m_fileSet = new ArtifactCollection();

//                return m_fileSet;
//            }
//        }

		public void Clear()
		{
			Artifacts.Clear();
		}

		//public bool IsContentReady
		//{
		//    get{return (m_fileSet != null);}
		//}

		public Uri BaseUrl
		{
			get{return m_baseUrl;}
			set{m_baseUrl = value;}
		}

		private Uri m_baseUrl = null;

		/// <summary>
		///		Populates this document instance with content from the 
		///		specified web address.
		/// </summary>
		/// <param name="webAddress">
		/// </param>
		public void LoadFromUrl(Uri webAddress)
		{
			if (webAddress == null)
				throw (new ArgumentNullException("No address provided."));

			HttpContentAdapter ca = new HttpContentAdapter(webAddress);
							   ca.LoadMhtml(this);
		}

		public ArtifactBase CreateArtifact(HttpWebResponse src)
		{
			ArtifactBase artifact = ContentHandlerBase.GetArtifact(src);

			if (artifact != null)
			{
				artifact.Owner = this;
			}

			return artifact;
		}

		public void LoadFromFile(string localPath)
		{
//			FileContentAdapter fa = new FileContentAdapter(localPath);
////								fa.LoadMhtmlContent(this);
///
			throw(new NotImplementedException());
		}

		public void SaveToFile(string localPath)
		{
			FileContentAdapter ca = new FileContentAdapter(localPath);
							   ca.SaveMhtml(this);
		}

		//private ArtifactCollection m_fileSet = null;



















		//		public void SaveToFile(string localPath)
		//		{
		//			FilePersistAdapter pa = new FilePersistAdapter(localPath);
		//							   pa.SaveMhtmlContent(this);
		//		}





//
//		if(webAddress == null)
//		throw(new ArgumentNullException("Must provide an Internet resource location."));
//		if(IsContentReady)
//		throw(new InvalidOperationException("Cannot load 


















		public event EventHandler ContentLoading;
		public event EventHandler ContentLoaded;
		public event EventHandler ContentSaving;
		public event EventHandler ContentSaved;



//		public Uri OriginalLocation
//		{
//			get{GetBaseHtml().Location;}
//			set{GetBaseHtml().Location = value;}
//		}

		public ReferenceMode ReferenceMode
		{
			get{return 0;}
			set{}
		}
		
		private HtmlArtifact m_baseHtml = null;

		public ArtifactBase GetBaseHtml()
		{
			//if( m_baseHtml == null )
			//{
			//    foreach( DictionaryEntry de in Artifacts )
			//    {
			//        ArtifactBase artifact = (ArtifactBase)de.Value;
			//        if( artifact.ContentType == "text/html" )
			//            m_baseHtml = (HtmlArtifact)artifact;
			//    }
			//}

			return m_baseHtml;
		}

		public DateTime CaptureDate
		{
			get{return m_captureDate;}
			set{m_captureDate = value;}
		}

		private DateTime m_captureDate;

		public string ToXml()
		{
			return null;
		}

		public object Clone()
		{
			return null;
		}

		public HtmlArtifact BaseHtml
		{
			get{return m_baseHtml;}
			set{m_baseHtml = value;}
		}

		//private HtmlArtifact m_baseHtml = null;
	}

	public enum ReferenceMode
	{
		ReferenceByLocation,
		ReferenceByIdentifier
	}

}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Software Developer (Senior)
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions