Click here to Skip to main content
15,879,326 members
Articles / Programming Languages / C#

Harvesting Web Content into MHTML Archive

Rate me:
Please Sign up or sign in to vote.
4.69/5 (14 votes)
18 Feb 20068 min read 61.6K   729   49  
Capture and archive web resources by saving them in RFC-2557 (MHTML) compliant format. This library includes a framework for augmenting the capture process and allowing programmatic control, from downloading through saving. A replacement for CDOSYS/CDONTS.
//using System;
//using Rilling.Web.Mhtml.Mime.Artifacts;
//using System.Diagnostics;
//using System.Collections;
//using System.IO;
//using System.Net;
//using Rilling.Web.Mhtml.Resolvers;
//using Rilling.Web.Mhtml;
//using Rilling.Web.Mhtml.Mime.Handlers;
//
//namespace Rilling.Web.Mhtml
//{
//	public class WebCreationAdapter : ICreationAdapter
//	{
//		static WebCreationAdapter()
//		{
//			MimeHandlerInfo[] handlers = ConfigManager.GetMimeHandlers();
//			foreach(MimeHandlerInfo handler in handlers)
//			{
//				MimeHandlerBase.RegisteredHandlers.Add(handler);
//			}
//
//			ILinkResolver[] resolvers = ConfigManager.GetLinkResolvers();
//			foreach(ILinkResolver resolver in resolvers)
//			{
//				LinkResolverBase.Register(resolver);
//			}
//		}
//
//		/// <summary>
//		///		Loads a page from the web and populates the 
//		///		specified <see cref="MhtmlDocument"/>.
//		/// </summary>
//		/// <param name="mht"></param>
//		public void LoadMhtml(MhtmlDocument mht)
//		{
//			if(mht == null)
//				throw(new ArgumentNullException("Must specify a document to populate.", "mht"));
//			string contentName = null;
//			string contentPath = null;
//
//			// So other methods have access to content.
//			m_affectedMht = mht;
//
//			// Make sure the document is empty.
//			mht.Clear();
//
//			// Download and collect all the referenced web content.
//			WebHarvester wh = new WebHarvester(BaseUrl);
//			foreach(ArtifactBase content in wh.Capture())
//			{
//				// All sections in the MHTML are identified by a 
//				// unique identifier.  Generate this identify
//				// based on the path that it came from.
//				contentPath = content.Location.AbsoluteUri;
//				contentName = 
//					MhtmlDocument.GenerateContentName(contentPath);
//
//				// Add the content to the document.
//				m_affectedMht.EmbeddedFiles.Add(contentName, content);
//			}
//
//			mht.CaptureDate		= DateTime.Now;
//			mht.BaseUrl			= BaseUrl;
//		}
//
//		/// <summary>
//		///		Helper class that collects all the web contents
//		///		from the page.  This class basically does most of the
//		///		work.
//		/// </summary>
//		private class WebHarvester
//		{
//			/// <summary>
//			///		Starting at the base page, collects all content
//			///		from that page, parsing references as needed so
//			///		that the returned content is a complete collection
//			///		of what is desired.
//			/// </summary>
//			/// <returns></returns>
//			public ArtifactBase[] Capture()
//			{
//				Queue webLinks = new Queue();
//				Hashtable pageParts = new Hashtable();
//
//				webLinks.Enqueue( m_baseHtmlAddress );
//
//				while( webLinks.Count != 0 )
//				{
//					LinkReplacementInfo tmp = 
//						(LinkReplacementInfo)webLinks.Dequeue();
//
//					if( !pageParts.Contains( tmp.TargetUrl.AbsoluteUri ) )
//					{
//						ArtifactBase content = null;
//
//						Trace.WriteLine("Downloading the resource '" + tmp.TargetUrl.AbsoluteUri + "'.");
//						Trace.Indent();
//
//						try
//						{
//							content = GetWebContent( tmp.TargetUrl );
//						} 
//						catch(WebException e)
//						{
//							object status = null;
//							if(e.Response == null)
//								status = e.Status;
//							else
//								status = ((HttpWebResponse)e.Response).StatusCode;
//
//							Trace.WriteLine("... Unable to download (" + status + ").");
//							Trace.Unindent();
//							continue;
//						}
//
//						if( content.MediaType == "text/css" || 
//							content.MediaType == "text/html" )
//						{
//							LinkReplacementInfo[] foundLinks = 
//								ResolveLinks( (TextArtifact)content );
//
//							Trace.WriteLine("... Resolved " + foundLinks.Length + " references.");
//							Utilities.AddRangeToQueue( webLinks, foundLinks );
//						}
//
//						pageParts.Add( tmp.TargetUrl.AbsoluteUri, content );
//
//						Trace.Unindent();
//					}
//				}
//
//				ArtifactBase[] allContent = 
//					new ArtifactBase[pageParts.Count];
//				int i = 0;
//				foreach( DictionaryEntry de in pageParts )
//				{
//					allContent[i++] = (ArtifactBase)de.Value;
//				}
//
//				return allContent;
//			}
//
//			private LinkReplacementInfo[] 
//				ResolveLinks( TextArtifact content )
//			{
//				ArrayList webLinks = new ArrayList();
//
//				foreach( ILinkResolver lr in LinkResolverBase.RegisteredResolvers )
//				{
//					webLinks.AddRange(lr.LocateReferences(content, 
//											content.Location));
//
//
//
//	//				webLinks.AddRange(lr.LocateReferences(new StreamReader(content.GetStream()), content.Location));
//				}
//
//				return (LinkReplacementInfo[])(webLinks.ToArray(typeof(LinkReplacementInfo)));
//			}
//
////				//ArrayList webLinks		= new ArrayList();
////				ArrayList webContents	= new ArrayList();
////
////				//webLinks.Add(m_baseHtmlAddress);
////				webLinks.Enqueue(m_baseHtmlAddress);
////
////				while(webLinks.Count != 0)
////				{
////					ArtifactBase content = 
////						GetWebContent(webLinks.Dequeue());
////
////					ResolverLinks(webLinks, content);
////				}
////
////
////				for(int i=0; i<webLinks.Count; i++)
////				{
////					ArtifactBase content = 
////						GetWebContent(webLinks[i]);
////
////					webContents.Add(content);
////
////					foreach(ILinkResolver lr in LinkResolverBase.Resolvers)
////					{
////
////					}
////				}
////
////
////
////
////
////				HtmlArtifact htmlContent = GetBaseHtmlContent();
////				ArrayList al = new ArrayList();
////				ArrayList links = new ArrayList();
////
////				if(htmlContent.MediaType == "text/html")
////				{
////					al.Add(htmlContent);
////
////					foreach(ILinkResolver linkResolver in LinkResolverBase.Resolvers)
////					{
////						links.AddRange(linkResolver.LocateReferences(new StreamReader(htmlContent.GetStream()), htmlContent.Location));
////					}
////
////					foreach(LinkReplacementInfo lr in links)
////					{
////						al.Add(GetWebContent(lr.TargetUrl));
////					}
////				}
////
////				ArtifactBase[] temp = 
////					new ArtifactBase[al.Count];
////				al.CopyTo(temp);
//
////				return null;
////			}
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//			public WebHarvester(Uri webAddress)
//			{
//				if(webAddress == null)
//					throw(new ArgumentNullException("webAddress"));
//
//				LinkReplacementInfo linkInfo = new LinkReplacementInfo();
//									linkInfo.Length = -1;
//									linkInfo.StartOffset = -1;
//									linkInfo.TargetUrl = webAddress;
//
//				m_baseHtmlAddress = linkInfo;
//			}
//
//
////			private HtmlArtifact GetBaseHtmlContent()
////			{
////				Uri baseAddress = m_baseHtmlAddress;
////				ArtifactBase ef = GetWebContent(baseAddress);
////				string contentMedia = ef.MediaType;
////
////				if(contentMedia != "text/html")
////					throw(new InvalidOperationException("Expected HTML content."));	// Create new exception.
////
////				return (HtmlArtifact)ef;
////			}
//
//			private ArtifactBase GetWebContent(Uri webAddress)
//			{
//				HttpWebRequest  request = (HttpWebRequest)WebRequest.Create(webAddress);
//				HttpWebResponse response= (HttpWebResponse)request.GetResponse();
//
//				string contentMedia = response.ContentType.Split(';')[0];
//
//				ArtifactBase cc =  MimeHandlerBase.GetArtifact(response);
//
//				response.Close();
//
//				return cc;
//			}
//
//			private int CaptureHelper(Uri webAddress)
//			{
//				return 0;
//			}
//
//			private LinkReplacementInfo m_baseHtmlAddress;
//		}
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//		public WebCreationAdapter(Uri webAddress)
//		{
//			if(webAddress == null) 
//				throw(new ArgumentNullException("Must specify web location.", "webAddress"));
//
//			m_base = webAddress;
//		}
//
//		public WebCreationAdapter(string baseAddress) :
//			this(new Uri(baseAddress))
//		{}
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//		public Uri BaseUrl
//		{
//			get{return m_base;}
//		}
//
//		private string AddToMhtml(HttpWebResponse webData)
//		{
//			return null;
////			if(webData == null) 
////				throw(new ArgumentNullException("webData"));
////
////			EmbeddedFile file = new EmbeddedFile();
////						 file.MediaType = webData.ContentType;
////						 file.OriginalLocation = webData.ResponseUri;
////
//////			StreamReader sr = new StreamReader(webData.GetResponseStream(), true);
//////						 file.Content = sr.ReadToEnd();
////
////			if(webData.ContentType == "text/html" || webData.ContentType == "text/css")
////			{
//////				Uri[] targetUrls = 
//////					LinkResolverBase.LocateReferences(file.Content, file.OriginalLocation);
////
////				// TODO: how does the urls get updated in content?
////				foreach(Uri url in targetUrls)
////				{
////					AddToMhtml(GetFileFromWeb(url));
//////				}
////			}
////
////			string refName =
////				GenerateContentName(webData.ResponseUri.AbsoluteUri); 
////
////			m_affectedMht.EmbeddedFiles.Add(refName, file);
////
////			return refName;
//		}
//
//		public static string GenerateContentName()
//		{
//			return (Guid.NewGuid()) + "$pmr";
//		}
//
//		public static string GenerateContentName(string hashSeed)
//		{
//			return (Guid.NewGuid()) + "$pmr";
//		}
//
//		private HttpWebResponse GetFileFromWeb(Uri baseUri)
//		{
//			if(baseUri == null)
//				throw(new ArgumentNullException("baseUri"));
//
//			HttpWebRequest request = (HttpWebRequest)WebRequest.Create(baseUri);
//			return (HttpWebResponse)request.GetResponse();
//		}
//
//		private Uri m_base;
//		private MhtmlDocument m_affectedMht;
//	}
//}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Software Developer (Senior)
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions