Harvesting Web Content into MHTML Archive

Peter Rilling

Rate me:

4.69/5 (14 votes)

18 Feb 20068 min read

61.6K

729

Capture and archive web resources by saving them in RFC-2557 (MHTML) compliant format. This library includes a framework for augmenting the capture process and allowing programmatic control, from downloading through saving. A replacement for CDOSYS/CDONTS.

mhtmllib_src.zip
- Rilling.MhtmlLib
  - docs
    - api
    - design
      - ClassDiagram.vsd
    - tutorials
  - MhtmlLib.sln
  - src
    - Rilling.Web.Mhtml.TestHarness
      - ArtifactBaseTest.cs
      - AssemblyInfo.cs
      - AudioArtifactTest.cs
      - AudioContentHandlerTest.cs
      - AudioLinkResolverTest.cs
      - AudioMimeHandlerTest.cs
      - bin
        
        Debug
        
        ConfigurationSettings
        
        SampleData
        
        AudioSample.wav
        
        HtmlSample.html
        
        ImageSample.gif
        
        ScriptSample.js
        
        StylesheetSample.css
        
        TextSample.txt
        
        vssver.scc
        
        XmlSample.xml
        
        XmlSampleBad.xsd
        
        XmlSampleBad.xsx
        
        XmlSampleGood.xsd
        
        XmlSampleGood.xsx
        
        Release
      - BinaryArtifactTest.cs
      - BinaryContentHandlerTest.cs
      - BinaryMimeHandlerTest.cs
      - ConfigurationSettings
        
        LinkResolverSettings_AllResolversDefined.xml
        
        LinkResolverSettings_BadResolverDefined.xml
        
        LinkResolverSettings_DisabledResolversDefined.xml
        
        LinkResolverSettings_NoResolversDefined.xml
      - ContentHandlerBaseTest.cs
      - FrameLinkResolverTest.cs
      - HelperUtilities.cs
      - HtmlArtifactTest.cs
      - HtmlContentHandlerTest.cs
      - HtmlMimeHandlerTest.cs
      - ImageArtifactTest.cs
      - ImageContentHandlerTest.cs
      - ImageLinkResolverTest.cs
      - ImageMimeHandlerTest.cs
      - LinkIdentificationTest.cs
      - LinkResolution
        
        LinkResolverConfigurationHandlerTest.cs
      - LinkResolverBaseTest.cs
      - MhtmlTextWriterTest.cs
      - MimeHandlerBaseTest.cs
      - MimeHandlerCollectionTest.cs
      - mssccprj.scc
      - obj
        
        Debug
        
        Refactor
        
        temp
        
        TempPE
        
        Release
        
        TempPE
      - Properties
        
        AssemblyInfo.cs
      - Rilling.Web.Mhtml.TestHarness.csproj
      - Rilling.Web.Mhtml.TestHarness.csproj.user
      - Rilling.Web.Mhtml.TestHarness.csproj.vspscc
      - Rilling.Web.Mhtml.TestHarness.dll.config
      - SampleData
        
        AudioSample.wav
        
        HtmlSample.html
        
        ImageSample.gif
        
        ScriptSample.js
        
        StylesheetSample.css
        
        TextSample.txt
        
        vssver.scc
        
        XmlSample.xml
        
        XmlSampleBad.xsd
        
        XmlSampleBad.xsx
        
        XmlSampleGood.xsd
        
        XmlSampleGood.xsx
      - Samples
      - ScriptArtifactTest.cs
      - ScriptContentHandlerTest.cs
      - ScriptLinkResolverTest.cs
      - ScriptMimeHandlerTest.cs
      - StylesheetArtifactTest.cs
      - StylesheetContentHandlerTest.cs
      - StylesheetLinkResolverTest.cs
      - StylesheetMimeHandlerTest.cs
      - TextArtifactTest.cs
      - TextContentHandlerTest.cs
      - TextMimeHandlerTest.cs
      - vssver.scc
      - XmlArtifactTest.cs
      - XmlContentHandlerTest.cs
      - XmlIslandLinkResolverTest.cs
      - XmlMimeHandlerTest.cs
    - Rilling.Web.Mhtml.TestHarness_old
      - ArtifactBaseTest.cs
      - AssemblyInfo.cs
      - AudioArtifactTest.cs
      - AudioLinkResolverTest.cs
      - AudioMimeHandlerTest.cs
      - bin
        
        Debug
        
        ConfigurationSettings
        
        SampleData
        
        AudioSample.wav
        
        HtmlSample.html
        
        ImageSample.gif
        
        ScriptSample.js
        
        StylesheetSample.css
        
        TextSample.txt
        
        vssver.scc
        
        XmlSample.xml
        
        XmlSampleBad.xsd
        
        XmlSampleBad.xsx
        
        XmlSampleGood.xsd
        
        XmlSampleGood.xsx
      - BinaryArtifactTest.cs
      - BinaryMimeHandlerTest.cs
      - ConfigurationSettings
        
        LinkResolverSettings_AllResolversDefined.xml
        
        LinkResolverSettings_BadResolverDefined.xml
        
        LinkResolverSettings_DisabledResolversDefined.xml
        
        LinkResolverSettings_NoResolversDefined.xml
      - FrameLinkResolverTest.cs
      - HelperUtilities.cs
      - HtmlArtifactTest.cs
      - HtmlMimeHandlerTest.cs
      - ImageArtifactTest.cs
      - ImageLinkResolverTest.cs
      - ImageMimeHandlerTest.cs
      - LinkResolution
        
        LinkResolverConfigurationHandlerTest.cs
      - MhtmlTextWriterTest.cs
      - MimeHandlerBaseTest.cs
      - MimeHandlerCollectionTest.cs
      - mssccprj.scc
      - obj
        
        Debug
        
        temp
        
        TempPE
      - Rilling.Web.Mhtml.TestHarness.csproj
      - Rilling.Web.Mhtml.TestHarness.csproj.user
      - Rilling.Web.Mhtml.TestHarness.csproj.vspscc
      - Rilling.Web.Mhtml.TestHarness.dll.config
      - SampleData
        
        AudioSample.wav
        
        HtmlSample.html
        
        ImageSample.gif
        
        ScriptSample.js
        
        StylesheetSample.css
        
        TextSample.txt
        
        vssver.scc
        
        XmlSample.xml
        
        XmlSampleBad.xsd
        
        XmlSampleBad.xsx
        
        XmlSampleGood.xsd
        
        XmlSampleGood.xsx
      - ScriptArtifactTest.cs
      - ScriptLinkResolverTest.cs
      - ScriptMimeHandlerTest.cs
      - StylesheetArtifactTest.cs
      - StylesheetLinkResolverTest.cs
      - StylesheetMimeHandlerTest.cs
      - TextArtifactTest.cs
      - TextMimeHandlerTest.cs
      - vssver.scc
      - XmlArtifactTest.cs
      - XmlIslandLinkResolverTest.cs
      - XmlMimeHandlerTest.cs
    - Rilling.Web.Mhtml.TestSite
      - _vti_cnf
      - _vti_pvt
        
        access.cnf
        
        deptodoc.btr
        
        doctodep.btr
        
        service.cnf
        
        service.lck
        
        services.cnf
      - _vti_script
      - _vti_txt
      - AudioFile1.wav
      - bin
        
        Rilling.Web.Mhtml.TestSite.dll
      - BinaryFile1.wav
      - Common
        
        Styles
      - CSSSample.css
      - ExtSiteImages
        
        Google
        
        Google.htm
        
        Google_files
        
        summer2004_closing.gif
        
        vssver.scc
        
        vssver.scc
      - HtmlFile1.htm
      - ImageFile1.gif
      - Images
        
        s001.jpg
        
        s002.jpg
        
        s003.jpg
        
        s004.jpg
        
        s005.jpg
        
        s006.jpg
        
        s007.jpg
        
        s008.jpg
        
        s009.jpg
        
        s010.jpg
        
        s011.jpg
        
        s012.jpg
        
        s013.jpg
        
        s014.jpg
        
        s015.jpg
        
        SampleImg.gif
        
        SampleImg.jpg
        
        vssver.scc
      - ImageSamples.htm
      - mssccprj.scc
      - RefSamples
        
        css01.css
        
        css02.css
        
        css03.css
        
        css04.css
        
        css05.css
        
        css06.css
        
        css07.css
        
        css08.css
        
        frame01.html
        
        frame02.html
        
        frame03.html
        
        frame04.html
        
        frame05.html
        
        frame06.html
        
        frame07.html
        
        frame08.html
        
        frame09.html
        
        frame10.html
        
        frame11.html
        
        frame12.html
        
        frame13.html
        
        frame14.html
        
        frame15.html
        
        frame16.html
        
        frame17.html
        
        frame18.html
        
        frame19.html
        
        frame20.html
        
        frame21.html
        
        frame22.html
        
        frame23.html
        
        frame24.html
        
        img01.jpg
        
        img02.jpg
        
        img03.jpg
        
        img04.jpg
        
        img05.jpg
        
        img06.jpg
        
        img07.jpg
        
        img08.jpg
        
        img09.jpg
        
        img10.jpg
        
        img11.jpg
        
        img12.jpg
        
        img13.jpg
        
        img14.jpg
        
        img15.jpg
        
        script01.js
        
        script01.vbs
        
        script02.js
        
        script02.vbs
        
        script03.js
        
        script03.vbs
        
        script04.js
        
        script04.vbs
        
        script05.js
        
        script05.vbs
        
        script06.js
        
        script06.vbs
        
        script07.js
        
        script07.vbs
        
        script08.js
        
        script08.vbs
        
        script09.js
        
        script09.vbs
        
        script10.js
        
        script10.vbs
        
        script11.js
        
        script11.vbs
        
        script12.js
        
        script12.vbs
        
        sound01.wav
        
        sound02.wav
        
        sound03.wav
        
        sound04.wav
        
        sound05.wav
        
        sound06.wav
        
        sound07.wav
        
        sound08.wav
        
        sound09.wav
        
        sound10.wav
        
        sound11.wav
        
        sound12.wav
        
        vssver.scc
        
        xml01.xml
        
        xml02.xml
        
        xml03.xml
        
        xml04.xml
        
        xml05.xml
        
        xml06.xml
        
        xml07.xml
        
        xml08.xml
        
        xml09.xml
        
        xml10.xml
        
        xml11.xml
        
        xml12.xml
      - Rilling.Web.Mhtml.TestSite.csproj
      - Rilling.Web.Mhtml.TestSite.csproj.vspscc
      - Rilling.Web.Mhtml.TestSite.csproj.webinfo
      - ScriptFile1.js
      - StylesheetFile1.css
      - TextFile1.txt
      - vssver.scc
      - XMLFile1.xml
      - XMLFile1.xsd
      - XMLFile1.xsx
      - XMLFile2.xsd.xsd
      - XMLFile2.xsd.xsx
    - Rilling.Web.Mhtml
      - AssemblyInfo.cs
      - BadImageFormatException.cs
      - bin
        
        Debug
        
        Rilling.Web.Mhtml.XML
        
        Release
      - ClassDiagram1.cd
      - ClassDiagram2.cd
      - ClassDiagram3.cd
      - Collections
        
        ArtifactCollection.cs
        
        ArtifactViewBase.cs
        
        ChildrenArtifactCollection.cs
        
        ChildrenArtifactView.cs
        
        CompleteArtifactCollection.cs
        
        CompleteArtifactView.cs
        
        Graph
        
        AdjacencyList.cs
        
        EdgeToNeighbor.cs
        
        GraphCollection.cs
        
        Node.cs
        
        NodeList.cs
        
        MimeHandlerCollection.cs
        
        ParentArtifactCollection.cs
        
        vssver.scc
      - Configuration
        
        ConfigManager.cs
        
        IExpandedConfiguration.cs
        
        LinkResolverConfigurationHandler.cs
        
        MimeHandlersConfigurationHandler.cs
        
        vssver.scc
      - ErrorMessages.resx
      - Exceptions
        
        ArtifactFormatException.cs
        
        BadImageFormatException.cs
        
        DuplicateHandlerRegistrationException.cs
        
        IncompatibleArtifactException.cs
        
        LinkReplacementOverlapException.cs
        
        MhtmlException.cs
        
        MimeFormatException.cs
        
        ResourceNotFoundException.cs
        
        ResourceTimeoutException.cs
        
        TypeNotLoadedException.cs
        
        vssver.scc
      - ExtractionAdapters
        
        tmp
        
        FileCreationAdapter.cs
        
        FilePersistAdapter.cs
        
        ICreationAdapter.cs
        
        IPersistAdapter.cs
        
        StreamFormationAdapter.cs
        
        vssver.scc
        
        WebCreationAdapter.cs
      - HttpHeaderField.cs
      - IContentDecorator.cs
      - IO
        
        FileContentAdapter.cs
        
        HttpContentAdapter.cs
        
        IContentReader.cs
        
        IContentWriter.cs
        
        MhtmlTextReader.cs
        
        MhtmlTextWriter.cs
        
        VersionStream.cs
        
        vssver.scc
      - IReferenceSubsituteHandler.cs
      - LinkMatchPatterns.Designer.cs
      - LinkMatchPatterns.resx
      - Media
        
        ArtifactRelationMap.cs
        
        Artifacts
        
        ArtifactBase.cs
        
        AudioArtifact.cs
        
        BinaryArtifact.cs
        
        HtmlArtifact.cs
        
        ImageArtifact.cs
        
        ScriptArtifact.cs
        
        StylesheetArtifact.cs
        
        TextArtifact.cs
        
        vssver.scc
        
        XmlArtifact.cs
        
        ContentHarvestedEventArgs.cs
        
        ContentHarvestedEventHandler.cs
        
        ContentTypes.cs
        
        Handlers
        
        AudioContentHandler.cs
        
        BinaryContentHandler.cs
        
        ContentHandlerBase.cs
        
        HtmlContentHandler.cs
        
        IContentFactory.cs
        
        ImageContentHandler.cs
        
        ScriptContentHandler.cs
        
        StylesheetContentHandler.cs
        
        TextContentHandler.cs
        
        vssver.scc
        
        XmlContentHandler.cs
        
        MhtmlContext.cs
        
        Resolvers
        
        AudioLinkResolver.cs
        
        FrameLinkResolver.cs
        
        HyperlinkLocatedEventArgs.cs
        
        HyperlinkLocatedEventHandler.cs
        
        ILinkResolver.cs
        
        ImageLinkResolver.cs
        
        LinkIdentification.cs
        
        LinkResolverBase.cs
        
        ScriptLinkResolver.cs
        
        StylesheetLinkResolver.cs
        
        vssver.scc
        
        XmlIslandLinkResolver.cs
      - MhtmlDocument.cs
      - mssccprj.scc
      - obj
        
        Debug
        
        Refactor
        
        temp
        
        TempPE
        
        Release
        
        TempPE
      - ResolverInfo.cs
      - Resources
      - Rilling.MhtmlLib.csproj
      - Rilling.MhtmlLib.csproj.user
      - Rilling.Web.Mhtml.csproj.vspscc
      - Rilling.Web.Mhtml.xml
      - Utilities.cs
      - VersionStream.cs
      - vssver.scc
      - zzzJunk
        
        ExtractionAdapters
        
        tmp
        
        FileCreationAdapter.cs
        
        FilePersistAdapter.cs
        
        ICreationAdapter.cs
        
        IPersistAdapter.cs
        
        StreamFormationAdapter.cs
        
        vssver.scc
        
        WebCreationAdapter.cs
        
        IMimeHandlerFactory.cs
        
        IPassthroughInitializer.cs
        
        LinkReference.cs
        
        ReferencedTarget.cs
        
        vssver.scc
    - WinCapture
      - App.config
      - App.ico
      - AssemblyInfo.cs
      - AudioDocumentInfo.cs
      - bin
        
        Debug
        
        Release
        
        WinCapture.exe
        
        WinCapture.exe.config
        
        WinCapture.vshost.exe
        
        WinCapture.vshost.exe.config
      - Capture.cs
      - Capture.resx
      - DocumentInfoBase.cs
      - EmbeddedTextFile.cs
      - GenealogyTreeNodeHandler.cs
      - GenericContentInfo.cs
      - HtmlDocumentInfo.cs
      - ImageDocumentInfo.cs
      - MhtmlAgentInfo.cs
      - MhtmlCapture.build
      - MhtmlDocumentInfo.cs
      - MhtmlTreeNodeHandler.cs
      - mssccprj.scc
      - obj
        
        Debug
        
        Refactor
        
        temp
        
        TempPE
        
        Release
        
        ResolveAssemblyReference.cache
        
        TempPE
      - OtherDocumentInfo.cs
      - PageTypeTreeNodeHandler.cs
      - Properties
        
        Resources.Designer.cs
        
        Resources.resx
      - ScriptDocumentInfo.cs
      - StylesheetDocumentInfo.cs
      - TextDocumentInfo.cs
      - TreeNodeHandlerBase.cs
      - UrlPrompt.cs
      - UrlPrompt.resx
      - vssver.scc
      - WinCapture.csproj
      - WinCapture.csproj.user
      - WinCapture.csproj.vspscc
      - XmlDocumentInfo.cs
mhtmllib_demo.zip
- Rilling.MhtmlLib.dll
- WinCapture.exe
- WinCapture.exe.config

//using System;
//using Rilling.Web.Mhtml.Mime.Artifacts;
//using System.Diagnostics;
//using System.Collections;
//using System.IO;
//using System.Net;
//using Rilling.Web.Mhtml.Resolvers;
//using Rilling.Web.Mhtml;
//using Rilling.Web.Mhtml.Mime.Handlers;
//
//namespace Rilling.Web.Mhtml
//{
//	public class WebCreationAdapter : ICreationAdapter
//	{
//		static WebCreationAdapter()
//		{
//			MimeHandlerInfo[] handlers = ConfigManager.GetMimeHandlers();
//			foreach(MimeHandlerInfo handler in handlers)
//			{
//				MimeHandlerBase.RegisteredHandlers.Add(handler);
//			}
//
//			ILinkResolver[] resolvers = ConfigManager.GetLinkResolvers();
//			foreach(ILinkResolver resolver in resolvers)
//			{
//				LinkResolverBase.Register(resolver);
//			}
//		}
//
//		/// <summary>
//		///		Loads a page from the web and populates the 
//		///		specified <see cref="MhtmlDocument"/>.
//		/// </summary>
//		/// <param name="mht"></param>
//		public void LoadMhtml(MhtmlDocument mht)
//		{
//			if(mht == null)
//				throw(new ArgumentNullException("Must specify a document to populate.", "mht"));
//			string contentName = null;
//			string contentPath = null;
//
//			// So other methods have access to content.
//			m_affectedMht = mht;
//
//			// Make sure the document is empty.
//			mht.Clear();
//
//			// Download and collect all the referenced web content.
//			WebHarvester wh = new WebHarvester(BaseUrl);
//			foreach(ArtifactBase content in wh.Capture())
//			{
//				// All sections in the MHTML are identified by a 
//				// unique identifier.  Generate this identify
//				// based on the path that it came from.
//				contentPath = content.Location.AbsoluteUri;
//				contentName = 
//					MhtmlDocument.GenerateContentName(contentPath);
//
//				// Add the content to the document.
//				m_affectedMht.EmbeddedFiles.Add(contentName, content);
//			}
//
//			mht.CaptureDate		= DateTime.Now;
//			mht.BaseUrl			= BaseUrl;
//		}
//
//		/// <summary>
//		///		Helper class that collects all the web contents
//		///		from the page.  This class basically does most of the
//		///		work.
//		/// </summary>
//		private class WebHarvester
//		{
//			/// <summary>
//			///		Starting at the base page, collects all content
//			///		from that page, parsing references as needed so
//			///		that the returned content is a complete collection
//			///		of what is desired.
//			/// </summary>
//			/// <returns></returns>
//			public ArtifactBase[] Capture()
//			{
//				Queue webLinks = new Queue();
//				Hashtable pageParts = new Hashtable();
//
//				webLinks.Enqueue( m_baseHtmlAddress );
//
//				while( webLinks.Count != 0 )
//				{
//					LinkReplacementInfo tmp = 
//						(LinkReplacementInfo)webLinks.Dequeue();
//
//					if( !pageParts.Contains( tmp.TargetUrl.AbsoluteUri ) )
//					{
//						ArtifactBase content = null;
//
//						Trace.WriteLine("Downloading the resource '" + tmp.TargetUrl.AbsoluteUri + "'.");
//						Trace.Indent();
//
//						try
//						{
//							content = GetWebContent( tmp.TargetUrl );
//						} 
//						catch(WebException e)
//						{
//							object status = null;
//							if(e.Response == null)
//								status = e.Status;
//							else
//								status = ((HttpWebResponse)e.Response).StatusCode;
//
//							Trace.WriteLine("... Unable to download (" + status + ").");
//							Trace.Unindent();
//							continue;
//						}
//
//						if( content.MediaType == "text/css" || 
//							content.MediaType == "text/html" )
//						{
//							LinkReplacementInfo[] foundLinks = 
//								ResolveLinks( (TextArtifact)content );
//
//							Trace.WriteLine("... Resolved " + foundLinks.Length + " references.");
//							Utilities.AddRangeToQueue( webLinks, foundLinks );
//						}
//
//						pageParts.Add( tmp.TargetUrl.AbsoluteUri, content );
//
//						Trace.Unindent();
//					}
//				}
//
//				ArtifactBase[] allContent = 
//					new ArtifactBase[pageParts.Count];
//				int i = 0;
//				foreach( DictionaryEntry de in pageParts )
//				{
//					allContent[i++] = (ArtifactBase)de.Value;
//				}
//
//				return allContent;
//			}
//
//			private LinkReplacementInfo[] 
//				ResolveLinks( TextArtifact content )
//			{
//				ArrayList webLinks = new ArrayList();
//
//				foreach( ILinkResolver lr in LinkResolverBase.RegisteredResolvers )
//				{
//					webLinks.AddRange(lr.LocateReferences(content, 
//											content.Location));
//
//
//
//	//				webLinks.AddRange(lr.LocateReferences(new StreamReader(content.GetStream()), content.Location));
//				}
//
//				return (LinkReplacementInfo[])(webLinks.ToArray(typeof(LinkReplacementInfo)));
//			}
//
////				//ArrayList webLinks		= new ArrayList();
////				ArrayList webContents	= new ArrayList();
////
////				//webLinks.Add(m_baseHtmlAddress);
////				webLinks.Enqueue(m_baseHtmlAddress);
////
////				while(webLinks.Count != 0)
////				{
////					ArtifactBase content = 
////						GetWebContent(webLinks.Dequeue());
////
////					ResolverLinks(webLinks, content);
////				}
////
////
////				for(int i=0; i<webLinks.Count; i++)
////				{
////					ArtifactBase content = 
////						GetWebContent(webLinks[i]);
////
////					webContents.Add(content);
////
////					foreach(ILinkResolver lr in LinkResolverBase.Resolvers)
////					{
////
////					}
////				}
////
////
////
////
////
////				HtmlArtifact htmlContent = GetBaseHtmlContent();
////				ArrayList al = new ArrayList();
////				ArrayList links = new ArrayList();
////
////				if(htmlContent.MediaType == "text/html")
////				{
////					al.Add(htmlContent);
////
////					foreach(ILinkResolver linkResolver in LinkResolverBase.Resolvers)
////					{
////						links.AddRange(linkResolver.LocateReferences(new StreamReader(htmlContent.GetStream()), htmlContent.Location));
////					}
////
////					foreach(LinkReplacementInfo lr in links)
////					{
////						al.Add(GetWebContent(lr.TargetUrl));
////					}
////				}
////
////				ArtifactBase[] temp = 
////					new ArtifactBase[al.Count];
////				al.CopyTo(temp);
//
////				return null;
////			}
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//			public WebHarvester(Uri webAddress)
//			{
//				if(webAddress == null)
//					throw(new ArgumentNullException("webAddress"));
//
//				LinkReplacementInfo linkInfo = new LinkReplacementInfo();
//									linkInfo.Length = -1;
//									linkInfo.StartOffset = -1;
//									linkInfo.TargetUrl = webAddress;
//
//				m_baseHtmlAddress = linkInfo;
//			}
//
//
////			private HtmlArtifact GetBaseHtmlContent()
////			{
////				Uri baseAddress = m_baseHtmlAddress;
////				ArtifactBase ef = GetWebContent(baseAddress);
////				string contentMedia = ef.MediaType;
////
////				if(contentMedia != "text/html")
////					throw(new InvalidOperationException("Expected HTML content."));	// Create new exception.
////
////				return (HtmlArtifact)ef;
////			}
//
//			private ArtifactBase GetWebContent(Uri webAddress)
//			{
//				HttpWebRequest  request = (HttpWebRequest)WebRequest.Create(webAddress);
//				HttpWebResponse response= (HttpWebResponse)request.GetResponse();
//
//				string contentMedia = response.ContentType.Split(';')[0];
//
//				ArtifactBase cc =  MimeHandlerBase.GetArtifact(response);
//
//				response.Close();
//
//				return cc;
//			}
//
//			private int CaptureHelper(Uri webAddress)
//			{
//				return 0;
//			}
//
//			private LinkReplacementInfo m_baseHtmlAddress;
//		}
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//		public WebCreationAdapter(Uri webAddress)
//		{
//			if(webAddress == null) 
//				throw(new ArgumentNullException("Must specify web location.", "webAddress"));
//
//			m_base = webAddress;
//		}
//
//		public WebCreationAdapter(string baseAddress) :
//			this(new Uri(baseAddress))
//		{}
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//		public Uri BaseUrl
//		{
//			get{return m_base;}
//		}
//
//		private string AddToMhtml(HttpWebResponse webData)
//		{
//			return null;
////			if(webData == null) 
////				throw(new ArgumentNullException("webData"));
////
////			EmbeddedFile file = new EmbeddedFile();
////						 file.MediaType = webData.ContentType;
////						 file.OriginalLocation = webData.ResponseUri;
////
//////			StreamReader sr = new StreamReader(webData.GetResponseStream(), true);
//////						 file.Content = sr.ReadToEnd();
////
////			if(webData.ContentType == "text/html" || webData.ContentType == "text/css")
////			{
//////				Uri[] targetUrls = 
//////					LinkResolverBase.LocateReferences(file.Content, file.OriginalLocation);
////
////				// TODO: how does the urls get updated in content?
////				foreach(Uri url in targetUrls)
////				{
////					AddToMhtml(GetFileFromWeb(url));
//////				}
////			}
////
////			string refName =
////				GenerateContentName(webData.ResponseUri.AbsoluteUri); 
////
////			m_affectedMht.EmbeddedFiles.Add(refName, file);
////
////			return refName;
//		}
//
//		public static string GenerateContentName()
//		{
//			return (Guid.NewGuid()) + "$pmr";
//		}
//
//		public static string GenerateContentName(string hashSeed)
//		{
//			return (Guid.NewGuid()) + "$pmr";
//		}
//
//		private HttpWebResponse GetFileFromWeb(Uri baseUri)
//		{
//			if(baseUri == null)
//				throw(new ArgumentNullException("baseUri"));
//
//			HttpWebRequest request = (HttpWebRequest)WebRequest.Create(baseUri);
//			return (HttpWebResponse)request.GetResponse();
//		}
//
//		private Uri m_base;
//		private MhtmlDocument m_affectedMht;
//	}
//}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

Written By

Peter Rilling

Software Developer (Senior)

United States

This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Harvesting Web Content into MHTML Archive

License

Comments and Discussions