Click here to Skip to main content
15,886,689 members
Articles / Programming Languages / Visual Basic

Convert any URL to a MHTML archive using native .NET code

Rate me:
Please Sign up or sign in to vote.
4.85/5 (59 votes)
3 Apr 20055 min read 640.5K   5.7K   164  
A native .NET class for saving URLs: text-only, HTML page, HTML archive, or HTML complete.
Imports System.Text
Imports System.Text.RegularExpressions
Imports System.IO

''' <summary>
''' This class builds the following from a URL:
'''
'''   .mht file (Web Archive, single file)
'''   .htm file with dereferenced (absolute) references (Web Page, HTML Only)
'''   .htm file plus all referenced files in a local subfolder (Web Page, complete) 
'''   .txt file (non-HTML contents of Web Page)
'''
''' The .mht format is based on RFC2557 
'''    "compliant Multipart MIME Message (mhtml web archive)"
'''    http://www.ietf.org/rfc/rfc2557.txt
''' </summary>
''' <remarks>
'''   Jeff Atwood
'''   http://www.codinghorror.com/
''' </remarks>
Public Class Builder

    Private _MhtBuilder As StringBuilder
    Private _StripScriptFromHtml As Boolean = False
    Private _StripIframeFromHtml As Boolean = False
    Private _AllowRecursion As Boolean = True
    Private _AddWebMark As Boolean = True
    Private _ForcedEncoding As System.Text.Encoding = Nothing

    Private _HtmlFile As WebFile

    Friend WebFiles As New SortedList
    Friend WebClient As New WebClientEx

    Private Const _MimeBoundaryTag As String = "----=_NextPart_000_00"

    Public Enum FileStorage
        Memory
        DiskPermanent
        DiskTemporary
    End Enum

    Public Sub New()
        _HtmlFile = New WebFile(Me, FileStorage.Memory)
    End Sub

#Region "  Properties"

    ''' <summary>
    ''' Specifies the target Url we want to save
    ''' </summary>
    Public Property Url() As String
        Get
            Return _HtmlFile.Url
        End Get
        Set(ByVal Value As String)
            WebFiles.Clear()
            _HtmlFile.Url = Value
        End Set
    End Property

    ''' <summary>
    ''' returns the Mime content-type string designation of a mht file
    ''' </summary>
    Public ReadOnly Property MhtContentType() As String
        Get
            Return "message/rfc822"
        End Get
    End Property

    ''' <summary>
    ''' *only* set this if you want to FORCE a specific text encoding for all the HTML pages you're downloading;
    ''' otherwise the text encoding is autodetected, which is generally what you want
    ''' </summary>
    Public Property TextEncoding() As System.Text.Encoding
        Get
            Return _ForcedEncoding
        End Get
        Set(ByVal Value As System.Text.Encoding)
            _ForcedEncoding = Value
        End Set
    End Property

    ''' <summary>
    ''' Add the "Mark of the web" to retrieved HTML content so it can run 
    ''' locally on Windows XP SP2
    ''' </summary>
    ''' <remarks>
    '''   http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120
    ''' </remarks>
    Public Property AddWebMark() As Boolean
        Get
            Return _AddWebMark
        End Get
        Set(ByVal Value As Boolean)
            _AddWebMark = Value
        End Set
    End Property

    ''' <summary>
    ''' Strip all &lt;SCRIPT&gt; blocks from any retrieved HTML
    ''' </summary>
    Public Property StripScripts() As Boolean
        Get
            Return _StripScriptFromHtml
        End Get
        Set(ByVal Value As Boolean)
            _StripScriptFromHtml = Value
        End Set
    End Property

    ''' <summary>
    ''' Strip all &lt;IFRAME&gt; blocks from any retrieved HTML
    ''' </summary>
    Public Property StripIframes() As Boolean
        Get
            Return _StripIframeFromHtml
        End Get
        Set(ByVal Value As Boolean)
            _StripIframeFromHtml = Value
        End Set
    End Property

    ''' <summary>
    ''' The browser identification string that is sent in all HTTP requests;
    ''' using a different string can produce simplified (downlevel) HTML
    ''' </summary>
    ''' <remarks>
    ''' defaults to browser ID string of vanilla IE6 as seen in XP SP2
    ''' </remarks>
    Public Property BrowserIdString() As String
        Get
            Return WebClient.BrowserIdString
        End Get
        Set(ByVal Value As String)
            WebClient.BrowserIdString = Value
        End Set
    End Property

    ''' <summary>
    ''' the target URL requires authentication
    ''' if not provided, the current user's credentials will automatically be sent
    ''' </summary>
    Public Property AuthenticationRequired() As Boolean
        Get
            Return WebClient.AuthenticationRequired
        End Get
        Set(ByVal Value As Boolean)
            WebClient.AuthenticationRequired = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP Authentication user for Url
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property AuthenticationUser() As String
        Get
            Return WebClient.AuthenticationUser
        End Get
        Set(ByVal Value As String)
            WebClient.AuthenticationUser = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP Authentication password for Url
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property AuthenticationPassword() As String
        Get
            Return WebClient.AuthenticationPassword
        End Get
        Set(ByVal Value As String)
            WebClient.AuthenticationPassword = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP proxy username
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property ProxyUser() As String
        Get
            Return WebClient.ProxyUser
        End Get
        Set(ByVal Value As String)
            WebClient.ProxyUser = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP proxy password
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property ProxyPassword() As String
        Get
            Return WebClient.ProxyPassword
        End Get
        Set(ByVal Value As String)
            WebClient.ProxyPassword = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP proxy URL
    ''' if provided, proxy will always be used; if left blank, proxy will not be used
    ''' </summary>
    Public Property ProxyUrl() As String
        Get
            Return WebClient.ProxyUrl
        End Get
        Set(ByVal Value As String)
            WebClient.ProxyUrl = Value
        End Set
    End Property

    ''' <summary>
    ''' Proxy requires authentication
    ''' if not provided, the current user's credentials will automatically be sent
    ''' </summary>
    Public Property ProxyAuthenticationRequired() As Boolean
        Get
            Return WebClient.ProxyAuthenticationRequired
        End Get
        Set(ByVal Value As Boolean)
            WebClient.ProxyAuthenticationRequired = Value
        End Set
    End Property

    ''' <summary>
    ''' allow recursive retrieval of any embedded HTML (typically IFRAME or FRAME)
    ''' turn off to prevent infinite recursion in the case of pages that reference themselves..
    ''' </summary>
    Public Property AllowRecursiveFileRetrieval() As Boolean
        Get
            Return _AllowRecursion
        End Get
        Set(ByVal Value As Boolean)
            _AllowRecursion = Value
        End Set
    End Property

#End Region

#Region "  Public"

    ''' <summary>
    ''' Saves URL to disk as a single HTML file, modified with absolute external references
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="url">fully qualified URL you wish to save</param>
    ''' <returns>the complete path of the HTML file that was saved to disk</returns>
    Public Function SavePage(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".htm;.html")
        DownloadHtmlFile(url)
        _HtmlFile.UseHtmlTitleAsFilename = True
        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.SaveToFile()
        Return _HtmlFile.DownloadPath
    End Function

    ''' <summary>
    ''' Saves URL to disk as a plain text file, stripping all HTML from it
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="url">fully qualified URL you wish to save</param>
    ''' <returns>the complete path of the text file that was saved to disk</returns>
    Public Function SavePageText(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".txt")
        DownloadHtmlFile(url)
        _HtmlFile.UseHtmlTitleAsFilename = True
        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.SaveAsTextFile()
        Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".txt")
    End Function

    ''' <summary>
    ''' Saves URL to disk as multiple files: a single HTML file, modified with local references
    ''' to externally referenced files in a subfolder
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="url">fully qualified URL you wish to save</param>
    ''' <returns>the complete path of the HTML file that was saved to disk</returns>
    Public Function SavePageComplete(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".htm;.html")
        DownloadHtmlFile(url)

        '-- first, let's get all the external files
        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.UseHtmlTitleAsFilename = True
        _HtmlFile.DownloadExternalFiles(FileStorage.DiskPermanent, _AllowRecursion)

        '-- convert any references in external files
        For Each de As DictionaryEntry In WebFiles
            Dim ef As WebFile = DirectCast(de.Value, WebFile)
            If ef.IsHtml Or ef.IsCss Then
                ef.ConvertReferencesToLocal()
                ef.SaveToFile()
            End If
        Next

        '-- convert the main HTML references
        _HtmlFile.ConvertReferencesToLocal()
        _HtmlFile.SaveToFile()

        Return _HtmlFile.DownloadPath
    End Function

    ''' <summary>
    ''' Generates a string representation of the URL as a Mht archive file
    ''' using exclusively in-memory storage
    ''' </summary>
    ''' <param name="url">fully qualified URL you wish to render to Mht</param>
    ''' <returns>string representation of MHT file</returns>
    Public Function GetPageArchive(Optional ByVal url As String = "") As String
        DownloadHtmlFile(url)
        _HtmlFile.DownloadExternalFiles(FileStorage.Memory, _AllowRecursion)
        AppendMhtHeader(_HtmlFile)
        AppendMhtFiles()
        Return FinalizeMht()
    End Function

    ''' <summary>
    ''' Saves URL to disk as a single file Mht archive
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="st">type of storage to use when generating the Mht archive</param>
    ''' <param name="url">fully qualified URL you wish to save as Mht</param>
    ''' <returns>the complete path of the Mht archive file that was generated</returns>
    Public Function SavePageArchive(ByVal outputFilePath As String, ByVal st As FileStorage, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".mht")
        DownloadHtmlFile(url)

        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.UseHtmlTitleAsFilename = True

        '-- if set to permanent disk storage, make a local copy of the HTML
        If st = FileStorage.DiskPermanent Then
            _HtmlFile.SaveToFile(Path.ChangeExtension(_HtmlFile.DownloadPath, ".htm"))
        End If

        '-- download all references
        _HtmlFile.DownloadExternalFiles(st, _AllowRecursion)

        '-- build the Mht 
        AppendMhtHeader(_HtmlFile)
        AppendMhtFiles()
        FinalizeMht(Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht"))

        '-- possibly destroy temporary resources
        If st = FileStorage.DiskTemporary Then
            For Each de As DictionaryEntry In WebFiles
                Dim ef As WebFile = DirectCast(de.Value, WebFile)
                If ef.Storage = FileStorage.DiskTemporary Then
                    File.Delete(ef.DownloadPath)
                End If
                '-- if the temp folder is empty, kill that too
                If Directory.GetFileSystemEntries(ef.DownloadFolder).Length = 0 Then
                    Directory.Delete(ef.DownloadFolder)
                End If
            Next
        End If
        WebFiles.Clear()

        Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht")
    End Function

#End Region

#Region "  Private"

    ''' <summary>
    ''' returns the root HTML we'll use to generate everything else;
    ''' this is tracked in the _HtmlFile object, which is always FileStorage.Memory
    ''' </summary>
    Private Sub DownloadHtmlFile(ByVal url As String)
        If url <> "" Then
            Me.Url = url
        End If
        _HtmlFile.Storage = FileStorage.Memory
        _HtmlFile.WasAppended = False
        _HtmlFile.Download()
        If Not _HtmlFile.WasDownloaded Then
            Throw New Exception("unable to download '" & Me.Url & "': " & _
                _HtmlFile.DownloadException.Message, _HtmlFile.DownloadException)
        End If
    End Sub

    ''' <summary>
    ''' returns true if this path refers to a directory (vs. a filename)
    ''' </summary>
    Private Function IsDirectory(ByVal FilePath As String) As Boolean
        Return FilePath.EndsWith("\")
    End Function


    ''' <summary>
    ''' ensures that the path, if it contains a filename, matches one of the semicolon delimited 
    ''' filetypes provided in fileExtension
    ''' </summary>
    Private Sub ValidateFilename(ByVal FilePath As String, ByVal fileExtensions As String)
        If IsDirectory(FilePath) Then Return
        Dim ext As String = Path.GetExtension(FilePath)
        If ext = "" Then
            Throw New Exception("The filename provided, '" & Path.GetFileName(FilePath) & _
                "', has no extension. If are specifying a folder, make sure it ends in a trailing slash. " & _
                "The expected file extension(s) are '" & fileExtensions & "'")
        End If
        If Not Regex.IsMatch(fileExtensions, ext & "(;|$)", RegexOptions.IgnoreCase) Then
            Throw New Exception("The extension of the filename provided, '" & Path.GetFileName(FilePath) & _
                "', does not have the expected extension(s) '" & fileExtensions & "'")
        End If
    End Sub

    ''' <summary>
    ''' removes all unsafe filesystem characters to form a valid filesystem filename
    ''' </summary>
    Private Function MakeValidFilename(ByVal s As String) As String
        '-- replace any invalid filesystem chars with underscore
        Return Regex.Replace(s, "[\/\\\:\*\?\""\<\>\|]", "_")
    End Function

    ''' <summary>
    ''' appends all downloaded files (from _ExternalFiles) to our MhtBuilder
    ''' </summary>
    ''' <param name="st">type of storage to use when downloading external files</param>
    ''' <param name="storagePath">path to use for downloaded external files</param>
    Private Sub AppendMhtFiles()
        For Each de As DictionaryEntry In WebFiles
            Dim ef As WebFile = DirectCast(de.Value, WebFile)
            AppendMhtFile(ef)
        Next
        AppendMhtBoundary()
    End Sub

    ''' <summary>
    ''' appends the Mht header, which includes the root HTML
    ''' </summary>
    Private Sub AppendMhtHeader(ByVal ef As WebFile)
        '-- clear the stringbuilder contents
        _MhtBuilder = New StringBuilder

        AppendMhtLine("From: <Saved by " & Environment.UserName & " on " & Environment.MachineName & ">")
        AppendMhtLine("Subject: " & ef.HtmlTitle)
        AppendMhtLine("Date: " & DateTime.Now.ToString("ddd, dd MMM yyyy HH:mm:ss zzz"))
        AppendMhtLine("MIME-Version: 1.0")
        AppendMhtLine("Content-Type: multipart/related;")
        AppendMhtLine(Convert.ToChar(9) & "type=""text/html"";")
        AppendMhtLine(Convert.ToChar(9) & "boundary=""" & _MimeBoundaryTag & """")
        AppendMhtLine("X-MimeOLE: Produced by " & Me.GetType.ToString & " " & _
            Reflection.Assembly.GetExecutingAssembly.GetName.Version.ToString())
        AppendMhtLine()
        AppendMhtLine("This is a multi-part message in MIME format.")

        AppendMhtFile(ef)
    End Sub

    ''' <summary>
    ''' append a single line, with trailing CRLF, to our MhtBuilder
    ''' </summary>
    Private Sub AppendMhtLine(Optional ByVal s As String = "")
        _MhtBuilder.Append(s)
        _MhtBuilder.Append(Environment.NewLine)
    End Sub

    ''' <summary>
    ''' appends a boundary marker to our MhtBuilder
    ''' </summary>
    Private Sub AppendMhtBoundary()
        AppendMhtLine()
        AppendMhtLine("--" & _MimeBoundaryTag)
    End Sub

    ''' <summary>
    ''' Appends a downloaded external file to our MhtBuilder
    ''' </summary>
    Private Sub AppendMhtFile(ByVal ef As WebFile)
        If ef.WasDownloaded And Not ef.WasAppended Then
            If ef.IsBinary Then
                AppendMhtBinaryFile(ef)
            Else
                AppendMhtTextFile(ef)
            End If
        End If
        ef.WasAppended = True
    End Sub

    ''' <summary>
    ''' Appends a downloaded external text file to our MhtBuilder using Quoted-Printable encoding
    ''' </summary>
    Private Sub AppendMhtTextFile(ByVal ef As WebFile)
        AppendMhtBoundary()
        AppendMhtLine("Content-Type: " & ef.ContentType & ";")
        AppendMhtLine(Convert.ToChar(9) & "charset=""" & ef.TextEncoding.WebName & """")
        AppendMhtLine("Content-Transfer-Encoding: quoted-printable")
        AppendMhtLine("Content-Location: " & ef.Url)
        AppendMhtLine()
        AppendMhtLine(QuotedPrintableEncode(ef.ToString, ef.TextEncoding))
    End Sub

    ''' <summary>
    ''' Appends a downloaded external binary file to our MhtBuilder using Base64 encoding
    ''' </summary>
    Private Sub AppendMhtBinaryFile(ByVal ef As WebFile)
        AppendMhtBoundary()
        AppendMhtLine("Content-Type: " & ef.ContentType)
        AppendMhtLine("Content-Transfer-Encoding: base64")
        AppendMhtLine("Content-Location: " & ef.Url)
        AppendMhtLine()

        '-- note that chunk size is equal to maximum line width (expanded = 75 chars)
        Const ChunkSize As Integer = 57

        If ef.Storage = FileStorage.Memory Then
            Dim len As Integer = ef.DownloadedBytes.Length
            If len <= ChunkSize Then
                AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, 0, len))
            Else
                Dim i As Integer = 0
                Do While i + ChunkSize < len
                    AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, ChunkSize))
                    i += ChunkSize
                Loop
                If i <> len Then
                    AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, len - i))
                End If
            End If
        Else
            Dim fs As IO.FileStream
            Dim b(ChunkSize) As Byte
            Dim BytesRead As Integer
            Try
                fs = New FileStream(ef.DownloadPath, FileMode.Open, FileAccess.Read)
                BytesRead = fs.Read(b, 0, ChunkSize)
                Do While BytesRead > 0
                    AppendMhtLine(Convert.ToBase64String(b, 0, BytesRead))
                    BytesRead = fs.Read(b, 0, ChunkSize)
                Loop
            Finally
                If Not fs Is Nothing Then
                    fs.Close()
                End If
            End Try
        End If
    End Sub

    ''' <summary>
    ''' dumps our MhtBuilder to disk and clears it
    ''' </summary>
    Private Sub FinalizeMht(ByVal outputFilePath As String)
        Dim sr As New StreamWriter(outputFilePath, False, _HtmlFile.TextEncoding)
        sr.Write(_MhtBuilder.ToString)
        sr.Close()
        _MhtBuilder = Nothing
    End Sub

    ''' <summary>
    ''' dumps our MhtBuilder as a string and clears it
    ''' </summary>
    Private Function FinalizeMht() As String
        Dim s As String = _MhtBuilder.ToString
        _MhtBuilder = Nothing
        Return s
    End Function

#End Region

#Region "  Quoted-Printable encoding"

    ''' <summary>
    ''' converts a string into Quoted-Printable encoding
    '''   http://www.freesoft.org/CIE/RFC/1521/6.htm
    ''' </summary>
    Private Function QuotedPrintableEncode(ByVal s As String, ByVal e As System.Text.Encoding) As String
        Dim Ascii As Integer
        Dim LastSpace As Integer = 0
        Dim LineLength As Integer = 0
        Dim LineBreaks As Integer = 0
        Dim sb As New StringBuilder
        Dim longchar As String

        If s Is Nothing OrElse s.Length = 0 Then
            Return ""
        End If

        For Each c As Char In s

            Ascii = Convert.ToInt32(c)

            If Ascii = 61 Or Ascii > 126 Then
                If Ascii <= 255 Then
                    sb.Append("=")
                    sb.Append(Convert.ToString(Ascii, 16).ToUpper)
                    LineLength += 3
                Else
                    '-- double-byte land..
                    For Each b As Byte In e.GetBytes(c)
                        sb.Append("=")
                        sb.Append(Convert.ToString(b, 16).ToUpper)
                        LineLength += 3
                    Next
                End If
            Else
                sb.Append(c)
                LineLength += 1
                If Ascii = 32 Then LastSpace = sb.Length
            End If

            If LineLength >= 73 Then
                If LastSpace = 0 Then
                    sb.Insert(sb.Length, "=" & Environment.NewLine)
                    LineLength = 0
                Else
                    sb.Insert(LastSpace, "=" & Environment.NewLine)
                    LineLength = sb.Length - LastSpace - 1
                End If
                LineBreaks += 1
                LastSpace = 0
            End If

        Next

        '-- if we split the line, have to indicate trailing spaces
        If LineBreaks > 0 Then
            If sb.Chars(sb.Length - 1) = " " Then
                sb.Remove(sb.Length - 1, 1)
                sb.Append("=20")
            End If
        End If

        Return sb.ToString
    End Function

#End Region


End Class

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United States United States
My name is Jeff Atwood. I live in Berkeley, CA with my wife, two cats, and far more computers than I care to mention. My first computer was the Texas Instruments TI-99/4a. I've been a Microsoft Windows developer since 1992; primarily in VB. I am particularly interested in best practices and human factors in software development, as represented in my recommended developer reading list. I also have a coding and human factors related blog at www.codinghorror.com.

Comments and Discussions