Click here to Skip to main content
15,880,651 members
Articles / Programming Languages / Visual Basic

Convert any URL to a MHTML archive using native .NET code

Rate me:
Please Sign up or sign in to vote.
4.85/5 (59 votes)
3 Apr 20055 min read 638.1K   5.7K   164  
A native .NET class for saving URLs: text-only, HTML page, HTML archive, or HTML complete.
Imports System.Text.RegularExpressions
Imports System.IO

''' <summary>
''' represents an external file referenced in our parent HTML at the target URL
''' </summary>
''' <remarks>
'''   Jeff Atwood
'''   http://www.codinghorror.com/
''' </remarks>
Friend Class WebFile
    Private _Builder As Builder
    Private _Url As String
    Private _UrlUnmodified As String
    Private _UrlRoot As String
    Private _UrlFolder As String
    Private _ContentType As String
    Private _IsBinary As Boolean
    Private _TextEncoding As System.Text.Encoding
    Private _ContentLocation As String
    Private _DownloadedBytes() As Byte
    Private _DownloadException As Exception = Nothing
    Private _WasDownloaded As Boolean = False
    Private _DownloadFilename As String = ""
    Private _DownloadFolder As String = ""
    Private _DownloadExtension As String = ""
    Private _UseHtmlFilename As Boolean = False
    Private _ExternalFileCollection As Specialized.NameValueCollection

    Public Storage As Builder.FileStorage = Builder.FileStorage.DiskPermanent
    Public WasAppended As Boolean = False

    Public Sub New(ByVal parent As Builder, ByVal st As Builder.FileStorage)
        Me.Storage = st
        _Builder = parent
    End Sub

    Public Sub New(ByVal parent As Builder, ByVal url As String, ByVal st As Builder.FileStorage)
        _Builder = parent
        If url <> "" Then
            Me.Url = url
        End If
        Me.Storage = st
    End Sub

    ''' <summary>
    ''' The URL target for this file
    ''' </summary>
    Public Property Url() As String
        Get
            Return _Url
        End Get
        Set(ByVal Value As String)
            _UrlUnmodified = Value
            SetUrl(Value, True)
            ReDim _DownloadedBytes(0)
            _ExternalFileCollection = Nothing
            _DownloadException = Nothing
            _TextEncoding = Nothing
            _ContentType = ""
            _ContentLocation = ""
            _IsBinary = False
            _WasDownloaded = False
        End Set
    End Property

    ''' <summary>
    ''' If enabled, will use the first 50 characters of the TITLE tag 
    ''' to form the filename when saved to disk
    ''' </summary>
    Public Property UseHtmlTitleAsFilename() As Boolean
        Get
            Return _UseHtmlFilename
        End Get
        Set(ByVal Value As Boolean)
            _UseHtmlFilename = Value
        End Set
    End Property

    ''' <summary>
    ''' the folder name used in the DownloadFolder
    ''' </summary>
    Public ReadOnly Property DownloadFolderName() As String
        Get
            Return Regex.Match(Me.DownloadFolder, "(?<Folder>[^\\]+)\\*$").Groups("Folder").Value
        End Get
    End Property

    ''' <summary>
    ''' folder to download this file to
    ''' if no folder is provided, the current application folder will be used
    ''' </summary>
    Public Property DownloadFolder() As String
        Get
            If _DownloadFolder = "" Then
                _DownloadFolder = AppDomain.CurrentDomain.BaseDirectory
            End If
            Return _DownloadFolder
        End Get
        Set(ByVal Value As String)
            _DownloadFolder = Value
        End Set
    End Property

    ''' <summary>
    ''' filename to download this file as
    ''' if no filename is provided, a filename will be auto-generated based on
    ''' the URL; if the UseHtmlTitleAsFilename property is true, then the
    ''' title tag will be used to generate the filename
    ''' </summary>
    Public Property DownloadFilename() As String
        Get
            If _DownloadFilename = "" Then
                If _UseHtmlFilename AndAlso (Me.WasDownloaded AndAlso Me.IsHtml) Then
                    Dim htmlTitle As String = Me.HtmlTitle
                    If htmlTitle <> "" Then
                        _DownloadFilename = MakeValidFilename(htmlTitle) & ".htm"
                    End If
                Else
                    _DownloadFilename = FilenameFromUrl()
                End If
            End If
            Return _DownloadFilename
        End Get
        Set(ByVal Value As String)
            _DownloadFilename = Value
        End Set
    End Property

    ''' <summary>
    ''' fully qualified path and filename to download this file to
    ''' </summary>
    Public Property DownloadPath() As String
        Get
            If Path.GetExtension(Me.DownloadFilename) = "" Then
                Return Path.Combine(Me.DownloadFolder, Me.DownloadFilename & Me.DownloadExtension)
            Else
                Return Path.Combine(Me.DownloadFolder, Me.DownloadFilename)
            End If
        End Get
        Set(ByVal Value As String)
            _DownloadFilename = Path.GetFileName(Value)
            If _DownloadFilename = "" Then
                _DownloadFolder = Value
            Else
                _DownloadFolder = Value.Replace(_DownloadFilename, "")
            End If
        End Set
    End Property

    ''' <summary>
    ''' file type extension to use on downloaded file
    ''' this property is only used if the DownloadFilename property does not
    ''' already contain a file extension
    ''' </summary>
    Public Property DownloadExtension() As String
        Get
            If _DownloadExtension = "" Then
                If Me.WasDownloaded Then
                    _DownloadExtension = ExtensionFromContentType()
                End If
            End If
            Return _DownloadExtension
        End Get
        Set(ByVal Value As String)
            _DownloadExtension = Value
        End Set
    End Property

    ''' <summary>
    ''' If this file has external dependencies, the folder they will be stored on disk
    ''' </summary>
    Public ReadOnly Property ExternalFilesFolder() As String
        Get
            Return Path.Combine(Me.DownloadFolder, Path.GetFileNameWithoutExtension(Me.DownloadFilename)) & "_files"
        End Get
    End Property

    ''' <summary>
    ''' The unmodified "raw" URL as originally provided
    ''' </summary>
    Public ReadOnly Property UrlUnmodified() As String
        Get
            Return _UrlUnmodified
        End Get
    End Property

    ''' <summary>
    ''' The Content-Location of this URL as provided by the server,
    ''' only if the URL was not fully qualified;
    ''' eg, http://mywebsite.com/ actually maps to http://mywebsite.com/default.htm 
    ''' </summary>
    Public ReadOnly Property UrlContentLocation() As String
        Get
            Return _ContentLocation
        End Get
    End Property

    ''' <summary>
    ''' The root of the URL, eg, http://mywebsite.com/
    ''' </summary>
    Public ReadOnly Property UrlRoot() As String
        Get
            Return _UrlRoot
        End Get
    End Property

    ''' <summary>
    ''' The root and folder of the URL, eg, http://mywebsite.com/myfolder
    ''' </summary>
    Public ReadOnly Property UrlFolder() As String
        Get
            Return _UrlFolder
        End Get
    End Property

    ''' <summary>
    ''' Was this file successfully downloaded via HTTP?
    ''' </summary>
    Public ReadOnly Property WasDownloaded() As Boolean
        Get
            Return _WasDownloaded
        End Get
    End Property

    ''' <summary>
    ''' The Content-Type of this file as returned by the server
    ''' </summary>
    Public ReadOnly Property ContentType() As String
        Get
            Return _ContentType
        End Get
    End Property

    ''' <summary>
    ''' Does this file contain binary data? If not, it must be text data.
    ''' </summary>
    Public ReadOnly Property IsBinary() As Boolean
        Get
            Return _IsBinary
        End Get
    End Property

    ''' <summary>
    ''' The raw bytes returned from the server for this file
    ''' </summary>
    Public ReadOnly Property DownloadedBytes() As Byte()
        Get
            Return _DownloadedBytes
        End Get
    End Property

    ''' <summary>
    ''' If not .WasDownloaded, the exception that prevented download is stored here
    ''' </summary>
    Public ReadOnly Property DownloadException() As Exception
        Get
            Return _DownloadException
        End Get
    End Property

    ''' <summary>
    ''' If this file is text (eg, it isn't binary), the type of text encoding used
    ''' </summary>
    Public ReadOnly Property TextEncoding() As System.Text.Encoding
        Get
            Return _TextEncoding
        End Get
    End Property

    ''' <summary>
    ''' Is this file HTML content?
    ''' </summary>
    Public ReadOnly Property IsHtml() As Boolean
        Get
            Return Regex.IsMatch(_ContentType, "text/html", RegexOptions.IgnoreCase)
        End Get
    End Property

    ''' <summary>
    ''' Is this file CSS content?
    ''' </summary>
    Public ReadOnly Property IsCss() As Boolean
        Get
            Return Regex.IsMatch(_ContentType, "text/css", RegexOptions.IgnoreCase)
        End Get
    End Property

    ''' <summary>
    ''' If this file is HTML, retrieve the &lt;TITLE&gt; tag from the HTML
    ''' (maximum of 50 characters)
    ''' </summary>
    Public ReadOnly Property HtmlTitle() As String
        Get
            If Not Me.IsHtml Then
                Throw New Exception("This file isn't HTML, so it has no HTML <TITLE> tag.")
            End If
            Const maxLength As Integer = 50
            Dim s As String = _
                Regex.Match(Me.ToString, "<title[^>]*?>(?<text>[^<]+)</title>", _
                RegexOptions.IgnoreCase Or RegexOptions.Singleline).Groups("text").Value()
            If s.Length > maxLength Then
                Return s.Substring(0, maxLength)
            Else
                Return s
            End If
        End Get
    End Property

    ''' <summary>
    ''' Returns a string representation of the data downloaded for this file
    ''' </summary>
    Public Overrides Function ToString() As String
        If Not _WasDownloaded Then
            Download()
        End If
        If Not _WasDownloaded Then
            Return ""
        Else
            If _DownloadedBytes.Length > 0 Then
                If _IsBinary Then
                    Return "[" & _DownloadedBytes.Length & " bytes of binary data]"
                Else
                    Return TextEncoding.GetString(_DownloadedBytes)
                End If
            End If
        End If
    End Function

    ''' <summary>
    ''' Download this file from the target URL
    ''' </summary>
    Public Sub Download()
        Debug.Write("Downloading " & _Url & "  ..")
        DownloadBytes()
        If _DownloadException Is Nothing Then
            Debug.WriteLine("OK")
        Else
            Debug.WriteLine("failed: ", "Error")
            Debug.WriteLine("    " & _DownloadException.Message, "Error")
            Return
        End If

        If Me.IsHtml Then
            _DownloadedBytes = _TextEncoding.GetBytes(ProcessHtml(Me.ToString))
        End If

        If Me.IsCss Then
            _DownloadedBytes = _TextEncoding.GetBytes(ProcessHtml(Me.ToString))
        End If

        If Me.Storage <> Builder.FileStorage.Memory Then
            Me.SaveToFile()
        End If
    End Sub

    ''' <summary>
    ''' download this file from the target URL;
    ''' place the bytes downloaded in _DownloadedBytes
    ''' if an exception occurs, capture it in _DownloadException
    ''' </summary>
    Private Sub DownloadBytes()
        If Me.WasDownloaded Then Return

        '-- always download to memory first
        Try
            _DownloadedBytes = _Builder.WebClient.DownloadBytes(_Url)
            _WasDownloaded = True
        Catch ex As Net.WebException
            _DownloadException = ex
            _Builder.WebClient.ClearDownload()
        End Try

        '-- necessary if the original client URL was imprecise; 
        '-- server location is always authoritatitve
        If _Builder.WebClient.ContentLocation <> "" Then
            _ContentLocation = _Builder.WebClient.ContentLocation
            SetUrl(_ContentLocation, False)
        End If

        _IsBinary = _Builder.WebClient.ResponseIsBinary
        _ContentType = _Builder.WebClient.ResponseContentType
        _TextEncoding = _Builder.WebClient.DetectedEncoding
        _Builder.WebClient.ClearDownload()
    End Sub

    Private Sub SetUrl(ByVal url As String, ByVal validate As Boolean)
        If validate Then
            _Url = ResolveUrl(url)
        Else
            _Url = url
        End If
        '-- http://mywebsite
        _UrlRoot = Regex.Match(url, "http://[^/'""]+", RegexOptions.IgnoreCase).ToString
        '-- http://mywebsite/myfolder
        If _Url.LastIndexOf("/") > 7 Then
            _UrlFolder = _Url.Substring(0, _Url.LastIndexOf("/"))
        Else
            _UrlFolder = _UrlRoot
        End If
    End Sub

    ''' <summary>
    ''' Pre-process the CSS using global preference settings
    ''' </summary>
    Private Function ProcessCss(ByVal css As String) As String
        Return ConvertRelativeToAbsoluteRefs(css)
    End Function

    ''' <summary>
    ''' Pre-process the HTML using global preference settings
    ''' </summary>
    Private Function ProcessHtml(ByVal html As String) As String
        Debug.WriteLine("Downloaded content was HTML/CSS -- processing: resolving URLs, getting <base>, etc")
        If _Builder.AddWebMark Then
            '-- add "mark of the web":
            '-- http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120
            html = "<!-- saved from url=(" & String.Format("{0:0000}", _Url.Length) & ")" & _Url & " -->" & _
                Environment.NewLine & html
        End If

        '-- see if we need to strip elements from the HTML
        If _Builder.StripScripts Then
            html = StripHtmlTag("script", html)
        End If
        If _Builder.StripIframes Then
            html = StripHtmlTag("iframe", html)
        End If

        '-- if we have a <base>, we must use it as the _UrlFolder, 
        '-- not what was parsed from the original _Url
        Dim BaseUrlFolder As String = _
            Regex.Match(html, _
            "<base[^>]+?href=['""]{0,1}(?<BaseUrl>[^'"">]+)['""]{0,1}", _
            RegexOptions.IgnoreCase).Groups("BaseUrl").Value
        If BaseUrlFolder <> "" Then
            If BaseUrlFolder.EndsWith("/") Then
                _UrlFolder = BaseUrlFolder.Substring(0, BaseUrlFolder.Length - 1)
            Else
                _UrlFolder = BaseUrlFolder
            End If
        End If

        '-- remove the <base href=''> tag if present; causes problems when viewing locally.
        html = Regex.Replace(html, "<base[^>]*?>", "")

        '-- relative URLs are a PITA for the processing we're about to do, 
        '-- so convert them all to absolute up front
        Return ConvertRelativeToAbsoluteRefs(html)
    End Function

    ''' <summary>
    ''' converts all relative url references
    '''    href="myfolder/mypage.htm"
    ''' into absolute url references
    '''    href="http://mywebsite/myfolder/mypage.htm"
    ''' </summary>
    Private Function ConvertRelativeToAbsoluteRefs(ByVal html As String) As String
        Dim r As Regex

        Dim urlPattern As String = _
            "(?<attrib>\shref|\ssrc|\sbackground)\s*?=\s*?" & _
            "(?<delim1>[""'\\]{0,2})(?!\s*\+|#|http:|ftp:|mailto:|javascript:)" & _
            "/(?<url>[^""'>\\]+)(?<delim2>[""'\\]{0,2})"

        Dim cssPattern As String = _
            "(?<attrib>@import\s|\S+-image:|background:)\s*?(url)*['""(]{1,2}" & _
            "(?!http)\s*/(?<url>[^""')]+)['"")]{1,2}"

        '-- href="/anything" to href="http://www.web.com/anything"
        r = New Regex(urlPattern, _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        html = r.Replace(html, "${attrib}=${delim1}" & _UrlRoot & "/${url}${delim2}")

        '-- href="anything" to href="http://www.web.com/folder/anything"
        r = New Regex(urlPattern.Replace("/", ""), _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        html = r.Replace(html, "${attrib}=${delim1}" & _UrlFolder & "/${url}${delim2}")

        '-- @import(/anything) to @import url(http://www.web.com/anything)
        r = New Regex(cssPattern, _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        html = r.Replace(html, "${attrib} url(" & _UrlRoot & "/${url})")

        '-- @import(anything) to @import url(http://www.web.com/folder/anything)
        r = New Regex(cssPattern.Replace("/", ""), _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        html = r.Replace(html, "${attrib} url(" & _UrlFolder & "/${url})")

        Return html
    End Function

    ''' <summary>
    ''' returns a name/value collection of all external files referenced in HTML:
    ''' 
    '''     "/myfolder/blah.png"
    '''     'http://mywebsite/blah.gif'
    '''     src=blah.jpg  
    ''' 
    ''' note that the Key includes the delimiting quotes or parens (if present), but the Value does not
    ''' this is important because the delimiters are used for matching and replacement to make the
    ''' match more specific!
    ''' </summary>
    Private Function ExternalHtmlFiles() As Specialized.NameValueCollection
        '-- avoid doing this work twice, however, be careful that the HTML hasn't
        '-- changed since the last time we called this function
        If Not _ExternalFileCollection Is Nothing Then
            Return _ExternalFileCollection
        End If

        _ExternalFileCollection = New Specialized.NameValueCollection
        Dim r As Regex
        Dim html As String = Me.ToString

        Debug.WriteLine("Resolving all external HTML references from URL:")
        Debug.WriteLine("    " & Me.Url)

        '-- src='filename.ext' ; background="filename.ext"
        '-- note that we have to test 3 times to catch all quote styles: '', "", and none
        r = New Regex( _
            "(\ssrc|\sbackground)\s*=\s*((?<Key>'(?<Value>[^']+)')|(?<Key>""(?<Value>[^""]+)"")|(?<Key>(?<Value>[^ \n\r\f]+)))", _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        AddMatchesToCollection(html, r, _ExternalFileCollection)

        '-- @import "style.css" or @import url(style.css)
        r = New Regex( _
            "(@import\s|\S+-image:|background:)\s*?(url)*\s*?(?<Key>[""'(]{1,2}(?<Value>[^""')]+)[""')]{1,2})", _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        AddMatchesToCollection(html, r, _ExternalFileCollection)

        '-- <link rel=stylesheet href="style.css">
        r = New Regex( _
            "<link[^>]+?href\s*=\s*(?<Key>('|"")*(?<Value>[^'"">]+)('|"")*)", _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        AddMatchesToCollection(html, r, _ExternalFileCollection)

        '-- <iframe src="mypage.htm"> or <frame src="mypage.aspx">
        r = New Regex( _
            "<i*frame[^>]+?src\s*=\s*(?<Key>['""]{0,1}(?<Value>[^'""\\>]+)['""]{0,1})", _
            RegexOptions.IgnoreCase Or RegexOptions.Multiline)
        AddMatchesToCollection(html, r, _ExternalFileCollection)

        Return _ExternalFileCollection
    End Function


    ''' <summary>
    ''' perform the regex replacement of all &lt;tagName&gt; .. &lt;/tagName&gt; blocks
    ''' </summary>
    Private Function StripHtmlTag(ByVal tagName As String, ByVal html As String) As String
        Dim reg As Regex = New Regex( _
        String.Format("<{0}[^>]*?>[\w|\t|\r|\W]*?</{0}>", tagName), _
            RegexOptions.Multiline Or RegexOptions.IgnoreCase)
        Return reg.Replace(html, "")
    End Function

    ''' <summary>
    ''' Returns the plain text representation of the data in this file, 
    ''' stripping out any HTML tags and codes
    ''' </summary>
    Public Function ToTextString(Optional ByVal removeWhitespace As Boolean = False) As String
        Dim html As String = Me.ToString
        '-- get rid of <script> .. </script>
        html = StripHtmlTag("script", html)
        '-- get rid of <style> .. </style>
        html = StripHtmlTag("style", html)

        '-- get rid of all HTML tags
        html = Regex.Replace(html, "<\w+(\s+[A-Za-z0-9_\-]+\s*=\s*(""([^""]*)""|'([^']*)'))*\s*(/)*>|<[^>]+>", " ")
        '-- convert escaped HTML to plaintext
        html = Web.HttpUtility.HtmlDecode(html)

        If removeWhitespace Then
            '-- clean up whitespace (optional, depends what you want..)
            html = Regex.Replace(html, "[\n\r\f\t]", " ", RegexOptions.Multiline)
            html = Regex.Replace(html, " {2,}", " ", RegexOptions.Multiline)
        End If
        Return html
    End Function

    ''' <summary>
    ''' Saves this file to disk as a plain text file
    ''' </summary>
    Public Sub SaveAsTextFile()
        SaveToFile(Path.ChangeExtension(Me.DownloadPath, ".txt"), True)
    End Sub

    ''' <summary>
    ''' Saves this file to disk as a plain text file, to an arbitrary path
    ''' </summary>
    Public Sub SaveAsTextFile(ByVal filePath As String)
        SaveToFile(filePath, True)
    End Sub

    ''' <summary>
    ''' writes contents of file to DownloadPath, using appropriate encoding as necessary
    ''' </summary>
    Public Sub SaveToFile()
        SaveToFile(Me.DownloadPath, False)
    End Sub

    ''' <summary>
    ''' writes contents of file to DownloadPath, using appropriate encoding as necessary
    ''' </summary>
    Public Sub SaveToFile(ByVal filePath As String)
        SaveToFile(filePath, False)
    End Sub

    ''' <summary>
    ''' sets the DownloadPath and writes contents of file, using appropriate encoding as necessary
    ''' </summary>
    Private Sub SaveToFile(ByVal filePath As String, ByVal asText As Boolean)
        Debug.WriteLine("Saving to file " & filePath)
        Dim fs As New FileStream(filePath, FileMode.OpenOrCreate)
        Try
            Dim bw As New BinaryWriter(fs)
            If Me.IsBinary Then
                bw.Write(_DownloadedBytes)
            Else
                If asText Then
                    bw.Write(Me.ToTextString)
                Else
                    bw.Write(_DownloadedBytes)
                End If
            End If
            bw.Close()
        Finally
            If Not fs Is Nothing Then
                fs.Close()
            End If
        End Try
    End Sub

    ''' <summary>
    ''' fully resolves any relative pathing inside the URL, and other URL oddities
    ''' </summary>
    Private Function ResolveUrl(ByVal url As String) As String
        '-- resolve any relative pathing
        Try
            url = (New System.Uri(url)).AbsoluteUri
        Catch ex As System.UriFormatException
            Throw New ArgumentException("'" & url & "' does not appear to be a valid URL.", ex)
        End Try
        '-- remove any anchor tags from the end of URLs
        If url.IndexOf("#") > -1 Then
            Dim jump As String = Regex.Match(url, "/[^/]*?(?<jump>#[^/?.]+$)").Groups("jump").Value
            If jump <> "" Then
                url = url.Replace(jump, "")
            End If
        End If
        Return url
    End Function

    ''' <summary>
    ''' if the user passed in a directory, form the filename automatically using the Html title tag
    ''' if the user passed in a filename, make sure the extension matches our desired extension
    ''' </summary>
    Private Function DeriveFilename(ByVal FilePath As String, _
        ByVal html As String, _
        ByVal fileExtension As String) As String

        If IsDirectory(FilePath) Then
            Dim htmlTitle As String = Me.HtmlTitle
            If htmlTitle = "" Then
                Throw New Exception("No filename was provided, and the HTML title tag was not found, " & _
                    "so a filename could not be automatically generated. You'll need to provide a filename and not a folder.")
            End If
            FilePath = Path.Combine(Path.GetDirectoryName(FilePath), MakeValidFilename(htmlTitle) & fileExtension)
        Else
            If Path.GetExtension(FilePath) <> fileExtension Then
                Return Path.ChangeExtension(FilePath, fileExtension)
            End If
        End If
        Return FilePath
    End Function

    ''' <summary>
    ''' removes all unsafe filesystem characters to form a valid filesystem filename
    ''' </summary>
    Private Function MakeValidFilename(ByVal s As String, Optional ByVal enforceLength As Boolean = False) As String
        If enforceLength Then
        End If
        '-- replace any invalid filesystem chars, plus leading/trailing/doublespaces
        Return Regex.Replace(Regex.Replace(s, "[\/\\\:\*\?\""""\<\>\|]|^\s+|\s+$", ""), "\s{2,}", " ")
    End Function

    ''' <summary>
    ''' returns true if this path refers to a directory (vs. a filename)
    ''' </summary>
    Private Function IsDirectory(ByVal FilePath As String) As Boolean
        Return FilePath.EndsWith("\")
    End Function

    ''' <summary>
    ''' converts all external Html files (gif, jpg, css, etc) to local refs
    ''' external ref:
    '''    &lt;img src="http://mywebsite/myfolder/myimage.gif"&gt;
    ''' into local refs:
    '''    &lt;img src="mypage_files/myimage.gif"&gt;
    ''' </summary>
    Public Sub ConvertReferencesToLocal()

        If Not Me.IsHtml And Not Me.IsCss Then
            Throw New Exception("Converting references only makes sense for HTML or CSS files; this file is of type '" & Me.ContentType & "'")
        End If

        '-- get a list of all external references
        Dim html As String = Me.ToString
        Dim FileCollection As Specialized.NameValueCollection = Me.ExternalHtmlFiles()

        '-- no external refs? nothing to do
        If FileCollection.Count = 0 Then Return

        Dim FolderName As String
        Dim FileUrl As String
        For Each DelimitedFileUrl As String In FileCollection.AllKeys

            FileUrl = FileCollection.Item(DelimitedFileUrl)
            If _Builder.WebFiles.Contains(FileUrl) Then
                Dim wf As WebFile = DirectCast(_Builder.WebFiles.Item(FileUrl), WebFile)

                Dim NewPath As String = Me.ExternalFilesFolder & "/" & wf.DownloadFilename
                Dim DelimitedReplacement As String = Regex.Replace(DelimitedFileUrl, _
                    "^(?<StartDelim>""|'|\()*(?<Value>[^'"")]*)(?<EndDelim>""|'|\))*$", _
                    "${StartDelim}" & NewPath & "${EndDelim}")

                '-- correct original Url references in Html so they point to our local files
                html = html.Replace(DelimitedFileUrl, DelimitedReplacement)
            End If

        Next

        _DownloadedBytes = _TextEncoding.GetBytes(html)
    End Sub


    ''' <summary>
    ''' appends key=value named matches in a regular expression
    ''' to a target NameValueCollection
    ''' </summary>
    Private Sub AddMatchesToCollection(ByVal s As String, _
        ByVal r As Regex, _
        ByRef nvc As Specialized.NameValueCollection)

        Dim key As String
        Dim value As String
        Dim headerDisplayed As Boolean = False
        Dim urlRegex As New Regex("^https*://\w+", RegexOptions.IgnoreCase)

        For Each m As Match In r.Matches(s)
            If Not headerDisplayed Then
                Debug.WriteLine("Matches added from regex:")
                Debug.WriteLine("""" & r.ToString & """")
                headerDisplayed = True
            End If
            key = m.Groups("Key").ToString
            value = m.Groups("Value").ToString
            If nvc.Item(key) Is Nothing Then
                Debug.WriteLine(" Match: " & m.ToString)
                Debug.WriteLine("   Key: " & key)
                Debug.WriteLine(" Value: " & value)
                If Not urlRegex.IsMatch(value) Then
                    Debug.WriteLine("Match discarded; does not appear to be valid fully qualified http:// Url", "Error")
                Else
                    nvc.Add(key, value)
                End If
            End If
        Next
    End Sub

    ''' <summary>
    ''' download ALL externally referenced files in this file's html, potentially recursively,
    ''' to the default download path for this page
    ''' </summary>
    Public Sub DownloadExternalFiles(ByVal st As Builder.FileStorage, Optional ByVal recursive As Boolean = False)
        DownloadExternalFiles(st, Me.ExternalFilesFolder, recursive)
    End Sub

    ''' <summary>
    ''' download ALL externally referenced files in this html, potentially recursively
    ''' to a specific download path
    ''' </summary>
    Private Sub DownloadExternalFiles(ByVal st As Builder.FileStorage, ByVal targetFolder As String, ByVal recursive As Boolean)
        Dim FileCollection As Specialized.NameValueCollection = ExternalHtmlFiles()
        If Not FileCollection.HasKeys Then Return
        Debug.WriteLine("Downloading all external files collected from URL:")
        Debug.WriteLine("    " & Url)
        For Each Key As String In FileCollection.AllKeys
            DownloadExternalFile(FileCollection.Item(Key), st, targetFolder, recursive)
        Next
    End Sub

    ''' <summary>
    ''' Download a single externally referenced file (if we haven't already downloaded it)
    ''' </summary>
    Private Sub DownloadExternalFile(ByVal url As String, ByVal st As Builder.FileStorage, _
        ByVal targetFolder As String, Optional ByVal recursive As Boolean = False)

        Dim wf As WebFile
        Dim isNew As Boolean

        '-- have we already downloaded (or attempted to) this file?
        If _Builder.WebFiles.Contains(url) Or _Builder.Url = url Then
            wf = DirectCast(_Builder.WebFiles.Item(url), WebFile)
            isNew = False
        Else
            wf = New WebFile(_Builder, url, st)
            isNew = True
        End If

        '-- if we're planning to store this file on disk, make sure we can
        If st = Builder.FileStorage.DiskPermanent Or st = Builder.FileStorage.DiskTemporary Then
            If Not Directory.Exists(targetFolder) Then
                Directory.CreateDirectory(targetFolder)
            End If
            wf.DownloadFolder = targetFolder
        End If

        wf.Download()

        If isNew Then
            '-- add this (possibly) downloaded file to our shared collection
            _Builder.WebFiles.Add(wf.UrlUnmodified, wf)

            '-- if this is an HTML file, it has dependencies of its own;
            '-- download them into a subfolder
            If (wf.IsHtml Or wf.IsCss) And recursive Then
                wf.DownloadExternalFiles(st, recursive)
            End If
        End If

    End Sub

    ''' <summary>
    ''' attempt to get a coherent filename out of the Url
    ''' </summary>
    Private Function FilenameFromUrl() As String
        '-- first, try to get a filename out of the URL itself;
        '-- this means anything past the final slash that doesn't include another slash
        '-- or a question mark, eg http://mywebsite/myfolder/crazy?param=1&param=2
        Dim filename As String = Regex.Match(_Url, "/(?<Filename>[^/?]+)[^/]*$").Groups("Filename").Value
        If filename <> "" Then
            '-- that worked, but we need to make sure the filename is unique
            '-- if query params were passed to the URL file
            Dim u As New Uri(_Url)
            If u.Query <> "" Then
                filename = Path.GetFileNameWithoutExtension(filename) & "_" & u.Query.GetHashCode.ToString & Me.DownloadExtension
            End If
        End If
        '-- ok, that didn't work; if this file is HTML try to get the TITLE tag
        If filename = "" Then
            If Me.IsHtml Then
                filename = Me.HtmlTitle
                If filename <> "" Then
                    filename &= ".htm"
                End If
            End If
        End If
        '-- now we're really desperate. Hash the URL and make that the filename.
        If filename = "" Then
            filename = _Url.GetHashCode.ToString & Me.DownloadExtension
        End If
        Return MakeValidFilename(filename)
    End Function

    ''' <summary>
    ''' if we weren't given a filename extension, infer it from the download
    ''' Content-Type header
    ''' </summary>
    ''' <remarks>
    ''' http://www.utoronto.ca/webdocs/HTMLdocs/Book/Book-3ed/appb/mimetype.html
    ''' </remarks>
    Private Function ExtensionFromContentType() As String
        Select Case Regex.Match(Me.ContentType, "^[^ ;]+").Value.ToLower
            Case "text/html"
                Return ".htm"
            Case "image/gif"
                Return ".gif"
            Case "image/jpeg"
                Return ".jpg"
            Case "text/javascript", "application/x-javascript"
                Return ".js"
            Case "image/x-png"
                Return ".png"
            Case "text/css"
                Return ".css"
            Case "text/plain"
                Return ".txt"
            Case Else
                Debug.WriteLine("Unknown content-type '" & Me.ContentType & "'", "Error")
                Return ".htm"
        End Select
    End Function

End Class

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United States United States
My name is Jeff Atwood. I live in Berkeley, CA with my wife, two cats, and far more computers than I care to mention. My first computer was the Texas Instruments TI-99/4a. I've been a Microsoft Windows developer since 1992; primarily in VB. I am particularly interested in best practices and human factors in software development, as represented in my recommended developer reading list. I also have a coding and human factors related blog at www.codinghorror.com.

Comments and Discussions