Click here to Skip to main content
15,881,803 members
Articles / Programming Languages / Visual Basic

Convert any URL to a MHTML archive using native .NET code

Rate me:
Please Sign up or sign in to vote.
4.85/5 (59 votes)
3 Apr 20055 min read 638.4K   5.7K   164  
A native .NET class for saving URLs: text-only, HTML page, HTML archive, or HTML complete.
Imports System.IO
Imports System.Net
Imports System.Text.RegularExpressions
Imports ICSharpCode.SharpZipLib

''' <summary>
''' This is a class similar to <c>System.Net.WebClient</c>, but with:
'''   - Autodetection of content encoding
'''   - Proxy support
'''   - Authentication support
'''   - Cookie retention
'''   - HTTP compression (via SharpZipLib)
'''   - IfModifiedSince
''' </summary>
''' <remarks>
'''   Jeff Atwood
'''   http://www.codinghorror.com/
''' </remarks>
Friend Class WebClientEx

    Private Const _AcceptedEncodings As String = "gzip,deflate"

    Private _DefaultEncoding As System.Text.Encoding
    Private _DetectedEncoding As System.Text.Encoding
    Private _ForcedEncoding As System.Text.Encoding
    Private _DetectedContentType As String
    Private _ContentLocation As String
    Private _ResponseBytes As Byte()
    Private _AuthenticationRequired As Boolean
    Private _ProxyAuthenticationRequired As Boolean
    Private _ProxyUrl As String
    Private _ProxyUser As String
    Private _ProxyPassword As String
    Private _AuthenticationUser As String
    Private _AuthenticationPassword As String
    Private _KeepCookies As Boolean
    Private _RequestTimeoutMilliseconds As Integer
    Private _PersistedCookies As CookieContainer
    '-- http://www.zytrax.com/tech/web/browser_ids.htm
    Private _HttpUserAgent As String

    Public Sub New()
        '-- sets default values
        Clear()
    End Sub

#Region "  Properties"

    ''' <summary>
    ''' this is the string encoding that was autodetected from the HTML content
    ''' </summary>
    Public ReadOnly Property DetectedEncoding() As System.Text.Encoding
        Get
            Return _DetectedEncoding
        End Get
    End Property

    ''' <summary>
    ''' Bypass detection of content encoding and force a specific encoding
    ''' </summary>
    Public Property ForcedEncoding() As System.Text.Encoding
        Get
            Return _ForcedEncoding
        End Get
        Set(ByVal Value As System.Text.Encoding)
            _ForcedEncoding = Value
        End Set
    End Property

    ''' <summary>
    ''' if the correct string encoding type cannot be detected, or detection is disabled
    ''' this is the default string encoding that will be used.
    ''' </summary>
    Public Property DefaultEncoding() As System.Text.Encoding
        Get
            Return _DefaultEncoding
        End Get
        Set(ByVal Value As System.Text.Encoding)
            _DefaultEncoding = Value
        End Set
    End Property

    ''' <summary>
    ''' this is the string encoding that was autodetected from the HTML content
    ''' </summary>
    Public ReadOnly Property ResponseContentType() As String
        Get
            Return _DetectedContentType
        End Get
    End Property

    ''' <summary>
    ''' Returns true if the last HTTP response was in a non-text format
    ''' </summary>
    Public ReadOnly Property ResponseIsBinary() As Boolean
        Get
            '-- if we truly have no content-type, we're kinda hosed, but 
            '-- let's assume the response is text data to be on the safe side
            If _DetectedContentType = "" Then
                Return False
            Else
                Return _DetectedContentType.IndexOf("text") = -1
            End If
        End Get
    End Property

    ''' <summary>
    ''' Returns a string representation, with encoding, of the last HTTP response data
    ''' </summary>
    Public ReadOnly Property ResponseString() As String
        Get
            If Me.ResponseIsBinary Then
                Return "(" & _ResponseBytes.Length & " bytes of binary data)"
            Else
                If _ForcedEncoding Is Nothing Then
                    Return _DetectedEncoding.GetString(_ResponseBytes)
                Else
                    Return _ForcedEncoding.GetString(_ResponseBytes)
                End If
            End If
        End Get
    End Property

    ''' <summary>
    ''' Returns the raw bytestream representing the last HTTP response data
    ''' </summary>
    Public ReadOnly Property ResponseBytes() As Byte()
        Get
            Return _ResponseBytes
        End Get
    End Property

    ''' <summary>
    ''' Returns the actual location of the downloaded content, which can 
    ''' be different than the requested URL, eg, http://web.com/IsThisAfolderOrNot
    ''' </summary>
    Public ReadOnly Property ContentLocation() As String
        Get
            Return _ContentLocation
        End Get
    End Property

    ''' <summary>
    ''' Browser ID string to send with web requests
    ''' note that many popular websites will serve alternate content based on this value
    ''' </summary>
    ''' <remarks>
    ''' defaults to browser ID string of vanilla IE6 as seen in XP SP2
    ''' </remarks>
    Public Property BrowserIdString() As String
        Get
            Return _HttpUserAgent
        End Get
        Set(ByVal Value As String)
            _HttpUserAgent = Value
        End Set
    End Property

    ''' <summary>
    ''' how long, in milliseconds, to wait for HTTP content to arrive before timing out
    ''' </summary>
    Public Property TimeoutMilliseconds() As Integer
        Get
            Return _RequestTimeoutMilliseconds
        End Get
        Set(ByVal Value As Integer)
            _RequestTimeoutMilliseconds = Value
        End Set
    End Property

    ''' <summary>
    ''' URL of the web proxy to use
    ''' if left blank, no Proxy will be used; if provided, will ALWAYS be used!
    ''' </summary>
    Public Property ProxyUrl() As String
        Get
            Return _ProxyUrl
        End Get
        Set(ByVal Value As String)
            _ProxyUrl = Value
        End Set
    End Property

    ''' <summary>
    ''' username to use for Proxy authentication
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property ProxyUser() As String
        Get
            Return _ProxyUser
        End Get
        Set(ByVal Value As String)
            _ProxyUser = Value
        End Set
    End Property

    ''' <summary>
    ''' password to use for Proxy authentication
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property ProxyPassword() As String
        Get
            Return _ProxyPassword
        End Get
        Set(ByVal Value As String)
            _ProxyPassword = Value
        End Set
    End Property

    ''' <summary>
    ''' the Proxy requires authentication
    ''' if credentials are not explicitly provided, the current user's credentials will automatically be sent
    ''' </summary>
    Public Property ProxyAuthenticationRequired() As Boolean
        Get
            Return _ProxyAuthenticationRequired
        End Get
        Set(ByVal Value As Boolean)
            _ProxyAuthenticationRequired = Value
        End Set
    End Property

    ''' <summary>
    ''' username for authentication to the target URL
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property AuthenticationUser() As String
        Get
            Return _AuthenticationUser
        End Get
        Set(ByVal Value As String)
            _AuthenticationUser = Value
        End Set
    End Property

    ''' <summary>
    ''' password for authentication to the target URL
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property AuthenticationPassword() As String
        Get
            Return _AuthenticationPassword
        End Get
        Set(ByVal Value As String)
            _AuthenticationPassword = Value
        End Set
    End Property

    ''' <summary>
    ''' the target URL requires authentication
    ''' if credentials are not explicitly provided, the current user's credentials will automatically be sent
    ''' </summary>
    Public Property AuthenticationRequired() As Boolean
        Get
            Return _AuthenticationRequired
        End Get
        Set(ByVal Value As Boolean)
            _AuthenticationRequired = Value
        End Set
    End Property

    ''' <summary>
    ''' Retains cookies for all subsequent HTTP requests from this object
    ''' </summary>
    Public Property KeepCookies() As Boolean
        Get
            Return _KeepCookies
        End Get
        Set(ByVal Value As Boolean)
            _KeepCookies = Value
        End Set
    End Property

#End Region

    ''' <summary>
    ''' The Content-Encoding entity-header field is used as a modifier to the media-type. 
    ''' When present, its value indicates what additional content codings have been applied 
    ''' to the entity-body, and thus what decoding mechanisms must be applied in order to 
    ''' obtain the media-type referenced by the Content-Type header field. Content-Encoding 
    ''' is primarily used to allow a document to be compressed without losing the identity 
    ''' of its underlying media type. 
    ''' </summary>
    ''' <remarks>
    ''' http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.5
    ''' </remarks>
    Private Enum HttpContentEncoding
        None
        Gzip
        Compress
        Deflate
        Unknown
    End Enum

    ''' <summary>
    ''' attempt to convert this charset string into a named .NET text encoding
    ''' </summary>
    Private Function CharsetToEncoding(ByVal Charset As String) As System.Text.Encoding
        If Charset = "" Then Return Nothing
        Try
            Return System.Text.Encoding.GetEncoding(Charset)
        Catch ex As System.ArgumentException
            Return Nothing
        End Try
    End Function

    ''' <summary>
    ''' try to determine string encoding using Content-Type HTTP header and
    ''' raw HTTP content bytes
    ''' "Content-Type: text/html; charset=us-ascii"
    ''' &lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"/&gt;
    ''' </summary>
    Private Function DetectEncoding(ByVal ContentTypeHeader As String, ByVal ResponseBytes() As Byte) As System.Text.Encoding

        Dim s As String
        Dim encoding As System.Text.Encoding

        '-- first try the header
        s = Regex.Match(ContentTypeHeader, "charset=([^;""'/>]+)", _
                RegexOptions.IgnoreCase).Groups(1).ToString.ToLower
        encoding = CharsetToEncoding(s)

        '-- if we can't get it from header, try the body bytes forced to ASCII
        If encoding Is Nothing Then
            s = Regex.Match(System.Text.Encoding.ASCII.GetString(ResponseBytes), _
                "<meta[^>]+content-type[^>]+charset=([^;""'/>]+)", _
                RegexOptions.IgnoreCase).Groups(1).ToString.ToLower
            encoding = CharsetToEncoding(s)
            If encoding Is Nothing Then
                Return _DefaultEncoding
            End If
        End If

        Return encoding
    End Function

    ''' <summary>
    ''' returns a collection of bytes from a Url
    ''' </summary>
    ''' <param name="Url">URL to retrieve</param>
    Public Sub GetUrlData(ByVal Url As String, ByVal ifModifiedSince As DateTime)

        Dim wreq As HttpWebRequest = DirectCast(WebRequest.Create(Url), HttpWebRequest)

        '-- do we need to use a proxy to get to the web?
        If _ProxyUrl <> "" Then
            Dim wp As New WebProxy(_ProxyUrl)
            If _ProxyAuthenticationRequired Then
                If _ProxyUser <> "" And _ProxyPassword <> "" Then
                    wp.Credentials = New NetworkCredential(_ProxyUser, _ProxyPassword)
                Else
                    wp.Credentials = CredentialCache.DefaultCredentials
                End If
                wreq.Proxy = wp
            End If
        End If

        '-- does the target website require credentials?
        If _AuthenticationRequired Then
            If _AuthenticationUser <> "" And _AuthenticationPassword <> "" Then
                wreq.Credentials = New NetworkCredential(_AuthenticationUser, _AuthenticationPassword)
            Else
                wreq.Credentials = CredentialCache.DefaultCredentials
            End If
        End If

        wreq.Method = "GET"
        wreq.Timeout = _RequestTimeoutMilliseconds
        wreq.UserAgent = _HttpUserAgent
        wreq.Headers.Add("Accept-Encoding", _AcceptedEncodings)
        '-- note that, if present, this will trigger a 304 exception
        '-- if the URL being retrieved is not newer than the specified
        '-- date/time
        If ifModifiedSince <> DateTime.MinValue Then
            wreq.IfModifiedSince = ifModifiedSince
        End If

        '-- sometimes we need to transfer cookies to another URL; 
        '-- this keeps them around in the object
        If KeepCookies Then
            If _PersistedCookies Is Nothing Then
                _PersistedCookies = New CookieContainer
            End If
            wreq.CookieContainer = _PersistedCookies
        End If

        '-- download the target URL into a byte array
        Dim wresp As HttpWebResponse = DirectCast(wreq.GetResponse, HttpWebResponse)

        '-- convert response stream to byte array
        Dim ebr As New ExtendedBinaryReader(wresp.GetResponseStream)
        _ResponseBytes = ebr.ReadToEnd()

        '-- determine if body bytes are compressed, and if so, 
        '-- decompress the bytes
        Dim ContentEncoding As HttpContentEncoding
        If wresp.Headers.Item("Content-Encoding") Is Nothing Then
            ContentEncoding = HttpContentEncoding.None
        Else
            Select Case wresp.Headers.Item("Content-Encoding").ToLower
                Case "gzip"
                    ContentEncoding = HttpContentEncoding.Gzip
                Case "deflate"
                    ContentEncoding = HttpContentEncoding.Deflate
                Case Else
                    ContentEncoding = HttpContentEncoding.Unknown
            End Select
            _ResponseBytes = Decompress(_ResponseBytes, ContentEncoding)
        End If

        '-- sometimes URL is indeterminate, eg, "http://website.com/myfolder"
        '-- in that case the folder and file resolution MUST be done on 
        '-- the server, and returned to the client as ContentLocation
        _ContentLocation = wresp.Headers("Content-Location")
        If _ContentLocation Is Nothing Then
            _ContentLocation = ""
        End If

        '-- if we have string content, determine encoding type
        '-- (must cast to prevent Nothing)
        _DetectedContentType = wresp.Headers("Content-Type")
        If _DetectedContentType Is Nothing Then
            _DetectedContentType = ""
        End If
        If Me.ResponseIsBinary Then
            _DetectedEncoding = Nothing
        Else
            If _ForcedEncoding Is Nothing Then
                _DetectedEncoding = DetectEncoding(_DetectedContentType, _ResponseBytes)
            End If
        End If
    End Sub

    ''' <summary>
    ''' decompresses a compressed array of bytes via the specified HTTP compression type
    ''' </summary>
    ''' <returns>decompressed array of bytes</returns>
    Private Function Decompress(ByVal b() As Byte, ByVal CompressionType As HttpContentEncoding) As Byte()
        Dim s As Stream
        Select Case CompressionType
            Case HttpContentEncoding.Deflate
                s = New Zip.Compression.Streams.InflaterInputStream(New MemoryStream(b), _
                    New Zip.Compression.Inflater(True))
            Case HttpContentEncoding.Gzip
                s = New GZip.GZipInputStream(New MemoryStream(b))
            Case Else
                Return b
        End Select

        Dim ms As New MemoryStream
        Const chunkSize As Integer = 2048

        Dim sizeRead As Integer
        Dim unzipBytes(chunkSize) As Byte
        While True
            sizeRead = s.Read(unzipBytes, 0, chunkSize)
            If sizeRead > 0 Then
                ms.Write(unzipBytes, 0, sizeRead)
            Else
                Exit While
            End If
        End While
        s.Close()

        Return ms.ToArray
    End Function

    ''' <summary>
    ''' download URL contents to a file, using HTTP compression if possible
    ''' </summary>
    Public Sub DownloadFile(ByVal Url As String, ByVal FilePath As String)
        DownloadFile(Url, FilePath, DateTime.MinValue)
    End Sub

    ''' <summary>
    ''' download URL contents to a file, using HTTP compression if possible
    ''' URL contents will only be downloaded if newer than the specified date
    ''' </summary>
    Public Sub DownloadFile(ByVal Url As String, ByVal FilePath As String, ByVal ifModifiedSince As DateTime)
        GetUrlData(Url, ifModifiedSince)
        SaveResponseToFile(FilePath)
    End Sub

    Private Sub SaveResponseToFile(ByVal FilePath As String)
        Dim fs As FileStream
        Dim bw As BinaryWriter
        Try
            fs = New FileStream(FilePath, FileMode.OpenOrCreate)
            bw = New BinaryWriter(fs)
            bw.Write(_ResponseBytes)
            bw.Close()
        Finally
            If Not fs Is Nothing Then fs.Close()
        End Try
        Return
    End Sub

    ''' <summary>
    ''' download URL contents to an array of bytes, using HTTP compression if possible
    ''' </summary>
    Public Function DownloadBytes(ByVal Url As String) As Byte()
        Return DownloadBytes(Url, DateTime.MinValue)
    End Function

    ''' <summary>
    ''' download URL contents to an array of bytes, using HTTP compression if possible
    ''' URL contents will only be downloaded if newer than the specified date
    ''' </summary>
    Public Function DownloadBytes(ByVal Url As String, ByVal ifModifiedSince As DateTime) As Byte()
        GetUrlData(Url, ifModifiedSince)
        Return _ResponseBytes
    End Function

    ''' <summary>
    ''' download URL contents to a string (with appropriate encoding), using HTTP compression if possible
    ''' </summary>
    Public Function DownloadString(ByVal Url As String) As String
        GetUrlData(Url, DateTime.MinValue)
        Return Me.ResponseString
    End Function

    ''' <summary>
    ''' download URL contents to a string (with appropriate encoding), using HTTP compression if possible
    ''' URL contents will only be downloaded if newer than the specified date
    ''' </summary>
    Public Function DownloadString(ByVal Url As String, ByVal ifModifiedSince As DateTime) As String
        GetUrlData(Url, ifModifiedSince)
        Return Me.ResponseString
    End Function

    ''' <summary>
    ''' clears any downloaded content and resets all properties to default values
    ''' </summary>
    Public Sub Clear()
        ClearDownload()
        _DefaultEncoding = System.Text.Encoding.GetEncoding("Windows-1252")
        _ForcedEncoding = Nothing
        _AuthenticationRequired = False
        _ProxyAuthenticationRequired = False
        _ProxyUrl = ""
        _ProxyUser = ""
        _ProxyPassword = ""
        _AuthenticationUser = ""
        _AuthenticationPassword = ""
        _KeepCookies = False
        _RequestTimeoutMilliseconds = 60000
        _HttpUserAgent = _
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"
    End Sub

    ''' <summary>
    ''' clears any downloaded content
    ''' </summary>
    Public Sub ClearDownload()
        _ResponseBytes = Nothing
        _DetectedEncoding = Nothing
        _DetectedContentType = ""
        _ContentLocation = ""
        _PersistedCookies = Nothing
    End Sub

#Region " ExtendedBinaryReader"

    ''' <summary>
    '''   Extends the <c>System.IO.BinaryReader</c> class by a <c>ReadToEnd</c>
    '''   method that can be used to read a whole file.
    ''' </summary>
    ''' <remarks>
    ''' http://dotnet.mvps.org/dotnet/faqs/?id=readfile&amp;lang=en
    ''' </remarks>
    Public Class ExtendedBinaryReader
        Inherits BinaryReader

        ''' <summary>
        '''   Creates a new instance of the <c>ExtendedBinaryReader</c> class.
        ''' </summary>
        ''' <param name="Input">A stream.</param>
        Public Sub New(ByVal Input As Stream)
            MyBase.New(Input)
        End Sub

        ''' <summary>
        '''   Creates a new instance of the <c>ExtendedBinaryReader</c> class.
        ''' </summary>
        ''' <param name="Input">The provided stream.</param>
        ''' <param name="Encoding">The character encoding.</param>
        Public Sub New(ByVal Input As Stream, ByVal Encoding As System.Text.Encoding)
            MyBase.New(Input, Encoding)
        End Sub

        ''' <summary>
        '''   Reads the whole data in the base stream and returns it in an
        '''   array of bytes.
        ''' </summary>
        ''' <returns>The streams whole binary data.</returns>
        Public Function ReadToEnd() As Byte()
            Return ReadToEnd(Short.MaxValue)
        End Function

        ''' <summary>
        '''   Reads the whole data in the base stream and returns it in an
        '''   array of bytes.
        ''' </summary>
        ''' <param name="InitialLength">The initial buffer length.</param>
        ''' <returns>The stream's whole binary data.</returns>
        ' Based on an implementation by Jon Skeet [MVP]
        ' (<URL:http://www.yoda.arachsys.com/csharp/readbinary.html>).
        Public Function ReadToEnd(ByVal InitialLength As Integer) As Byte()

            ' If an unhelpful initial length was passed, just use 32K.
            If InitialLength < 1 Then
                InitialLength = Short.MaxValue
            End If
            Dim Buffer(InitialLength - 1) As Byte
            Dim Read As Integer
            Dim Chunk As Integer = _
                Me.BaseStream.Read(Buffer, Read, Buffer.Length - Read)
            Do While Chunk > 0
                Read = Read + Chunk

                ' If the end of the buffer is reached, check to see if there is
                ' any more data.
                If Read = Buffer.Length Then
                    Dim NextByte As Integer = Me.BaseStream.ReadByte()

                    ' If the end of the stream is reached, we are done.
                    If NextByte = -1 Then
                        Return Buffer
                    End If

                    ' Nope.  Resize the buffer, put in the byte we have just
                    ' read, and continue.
                    Dim NewBuffer(Buffer.Length * 2 - 1) As Byte
                    System.Buffer.BlockCopy( _
                        Buffer, _
                        0, _
                        NewBuffer, _
                        0, _
                        Buffer.Length _
                    )
                    'Array.Copy(Buffer, NewBuffer, Buffer.Length)
                    NewBuffer(Read) = CByte(NextByte)
                    Buffer = NewBuffer
                    Read = Read + 1
                End If
                Chunk = Me.BaseStream.Read(Buffer, Read, Buffer.Length - Read)
            Loop

            ' The buffer is now too big.  Shrink it.
            Dim ReturnBuffer(Read - 1) As Byte
            System.Buffer.BlockCopy(Buffer, 0, ReturnBuffer, 0, Read)
            'Array.Copy(Buffer, ReturnBuffer, Read)
            Return ReturnBuffer
        End Function
    End Class

#End Region

End Class

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United States United States
My name is Jeff Atwood. I live in Berkeley, CA with my wife, two cats, and far more computers than I care to mention. My first computer was the Texas Instruments TI-99/4a. I've been a Microsoft Windows developer since 1992; primarily in VB. I am particularly interested in best practices and human factors in software development, as represented in my recommended developer reading list. I also have a coding and human factors related blog at www.codinghorror.com.

Comments and Discussions