Imports System.Text
Imports System.Text.RegularExpressions
Imports System.IO
''' <summary>
''' This class builds the following from a URL:
'''
''' .mht file (Web Archive, single file)
''' .htm file with dereferenced (absolute) references (Web Page, HTML Only)
''' .htm file plus all referenced files in a local subfolder (Web Page, complete)
''' .txt file (non-HTML contents of Web Page)
'''
''' The .mht format is based on RFC2557
''' "compliant Multipart MIME Message (mhtml web archive)"
''' http://www.ietf.org/rfc/rfc2557.txt
''' </summary>
''' <remarks>
''' Jeff Atwood
''' http://www.codinghorror.com/
''' </remarks>
Public Class Builder
Private _MhtBuilder As StringBuilder
Private _StripScriptFromHtml As Boolean = False
Private _StripIframeFromHtml As Boolean = False
Private _AllowRecursion As Boolean = True
Private _AddWebMark As Boolean = True
Private _ForcedEncoding As System.Text.Encoding = Nothing
Private _HtmlFile As WebFile
Friend WebFiles As New SortedList
Friend WebClient As New WebClientEx
Private Const _MimeBoundaryTag As String = "----=_NextPart_000_00"
Public Enum FileStorage
Memory
DiskPermanent
DiskTemporary
End Enum
Public Sub New()
_HtmlFile = New WebFile(Me, FileStorage.Memory)
End Sub
#Region " Properties"
''' <summary>
''' Specifies the target Url we want to save
''' </summary>
Public Property Url() As String
Get
Return _HtmlFile.Url
End Get
Set(ByVal Value As String)
WebFiles.Clear()
_HtmlFile.Url = Value
End Set
End Property
''' <summary>
''' returns the Mime content-type string designation of a mht file
''' </summary>
Public ReadOnly Property MhtContentType() As String
Get
Return "message/rfc822"
End Get
End Property
''' <summary>
''' *only* set this if you want to FORCE a specific text encoding for all the HTML pages you're downloading;
''' otherwise the text encoding is autodetected, which is generally what you want
''' </summary>
Public Property TextEncoding() As System.Text.Encoding
Get
Return _ForcedEncoding
End Get
Set(ByVal Value As System.Text.Encoding)
_ForcedEncoding = Value
End Set
End Property
''' <summary>
''' Add the "Mark of the web" to retrieved HTML content so it can run
''' locally on Windows XP SP2
''' </summary>
''' <remarks>
''' http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120
''' </remarks>
Public Property AddWebMark() As Boolean
Get
Return _AddWebMark
End Get
Set(ByVal Value As Boolean)
_AddWebMark = Value
End Set
End Property
''' <summary>
''' Strip all <SCRIPT> blocks from any retrieved HTML
''' </summary>
Public Property StripScripts() As Boolean
Get
Return _StripScriptFromHtml
End Get
Set(ByVal Value As Boolean)
_StripScriptFromHtml = Value
End Set
End Property
''' <summary>
''' Strip all <IFRAME> blocks from any retrieved HTML
''' </summary>
Public Property StripIframes() As Boolean
Get
Return _StripIframeFromHtml
End Get
Set(ByVal Value As Boolean)
_StripIframeFromHtml = Value
End Set
End Property
''' <summary>
''' The browser identification string that is sent in all HTTP requests;
''' using a different string can produce simplified (downlevel) HTML
''' </summary>
''' <remarks>
''' defaults to browser ID string of vanilla IE6 as seen in XP SP2
''' </remarks>
Public Property BrowserIdString() As String
Get
Return WebClient.BrowserIdString
End Get
Set(ByVal Value As String)
WebClient.BrowserIdString = Value
End Set
End Property
''' <summary>
''' the target URL requires authentication
''' if not provided, the current user's credentials will automatically be sent
''' </summary>
Public Property AuthenticationRequired() As Boolean
Get
Return WebClient.AuthenticationRequired
End Get
Set(ByVal Value As Boolean)
WebClient.AuthenticationRequired = Value
End Set
End Property
''' <summary>
''' HTTP Authentication user for Url
''' if left blank, the current user's credentials will be sent
''' </summary>
Public Property AuthenticationUser() As String
Get
Return WebClient.AuthenticationUser
End Get
Set(ByVal Value As String)
WebClient.AuthenticationUser = Value
End Set
End Property
''' <summary>
''' HTTP Authentication password for Url
''' if left blank, the current user's credentials will be sent
''' </summary>
Public Property AuthenticationPassword() As String
Get
Return WebClient.AuthenticationPassword
End Get
Set(ByVal Value As String)
WebClient.AuthenticationPassword = Value
End Set
End Property
''' <summary>
''' HTTP proxy username
''' if left blank, the current user's credentials will be sent
''' </summary>
Public Property ProxyUser() As String
Get
Return WebClient.ProxyUser
End Get
Set(ByVal Value As String)
WebClient.ProxyUser = Value
End Set
End Property
''' <summary>
''' HTTP proxy password
''' if left blank, the current user's credentials will be sent
''' </summary>
Public Property ProxyPassword() As String
Get
Return WebClient.ProxyPassword
End Get
Set(ByVal Value As String)
WebClient.ProxyPassword = Value
End Set
End Property
''' <summary>
''' HTTP proxy URL
''' if provided, proxy will always be used; if left blank, proxy will not be used
''' </summary>
Public Property ProxyUrl() As String
Get
Return WebClient.ProxyUrl
End Get
Set(ByVal Value As String)
WebClient.ProxyUrl = Value
End Set
End Property
''' <summary>
''' Proxy requires authentication
''' if not provided, the current user's credentials will automatically be sent
''' </summary>
Public Property ProxyAuthenticationRequired() As Boolean
Get
Return WebClient.ProxyAuthenticationRequired
End Get
Set(ByVal Value As Boolean)
WebClient.ProxyAuthenticationRequired = Value
End Set
End Property
''' <summary>
''' allow recursive retrieval of any embedded HTML (typically IFRAME or FRAME)
''' turn off to prevent infinite recursion in the case of pages that reference themselves..
''' </summary>
Public Property AllowRecursiveFileRetrieval() As Boolean
Get
Return _AllowRecursion
End Get
Set(ByVal Value As Boolean)
_AllowRecursion = Value
End Set
End Property
#End Region
#Region " Public"
''' <summary>
''' Saves URL to disk as a single HTML file, modified with absolute external references
''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
''' </summary>
''' <param name="outputFilePath">path to generate to, or filename to generate</param>
''' <param name="url">fully qualified URL you wish to save</param>
''' <returns>the complete path of the HTML file that was saved to disk</returns>
Public Function SavePage(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
ValidateFilename(outputFilePath, ".htm;.html")
DownloadHtmlFile(url)
_HtmlFile.UseHtmlTitleAsFilename = True
_HtmlFile.DownloadPath = outputFilePath
_HtmlFile.SaveToFile()
Return _HtmlFile.DownloadPath
End Function
''' <summary>
''' Saves URL to disk as a plain text file, stripping all HTML from it
''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
''' </summary>
''' <param name="outputFilePath">path to generate to, or filename to generate</param>
''' <param name="url">fully qualified URL you wish to save</param>
''' <returns>the complete path of the text file that was saved to disk</returns>
Public Function SavePageText(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
ValidateFilename(outputFilePath, ".txt")
DownloadHtmlFile(url)
_HtmlFile.UseHtmlTitleAsFilename = True
_HtmlFile.DownloadPath = outputFilePath
_HtmlFile.SaveAsTextFile()
Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".txt")
End Function
''' <summary>
''' Saves URL to disk as multiple files: a single HTML file, modified with local references
''' to externally referenced files in a subfolder
''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
''' </summary>
''' <param name="outputFilePath">path to generate to, or filename to generate</param>
''' <param name="url">fully qualified URL you wish to save</param>
''' <returns>the complete path of the HTML file that was saved to disk</returns>
Public Function SavePageComplete(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
ValidateFilename(outputFilePath, ".htm;.html")
DownloadHtmlFile(url)
'-- first, let's get all the external files
_HtmlFile.DownloadPath = outputFilePath
_HtmlFile.UseHtmlTitleAsFilename = True
_HtmlFile.DownloadExternalFiles(FileStorage.DiskPermanent, _AllowRecursion)
'-- convert any references in external files
For Each de As DictionaryEntry In WebFiles
Dim ef As WebFile = DirectCast(de.Value, WebFile)
If ef.IsHtml Or ef.IsCss Then
ef.ConvertReferencesToLocal()
ef.SaveToFile()
End If
Next
'-- convert the main HTML references
_HtmlFile.ConvertReferencesToLocal()
_HtmlFile.SaveToFile()
Return _HtmlFile.DownloadPath
End Function
''' <summary>
''' Generates a string representation of the URL as a Mht archive file
''' using exclusively in-memory storage
''' </summary>
''' <param name="url">fully qualified URL you wish to render to Mht</param>
''' <returns>string representation of MHT file</returns>
Public Function GetPageArchive(Optional ByVal url As String = "") As String
DownloadHtmlFile(url)
_HtmlFile.DownloadExternalFiles(FileStorage.Memory, _AllowRecursion)
AppendMhtHeader(_HtmlFile)
AppendMhtFiles()
Return FinalizeMht()
End Function
''' <summary>
''' Saves URL to disk as a single file Mht archive
''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
''' </summary>
''' <param name="outputFilePath">path to generate to, or filename to generate</param>
''' <param name="st">type of storage to use when generating the Mht archive</param>
''' <param name="url">fully qualified URL you wish to save as Mht</param>
''' <returns>the complete path of the Mht archive file that was generated</returns>
Public Function SavePageArchive(ByVal outputFilePath As String, ByVal st As FileStorage, Optional ByVal url As String = "") As String
ValidateFilename(outputFilePath, ".mht")
DownloadHtmlFile(url)
_HtmlFile.DownloadPath = outputFilePath
_HtmlFile.UseHtmlTitleAsFilename = True
'-- if set to permanent disk storage, make a local copy of the HTML
If st = FileStorage.DiskPermanent Then
_HtmlFile.SaveToFile(Path.ChangeExtension(_HtmlFile.DownloadPath, ".htm"))
End If
'-- download all references
_HtmlFile.DownloadExternalFiles(st, _AllowRecursion)
'-- build the Mht
AppendMhtHeader(_HtmlFile)
AppendMhtFiles()
FinalizeMht(Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht"))
'-- possibly destroy temporary resources
If st = FileStorage.DiskTemporary Then
For Each de As DictionaryEntry In WebFiles
Dim ef As WebFile = DirectCast(de.Value, WebFile)
If ef.Storage = FileStorage.DiskTemporary Then
File.Delete(ef.DownloadPath)
End If
'-- if the temp folder is empty, kill that too
If Directory.GetFileSystemEntries(ef.DownloadFolder).Length = 0 Then
Directory.Delete(ef.DownloadFolder)
End If
Next
End If
WebFiles.Clear()
Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht")
End Function
#End Region
#Region " Private"
''' <summary>
''' returns the root HTML we'll use to generate everything else;
''' this is tracked in the _HtmlFile object, which is always FileStorage.Memory
''' </summary>
Private Sub DownloadHtmlFile(ByVal url As String)
If url <> "" Then
Me.Url = url
End If
_HtmlFile.Storage = FileStorage.Memory
_HtmlFile.WasAppended = False
_HtmlFile.Download()
If Not _HtmlFile.WasDownloaded Then
Throw New Exception("unable to download '" & Me.Url & "': " & _
_HtmlFile.DownloadException.Message, _HtmlFile.DownloadException)
End If
End Sub
''' <summary>
''' returns true if this path refers to a directory (vs. a filename)
''' </summary>
Private Function IsDirectory(ByVal FilePath As String) As Boolean
Return FilePath.EndsWith("\")
End Function
''' <summary>
''' ensures that the path, if it contains a filename, matches one of the semicolon delimited
''' filetypes provided in fileExtension
''' </summary>
Private Sub ValidateFilename(ByVal FilePath As String, ByVal fileExtensions As String)
If IsDirectory(FilePath) Then Return
Dim ext As String = Path.GetExtension(FilePath)
If ext = "" Then
Throw New Exception("The filename provided, '" & Path.GetFileName(FilePath) & _
"', has no extension. If are specifying a folder, make sure it ends in a trailing slash. " & _
"The expected file extension(s) are '" & fileExtensions & "'")
End If
If Not Regex.IsMatch(fileExtensions, ext & "(;|$)", RegexOptions.IgnoreCase) Then
Throw New Exception("The extension of the filename provided, '" & Path.GetFileName(FilePath) & _
"', does not have the expected extension(s) '" & fileExtensions & "'")
End If
End Sub
''' <summary>
''' removes all unsafe filesystem characters to form a valid filesystem filename
''' </summary>
Private Function MakeValidFilename(ByVal s As String) As String
'-- replace any invalid filesystem chars with underscore
Return Regex.Replace(s, "[\/\\\:\*\?\""\<\>\|]", "_")
End Function
''' <summary>
''' appends all downloaded files (from _ExternalFiles) to our MhtBuilder
''' </summary>
''' <param name="st">type of storage to use when downloading external files</param>
''' <param name="storagePath">path to use for downloaded external files</param>
Private Sub AppendMhtFiles()
For Each de As DictionaryEntry In WebFiles
Dim ef As WebFile = DirectCast(de.Value, WebFile)
AppendMhtFile(ef)
Next
AppendMhtBoundary()
End Sub
''' <summary>
''' appends the Mht header, which includes the root HTML
''' </summary>
Private Sub AppendMhtHeader(ByVal ef As WebFile)
'-- clear the stringbuilder contents
_MhtBuilder = New StringBuilder
AppendMhtLine("From: <Saved by " & Environment.UserName & " on " & Environment.MachineName & ">")
AppendMhtLine("Subject: " & ef.HtmlTitle)
AppendMhtLine("Date: " & DateTime.Now.ToString("ddd, dd MMM yyyy HH:mm:ss zzz"))
AppendMhtLine("MIME-Version: 1.0")
AppendMhtLine("Content-Type: multipart/related;")
AppendMhtLine(Convert.ToChar(9) & "type=""text/html"";")
AppendMhtLine(Convert.ToChar(9) & "boundary=""" & _MimeBoundaryTag & """")
AppendMhtLine("X-MimeOLE: Produced by " & Me.GetType.ToString & " " & _
Reflection.Assembly.GetExecutingAssembly.GetName.Version.ToString())
AppendMhtLine()
AppendMhtLine("This is a multi-part message in MIME format.")
AppendMhtFile(ef)
End Sub
''' <summary>
''' append a single line, with trailing CRLF, to our MhtBuilder
''' </summary>
Private Sub AppendMhtLine(Optional ByVal s As String = "")
_MhtBuilder.Append(s)
_MhtBuilder.Append(Environment.NewLine)
End Sub
''' <summary>
''' appends a boundary marker to our MhtBuilder
''' </summary>
Private Sub AppendMhtBoundary()
AppendMhtLine()
AppendMhtLine("--" & _MimeBoundaryTag)
End Sub
''' <summary>
''' Appends a downloaded external file to our MhtBuilder
''' </summary>
Private Sub AppendMhtFile(ByVal ef As WebFile)
If ef.WasDownloaded And Not ef.WasAppended Then
If ef.IsBinary Then
AppendMhtBinaryFile(ef)
Else
AppendMhtTextFile(ef)
End If
End If
ef.WasAppended = True
End Sub
''' <summary>
''' Appends a downloaded external text file to our MhtBuilder using Quoted-Printable encoding
''' </summary>
Private Sub AppendMhtTextFile(ByVal ef As WebFile)
AppendMhtBoundary()
AppendMhtLine("Content-Type: " & ef.ContentType & ";")
AppendMhtLine(Convert.ToChar(9) & "charset=""" & ef.TextEncoding.WebName & """")
AppendMhtLine("Content-Transfer-Encoding: quoted-printable")
AppendMhtLine("Content-Location: " & ef.Url)
AppendMhtLine()
AppendMhtLine(QuotedPrintableEncode(ef.ToString, ef.TextEncoding))
End Sub
''' <summary>
''' Appends a downloaded external binary file to our MhtBuilder using Base64 encoding
''' </summary>
Private Sub AppendMhtBinaryFile(ByVal ef As WebFile)
AppendMhtBoundary()
AppendMhtLine("Content-Type: " & ef.ContentType)
AppendMhtLine("Content-Transfer-Encoding: base64")
AppendMhtLine("Content-Location: " & ef.Url)
AppendMhtLine()
'-- note that chunk size is equal to maximum line width (expanded = 75 chars)
Const ChunkSize As Integer = 57
If ef.Storage = FileStorage.Memory Then
Dim len As Integer = ef.DownloadedBytes.Length
If len <= ChunkSize Then
AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, 0, len))
Else
Dim i As Integer = 0
Do While i + ChunkSize < len
AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, ChunkSize))
i += ChunkSize
Loop
If i <> len Then
AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, len - i))
End If
End If
Else
Dim fs As IO.FileStream
Dim b(ChunkSize) As Byte
Dim BytesRead As Integer
Try
fs = New FileStream(ef.DownloadPath, FileMode.Open, FileAccess.Read)
BytesRead = fs.Read(b, 0, ChunkSize)
Do While BytesRead > 0
AppendMhtLine(Convert.ToBase64String(b, 0, BytesRead))
BytesRead = fs.Read(b, 0, ChunkSize)
Loop
Finally
If Not fs Is Nothing Then
fs.Close()
End If
End Try
End If
End Sub
''' <summary>
''' dumps our MhtBuilder to disk and clears it
''' </summary>
Private Sub FinalizeMht(ByVal outputFilePath As String)
Dim sr As New StreamWriter(outputFilePath, False, _HtmlFile.TextEncoding)
sr.Write(_MhtBuilder.ToString)
sr.Close()
_MhtBuilder = Nothing
End Sub
''' <summary>
''' dumps our MhtBuilder as a string and clears it
''' </summary>
Private Function FinalizeMht() As String
Dim s As String = _MhtBuilder.ToString
_MhtBuilder = Nothing
Return s
End Function
#End Region
#Region " Quoted-Printable encoding"
''' <summary>
''' converts a string into Quoted-Printable encoding
''' http://www.freesoft.org/CIE/RFC/1521/6.htm
''' </summary>
Private Function QuotedPrintableEncode(ByVal s As String, ByVal e As System.Text.Encoding) As String
Dim Ascii As Integer
Dim LastSpace As Integer = 0
Dim LineLength As Integer = 0
Dim LineBreaks As Integer = 0
Dim sb As New StringBuilder
Dim longchar As String
If s Is Nothing OrElse s.Length = 0 Then
Return ""
End If
For Each c As Char In s
Ascii = Convert.ToInt32(c)
If Ascii = 61 Or Ascii > 126 Then
If Ascii <= 255 Then
sb.Append("=")
sb.Append(Convert.ToString(Ascii, 16).ToUpper)
LineLength += 3
Else
'-- double-byte land..
For Each b As Byte In e.GetBytes(c)
sb.Append("=")
sb.Append(Convert.ToString(b, 16).ToUpper)
LineLength += 3
Next
End If
Else
sb.Append(c)
LineLength += 1
If Ascii = 32 Then LastSpace = sb.Length
End If
If LineLength >= 73 Then
If LastSpace = 0 Then
sb.Insert(sb.Length, "=" & Environment.NewLine)
LineLength = 0
Else
sb.Insert(LastSpace, "=" & Environment.NewLine)
LineLength = sb.Length - LastSpace - 1
End If
LineBreaks += 1
LastSpace = 0
End If
Next
'-- if we split the line, have to indicate trailing spaces
If LineBreaks > 0 Then
If sb.Chars(sb.Length - 1) = " " Then
sb.Remove(sb.Length - 1, 1)
sb.Append("=20")
End If
End If
Return sb.ToString
End Function
#End Region
End Class