Imports System.Text.RegularExpressions
Imports System.IO
''' <summary>
''' represents an external file referenced in our parent HTML at the target URL
''' </summary>
''' <remarks>
''' Jeff Atwood
''' http://www.codinghorror.com/
''' </remarks>
Friend Class WebFile
Private _Builder As Builder
Private _Url As String
Private _UrlUnmodified As String
Private _UrlRoot As String
Private _UrlFolder As String
Private _ContentType As String
Private _IsBinary As Boolean
Private _TextEncoding As System.Text.Encoding
Private _ContentLocation As String
Private _DownloadedBytes() As Byte
Private _DownloadException As Exception = Nothing
Private _WasDownloaded As Boolean = False
Private _DownloadFilename As String = ""
Private _DownloadFolder As String = ""
Private _DownloadExtension As String = ""
Private _UseHtmlFilename As Boolean = False
Private _ExternalFileCollection As Specialized.NameValueCollection
Public Storage As Builder.FileStorage = Builder.FileStorage.DiskPermanent
Public WasAppended As Boolean = False
Public Sub New(ByVal parent As Builder, ByVal st As Builder.FileStorage)
Me.Storage = st
_Builder = parent
End Sub
Public Sub New(ByVal parent As Builder, ByVal url As String, ByVal st As Builder.FileStorage)
_Builder = parent
If url <> "" Then
Me.Url = url
End If
Me.Storage = st
End Sub
''' <summary>
''' The URL target for this file
''' </summary>
Public Property Url() As String
Get
Return _Url
End Get
Set(ByVal Value As String)
_UrlUnmodified = Value
SetUrl(Value, True)
ReDim _DownloadedBytes(0)
_ExternalFileCollection = Nothing
_DownloadException = Nothing
_TextEncoding = Nothing
_ContentType = ""
_ContentLocation = ""
_IsBinary = False
_WasDownloaded = False
End Set
End Property
''' <summary>
''' If enabled, will use the first 50 characters of the TITLE tag
''' to form the filename when saved to disk
''' </summary>
Public Property UseHtmlTitleAsFilename() As Boolean
Get
Return _UseHtmlFilename
End Get
Set(ByVal Value As Boolean)
_UseHtmlFilename = Value
End Set
End Property
''' <summary>
''' the folder name used in the DownloadFolder
''' </summary>
Public ReadOnly Property DownloadFolderName() As String
Get
Return Regex.Match(Me.DownloadFolder, "(?<Folder>[^\\]+)\\*$").Groups("Folder").Value
End Get
End Property
''' <summary>
''' folder to download this file to
''' if no folder is provided, the current application folder will be used
''' </summary>
Public Property DownloadFolder() As String
Get
If _DownloadFolder = "" Then
_DownloadFolder = AppDomain.CurrentDomain.BaseDirectory
End If
Return _DownloadFolder
End Get
Set(ByVal Value As String)
_DownloadFolder = Value
End Set
End Property
''' <summary>
''' filename to download this file as
''' if no filename is provided, a filename will be auto-generated based on
''' the URL; if the UseHtmlTitleAsFilename property is true, then the
''' title tag will be used to generate the filename
''' </summary>
Public Property DownloadFilename() As String
Get
If _DownloadFilename = "" Then
If _UseHtmlFilename AndAlso (Me.WasDownloaded AndAlso Me.IsHtml) Then
Dim htmlTitle As String = Me.HtmlTitle
If htmlTitle <> "" Then
_DownloadFilename = MakeValidFilename(htmlTitle) & ".htm"
End If
Else
_DownloadFilename = FilenameFromUrl()
End If
End If
Return _DownloadFilename
End Get
Set(ByVal Value As String)
_DownloadFilename = Value
End Set
End Property
''' <summary>
''' fully qualified path and filename to download this file to
''' </summary>
Public Property DownloadPath() As String
Get
If Path.GetExtension(Me.DownloadFilename) = "" Then
Return Path.Combine(Me.DownloadFolder, Me.DownloadFilename & Me.DownloadExtension)
Else
Return Path.Combine(Me.DownloadFolder, Me.DownloadFilename)
End If
End Get
Set(ByVal Value As String)
_DownloadFilename = Path.GetFileName(Value)
If _DownloadFilename = "" Then
_DownloadFolder = Value
Else
_DownloadFolder = Value.Replace(_DownloadFilename, "")
End If
End Set
End Property
''' <summary>
''' file type extension to use on downloaded file
''' this property is only used if the DownloadFilename property does not
''' already contain a file extension
''' </summary>
Public Property DownloadExtension() As String
Get
If _DownloadExtension = "" Then
If Me.WasDownloaded Then
_DownloadExtension = ExtensionFromContentType()
End If
End If
Return _DownloadExtension
End Get
Set(ByVal Value As String)
_DownloadExtension = Value
End Set
End Property
''' <summary>
''' If this file has external dependencies, the folder they will be stored on disk
''' </summary>
Public ReadOnly Property ExternalFilesFolder() As String
Get
Return Path.Combine(Me.DownloadFolder, Path.GetFileNameWithoutExtension(Me.DownloadFilename)) & "_files"
End Get
End Property
''' <summary>
''' The unmodified "raw" URL as originally provided
''' </summary>
Public ReadOnly Property UrlUnmodified() As String
Get
Return _UrlUnmodified
End Get
End Property
''' <summary>
''' The Content-Location of this URL as provided by the server,
''' only if the URL was not fully qualified;
''' eg, http://mywebsite.com/ actually maps to http://mywebsite.com/default.htm
''' </summary>
Public ReadOnly Property UrlContentLocation() As String
Get
Return _ContentLocation
End Get
End Property
''' <summary>
''' The root of the URL, eg, http://mywebsite.com/
''' </summary>
Public ReadOnly Property UrlRoot() As String
Get
Return _UrlRoot
End Get
End Property
''' <summary>
''' The root and folder of the URL, eg, http://mywebsite.com/myfolder
''' </summary>
Public ReadOnly Property UrlFolder() As String
Get
Return _UrlFolder
End Get
End Property
''' <summary>
''' Was this file successfully downloaded via HTTP?
''' </summary>
Public ReadOnly Property WasDownloaded() As Boolean
Get
Return _WasDownloaded
End Get
End Property
''' <summary>
''' The Content-Type of this file as returned by the server
''' </summary>
Public ReadOnly Property ContentType() As String
Get
Return _ContentType
End Get
End Property
''' <summary>
''' Does this file contain binary data? If not, it must be text data.
''' </summary>
Public ReadOnly Property IsBinary() As Boolean
Get
Return _IsBinary
End Get
End Property
''' <summary>
''' The raw bytes returned from the server for this file
''' </summary>
Public ReadOnly Property DownloadedBytes() As Byte()
Get
Return _DownloadedBytes
End Get
End Property
''' <summary>
''' If not .WasDownloaded, the exception that prevented download is stored here
''' </summary>
Public ReadOnly Property DownloadException() As Exception
Get
Return _DownloadException
End Get
End Property
''' <summary>
''' If this file is text (eg, it isn't binary), the type of text encoding used
''' </summary>
Public ReadOnly Property TextEncoding() As System.Text.Encoding
Get
Return _TextEncoding
End Get
End Property
''' <summary>
''' Is this file HTML content?
''' </summary>
Public ReadOnly Property IsHtml() As Boolean
Get
Return Regex.IsMatch(_ContentType, "text/html", RegexOptions.IgnoreCase)
End Get
End Property
''' <summary>
''' Is this file CSS content?
''' </summary>
Public ReadOnly Property IsCss() As Boolean
Get
Return Regex.IsMatch(_ContentType, "text/css", RegexOptions.IgnoreCase)
End Get
End Property
''' <summary>
''' If this file is HTML, retrieve the <TITLE> tag from the HTML
''' (maximum of 50 characters)
''' </summary>
Public ReadOnly Property HtmlTitle() As String
Get
If Not Me.IsHtml Then
Throw New Exception("This file isn't HTML, so it has no HTML <TITLE> tag.")
End If
Const maxLength As Integer = 50
Dim s As String = _
Regex.Match(Me.ToString, "<title[^>]*?>(?<text>[^<]+)</title>", _
RegexOptions.IgnoreCase Or RegexOptions.Singleline).Groups("text").Value()
If s.Length > maxLength Then
Return s.Substring(0, maxLength)
Else
Return s
End If
End Get
End Property
''' <summary>
''' Returns a string representation of the data downloaded for this file
''' </summary>
Public Overrides Function ToString() As String
If Not _WasDownloaded Then
Download()
End If
If Not _WasDownloaded Then
Return ""
Else
If _DownloadedBytes.Length > 0 Then
If _IsBinary Then
Return "[" & _DownloadedBytes.Length & " bytes of binary data]"
Else
Return TextEncoding.GetString(_DownloadedBytes)
End If
End If
End If
End Function
''' <summary>
''' Download this file from the target URL
''' </summary>
Public Sub Download()
Debug.Write("Downloading " & _Url & " ..")
DownloadBytes()
If _DownloadException Is Nothing Then
Debug.WriteLine("OK")
Else
Debug.WriteLine("failed: ", "Error")
Debug.WriteLine(" " & _DownloadException.Message, "Error")
Return
End If
If Me.IsHtml Then
_DownloadedBytes = _TextEncoding.GetBytes(ProcessHtml(Me.ToString))
End If
If Me.IsCss Then
_DownloadedBytes = _TextEncoding.GetBytes(ProcessHtml(Me.ToString))
End If
If Me.Storage <> Builder.FileStorage.Memory Then
Me.SaveToFile()
End If
End Sub
''' <summary>
''' download this file from the target URL;
''' place the bytes downloaded in _DownloadedBytes
''' if an exception occurs, capture it in _DownloadException
''' </summary>
Private Sub DownloadBytes()
If Me.WasDownloaded Then Return
'-- always download to memory first
Try
_DownloadedBytes = _Builder.WebClient.DownloadBytes(_Url)
_WasDownloaded = True
Catch ex As Net.WebException
_DownloadException = ex
_Builder.WebClient.ClearDownload()
End Try
'-- necessary if the original client URL was imprecise;
'-- server location is always authoritatitve
If _Builder.WebClient.ContentLocation <> "" Then
_ContentLocation = _Builder.WebClient.ContentLocation
SetUrl(_ContentLocation, False)
End If
_IsBinary = _Builder.WebClient.ResponseIsBinary
_ContentType = _Builder.WebClient.ResponseContentType
_TextEncoding = _Builder.WebClient.DetectedEncoding
_Builder.WebClient.ClearDownload()
End Sub
Private Sub SetUrl(ByVal url As String, ByVal validate As Boolean)
If validate Then
_Url = ResolveUrl(url)
Else
_Url = url
End If
'-- http://mywebsite
_UrlRoot = Regex.Match(url, "http://[^/'""]+", RegexOptions.IgnoreCase).ToString
'-- http://mywebsite/myfolder
If _Url.LastIndexOf("/") > 7 Then
_UrlFolder = _Url.Substring(0, _Url.LastIndexOf("/"))
Else
_UrlFolder = _UrlRoot
End If
End Sub
''' <summary>
''' Pre-process the CSS using global preference settings
''' </summary>
Private Function ProcessCss(ByVal css As String) As String
Return ConvertRelativeToAbsoluteRefs(css)
End Function
''' <summary>
''' Pre-process the HTML using global preference settings
''' </summary>
Private Function ProcessHtml(ByVal html As String) As String
Debug.WriteLine("Downloaded content was HTML/CSS -- processing: resolving URLs, getting <base>, etc")
If _Builder.AddWebMark Then
'-- add "mark of the web":
'-- http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120
html = "<!-- saved from url=(" & String.Format("{0:0000}", _Url.Length) & ")" & _Url & " -->" & _
Environment.NewLine & html
End If
'-- see if we need to strip elements from the HTML
If _Builder.StripScripts Then
html = StripHtmlTag("script", html)
End If
If _Builder.StripIframes Then
html = StripHtmlTag("iframe", html)
End If
'-- if we have a <base>, we must use it as the _UrlFolder,
'-- not what was parsed from the original _Url
Dim BaseUrlFolder As String = _
Regex.Match(html, _
"<base[^>]+?href=['""]{0,1}(?<BaseUrl>[^'"">]+)['""]{0,1}", _
RegexOptions.IgnoreCase).Groups("BaseUrl").Value
If BaseUrlFolder <> "" Then
If BaseUrlFolder.EndsWith("/") Then
_UrlFolder = BaseUrlFolder.Substring(0, BaseUrlFolder.Length - 1)
Else
_UrlFolder = BaseUrlFolder
End If
End If
'-- remove the <base href=''> tag if present; causes problems when viewing locally.
html = Regex.Replace(html, "<base[^>]*?>", "")
'-- relative URLs are a PITA for the processing we're about to do,
'-- so convert them all to absolute up front
Return ConvertRelativeToAbsoluteRefs(html)
End Function
''' <summary>
''' converts all relative url references
''' href="myfolder/mypage.htm"
''' into absolute url references
''' href="http://mywebsite/myfolder/mypage.htm"
''' </summary>
Private Function ConvertRelativeToAbsoluteRefs(ByVal html As String) As String
Dim r As Regex
Dim urlPattern As String = _
"(?<attrib>\shref|\ssrc|\sbackground)\s*?=\s*?" & _
"(?<delim1>[""'\\]{0,2})(?!\s*\+|#|http:|ftp:|mailto:|javascript:)" & _
"/(?<url>[^""'>\\]+)(?<delim2>[""'\\]{0,2})"
Dim cssPattern As String = _
"(?<attrib>@import\s|\S+-image:|background:)\s*?(url)*['""(]{1,2}" & _
"(?!http)\s*/(?<url>[^""')]+)['"")]{1,2}"
'-- href="/anything" to href="http://www.web.com/anything"
r = New Regex(urlPattern, _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
html = r.Replace(html, "${attrib}=${delim1}" & _UrlRoot & "/${url}${delim2}")
'-- href="anything" to href="http://www.web.com/folder/anything"
r = New Regex(urlPattern.Replace("/", ""), _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
html = r.Replace(html, "${attrib}=${delim1}" & _UrlFolder & "/${url}${delim2}")
'-- @import(/anything) to @import url(http://www.web.com/anything)
r = New Regex(cssPattern, _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
html = r.Replace(html, "${attrib} url(" & _UrlRoot & "/${url})")
'-- @import(anything) to @import url(http://www.web.com/folder/anything)
r = New Regex(cssPattern.Replace("/", ""), _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
html = r.Replace(html, "${attrib} url(" & _UrlFolder & "/${url})")
Return html
End Function
''' <summary>
''' returns a name/value collection of all external files referenced in HTML:
'''
''' "/myfolder/blah.png"
''' 'http://mywebsite/blah.gif'
''' src=blah.jpg
'''
''' note that the Key includes the delimiting quotes or parens (if present), but the Value does not
''' this is important because the delimiters are used for matching and replacement to make the
''' match more specific!
''' </summary>
Private Function ExternalHtmlFiles() As Specialized.NameValueCollection
'-- avoid doing this work twice, however, be careful that the HTML hasn't
'-- changed since the last time we called this function
If Not _ExternalFileCollection Is Nothing Then
Return _ExternalFileCollection
End If
_ExternalFileCollection = New Specialized.NameValueCollection
Dim r As Regex
Dim html As String = Me.ToString
Debug.WriteLine("Resolving all external HTML references from URL:")
Debug.WriteLine(" " & Me.Url)
'-- src='filename.ext' ; background="filename.ext"
'-- note that we have to test 3 times to catch all quote styles: '', "", and none
r = New Regex( _
"(\ssrc|\sbackground)\s*=\s*((?<Key>'(?<Value>[^']+)')|(?<Key>""(?<Value>[^""]+)"")|(?<Key>(?<Value>[^ \n\r\f]+)))", _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
AddMatchesToCollection(html, r, _ExternalFileCollection)
'-- @import "style.css" or @import url(style.css)
r = New Regex( _
"(@import\s|\S+-image:|background:)\s*?(url)*\s*?(?<Key>[""'(]{1,2}(?<Value>[^""')]+)[""')]{1,2})", _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
AddMatchesToCollection(html, r, _ExternalFileCollection)
'-- <link rel=stylesheet href="style.css">
r = New Regex( _
"<link[^>]+?href\s*=\s*(?<Key>('|"")*(?<Value>[^'"">]+)('|"")*)", _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
AddMatchesToCollection(html, r, _ExternalFileCollection)
'-- <iframe src="mypage.htm"> or <frame src="mypage.aspx">
r = New Regex( _
"<i*frame[^>]+?src\s*=\s*(?<Key>['""]{0,1}(?<Value>[^'""\\>]+)['""]{0,1})", _
RegexOptions.IgnoreCase Or RegexOptions.Multiline)
AddMatchesToCollection(html, r, _ExternalFileCollection)
Return _ExternalFileCollection
End Function
''' <summary>
''' perform the regex replacement of all <tagName> .. </tagName> blocks
''' </summary>
Private Function StripHtmlTag(ByVal tagName As String, ByVal html As String) As String
Dim reg As Regex = New Regex( _
String.Format("<{0}[^>]*?>[\w|\t|\r|\W]*?</{0}>", tagName), _
RegexOptions.Multiline Or RegexOptions.IgnoreCase)
Return reg.Replace(html, "")
End Function
''' <summary>
''' Returns the plain text representation of the data in this file,
''' stripping out any HTML tags and codes
''' </summary>
Public Function ToTextString(Optional ByVal removeWhitespace As Boolean = False) As String
Dim html As String = Me.ToString
'-- get rid of <script> .. </script>
html = StripHtmlTag("script", html)
'-- get rid of <style> .. </style>
html = StripHtmlTag("style", html)
'-- get rid of all HTML tags
html = Regex.Replace(html, "<\w+(\s+[A-Za-z0-9_\-]+\s*=\s*(""([^""]*)""|'([^']*)'))*\s*(/)*>|<[^>]+>", " ")
'-- convert escaped HTML to plaintext
html = Web.HttpUtility.HtmlDecode(html)
If removeWhitespace Then
'-- clean up whitespace (optional, depends what you want..)
html = Regex.Replace(html, "[\n\r\f\t]", " ", RegexOptions.Multiline)
html = Regex.Replace(html, " {2,}", " ", RegexOptions.Multiline)
End If
Return html
End Function
''' <summary>
''' Saves this file to disk as a plain text file
''' </summary>
Public Sub SaveAsTextFile()
SaveToFile(Path.ChangeExtension(Me.DownloadPath, ".txt"), True)
End Sub
''' <summary>
''' Saves this file to disk as a plain text file, to an arbitrary path
''' </summary>
Public Sub SaveAsTextFile(ByVal filePath As String)
SaveToFile(filePath, True)
End Sub
''' <summary>
''' writes contents of file to DownloadPath, using appropriate encoding as necessary
''' </summary>
Public Sub SaveToFile()
SaveToFile(Me.DownloadPath, False)
End Sub
''' <summary>
''' writes contents of file to DownloadPath, using appropriate encoding as necessary
''' </summary>
Public Sub SaveToFile(ByVal filePath As String)
SaveToFile(filePath, False)
End Sub
''' <summary>
''' sets the DownloadPath and writes contents of file, using appropriate encoding as necessary
''' </summary>
Private Sub SaveToFile(ByVal filePath As String, ByVal asText As Boolean)
Debug.WriteLine("Saving to file " & filePath)
Dim fs As New FileStream(filePath, FileMode.OpenOrCreate)
Try
Dim bw As New BinaryWriter(fs)
If Me.IsBinary Then
bw.Write(_DownloadedBytes)
Else
If asText Then
bw.Write(Me.ToTextString)
Else
bw.Write(_DownloadedBytes)
End If
End If
bw.Close()
Finally
If Not fs Is Nothing Then
fs.Close()
End If
End Try
End Sub
''' <summary>
''' fully resolves any relative pathing inside the URL, and other URL oddities
''' </summary>
Private Function ResolveUrl(ByVal url As String) As String
'-- resolve any relative pathing
Try
url = (New System.Uri(url)).AbsoluteUri
Catch ex As System.UriFormatException
Throw New ArgumentException("'" & url & "' does not appear to be a valid URL.", ex)
End Try
'-- remove any anchor tags from the end of URLs
If url.IndexOf("#") > -1 Then
Dim jump As String = Regex.Match(url, "/[^/]*?(?<jump>#[^/?.]+$)").Groups("jump").Value
If jump <> "" Then
url = url.Replace(jump, "")
End If
End If
Return url
End Function
''' <summary>
''' if the user passed in a directory, form the filename automatically using the Html title tag
''' if the user passed in a filename, make sure the extension matches our desired extension
''' </summary>
Private Function DeriveFilename(ByVal FilePath As String, _
ByVal html As String, _
ByVal fileExtension As String) As String
If IsDirectory(FilePath) Then
Dim htmlTitle As String = Me.HtmlTitle
If htmlTitle = "" Then
Throw New Exception("No filename was provided, and the HTML title tag was not found, " & _
"so a filename could not be automatically generated. You'll need to provide a filename and not a folder.")
End If
FilePath = Path.Combine(Path.GetDirectoryName(FilePath), MakeValidFilename(htmlTitle) & fileExtension)
Else
If Path.GetExtension(FilePath) <> fileExtension Then
Return Path.ChangeExtension(FilePath, fileExtension)
End If
End If
Return FilePath
End Function
''' <summary>
''' removes all unsafe filesystem characters to form a valid filesystem filename
''' </summary>
Private Function MakeValidFilename(ByVal s As String, Optional ByVal enforceLength As Boolean = False) As String
If enforceLength Then
End If
'-- replace any invalid filesystem chars, plus leading/trailing/doublespaces
Return Regex.Replace(Regex.Replace(s, "[\/\\\:\*\?\""""\<\>\|]|^\s+|\s+$", ""), "\s{2,}", " ")
End Function
''' <summary>
''' returns true if this path refers to a directory (vs. a filename)
''' </summary>
Private Function IsDirectory(ByVal FilePath As String) As Boolean
Return FilePath.EndsWith("\")
End Function
''' <summary>
''' converts all external Html files (gif, jpg, css, etc) to local refs
''' external ref:
''' <img src="http://mywebsite/myfolder/myimage.gif">
''' into local refs:
''' <img src="mypage_files/myimage.gif">
''' </summary>
Public Sub ConvertReferencesToLocal()
If Not Me.IsHtml And Not Me.IsCss Then
Throw New Exception("Converting references only makes sense for HTML or CSS files; this file is of type '" & Me.ContentType & "'")
End If
'-- get a list of all external references
Dim html As String = Me.ToString
Dim FileCollection As Specialized.NameValueCollection = Me.ExternalHtmlFiles()
'-- no external refs? nothing to do
If FileCollection.Count = 0 Then Return
Dim FolderName As String
Dim FileUrl As String
For Each DelimitedFileUrl As String In FileCollection.AllKeys
FileUrl = FileCollection.Item(DelimitedFileUrl)
If _Builder.WebFiles.Contains(FileUrl) Then
Dim wf As WebFile = DirectCast(_Builder.WebFiles.Item(FileUrl), WebFile)
Dim NewPath As String = Me.ExternalFilesFolder & "/" & wf.DownloadFilename
Dim DelimitedReplacement As String = Regex.Replace(DelimitedFileUrl, _
"^(?<StartDelim>""|'|\()*(?<Value>[^'"")]*)(?<EndDelim>""|'|\))*$", _
"${StartDelim}" & NewPath & "${EndDelim}")
'-- correct original Url references in Html so they point to our local files
html = html.Replace(DelimitedFileUrl, DelimitedReplacement)
End If
Next
_DownloadedBytes = _TextEncoding.GetBytes(html)
End Sub
''' <summary>
''' appends key=value named matches in a regular expression
''' to a target NameValueCollection
''' </summary>
Private Sub AddMatchesToCollection(ByVal s As String, _
ByVal r As Regex, _
ByRef nvc As Specialized.NameValueCollection)
Dim key As String
Dim value As String
Dim headerDisplayed As Boolean = False
Dim urlRegex As New Regex("^https*://\w+", RegexOptions.IgnoreCase)
For Each m As Match In r.Matches(s)
If Not headerDisplayed Then
Debug.WriteLine("Matches added from regex:")
Debug.WriteLine("""" & r.ToString & """")
headerDisplayed = True
End If
key = m.Groups("Key").ToString
value = m.Groups("Value").ToString
If nvc.Item(key) Is Nothing Then
Debug.WriteLine(" Match: " & m.ToString)
Debug.WriteLine(" Key: " & key)
Debug.WriteLine(" Value: " & value)
If Not urlRegex.IsMatch(value) Then
Debug.WriteLine("Match discarded; does not appear to be valid fully qualified http:// Url", "Error")
Else
nvc.Add(key, value)
End If
End If
Next
End Sub
''' <summary>
''' download ALL externally referenced files in this file's html, potentially recursively,
''' to the default download path for this page
''' </summary>
Public Sub DownloadExternalFiles(ByVal st As Builder.FileStorage, Optional ByVal recursive As Boolean = False)
DownloadExternalFiles(st, Me.ExternalFilesFolder, recursive)
End Sub
''' <summary>
''' download ALL externally referenced files in this html, potentially recursively
''' to a specific download path
''' </summary>
Private Sub DownloadExternalFiles(ByVal st As Builder.FileStorage, ByVal targetFolder As String, ByVal recursive As Boolean)
Dim FileCollection As Specialized.NameValueCollection = ExternalHtmlFiles()
If Not FileCollection.HasKeys Then Return
Debug.WriteLine("Downloading all external files collected from URL:")
Debug.WriteLine(" " & Url)
For Each Key As String In FileCollection.AllKeys
DownloadExternalFile(FileCollection.Item(Key), st, targetFolder, recursive)
Next
End Sub
''' <summary>
''' Download a single externally referenced file (if we haven't already downloaded it)
''' </summary>
Private Sub DownloadExternalFile(ByVal url As String, ByVal st As Builder.FileStorage, _
ByVal targetFolder As String, Optional ByVal recursive As Boolean = False)
Dim wf As WebFile
Dim isNew As Boolean
'-- have we already downloaded (or attempted to) this file?
If _Builder.WebFiles.Contains(url) Or _Builder.Url = url Then
wf = DirectCast(_Builder.WebFiles.Item(url), WebFile)
isNew = False
Else
wf = New WebFile(_Builder, url, st)
isNew = True
End If
'-- if we're planning to store this file on disk, make sure we can
If st = Builder.FileStorage.DiskPermanent Or st = Builder.FileStorage.DiskTemporary Then
If Not Directory.Exists(targetFolder) Then
Directory.CreateDirectory(targetFolder)
End If
wf.DownloadFolder = targetFolder
End If
wf.Download()
If isNew Then
'-- add this (possibly) downloaded file to our shared collection
_Builder.WebFiles.Add(wf.UrlUnmodified, wf)
'-- if this is an HTML file, it has dependencies of its own;
'-- download them into a subfolder
If (wf.IsHtml Or wf.IsCss) And recursive Then
wf.DownloadExternalFiles(st, recursive)
End If
End If
End Sub
''' <summary>
''' attempt to get a coherent filename out of the Url
''' </summary>
Private Function FilenameFromUrl() As String
'-- first, try to get a filename out of the URL itself;
'-- this means anything past the final slash that doesn't include another slash
'-- or a question mark, eg http://mywebsite/myfolder/crazy?param=1¶m=2
Dim filename As String = Regex.Match(_Url, "/(?<Filename>[^/?]+)[^/]*$").Groups("Filename").Value
If filename <> "" Then
'-- that worked, but we need to make sure the filename is unique
'-- if query params were passed to the URL file
Dim u As New Uri(_Url)
If u.Query <> "" Then
filename = Path.GetFileNameWithoutExtension(filename) & "_" & u.Query.GetHashCode.ToString & Me.DownloadExtension
End If
End If
'-- ok, that didn't work; if this file is HTML try to get the TITLE tag
If filename = "" Then
If Me.IsHtml Then
filename = Me.HtmlTitle
If filename <> "" Then
filename &= ".htm"
End If
End If
End If
'-- now we're really desperate. Hash the URL and make that the filename.
If filename = "" Then
filename = _Url.GetHashCode.ToString & Me.DownloadExtension
End If
Return MakeValidFilename(filename)
End Function
''' <summary>
''' if we weren't given a filename extension, infer it from the download
''' Content-Type header
''' </summary>
''' <remarks>
''' http://www.utoronto.ca/webdocs/HTMLdocs/Book/Book-3ed/appb/mimetype.html
''' </remarks>
Private Function ExtensionFromContentType() As String
Select Case Regex.Match(Me.ContentType, "^[^ ;]+").Value.ToLower
Case "text/html"
Return ".htm"
Case "image/gif"
Return ".gif"
Case "image/jpeg"
Return ".jpg"
Case "text/javascript", "application/x-javascript"
Return ".js"
Case "image/x-png"
Return ".png"
Case "text/css"
Return ".css"
Case "text/plain"
Return ".txt"
Case Else
Debug.WriteLine("Unknown content-type '" & Me.ContentType & "'", "Error")
Return ".htm"
End Select
End Function
End Class