Imports System.Net
Imports System.Net.Sockets
Imports System.IO
Imports System.Text
Public MustInherit Class WebResourceProvider
' WebResourceProvider.vb
'
' Written by Marcel Spring, ADVIS AG, Switzerland <marcel.spring@advis.ch>
' Copyright (c) 2002. All Rights Reserved.
'
' code adapted from original c++ library from
' Ravi Bhavnani <ravib@ravib.com>
' http:'www.codeproject.com/internet/WebResourceProvider.asp.
' Copyright (c) 2002. All Rights Reserved.
'
' This code may be used in compiled form in any way you desire. This
' file may be redistributed unmodified by any means PROVIDING it is
' not sold for profit without the author's written consent, and
' providing that this notice and the author's name and all copyright
' notices remains intact.
'
' An email letting us know how you are using it would be nice as well.
'
' This code is provided "as is" with no expressed or implied warranty.
' The author accepts no liability for any damage/loss of business that
' this product may cause.
'
'
' Revision history:
'
' 06 Jun 2002 - Initial release
' 22 Aug 2002 - Version 1.1
' - RemoveHTML is now treating special characters like Umlaut etc. well
' - ReplaceSpecialChars function added
' - StoreIndex function added, to store the current pointer while parsing
' - RestoreIndex function added, to retrieve the previously stored pointer
' - SkipBackTo and SkipBackToExact function added to parse backward
'------------------------------------------------------------------------------------------
#Region "---- Private class members"
'==========================================================================================
'Private class members
'==========================================================================================
Private myIndex As Integer
Private myStoredIndex As Integer
Private myAgent As String
Private myContent As String
Private myURL As String
Private myFetchTime As System.DateTime
#End Region
#Region "---- Protected class properties"
'==========================================================================================
'Protected class properties
'==========================================================================================
'------------------------------------------------------------------------------------------
Protected ReadOnly Property FetchTime() As System.DateTime
Get
Return myFetchTime
End Get
End Property
'------------------------------------------------------------------------------------------
Protected Property URL() As String
'URL will be set trough overridden method constructURL
Get
Return URL
End Get
Set(ByVal Value As String)
myURL = Value
End Set
End Property
'------------------------------------------------------------------------------------------
Protected Property Agent() As String
Get
Return myAgent
End Get
Set(ByVal Value As String)
myAgent = Value
End Set
End Property
#End Region
#Region "---- Overridable class properties"
'==========================================================================================
'Overridable class properties
'==========================================================================================
'------------------------------------------------------------------------------------------
Overridable ReadOnly Property Name() As String
Get
Return ""
End Get
End Property
'------------------------------------------------------------------------------------------
Overridable ReadOnly Property Version() As String
Get
Return ""
End Get
End Property
'------------------------------------------------------------------------------------------
Overridable ReadOnly Property Copyright() As String
Get
Return ""
End Get
End Property
'------------------------------------------------------------------------------------------
Overridable ReadOnly Property OtherInformation() As String
Get
Return ""
End Get
End Property
#End Region
#Region "---- Class methods"
'==========================================================================================
'Class methods
'==========================================================================================
Public Sub New()
myIndex = 0
End Sub
#End Region
#Region "---- Public methods"
'==========================================================================================
'Public methods
'==========================================================================================
'------------------------------------------------------------------------------------------
Public Sub FetchResource()
'reset fetch status
myContent = ""
myFetchTime = Now
Do
'construct myURL to be fetched
ConstructURL(myURL)
'get and parse fetched myContent
ResetIndex()
Try
GetContent(myURL)
Catch ex As Exception
Throw New System.Exception("Error fetching myContent", ex)
End Try
Try
ParseContent()
Catch ex As Exception
Throw New System.Exception("Error parsing myContent", ex)
End Try
Loop Until Not MoreAvailable()
End Sub
'------------------------------------------------------------------------------------------
Public Function URLExists() As Boolean
'is a bit slow, cause it has to get the whole page from the specified myURL
'but some webservers respond with a error page if we use a GETHEADER call
Dim myResponse As WebResponse
Dim myRequest As WebRequest = HttpWebRequest.Create(myURL)
Try
myResponse = myRequest.GetResponse()
Catch
Dim ex As Exception
Return False
Finally
myResponse.Close()
End Try
Return True
End Function
#End Region
#Region "---- Overridable methods for derived classes"
'==========================================================================================
'Overridable methods for derived classes
'==========================================================================================
'------------------------------------------------------------------------------------------
MustOverride Sub ConstructURL(ByRef myURL As String)
'------------------------------------------------------------------------------------------
Overridable Function IsPost() As Boolean
Return False
End Function
'------------------------------------------------------------------------------------------
Overridable Sub GetPostData(ByRef myPostData As Collections.Specialized.NameValueCollection)
End Sub
'------------------------------------------------------------------------------------------
Overridable Function Init() As Boolean
Return True
End Function
'------------------------------------------------------------------------------------------
Overridable Function MoreAvailable() As Boolean
Return False
End Function
'------------------------------------------------------------------------------------------
MustOverride Sub ParseContent()
#End Region
#Region "---- Protected helper methods used by derived class"
'==========================================================================================
'Protected helper methods used by derived class
'==========================================================================================
'------------------------------------------------------------------------------------------
Protected Function At(ByVal text As String) As Boolean
Return FindNoCase(myContent, text, myIndex) = myIndex
End Function
'------------------------------------------------------------------------------------------
Protected Function AtExact(ByVal text As String) As Boolean
Return myContent.IndexOf(text, myIndex) = myIndex
End Function
'------------------------------------------------------------------------------------------
Protected Function SkipTo(ByVal text As String) As Boolean
Dim newIndex As Integer = FindNoCase(myContent, text, myIndex)
If newIndex = -1 Then
Return False
End If
myIndex = newIndex + text.Length
Return True
End Function
'------------------------------------------------------------------------------------------
Protected Function SkipToExact(ByVal text As String) As Boolean
Dim newIndex As Integer = myContent.IndexOf(text, myIndex)
If newIndex = -1 Then
Return False
End If
myIndex = newIndex + text.Length
Return True
End Function
'------------------------------------------------------------------------------------------
Protected Function SkipBackTo(ByVal text As String) As Boolean
Dim newIndex As Integer = FindRevNoCase(myContent, text, myIndex)
If newIndex = -1 Then
Return False
End If
myIndex = newIndex + text.Length
Return True
End Function
'------------------------------------------------------------------------------------------
Protected Function SkipBackToExact(ByVal text As String) As Boolean
Dim newIndex As Integer = InStrRev(myContent, text, myIndex, CompareMethod.Text)
If newIndex = -1 Then
Return False
End If
myIndex = newIndex + text.Length
Return True
End Function
'------------------------------------------------------------------------------------------
Protected Function ExtractTo(ByVal terminator As String, ByRef result As String) As Boolean
Dim Length As Integer = myContent.Length
If myIndex < (Length - 1) Then
Dim endSegment As Integer = FindNoCase(myContent, terminator, myIndex)
If endSegment <> -1 Then
result = Mid(myContent, myIndex + 1, endSegment - myIndex)
myIndex = endSegment + terminator.Length
Return True
End If
End If
Return False
End Function
'------------------------------------------------------------------------------------------
Protected Function ExtractToExact(ByVal terminator As String, ByRef result As String) As Boolean
Dim Length As Integer = myContent.Length
If myIndex < (Length - 1) Then
Dim endSegment As Integer = myContent.IndexOf(terminator, myIndex)
If endSegment <> -1 Then
result = Mid(myContent, myIndex + 1, endSegment - myIndex)
myIndex = endSegment + terminator.Length
Return True
End If
End If
Return False
End Function
'------------------------------------------------------------------------------------------
Protected Sub ExtractToEnd(ByRef result As String)
Dim Length As Integer = myContent.Length
If myIndex < (Length - 1) Then
result = Mid(myContent, myIndex + 1, Length - myIndex)
End If
End Sub
'------------------------------------------------------------------------------------------
Protected Sub GetLinks(ByRef documents As String(), ByRef images As String())
'remove comments and script and fix links
Dim OriginalmyContent As String = myContent
RemoveComments()
RemoveScripts()
myContent = ReplaceEvery(myContent, "'", """")
'create URL prefix - this will be used to create fully qualified URLs
Dim myURLPrefix As String = myURL
Dim Slash As Integer = myURLPrefix.LastIndexOf("/")
If Slash > 7 Then
myURLPrefix = Left(myURLPrefix, Slash)
End If
'extract HREF targets - prepend base URL if extracted myURL is relative
Dim baseURL As String
While SkipTo("href=""")
If ExtractTo("""", baseURL) Then
Trim(baseURL)
If baseURL.IndexOf("mailto:") = -1 Then
If baseURL.IndexOf("http://") <> 0 Then
If baseURL.Chars(0) = "/" Then
baseURL = [String].Format("{0}{1}", myURLPrefix, baseURL)
Else
baseURL = [String].Format("{0}/{1}", myURLPrefix, baseURL)
End If
End If
If documents Is Nothing Then
ReDim Preserve documents(0)
documents(0) = baseURL
Else
If FindStringInArray(documents, baseURL) = -1 Then
Dim i As Integer = documents.Length
ReDim Preserve documents(i)
documents(i) = baseURL
End If
End If
End If
End If
End While
'extract SRC targets - prepend base myURL if extracted myURL is relative
ResetIndex()
While SkipTo("src=""")
If ExtractTo("""", baseURL) Then
Trim(baseURL)
If baseURL.IndexOf("http://") <> 0 Then
If baseURL.Chars(0) = "/" Then
baseURL = [String].Format("{0}{1}", myURLPrefix, baseURL)
Else
baseURL = [String].Format("{0}/{1}", myURLPrefix, baseURL)
End If
End If
If images Is Nothing Then
ReDim Preserve images(0)
images(0) = baseURL
Else
If FindStringInArray(images, baseURL) = -1 Then
Dim i As Integer = images.Length
ReDim Preserve images(i)
images(i) = baseURL
End If
End If
End If
End While
'restore original myContent
myContent = OriginalmyContent
ResetIndex()
End Sub
'------------------------------------------------------------------------------------------
Protected Sub ResetIndex()
myIndex = 0
End Sub
'------------------------------------------------------------------------------------------
Protected Sub StoreIndex()
myStoredIndex = myIndex
End Sub
'------------------------------------------------------------------------------------------
Protected Sub RestoreIndex()
myIndex = myStoredIndex
End Sub
'------------------------------------------------------------------------------------------
Protected Function OffsetIndex() As Integer
Return myIndex - myStoredIndex
End Function
'------------------------------------------------------------------------------------------
Protected Function ReplaceEvery(ByVal target As String, ByVal occurence As String, ByVal replacement As String) As String
Dim lowerTarget As String = target.ToLower
Dim lowerOccurence As String = occurence.ToLower
'search the lowercase version, replace in the original version
Dim pos As Integer = 0
pos = lowerTarget.IndexOf(lowerOccurence, pos)
While pos <> -1
'need for empty "newstr" cases
lowerTarget.Remove(pos, occurence.Length)
lowerTarget.Insert(pos, replacement)
'actually replace
target.Remove(pos, occurence.Length)
target.Insert(pos, replacement)
'ckeck next occurence
pos += replacement.Length
pos = lowerTarget.IndexOf(lowerOccurence, pos)
End While
Return target
End Function
'------------------------------------------------------------------------------------------
Protected Sub RemoveComments()
ResetIndex()
Dim myContentBody As String
Dim myContentSegment As String
While ExtractTo("<!--", myContentSegment)
myContentBody += myContentSegment
If Not SkipTo("-->") Then
myContent = myContentBody
ResetIndex()
Return
End If
End While
ExtractToEnd(myContentSegment)
myContentBody += myContentSegment
'replace existing myContent
myContent = myContentBody
ResetIndex()
End Sub
'------------------------------------------------------------------------------------------
Protected Sub RemoveScripts()
ResetIndex()
Dim myContentBody As String
Dim myContentSegment As String
While ExtractTo("<script", myContentSegment)
myContentBody += myContentSegment
If Not SkipTo("/script>") Then
myContent = myContentBody
ResetIndex()
Return
End If
End While
ExtractToEnd(myContentSegment)
myContentBody += myContentSegment
'replace existing myContent
myContent = myContentBody
ResetIndex()
End Sub
'------------------------------------------------------------------------------------------
Protected Sub RemoveEnclosingAnchorTag(ByRef text As String)
If text.IndexOf("<a") = 0 Or text.IndexOf("<A") = 0 Then
Dim newIndex As Integer = text.IndexOf(">")
If newIndex <> -1 Then
text = Right(text, text.Length - newIndex - 1)
newIndex = text.IndexOf("<")
If newIndex <> -1 Then
text = Left(text, newIndex)
End If
End If
End If
End Sub
'------------------------------------------------------------------------------------------
Protected Sub RemoveEnclosingQuotes(ByRef text As String)
Dim myLength As Integer = text.Length
If text.Chars(0) = """" And text.Chars(myLength - 1) = """" Then
text = Mid(text, 2, myLength - 2)
End If
End Sub
'------------------------------------------------------------------------------------------
Protected Sub RemoveHTML(ByRef text As String)
'do some common replacements
ReplaceSpecialChars(text)
'remove all tags
Dim cleanText As String = ""
Dim myIndex As Integer = 0
Dim startTag As Integer = text.IndexOf("<", myIndex)
While startTag <> -1
'extract the start of tag
Dim subText As String = Mid(text, myIndex + 1, startTag - myIndex)
cleanText += subText
myIndex = startTag + 1
'skip over tag
Dim endTag As Integer = text.IndexOf(">", myIndex)
If endTag = -1 Then
Exit Sub
End If
myIndex = endTag + 1
startTag = text.IndexOf("<", myIndex)
End While
'gather remaining text
If myIndex < text.Length Then
cleanText += Right(text, text.Length - myIndex)
End If
text = cleanText
cleanText = ""
End Sub
'------------------------------------------------------------------------------------------
Protected Sub ReplaceSpecialChars(ByRef text As String)
Static Dim myHash As Hashtable
If myHash Is Nothing Then
myHash = New Hashtable(100)
'the first time we have to build the hash table
With myHash
.Add("<p>", vbCrLf)
.Add("Á", "�")
.Add("á", "�")
.Add("Â", "�")
.Add("â", "�")
.Add("´", "�")
.Add("Æ", "�")
.Add("æ", "�")
.Add("À", "�")
.Add("à", "�")
.Add("&", "&")
.Add("Å", "�")
.Add("å", "�")
.Add("Ã", "�")
.Add("ã", "�")
.Add("Ä", "�")
.Add("ä", "�")
.Add("¦", "�")
.Add("&brkbar;", "�")
.Add("Ç", "�")
.Add("ç", "�")
.Add("¸", "�")
.Add("¢", "�")
.Add("©", "�")
.Add("¤", "�")
.Add("°", "�")
.Add("÷", "�")
.Add("É", "�")
.Add("é", "�")
.Add("Ê", "�")
.Add("ê", "�")
.Add("È", "�")
.Add("è", "�")
.Add("Ð", "�")
.Add("ð", "�")
.Add("Ë", "�")
.Add("ë", "�")
.Add("½", "�")
.Add("¼", "�")
.Add("¾", "�")
.Add(">", ">")
.Add("Í", "�")
.Add("í", "�")
.Add("Î", "�")
.Add("î", "�")
.Add("¡", "�")
.Add("Ì", "�")
.Add("ì", "�")
.Add("¿", "�")
.Add("Ï", "�")
.Add("ï", "�")
.Add("«", "�")
.Add("<", "<")
.Add("¯", "�")
.Add("&hibar;", "�")
.Add("µ", "�")
.Add("·", "�")
.Add(" ", " ")
.Add("¬", "�")
.Add("Ñ", "�")
.Add("ñ", "�")
.Add("Ó", "�")
.Add("ó", "�")
.Add("Ô", "�")
.Add("ô", "�")
.Add("Ò", "�")
.Add("ò", "�")
.Add("ª", "�")
.Add("º", "�")
.Add("Ø", "�")
.Add("ø", "�")
.Add("Õ", "�")
.Add("õ", "�")
.Add("Ö", "�")
.Add("ö", "�")
.Add("¶", "�")
.Add("±", "�")
.Add("£", "�")
.Add(""", """")
.Add("»", "�")
.Add("®", "�")
.Add("§", "�")
.Add("­", "�")
.Add("¹", "�")
.Add("²", "�")
.Add("³", "�")
.Add("ß", "�")
.Add("Þ", "�")
.Add("þ", "�")
.Add("×", "�")
.Add("Ú", "�")
.Add("ú", "�")
.Add("Û", "�")
.Add("û", "�")
.Add("Ù", "�")
.Add("ù", "�")
.Add("¨", "�")
.Add("¨", "�")
.Add("Ü", "�")
.Add("ü", "�")
.Add("Ý", "�")
.Add("ý", "�")
.Add("¥", "�")
.Add("ÿ", "�")
End With
End If
Dim myDE As DictionaryEntry
For Each myDE In myHash
If text.IndexOf(CStr(myDE.Key)) <> -1 Then
text = text.Replace(CStr(myDE.Key), CStr(myDE.Value))
End If
Next
End Sub
'------------------------------------------------------------------------------------------
Protected Sub Trim(ByRef text As String)
text = text.Trim()
text = text.Replace(vbCrLf, " ")
text = text.Replace(vbCr, " ")
text = text.Replace(vbLf, " ")
End Sub
'------------------------------------------------------------------------------------------
Sub RemoveWhiteSpace(ByRef text As String)
Dim RegEx As New RegularExpressions.Regex("\s+")
text = RegEx.Replace(text, " ")
End Sub
#End Region
#Region "---- Private helper functions"
'==========================================================================================
'Private helper functions
'==========================================================================================
Private Function FindNoCase(ByVal text As String, ByVal subText As String, ByVal startIndex As Integer) As Integer
'get lowercase version of substring
Dim lcSubText As String = subText.ToLower
'initialize indices
Dim textstartIndex As Integer = startIndex
Dim subtextstartIndex As Integer = 0
'walk trough string
While textstartIndex < text.Length
Dim ch As Char
ch = ch.ToLower(text.Chars(textstartIndex))
If subtextstartIndex < lcSubText.Length Then
If ch = lcSubText.Chars(subtextstartIndex) Then
'if source and substring characters match, continue seraching for the rest of the substring
textstartIndex += 1
subtextstartIndex += 1
Else
'otherwise restart search from the next location in source string
startIndex += 1
textstartIndex = startIndex
subtextstartIndex = 0
End If
Else
'if the entire substring has been found, return its location in the source string
Return startIndex
End If
End While
'if the search ended because the entire source string segment matched, return the location where the match began
If subtextstartIndex = lcSubText.Length Then
Return startIndex
End If
'otherwise indicate that the substring wasnt found
Return -1
End Function
'------------------------------------------------------------------------------------------
Private Function FindRevNoCase(ByVal text As String, ByVal subText As String, ByVal startIndex As Integer) As Integer
'get lowercase version of substring
Dim lcSubText As String = subText.ToLower
'initialize indices
Dim textstartIndex As Integer = startIndex
Dim subtextstartIndex As Integer = 0
'walk trough string
While textstartIndex < text.Length
Dim ch As Char
ch = ch.ToLower(text.Chars(textstartIndex))
If subtextstartIndex < lcSubText.Length Then
If ch = lcSubText.Chars(subtextstartIndex) Then
'if source and substring characters match, continue seraching for the rest of the substring
textstartIndex += 1
subtextstartIndex += 1
Else
'otherwise restart search from the next location in source string
startIndex -= 1
textstartIndex = startIndex
subtextstartIndex = 0
End If
Else
'if the entire substring has been found, return its location in the source string
Return startIndex
End If
End While
'if the search ended because the entire source string segment matched, return the location where the match began
If subtextstartIndex = lcSubText.Length Then
Return startIndex
End If
'otherwise indicate that the substring wasnt found
Return -1
End Function
'------------------------------------------------------------------------------------------
Private Function FindStringInArray(ByVal searchArray As String(), ByVal searchText As String) As Integer
Dim newIndex As Integer = 0
While newIndex < searchArray.Length
If [String].Compare(searchText, searchArray(newIndex)) = 0 Then
Return newIndex
End If
newIndex += 1
End While
Return -1
End Function
'------------------------------------------------------------------------------------------
Sub GetContent(ByVal url As String)
Dim myWebClient As WebClient = New WebClient()
Dim parameterTable As New Collections.Specialized.NameValueCollection()
Dim responseArray As Byte()
If IsPost() Then
GetPostData(parameterTable)
Try
responseArray = myWebClient.UploadValues(url, "POST", parameterTable)
Catch ex As Exception
Throw New System.Exception("Error reading from website (POST)", ex)
End Try
Else
Try
responseArray = myWebClient.DownloadData(url)
Catch ex As Exception
Throw New System.Exception("Error reading from website (POST)", ex)
End Try
End If
myContent = Encoding.Default.GetString(responseArray)
myWebClient.Dispose()
End Sub
#End Region
End Class