65.9K
CodeProject is changing. Read more.
Home

Parsing an HTML document by using a recursive function

starIconstarIconstarIconstarIconstarIcon

5.00/5 (3 votes)

Mar 22, 2010

CPOL
viewsIcon

54719

This is an example of one way to parse an HTML document by using a recursive function. In this example, an html document is loaded from a text file, but the code also demonstrates (in remarks) using a web page as a source file.When the recursive function is called, a conditional statement...

This is an example of one way to parse an HTML document by using a recursive function. In this example, an html document is loaded from a text file, but the code also demonstrates (in remarks) using a web page as a source file. When the recursive function is called, a conditional statement evaluates the html elements for child elements. If the element has children, the recursion occurs (the function calls itself) and the child of the candidate gets evaluated for children. Eventually, the function finds an element with no children, and the element's inner text, etc., is appended to a textbox. Running this program in debug mode might be helpful if my description is confusing. import the namespaces as shown
Imports System.Windows.Forms.HtmlDocument
Imports System.Xml
    Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click

        Try
            Dim HTMLDocument As HtmlDocument
            'Dim webclient As System.Net.WebClient = New System.Net.WebClient
            'Dim url As String = "http://www.somewebsite.com"
            Dim myHTML As String '= webclient.DownloadString(url)

            'instead of downloading the html, lets get it from a file
            Dim filePath As String = "C:\htmlsourcefile.txt"
            Dim myStreamReader = New System.IO.StreamReader(filePath)
            myHTML = myStreamReader.ReadToEnd

            WebBrowser1.Navigate("about:blank")
            Dim objectDoc = WebBrowser1.Document
            WebBrowser1.Document.Write(myHTML)
            WebBrowser1.ScriptErrorsSuppressed = True
            HTMLDocument = WebBrowser1.Document

            append("The document title is: " & HTMLDocument.Title)

            Dim headElementCollection As HtmlElementCollection = _
            HTMLDocument.GetElementsByTagName("head")

            'call the function (no value is returned)
            getChildren(headElementCollection)
            append(vbCrLf)
            headElementCollection = HTMLDocument.GetElementsByTagName("body")
            'same function again, just for the body this time
            getChildren(headElementCollection)


        Catch ex As Exception

            append(ex.ToString)

        End Try

    End Sub
This is the recursive function
    Private Function getChildren(ByVal xElementCollection As HtmlElementCollection)
        Dim xLabel As String

        Dim parentElement As HtmlElement

        For Each parentElement In xElementCollection
            If parentElement.Children.Count > 0 Then

                Select Case parentElement.TagName.ToLower
                    Case "tr" : xLabel = "Row"
                    Case "td" : xLabel = "Cell"
                    Case "th" : xLabel = "Header"
                    Case "a" : xLabel = "Anchor"
                    Case "tbody" : xLabel = "T-Body"
                    Case "div" : xLabel = "Division"
                    Case "head" : xLabel = "Head"
                    Case "body" : xLabel = "Body"
                    Case "table" : xLabel = "Table"
                    Case "p" : xLabel = "Paragraph"
                    Case Else : xLabel = "element not specified"

                End Select

                append("<" & xLabel & ">")
                getChildren(parentElement.Children)
                append("<" & xLabel & " />")

            Else

                If parentElement.InnerText <> "" Then
                    append("     " & parentElement.InnerText & "")
                Else
                    append("     " & vbNull.ToString & "")
                End If

                If parentElement.GetAttribute("href").ToString <> "" Then
                    append("     " & parentElement.GetAttribute("href") & "")
                End If

            End If
        Next

        Return Nothing

    End Function

one last thing, I prefer not to use TextBox1.append("one two three"), so I do it this way....
    Private Sub append(ByVal myTextToAppend As String)
        TextBox1.AppendText(myTextToAppend & vbCrLf)
        Application.DoEvents()
        outputXL = outputXL & myTextToAppend & vbCrLf
    End Sub