Click here to Skip to main content
15,881,882 members
Articles / Desktop Programming / Win32

Making a Search Engine

Rate me:
Please Sign up or sign in to vote.
4.94/5 (51 votes)
3 May 2013CPOL6 min read 239.6K   27.6K   124  
This article discusses the making of a search engine.
Public Class pandamanger
    Private sql As New url_webpage_manger()
    '' Spider variables
    Dim Employes As New List(Of panda1)
    Dim _maxEmployes As Integer = 10
    Dim _autoStartNewWork As Boolean = True

    ''' <summary>
    ''' Crawl the given url for keywords and follow the url in the page
    ''' </summary>
    ''' <param name="link">url of the page</param>
    Public Sub Crawl(ByVal link As String)
        Dim panda As panda1 = _getSpider(True)

        ''Check if we have spider
        If panda Is Nothing Then
            Exit Sub
        End If

        Dim lnk As Link = sql.get_link(func.GetMd5Hash(link))

        If lnk._Empty Then
            '' Add the url to db and then proceed
            lnk.inset_url_with_hash(link)
            lnk.backlink = 0
            lnk.crawldate = "null"
            lnk.priority = LinkPriority.High ' Because user has requested to perform
            lnk.state = LinkState.Crawling
            lnk._Empty = False
            sql.add_link(lnk)
            sql.update_urlstate(lnk.urlhash, LinkState.Crawling)
            panda.Crawl(lnk.url)
            AddHandler panda.WorkComplete, AddressOf _TimeForNextWork
        Else
            '' Already in db check if we can crawl or not
            If lnk.state = LinkState.None Then
                '' we can crawl
                sql.update_urlstate(lnk.urlhash, LinkState.Crawling)
                panda.Crawl(lnk.url)
                AddHandler panda.WorkComplete, AddressOf _TimeForNextWork
            Else
                Dim msg As String = String.Format("{0} has a state of {1}", lnk.url, lnk.state)
                Console.WriteLine("panda_manger::crawl()->" + msg)
                MsgBox(msg)
            End If
        End If
    End Sub

    ''' <summary>
    ''' Search for an employe sitting ideal.
    ''' If no employe in office then heir one employe automatically
    ''' </summary>
    ''' <returns>Returns the ideal employe to assign him a work</returns>
    ''' <remarks></remarks>
    ''' <param name="_add_if_needed">do you need to assign a new employe if no one is there</param>
    Private Function _getSpider(ByVal _add_if_needed As Boolean) As panda1
        If Employes.Count = 0 Then
            '' We need one employe atleast at work
            Employes.Add(New panda1)
            Return Employes(0)
        Else
            For Each emp In Employes
                If Not emp.Busy Then
                    Return emp
                    Exit Function
                End If
            Next
            '' If we are here then there must be not any employe ideal.
            If _add_if_needed Then
                If Employes.Count <= _maxEmployes Then
                    Employes.Add(New panda1)
                    Return Employes(Employes.Count - 1)
                    Exit Function
                Else
                    Return Nothing
                End If
            Else
                Return Nothing
            End If
        End If
    End Function



    Private Sub _TimeForNextWork(ByVal sender As panda1, ByVal state As WorkState)
        Try
            Select Case state
                Case WorkState.Url_error, WorkState.Work_error
                    sql.update_urlstate(sender.URL_Hash, LinkState.CrawlingError)
                    Console.WriteLine("Spider got error on work complete")
                Case WorkState.Complete
                    sql.update_urlstate(sender.URL_Hash, LinkState.Crawled)
                    Console.WriteLine("Spider got work complete")
            End Select
            If _autoStartNewWork Then
                Dim lnk As Link = sql.get_work()
                If Not lnk._Empty Then
                    Console.WriteLine("Spider got new work " + lnk.url)
                    sql.update_urlstate(lnk.urlhash, LinkState.Crawling)
                    sender.Crawl(lnk.url)
                Else
                    Console.WriteLine("No work to do")
                End If
            End If
        Catch ex As Exception
        End Try
    End Sub

    Public Sub self_start()
        Try
            If _autoStartNewWork Then
                Dim lnk As Link = sql.get_work()
                If Not lnk._Empty Then
                    Console.WriteLine("Spider got new work " + lnk.url)
                    sql.update_urlstate(lnk.urlhash, LinkState.Crawling)
                    Dim panda As panda1 = _getSpider(True)
                    panda.Crawl(lnk.url)
                    AddHandler panda.WorkComplete, AddressOf _TimeForNextWork
                Else
                    Console.WriteLine("No work to do")
                End If
            End If
        Catch ex As Exception

        End Try
    End Sub
End Class

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Student
India India
I just love coding. But due to my studies it became very tough for me to manage both.

Comments and Discussions