Click here to Skip to main content
15,896,111 members
Articles / Web Development / ASP.NET

Image and Location Search(aroo)

Rate me:
Please Sign up or sign in to vote.
4.96/5 (28 votes)
11 Jun 2008CPOL12 min read 203.7K   2.5K   76  
Index your website (including images and geographic data), search and display results in Google Earth.
using System;
using System.Xml.Serialization;
using System.Collections.Specialized;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using Searcharoo.Common;

namespace Searcharoo.Indexer
{
    /// <summary>
    /// The Spider that crawls your website, link by link.
    /// </summary>
    /// <remarks>
    /// In the Searcharoo (v2), this code was 'embedded' in an ASPX page. 
    /// This was for ease of reporting 'progress' via Response.Write
    /// statements. The code now uses an EventHandler to trigger progress reporting
    /// by the calling code - so now it could be Reponse.Write, or saved to a file 
    /// or any other mechanism. (v4) takes advantage of this by wrapping the Spider in a console
    /// application so you can run it outside of a website.
    /// 
    /// Some of the references used when researching this code:
    ///
    /// C# and the Web: Writing a Web Client Application with Managed Code in the Microsoft .NET Framework - not helpful...
    /// http://msdn.microsoft.com/msdnmag/issues/01/09/cweb/default.aspx
    ///
    /// Retrieving a List of Links & Images from a Web Page
    /// http://www.dotnetjunkies.com/Tutorial/1B219C93-7702-4ADF-9106-DFFDF90914CF.dcik
    /// 
    /// FUTURE: In case connecting via a Proxy is required for the spidering
    /// http://www.experts-exchange.com/Programming/Programming_Languages/Dot_Net/Q_20974147.html
    /// http://msdn.microsoft.com/library/en-us/cpref/html/frlrfsystemnetglobalproxyselectionclasstopic.asp
    /// </remarks>
    public class Spider
    {
        #region Private fields: visited, count, catalog, 
        private Uri _CurrentStartUri = null;
        private string _CurrentStartUriString = String.Empty;
        /// <summary></summary>
        private ArrayList _Visited = new ArrayList();
        /// <summary></summary>
        private Hashtable _VisitedHashtable = new Hashtable();
        /// <summary></summary>
        //private int _Count = 0;
        /// <summary></summary>
        private Catalog _Catalog;
        
        /// <summary>Stemmer to use</summary>
        private IStemming _Stemmer;

        /// <summary>Stemmer to use</summary>
        private IStopper _Stopper;

        /// <summary>Go word parser to use</summary>
        private IGoWord _GoChecker;

        /// <summary>Loads and acts as 'authorisation' for robot-excluded Urls</summary>
        private RobotsTxt _Robot;

        /// <summary>SIMONJONES</summary>
        System.Net.CookieContainer _CookieContainer = new System.Net.CookieContainer();
        #endregion

        #region Public events/handlers: SpiderProgressEvent
        /// <summary>
        /// Event Handler to communicate progress and errors back to the calling code
        /// </summary>
        /// <remarks>
        /// Learn about Events from a few different places
        /// http://www.codeproject.com/csharp/csevents01.asp
        /// http://www.csharphelp.com/archives/archive253.html
        /// http://www.devhood.com/Tutorials/tutorial_details.aspx?tutorial_id=380
        /// </remarks>
        public event SpiderProgressEventHandler SpiderProgressEvent;

        /// <summary>
        /// Only trigger the event if a Handler has been attached.
        /// </summary>
        private void ProgressEvent(object sender, ProgressEventArgs pea)
        {
            if (this.SpiderProgressEvent != null)
            {
                SpiderProgressEvent(sender, pea);
            }
        }
        #endregion

        /// <summary>
        /// Takes a single Uri (Url) and returns the catalog that is generated
        /// by following all the links from that point.
        /// </summary>
        /// <remarks>
        ///This is the MAIN method of the indexing system.
        /// </remarks>
        public Catalog BuildCatalog (Uri startPageUri)
        {
            _Catalog = new Catalog();
            _CurrentStartUri = startPageUri;    // to compare against fully qualified links
            _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower();
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (single Uri) " + startPageUri.AbsoluteUri));
            // Setup Stop, Go, Stemming
            SetPreferences();

            _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);

            // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
            // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
            // RECURSIVE CALL TO 'Process()' STARTS HERE
            ProcessUri(startPageUri, 0);

            // Now we've FINISHED Spidering
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
            ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));

            // Serialization of the Catalog, so we can load it again if the server Application is restarted
            _Catalog.Save();

            ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));

            return _Catalog;// finished, return to the calling code to 'use'
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="startPageUri">array of start pages</param>
        /// <returns></returns>
        public Catalog BuildCatalog(Uri[] startPageUris)
        {
            _Catalog = new Catalog();
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (Uri Array) count: " + startPageUris.Length.ToString()));
            // Setup Stop, Go, Stemming
            SetPreferences();

            foreach (Uri startPageUri in startPageUris)
            {
                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (start Uri) " + startPageUri.AbsoluteUri));

                _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);

                // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
                // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
                // RECURSIVE CALL TO 'Process()' STARTS HERE
                ProcessUri(startPageUri, 0);

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (end Uri) " + startPageUri.AbsoluteUri));
            }
            // Now we've FINISHED Spidering
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
            ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));

            // Serialization of the Catalog, so we can load it again if the server Application is restarted
            _Catalog.Save();

            ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));

            return _Catalog;// finished, return to the calling code to 'use'
        }

        /// <summary>
        /// Setup Stop, Go, Stemming
        /// </summary>
        private void SetPreferences()
        {
            switch (Preferences.StemmingMode)
            {
                case 1:
                    ProgressEvent(this, new ProgressEventArgs(1, "Stemming enabled."));
                    _Stemmer = new PorterStemmer();	//Stemmer = new SnowStemming();
                    break;
                case 2:
                    ProgressEvent(this, new ProgressEventArgs(1, "Stemming enabled."));
                    _Stemmer = new PorterStemmer();
                    break;
                default:
                    ProgressEvent(this, new ProgressEventArgs(1, "Stemming DISabled."));
                    _Stemmer = new NoStemming();
                    break;
            }
            switch (Preferences.StoppingMode)
            {
                case 1:
                    ProgressEvent(this, new ProgressEventArgs(1, "Stop words shorter than 3 chars."));
                    _Stopper = new ShortStopper();
                    break;
                case 2:
                    ProgressEvent(this, new ProgressEventArgs(1, "Stop words from list."));
                    _Stopper = new ListStopper();
                    break;
                default:
                    ProgressEvent(this, new ProgressEventArgs(1, "Stopping DISabled."));
                    _Stopper = new NoStopping();
                    break;
            }
            switch (Preferences.GoWordMode)
            {
                case 1:
                    ProgressEvent(this, new ProgressEventArgs(1, "Go Words enabled."));
                    _GoChecker = new ListGoWord();
                    break;
                default:
                    ProgressEvent(this, new ProgressEventArgs(1, "Go Words DISabled."));
                    _GoChecker = new NoGoWord();
                    break;
            }
        }


        /// <summary>
        /// Recursive 'process' method: takes the uri input, downloads it (following redirects if required)
        /// receiving a Document subclass, then calling the Parse() method to get the words which
        /// are then added to the Catalog.
        /// </summary>
        protected int ProcessUri(Uri uri, int level)
        {
            // [j105 Rob] recursion fix 
            // http://www.codeproject.com/aspnet/Spideroo.asp?df=100&forumid=71481&select=1862807#xx1862807xx
            if (level > Preferences.RecursionLimit) return Preferences.RecursionLimit;

            int wordcount = 0;
            string url = uri.AbsoluteUri.ToLower(); // v6

            if (!_Robot.Allowed(uri))
            {
                ProgressEvent(this, new ProgressEventArgs(2, "RobotsTxt exclusion prevented indexing of " + url + ""));
            }
            else
            {
                if (_Visited.Contains(url))
                {
                    ProgressEvent(this, new ProgressEventArgs(2, url + " already spidered"));
                }
                else
                {
                    _Visited.Add(url);
                    ProgressEvent(this, new ProgressEventArgs(2, url + " being downloaded"));
                    // ### IMPORTANT ### 
                    // Uri is actually retrieved here!
                    Document downloadDocument = Download(uri);

                    if (null == downloadDocument)
                    {
                        ProgressEvent(this, new ProgressEventArgs(1, "Download() failed on " + url + ""));
                    }
                    else
                    {
                        // ### IMPORTANT ### 
                        // Uri is actually parsed here!
                        downloadDocument.Parse();
                        if (downloadDocument.RobotIndexOK)
                        {
                            wordcount = AddToCatalog (downloadDocument);
                        }
                        else
                        {
                            ProgressEvent(this, new ProgressEventArgs(2, "RobotMeta exclusion prevented indexing of " + url + ""));
                        }
                    }

                    if (wordcount > 0)
                    {
                        ProgressEvent(this, new ProgressEventArgs(1, downloadDocument.Title + " parsed " + wordcount + " words!"));
                        ProgressEvent(this, new ProgressEventArgs(4, downloadDocument.Title + " " + downloadDocument.Uri.AbsoluteUri + System.Environment.NewLine
                                                                    + (downloadDocument.RobotIndexOK ? "Indexed" : "RobotMeta Excluded Index")
                                                                    + downloadDocument.Description));
                    }
                    else
                    {
                        ProgressEvent(this, new ProgressEventArgs(2, url + " parsed but zero words found."));
                    }

                    // Move some 'External' to 'Local' links
                    ArrayList elinks = (ArrayList)downloadDocument.ExternalLinks.Clone();
                    for (int l = 0; l < elinks.Count; l++) //string link in downloadDocument.ExternalLinks)
                    {
                        string link = elinks[l].ToString();
                        if (link.ToLower().StartsWith(this._CurrentStartUriString))
                        {
                            downloadDocument.ExternalLinks.Remove(link);
                            downloadDocument.LocalLinks.Add(link);
                        }
                    }

                    // ### Loop through the 'local' links in the document ###
                    // ### and parse each of them recursively ###
                    if (null != downloadDocument && null != downloadDocument.LocalLinks && downloadDocument.RobotFollowOK)
                    { // only if the Robot meta says it's OK
                        foreach (object link in downloadDocument.LocalLinks)
                        {
                            try
                            {
                                Uri urlToFollow = new Uri(downloadDocument.Uri, link.ToString());
                                ProcessUri(urlToFollow, level + 1); // calls THIS method!
                            }
                            catch (Exception ex)
                            {
                                ProgressEvent(this, new ProgressEventArgs(2, "new Uri(" + downloadDocument.Uri + ", " + link.ToString() + ") invalid : " + ex.Message + ""));
                            }
                        }
                    } // process local links
                } // not visited
            } // robot allowed
            return level;
        }

        /// <summary>
        /// Attempts to download the Uri and (based on it's MimeType) use the DocumentFactory
        /// to get a Document subclass object that is able to parse the downloaded data.
        /// </summary>
        /// <remarks>
        /// http://www.123aspx.com/redir.aspx?res=28320
        /// </remarks>
        protected Document Download (Uri uri)
        {
            bool success = false;
            // Open the requested URL

            System.Net.WebProxy proxyObject = null;
            if (Preferences.UseProxy)
            {   // [v6] stephenlane80 suggested proxy code
                proxyObject = new System.Net.WebProxy(Preferences.ProxyUrl, true);
                proxyObject.Credentials = System.Net.CredentialCache.DefaultCredentials;
            }
            // [v6] Erick Brown [work] suggested fix for & in querystring
            string unescapedUri = Regex.Replace(uri.AbsoluteUri, @"&amp;amp;", @"&", RegexOptions.IgnoreCase);
            System.Net.HttpWebRequest req = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(unescapedUri); 

            req.AllowAutoRedirect = true;
            req.MaximumAutomaticRedirections = 3;
            req.UserAgent = Preferences.UserAgent; //"Mozilla/6.0 (MSIE 6.0; Windows NT 5.1; Searcharoo.NET; robot)";
            req.KeepAlive = true;
            req.Timeout = Preferences.RequestTimeout * 1000; //prefRequestTimeout 
            if (Preferences.UseProxy) req.Proxy = proxyObject; // [v6] stephenlane80

            // SIMONJONES http://codeproject.com/aspnet/spideroo.asp?msg=1421158#xx1421158xx
            req.CookieContainer = new System.Net.CookieContainer();
            req.CookieContainer.Add(_CookieContainer.GetCookies(uri));

            // Get the stream from the returned web response
            System.Net.HttpWebResponse webresponse = null;
            try
            {
                webresponse = (System.Net.HttpWebResponse)req.GetResponse();
            }
            catch (System.Net.WebException we)
            {   //remote url not found, 404; remote url forbidden, 403
                ProgressEvent(this, new ProgressEventArgs(2, "skipped  " + uri.AbsoluteUri + " response exception:" + we.ToString() + ""));
            }

            Document currentUriDocument = null;
            if (webresponse != null)
            {
                /* SIMONJONES */
                /* **************** this doesn't necessarily work yet...
                if (webresponse.ResponseUri != htmldoc.Uri)
                {	// we've been redirected, 
                    if (visited.Contains(webresponse.ResponseUri.ToString().ToLower()))
                    {
                        return true;
                    }
                    else
                    {
                        visited.Add(webresponse.ResponseUri.ToString().ToLower());
                    }
                }*/

                try
                {
                    webresponse.Cookies = req.CookieContainer.GetCookies(req.RequestUri);
                    // handle cookies (need to do this in case we have any session cookies)
                    foreach (System.Net.Cookie retCookie in webresponse.Cookies)
                    {
                        bool cookieFound = false;
                        foreach (System.Net.Cookie oldCookie in _CookieContainer.GetCookies(uri))
                        {
                            if (retCookie.Name.Equals(oldCookie.Name))
                            {
                                oldCookie.Value = retCookie.Value;
                                cookieFound = true;
                            }
                        }
                        if (!cookieFound)
                        {
                            _CookieContainer.Add(retCookie);
                        }
                    }
                }
                catch (Exception ex)
                {
                    ProgressEvent(this, new ProgressEventArgs(3, "Cookie processing error : " + ex.Message + ""));
                }
                /* end SIMONJONES */
                
                currentUriDocument = DocumentFactory.New(uri, webresponse);
                success = currentUriDocument.GetResponse(webresponse);
                webresponse.Close();
                ProgressEvent(this, new ProgressEventArgs(2, "Trying index mime type: " + currentUriDocument.MimeType + " for " + currentUriDocument.Uri + ""));
            }
            else
            {
                ProgressEvent(this, new ProgressEventArgs(2, "No WebResponse for " + uri + ""));
                success = false;
            }
            return currentUriDocument;
        }

        /// <summary>
        /// Add the Document subclass to the catalog, BY FIRST 'copying' the main
        /// properties into a File class. The distinction is a bit arbitrary: Documents
        /// are downloaded and indexed, but their content is modelled in as a File
        /// class in the Catalog (and represented as a ResultFile object in the search ASPX page)
        /// </summary>
        /// <return>Number of words catalogued in the Document</return>
        protected int AddToCatalog(Document downloadDocument)
        {
            File infile = new File(downloadDocument.Uri.AbsoluteUri
                , downloadDocument.Title
                , downloadDocument.Description
                , DateTime.Now
                , downloadDocument.Length
                , downloadDocument.GpsLocation
                , downloadDocument.Extension
                , downloadDocument.KeywordString); // [v6] Gps, Extension, keywords

            // ### Loop through words in the file ###
            int i = 0;          // count of words
            string key = "";    // temp variables
            
            foreach (string word in downloadDocument.WordsArray)
            {
                key = word.ToLower();
                if (!_GoChecker.IsGoWord(key))
                {	// not a special case, parse like any other word
                    RemovePunctuation(ref key);

                    if (!IsNumber(ref key))
                    {	// not a number, so get rid of numeric seperators and catalog as a word
                        // TODO: remove inline punctuation, split hyphenated words?
                        // http://blogs.msdn.com/ericgu/archive/2006/01/16/513645.aspx
                        key = System.Text.RegularExpressions.Regex.Replace(key, "[,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                        // Apply Stemmer (set by preferences)
                        key = _Stemmer.StemWord(key);

                        // Apply Stopper (set by preferences)
                        key = _Stopper.StopWord(key);
                    }
                }
                else
                {
                    ProgressEvent(this, new ProgressEventArgs(4, "Found GoWord " + key + " in " + downloadDocument.Title));
                }
                if (key != String.Empty)
                {
                    _Catalog.Add(key, infile, i);
                    i++;
                }
            }
            return i;
        }

        /// <summary>
        /// Each word is identified purely by the whitespace around it. It could still include punctuation
        /// attached to either end of the word, or "in" the word (ie a dash, which we will remove for
        /// indexing purposes)
        /// </summary>
        /// <remarks>
        /// Andrey Shchekin suggests 'unicode' regex [\w] - equivalent to [\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}\p{Pc}]
        /// http://www.codeproject.com/cs/internet/Searcharoo_4.asp?df=100&forumid=397394&select=1992575#xx1992575xx
        /// so [^\w0-9,.] as a replacement for [^a-z0-9,.]
        /// which might remove the need for 'AssumeAllWordsAreEnglish'. TO BE TESTED.
        /// </remarks>
        private void RemovePunctuation(ref string word)
        {   // this stuff is a bit 'English-language-centric'
            if (Preferences.AssumeAllWordsAreEnglish)
            {   // if all words are english, this strict parse to remove all punctuation ensures
                // words are reduced to their least unique form before indexing
                //word = System.Text.RegularExpressions.Regex.Replace(word, @"[^a-z0-9,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                
                // [v6] testing better i18n
                word = System.Text.RegularExpressions.Regex.Replace(word, @"[^\w0-9,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            }
            else 
            {   // by stripping out this specific list of punctuation only, there is potential to leave lots 
                // of cruft in the word before indexing BUT this will allow any language to be indexed
                word = word.Trim(' ','?','\"',',','\'',';',':','.','(',')','[',']','%','*','$','-'); 
            }
        }
        /// <summary>
        /// TODO: parse numbers here 
        /// ie remove thousands seperator, currency, etc
        /// and also trim decimal part, so number searches are only on the integer value
        /// </summary>
        private bool IsNumber(ref string word)
        {
            try
            {
                long number = Convert.ToInt64(word); //;int.Parse(word);
                word = number.ToString();
                return (word != String.Empty);
            }
            catch
            {
                return false;
            }
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Web Developer
Australia Australia
-- ooo ---
www.conceptdevelopment.net
conceptdev.blogspot.com
www.searcharoo.net
www.recipenow.net
www.racereplay.net
www.silverlightearth.com

Comments and Discussions