Click here to Skip to main content
15,893,487 members
Articles / Web Development / HTML

HTML Parser Technique for Parsing Search Engines (Google)

Rate me:
Please Sign up or sign in to vote.
3.06/5 (11 votes)
23 Sep 20054 min read 131.8K   2.6K   75  
Set of libraries for parsing results of popular search engines (Google, Yahoo!, Lycos, MSN, Netscape, Ask, AllTheWeb, AltaVista).
using System;
using System.Collections;
using System.Text.RegularExpressions;

namespace Slaks.Web.Parser
{
	public class GoogleParser : SearchEngineParser
	{
		private static readonly string TotalSearchResultPattern = @".*?about\s+(?<total>.*?)\s+for.*";
		private static readonly string SearchResultTermPattern = "of about";

		private string m_queryPathString;
		private string m_startQuerySearchPattern = "http://www.google.com/search?hl=en&hs=23e&lr=&rls=en&ie=utf-8&oe=utf-8&sa=N&q=";
		private string m_startSearchPattern = "&start=";
		public GoogleParser(string query) : base(query)
		{
			m_queryPathString = m_startQuerySearchPattern + m_query + m_startSearchPattern + 0.ToString();
			m_fileName = m_queryPathString;
			m_encoding = System.Text.Encoding.UTF8;
			m_source = ReadSource.Web;
			m_baseUri = new Uri(m_fileName);
			m_addressLinkLocation = new Uri("http://www.google.com");
			base.LoadConfiguration("google.config");
		}

		protected override void GetLinks(HtmlStructure structure,AddressLinkCollection linkCollection)
		{
			if (structure == null) return;
			if (structure.TagName == "PARAGRAPH")
			{
				if (structure.Anchors != null && structure.Anchors.Count != 0) 
				{

					IList anchors = structure.Anchors;

					foreach(HtmlAnchor anchor in anchors)
					{
						if (anchor.Text.IndexOf("cached") >= 0) continue;
						if (anchor.Text.IndexOf("similar") >= 0) continue;
						if (anchor.Text.IndexOf("view as") >= 0) continue;
						if (anchor.Href.ToString().IndexOf("google") >= 0 ) continue;

						AddressLink link = new AddressLink(anchor.Href.ToString(),anchor.Text);
						linkCollection.Add(link);
					}
				}
			}
			IList structList = structure.Structure;

			foreach(HtmlStructure struct_ in structList)
			{
				GetLinks(struct_,linkCollection);
			}
		}

		protected override int GetTotalSearchResults(HtmlStructure structure)
		{
			int totalSearchResults = -1;
			if (structure == null) return -1;
			if (structure.TagName == "TABLE")
			{
				if (FindTotalSearchResults(structure.TextArray,out totalSearchResults))
				{
					m_totalSearchResults = totalSearchResults;
					m_isTotaSearchResultsFound = true;
					return totalSearchResults;
				}
			}

			IList structList = structure.Structure;

			foreach(HtmlStructure struct_ in structList)
			{
				totalSearchResults = GetTotalSearchResults(struct_);
				if (totalSearchResults >= 0) break;
			}
			return totalSearchResults;
		}

		protected override bool FindTotalSearchResults(string text,out int total)
		{
			total = -1;
			if (text.IndexOf(SearchResultTermPattern) < 0) return false;
			Match m = Regex.Match(text,TotalSearchResultPattern,RegexOptions.IgnoreCase | RegexOptions.Multiline);
     
			try
			{
				string totalString = m.Groups["total"].Value;
				totalString = totalString.Replace(",","");
				total = int.Parse(totalString);
			}
			catch(Exception)
			{
				return false;
			}
			return true;
		}

		public override bool Search()
		{
			m_fileName = m_queryPathString = m_startQuerySearchPattern + m_query + m_startSearchPattern + m_totalLinksRetrieved.ToString();
			m_baseUri = new Uri(m_fileName);
			bool isParsed = this.ParseMe();
			if (isParsed) 
			{
				m_addressLinkCollection = new AddressLinkCollection();
				this.GetLinks(this.RootStructure,m_addressLinkCollection);
				m_numberOfLinksRetrieved = m_addressLinkCollection.Count;
				m_totalLinksRetrieved += m_numberOfLinksRetrieved;
			}

			return isParsed;
		}

		public override bool Search(int nextIndex)
		{
			m_fileName = m_queryPathString = m_startQuerySearchPattern + m_query + m_startSearchPattern + nextIndex.ToString();
			m_baseUri = new Uri(m_fileName);
			bool isParsed = this.ParseMe();
			if (isParsed) 
			{
				m_addressLinkCollection = new AddressLinkCollection();
				this.GetLinks(this.RootStructure,m_addressLinkCollection);
				m_numberOfLinksRetrieved = m_addressLinkCollection.Count;
				m_totalLinksRetrieved += m_numberOfLinksRetrieved;
			}

			return isParsed;
		}
		public override string ToString()
		{
			return "Google";
		}
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Engineer
Germany Germany
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions