- searchengineparser.zip
- SearchEngineParsers
- bin
- Slaks.Web.Parser.AllTheWebParser.dll
- Slaks.Web.Parser.AltVistaParser.dll
- Slaks.Web.Parser.AskParser.dll
- Slaks.Web.Parser.HtmlParser.dll
- Slaks.Web.Parser.LycosParser.dll
- Slaks.Web.Parser.MsnParser.dll
- Slaks.Web.Parser.NetscapeParser.dll
- Slaks.Web.Parser.YahooParser.dll
- GoogleParser
- SearchEngineParser
- SearchEngineParsers.resharperoptions
- SearchEngineParsers.sln
- TestSearchEngines
|
using System;
using System.Collections;
using System.Text.RegularExpressions;
namespace Slaks.Web.Parser
{
public class GoogleParser : SearchEngineParser
{
private static readonly string TotalSearchResultPattern = @".*?about\s+(?<total>.*?)\s+for.*";
private static readonly string SearchResultTermPattern = "of about";
private string m_queryPathString;
private string m_startQuerySearchPattern = "http://www.google.com/search?hl=en&hs=23e&lr=&rls=en&ie=utf-8&oe=utf-8&sa=N&q=";
private string m_startSearchPattern = "&start=";
public GoogleParser(string query) : base(query)
{
m_queryPathString = m_startQuerySearchPattern + m_query + m_startSearchPattern + 0.ToString();
m_fileName = m_queryPathString;
m_encoding = System.Text.Encoding.UTF8;
m_source = ReadSource.Web;
m_baseUri = new Uri(m_fileName);
m_addressLinkLocation = new Uri("http://www.google.com");
base.LoadConfiguration("google.config");
}
protected override void GetLinks(HtmlStructure structure,AddressLinkCollection linkCollection)
{
if (structure == null) return;
if (structure.TagName == "PARAGRAPH")
{
if (structure.Anchors != null && structure.Anchors.Count != 0)
{
IList anchors = structure.Anchors;
foreach(HtmlAnchor anchor in anchors)
{
if (anchor.Text.IndexOf("cached") >= 0) continue;
if (anchor.Text.IndexOf("similar") >= 0) continue;
if (anchor.Text.IndexOf("view as") >= 0) continue;
if (anchor.Href.ToString().IndexOf("google") >= 0 ) continue;
AddressLink link = new AddressLink(anchor.Href.ToString(),anchor.Text);
linkCollection.Add(link);
}
}
}
IList structList = structure.Structure;
foreach(HtmlStructure struct_ in structList)
{
GetLinks(struct_,linkCollection);
}
}
protected override int GetTotalSearchResults(HtmlStructure structure)
{
int totalSearchResults = -1;
if (structure == null) return -1;
if (structure.TagName == "TABLE")
{
if (FindTotalSearchResults(structure.TextArray,out totalSearchResults))
{
m_totalSearchResults = totalSearchResults;
m_isTotaSearchResultsFound = true;
return totalSearchResults;
}
}
IList structList = structure.Structure;
foreach(HtmlStructure struct_ in structList)
{
totalSearchResults = GetTotalSearchResults(struct_);
if (totalSearchResults >= 0) break;
}
return totalSearchResults;
}
protected override bool FindTotalSearchResults(string text,out int total)
{
total = -1;
if (text.IndexOf(SearchResultTermPattern) < 0) return false;
Match m = Regex.Match(text,TotalSearchResultPattern,RegexOptions.IgnoreCase | RegexOptions.Multiline);
try
{
string totalString = m.Groups["total"].Value;
totalString = totalString.Replace(",","");
total = int.Parse(totalString);
}
catch(Exception)
{
return false;
}
return true;
}
public override bool Search()
{
m_fileName = m_queryPathString = m_startQuerySearchPattern + m_query + m_startSearchPattern + m_totalLinksRetrieved.ToString();
m_baseUri = new Uri(m_fileName);
bool isParsed = this.ParseMe();
if (isParsed)
{
m_addressLinkCollection = new AddressLinkCollection();
this.GetLinks(this.RootStructure,m_addressLinkCollection);
m_numberOfLinksRetrieved = m_addressLinkCollection.Count;
m_totalLinksRetrieved += m_numberOfLinksRetrieved;
}
return isParsed;
}
public override bool Search(int nextIndex)
{
m_fileName = m_queryPathString = m_startQuerySearchPattern + m_query + m_startSearchPattern + nextIndex.ToString();
m_baseUri = new Uri(m_fileName);
bool isParsed = this.ParseMe();
if (isParsed)
{
m_addressLinkCollection = new AddressLinkCollection();
this.GetLinks(this.RootStructure,m_addressLinkCollection);
m_numberOfLinksRetrieved = m_addressLinkCollection.Count;
m_totalLinksRetrieved += m_numberOfLinksRetrieved;
}
return isParsed;
}
public override string ToString()
{
return "Google";
}
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.