|
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Announcements
Want a new Job?
Chapters
Services
Feature Zones
|
IntroductionThis code parses hyperlinks from an HTML document and follows them recursively, like a Web spider. The sample is embedded into a Windows form containing a list view which shows the parsed hyperlinks. It doesn't really parse only hyperlinks (" This is NOT an introduction to regular expressions! BackgroundI needed this code to create a "Google Sitemap" for my own private homepage. Using the CodeThe code supports a maximum depth parameter so that you will not follow too many hyperlinks. It also follows only URLs which are "below" the URL you have specified for the start. Here's the nearly complete code of this sample, including a lot of comments explaining what it does. protected Regex m_rxBaseHref;
protected Regex m_rxHref;
protected Regex m_rxFrame;
protected Regex m_rxIframe;
protected Regex m_rxArea;
protected List<string> m_strListUrlsAdded;
protected List<string> m_strListUrlsFollowed;
public Form1()
{
InitializeComponent();
RegexOptions rxOpt = RegexOptions.Singleline |
RegexOptions.Compiled |
RegexOptions.IgnoreCase;
//We have to create our regular expression for parsing hyperlinks,
//base hrefs and so on
m_rxHref = new Regex("<a[^>]*href=(\"|')(.*?)\\1[^>]*>(.*?)</a>", rxOpt);
m_rxFrame = new Regex("<frame[^>]*src=(\"|')(.*?)\\1[^>]*>", rxOpt);
m_rxIframe = new Regex("<iframe[^>]*src=(\"|')(.*?)\\1[^>]*>", rxOpt);
m_rxArea = new Regex("<area[^>]*href=(\"|')(.*?)\\1[^>]*>", rxOpt);
m_rxBaseHref = new Regex("<base[^>]* href=(\"|')(.*?)\\1[^>]*>", rxOpt);
}
private void btnReadFromUrl_Click(object sender, EventArgs e)
{
Cursor = Cursors.WaitCursor;
//Clear existing URLs
lvUrls.Items.Clear();
m_strListUrlsAdded = new List<string>();
m_strListUrlsFollowed = new List<string>();
ReadUrls(tbUrl.Text, tbUrl.Text, ref m_strListUrlsAdded,
ref m_strListUrlsFollowed, (int)numMaxDepth.Value, 0);
Cursor = Cursors.Default;
}
protected void ReadUrls(string strURL, string strStartBase,
ref List<string> strUrlsAdded,
ref List<string> strUrlsFollowed,
int iMaximumDepth,
int iCurrentDepth)
{
//Increase the depth. If we reach the maximum depth: return
if (++iCurrentDepth == iMaximumDepth)
{
return;
}
//Now we create the WebRequest and get the response. If something fails
//we return
HttpWebRequest req = null;
try
{
req = HttpWebRequest.Create(strURL) as HttpWebRequest;
}
catch (Exception) { }
if(req == null)
{
return;
}
req.Method = "GET";
HttpWebResponse res = null;
try
{
res = req.GetResponse() as HttpWebResponse;
}
catch (Exception){}
if(res == null || res.StatusCode != HttpStatusCode.OK)
{
return;
}
Stream s = res.GetResponseStream();
StreamReader sr = new StreamReader(s);
//Read the whole content of the response stream into a string
string strHTML = sr.ReadToEnd();
sr.Close();
sr.Dispose();
sr = null;
s.Close();
s.Dispose();
s = null;
int iPos, iPos2;
/*After getting a response the Address property of the
Web request contains the real URL from the served document.
This is automatically done by reading the HTTP header
"Content-Location"
For example, you request the URL "http://www.test.abc/test
But "test" is just a directory, so the server returns
the index document of this directory. In this case the
address property is "http://www.test.abc/test/index.htm"
*/
//We need the base to follow relative URLs
string strBase = req.Address.AbsoluteUri;
//If the base contains a query string, we remove that string
//because we don't need it.
iPos = strBase.IndexOf('?');
if(iPos > -1)
{
strBase = strBase.Substring(0, iPos);
}
//Assure that the base ends with a slash
if(strBase[strBase.Length - 1] != '/')
{
iPos = strBase.LastIndexOf('/');
if(iPos < 0)
{
return;
}
strBase = strBase.Substring(0, iPos + 1);
}
iPos = strBase.IndexOf("://");
if(iPos < 0)
{
return;
}
iPos = strBase.IndexOf('/', iPos + 3);
if(iPos < 0)
{
return;
}
//We need the base host URL for hyperlinks that start with a slash
string strBaseHostUrl = strBase.Substring(0, iPos + 1);
//Test if the HTML contains a base href
Match matchBaseHref = m_rxBaseHref.Match(strHTML);
if (matchBaseHref.Success)
{
string strHtmlBase = matchBaseHref.Groups[2].Value.Trim();
if(strHtmlBase.StartsWith("/"))
{
strBase = strBaseHostUrl + strHtmlBase.Substring(1);
}
else
{
strBase = strHtmlBase;
}
}
//This dictionary contains all hyperlinks and their
//associated "texts" (anything between <a> and </a>)
Dictionary<string, string> dictHrefs = new Dictionary<string, string>();
MatchCollection matchesHref = m_rxHref.Matches(strHTML);
AddHrefMatches(matchesHref, ref dictHrefs);
MatchCollection matchesFrame = m_rxFrame.Matches(strHTML);
AddHrefMatches(matchesFrame, ref dictHrefs);
MatchCollection matchesIframe = m_rxIframe.Matches(strHTML);
AddHrefMatches(matchesIframe, ref dictHrefs);
MatchCollection matchesArea = m_rxArea.Matches(strHTML);
AddHrefMatches(matchesArea, ref dictHrefs);
//Now we iterate through all Hyperlinks we found
foreach (string strUrlFound in dictHrefs.Keys)
{
string strUrlNew = strUrlFound;
//Skip this links if it starts with ftp://, news://, mailto:,
//JavaScript:
if (IsAbsoluteUrl(strUrlNew) && !IsHttpUrl(strUrlNew))
{
continue;
}
//if this isn't an absolute URL
if (!IsHttpUrl(strUrlNew))
{
if (strUrlNew.StartsWith("/"))
{
strUrlNew = strBaseHostUrl + strUrlNew.Substring(1);
}
else
{
strUrlNew = strBase + strUrlNew;
}
}
//Now we remove all parent paths
while ((iPos = strUrlNew.IndexOf("../")) > -1)
{
iPos2 = strUrlNew.Substring(0, iPos).LastIndexOf('/');
iPos2 = strUrlNew.Substring(0, iPos2).LastIndexOf('/');
strUrlNew = strUrlNew.Substring(0, iPos2) +
"/" + strUrlNew.Substring(iPos + 3);
}
//if the URL doesn't start with our starting base
//(the address we entered into our textbox) then
//skip this
if (!strUrlNew.StartsWith(strStartBase))
{
continue;
}
//If we haven't added the URL yet to our listview do it now
if (!strUrlsAdded.Contains(strUrlNew))
{
ListViewItem lvi = new ListViewItem(new string[]{
strUrlNew,
dictHrefs[strUrlFound]
});
lvUrls.Items.Add(lvi);
strUrlsAdded.Add(strUrlNew);
}
//Follow this URL if not already done
if (!strUrlsFollowed.Contains(strUrlNew))
{
strUrlsFollowed.Add(strUrlNew);
ReadUrls(strUrlNew, strStartBase,
ref strUrlsAdded, ref strUrlsFollowed,
iMaximumDepth, iCurrentDepth);
}
}
}
Points of InterestIt was a little bit tricky trying to discover how to handle parent links ("../"), base hrefs, Content-Locations and so on. But now I know a little bit more about this. History
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||