Link Scanner






4.61/5 (10 votes)
Gets all links that a page contains.
Introduction
Often developers have to write apps that have to parse something. This is a small example how to parse a web page ad get all the links that it contains. Such examples are realy good for beginner developers, and I think that it will give an idea of how to to create a nice parser. This example was created for a concrete problem, so it is not that abstract. The path of the web page must be a URL.
Using the Code
Scanner.cs contains all of the logic:
public class Scanner
{
//regular expression patterns
private static string urlPattern = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
private static string tagPattern = @"<a\b[^>]*(.*?)";
private static string emailPattern = @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
// gets all links that the url contains
public static List<string> getInnerUrls(string url) {
var innerUrls = new List<string>();
//create the WebRequest for url eg "http://www.codeproject.com"
WebRequest request = WebRequest.Create(url);
//get the stream from the web response
var reader = new StreamReader(request.GetResponse().GetResponseStream());
//get the htmlCode
string htmlCode = reader.ReadToEnd();
List<string> links = getMatches(htmlCode);
foreach (string link in links) {
//check if the links is referred to the same site
if (!Regex.IsMatch(link, urlPattern) && !Regex.IsMatch(link, emailPattern)) {
//form an absolute url for the link
string absoluteUrlPath = getAblosuteUrl(getDomainName(url), link);
innerUrls.Add(absoluteUrlPath);
}
else {
innerUrls.Add(link);
}
}
return innerUrls;
}
// get all links that the page contains
private static List<string> getMatches(string source) {
var matchesList = new List<string>();
//get the collection that match the tag pattern
MatchCollection matches = Regex.Matches(source, tagPattern);
//add the text under the href attribute
//to the list
foreach (Match match in matches) {
string val = match.Value.Trim();
if (val.Contains("href=\"")) {
string link = getSubstring(val, "href=\"", "\"");
matchesList.Add(link);
}
}
return matchesList;
}
private static string getSubstring(string source, string start, string end) {
// return the sub string
}
/// creates an absolute url for the source whitch the site contains
private static string getAblosuteUrl(string domainName, string path) {
//forms and return an absolute url for the source that is referred to the site
}
private static string getDomainName(string url) {
// return the url path were the page is stored
}}