
Introduction
Often developers have to write apps that have to parse something. This is a small example how to parse a web page ad get all the links that it contains. Such examples are realy good for beginner developers, and I think that it will give an idea of how to to create a nice parser. This example was created for a concrete problem, so it is not that abstract. The path of the web page must be a URL.
Using the Code
Scanner.cs contains all of the logic:
public class Scanner
{
private static string urlPattern = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
private static string tagPattern = @"<a\b[^>]*(.*?)";
private static string emailPattern = @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
public static List<string> getInnerUrls(string url) {
var innerUrls = new List<string>();
WebRequest request = WebRequest.Create(url);
var reader = new StreamReader(request.GetResponse().GetResponseStream());
string htmlCode = reader.ReadToEnd();
List<string> links = getMatches(htmlCode);
foreach (string link in links) {
if (!Regex.IsMatch(link, urlPattern) && !Regex.IsMatch(link, emailPattern)) {
string absoluteUrlPath = getAblosuteUrl(getDomainName(url), link);
innerUrls.Add(absoluteUrlPath);
}
else {
innerUrls.Add(link);
}
}
return innerUrls;
}
private static List<string> getMatches(string source) {
var matchesList = new List<string>();
MatchCollection matches = Regex.Matches(source, tagPattern);
foreach (Match match in matches) {
string val = match.Value.Trim();
if (val.Contains("href=\"")) {
string link = getSubstring(val, "href=\"", "\"");
matchesList.Add(link);
}
}
return matchesList;
}
private static string getSubstring(string source, string start, string end) {
}
private static string getAblosuteUrl(string domainName, string path) {
}
private static string getDomainName(string url) {
}}