|
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.IO;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
namespace CPVanityLite
{
class RepScoreScraper
{
private int timeOut = 30 * 1000;
private Regex memberNumberRegex = new Regex("Member No. (\\d*)");
private Regex repScoreRegex = new Regex("<span id=\"ctl00_.*?_TotalRep\" class=\"large-text\">([\\s\\S]*?)</span>");
private Regex displayNameRegex = new Regex("<h2 id=\"ctl00_.*?_P_Name\">([\\s\\S]*?)</h2>");
public RepScoreScraper()
{
}
public RepScoreScraper(int timeOut)
{
timeOut = this.timeOut;
}
public event EventHandler<RepScoreScraperEventArgs> MemberInfoScraped;
public event EventHandler ScrapeFinished;
private string GetHttpPage(string url, int timeout)
{
var request = WebRequest.Create(new Uri(url, UriKind.Absolute));
request.Timeout = timeout;
using (var response = request.GetResponse())
{
using (var responseStream = response.GetResponseStream())
{
using (var reader = new StreamReader(responseStream))
{
return reader.ReadToEnd();
}
}
}
}
public void StartScraping()
{
string[] ml_obs = new[] { "ArticleCount", "MessageCount" };
HashSet<int> ids = new HashSet<int>();
for (int j = 0; j < ml_obs.Length; j++)
{
for (int page = 1; page <= 5; page++)
{
string url = String.Format("http://www.codeproject.com/script/Membership/Profiles.aspx?ml_ob={0}&mgtid=-1&mgm=False&pgnum={1}", ml_obs[j], page);
string html = GetHttpPage(url, timeOut);
var memberNumberMatches = memberNumberRegex.Matches(html);
var repScoreMatches = repScoreRegex.Matches(html);
var displayNameMatches = displayNameRegex.Matches(html);
if (memberNumberMatches.Count == repScoreMatches.Count && memberNumberMatches.Count == displayNameMatches.Count)
{
for (int i = 0; i < memberNumberMatches.Count; i++)
{
int id = -1;
double score = -1;
if (memberNumberMatches[i].Groups.Count == 2 && Int32.TryParse(memberNumberMatches[i].Groups[1].Value, out id) &&
repScoreMatches[i].Groups.Count == 2 && Double.TryParse(repScoreMatches[i].Groups[1].Value, out score) &&
displayNameMatches[i].Groups.Count == 2)
{
if (!ids.Contains(id))
{
ids.Add(id);
var handler = MemberInfoScraped;
if (handler != null)
{
handler(this, new RepScoreScraperEventArgs() { Id = id, DisplayName = displayNameMatches[i].Groups[1].Value.Trim(), ReputationScore = (int)score });
}
}
}
}
}
}
}
var finishedHandler = ScrapeFinished;
if (finishedHandler != null)
{
finishedHandler(this, EventArgs.Empty);
}
}
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.
Nish Nishant is a technology enthusiast from Columbus, Ohio. He has over 20 years of software industry experience in various roles including Chief Technology Officer, Senior Solution Architect, Lead Software Architect, Principal Software Engineer, and Engineering/Architecture Team Leader. Nish is a 14-time recipient of the Microsoft Visual C++ MVP Award.
Nish authored C++/CLI in Action for Manning Publications in 2005, and co-authored Extending MFC Applications with the .NET Framework for Addison Wesley in 2003. In addition, he has over 140 published technology articles on CodeProject.com and another 250+ blog articles on his WordPress blog. Nish is experienced in technology leadership, solution architecture, software architecture, cloud development (AWS and Azure), REST services, software engineering best practices, CI/CD, mentoring, and directing all stages of software development.
Nish's Technology Blog :
voidnish.wordpress.com