using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace WindowsFormsApplication9
{
class AdditionalRegexExpression
{
public AdditionalRegexExpression(string Name, string Regex,int NumberOfResults)
{
this.RgColumnTarget = Name;
this.RgExp = Regex;
this.NumOfResults = NumberOfResults;
}
public string RgExp { get; set; }
public string RgColumnTarget { get; set; }
public int NumOfResults { get; set; }
}
class RegexExpressionCollection
{
public static string Fax
{
get { return @"Fax:\s[+]*([\d+[-]*)+"; }
}
public static string Email
{
get { return @"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*"; }
}
public static string Phone
{
get { return @"(Tel|phone|telephone|mobile):\s[+]*([\d+[-]*)+"; }
}
public static string[] Male
{
get
{
return new string[]{
"^[^S].*r[rv]e?y?$" //{ sex = ,"m," } # Barry, Larry, Perry,...
,"^[^G].*v[ei]$" //{ sex = ,"m," } # Clive, Dave, Steve,...
,"^[^AJKLMNP][^o][^eit]*([glrsw]ey|lie)$" //{ sex = ,"m," } # Dewey, Stanley, Wesley,...
,"^[CGJWZ][^o][^dnt]*y$" //{ sex = ,"m," } # Gregory, Jeremy, Zachary,...
,"^.*[Rlr][abo]y$" //{ sex = ,"m," } # Leroy, Murray, Roy,...
,"^.*[GRguw][ae]y?ne$" //{ sex = ,"m," } # Duane, Eugene, Rene,...
,"^[CLMQTV].*[^dl][in]c.*[ey]$" //{ sex = ,"m," } # Lance, Quincy, Vince,...
,"^.*[ay][dl]e$" //{ sex = ,"m," } # Clyde, Kyle, Pascale,...
,"^[^o]*ke$" //{ sex = ,"m," } # Blake, Luke, Mike,...
,"^[^EL].*o(rg?|sh?)?(e|ua)$" //{ sex = ,"m," } # George, Joshua, Theodore,..
,"^[^JPSWZ].*[denor]n.*y$" //{ sex = ,"m," } # Anthony, Henry, Rodney,...
,"^Br[aou][cd].*[ey]$" //{ sex = ,"m," } # Bradley, Brady, Bruce,...
,"^[ILW][aeg][^ir]*e$" //{ sex = ,"m," } # Ignace, Lee, Wallace,...
,"^[ABEIUY][euz]?[blr][aeiy]$" //{ sex = ,"m," } # Ari, Bela, Ira,...
,"^[ART][^r]*[dhn]e?y$" //{ sex = ,"m," } # Randy, Timothy, Tony,...
,"^.*oi?[mn]e$" //{ sex = ,"m," } # Antoine, Jerome, Tyrone,...
,"^D.*[mnw].*[iy]$" //{ sex = ,"m," } # Danny, Demetri, Dondi,...
,"^[^BG](e[rst]|ha)[^il]*e$" }; //{ sex = ,"m," } # Pete, Serge, Shane,...; }
}
}
public static string[] Female
{
get
{
return new string[]{
"^[ADFGIM][^r]*([bg]e[lr]|il|wn)$" //{ sex = ,"f," } # Angel, Gail, Isabel,...
,"^.*[aeiy]$" //{ sex = ,"f," } # Female names endng in aeiy.
,"^All?[iy]((ss?)|z)on$" //{ sex = ,"f," } # Allison (and variations)
,"^.*een$" //{ sex = ,"f," } # Cathleen, Eileen, Maureen,...
,"^[^BD].*(b[iy]|y|via)nn?$" //{ sex = ,"f," } # Carolyn,Gwendolyn,Vivian,...
,"^[AEHJL].*il.*$" //{ sex = ,"f," } # Abigail, Jill, Lillian,...
,"^.*[Jj](o|o?[ae]a?n.*)$" //{ sex = ,"f," } # Janet, Jennifer, Joan,...
,"^[FLM].*ur(.*[^eotuy])?$" //{ sex = ,"f," } # Fleur, Lauren, Muriel,...
,"^M[aei]r[^tv].*([^cklnos]|([^o]n))$" //{ sex = ,"f," } # Margaret, Marylou, Miriam,...
,"^[CKS]h?(ar[^lst]|ry).+$" //{ sex = ,"f," } # Carol, Karen, Sharon,...
,"^[PR]e?a([^dfju]|qu)*[lm]$" //{ sex = ,"f," } # Pam, Pearl, Rachel,...
,"^.*[Aa]nn.*$" //{ sex = ,"f," } # Annacarol, Leann, Ruthann,...
,"^.*[^cio]ag?h$" //{ sex = ,"f," } # Deborah, Leah, Sarah,...
,"^[^EK].*[grsz]h?an(ces)?$" //{ sex = ,"f," } # Frances, Megan, Susan,...
,"^[^P]*([Hh]e|[Ee][lt])[^s]*[ey].*[^t]$" //{ sex = ,"f," } # Ethel, Helen, Gretchen,...
,"^[DP][eo]?[lr].*se$" //{ sex = ,"f," } # Delores, Doris, Precious,...
,"^K[^v]*i.*[mns]$," //{ sex = ,"f," } # Karin, Kim, Kristin,...
,"^[ACGK].*[deinx][^aor]s$," //{ sex = ,"f," } # Agnes, Alexis, Glynis,...
,"^[^AGW][iu][gl].*[drt]$," //{ sex = ,"f," } # Juliet, Mildred, Millicent,...
,"^[EGILP][^eu]*i[ds]$" //{ sex = ,"f," } # Iris, Lois, Phyllis,...
,"^[BHL].*i.*[rtxz]$" //{ sex = ,"f," } # Beatriz, Bridget, Harriet,...
,"^[^GKSW].*(th|lv)(e[rt])?$" }; //{ sex = ,"f," } # Heather, Ruth, Velvet,...; }
}
}
public static string name
{
get { return @"name[:||\s+]\D+"; }
}
public static string PostalCode
{
get { return @"^\d{5}$|^\d{5}-\d{4}$"; }
}
public static string BirthDate
{
get { return @"Fax:\s[+]*([\d+[-]*)+"; }
}
public static string WebPage
{
get { return @"(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z 0-9\-\._\?\,\'/\\\+&%\$#\=~])*"; }
}
public static string Department { get { return @"department [\b\w\:]+ $"; } }
public static string BussinessPhone { get { return @"work||company||unversity||bussiness[\w]{0,10}[\b]*[:]*[\b]*[.]*^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$"; } }
public static string FirstLine { get { return @"[.]+\n"; } }
public static string EmailFirstPart { get { return @"[.]+\@"; } }
public static string CleanStringForExcel { get { return @"\n|\s|\t|\f|\v|\e|,"; } }
}
class PersonData
{
public static string[] DataColumns
{
get
{
return new string[] { "Title", "First Name", "Middle Name", "Last Name", "Suffix", "Company", "Department", "Job Title", "Business Street", "Business Street 2", "Business Street 3", "Business City", "Business State", "Business Postal Code", "Business Country", "Home Street", "Home Street 2", "Home Street 3", "Home City", "Home State", "Home Postal Code", "Home Country", "Other Street", "Other Street 2", "Other Street 3", "Other City", "Other State", "Other Postal Code", "Other Country", "Assistant's Phone", "Business Fax", "Business Phone", "Business Phone 2", "Callback", "Car Phone", "Company Main Phone", "Home Fax", "Home Phone", "Home Phone 2", "ISDN", "Mobile Phone", "Other Fax", "Other Phone", "Pager", "Primary Phone", "Radio Phone", "TTY/TDD Phone", "Telex", "Account", "Anniversary", "Assistant's Name", "Billing Information", "Birthday", "Categories", "Children", "E-mail Address", "E-mail Display Name", "E-mail 2 Address", "E-mail 2 Display Name", "E-mail 3 Address", "E-mail 3 Display Name", "Gender", "Government ID Number", "Hobby", "Initials", "Keywords", "Language", "Location", "Mileage", "Notes", "Office Location", "Organizational ID Number", "PO Box", "Private", "Profession", "Referred By", "Spouse", "User 1", "User 2", "User 3", "User 4", "Web Page" };
}
}
public string dataType { get; set; }
public string Data { get; set; }
public PersonData() { }
public PersonData(string dataType, string data)
{
// TODO: Complete member initialization
this.dataType = dataType;
this.Data = data;
}
public PersonData(Person.DataType dataType, string data)
{
// TODO: Complete member initialization
this.dataType = dataType.ToString();
this.Data = data;
}
}
class Person
{
public enum DataType
{
First_Name, Middle_Name, Last_Name, Department, Job_Title, Business_State, Home_Postal_Code, Business_Phone, Home_Phone, Birthday, Email_Address, Email_Display_Name, Gender, Language, Notes, Web_Page, Business_Fax
}
string DataBlock { get; set; }
public Person(string DataBlock)
{
this.DataBlock = DataBlock;
}
public PersonData getEmailPersonFromBlock()
{
List<string> Emails = new List<string>();
Regex emailRegex = new Regex(RegexExpressionCollection.Email,
RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(this.DataBlock);
foreach (Match emailMatch in emailMatches)
{
Emails.Add(emailMatch.Value);
}
//retutn OnlyOneEmail
if(emailMatches.Count==0)
return new PersonData(DataType.Email_Address, "");
return new PersonData(DataType.Email_Address, Emails[0]);
}
public PersonData Gender()
{
string Name = GetPersonNameFromBlock()[0].Data;//just First Name;
return Gender(Name);
}
public PersonData Gender(string Name)
{
bool MaleByText = Regex.Matches(this.DataBlock, @"male|sir|mister|father|man|he|his\s ").Count > 0;
bool FeMaleByText = Regex.Matches(this.DataBlock, @"female|madam|Mademoiselle|miss|mother|women|teacher|she|her\s").Count > 0;
if (IsMale(Name))
return new PersonData(DataType.Gender, "male");
else if (IsFemale(Name))
return new PersonData(DataType.Gender, "Female");
else if(MaleByText)
return new PersonData(DataType.Gender, "male");
else if (FeMaleByText)
return new PersonData(DataType.Gender, "Female");
else
return new PersonData(DataType.Gender, "UnKnown");
}
public PersonData GetPersonPostalCodeFromBlocK()
{
MatchCollection PersonPCRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.PostalCode);
string PC = "";
if (PersonPCRegex.Count > 0)
PC = PersonPCRegex[0].Value.ToString();
return new PersonData(DataType.Home_Postal_Code, PC);
}
public List<PersonData> GetPersonFromBlock(bool EmailIsMainData)
{
return GetPersonFromBlock(new List<AdditionalRegexExpression>(), EmailIsMainData);
}
public List<PersonData> GetPersonFromBlock()
{
return GetPersonFromBlock(new List<AdditionalRegexExpression>(), false);
}
public List<PersonData> GetPersonFromBlock(List<AdditionalRegexExpression> AddationalRegex, bool EmailIsMainData)
{
PersonData PEmailData = getEmailPersonFromBlock();
if (!Regex.IsMatch( PEmailData.Data.ToString(),RegexExpressionCollection.Email)&& EmailIsMainData)//if this doesn't match Email Catagory and Email is Needed return empty
return new List<PersonData>();
List<PersonData> PNameData = new List<PersonData>();
if (PEmailData.Data != "")
PNameData = GetPersonNameFromBlock(PEmailData.Data.ToString()); //if Name is Empty Take Email first part as name
PersonData PPhone = GetPhonePersonFormBlock();
PersonData PbirthDaY = GetPhonePersonBirthDate();
PersonData PEMialDisplayName = new PersonData();
if(PNameData.Count>0)
PEMialDisplayName = new PersonData(DataType.Email_Display_Name, PNameData[0].Data);
PersonData PPostalCode = GetPersonPostalCodeFromBlocK();
PersonData Gender = new PersonData();
if(PNameData.Count>0)
Gender= this.Gender(PNameData[0].Data);
PersonData Notes = new PersonData(DataType.Notes,this.DataBlock); //just in Case The Data Was Wrong U have to consider that every thing beside Email is Notes
PersonData WebPage = GetPersonWebPageFromBlock();
PersonData Business_Phone = GetPersonBussinessPhone();
PersonData Lang = GetPersonLanguage();
PersonData Business_Fax = getPersonBusinessFax();
PersonData Business_State = GetPersonBusiness_State();
PersonData Jop_Tile = new PersonData(DataType.Job_Title, Business_State.Data);
PersonData Department = GetPersonDepartment();
List<PersonData> AddaationalData = GetAddationalData(AddationalRegex);
List<PersonData> TotalData = new List<PersonData>();
TotalData.AddRange(PNameData);
TotalData.AddRange(new List<PersonData> { Business_Fax, PEmailData, PPhone, PbirthDaY, PEMialDisplayName, PPostalCode, Gender, Notes, WebPage, Business_Phone, Lang, Business_State, Jop_Tile, Department });
TotalData.AddRange(AddaationalData);//it must be at last becuase it maybe erase data taken by another regex
return TotalData;
}
private string[] StringSplittededBySpace(string Data)
{
return Regex.Split(Data, @"[\s|\.|\,|\-]+?");
}
public List<PersonData> GetPersonNameFromBlock()
{
string Email = getEmailPersonFromBlock().Data;
return GetPersonNameFromBlock(Email);
}
public List<PersonData> GetPersonNameFromBlock(string Email)
{
string Name = Regex.Match(this.DataBlock.ToLower(), RegexExpressionCollection.name, RegexOptions.Singleline).ToString();
if (Name == "")
{
Name = Regex.Split(Email, @"@", RegexOptions.Singleline)[0];//take Email first part As Name
if (Name == "")
Name = Regex.Split(this.DataBlock, RegexExpressionCollection.FirstLine, RegexOptions.Singleline)[0];
}
string[] PName = StringSplittededBySpace(Name);//what will be produced by this
string PNameFirst = "";
string PNameMiddle = "";
string PNameLast = "";
if (PName.Length > 1)
{
PNameFirst = PName[0];
PNameLast = PName[1];
}
else
{
PNameFirst = PName[0];
}
if (PName.Length > 2)
{
PNameFirst = PName[0];
PNameMiddle = PName[1];
PNameLast = PName[2];
}
List<PersonData> PersonDataName = new List<PersonData>();
if(DataIsName(PNameFirst))
PersonDataName.Add(new PersonData(DataType.First_Name, PNameFirst));
if (DataIsName(PNameMiddle))
PersonDataName.Add(new PersonData(DataType.Middle_Name, PNameMiddle));
if (DataIsName(PNameLast))
PersonDataName.Add(new PersonData(DataType.Last_Name, PNameLast));
return PersonDataName;
}
private static bool IsMale(string Name)
{
for (int i = 0; i < RegexExpressionCollection.Male.Length; i++)
if (Regex.IsMatch(Name, RegexExpressionCollection.Male[i]))
return true;
return false;
}
private bool DataIsName(string Data)
{
return Regex.IsMatch(Data,@"[\s|\w|\.]+");
}
private static bool IsFemale(string Name)
{
for (int i = 0; i < RegexExpressionCollection.Female.Length; i++)
if (Regex.IsMatch(Name, RegexExpressionCollection.Female[i]))
return true;
return false;
}
public PersonData GetPersonWebPageFromBlock()
{
MatchCollection PersonWPRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.WebPage);
string WP = "";
if (PersonWPRegex.Count > 0)
WP = PersonWPRegex[0].Value.ToString();
return new PersonData(DataType.Web_Page, WP);
}
public PersonData getPersonBusinessFax()
{
MatchCollection Matches = Regex.Matches(this.DataBlock, RegexExpressionCollection.Fax);
string Fax = "";
if (Matches.Count > 0)
{
for (int i = 0; i < Matches.Count; i++)
{
Fax += Matches[i];
}
return new PersonData(DataType.Business_Fax, Fax);
}
return new PersonData(DataType.Business_Fax, "");
}
public PersonData GetPersonDepartment()
{
MatchCollection DeptRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.Department);
if (DeptRegex.Count > 0)
return new PersonData(DataType.Department, DeptRegex[0].Value.ToString());
return new PersonData(DataType.Department, "");
}
public PersonData GetPersonBusiness_State()
{
bool Dr = Regex.Matches(this.DataBlock, "Dr.\b|Phd", RegexOptions.IgnoreCase).Count > 0;
bool Prof = Regex.Matches(this.DataBlock, "prof.\b", RegexOptions.IgnoreCase).Count > 0;
bool Manager = Regex.Matches(this.DataBlock, "manager\b", RegexOptions.IgnoreCase).Count > 0;
bool HeadMaster = Regex.Matches(this.DataBlock, "headmaster\b", RegexOptions.IgnoreCase).Count > 0;
bool serial_killer = Regex.Matches(this.DataBlock, @"serial\bkiller\b|Killer", RegexOptions.IgnoreCase).Count > 0; //Just in case :)
if (Dr)
return new PersonData(DataType.Business_State, "Doctor");
if (Prof)
return new PersonData(DataType.Business_State, "Professor");
if (Manager)
return new PersonData(DataType.Business_State, "Manager");
if (Dr)
return new PersonData(DataType.Business_State, "Doctor");
if (serial_killer)
return new PersonData(DataType.Business_State, "Killer Don't Add Him °O° ");
return new PersonData(DataType.Business_State, "");
}
public PersonData GetPersonLanguage()
{
bool English = Regex.Matches(this.DataBlock, @"\w").Count > 0;
if (English)
return new PersonData(DataType.Language, "English");
else
return new PersonData(DataType.Language, "Unknown");//I don't Know what in hel this is
}
public PersonData GetPersonBussinessPhone()
{
return new PersonData(DataType.Business_Phone, Regex.Matches(this.DataBlock, RegexExpressionCollection.BussinessPhone)[0].ToString());
}
public PersonData GetPhonePersonFormBlock()
{
MatchCollection PhoneRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.Phone);
string PhoneNumber = "";
if (PhoneRegex.Count <= 0)
return new PersonData(DataType.Home_Phone, "");
return new PersonData(DataType.Home_Phone, PhoneRegex[0].ToString());
}
public PersonData GetPhonePersonBirthDate()
{
MatchCollection BirthDate = Regex.Matches(this.DataBlock, RegexExpressionCollection.BirthDate);
if(BirthDate.Count<=0)
return new PersonData(DataType.Birthday, "");
return new PersonData(DataType.Birthday, BirthDate[0].ToString());
}
static List<int> WrongRegex = new List<int>();
public List<PersonData> GetAddationalData(List<AdditionalRegexExpression> AdditionalRegex)
{
List<PersonData> AdditionalData = new List<PersonData>();
for (int i = 0; i < AdditionalRegex.Count; i++)
{
if (WrongRegex.Contains(i))
continue;
try
{
MatchCollection RegexMatches = Regex.Matches(this.DataBlock, AdditionalRegex[i].RgExp.ToString());
int count = 0;
if (AdditionalRegex[i].NumOfResults == 0)
{
count = RegexMatches.Count;
}
else
count = AdditionalRegex[i].NumOfResults;
if (RegexMatches.Count < count)
count = RegexMatches.Count;
string Data = "";
for (int c = 0; c < count; c++)
{
Data += RegexMatches[c].Value.ToString();
}
string targetColumn = AdditionalRegex[i].RgColumnTarget;
AdditionalData.Add(new PersonData(targetColumn, Data));
}
catch (Exception e)
{
WrongRegex.Add(i);
throw e;
}
}
return AdditionalData;
}
}
}