Click here to Skip to main content
15,895,746 members
Articles / Programming Languages / C#

Emails Extractor

Rate me:
Please Sign up or sign in to vote.
4.85/5 (4 votes)
30 Sep 2012CPOL10 min read 40.1K   2.6K   17  
Program Extracts Emails and Data (names,phones and Gender) belongs to a group of people from text file and order them into Datagridview which can be saved as .csv file contains people’s contacts which can be imported to a yahoo mail account.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace WindowsFormsApplication9
{
    class AdditionalRegexExpression
    {
        public AdditionalRegexExpression(string Name, string Regex,int NumberOfResults)
        {
            this.RgColumnTarget = Name;
            this.RgExp = Regex;
            this.NumOfResults = NumberOfResults;
        }

        public string RgExp { get; set; }

        public string RgColumnTarget { get; set; }

        public int NumOfResults { get; set; }
    }
    class RegexExpressionCollection
    {
        public static string Fax
        {
            get { return @"Fax:\s[+]*([\d+[-]*)+"; }
        }
        public static string Email
        {
            get { return @"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*"; }
        }
        public static string Phone
        {
            get { return @"(Tel|phone|telephone|mobile):\s[+]*([\d+[-]*)+"; }
        }
        public static string[] Male
        {
            get
            {
                return new string[]{
 "^[^S].*r[rv]e?y?$"                       //{ sex = ,"m," }  # Barry, Larry, Perry,...
,"^[^G].*v[ei]$"                           //{ sex = ,"m," }  # Clive, Dave, Steve,...
,"^[^AJKLMNP][^o][^eit]*([glrsw]ey|lie)$"  //{ sex = ,"m," }  # Dewey, Stanley, Wesley,...
,"^[CGJWZ][^o][^dnt]*y$"                   //{ sex = ,"m," }  # Gregory, Jeremy, Zachary,...
,"^.*[Rlr][abo]y$"                         //{ sex = ,"m," }  # Leroy, Murray, Roy,...
,"^.*[GRguw][ae]y?ne$"                     //{ sex = ,"m," }  # Duane, Eugene, Rene,...
,"^[CLMQTV].*[^dl][in]c.*[ey]$"            //{ sex = ,"m," }  # Lance, Quincy, Vince,...
,"^.*[ay][dl]e$"                           //{ sex = ,"m," }  # Clyde, Kyle, Pascale,...
,"^[^o]*ke$"                               //{ sex = ,"m," }  # Blake, Luke, Mike,...
,"^[^EL].*o(rg?|sh?)?(e|ua)$"              //{ sex = ,"m," }  # George, Joshua, Theodore,..
,"^[^JPSWZ].*[denor]n.*y$"                 //{ sex = ,"m," }  # Anthony, Henry, Rodney,...
,"^Br[aou][cd].*[ey]$"                     //{ sex = ,"m," }  # Bradley, Brady, Bruce,...
,"^[ILW][aeg][^ir]*e$"                     //{ sex = ,"m," }  # Ignace, Lee, Wallace,...
,"^[ABEIUY][euz]?[blr][aeiy]$"             //{ sex = ,"m," }  # Ari, Bela, Ira,...
,"^[ART][^r]*[dhn]e?y$"                    //{ sex = ,"m," }  # Randy, Timothy, Tony,...
,"^.*oi?[mn]e$"                            //{ sex = ,"m," }  # Antoine, Jerome, Tyrone,...
,"^D.*[mnw].*[iy]$"                        //{ sex = ,"m," }  # Danny, Demetri, Dondi,...
,"^[^BG](e[rst]|ha)[^il]*e$"  };           //{ sex = ,"m," }  # Pete, Serge, Shane,...; }
            }
        }
        public static string[] Female
        {
            get
            {
                return new string[]{
 "^[ADFGIM][^r]*([bg]e[lr]|il|wn)$"        //{ sex = ,"f," }  # Angel, Gail, Isabel,...
,"^.*[aeiy]$"                              //{ sex = ,"f," }  # Female names endng in aeiy.
,"^All?[iy]((ss?)|z)on$"                   //{ sex = ,"f," }  # Allison (and variations)
,"^.*een$"                                 //{ sex = ,"f," }  # Cathleen, Eileen, Maureen,...
,"^[^BD].*(b[iy]|y|via)nn?$"               //{ sex = ,"f," }  # Carolyn,Gwendolyn,Vivian,...
,"^[AEHJL].*il.*$"                         //{ sex = ,"f," }  # Abigail, Jill, Lillian,...
,"^.*[Jj](o|o?[ae]a?n.*)$"                 //{ sex = ,"f," }  # Janet, Jennifer, Joan,...
,"^[FLM].*ur(.*[^eotuy])?$"                //{ sex = ,"f," }  # Fleur, Lauren, Muriel,...
,"^M[aei]r[^tv].*([^cklnos]|([^o]n))$"     //{ sex = ,"f," }  # Margaret, Marylou, Miriam,...
,"^[CKS]h?(ar[^lst]|ry).+$"                //{ sex = ,"f," }  # Carol, Karen, Sharon,...
,"^[PR]e?a([^dfju]|qu)*[lm]$"              //{ sex = ,"f," }  # Pam, Pearl, Rachel,...
,"^.*[Aa]nn.*$"                            //{ sex = ,"f," }  # Annacarol, Leann, Ruthann,...
,"^.*[^cio]ag?h$"                          //{ sex = ,"f," }  # Deborah, Leah, Sarah,...
,"^[^EK].*[grsz]h?an(ces)?$"               //{ sex = ,"f," }  # Frances, Megan, Susan,...
,"^[^P]*([Hh]e|[Ee][lt])[^s]*[ey].*[^t]$"  //{ sex = ,"f," }  # Ethel, Helen, Gretchen,...
,"^[DP][eo]?[lr].*se$"                     //{ sex = ,"f," }  # Delores, Doris, Precious,...
,"^K[^v]*i.*[mns]$,"                        //{ sex = ,"f," }  # Karin, Kim, Kristin,...
,"^[ACGK].*[deinx][^aor]s$,"                //{ sex = ,"f," }  # Agnes, Alexis, Glynis,...
,"^[^AGW][iu][gl].*[drt]$,"                 //{ sex = ,"f," }  # Juliet, Mildred, Millicent,...
,"^[EGILP][^eu]*i[ds]$"                    //{ sex = ,"f," }  # Iris, Lois, Phyllis,...
,"^[BHL].*i.*[rtxz]$"                      //{ sex = ,"f," }  # Beatriz, Bridget, Harriet,...
,"^[^GKSW].*(th|lv)(e[rt])?$" };             //{ sex = ,"f," }  # Heather, Ruth, Velvet,...; }
            }
        }
        public static string name
        {
            get { return @"name[:||\s+]\D+"; }
        }
        public static string PostalCode
        {
            get { return @"^\d{5}$|^\d{5}-\d{4}$"; }
        }
        public static string BirthDate
        {
            get { return @"Fax:\s[+]*([\d+[-]*)+"; }
        }
        public static string WebPage
        {
            get { return @"(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z 0-9\-\._\?\,\'/\\\+&%\$#\=~])*"; }
        }

        public static string Department { get { return @"department [\b\w\:]+ $"; } }

        public static string BussinessPhone { get { return @"work||company||unversity||bussiness[\w]{0,10}[\b]*[:]*[\b]*[.]*^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$"; } }

        public static string FirstLine { get { return @"[.]+\n"; } }

        public static string EmailFirstPart { get { return @"[.]+\@"; } }

        public static string CleanStringForExcel { get { return @"\n|\s|\t|\f|\v|\e|,"; } }
    }
    class PersonData
    {
        public static string[] DataColumns
        {
            get
            {
                return new string[] { "Title", "First Name", "Middle Name", "Last Name", "Suffix", "Company", "Department", "Job Title", "Business Street", "Business Street 2", "Business Street 3", "Business City", "Business State", "Business Postal Code", "Business Country", "Home Street", "Home Street 2", "Home Street 3", "Home City", "Home State", "Home Postal Code", "Home Country", "Other Street", "Other Street 2", "Other Street 3", "Other City", "Other State", "Other Postal Code", "Other Country", "Assistant's Phone", "Business Fax", "Business Phone", "Business Phone 2", "Callback", "Car Phone", "Company Main Phone", "Home Fax", "Home Phone", "Home Phone 2", "ISDN", "Mobile Phone", "Other Fax", "Other Phone", "Pager", "Primary Phone", "Radio Phone", "TTY/TDD Phone", "Telex", "Account", "Anniversary", "Assistant's Name", "Billing Information", "Birthday", "Categories", "Children", "E-mail Address", "E-mail Display Name", "E-mail 2 Address", "E-mail 2 Display Name", "E-mail 3 Address", "E-mail 3 Display Name", "Gender", "Government ID Number", "Hobby", "Initials", "Keywords", "Language", "Location", "Mileage", "Notes", "Office Location", "Organizational ID Number", "PO Box", "Private", "Profession", "Referred By", "Spouse", "User 1", "User 2", "User 3", "User 4", "Web Page" };
            }
        }
        public string dataType { get; set; }
        public string Data { get; set; }


        public PersonData() { }
        public PersonData(string dataType, string data)
        {
            // TODO: Complete member initialization
            this.dataType = dataType;
            this.Data = data;
        }
        public PersonData(Person.DataType dataType, string data)
        {
            // TODO: Complete member initialization
            this.dataType = dataType.ToString();
            this.Data = data;
        }
      
    }
    class Person
        {
           public enum DataType
            {
                First_Name, Middle_Name, Last_Name, Department, Job_Title, Business_State, Home_Postal_Code, Business_Phone, Home_Phone, Birthday, Email_Address, Email_Display_Name, Gender, Language, Notes, Web_Page, Business_Fax
            }
          
          
            string DataBlock { get; set; }
            public Person(string DataBlock)
            {
                this.DataBlock = DataBlock;
            }
            public PersonData getEmailPersonFromBlock()
            {


                List<string> Emails = new List<string>();
                Regex emailRegex = new Regex(RegexExpressionCollection.Email,
                       RegexOptions.IgnoreCase);
                //find items that matches with our pattern
                MatchCollection emailMatches = emailRegex.Matches(this.DataBlock);
                foreach (Match emailMatch in emailMatches)
                {
                    Emails.Add(emailMatch.Value);
                }
                //retutn OnlyOneEmail
                if(emailMatches.Count==0)
                    return new PersonData(DataType.Email_Address, "");
                return new PersonData(DataType.Email_Address, Emails[0]);
            }
            public PersonData Gender()
            {
                string Name = GetPersonNameFromBlock()[0].Data;//just First Name;
                return Gender(Name);
            
            }
            public PersonData Gender(string Name)
            {
                
                bool MaleByText = Regex.Matches(this.DataBlock, @"male|sir|mister|father|man|he|his\s ").Count > 0;
                bool FeMaleByText = Regex.Matches(this.DataBlock, @"female|madam|Mademoiselle|miss|mother|women|teacher|she|her\s").Count > 0;

                if (IsMale(Name))
                    return new PersonData(DataType.Gender, "male");
                else if (IsFemale(Name))
                    return new PersonData(DataType.Gender, "Female");
                else if(MaleByText)
                         return new PersonData(DataType.Gender, "male");
                else if (FeMaleByText)
                    return new PersonData(DataType.Gender, "Female");
                else
                    return new PersonData(DataType.Gender, "UnKnown");
            }
            public PersonData GetPersonPostalCodeFromBlocK()
            {
                MatchCollection PersonPCRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.PostalCode);
                string PC = "";
                if (PersonPCRegex.Count > 0)
                    PC = PersonPCRegex[0].Value.ToString();
                return new PersonData(DataType.Home_Postal_Code, PC);
            }
            public List<PersonData> GetPersonFromBlock(bool EmailIsMainData)
            {
                return GetPersonFromBlock(new List<AdditionalRegexExpression>(), EmailIsMainData);
            }
            public List<PersonData> GetPersonFromBlock()
            {
                return GetPersonFromBlock(new List<AdditionalRegexExpression>(), false);
            }
            public List<PersonData> GetPersonFromBlock(List<AdditionalRegexExpression> AddationalRegex, bool EmailIsMainData)
            {

                PersonData PEmailData = getEmailPersonFromBlock();
                if (!Regex.IsMatch( PEmailData.Data.ToString(),RegexExpressionCollection.Email)&& EmailIsMainData)//if this doesn't match Email Catagory and Email is Needed return empty
                    return new List<PersonData>();
                List<PersonData> PNameData = new List<PersonData>();
                if (PEmailData.Data != "")
                  PNameData =  GetPersonNameFromBlock(PEmailData.Data.ToString()); //if Name is Empty Take Email first part as name
                PersonData PPhone = GetPhonePersonFormBlock();
                PersonData PbirthDaY = GetPhonePersonBirthDate();
                PersonData PEMialDisplayName = new PersonData(); 
                    if(PNameData.Count>0)
                 PEMialDisplayName = new PersonData(DataType.Email_Display_Name, PNameData[0].Data);
                PersonData PPostalCode = GetPersonPostalCodeFromBlocK();

                PersonData Gender = new PersonData(); 
                if(PNameData.Count>0)
                    Gender=  this.Gender(PNameData[0].Data);
                PersonData Notes = new PersonData(DataType.Notes,this.DataBlock); //just in Case The Data Was Wrong U have to consider that every thing beside Email is Notes
                PersonData WebPage = GetPersonWebPageFromBlock();
                PersonData Business_Phone = GetPersonBussinessPhone();
                PersonData Lang = GetPersonLanguage();
                PersonData Business_Fax = getPersonBusinessFax();
                PersonData Business_State = GetPersonBusiness_State();
                PersonData Jop_Tile = new PersonData(DataType.Job_Title, Business_State.Data);
                PersonData Department = GetPersonDepartment();
                List<PersonData> AddaationalData = GetAddationalData(AddationalRegex);
                List<PersonData> TotalData = new List<PersonData>();
                TotalData.AddRange(PNameData);
                TotalData.AddRange(new List<PersonData> { Business_Fax, PEmailData, PPhone, PbirthDaY, PEMialDisplayName, PPostalCode, Gender, Notes, WebPage, Business_Phone, Lang, Business_State, Jop_Tile, Department });
                TotalData.AddRange(AddaationalData);//it must be at last becuase it maybe erase data taken by another regex
                return TotalData;

            }
            private string[] StringSplittededBySpace(string Data)
            {
                return Regex.Split(Data, @"[\s|\.|\,|\-]+?");
            }
            public List<PersonData> GetPersonNameFromBlock()
            {
                string Email = getEmailPersonFromBlock().Data;
                return GetPersonNameFromBlock(Email);
            }
            public List<PersonData> GetPersonNameFromBlock(string Email)
            {
               
                string Name = Regex.Match(this.DataBlock.ToLower(), RegexExpressionCollection.name, RegexOptions.Singleline).ToString();
                if (Name == "")
                {

                    Name = Regex.Split(Email, @"@", RegexOptions.Singleline)[0];//take Email first part As Name
                    if (Name == "")
                        Name = Regex.Split(this.DataBlock, RegexExpressionCollection.FirstLine, RegexOptions.Singleline)[0];
                }
               
                string[] PName = StringSplittededBySpace(Name);//what will be produced by this
                string PNameFirst = "";
                string PNameMiddle = "";
                string PNameLast = "";
                if (PName.Length > 1)
                {
                    PNameFirst = PName[0];
                    PNameLast = PName[1];
                }
                else
                {
                    PNameFirst = PName[0];
                }
                if (PName.Length > 2)
                {
                    PNameFirst = PName[0];
                    PNameMiddle = PName[1];
                    PNameLast = PName[2];
                }
                List<PersonData> PersonDataName = new List<PersonData>();
                if(DataIsName(PNameFirst))
                PersonDataName.Add(new PersonData(DataType.First_Name, PNameFirst));
                if (DataIsName(PNameMiddle))
                PersonDataName.Add(new PersonData(DataType.Middle_Name, PNameMiddle));
                if (DataIsName(PNameLast))
                PersonDataName.Add(new PersonData(DataType.Last_Name, PNameLast));
                return PersonDataName;
            }
            private static bool IsMale(string Name)
            {
                for (int i = 0; i < RegexExpressionCollection.Male.Length; i++)
                    if (Regex.IsMatch(Name, RegexExpressionCollection.Male[i]))
                        return true;
                return false;
            }
            private bool DataIsName(string Data)
            { 
            return Regex.IsMatch(Data,@"[\s|\w|\.]+");
            }
            private static bool IsFemale(string Name)
            {

                for (int i = 0; i < RegexExpressionCollection.Female.Length; i++)
                    if (Regex.IsMatch(Name, RegexExpressionCollection.Female[i]))
                        return true;
                return false;

            }
            public PersonData GetPersonWebPageFromBlock()
            {
                MatchCollection PersonWPRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.WebPage);
                string WP = "";
                if (PersonWPRegex.Count > 0)
                    WP = PersonWPRegex[0].Value.ToString();
                return new PersonData(DataType.Web_Page, WP);
            }
            public PersonData getPersonBusinessFax()
            {
                MatchCollection Matches = Regex.Matches(this.DataBlock, RegexExpressionCollection.Fax);
                string Fax = "";
                if (Matches.Count > 0)
                {
                    for (int i = 0; i < Matches.Count; i++)
                    {
                        Fax += Matches[i];
                    }
                    return new PersonData(DataType.Business_Fax, Fax);
                }
                return new PersonData(DataType.Business_Fax, "");

            }
            public PersonData GetPersonDepartment()
            {
                MatchCollection DeptRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.Department);
                if (DeptRegex.Count > 0)
                    return new PersonData(DataType.Department, DeptRegex[0].Value.ToString());
                return new PersonData(DataType.Department, "");
            }
            public PersonData GetPersonBusiness_State()
            {
                bool Dr = Regex.Matches(this.DataBlock, "Dr.\b|Phd", RegexOptions.IgnoreCase).Count > 0;
                bool Prof = Regex.Matches(this.DataBlock, "prof.\b", RegexOptions.IgnoreCase).Count > 0;
                bool Manager = Regex.Matches(this.DataBlock, "manager\b", RegexOptions.IgnoreCase).Count > 0;
                bool HeadMaster = Regex.Matches(this.DataBlock, "headmaster\b", RegexOptions.IgnoreCase).Count > 0;
                bool serial_killer = Regex.Matches(this.DataBlock, @"serial\bkiller\b|Killer", RegexOptions.IgnoreCase).Count > 0; //Just in case :)
                if (Dr)
                    return new PersonData(DataType.Business_State, "Doctor");
                if (Prof)
                    return new PersonData(DataType.Business_State, "Professor");
                if (Manager)
                    return new PersonData(DataType.Business_State, "Manager");
                if (Dr)
                    return new PersonData(DataType.Business_State, "Doctor");
                if (serial_killer)
                    return new PersonData(DataType.Business_State, "Killer Don't Add Him °O° ");

                return new PersonData(DataType.Business_State, "");
            }
            public PersonData GetPersonLanguage()
            {
                bool English = Regex.Matches(this.DataBlock, @"\w").Count > 0;
                if (English)
                    return new PersonData(DataType.Language, "English");
                else
                    return new PersonData(DataType.Language, "Unknown");//I don't Know what in hel this is 
            }
            public PersonData GetPersonBussinessPhone()
            {

                return new PersonData(DataType.Business_Phone, Regex.Matches(this.DataBlock, RegexExpressionCollection.BussinessPhone)[0].ToString());

            }
            public PersonData GetPhonePersonFormBlock()
            {
                MatchCollection PhoneRegex = Regex.Matches(this.DataBlock, RegexExpressionCollection.Phone);
                string PhoneNumber = "";
                if (PhoneRegex.Count <= 0)
                    return new PersonData(DataType.Home_Phone, "");
               
                return new PersonData(DataType.Home_Phone, PhoneRegex[0].ToString());
            }
            public PersonData GetPhonePersonBirthDate()
            {
                MatchCollection BirthDate = Regex.Matches(this.DataBlock, RegexExpressionCollection.BirthDate);
                if(BirthDate.Count<=0)
                    return new PersonData(DataType.Birthday, "");
                return new PersonData(DataType.Birthday, BirthDate[0].ToString());
            }
            static List<int> WrongRegex = new List<int>();
            public List<PersonData> GetAddationalData(List<AdditionalRegexExpression> AdditionalRegex)
            {
                List<PersonData> AdditionalData = new List<PersonData>();
                for (int i = 0; i < AdditionalRegex.Count; i++)
                {
                    if (WrongRegex.Contains(i))
                        continue;
                    try
                    {
                        MatchCollection RegexMatches = Regex.Matches(this.DataBlock, AdditionalRegex[i].RgExp.ToString());
                        int count = 0;
                        if (AdditionalRegex[i].NumOfResults == 0)
                        {
                            count = RegexMatches.Count;
                        }
                        else
                            count = AdditionalRegex[i].NumOfResults;
                        if (RegexMatches.Count < count)
                            count = RegexMatches.Count;
                        string Data = "";
                        for (int c = 0; c < count; c++)
                        {
                            Data += RegexMatches[c].Value.ToString();
                        }
                        string targetColumn = AdditionalRegex[i].RgColumnTarget;
                        AdditionalData.Add(new PersonData(targetColumn, Data));
                    }
                    catch (Exception e)
                    {
                        WrongRegex.Add(i);
                        throw e;
                    }

                }
                return AdditionalData;
            }
            
        }
    
    }

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer Agriculture Genetic Engineering Research Institute
Egypt Egypt
I have adored bioinformatics since my second year in college and then I specialized in this field in my final year in biotechnology department, I have learned both C# and Perl Languages to build open soft-wares contain specialized tools in this particular science.
I'm trying to enhance my knowledge in this beautiful field by both reading and writing articles , also I hope to gain both master and doctoral degree in it.
If you have any scholarship, source , opportunity , project or idea which can help me to reach my goal don't hesitate to contact me on :
E-Mail:samman_mahmoud@yahoo.com
Facebook:Samman Mahmoud
Tel: +20118904500

Comments and Discussions