Click here to Skip to main content
15,879,535 members
Articles / Programming Languages / C#

An Implementation of Regular Expression Parser in C#

Rate me:
Please Sign up or sign in to vote.
4.91/5 (43 votes)
24 Jun 2009CPOL8 min read 184.9K   7.2K   116  
An article on how one can implement regular expression parser
using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using System.Diagnostics;


namespace RegularExpression
{


  public enum ErrorCode
  {
    ERR_SUCCESS,        // the patten is in correct format
    ERR_PREN_MISMATCH,  // "(A(D*)"
    ERR_EMPTY_PREN,     // "()"
    ERR_EMPTY_BRACKET,  // "[]"
    ERR_BRACKET_MISMATCH,  // "["
    ERR_OPERAND_MISSING,  // "A|"
    ERR_INVALID_ESCAPE,   // "\A"
    ERR_INVALID_RANGE,  // "[C-A]" 
    ERR_EMPTY_STRING,   // ""
  }

  internal class ValidationInfo
  {
    public ErrorCode ErrorCode = ErrorCode.ERR_SUCCESS;
    public int ErrorStartAt = -1;
    public int ErrorLength = -1;
    public string FormattedString = String.Empty;
    public bool MatchAtStart = false;
    public bool MatchAtEnd = false;
  }

  internal class MetaSymbol
  {
    public const char CONCANATE = '.';
    public const char ALTERNATE = '|';
    public const char ZERO_OR_MORE = '*';
    public const char ONE_OR_MORE = '+';
    public const char ZERO_OR_ONE = '?';
    public const char OPEN_PREN = '(';
    public const char CLOSE_PREN = ')';
    public const char COMPLEMENT = '^';
    public const char ANY_ONE_CHAR = '_';  // this used in the syntex i.e., A_B => AEB or AcB
    public const string ANY_ONE_CHAR_TRANS = "AnyChar";  // this is the actual transitional symbol
    public const char ESCAPE = '\\';
    public const string EPSILON = "epsilon";
    public const char CHARSET_START = '[';
    public const char CHARSET_END = ']';
    public const char RANGE = '-';
    public const string DUMMY = "Dummy";  // if you draw the model on paper, you should not draw this transition
    public const char MATCH_START = '^';  // token to specify to match at the begining of the string
    public const char MATCH_END = '$';  // token to specify to match at the end of string
    public const char NEW_LINE = 'n';
    public const char TAB = 't';
  }


  /// <summary>
  /// This class is used to validate a pattern.  Validation is done using Recursive Descent Parsing.
  /// Beside validating the pattern, it does two other tasks: insertion of implicit tokens and expanding charecter classes.
  /// i.e., "AB"    -> "A.B"        (inserting the concatening quatifyer)
  ///       "A.B"   -> "A\.B"       (inserting the escape)
  ///       "[A-C]" -> "(A|B|C)"    (expanding the range)
  ///       "(AB"   -> Reports error with mismatch prenthesis
  /// </summary>
  internal class RegExValidator
  {

    private bool m_bConcante = false;
    private bool m_bAlternate = false;

    private const char m_chNull = '\0'; // null symbol;
    private char m_chSym = m_chNull;

    private int m_nPatternLength = -1;
    private string m_sPattern = String.Empty;
    private int m_nCurrPos = -1;
    private StringBuilder m_sb = null;

    private ValidationInfo m_validationInfo = null;

    public RegExValidator()
    {

    }

    public ValidationInfo Validate(string sPattern)
    {
      m_chSym = m_chNull;
      m_bConcante = false;
      m_bAlternate = false;
      m_sb = new StringBuilder(1024);
      m_nCurrPos = -1;
      m_sPattern = sPattern;
      m_nPatternLength = m_sPattern.Length;

      m_validationInfo = new ValidationInfo();

      if (sPattern.Length == 0)
      {
        m_validationInfo.ErrorCode = ErrorCode.ERR_EMPTY_STRING;
        return m_validationInfo;
      }

      GetNextSymbol();

      string sLit1 = MetaSymbol.MATCH_START.ToString();
      string sLit2 = MetaSymbol.MATCH_END.ToString();
      string sLit3 = sLit1 + sLit2;

      if (!(sPattern.CompareTo(sLit1) == 0 || sPattern.CompareTo(sLit2) == 0 || sPattern.CompareTo(sLit3) == 0))
      {
        if (sPattern[0] == MetaSymbol.MATCH_START)
        {
          m_validationInfo.MatchAtStart = true;
          Accept(MetaSymbol.MATCH_START);
        }
        if (m_sPattern[m_nPatternLength - 1] == MetaSymbol.MATCH_END)
        {
          m_validationInfo.MatchAtEnd = true;
          m_nPatternLength--;
        }
      }

      try
      {
        while (m_nCurrPos < m_nPatternLength)
        {
          switch (m_chSym)
          {
            case MetaSymbol.ALTERNATE:
            case MetaSymbol.ONE_OR_MORE:
            case MetaSymbol.ZERO_OR_MORE:
            case MetaSymbol.ZERO_OR_ONE:
              Abort(ErrorCode.ERR_OPERAND_MISSING, m_nCurrPos, 1);
              break;
            case MetaSymbol.CLOSE_PREN:
              Abort(ErrorCode.ERR_PREN_MISMATCH, m_nCurrPos, 1);
              break;
            default:
              Expression();
              break;
          }
        }
      }
      catch (Exception ex)
      {
        Console.WriteLine(ex.Message);
      }

      m_validationInfo.FormattedString = m_sb.ToString();

      return m_validationInfo;
    }
    private void GetNextSymbol()
    {
      m_nCurrPos++;
      if (m_nCurrPos < m_nPatternLength)
      {
        m_chSym = m_sPattern[m_nCurrPos];
      }
      else
      {
        m_chSym = m_chNull;
      }
    }
    private bool Accept(char ch)
    {
      if (m_chSym == ch)
      {
        GetNextSymbol();
        return true;
      }
      return false;
    }
    private bool AcceptPostfixOperator()
    {
      switch (m_chSym)
      {
        case MetaSymbol.ONE_OR_MORE:
        case MetaSymbol.ZERO_OR_MORE:
        case MetaSymbol.ZERO_OR_ONE:
          m_sb.Append(m_chSym);
          return Accept(m_chSym);
        default:
          return false;
      }
    }
    private bool AcceptNonEscapeChar()
    {
      switch (m_chSym)
      {
        case MetaSymbol.ALTERNATE:
        case MetaSymbol.CHARSET_START:
        case MetaSymbol.CLOSE_PREN:
        case MetaSymbol.ESCAPE:
        case MetaSymbol.ONE_OR_MORE:
        case MetaSymbol.OPEN_PREN:
        case MetaSymbol.ZERO_OR_MORE:
        case MetaSymbol.ZERO_OR_ONE:
        case MetaSymbol.CONCANATE:
        case m_chNull:
          return false;
        default:
          AppendConate();
          m_sb.Append(m_chSym);
          Accept(m_chSym);
          break;
      }
      return true;
    }
    private bool Expect(char ch)
    {
      if (Accept(ch))
      {
        return true;
      }
      return false;
    }
    private bool ExpectEscapeChar()
    {
      switch (m_chSym)
      {
        case MetaSymbol.ALTERNATE:
        case MetaSymbol.ANY_ONE_CHAR:
        case MetaSymbol.CHARSET_START:
        case MetaSymbol.CLOSE_PREN:
        case MetaSymbol.COMPLEMENT:
        case MetaSymbol.ESCAPE:
        case MetaSymbol.ONE_OR_MORE:
        case MetaSymbol.OPEN_PREN:
        case MetaSymbol.ZERO_OR_MORE:
        case MetaSymbol.ZERO_OR_ONE:
          m_sb.Append(MetaSymbol.ESCAPE);
          m_sb.Append(m_chSym);
          Accept(m_chSym);
          break;
        case MetaSymbol.NEW_LINE:
          m_sb.Append('\n');
          Accept(m_chSym);
          break;
        case MetaSymbol.TAB:
          m_sb.Append('\t');
          Accept(m_chSym);
          break;
        default:
          return false;
      }
      return true;
    }
    private void Abort(ErrorCode errCode, int nErrPosition, int nErrLen)
    {
      m_validationInfo.ErrorCode = errCode;
      m_validationInfo.ErrorStartAt = nErrPosition;
      m_validationInfo.ErrorLength = nErrLen;

      throw new Exception("Syntex error.");
    }
    private void AppendConate()
    {
      if (m_bConcante)
      {
        m_sb.Append(MetaSymbol.CONCANATE);
        m_bConcante = false;
      }
    }
    private void AppendAlternate()
    {
      if (m_bAlternate)
      {
        m_sb.Append(MetaSymbol.ALTERNATE);
        m_bAlternate = false;
      }
    }
    private void Expression()
    {
      while (Accept(MetaSymbol.ESCAPE))
      {
        AppendConate();
        if (!ExpectEscapeChar())
        {
          Abort(ErrorCode.ERR_INVALID_ESCAPE, m_nCurrPos - 1, 1);
        }
        AcceptPostfixOperator();
        m_bConcante = true;
      }

      while (Accept(MetaSymbol.CONCANATE))
      {
        AppendConate();
        m_sb.Append(MetaSymbol.ESCAPE);
        m_sb.Append(MetaSymbol.CONCANATE);
        AcceptPostfixOperator();
        m_bConcante = true;
      }

      while (Accept(MetaSymbol.COMPLEMENT))
      {
        AppendConate();
        m_sb.Append(MetaSymbol.ESCAPE);
        m_sb.Append(MetaSymbol.COMPLEMENT);
        AcceptPostfixOperator();
        m_bConcante = true;
      }

      while (AcceptNonEscapeChar())
      {
        AcceptPostfixOperator();
        m_bConcante = true;
        Expression();
      }

      if (Accept(MetaSymbol.OPEN_PREN))
      {
        int nEntryPos = m_nCurrPos - 1;
        AppendConate();
        m_sb.Append(MetaSymbol.OPEN_PREN);
        Expression();
        if (!Expect(MetaSymbol.CLOSE_PREN))
        {
          Abort(ErrorCode.ERR_PREN_MISMATCH, nEntryPos, m_nCurrPos - nEntryPos );
        }
        m_sb.Append(MetaSymbol.CLOSE_PREN);

        int nLen = m_nCurrPos - nEntryPos;
        if (nLen == 2)
        {
          Abort(ErrorCode.ERR_EMPTY_PREN, nEntryPos, m_nCurrPos - nEntryPos );
        }

        AcceptPostfixOperator();
        m_bConcante = true;
        Expression();
      }


      if (Accept(MetaSymbol.CHARSET_START))
      {
        int nEntryPos = m_nCurrPos - 1;
        bool bComplement = false;

        AppendConate();

        if (Accept(MetaSymbol.COMPLEMENT))
        {
          bComplement = true;
        }

        string sTmp = m_sb.ToString();

        m_sb = new StringBuilder(1024);
        m_bAlternate = false;
        CharecterSet();

        if (!Expect(MetaSymbol.CHARSET_END))
        {
          Abort(ErrorCode.ERR_BRACKET_MISMATCH, nEntryPos, m_nCurrPos - nEntryPos);
        }

        int nLen = m_nCurrPos - nEntryPos;

        if (nLen == 2)  // "[]"
        {
          Abort(ErrorCode.ERR_EMPTY_BRACKET, nEntryPos, m_nCurrPos - nEntryPos);
        }
        else if (nLen == 3 && bComplement == true) // "[^]"  - treat the complement as literal
        {
          m_sb = new StringBuilder(1024);
          m_sb.Append(sTmp);
          m_sb.Append(MetaSymbol.OPEN_PREN);
          m_sb.Append(MetaSymbol.ESCAPE);
          m_sb.Append(MetaSymbol.COMPLEMENT);
          m_sb.Append(MetaSymbol.CLOSE_PREN);
        }
        else
        {
          string sCharset = m_sb.ToString();
          m_sb = new StringBuilder(1024);
          m_sb.Append(sTmp);
          if (bComplement)
          {
            m_sb.Append(MetaSymbol.COMPLEMENT);
          }
          m_sb.Append(MetaSymbol.OPEN_PREN);
          m_sb.Append(sCharset /*ExpandRange(sCharset, nEntryPos) */   );
          m_sb.Append(MetaSymbol.CLOSE_PREN);
        }

        AcceptPostfixOperator();

        m_bConcante = true;

        Expression();
      }


      if (Accept(MetaSymbol.ALTERNATE))
      {
        int nEntryPos = m_nCurrPos - 1;
        m_bConcante = false;
        m_sb.Append(MetaSymbol.ALTERNATE);
        Expression();
        int nLen = m_nCurrPos - nEntryPos;
        if (nLen == 1)
        {
          Abort(ErrorCode.ERR_OPERAND_MISSING, nEntryPos, m_nCurrPos - nEntryPos );
        }
        Expression();
      }


    }
    private string ExpectEscapeCharInBracket()
    {
      char ch = m_chSym;

      switch (m_chSym)
      {
        case MetaSymbol.CHARSET_END:
        case MetaSymbol.ESCAPE:
          //AppendAlternate();
          //m_sb.Append(MetaSymbol.ESCAPE);
          //m_sb.Append(m_chSym);
          //return Accept(m_chSym);
          Accept(m_chSym);
          return MetaSymbol.ESCAPE.ToString() + ch.ToString();
        case MetaSymbol.NEW_LINE:
          //AppendAlternate();
          //m_sb.Append(Environment.NewLine);
          //return Accept(m_chSym);
          Accept(m_chSym);
          //return Environment.NewLine;
          return ('\n').ToString();
        case MetaSymbol.TAB:
          //AppendAlternate();
          //m_sb.Append('\t');
          //return Accept(m_chSym);
          Accept(m_chSym);
          return ('\t').ToString();
        default:
          return String.Empty;
      }
    }
    private string AcceptNonEscapeCharInBracket()
    {
      char ch = m_chSym;

      switch (ch)
      {
        case MetaSymbol.CHARSET_END:
        case MetaSymbol.ESCAPE:
        case m_chNull:
          return String.Empty;
        case MetaSymbol.ALTERNATE:
        case MetaSymbol.ANY_ONE_CHAR:
        case MetaSymbol.CLOSE_PREN:
        case MetaSymbol.COMPLEMENT:
        case MetaSymbol.ONE_OR_MORE:
        case MetaSymbol.OPEN_PREN:
        case MetaSymbol.ZERO_OR_MORE:
        case MetaSymbol.ZERO_OR_ONE:
        case MetaSymbol.CONCANATE:
          Accept(m_chSym);
          return MetaSymbol.ESCAPE.ToString() + ch.ToString();
        default:
          Accept(m_chSym);
          return ch.ToString();
      }
    }
    private void CharecterSet()
    {
      int nRangeFormStartAt = -1;
      int nStartAt = -1;
      int nLength = -1;

      // xx-xx form
      string sLeft = String.Empty;
      string sRange = String.Empty;
      string sRight = String.Empty;


      string sTmp = String.Empty;

      while(true)
      {
        sTmp = String.Empty;

        nStartAt = m_nCurrPos;

        if (Accept(MetaSymbol.ESCAPE))
        {
          if ((sTmp = ExpectEscapeCharInBracket()) == String.Empty)
          {
            Abort(ErrorCode.ERR_INVALID_ESCAPE, m_nCurrPos - 1, 1);
          }
          nLength = 2;
        }

        if (sTmp == String.Empty)
        {
          sTmp = AcceptNonEscapeCharInBracket();
          nLength = 1;
        }

        if (sTmp == String.Empty)
        {
          break;
        }

        if (sLeft == String.Empty)
        {
          nRangeFormStartAt = nStartAt;
          sLeft = sTmp;
          AppendAlternate();
          m_sb.Append(sTmp);
          m_bAlternate = true;
          continue;
        }

        if (sRange == String.Empty)
        {
          if (sTmp != MetaSymbol.RANGE.ToString())
          {
            nRangeFormStartAt = nStartAt;
            sLeft = sTmp;
            AppendAlternate();
            m_sb.Append(sTmp);
            m_bAlternate = true;
            continue;
          }
          else
          {
            sRange = sTmp;
          }
          continue;
        }

        sRight = sTmp;


        bool bOk = ExpandRange(sLeft, sRight);

        if (bOk == false)
        {
          int nSubstringLen =  (nStartAt + nLength) - nRangeFormStartAt;

          Abort(ErrorCode.ERR_INVALID_RANGE, nRangeFormStartAt, nSubstringLen);
        }
        sLeft = String.Empty;
        sRange = String.Empty;
        sRange = String.Empty;
      }

      if (sRange != String.Empty)
      {
        AppendAlternate();
        m_sb.Append(sRange);
        m_bAlternate = true;
      }

    }
    private bool ExpandRange(string sLeft, string sRight)
    {
      char chLeft = (sLeft.Length > 1 ? sLeft[1] : sLeft[0]);
      char chRight = (sRight.Length > 1 ? sRight[1] : sRight[0]);

      if (chLeft > chRight)
      {
        return false;
      }

      chLeft++;
      while (chLeft <= chRight)
      {
        AppendAlternate();

        switch (chLeft)
        {
          case MetaSymbol.ALTERNATE:
          case MetaSymbol.ANY_ONE_CHAR:
          case MetaSymbol.CLOSE_PREN:
          case MetaSymbol.COMPLEMENT:
          case MetaSymbol.CONCANATE:
          case MetaSymbol.ESCAPE:
          case MetaSymbol.ONE_OR_MORE:
          case MetaSymbol.ZERO_OR_MORE:
          case MetaSymbol.ZERO_OR_ONE:
          case MetaSymbol.OPEN_PREN:
            m_sb.Append(MetaSymbol.ESCAPE);
            break;
          default:
            break;
        }

        m_sb.Append(chLeft);
        m_bAlternate = true;
        chLeft++;
      }

      return true;

    }




  }

}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Denmark Denmark
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions