Click here to Skip to main content
15,884,298 members
Articles / Programming Languages / C#

An Implementation of Regular Expression Parser in C#

Rate me:
Please Sign up or sign in to vote.
4.91/5 (43 votes)
24 Jun 2009CPOL8 min read 185.2K   7.2K   116  
An article on how one can implement regular expression parser
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Windows.Forms;
using System.Data;
using System.Diagnostics;

namespace RegularExpression
{

  /// <summary>
  /// the regular expression recognizer class
  /// </summary>
  public class RegEx
  {
    /// <summary>
    /// Used to validate the pattern string and make sure it is in correct form
    /// </summary>
    static private RegExValidator m_reValidator = new RegExValidator();

    /// <summary>
    /// Position of the last error in the pattern string.
    /// 
    /// </summary>
    private int m_nLastErrorIndex = -1;

    /// <summary>
    /// Length of the error substring in the pattern string
    /// </summary>
    private int m_nLastErrorLength = -1;

    /// <summary>
    /// ErrorCode indicating what if the last compilation succeded.
    /// </summary>
    private ErrorCode m_LastErrorCode = ErrorCode.ERR_SUCCESS;

    /// <summary>
    /// Specifies match must occure at the begining of the input string (^)
    /// </summary>
    private bool m_bMatchAtStart = false; 
 
    /// <summary>
    ///  Specifies match must occure at the end of the input string ($)
    /// </summary>
    private bool m_bMatchAtEnd = false; 
   
    /// <summary>
    /// Behave greedy, default is true.
    /// If true, Find will stop at the first accepting state, otherwisie it will try process more charectr
    /// </summary>
    private bool m_bGreedy = true;         

    /// <summary>
    /// Start state of the DFA M'.
    /// </summary>
    private State m_stateStartDfaM = null;


    /// <summary>
    /// public default constructor
    /// </summary>
    public RegEx()
    {

    }

 
    /// <summary>
    /// to keep set of distinct input symbol. needed for NFA to DFA conversion
    /// </summary>
    private Set m_setInputSymbol = new Set();

    /// <summary>
    /// Converts a regular expression from infix form to postfix form
    /// </summary>
    /// <param name="sInfixPattern">regular expression in infix form</param>
    /// <returns>regular expression in postfix form</returns>
    private string ConvertToPostfix(string sInfixPattern)
    {
      Stack stackOperator = new Stack();
      Queue queuePostfix = new Queue();
      bool bEscape = false;

      for (int i = 0; i < sInfixPattern.Length; i++)
      {
        char ch = sInfixPattern[i];


        if (bEscape == false && ch == MetaSymbol.ESCAPE)
        {
          queuePostfix.Enqueue(ch);
          bEscape = true;
          continue;
        }

        if (bEscape == true)
        {
          queuePostfix.Enqueue(ch);
          bEscape = false;
          continue;
        }
        switch (ch)
        {
          case MetaSymbol.OPEN_PREN:
            stackOperator.Push(ch);
            break;
          case MetaSymbol.CLOSE_PREN:
            while ((char)stackOperator.Peek() != MetaSymbol.OPEN_PREN)
            {
              queuePostfix.Enqueue(stackOperator.Pop());
            }
            stackOperator.Pop();  // pop the '('

            break;
          default:
            while (stackOperator.Count > 0)
            {
              char chPeeked = (char)stackOperator.Peek();

              int nPriorityPeek = GetOperatorPriority(chPeeked);
              int nPriorityCurr = GetOperatorPriority(ch);

              if (nPriorityPeek >= nPriorityCurr)
              {
                queuePostfix.Enqueue(stackOperator.Pop());
              }
              else
              {
                break;
              }
            }
            stackOperator.Push(ch);
            break;
        }

      }  // end of for..loop

      while (stackOperator.Count > 0)
      {
        queuePostfix.Enqueue((char)stackOperator.Pop());
      }
      StringBuilder sb = new StringBuilder(1024);
      while (queuePostfix.Count > 0)
      {
        sb.Append((char)queuePostfix.Dequeue());
      }


      return sb.ToString();
    }

    /// <summary>
    /// helper function. needed for postfix conversion
    /// </summary>
    /// <param name="chOpt">literal symbol</param>
    /// <returns>priority</returns>
    private int GetOperatorPriority(char chOpt)
    {
      switch (chOpt)
      {
        case MetaSymbol.OPEN_PREN:
          return 0;
        case MetaSymbol.ALTERNATE:
          return 1;
        case MetaSymbol.CONCANATE:
          return 2;
        case MetaSymbol.ZERO_OR_ONE:
        case MetaSymbol.ZERO_OR_MORE:
        case MetaSymbol.ONE_OR_MORE:
          return 3;
        case MetaSymbol.COMPLEMENT:
          return 4;
        default:
          return 5;

      }
    }


    /// <summary>
    /// Compiles a pattern string produces a Minimum DFA model.
    /// </summary>
    /// <param name="sPattern">Actual pattern string in the correct format</param>
    /// <param name="sbStats">This will receive the statistics. Can be null.</param>
    /// <returns>ErrorCode indicating how the compilatio went.</returns>
    public ErrorCode CompileWithStats(string sPattern, StringBuilder sbStats)
    {
      Console.WriteLine("Orig RE: " + sPattern);


      if (sbStats == null)
      {
        return Compile(sPattern);  // no statistics required
      }

      m_setInputSymbol.Clear();
      State.ResetCounter();

      string sUnit = "Tick(s)";
      long tcStart = DateTime.Now.Ticks;
      ValidationInfo vi = m_reValidator.Validate(sPattern);

      UpdateValidationInfo(vi);

      if (vi.ErrorCode != ErrorCode.ERR_SUCCESS)
      {
        return vi.ErrorCode;
      }
      string sRegExConcat = vi.FormattedString;
      long tcConcat = DateTime.Now.Ticks - tcStart;

      tcStart = DateTime.Now.Ticks;
      string sRegExPostfix = ConvertToPostfix(sRegExConcat);
      long tcPostfix = DateTime.Now.Ticks - tcStart;

      Set setMasterNfa = new Set();
      tcStart = DateTime.Now.Ticks;
      State stateStartNfa = CreateNfa(sRegExPostfix, setMasterNfa);
      long tcNfa = DateTime.Now.Ticks - tcStart;
      int nNfaStateCount = setMasterNfa.Count;
      string sFsaNfa = GetFsaTable(stateStartNfa, setMasterNfa, m_setInputSymbol);

      State.ResetCounter();
      Set setMasterDfa = new Set();
      tcStart = DateTime.Now.Ticks;
      State stateStartDfa = ConvertToDfa(stateStartNfa, setMasterDfa);
      m_stateStartDfaM = stateStartDfa;
      long tcDfa = DateTime.Now.Ticks - tcStart;
      int nDfaStateCount = setMasterDfa.Count;
      string sFsaDfa = GetFsaTable(stateStartDfa, setMasterDfa, m_setInputSymbol);

      Set setMasterDfaM = new Set();
      setMasterDfaM.Union(setMasterDfa); // working on the references of DFA states
      tcStart = DateTime.Now.Ticks;
      State stateStartDfaM = ReduceDfa(stateStartDfa, setMasterDfaM, m_setInputSymbol);
      m_stateStartDfaM = stateStartDfaM;
      long tcDfaM = DateTime.Now.Ticks - tcStart;
      int nDfaMStateCount = setMasterDfaM.Count;
      string sFsaDfaM = GetFsaTable(stateStartDfaM, setMasterDfaM, m_setInputSymbol);

      sbStats.Append("Original String:\t" + sPattern);
      sbStats.AppendLine();
      sbStats.Append("After Validation:\t" + sRegExConcat + " (TimeSpan: " + tcConcat.ToString() + " " + sUnit + ")");
      sbStats.AppendLine();
      sbStats.Append("After Postfix:\t\t" + sRegExPostfix + " (TimeSpan: " + tcPostfix.ToString() + " " + sUnit + ")");
      sbStats.AppendLine();
      sbStats.Append("Input Symbol Count:\t" + m_setInputSymbol.Count.ToString());
      sbStats.AppendLine();
      sbStats.AppendLine();
      sbStats.Append("NFA Table(TimeSpan: " + tcNfa.ToString() + " " + sUnit + "):");
      sbStats.AppendLine();
      sbStats.Append(sFsaNfa);
      sbStats.AppendLine();
      sbStats.AppendLine();
      sbStats.Append("DFA Table(TimeSpan: " + tcDfa.ToString() + " " + " " + sUnit + "):");
      sbStats.AppendLine();
      sbStats.Append(sFsaDfa);
      sbStats.AppendLine();
      sbStats.AppendLine();
      sbStats.Append("DFA M' Table(TimeSpan: " + tcDfaM.ToString() + " " + sUnit + "):");
      sbStats.AppendLine();
      sbStats.Append(sFsaDfaM);
      sbStats.AppendLine();
      sbStats.AppendLine();

      long nTotal = (tcConcat + tcPostfix + tcNfa + tcDfa + tcDfaM);
      sbStats.AppendLine();
      sbStats.Append("Total TimeSpan: " + nTotal.ToString() + " " + sUnit);

      return ErrorCode.ERR_SUCCESS;
    }

    /// <summary>
    /// Compiles a pattern string produces a Minimum DFA model.
    /// </summary>
    /// <param name="sPattern">Actual pattern string in the correct format</param>
    /// <returns>ErrorCode indicating how the compilatio went.</returns>
    public ErrorCode Compile(string sPattern)
    {
      ValidationInfo vi = m_reValidator.Validate(sPattern);

      UpdateValidationInfo(vi);

      if (vi.ErrorCode != ErrorCode.ERR_SUCCESS)
      {
        return vi.ErrorCode;
      }

      m_setInputSymbol.Clear();

      State.ResetCounter();
      string sRegExConcat = sPattern;

      string sRegExPostfix = ConvertToPostfix(sRegExConcat);

      Set setMasterNfa = new Set();
      State stateStartNfa = CreateNfa(sRegExPostfix, setMasterNfa);

      State.ResetCounter();
      Set setMasterDfa = new Set();
      State stateStartDfa = ConvertToDfa(stateStartNfa, setMasterDfa);
      m_stateStartDfaM = stateStartDfa;

      Set setMasterDfaM = new Set();
      setMasterDfaM.Union(setMasterDfa); // working on the references of DFA states
      m_stateStartDfaM = ReduceDfa(stateStartDfa, setMasterDfaM, m_setInputSymbol);

      return ErrorCode.ERR_SUCCESS;

    }

    /// <summary>
    /// Finds all state reachable from the specic state on Epsilon transition
    /// </summary>
    /// <param name="stateStart">State from which search begins</param>
    /// <returns>A set of all state reachable from teh startState on Epsilon transtion</returns>
    private Set Eclosure(State stateStart)
    {
      Set setProcessed = new Set();
      Set setUnprocessed = new Set();

      setUnprocessed.AddElement(stateStart);

      while (setUnprocessed.Count > 0)
      {
        State state = (State)setUnprocessed[0];
        State[] arrTrans = state.GetTransitions(MetaSymbol.EPSILON);
        setProcessed.AddElement(state);
        setUnprocessed.RemoveElement(state);

        if (arrTrans != null)
        {
          foreach (State stateEpsilon in arrTrans)
          {
            if (!setProcessed.ElementExist(stateEpsilon))
            {
              setUnprocessed.AddElement(stateEpsilon);
            }
          }
        }


      }

      return setProcessed;

    }

    /// <summary>
    /// Finds all state reachable from the set of states on Epsilon transition
    /// </summary>
    /// <param name="setState">Set of states to search from</param>
    /// <returns></returns>
    private Set Eclosure(Set setState)
    {
      Set setAllEclosure = new Set();
      State state = null;
      foreach (object obj in setState)
      {
        state = (State)obj;

        Set setEclosure = Eclosure(state);
        setAllEclosure.Union(setEclosure);
      }
      return setAllEclosure;
    }

    /// <summary>
    /// Gets Move of a set states.
    /// </summary>
    /// <param name="setState">Set of state for which to get Move </param>
    /// <param name="chInputSymbol">Innput symbol</param>
    /// <returns>Set of Move</returns>
    private Set Move(Set setState, string sInputSymbol)
    {
      Set set = new Set();
      State state = null;
      foreach (object obj in setState)
      {
        state = (State)obj;
        Set setMove = Move(state, sInputSymbol);
        set.Union(setMove);
      }
      return set;
    }

    /// <summary>
    /// Gets Move of a state.
    /// </summary>
    /// <param name="state">state for which to get Move</param>
    /// <param name="chInputSymbol">Input symbol</param>
    /// <returns>Set of Move</returns>
    private Set Move(State state, string sInputSymbol)
    {
      Set set = new Set();

      State[] arrTrans = state.GetTransitions(sInputSymbol);

      if (arrTrans != null)
      {
        set.AddElementRange(arrTrans);
      }

      return set;

    }

    /// <summary>
    /// Converts a regular expression in posfix form to NFA using "Thompson�s Construction"
    /// </summary>
    /// <param name="sRegExPosfix">Regulare expression in postfix form (pattern)</param>
    /// <returns>Start state of the NFA</returns>
    private State CreateNfa(string sRegExPosfix, Set setMasterNfa)
    {
      Stack stackNfa = new Stack();
      NfaExpression expr = null;
      NfaExpression exprA = null;
      NfaExpression exprB = null;
      NfaExpression exprNew = null;
      bool bEscape = false;

      foreach (char ch in sRegExPosfix)
      {
        if (bEscape == false && ch == MetaSymbol.ESCAPE)
        {
          bEscape = true;
          continue;
        }

        if (bEscape == true)
        {
          exprNew = new NfaExpression();
          exprNew.StartState().AddTransition(ch.ToString(), exprNew.FinalState());

          stackNfa.Push(exprNew);

          m_setInputSymbol.AddElement(ch);  // this set is later used to convert from NFA to DFA
          setMasterNfa.AddElement(exprNew.StartState());
          setMasterNfa.AddElement(exprNew.FinalState());
          bEscape = false;
          continue;
        }

        switch (ch)
        {
          case MetaSymbol.ZERO_OR_MORE:  // A*  Kleene closure

            exprA = (NfaExpression)stackNfa.Pop();
            exprNew = new NfaExpression();

            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());

            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());

            stackNfa.Push(exprNew);
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;
          case MetaSymbol.ALTERNATE:  // A|B
            exprB = (NfaExpression)stackNfa.Pop();
            exprA = (NfaExpression)stackNfa.Pop();

            exprNew = new NfaExpression();

            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());
            exprB.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());

            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprB.StartState());

            stackNfa.Push(exprNew);
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;

          case MetaSymbol.CONCANATE:  // AB
            exprB = (NfaExpression)stackNfa.Pop();
            exprA = (NfaExpression)stackNfa.Pop();

            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprB.StartState());

            exprNew = new NfaExpression(exprA.StartState(), exprB.FinalState());
            stackNfa.Push(exprNew);
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;

          case MetaSymbol.ONE_OR_MORE:  // A+ => AA* => A.A*

            exprA = (NfaExpression)stackNfa.Pop();
            exprNew = new NfaExpression();

            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprNew.FinalState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());

            stackNfa.Push(exprNew);
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;
          case MetaSymbol.ZERO_OR_ONE:  // A? => A|empty  
            exprA = (NfaExpression)stackNfa.Pop();
            exprNew = new NfaExpression();

            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());
            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());

            stackNfa.Push(exprNew);
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;
          case MetaSymbol.ANY_ONE_CHAR:
            exprNew = new NfaExpression();
            exprNew.StartState().AddTransition(MetaSymbol.ANY_ONE_CHAR_TRANS, exprNew.FinalState());
            stackNfa.Push(exprNew);

            m_setInputSymbol.AddElement(MetaSymbol.ANY_ONE_CHAR_TRANS);  // this set is later used to convert from NFA to DFA
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;

          case MetaSymbol.COMPLEMENT:  // ^ 

            exprA = (NfaExpression)stackNfa.Pop();

            NfaExpression exprDummy = new NfaExpression();
            exprDummy.StartState().AddTransition(MetaSymbol.DUMMY, exprDummy.FinalState());

            exprA.FinalState().AddTransition(MetaSymbol.EPSILON, exprDummy.StartState());

            NfaExpression exprAny = new NfaExpression();
            exprAny.StartState().AddTransition(MetaSymbol.ANY_ONE_CHAR_TRANS, exprAny.FinalState());


            exprNew = new NfaExpression();
            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprA.StartState());
            exprNew.StartState().AddTransition(MetaSymbol.EPSILON, exprAny.StartState());

            exprAny.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());
            exprDummy.FinalState().AddTransition(MetaSymbol.EPSILON, exprNew.FinalState());

            stackNfa.Push(exprNew);

            m_setInputSymbol.AddElement(MetaSymbol.ANY_ONE_CHAR_TRANS);  // this set is later used to convert from NFA to DFA
            m_setInputSymbol.AddElement(MetaSymbol.DUMMY);

            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());

            setMasterNfa.AddElement(exprAny.StartState());
            setMasterNfa.AddElement(exprAny.FinalState());

            setMasterNfa.AddElement(exprDummy.StartState());
            setMasterNfa.AddElement(exprDummy.FinalState());

            break;
          default:
            exprNew = new NfaExpression();
            exprNew.StartState().AddTransition(ch.ToString(), exprNew.FinalState());

            stackNfa.Push(exprNew);

            m_setInputSymbol.AddElement(ch);  // this set is later used to convert from NFA to DFA
            setMasterNfa.AddElement(exprNew.StartState());
            setMasterNfa.AddElement(exprNew.FinalState());
            break;

        } // end of switch statement


      }  // end of for..each loop

      Debug.Assert(stackNfa.Count == 1);
      expr = (NfaExpression)stackNfa.Pop();  // pop the very last one.  YES, THERE SHOULD ONLY BE ONE LEFT AT THIS POINT
      expr.FinalState().AcceptingState = true;  // the very last state is the accepting state of the NFA

      return expr.StartState();  // retun the start state of NFA

    }  // end of CreateNfa method

    /// <summary>
    /// Converts NFA to DFA using "Subset Construction"
    /// </summary>
    /// <param name="stateStartNfa">Starting state of NFA</param>
    /// <param name="setMasterDfa">Contains set of all DFA states when this function returns</param>
    /// <returns>Starting state of DFA</returns>
    private State ConvertToDfa(State stateStartNfa, Set setMasterDfa)
    {
      NfaToDfaHelper helper = new NfaToDfaHelper();
      Set setMove = null;
      Set setEclosure = null;

      // first, we get Eclosure of the start state of NFA ( just following the algoritham)
      setEclosure = Eclosure(stateStartNfa);
      State stateStartDfa = new State();  // create a new DFA state to represent the above Eclosure

      // NOTE: 
      // we keep track of the NFA Eclosure and the DFA state that represent the Eclosure.
      // we maintain a relationship between the NFA Eclosure and DFA state that represents the NFA Eclosure.
      // all these are done in the NfaToDfaHelper class.

      if (IsAcceptingGroup(setEclosure) == true)
      {
        stateStartDfa.AcceptingState = true;
      }

      helper.AddDfaState(stateStartDfa, setEclosure);

      string sInputSymbol = String.Empty; // dummy

      // please see "subset construction" algoritham
      // for clear understanding

      State stateT = null;
      Set setT = null;
      State stateU = null;

      while ((stateT = helper.GetNextUnmarkedDfaState()) != null)
      {
        helper.Mark(stateT);   // flag it to indicate that we have processed this state.

        // the DFA state stateT represents a set of NFA Eclosure.
        // so, we retrieve the Eclosure.
        setT = helper.GetEclosureByDfaState(stateT);

        foreach (object obj in m_setInputSymbol)
        {
          sInputSymbol = obj.ToString();

          setMove = Move(setT, sInputSymbol);

          if (setMove.IsEmpty() == false)
          {
            setEclosure = Eclosure(setMove);

            stateU = helper.FindDfaStateByEclosure(setEclosure);

            if (stateU == null) // so set setEclosure must be a new one and we should crate a new DFA state
            {
              stateU = new State();
              if (IsAcceptingGroup(setEclosure) == true)
              {
                stateU.AcceptingState = true;
              }

              helper.AddDfaState(stateU, setEclosure);  // add new state (as unmarked by default)
            }

            stateT.AddTransition(sInputSymbol, stateU);
          }

        }  // end of foreach..loop

      }  // end of while..loop

      setMasterDfa.Union(helper.GetAllDfaState());
      return stateStartDfa;

    }  // end of ConverToDfa method

    /// <summary>
    /// Converts DFA to Minimum DFA or DFA M'.
    /// </summary>
    /// <param name="stateStartDfa">Starting state of DFA</param>
    /// <param name="setMasterDfa">Set of all DFA state (including the starting one)</param>
    /// <param name="setInputSymbol">Set of all input symbol</param>
    /// <returns>Starting state of DFA M'</returns>
    private State ReduceDfa(State stateStartDfa, Set setMasterDfa, Set setInputSymbol)
    {
      State stateStartReducedDfa = null;   // start state of the Reduced DFA
      ArrayList arrGroup = null;  // master array of all possible partitions/groups

      // STEP 1: partition the DFS states.
      // we do this by calling another method 
      arrGroup = PartitionDfaGroups(setMasterDfa, setInputSymbol);

      // NOTE: arrGroup now contains all possible groups for all the DFA state.


      // STEP 2: now we go through all the groups and select a group representative for each group.
      // eventually the representive becomes one of the state in DFA M'.  All other members of the groups get eliminited (deleted).
      foreach (object objGroup in arrGroup)
      {
        Set setGroup = (Set)objGroup;

        bool bAcceptingGroup = IsAcceptingGroup(setGroup);  // see if the group contains any accepting state
        bool bStartingGroup = setGroup.ElementExist(stateStartDfa); // check if the group contains the starting DFA state

        // choose group representative
        State stateRepresentative = (State)setGroup[0]; // just choose the first one as group representative


        // should the representative be start state of DFA M'
        if (bStartingGroup == true)
        {
          stateStartReducedDfa = stateRepresentative;
        }

        // should the representative be an accepting state of DFA M'
        if (bAcceptingGroup == true)
        {
          stateRepresentative.AcceptingState = true;
        }

        if (setGroup.GetCardinality() == 1)
        {
          continue;  // no need for further processing
        }


        // STEP 3: remove the representative from its group
        // and replace all the references of the remaining member of the group with the representative
        setGroup.RemoveElement(stateRepresentative);

        State stateToBeReplaced = null;   // state to be replaced with the group representative
        int nReplecementCount = 0;
        foreach (object objStateToReplaced in setGroup)
        {
          stateToBeReplaced = (State)objStateToReplaced;

          setMasterDfa.RemoveElement(stateToBeReplaced);  // remove this member from the master set as well

          foreach (object objState in setMasterDfa)
          {
            State state = (State)objState;
            nReplecementCount += state.ReplaceTransitionState(stateToBeReplaced, stateRepresentative);
          }

          // here, in C++, you would actully delete the stateA object by calling:
          // delete stateToBeRemoved;

        }
      }  // end of outer foreach..loop

      //  STEP 4: now remove all "dead states"
      int nIndex = 0;
      while (nIndex < setMasterDfa.Count)
      {
        State state = (State)setMasterDfa[nIndex];
        if (state.IsDeadState())
        {
          setMasterDfa.RemoveAt(nIndex);
          // here, in C++, you would actully delete the stateDead object by calling:
          // delete stateDead;
          continue;
        }
        nIndex++;
      }


      return stateStartReducedDfa;
    }

    /// <summary>
    /// Helper function. Check to see if a set contains any accpeting state.
    /// </summary>
    /// <param name="setGroup">Set of state</param>
    /// <returns>true if set contains any accpting states, otherwise false</returns>
    private bool IsAcceptingGroup(Set setGroup)
    {
      State state = null;

      foreach (object objState in setGroup)
      {
        state = (State)objState;

        if (state.AcceptingState == true)
        {
          return true;
        }
      }

      return false;
    }

    /// <summary>
    /// Partions set of all DFA states into smaller groups (according to the partion rules).
    /// Please see notes for detail of partioning DFA.
    /// </summary>
    /// <param name="setMasterDfa">Set of all DFA states</param>
    /// <param name="setInputSymbol">Set of all input symbol</param>
    /// <returns>Array of DFA groups</returns>
    private ArrayList PartitionDfaGroups(Set setMasterDfa, Set setInputSymbol)
    {
      ArrayList arrGroup = new ArrayList();  // array of all set (group) of DFA states.
      Map map = new Map();   // to keep track of which memeber transition into which group
      Set setEmpty = new Set();

      // first we need to create two partition of the setMasterDfa:
      // one with all the accepting states and the other one with all the non-accpeting states
      Set setAccepting = new Set();  // group of all accepting state
      Set setNonAccepting = new Set();  // group of all non-accepting state

      foreach (object objState in setMasterDfa)
      {
        State state = (State)objState;

        if (state.AcceptingState == true)
        {
          setAccepting.AddElement(state);
        }
        else
        {
          setNonAccepting.AddElement(state);
        }
      }

      if (setNonAccepting.GetCardinality() > 0)
      {
        arrGroup.Add(setNonAccepting);  // add this newly created partition to the master list
      }

      // for accepting state, there should always be at least one state, if NOT then there must be something wrong somewhere
      arrGroup.Add(setAccepting);   // add this newly created partition to the master list


      // now we iterate through these two partitions and see if they can be further partioned.
      // we continuew the iteration until no further paritioning is possible.

      IEnumerator iterInput = setInputSymbol.GetEnumerator();

      iterInput.Reset();

      while (iterInput.MoveNext())
      {
        string sInputSymbol = iterInput.Current.ToString();

        int nPartionIndex = 0;
        while (nPartionIndex < arrGroup.Count)
        {
          Set setToBePartitioned = (Set)arrGroup[nPartionIndex];
          nPartionIndex++;

          if (setToBePartitioned.IsEmpty() || setToBePartitioned.GetCardinality() == 1)
          {
            continue;   // because we can't partition a set with zero or one memeber in it
          }

          foreach (object objState in setToBePartitioned)
          {
            State state = (State)objState;
            State[] arrState = state.GetTransitions(sInputSymbol.ToString());

            if (arrState != null)
            {
              Debug.Assert(arrState.Length == 1);

              State stateTransionTo = arrState[0];  // since the state is DFA state, this array should contain only ONE state

              Set setFound = FindGroup(arrGroup, stateTransionTo);
              map.Add(setFound, state);
            }
            else   // no transition exists, so transition to empty set
            {
              //setEmpty = new Set();
              map.Add(setEmpty, state);  // keep a map of which states transtion into which group

            }
          }  // end of foreach (object objState in setToBePartitioned)

          if (map.Count > 1)  // means some states transition into different groups
          {
            arrGroup.Remove(setToBePartitioned);
            foreach (DictionaryEntry de in map)
            {
              Set setValue = (Set)de.Value;
              arrGroup.Add(setValue);
            }
            nPartionIndex = 0;  // we want to start from the begining again
            iterInput.Reset();  // we want to start from the begining again
          }
          map.Clear();
        }  // end of while..loop


      }  // end of foreach (object objString in setInputSymbol)

      return arrGroup;
    }  // end of PartitionDfaSet method

    /// <summary>
    /// Helper function.  Finds a set in a array of set for a particular state.
    /// </summary>
    /// <param name="arrGroup">Array of set of states</param>
    /// <param name="state">State to search for</param>
    /// <returns>Set the state belongs to</returns>
    private Set FindGroup(ArrayList arrGroup, State state)
    {
      foreach (object objSet in arrGroup)
      {
        Set set = (Set)objSet;

        if (set.ElementExist(state) == true)
        {
          return set;
        }
      }

      return null;
    }

    /// <summary>
    /// Retuns a string format of NFA.Set type.
    /// Only used for debugging.
    /// </summary>
    /// <param name="set">Set of state.</param>
    /// <returns>Formatted string</returns>
    private string SetToString(Set set)
    {
      string s = "";
      foreach (object objState in set)
      {
        State state = (State)objState;
        s += state.Id.ToString() + ", ";
      }

      s = s.TrimEnd(new char[] { ' ', ',' });
      if (s.Length == 0)
      {
        s = "Empty";
      }
      s = "{" + s + "}";
      return s;
    }

    /// <summary>
    /// Converts an Automata into formatted string that represents a Finate Automat State table.
    /// Very useful for understanding automata and drawing the model.
    /// NOTE: Not needed for actual regular expression operations.
    /// </summary>
    /// <param name="startState">Starting state of the automata</param>
    /// <param name="setAllState">Set of all states</param>
    /// <param name="setInputSymbol">Set of all input symbols</param>
    /// <returns>Formatted string</returns>
    private string GetFsaTable(State startState, Set setAllState, Set setInputSymbol)
    {
      FsaDS fsaDS = new FsaDS();
      Set setAllInput = new Set();
      setAllInput.Union(setInputSymbol);
      setAllInput.RemoveElement(MetaSymbol.EPSILON);

      setAllInput.AddElement(MetaSymbol.EPSILON);

      string sInputSymbol = String.Empty;
      foreach (object objString in setAllInput)
      {
        sInputSymbol = objString.ToString();
        System.Data.DataColumn dc = fsaDS.Fsa.Columns.Add(sInputSymbol);
        dc.DataType = typeof(string);
        dc.DefaultValue = "--";
      }

      int nTransitionCount = 0;

      State state = null;
      FsaDS.FsaRow rFsa = null;
      foreach (object objState in setAllState)
      {
        state = (State)objState;

        rFsa = fsaDS.Fsa.NewFsaRow();
        if (state.AcceptingState == true)
        {
          rFsa.State = "{" + "s" + state.Id.ToString() + "}";
        }
        else
        {
          rFsa.State = "s" + state.Id.ToString();
        }

        if (state.Equals(startState) == true)
        {
          rFsa.State = ">" + rFsa.State;
        }


        State[] arrTrans = null;
        foreach (object objString in setAllInput)
        {
          sInputSymbol = objString.ToString();
          arrTrans = state.GetTransitions(sInputSymbol.ToString());
          if (arrTrans != null)
          {
            nTransitionCount += arrTrans.Length;

            StringBuilder sbData = new StringBuilder(1024);
            int i = 0;
            for (i = 0; i < arrTrans.Length - 1; i++)
            {
              sbData.Append("s" + arrTrans[i].Id.ToString() + ", ");
            }
            sbData.Append("s" + arrTrans[i].Id.ToString());

            System.Data.DataColumn dc = fsaDS.Fsa.Columns[sInputSymbol];
            rFsa[dc] = sbData.ToString();
          }
        }
        fsaDS.Fsa.AddFsaRow(rFsa);
      }
      fsaDS.AcceptChanges();


      StringBuilder sb = new StringBuilder(2048);

      // first create the column header using column name for header text
      for (int i = 0; i < fsaDS.Fsa.Columns.Count - 1; i++)
      {
        sb.Append(fsaDS.Fsa.Columns[i].ColumnName);
        sb.Append("\t|\t");
      }
      sb.Append(fsaDS.Fsa.Columns[fsaDS.Fsa.Columns.Count - 1].ColumnName);
      sb.AppendLine();

      // now create the rows
      foreach (FsaDS.FsaRow fr in fsaDS.Fsa)
      {
        for (int i = 0; i < fsaDS.Fsa.Columns.Count - 1; i++)
        {
          sb.Append(fr[i].ToString());
          sb.Append("\t|\t");
        }
        sb.Append(fr[fsaDS.Fsa.Columns.Count - 1].ToString());
        sb.AppendLine();
      }
      string sFormat = "(State Count: {0};  Transition Count: {1})";
      string sStat = String.Format(sFormat, setAllState.Count, nTransitionCount);
      sb.Append(sStat);

      sb.AppendLine();

      return sb.ToString();
    }

    /// <summary>
    /// Search and finds a match for the compiled pattern.
    /// One must call Compile method before calling this method.
    /// </summary>
    /// <param name="sSearchIn">String to search in.</param>
    /// <param name="nSearchStartAt">Index at which to begin the search.</param>
    /// <param name="nSearchEndAt">Index at which to end the search.</param>
    /// <param name="nFoundBeginAt">If match found, recives the index where the match started, otherwise -1</param>
    /// <param name="nFoundEndAt">If match found, recives the index where the match ended, otherwise -1</param>
    /// <returns>true if match found, otherwise false.</returns>
    public bool FindMatch(string sSearchIn,
                          int nSearchStartAt,
                          int nSearchEndAt,
                          ref int nFoundBeginAt,
                          ref int nFoundEndAt)
    {

      if (m_stateStartDfaM == null)
      {
        return false;
      }

      if (nSearchStartAt < 0)
      {
        return false;
      }

      State stateStart = m_stateStartDfaM;

      nFoundBeginAt = -1;
      nFoundEndAt = -1;

      bool bAccepted = false;
      State toState = null;
      State stateCurr = stateStart;
      int nIndex = nSearchStartAt;
      int nSearchUpTo = nSearchEndAt;


      while (nIndex <= nSearchUpTo)
      {

        if (m_bGreedy && IsWildCard(stateCurr) == true)
        {
          if (nFoundBeginAt == -1)
          {
            nFoundBeginAt = nIndex;
          }
          ProcessWildCard(stateCurr, sSearchIn, ref nIndex, nSearchUpTo);
        }

        char chInputSymbol = sSearchIn[nIndex];

        toState = stateCurr.GetSingleTransition(chInputSymbol.ToString());

        if (toState == null)
        {
          toState = stateCurr.GetSingleTransition(MetaSymbol.ANY_ONE_CHAR_TRANS);
        }

        if (toState != null)
        {
          if (nFoundBeginAt == -1)
          {
            nFoundBeginAt = nIndex;
          }

          if (toState.AcceptingState)
          {
            if (m_bMatchAtEnd && nIndex != nSearchUpTo)  // then we ignore the accepting state
            {
              //toState = stateStart ;
            }
            else
            {
              bAccepted = true;
              nFoundEndAt = nIndex;
              if (m_bGreedy == false)
              {
                break;
              }
            }
          }

          stateCurr = toState;
          nIndex++;
        }
        else
        {
          if (!m_bMatchAtStart && !bAccepted )  // we reset everything
          {
            nIndex = (nFoundBeginAt != -1 ? nFoundBeginAt + 1 : nIndex + 1);

            nFoundBeginAt = -1;
            nFoundEndAt = -1;
            //nIndex++;
            stateCurr = stateStart;  // start from begining
          }
          else
          {
            break;
          } 
        }
      }  // end of while..loop 

      if (!bAccepted)
      {
        if (stateStart.AcceptingState == false)
        {
          return false;
        }
        else // matched an empty string
        {
          nFoundBeginAt = nSearchStartAt;
          nFoundEndAt = nFoundBeginAt - 1;
          return true;
        }
      }


      return true;
    }



    /// <summary>
    /// Determins if a state contains a wildcard transition.
    /// i.e., A_*B
    /// </summary>
    /// <param name="state">State to check</param>
    /// <returns>true if the state contains wildcard transition, otherwise false</returns>
    private bool IsWildCard(State state)
    {
      return (state == state.GetSingleTransition(MetaSymbol.ANY_ONE_CHAR_TRANS));
    }

    /// <summary>
    /// Process state that has wildcard transition.
    /// </summary>
    /// <param name="state">State with wildcard transition</param>
    /// <param name="sSearchIn">String to search in.</param>
    /// <param name="nCurrIndex">Current index of the search</param>
    /// <param name="nSearchUpTo">Index where to stop the search.</param>
    private void ProcessWildCard(State state, string sSearchIn, ref int nCurrIndex, int nSearchUpTo)
    {
      State toState = null;
      int nIndex = nCurrIndex;

      while (nIndex <= nSearchUpTo)
      {
        char ch = sSearchIn[nIndex];

        toState = state.GetSingleTransition(ch.ToString());

        if (toState != null)
        {
          nCurrIndex = nIndex;
        }
        nIndex++;
      }

    }


    /// <summary>
    /// Get the ready state of the parser.
    /// </summary>
    /// <returns>true if a Compile method had been called successfully, otherwise false.</returns>
    public bool IsReady()
    {
      return (m_stateStartDfaM != null);
    }

    /// <summary>
    /// Position where error occured during the last compilation
    /// </summary>
    /// <returns>-1 if there was no compilation</returns>
    public int GetLastErrorPosition()
    {
      return m_nLastErrorIndex;
    }

    /// <summary>
    /// Indicate if the last compilation was successfull or error
    /// </summary>
    /// <returns></returns>
    public ErrorCode GetLastErrorCode()
    {
      return m_LastErrorCode;

    }
    /// <summary>
    /// Get last error length.
    /// </summary>
    /// <returns>Length</returns>
    public int GetLastErrorLength()
    {
      return m_nLastErrorLength;

    }

    /// <summary>
    /// Gets/Sets to indicate whether Find should stop at the first accepting state, 
    /// or should continue see if further match is possible (greedy).
    /// </summary>
    public bool UseGreedy
    {
      get
      {
        return m_bGreedy;
      }
      set
      {
        m_bGreedy = value;
      }

    }

    /// <summary>
    /// Helper fucntion. Updates the local variables once the validation returns.
    /// </summary>
    /// <param name="vi"></param>
    private void UpdateValidationInfo(ValidationInfo vi)
    {
      if (vi.ErrorCode == ErrorCode.ERR_SUCCESS)
      {
        m_bMatchAtEnd = vi.MatchAtEnd;
        m_bMatchAtStart = vi.MatchAtStart;
      }

      m_LastErrorCode = vi.ErrorCode;
      m_nLastErrorIndex = vi.ErrorStartAt;
      m_nLastErrorLength = vi.ErrorLength;
    }
  }

}


By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Denmark Denmark
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions