Click here to Skip to main content
15,892,005 members
Articles / Programming Languages / C++

Writing own regular expression parser

Rate me:
Please Sign up or sign in to vote.
4.91/5 (132 votes)
14 Nov 200322 min read 986.5K   12.3K   237  
Explains principles behind writing regular expression parsers.
/*! \file AG_RegEx.h
    \brief Regular Expression Parser
	\author Amer Gerzic

	Use it as following:					\n
											\n
	string strPattern;						\n
	int nPos = 0;							\n
											\n
	if(SetRegEx("....."))					\n
	{										\n
		if(FindFirst(...))					\n
		{									\n
			// Do something with result		\n
			while(FindNext(...))			\n
			{								\n
				// Process Result			\n
			}								\n
		}									\n
	}										\n
*/

#pragma once

// for debug STL exceeds 255 chars and MS compiler complains, so disable it
#pragma warning(disable:4786)
#pragma warning(disable:4503)

#include <string>
#include <stack>
#include <map>
#include <deque>
#include <vector>
#include <set>
#include <algorithm>
#include <list>

using namespace std;

class CAG_RegEx  
{
protected:
	//! State Class
	class CAG_State
	{
	protected:
		//! Transitions from this state to other 
		multimap<char, CAG_State*> m_Transition;

		//! State ID
		int m_nStateID;

		//! Set of NFA state from which this state is constructed
		set<CAG_State*> m_NFAStates;

	public:
		//! Default constructor
		CAG_State() : m_nStateID(-1), m_bAcceptingState(FALSE) {};

		//! parameterized constructor
		CAG_State(int nID) : m_nStateID(nID), m_bAcceptingState(FALSE) {};

		//! Constructs new state from the set of other states
		/*! This is necessary for subset construction algorithm
			because there a new DFA state is constructed from 
			one or more NFA states
		*/
		CAG_State(set<CAG_State*> NFAState, int nID)
		{
			m_NFAStates			= NFAState;
			m_nStateID			= nID;
			
			// DFA state is accepting state if it is constructed from 
			// an accepting NFA state
			m_bAcceptingState	= FALSE;
			set<CAG_State*>::iterator iter;
			for(iter=NFAState.begin(); iter!=NFAState.end(); ++iter)
				if((*iter)->m_bAcceptingState)
					m_bAcceptingState = TRUE;
		};

		//! Copy Constructor
		CAG_State(const CAG_State &other)
		{ *this = other; };

		//! Destructor
		virtual ~CAG_State() {};

		//! True if this state is accepting state
		BOOL m_bAcceptingState;

		//! Adds a transition from this state to the other
		void AddTransition(char chInput, CAG_State *pState)
		{
			ASSERT(pState != NULL);
			m_Transition.insert(make_pair(chInput, pState));
		};

		//! Removes all transition that go from this state to pState
		void RemoveTransition(CAG_State* pState)
		{
			multimap<char, CAG_State*>::iterator iter;
			for(iter=m_Transition.begin(); iter!=m_Transition.end();)
			{
				CAG_State *toState = iter->second;
				if(toState == pState)
					m_Transition.erase(iter++);
				else ++iter;
			}
		};

		//! Returns all transitions from this state on specific input
		void GetTransition(char chInput, vector<CAG_State*> &States)
		{
			// clear the array first
			States.clear();

			// Iterate through all values with the key chInput
			multimap<char, CAG_State*>::iterator iter;
			for(iter = m_Transition.lower_bound(chInput);
				iter!= m_Transition.upper_bound(chInput);
				++iter)
				{
					CAG_State *pState = iter->second;
					ASSERT(pState != NULL);
					States.push_back(pState);
				}
		};

		//! Returns the state id in form of string
		CString GetStateID()
		{
			CString strRes;
			if(m_bAcceptingState)
				strRes.Format(_T("{%d}"), m_nStateID);
			else strRes.Format(_T("%d"), m_nStateID);
			return strRes;
		};

		/*! Returns the set of NFA states from 
			which this DFA state was constructed
		*/
		set<CAG_State*>& GetNFAState()
		{ return m_NFAStates; };

		//! Returns true if this state is dead end
		/*! By dead end I mean that this state is not
			accepting state and there are no transitions
			leading away from this state. This function
			is used for reducing the DFA.
		*/
		BOOL IsDeadEnd()
		{
			if(m_bAcceptingState)
				return FALSE;
			if(m_Transition.empty())
				return TRUE;
			
			multimap<char, CAG_State*>::iterator iter;
			for(iter=m_Transition.begin(); iter!=m_Transition.end(); ++iter)
			{
				CAG_State *toState = iter->second;
				if(toState != this)
					return FALSE;
			}

			TRACE("State %d is dead end.\n", m_nStateID); 
			
			return TRUE;
		};	

		//! Override the assignment operator
		CAG_State& operator=(const CAG_State& other)
		{ 
			m_Transition	= other.m_Transition; 
			m_nStateID		= other.m_nStateID;
			m_NFAStates		= other.m_NFAStates;
		};

		//! Override the comparison operator
		BOOL operator==(const CAG_State& other)
		{
			if(m_NFAStates.empty())
				return(m_nStateID == other.m_nStateID);
			else return(m_NFAStates == other.m_NFAStates);
		};
	};

	//! Structure to track the pattern recognition
	class CAG_PatternState
	{
	public:
		//! Default Constructor
		CAG_PatternState() : m_pState(NULL), m_nStartIndex(-1) {};

		//! Copy Constructor
		CAG_PatternState(const CAG_PatternState &other)
		{ *this = other; };

		//! Destructor
		virtual ~CAG_PatternState() {};

		//! Pointer to current state in DFA
		CAG_State* m_pState;

		//! 0-Based index of the starting position
		int m_nStartIndex;

		//! Override the assignment operator
		CAG_PatternState& operator=(const CAG_PatternState& other)
		{
			ASSERT(other.m_pState != NULL);
			m_pState		= other.m_pState;
			m_nStartIndex	= other.m_nStartIndex;
			return *this;
		};
	};

	//! NFA Table
	typedef deque<CAG_State*> FSA_TABLE;

	/*! NFA Table is stored in a deque of CAG_States.
		Each CAG_State object has a multimap of 
		transitions where the key is the input
		character and values are the references to
		states to which it transfers.
	*/
	FSA_TABLE m_NFATable;

	//! DFA table is stores in same format as NFA table
	FSA_TABLE m_DFATable;

	//! Operand Stack
	/*! If we use the Thompson's Algorithm then we realize
		that each operand is a NFA automata on its own!
	*/
	stack<FSA_TABLE> m_OperandStack;

	//! Operator Stack
	/*! Operators are simple characters like "*" & "|" */
	stack<char> m_OperatorStack;

	//! Keeps track of state IDs
	int m_nNextStateID;

	//! Set of input characters
	set<char> m_InputSet;

	//! List of current partially found patterns
	list<CAG_PatternState*> m_PatternList;

	//! Contains the current patterns accessed
	int m_nPatternIndex;

	//! Contains the text to check
	string m_strText;

	//! Contains all found patterns
	vector<string> m_vecPattern;

	//! Contains all position where these pattern start in the text
	vector<int> m_vecPos;

	//! Constructs basic Thompson Construction Element
	/*! Constructs basic NFA for single character and
		pushes it onto the stack.
	*/
	void Push(char chInput);

	//! Pops an element from the operand stack
	/*! The return value is TRUE if an element 
		was poped successfully, otherwise it is
		FALSE (syntax error) 
	*/
	BOOL Pop(FSA_TABLE &NFATable);

	//! Checks is a specific character and operator
	BOOL IsOperator(char ch) { return((ch == 42) || (ch == 124) || (ch == 40) || (ch == 41) || (ch == 8)); };

	//! Returns operator presedence
	/*! Returns TRUE if presedence of opLeft <= opRight.
	
			Kleens Closure	- highest
			Concatenation	- middle
			Union			- lowest
	*/
	BOOL Presedence(char opLeft, char opRight)
	{
		if(opLeft == opRight)
			return TRUE;

		if(opLeft == '*')
			return FALSE;

		if(opRight == '*')
			return TRUE;

		if(opLeft == 8)
			return FALSE;

		if(opRight == 8)
			return TRUE;

		if(opLeft == '|')
			return FALSE;
		
		return TRUE;
	};

	//! Checks if the specific character is input character
	BOOL IsInput(char ch) { return(!IsOperator(ch)); };

	//! Checks is a character left parantheses
	BOOL IsLeftParanthesis(char ch) { return(ch == 40); };

	//! Checks is a character right parantheses
	BOOL IsRightParanthesis(char ch) { return(ch == 41); };

	//! Evaluates the next operator from the operator stack
	BOOL Eval();

	//! Evaluates the concatenation operator
	/*! This function pops two operands from the stack 
		and evaluates the concatenation on them, pushing
		the result back on the stack.
	*/
	BOOL Concat();

	//! Evaluates the Kleen's closure - star operator
	/*! Pops one operator from the stack and evaluates
		the star operator on it. It pushes the result
		on the operand stack again.
	*/
	BOOL Star();

	//! Evaluates the union operator
	/*! Pops 2 operands from the stack and evaluates
		the union operator pushing the result on the
		operand stack.
	*/
	BOOL Union();

	//! Inserts char 8 where the concatenation needs to occur
	/*! The method used to parse regular expression here is 
		similar to method of evaluating the arithmetic expressions.
		A difference here is that each arithmetic expression is 
		denoted by a sign. In regular expressions the concatenation
		is not denoted by ny sign so I will detect concatenation
		and insert a character 8 between operands.
	*/
	string ConcatExpand(string strRegEx);

	//! Creates Nondeterministic Finite Automata from a Regular Expression
	BOOL CreateNFA(string strRegEx);

	//! Calculates the Epsilon Closure 
	/*! Returns epsilon closure of all states
		given with the parameter.
	*/
	void EpsilonClosure(set<CAG_State*> T, set<CAG_State*> &Res); 

	//! Calculates all transitions on specific input char
	/*! Returns all states rechible from this set of states
		on an input character.
 	*/
	void Move(char chInput, set<CAG_State*> T, set<CAG_State*> &Res);

	//! Converts NFA to DFA using the Subset Construction Algorithm
	void ConvertNFAtoDFA();

	//! Optimizes the DFA
	/*! This function scanns DFA and checks for states that are not
		accepting states and there is no transition from that state
		to any other state. Then after deleting this state we need to
		go through the DFA and delete all transitions from other states
		to this one.
	*/
	void ReduceDFA();

	//! Cleans up the memory
	void CleanUp()
	{
		// Clean up all allocated memory for NFA
		for(int i=0; i<m_NFATable.size(); ++i)
			delete m_NFATable[i];
		m_NFATable.clear();

		// Clean up all allocated memory for DFA
		for(i=0; i<m_DFATable.size(); ++i)
			delete m_DFATable[i];
		m_DFATable.clear();

		// Reset the id tracking
		m_nNextStateID = 0;

		// Clear both stacks
		while(!m_OperandStack.empty())
			m_OperandStack.pop();
		while(!m_OperatorStack.empty())
			m_OperatorStack.pop();

		// Clean up the Input Character set
		m_InputSet.clear();

		// Clean up all pattern states
		list<CAG_PatternState*>::iterator iter;
		for(iter=m_PatternList.begin(); iter!=m_PatternList.end(); ++iter)
			delete *iter;
		m_PatternList.clear();
	};

	//! Searches the text for all occurences of the patterns
	/*! Returns TRUE if single pattern is found or FALSE if 
	    no pattern sis found.
	*/
	BOOL Find();

public:
	//! Default Constructor
	CAG_RegEx();

	//! Destructor
	virtual ~CAG_RegEx();

	//! Set Regular Expression
	/*! Set the string pattern to be searched for.
		Returns TRUE if success othewise it returns FALSE.
	*/
	BOOL SetRegEx(string strRegEx);

	//! Searches for the first occurence of a text pattern
	/*! If the pattern is found the function returns TRUE
		together with starting positions (vecPos) and the patterns
		(vecPattern) found. THIS FUNCTION MUST BE CALLED FIRST.
		The return value is TRUE if a pattern is found or FALSE
		if nothing is found.
	*/
	BOOL FindFirst(string strText, int &nPos, string &strPattern);

	//! Searches for the next occurence of a text pattern
	/*! If the pattern is found the function returns TRUE
		together with starting position array (vecPos) and the patterns
		(vecPattern) found. THIS FUNCTION MUST BE CALLED AFTER 
		THE FUNCTION FindFirst(...). 
		The return value is TRUE if a pattern is found or FALSE
		if nothing is found.
	*/
	BOOL FindNext(int &nPos, string &strPattern);

#ifdef _DEBUG
	//! For Debuging purposes only
	void SaveNFATable()
	{
		CString strNFATable;

		// First line are input characters
		set<char>::iterator iter;
		for(iter = m_InputSet.begin(); iter != m_InputSet.end(); ++iter)
			strNFATable += "\t\t" + CString(*iter);
		// add epsilon
		strNFATable += "\t\tepsilon";
		strNFATable += "\n";

		// Now go through each state and record the transitions
		for(int i=0; i<m_NFATable.size(); ++i)
		{
			CAG_State *pState = m_NFATable[i];
			
			// Save the state id
			strNFATable += pState->GetStateID();

			// now write all transitions for each character
			vector<CAG_State*> State;
			for(iter = m_InputSet.begin(); iter != m_InputSet.end(); ++iter)
			{
				pState->GetTransition(*iter, State);
				CString strState;
				if(State.size()>0)
				{
					strState = State[0]->GetStateID();
					for(int i=1; i<State.size(); ++i)
						strState += "," + State[i]->GetStateID();
				}
				strNFATable += "\t\t" + strState;
			}

			// Add all epsilon transitions
			pState->GetTransition(0, State);
			CString strState;
			if(State.size()>0)
			{
				strState = State[0]->GetStateID();
				for(int i=1; i<State.size(); ++i)
					strState += "," + State[i]->GetStateID();
			}
			strNFATable += "\t\t" + strState + "\n";
		}

		// Save table to the file
		CStdioFile f;
		if(f.Open("C:\\NFATable.txt", CFile::modeCreate | CFile::modeWrite))
		{
			f.WriteString(strNFATable);
			f.Close();
		}
		else AfxMessageBox(_T("Could not save NFA Table to c:\\NFATable.txt"));
	};

	//! For Debuging purposes only
	void SaveDFATable()
	{
		CString strDFATable;

		// First line are input characters
		set<char>::iterator iter;
		for(iter = m_InputSet.begin(); iter != m_InputSet.end(); ++iter)
			strDFATable += "\t\t" + CString(*iter);
		strDFATable += "\n";

		// Now go through each state and record the transitions
		for(int i=0; i<m_DFATable.size(); ++i)
		{
			CAG_State *pState = m_DFATable[i];
			
			// Save the state id
			strDFATable += pState->GetStateID();

			// now write all transitions for each character
			vector<CAG_State*> State;
			for(iter = m_InputSet.begin(); iter != m_InputSet.end(); ++iter)
			{
				pState->GetTransition(*iter, State);
				CString strState;
				if(State.size()>0)
				{
					strState = State[0]->GetStateID();
					for(int i=1; i<State.size(); ++i)
						strState += "," + State[i]->GetStateID();
				}
				strDFATable += "\t\t" + strState;
			}
			strDFATable += "\n";
		}

		// Save table to the file
		CStdioFile f;
		if(f.Open("C:\\DFATable.txt", CFile::modeCreate | CFile::modeWrite))
		{
			f.WriteString(strDFATable);
			f.Close();
		}
		else AfxMessageBox(_T("Could not save DFA Table to c:\\DFATable.txt"));
	};

	void SaveNFAGraph() 
	{
		CString strNFAGraph = _T("digraph{\n");

		// Final states are double circled
		for(int i=0; i<m_NFATable.size(); ++i)
		{
			CAG_State *pState = m_NFATable[i];
			if(pState->m_bAcceptingState)
			{
				CString strStateID = pState->GetStateID();
				strStateID.Remove('{'); strStateID.Remove('}');
				strNFAGraph += _T("\t") + strStateID + _T("\t[shape=doublecircle];\n");
			}
		}
		
		strNFAGraph += _T("\n");

		// Record transitions
		for(i=0; i<m_NFATable.size(); ++i)
		{
			CAG_State *pState = m_NFATable[i];
			vector<CAG_State*> State;

			pState->GetTransition(0, State);
			for(int j=0; j<State.size(); ++j)
			{
				// Record transition
				CString strStateID1 = pState->GetStateID();
				CString strStateID2 = State[j]->GetStateID();
				strStateID1.Remove('{'); strStateID1.Remove('}');
				strStateID2.Remove('{'); strStateID2.Remove('}');
				strNFAGraph += _T("\t") + strStateID1 + _T(" -> ") + strStateID2;
				strNFAGraph += _T("\t[label=\"epsilon\"];\n");
			}

			set<char>::iterator iter;
			for(iter = m_InputSet.begin(); iter != m_InputSet.end(); ++iter)
			{
				pState->GetTransition(*iter, State);
				for(int j=0; j<State.size(); ++j)
				{
					// Record transition
					CString strStateID1 = pState->GetStateID();
					CString strStateID2 = State[j]->GetStateID();
					strStateID1.Remove('{'); strStateID1.Remove('}');
					strStateID2.Remove('{'); strStateID2.Remove('}');
					strNFAGraph += _T("\t") + strStateID1 + _T(" -> ") + strStateID2;
					strNFAGraph += _T("\t[label=\"") + CString(*iter) + _T("\"];\n");
				}
			}
		}

		strNFAGraph += _T("}");

		// Save table to the file
		CStdioFile f;
		if(f.Open("C:\\NFAGraph.dot", CFile::modeCreate | CFile::modeWrite))
		{
			f.WriteString(strNFAGraph);
			f.Close();
		}
		else AfxMessageBox(_T("Could not save NFA Graph to c:\\NFAGraph.txt"));
	};
	
	void SaveDFAGraph()
	{
		CString strDFAGraph = _T("digraph{\n");

		// Final states are double circled
		for(int i=0; i<m_DFATable.size(); ++i)
		{
			CAG_State *pState = m_DFATable[i];
			if(pState->m_bAcceptingState)
			{
				CString strStateID = pState->GetStateID();
				strStateID.Remove('{'); strStateID.Remove('}');
				strDFAGraph += _T("\t") + strStateID + _T("\t[shape=doublecircle];\n");
			}
		}
		
		strDFAGraph += _T("\n");

		// Record transitions
		for(i=0; i<m_DFATable.size(); ++i)
		{
			CAG_State *pState = m_DFATable[i];
			vector<CAG_State*> State;

			pState->GetTransition(0, State);
			for(int j=0; j<State.size(); ++j)
			{
				// Record transition
				CString strStateID1 = pState->GetStateID();
				CString strStateID2 = State[j]->GetStateID();
				strStateID1.Remove('{'); strStateID1.Remove('}');
				strStateID2.Remove('{'); strStateID2.Remove('}');
				strDFAGraph += _T("\t") + strStateID1 + _T(" -> ") + strStateID2;
				strDFAGraph += _T("\t[label=\"epsilon\"];\n");
			}

			set<char>::iterator iter;
			for(iter = m_InputSet.begin(); iter != m_InputSet.end(); ++iter)
			{
				pState->GetTransition(*iter, State);
				for(int j=0; j<State.size(); ++j)
				{
					// Record transition
					CString strStateID1 = pState->GetStateID();
					CString strStateID2 = State[j]->GetStateID();
					strStateID1.Remove('{'); strStateID1.Remove('}');
					strStateID2.Remove('{'); strStateID2.Remove('}');
					strDFAGraph += _T("\t") + strStateID1 + _T(" -> ") + strStateID2;
					strDFAGraph += _T("\t[label=\"") + CString(*iter) + _T("\"];\n");
				}
			}
		}

		strDFAGraph += _T("}");

		// Save table to the file
		CStdioFile f;
		if(f.Open("C:\\DFAGraph.dot", CFile::modeCreate | CFile::modeWrite))
		{
			f.WriteString(strDFAGraph);
			f.Close();
		}
		else AfxMessageBox(_T("Could not save DFA Graph to c:\\DFAGraph.txt"));
	};
#else
	void SaveNFATable() { } ;
	void SaveDFATable() { } ;
	void SaveNFAGraph() { } ;
	void SaveDFAGraph() { } ;
#endif
};

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
President Infinity Software Solutions, LLC.
United States United States
Originally from Bosnia and Herzegovina, but lived for 6 years in Germany where I did majority of education, then moved to US, where I live since 1999. I like programming, computers in general, but also Basketball, Soccer, Tennis, and many other things. Masters graduate from Grand Valley State University in CIS and working as a full time software developer. Please visit my website www.amergerzic.com

Comments and Discussions