Click here to Skip to main content
15,882,113 members
Articles / Programming Languages / C++

Fast regular expressions

Rate me:
Please Sign up or sign in to vote.
4.85/5 (19 votes)
29 Oct 2000 360.8K   5.2K   104  
Compiles a regular expression into a fast automaton.
#include "RexAlgorithm.h"
#pragma warning(disable:4786)
#pragma warning(disable:4503)
#include <iostream>
#include <bitset>
#include <set>
#include <map>
#include <string>
#include <algorithm>
#include <vector>
#include <utility>
#include <fstream>
using namespace std;

typedef bitset<256>			REXA_CharSet;
typedef int					REXA_StateId;

template<class TSet>
struct REXA_LessSet{
    bool operator()(const TSet& rTSet1, const TSet& rTSet2)const
    {
        if( rTSet1==rTSet2 ){
            return false;
        }
        for(int i=0;i<rTSet1.size();i++){
            if( rTSet1[i]!=rTSet2[i] )
                return rTSet1[i]<rTSet2[i];
        }
        return false;
    }
};
typedef REXA_LessSet<REXA_CharSet>  REXA_LessCharSet;
typedef set<REXA_CharSet,REXA_LessCharSet> REXA_CharSetSet;


// State and Translationtable
struct REXA_NDFAState;

struct REXA_StateSet : public  set<const struct REXA_NDFAState*>
{
	bool	IsAccepting		()const;
};

typedef vector<REXA_StateSet>		TransitionVector;



struct REXA_NDFATransition{

	REXA_NDFATransition(const REXA_CharSet& m_rTransChars,REXA_NDFAState* pStateNext)
		:m_transChars(m_rTransChars),m_nextState(pStateNext)
	{
	}
    REXA_NDFATransition(const char* pcszChars=0,REXA_NDFAState* pNextState=0)
    {
		if(!pcszChars || strlen(pcszChars)==0 ){
			m_bIsMute=true;
			return;
		}
        const unsigned char* pucszChars= (const unsigned char*)pcszChars;
        m_nextState= pNextState;
        while( *pucszChars ){
            m_transChars[*pucszChars++]= true;
        }
    }
    REXA_CharSet     m_transChars;
    REXA_NDFAState*  m_nextState;
	bool		m_bIsMute;
};
struct REXA_NDFAState{
    REXA_NDFAState(int id=0,bool isAccepting=false)
        :m_id(id),
        m_nAnswer(REXA_DFAState::eNoAnswer),
		m_isAccepting(isAccepting)
    {
    }

    static void			CollectStates   (const REXA_NDFAState* pState,
											set<const REXA_NDFAState*>& rsetStates);
    static void         CollectStates   (REXA_NDFAState* pState,
                                            set<REXA_NDFAState*>& rsetStates);

    static REXA_NDFAState*	CopyStates	(REXA_NDFAState* pState,
									map<REXA_NDFAState*,REXA_NDFAState*>& rMapStates);
	REXA_NDFAState*			Clone		();
    void				DeleteAll	();

    int                         m_id;
	int							m_nAnswer;
    bool                        m_isAccepting;
    vector<REXA_NDFATransition>      m_transitions;

};
struct REXA_TranslatingTable{
	typedef	map< REXA_StateSet,TransitionVector >	TranslationMap;
	typedef	map<const REXA_NDFAState*,int>			MapStateOccurrenceCount;

    REXA_TranslatingTable    (const REXA_NDFAState* pStartState);

	void					ComputeOccCounts();
	int						GetAnswerCode	(const REXA_StateSet& rSet)const;

	vector<REXA_CharSet>		m_vecCharsets;
    const REXA_NDFAState*	m_pStartState;
	TranslationMap		m_mapTransitions;

	MapStateOccurrenceCount		m_mapStateOccurrenceCount;
    
private:
    vector<REXA_CharSet> 
            CharsetToVector			(const REXA_CharSetSet& rCharSetSet);

    set<REXA_CharSet,REXA_LessCharSet> 
            ComputeDisjointCharSets     (REXA_CharSetSet& rCharSets);

    void    GenerateTranslationTable	( set<const REXA_NDFAState*>& rSetState);

    void    CollectCharSets     (   const REXA_NDFAState* pCurState,
                                    set<int>& rVisitedStates,REXA_CharSetSet& rCharSets);
};

class REXA_NDFAToTable{
public:
    void     NDFA_To_Table      (REXA_NDFAState* pStartState,
                                REXA_TranslatingTable& rTable);

static vector<REXA_CharSet> 
            CharsetToVector     (const REXA_CharSetSet& rCharSetSet);

    set<REXA_CharSet,REXA_LessCharSet> 
            ComputeDisjointCharSets     (REXA_CharSetSet& rCharSets);

static void    GenerateTranslationTable    ( map<int,REXA_NDFAState*>& rMapIdState,
                                                  REXA_TranslatingTable& rTable);


static void CollectCharSets     (   REXA_NDFAState* pCurState,
                                    set<int>& rVisitedStates,
									REXA_CharSetSet& rCharSets);
};

class REXA_NDFAToDfa{
public:
	typedef			map<REXA_NDFAState*,int> MapStateOccurrenceCount;

	REXA_NDFAToDfa		(const REXA_TranslatingTable& rTable,char cEos);
    REXA_DFAState*		TableToDfaStates();
private:
	REXA_DFAState*		CreateDfaStates(
						const REXA_StateSet&				rStateSet,
						map<REXA_StateSet,REXA_DFAState*>&	rMapStateSetDFAState);

	const REXA_TranslatingTable&		m_rTable;
	unsigned char				        m_cEos;
};

class REXA_Scanner{
    friend class REXA_ParserImpl;
    friend class REXA_Exception;
public:
//constructor,desctructor,typedefs
	REXA_Scanner(const char* pcszToParse="")   {SetSource(pcszToParse);}
private:
    void SetSource(const char* pcszSrc){
                                        m_pcszToParse= pcszSrc;m_pTokBeg= pcszSrc;
                                        m_pTokEnd= pcszSrc;m_eLastSymbol= eEof;
                                        }
//operations
	REXA_ESymbols	operator()		();
    
    int             GetLastSymbol   (){return m_eLastSymbol;}
static string       GetSymbolString (REXA_ESymbols eRegexpSymbol);
    void            AdvanceAndMatch (REXA_ESymbols eSymbol);
	void			Match			(REXA_ESymbols eSymbol);
	REXA_BegEnd		GetToken		(){return make_pair(m_pTokBeg,m_pTokEnd);}
	const char*		GetSource		(){return m_pcszToParse;}
	REXA_CharSet			GetCharSet		(){return m_charSet;}
    char            GetChar         (const char*& rpFrom,bool& rbIsEscape);
    void            AddRange        (   unsigned char cFirst,
                                        unsigned char cLast, 
                                        REXA_CharSet& rCharSet);
    const char*     SkipWhiteSpace  ();
	const char*		m_pcszToParse;
	const char*		m_pTokBeg;
    const char*     m_pTokEnd;
	REXA_CharSet			m_charSet;
    string          m_strLiteral;
	REXA_ESymbols   m_eLastSymbol;
};

class REXA_ParserImpl{
public:
	REXA_ParserImpl				(char cEos=0)
        :m_cEos(cEos),m_eAtomSet(eCharSet|eIdent|eLParen),
        m_nStateIdMax(0)
	{
    }
    ~REXA_ParserImpl();
    const REXA_NDFAState* ParseRegExp           (string strRegExp);
    bool       ParseRegExpDefinition (string strRegExpDefiniton,int nAnswer);
 
    char                    m_cEos;
    map<string,REXA_NDFAState*>  m_mapNamedStates;
private:
// grammar related functions
	void       NamedRegExp		(int nAnswer);
	REXA_NDFAState* RegExp			();
	REXA_NDFAState* Term				();
	REXA_NDFAState* Factor			();
	REXA_NDFAState* Atom				();
    REXA_NDFAState* Name             (REXA_BegEnd nameToFind);
    REXA_NDFAState* ParseCharSet     (REXA_CharSet&	rCharSet);	
//NDFA-Automtaton constructors
    REXA_NDFAState* Concat           (REXA_NDFAState* pState0,REXA_NDFAState* pState1);
    REXA_NDFAState* Union            (REXA_NDFAState* pState0,REXA_NDFAState* pState1);
    REXA_NDFAState* MakeStar         (REXA_NDFAState* pState);
    REXA_NDFAState* MakePlus         (REXA_NDFAState* pState);
    REXA_NDFAState* MakeOptional     (REXA_NDFAState* pState);
    REXA_NDFAState* MakeIgnoreCase   (REXA_NDFAState* pState);
//helper functions for grammar related functions
    void			InsertNamedState(	REXA_BegEnd name,
										REXA_NDFAState* pState,int nAnwer);
 
	vector<pair<REXA_NDFAState*,REXA_NDFATransition> >
					CollectAcceptingTransitions(REXA_NDFAState* pState);
    vector<REXA_NDFAState*> CollectAcceptingStates(REXA_NDFAState* pState);
	void			CollectStates(
							REXA_NDFAState* pState,
							set<REXA_NDFAState*>& rsetStates);

    REXA_NDFAState*               CloneNDFA(REXA_NDFAState* pToClone);
    REXA_NDFAState*               GetNewNDFAState(int id=0,bool isAccepting=false);

    int                     NextStateId          ();

	REXA_Scanner   	        m_scanner;
    const int               m_eAtomSet;
    int                     m_nStateIdMax;

    vector<REXA_NDFAState*>      m_vecAllStates;
};


static struct SymbolString{
    REXA_ESymbols   eSym;
    const char*     strSym;
}g_symbolStrings[]={
    {   eEof,       "end of string"     },
    {   eIllegal,   "illegal character" },
    {   eStar,      "*"                 },
    {   ePlus,      "+"                 },
    {   eAssign,    "="                 },
    {   eOpt,       "?"                 },
    {   eAlternative,"|"                },
    {   eCharSet,   "[<characters>]"    },
    {   eLParen,    "("                 },
    {   eRParen,    ")"                 }
};

//-------------------------------------------------------------------------------
REXA_Exception::REXA_Exception(
        const char* pPos,
        const char* pRegExp,
        string strMsg)
//-------------------------------------------------------------------------------
{
    m_strErrMsg=   strMsg;
    m_nErrPos= pPos-pRegExp;
}
//-------------------------------------------------------------------------------
REXA_Exception::REXA_Exception(
		REXA_ESymbols eExpectedSymbol,
		REXA_BegEnd    token,
		const char*   pRegExp,
		string		  strMsg)
//-------------------------------------------------------------------------------
{
    
    m_strErrMsg=  REXA_Scanner::GetSymbolString(eExpectedSymbol)    
		+  " expected\r\n  ";
    m_nErrPos= token.first - pRegExp;
}
//-------------------------------------------------------------------------------
string       REXA_Scanner::GetSymbolString     (REXA_ESymbols eRegexpSymbol)
//-------------------------------------------------------------------------------
{
    for(int i=0;i<sizeof(g_symbolStrings)/sizeof(SymbolString);i++){
        if( g_symbolStrings[i].eSym==eRegexpSymbol ){
            return g_symbolStrings[i].strSym;
        }
    }
    return "";
}

//-------------------------------------------------------------------------------
const char*    REXA_Scanner::SkipWhiteSpace    ()
//-------------------------------------------------------------------------------
{
    const char* pCur= m_pTokEnd;
    while(isspace(*pCur))
        pCur++;
    return pCur;
}

//------------------------------------------------------------------------------
void			REXA_Scanner::Match		    (REXA_ESymbols eSymbol)
//------------------------------------------------------------------------------
{
	if( eSymbol!=m_eLastSymbol ){
        throw REXA_Exception(eSymbol,GetToken(),m_pcszToParse);
	}
}
//------------------------------------------------------------------------------
void            REXA_Scanner::AdvanceAndMatch (REXA_ESymbols eSymbol)
//------------------------------------------------------------------------------
{
    if( eSymbol!=operator()() ){
        throw REXA_Exception(eSymbol,GetToken(),m_pcszToParse);
    }
}
//------------------------------------------------------------------------------
char          REXA_Scanner::GetChar(const char*& rpFrom,bool& rbIsEscape)
//------------------------------------------------------------------------------
{
    rbIsEscape= false;
    if( *rpFrom=='\0' ) return '\0';
    char c;
    if( *rpFrom=='\\' ){
        rbIsEscape= true;
        rpFrom++;
        switch(*rpFrom){
        case '0':  return '\0';
        case 'n':  rpFrom++; return '\n';
        case 't':  rpFrom++; return '\t';
        case 'v':  rpFrom++; return '\v';
        case 'a':  rpFrom++; return '\a';
        case 'b':  rpFrom++; return '\b';
        case 'r':  rpFrom++; return '\r';
        case 'x':  c= *rpFrom++; 
            if( isxdigit(rpFrom[0]) && isxdigit(rpFrom[1]) ){
                unsigned hexChar=0;
                for(int i=0;i<2;i++){
                    hexChar*=16;
                    hexChar+= isdigit(rpFrom[i])?(rpFrom[i]-'0'):(toupper(rpFrom[i])-'A'+16);
                }
                c= (char)hexChar;
                rpFrom+= 2;
            }
            return c;
        default:    c= *rpFrom++; return c;
        }
    }
    return *rpFrom++;
}
//------------------------------------------------------------------------------
void    REXA_Scanner::AddRange(
                           unsigned char cFirst, 
                           unsigned char cLast, 
                           REXA_CharSet& rCharSet)
//------------------------------------------------------------------------------
{
    if( cFirst>cLast ){
        swap(cFirst,cLast);
    }
    for(unsigned char c= cFirst; c<cLast;c++){
        rCharSet[c]= true;
    }
    rCharSet[cLast]= true;
}
//------------------------------------------------------------------------------
REXA_ESymbols REXA_Scanner::operator()	()
//------------------------------------------------------------------------------
{
	m_pTokBeg= m_pTokEnd= SkipWhiteSpace();
    switch(*m_pTokEnd){
    case '\0':  return m_eLastSymbol=eEof;
    case '+':   m_pTokEnd++; return m_eLastSymbol=ePlus;
    case '?':   m_pTokEnd++; return m_eLastSymbol=eOpt;
    case '*':   m_pTokEnd++; return m_eLastSymbol=eStar;
    case '(':   m_pTokEnd++; return m_eLastSymbol=eLParen;
    case ')':   m_pTokEnd++; return m_eLastSymbol=eRParen;
    case '=':   m_pTokEnd++; return m_eLastSymbol=eAssign;
	case '|':   m_pTokEnd++; return m_eLastSymbol=eAlternative;
    case '[': {
                m_pTokEnd++;
                REXA_CharSet cset;
                char c;
                bool bIsEscape;
				bool bIsFirstTime= true;
                bool bPrevWasRangeSeparator= false;
				bool bUseComplement= false;
                char cPrev;
				cPrev=GetChar(m_pTokEnd,bIsEscape);
				if( cPrev=='^' && !bIsEscape ){
					bUseComplement= true;
                    cPrev=GetChar(m_pTokEnd,bIsEscape);
				}
				while(	cPrev	&&	!(cPrev==']' && !bIsEscape) ){
					c=GetChar(m_pTokEnd,bIsEscape);
					if( !bIsEscape && c=='-'  ){
						c=GetChar(m_pTokEnd,bIsEscape);
                        if(!c){
                            break;
                        }
						AddRange(
							(unsigned char)cPrev,(unsigned char)c,cset);
						cPrev= GetChar(m_pTokEnd,bIsEscape);
					}else{
                        cset[(unsigned char)cPrev]= true;
                        cPrev= c;
					}
				}
                if( !cPrev  || !c ){
                    throw REXA_Exception(
                                m_pTokBeg,m_pcszToParse,"closing ] missing");
                }
                m_charSet= cset;
				if(bUseComplement){
					m_charSet.flip();
				}
                return m_eLastSymbol=eCharSet;
              }
    case '.':{
                m_pTokEnd++;
                m_charSet= REXA_CharSet();
                for(int i=0;i<256;i++){
                    m_charSet[i]= true;
                }
                return m_eLastSymbol=eCharSet;
             }
    case '$':{
                if(  isalpha(m_pTokEnd[1])||m_pTokEnd[1]=='_' ){
                    m_pTokEnd++;
                    m_strLiteral="";
                    while(  isalpha(*m_pTokEnd)||isdigit(*m_pTokEnd)||m_pTokEnd[1]=='_' ){
                        m_strLiteral+= *m_pTokEnd++;
                    }
                    return m_eLastSymbol=eIdent;
                 }
             }
    default:
        {
            if( *m_pTokEnd=='\\' && m_pTokEnd[1]=='i' ){
                m_pTokEnd++; 
                m_pTokEnd++;
                return m_eLastSymbol=eIgnoreCase;
            }else{
                bool bIsEscape;
                char c=GetChar(m_pTokEnd,bIsEscape);
                m_charSet= REXA_CharSet();
                m_charSet[(unsigned char)c]= true;
                return m_eLastSymbol=eCharSet;
            }
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
// grammar related functions
///////////////////////////////////////////////////////////////////////////////
//-----------------------------------------------------------------------------
REXA_ParserImpl::~REXA_ParserImpl()
//-----------------------------------------------------------------------------
{
    for(int i=0;i<m_vecAllStates.size();++i){
        delete m_vecAllStates[i];
    }
}

//-----------------------------------------------------------------------------
void		REXA_ParserImpl::InsertNamedState(REXA_BegEnd name,
										   REXA_NDFAState* pState,int nAnswer)
//-----------------------------------------------------------------------------
{
	vector<REXA_NDFAState*>  vecAccepting= CollectAcceptingStates(pState);
	for(int i=0;i<vecAccepting.size();i++){
		vecAccepting[i]->m_nAnswer= nAnswer;
	}
    pair<map<string,REXA_NDFAState*>::iterator,bool> itBool=
        m_mapNamedStates.insert(make_pair(string(name.first,name.second),pState));
    if( !itBool.second ){
        itBool.first->second= pState;
    }
}
//-----------------------------------------------------------------------------
void REXA_ParserImpl::NamedRegExp		(int nAnswer)
//-----------------------------------------------------------------------------
//$1 NamedRegExp= Ident "=" RegExp.
{
	m_scanner.Match(eIdent);
	REXA_BegEnd name= m_scanner.GetToken();

	m_scanner.AdvanceAndMatch(eAssign);
    m_scanner();
	REXA_NDFAState* pState=RegExp();
    if( m_scanner.GetLastSymbol()!=eEof){
         throw REXA_Exception(
                        m_scanner.m_pTokBeg,
                        m_scanner.m_pcszToParse,
                        "unexpected symbol");
    }
	InsertNamedState(name,pState,nAnswer);
}
//-----------------------------------------------------------------------------
const REXA_NDFAState*  REXA_ParserImpl::ParseRegExp          (string strRegExp)
//-----------------------------------------------------------------------------
{
    m_scanner.SetSource(strRegExp.c_str());
    m_scanner();
    const REXA_NDFAState* cRegExp= RegExp();
    if( m_scanner.GetLastSymbol()!=eEof){
         throw REXA_Exception(
                        m_scanner.m_pTokBeg,
                        m_scanner.m_pcszToParse,
                        "unexpected symbol");
    }
    return cRegExp;
}
//-----------------------------------------------------------------------------
bool       REXA_ParserImpl::ParseRegExpDefinition(string strRegExpDefiniton,
											   int nAnswer)
//-----------------------------------------------------------------------------
{
    m_scanner.SetSource(strRegExpDefiniton.c_str());
    m_scanner();
    NamedRegExp(nAnswer);
    return true;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::RegExp			()
//-----------------------------------------------------------------------------
//$2 RegExp= Term {"|" Term}.
{
    REXA_NDFAState* pState= Term();
    while( m_scanner.GetLastSymbol()==eAlternative ){
        m_scanner();
        pState= Union(pState,Term());
    }
    return pState;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Term			()
//-----------------------------------------------------------------------------
//$3 Term= Factor {Factor}.
{
    REXA_NDFAState* pState= Factor();
    while( m_scanner.GetLastSymbol()&m_eAtomSet ){
        pState= Concat(pState,Factor());
    }
    return pState;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Factor			()
//-----------------------------------------------------------------------------
//$4 Factor= Atom ["*"|"+"|"?"|"\i"].
{
    REXA_NDFAState* pState= Atom();
    switch(m_scanner.GetLastSymbol()){
    case eStar:
        {
            pState= MakeStar(pState);
            m_scanner();
            return pState;
        }
    case ePlus:
        {
            pState= MakePlus(pState);
            m_scanner();
            return pState;
        }
    case eOpt:
        {
            pState= MakeOptional(pState);
            m_scanner();
            return pState;
        }
    case eIgnoreCase:
        {
            pState= MakeIgnoreCase(pState);
            m_scanner();
            return pState;
        }
    default:
        return pState;
    }
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Atom				()
//-----------------------------------------------------------------------------
//$5 Atom= REXA_CharSet |  Ident | "(" RegExp ").
{
    switch(m_scanner.GetLastSymbol()){
    case eCharSet:
        {
			REXA_NDFAState* pState= ParseCharSet(m_scanner.GetCharSet());
            m_scanner();
            return pState;
        }

    case eIdent:
        {
            REXA_NDFAState* pState= Name(m_scanner.GetToken());
			m_scanner();
            return pState;
        }
    case eLParen:
        {
			m_scanner();
            REXA_NDFAState* pState= RegExp();
            m_scanner.Match(eRParen);
            m_scanner();
            return pState;
        }
    default: throw REXA_Exception(
                        m_scanner.m_pTokBeg,
                        m_scanner.m_pcszToParse,
                        "[,',( or ident expected");
        
    }
    return 0;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Name             (REXA_BegEnd nameToFind)
//-----------------------------------------------------------------------------
{

	map<string,REXA_NDFAState*>::iterator it=
		m_mapNamedStates.find(string(nameToFind.first,nameToFind.second));
	if( it!= m_mapNamedStates.end() ){
		return CloneNDFA(it->second);;
	}
    throw REXA_Exception(   m_scanner.m_pTokBeg,
                            m_scanner.m_pcszToParse,
                            string(nameToFind.first,nameToFind.second-nameToFind.first)
                            + " not found in name dictionary");
    return 0;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::ParseCharSet   (REXA_CharSet& rCharSet)
//-----------------------------------------------------------------------------
{
    rCharSet[(unsigned char)m_cEos]=false;
    REXA_NDFAState* pStartState= GetNewNDFAState(NextStateId(),false);
	REXA_NDFAState* pFinalState= GetNewNDFAState(NextStateId(),true);
	pStartState->m_transitions.push_back(REXA_NDFATransition(rCharSet,pFinalState));
    return pStartState;
}

///////////////////////////////////////////////////////////////////////////////
//NDFA-Automtaton constructors
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Concat         (REXA_NDFAState* pState0,REXA_NDFAState* pState1)
//-----------------------------------------------------------------------------
{ //Terms: F0 resp. F1: StateGraph reachable from 'pState0' resp. 'pState1'
	//1. for each transition in F0 leading from state q to an accepting state 
	//   add a transition leading from q to 'pState1'
	vector<pair<REXA_NDFAState*,REXA_NDFATransition> > vecTrans= 
								CollectAcceptingTransitions(pState0);
    vector<REXA_NDFAState*>  vecAccepting= CollectAcceptingStates(pState0);
	for(int i=0;i<vecTrans.size();i++){
		vecTrans[i].first->m_transitions.push_back(
			REXA_NDFATransition(vecTrans[i].second.m_transChars,pState1));
	}
	//2. if 'pState0' is accepting 
	//   add all transitions from 'pState1'  to 'pState0'
	if( pState0->m_isAccepting ){
		pState0->m_transitions.insert(	pState0->m_transitions.end(),
										pState1->m_transitions.begin(),
										pState1->m_transitions.end());
	}
	//3. accepting states of F1 are accepting states of the catenation
	//   accepting states of F0 are no langer accepting
	for(int j=0;j<vecAccepting.size();j++){
		vecAccepting[j]->m_isAccepting= false;
	}
    return pState0;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Union          (REXA_NDFAState* pState0,REXA_NDFAState* pState1)
//-----------------------------------------------------------------------------
{ //1. 'pState0' gets also the transitions of 'pState1'
	pState0->m_transitions.insert(	pState0->m_transitions.end(),
									pState1->m_transitions.begin(),
									pState1->m_transitions.end());
 //2. if one of the start states is accepting the new start state is also accepting
	if( pState1->m_isAccepting ){
		pState0->m_isAccepting= true;
	} 
    return pState0;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::MakeStar       (REXA_NDFAState* pState)
//-----------------------------------------------------------------------------
{ // //1. for each transition  leading from state q to an accepting state 
	//   add a transition leading from q to 'pState
    vector<pair<REXA_NDFAState*,REXA_NDFATransition> > vecTrans= 
								CollectAcceptingTransitions(pState);
	for(int i=0;i<vecTrans.size();i++){
		vecTrans[i].first->m_transitions.push_back(
			REXA_NDFATransition(vecTrans[i].second.m_transChars,pState));
	}
	pState->m_isAccepting=true;
    return pState;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::MakePlus       (REXA_NDFAState* pState)
//-----------------------------------------------------------------------------
{ //catenate original state graph and its dupplication; then apply MakeStar
    return Concat(pState,MakeStar(CloneNDFA(pState)));
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::MakeOptional   (REXA_NDFAState* pState)
//-----------------------------------------------------------------------------
{ //start state gets optional
    pState->m_isAccepting= true;
	return pState;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::MakeIgnoreCase (REXA_NDFAState* pState)
//-----------------------------------------------------------------------------
{
    set<REXA_NDFAState*>     setStates;
    REXA_NDFAState::CollectStates(pState,setStates);
    set<REXA_NDFAState*>::iterator it;
    for(it= setStates.begin();it!=setStates.end();++it){
        for(int i=0;i<(*it)->m_transitions.size();i++){
            for(int j=0;j<(*it)->m_transitions[i].m_transChars.size();j++){
                if(     (*it)->m_transitions[i].m_transChars[j]
                    &&  isupper(j) ){
                    (*it)->m_transitions[i].
                        m_transChars[(unsigned char)(tolower(j))]= true;
                }
            }
        }
    }
    return pState;
}

//NDFA-Automaton helper  functions
//-----------------------------------------------------------------------------
vector<REXA_NDFAState*>  REXA_ParserImpl::CollectAcceptingStates(REXA_NDFAState* pState)
//-----------------------------------------------------------------------------
{
    set<REXA_NDFAState*>     setStates;
    vector<REXA_NDFAState*>  vecAcceptingStates;
    REXA_NDFAState::CollectStates(pState,setStates);
    set<REXA_NDFAState*>::iterator it;
    for(it= setStates.begin();it!=setStates.end();++it){
        if( (*it)->m_isAccepting ){
            vecAcceptingStates.push_back(*it);
        }
    }
    return vecAcceptingStates;
}

//-----------------------------------------------------------------------------
vector<pair<REXA_NDFAState*,REXA_NDFATransition> >	
			REXA_ParserImpl::CollectAcceptingTransitions(REXA_NDFAState*	pState)
//-----------------------------------------------------------------------------
{
	vector<pair<REXA_NDFAState*,REXA_NDFATransition> > vecStatTrans;

	set<REXA_NDFAState*> setStates;
    REXA_NDFAState::CollectStates(pState,setStates);
	set<REXA_NDFAState*>::iterator	it;
	for(it= setStates.begin();it!=setStates.end();++it){
        for(int i=0;i<(*it)->m_transitions.size();i++){
            if( (*it)->m_transitions[i].m_nextState->m_isAccepting ){
                vecStatTrans.push_back(make_pair(*it,(*it)->m_transitions[i]));
            }
        }
    }
	return vecStatTrans;
}

//-------------------------------------------------------------------------------
int  REXA_ParserImpl::NextStateId                  ()
//-------------------------------------------------------------------------------
{
    return m_nStateIdMax++;
}
//-------------------------------------------------------------------------------
REXA_NDFAState*   REXA_ParserImpl::CloneNDFA(REXA_NDFAState* pToClone)
//-------------------------------------------------------------------------------
{
    set<REXA_NDFAState*> setStates;
    REXA_NDFAState* pState= pToClone->Clone();
    REXA_NDFAState::CollectStates(pState,setStates);
    set<REXA_NDFAState*>::const_iterator it;
    for(it= setStates.begin();it!=setStates.end();++it){
        (*it)->m_id= NextStateId();
        m_vecAllStates.push_back(*it);
    }
    return pState;
}
//-------------------------------------------------------------------------------
REXA_NDFAState*   REXA_ParserImpl::GetNewNDFAState(int id,bool isAccepting)
//-------------------------------------------------------------------------------
{
    REXA_NDFAState* pState= new REXA_NDFAState(id,isAccepting);
    m_vecAllStates.push_back(pState);
    return pState;
}


///////////////////////////////////////////////////////////////////////////
//  NDFA    State
///////////////////////////////////////////////////////////////////////////

//-----------------------------------------------------------------------------
void    REXA_NDFAState::DeleteAll()
//-----------------------------------------------------------------------------
{
    set<const REXA_NDFAState*> setStates;
    CollectStates(this,setStates);
    set<const REXA_NDFAState*>::iterator it;
    for(it= setStates.begin();it!=setStates.end();++it){
        delete *it;
    }
}
//-------------------------------------------------------------------------------
void         REXA_NDFAState::CollectStates(const REXA_NDFAState* pState,set<const REXA_NDFAState*>& rsetStates)
//-------------------------------------------------------------------------------
{
    pair<set<const REXA_NDFAState*>::iterator,bool> itBool=    rsetStates.insert(pState);
    if( itBool.second ){
        for(int i=0;i<(*itBool.first)->m_transitions.size();i++){
            CollectStates((*itBool.first)->m_transitions[i].m_nextState,rsetStates);
        }
    }
}
//-------------------------------------------------------------------------------
void         REXA_NDFAState::CollectStates   (REXA_NDFAState* pState,
                                            set<REXA_NDFAState*>& rsetStates)
//-------------------------------------------------------------------------------
{
    pair<set<REXA_NDFAState*>::iterator,bool> itBool=    rsetStates.insert(pState);
    if( itBool.second ){
        for(int i=0;i<(*itBool.first)->m_transitions.size();i++){
            CollectStates((*itBool.first)->m_transitions[i].m_nextState,rsetStates);
        }
    }
}
//-------------------------------------------------------------------------------
REXA_NDFAState*  REXA_NDFAState::CopyStates      
                        (REXA_NDFAState* pState,map<REXA_NDFAState*,REXA_NDFAState*>& rMapStates)
//-------------------------------------------------------------------------------
{
	pair<map<REXA_NDFAState*,REXA_NDFAState*>::iterator,bool> itDone= 
			rMapStates.insert(make_pair(pState,(REXA_NDFAState*)0));
	if( itDone.second ){
		REXA_NDFAState* pNewState= new REXA_NDFAState(*pState);
        itDone.first->second= pNewState;
		for(int i=0;i<pNewState->m_transitions.size();i++){
			pNewState->m_transitions[i].m_nextState=
				CopyStates(pNewState->m_transitions[i].m_nextState,rMapStates);
		}
	}
    return itDone.first->second;
}


//-----------------------------------------------------------------------------
REXA_NDFAState*		REXA_NDFAState::Clone()
//-----------------------------------------------------------------------------
{
	map<REXA_NDFAState*,REXA_NDFAState*> mapStates;
	return CopyStates(this,mapStates);
}
///////////////////////////////////////////////////////////////////////////
//  REXA_StateSet
///////////////////////////////////////////////////////////////////////////
//-----------------------------------------------------------------------------
bool	REXA_StateSet::IsAccepting	()const
//-----------------------------------------------------------------------------
{
	REXA_StateSet::const_iterator it;
	for(it= begin();it!=end();++it){
		if( (*it)->m_isAccepting ){
			return true;
		}
	}
	return false;
}


//-----------------------------------------------------------------------------
void PrintCharSet(ostream& os,const REXA_CharSet& rCharSet)
//-----------------------------------------------------------------------------
{
	os	<<	"\"";
	for(int i=0;i<rCharSet.size();i++){
		if( rCharSet[i] ){
			os	<<	(char)i;
		}
	}
	os	<<	"\"";
}
//-----------------------------------------------------------------------------
int				REXA_TranslatingTable::GetAnswerCode	(const REXA_StateSet& rSet)const
//-----------------------------------------------------------------------------
{	
	REXA_StateSet::const_iterator it;
	int nAnswerCode=	REXA_DFAState::eNoAnswer;
	int nMinCount=		INT_MAX;
	for(it= rSet.begin();it!=rSet.end();++it){
		MapStateOccurrenceCount::const_iterator itMap=
			m_mapStateOccurrenceCount.find((*it));
		if( itMap!=m_mapStateOccurrenceCount.end() ){
			if( itMap->second<nMinCount ){
				nMinCount=		itMap->second;
				nAnswerCode=	itMap->first->m_nAnswer;
			}
		}
	}
	return nAnswerCode;
}
//-----------------------------------------------------------------------------
void			REXA_TranslatingTable::ComputeOccCounts	()
//-----------------------------------------------------------------------------
{	//computes the occurrences of all states with an existing answer code
	TranslationMap::const_iterator it;
	for(	it= m_mapTransitions.begin();
			it!=m_mapTransitions.end();
			++it){
				REXA_StateSet::const_iterator it1;
				for( it1= it->first.begin();it1!=it->first.end();++it1){
					if(		(*it1)->m_isAccepting 
						&&	(*it1)->m_nAnswer!=REXA_DFAState::eNoAnswer){
						pair<MapStateOccurrenceCount::iterator,bool> pairRes=
							m_mapStateOccurrenceCount.insert(make_pair(*it1,1));
						if( !pairRes.second ){
							pairRes.first->second++;
						}
					}
				}
			}
}

//-----------------------------------------------------------------------------
set<REXA_CharSet,REXA_LessCharSet> 
            REXA_TranslatingTable::ComputeDisjointCharSets(REXA_CharSetSet& rCharSets)
//-----------------------------------------------------------------------------
{
    vector< REXA_CharSetSet > charSets(256);
    vector< REXA_CharSet >    unionSets(256);
    REXA_CharSetSet           resultSet;
    REXA_CharSetSet::const_iterator it;
    for(it= rCharSets.begin();it!=rCharSets.end();++it){
        charSets[it->count()].insert(*it);
    }

//cumulative union
    for(int k= 1;k<256;k++){
        for(int j=0;j<k;j++){
            for(it= charSets[j].begin();it!= charSets[j].end();++it){
                unionSets[k]|= *it;
            }
        }
    }

//reduce charset into disjoint charsets
    for(int l=0;l<256;l++){
        for(it= charSets[l].begin();it!= charSets[l].end();++it){
            REXA_CharSet reducedSet= *it;
            REXA_CharSet unionSet= ~unionSets[l];
            reducedSet&= unionSet;
            resultSet.insert(reducedSet);
        }
    }

    return resultSet;
}
//-----------------------------------------------------------------------------
void REXA_TranslatingTable::CollectCharSets(const REXA_NDFAState* pCurState,
                     set<int>& rVisitedStates,
                     REXA_CharSetSet& rCharSets)
//-----------------------------------------------------------------------------
{
    if( rVisitedStates.find(pCurState->m_id) == rVisitedStates.end() ){
        rVisitedStates.insert(pCurState->m_id);
        for(int i=0;i<pCurState->m_transitions.size();i++){
            if( pCurState->m_transitions[i].m_transChars.count()!=0 ){
                rCharSets.insert(pCurState->m_transitions[i].m_transChars);
                CollectCharSets(
					pCurState->m_transitions[i].m_nextState,
					rVisitedStates,rCharSets);
            }
        }
    }
}



//# Generates a translation table from the NDFA-states present in 'vecpStates'
//example: NDFA having the States S,A,E.  S('a') -> A. S('a') -> E. A('b') -> A. A('b') -> E.
//TranslationTable: 
//  REXA_StateSet    'a'     'b'
//--------------------------
//    {S}      {A,E}    
//   {A,E}             {A,E}
// corresponding DFA: {S}('a') -> {A,E}. {A,E}('b') -> {A,E}
//-----------------------------------------------------------------------------
void REXA_TranslatingTable::GenerateTranslationTable(set<const REXA_NDFAState*>& rSetState)
//-----------------------------------------------------------------------------
{
    set< REXA_StateSet > setPendingStateSets;
	REXA_StateSet startStateSet;

	startStateSet.insert(m_pStartState);
    setPendingStateSets.insert(startStateSet);

    while( setPendingStateSets.size()>0 ){
        REXA_StateSet curStateSet= *setPendingStateSets.begin();
        setPendingStateSets.erase(curStateSet);
        if( m_mapTransitions.find(curStateSet)==m_mapTransitions.end() ){
            TransitionVector transVector(m_vecCharsets.size());
			for(int i=0;i<m_vecCharsets.size();i++){
				REXA_StateSet::const_iterator it;
				for(it= curStateSet.begin();it!=curStateSet.end();++it){
					for(int j=0;j<(*it)->m_transitions.size();j++){	
						const  REXA_NDFATransition& rTransition= (*it)->m_transitions[j];
						REXA_CharSet charSet= rTransition.m_transChars;
						charSet&= m_vecCharsets[i];
						if( charSet.any() ){
							transVector[i].insert(rTransition.m_nextState);
						}
					}
				}
			}
			m_mapTransitions.insert(make_pair(curStateSet,transVector));
			for(int l=0;l<transVector.size();l++){
				setPendingStateSets.insert(transVector[l]);
			}
        }
    }
}

static REXA_DFAState* GetIllegalState()
{
    static REXA_DFAState illegalState(REXA_DFAState::eNoAnswer,true);
    return &illegalState;
}
static REXA_DFAState* GetEosState()
{
    bool bFirstTime= true;
    static REXA_DFAState eosState(REXA_DFAState::eEos,true);
    if( bFirstTime ){
        bFirstTime= false;
        for(int i=0;i<sizeof(eosState.aNext)/sizeof(eosState.aNext[0]);i++){
            eosState.aNext[i]= &eosState;
        }
    }
    return &eosState;
}

REXA_DFAState* REXA_DFAState::ms_pIllegalState= ::GetIllegalState();
REXA_DFAState* REXA_DFAState::ms_pEosState= ::GetEosState();

//-------------------------------------------------------------------------------
REXA_DFAState::REXA_DFAState(int nAnwserId,bool bIsFinal)
		:m_nAnwserId(nAnwserId),
         m_bIsFinal(bIsFinal),
         m_bIsAccepting(false)	
//-------------------------------------------------------------------------------
{
    for(int i=0;i<256;i++)	aNext[i]=0;
}
//-------------------------------------------------------------------------------
void    REXA_DFAState::CollectStates(REXA_DFAState* pState,set<REXA_DFAState*>& rsetStates)
//-------------------------------------------------------------------------------
{
    if( !pState ) return;

    pair<set<REXA_DFAState*>::iterator,bool> itBool=    rsetStates.insert(pState);
    if( itBool.second ){
        for(int i=0;i<256;i++){
            CollectStates((*itBool.first)->aNext[i],rsetStates);
        }
    }
}

//-----------------------------------------------------------------------------
void    REXA_DFAState::DeleteAll()
//-----------------------------------------------------------------------------
{
    set<REXA_DFAState*> setStates;
    CollectStates(this,setStates);
    set<REXA_DFAState*>::iterator it;
    for(it= setStates.begin();it!=setStates.end();++it){
        if( *it!=ms_pIllegalState && *it!=ms_pEosState ){
            delete *it;
        }
    }
}



//-----------------------------------------------------------------------------
vector<REXA_CharSet> REXA_NDFAToTable::CharsetToVector(const REXA_CharSetSet& rCharSetSet)
//-----------------------------------------------------------------------------
{
	vector<REXA_CharSet> vecCharSet;
	REXA_CharSetSet::const_iterator it;
	for(it= rCharSetSet.begin();it!=rCharSetSet.end();++it){
		vecCharSet.push_back(*it);
	}
	return vecCharSet;
}
//-----------------------------------------------------------------------------
REXA_TranslatingTable::REXA_TranslatingTable    (const REXA_NDFAState* pStartState)
//-----------------------------------------------------------------------------
{
    set<int>                setVisitedStates;
    REXA_CharSetSet              charSets; 
	set<const REXA_NDFAState*>   setStates;
	
	REXA_NDFAState::CollectStates(pStartState,setStates);

	CollectCharSets(pStartState,setVisitedStates,charSets);
	m_vecCharsets=	REXA_NDFAToTable::CharsetToVector(
								ComputeDisjointCharSets(charSets));
    m_pStartState=	pStartState;
	GenerateTranslationTable(setStates);
	ComputeOccCounts();
}


//-----------------------------------------------------------------------------
REXA_NDFAToDfa::REXA_NDFAToDfa	(const REXA_TranslatingTable& rTable,char cEos)
						:m_rTable(rTable),m_cEos((unsigned char)cEos)
//-----------------------------------------------------------------------------
{
	
}
//-----------------------------------------------------------------------------
REXA_DFAState* REXA_NDFAToDfa::CreateDfaStates(
					const REXA_StateSet&				    rStateSet,
					map<REXA_StateSet,REXA_DFAState*>&	rMapStateSetDFAState)
//-----------------------------------------------------------------------------
{  
    pair<map<REXA_StateSet,REXA_DFAState*>::iterator,bool> pairItBool=
			rMapStateSetDFAState.insert(make_pair(rStateSet,(REXA_DFAState*)0));
	if( pairItBool.second ){
		REXA_DFAState* pDFAState= new REXA_DFAState();
		pairItBool.first->second= pDFAState;
		REXA_TranslatingTable::TranslationMap::const_iterator it=
			m_rTable.m_mapTransitions.find(rStateSet);
		if( it!= m_rTable.m_mapTransitions.end() ){
			const TransitionVector& rvecTrans= it->second;
			for(int i=0;i<rvecTrans.size();i++){
				if( rvecTrans[i].size()>0 ){
					REXA_DFAState* pTransState= 
						CreateDfaStates(rvecTrans[i],rMapStateSetDFAState);
					for(int j=0;j<256;j++){
                        if( m_rTable.m_vecCharsets[i][j] ){
                            pDFAState->aNext[j]= pTransState;
                        }
					}
				}
			}
			if( rStateSet.IsAccepting() ){
				int nAnswerCode= m_rTable.GetAnswerCode(rStateSet);
                pDFAState->m_bIsAccepting= true;
                pDFAState->m_nAnwserId= nAnswerCode;
				REXA_DFAState* pAcceptState= 
					new REXA_DFAState(nAnswerCode,true);
                pAcceptState->m_bIsAccepting= true;
				for(int j=0;j<256;j++){
					if( !pDFAState->aNext[j] ){
						pDFAState->aNext[j]= pAcceptState;
					}
				}
			}else{
				for(int j=0;j<256;j++){
					if( !pDFAState->aNext[j] ){
                        pDFAState->aNext[j]= REXA_DFAState::ms_pIllegalState;
					}
				}
                pDFAState->aNext[m_cEos]= REXA_DFAState::ms_pEosState;
			}
		}
        return pDFAState;
    }
	return  pairItBool.first->second;
}
//-----------------------------------------------------------------------------
REXA_DFAState* REXA_NDFAToDfa::TableToDfaStates()
//-----------------------------------------------------------------------------
{ 
	REXA_StateSet startStateSet;
	startStateSet.insert(m_rTable.m_pStartState);

	map<REXA_StateSet,REXA_DFAState*> mapStateSetDFAState;

    REXA_TranslatingTable::TranslationMap::const_iterator it;
	it= m_rTable.m_mapTransitions.find(startStateSet);
	if( it!=m_rTable.m_mapTransitions.end() ){
		return  CreateDfaStates(
					it->first,
					mapStateSetDFAState);
		
    }
    return 0;
}

//-----------------------------------------------------------------------------
REXA_Parser::REXA_Parser     (char cEos)
//-----------------------------------------------------------------------------
{
    m_pParserImp= new REXA_ParserImpl(cEos);

}
//-----------------------------------------------------------------------------
REXA_Parser::~REXA_Parser    ()
//-----------------------------------------------------------------------------
{
    delete m_pParserImp;
}
//-----------------------------------------------------------------------------
REXA_DFAState*      REXA_Parser::ParseRegExp     (string strRegExp)
//-----------------------------------------------------------------------------
{
    const REXA_NDFAState*    pNdfa=  m_pParserImp->ParseRegExp(strRegExp);
	REXA_TranslatingTable    table(pNdfa);
	REXA_NDFAToDfa           ndfaToDfa(table,m_pParserImp->m_cEos);
    return                   ndfaToDfa.TableToDfaStates();
}
//-----------------------------------------------------------------------------
void                REXA_Parser::AddRegDefinition (   string strDefName,
                                                string strDefExpression,
                                                int nAnswer)
//-----------------------------------------------------------------------------
{
    string str= strDefName+ "=" + strDefExpression;
    m_pParserImp->ParseRegExpDefinition(str,nAnswer);
}
//-----------------------------------------------------------------------------
void                REXA_Parser::RemoveRegDefinition (   string strDefName)
//-----------------------------------------------------------------------------
{
    m_pParserImp->m_mapNamedStates.erase(strDefName);
}
//-----------------------------------------------------------------------------
void                REXA_Parser::GetRegDefinitions   (vector<string>& vecDefinitions)
//-----------------------------------------------------------------------------
{
    map<string,REXA_NDFAState*>::iterator it;
    for(it= m_pParserImp->m_mapNamedStates.begin();
        it!=m_pParserImp->m_mapNamedStates.end();
        ++it){
        vecDefinitions.push_back(it->first);
    }
}


By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Switzerland Switzerland
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions