Click here to Skip to main content
15,880,469 members
Articles / Programming Languages / C++

Crafting an interpreter Part 1 - Parsing and Grammars

Rate me:
Please Sign up or sign in to vote.
4.88/5 (77 votes)
29 Apr 2005CPOL20 min read 218.8K   2.7K   200  
First of a series of articles on building a language interpreter, describing basics about parsing and grammars.
/*  samples for 'PARSING and GRAMMARS' 
	Author: Martin.Holzherr@infobrain.com
	Purpose: demonstrate simple recursive descent parsing techniques
	
*/
#pragma warning(disable:4786)
#include <cstdlib>
#include <iostream>
#include <cassert>
#include <cstdio>
#include <string>
#include <algorithm>
#include <time.h>
#include <iomanip>
#include <climits>
#include <sstream>

#include "peg_templ.h"
#include "peg_proc.h"

struct C{
	C():nNofIntegers(0),nNofIdents(0){}
	int nNofIntegers;
	int nNofIdents;
};

class SimpleTimer{
public:
    SimpleTimer(const char* msg):m_msg(msg),m_start(clock()){}
	void SetErr(std::string s){m_err=s;}
    ~SimpleTimer()
    {
        m_stop = clock();
        std::cout   <<  ">> CPU usage ("   << std::setw(35) <<  m_err+m_msg    
					<<  ") = " <<  std::setprecision(12) 
					<< ((double)(m_stop - m_start)/ CLOCKS_PER_SEC)
                    <<  " seconds"   <<  std::endl;
    }
private:
    time_t m_start, m_stop;
    std::string     m_msg;
	std::string		m_err;
};

void ShowOptions()
{
	std::cout	
				<<	"-s\t\tshow built in samples"	<< std::endl
				<<	"-d <digits>\tExpects parenthized digits, e.g. 1132 or (123) or (((5)))" << std::endl
                <<	"-e <email>\tExpects an email as input" 	<< std::endl
				<<	"-r <ident>\tExpects an identifier, uses begend iterator" << std::endl
                <<  "-p\tperformance test for various PEG implementations"  << std::endl
				<<	std::endl	<<  std::endl
				<<	"enter one of the options starting with either " << std::endl
				<<	"-s -d -e -r or -p with parameters as explained above"
				<<	std::endl;
				
}

struct peg_begend_iterator{
	peg_begend_iterator(const char* p=0,const char* pEnd=0)
			:m_p((const unsigned char*)p),
             m_pEnd((const unsigned char*)pEnd){}
	int  operator*()const{
		return m_p==m_pEnd?(INT_MIN+1):*m_p;
	}
	peg_begend_iterator& operator++(){
		if(m_p<m_pEnd)++m_p;
		return *this;
    }
	
	unsigned const char* m_p;
	unsigned const char* m_pEnd;
};

typedef const unsigned char* openend_iterator;
bool MatchesDigits(openend_iterator& it)
{
   return (*it>='0' && *it<='9'?(++it,true):false)//[0-9]
      &&  (MatchesDigits(it),true);                //Digits?
}
bool MatchesEnclosedDigits(openend_iterator& it)
{	openend_iterator itSave= it;
   return 
      MatchesDigits(it)                          // Digits
   ||                                            // |
        (*it=='('?(++it,true):false)             //  '('
      && MatchesEnclosedDigits(it)               //   EnclosedDigits
      && (*it==')'?(++it,true):(it=itSave,false));//  ')'
}

bool MatchesEMail(openend_iterator& it)
{  // EMailChar:   [A-Za-z0-9._%-]; 
   // EMailSuffix:  [A-Za-z]{2,4} !EMailChar ;
   /* EMail:       EMailChar+ 
				   '@' 
                   EMailChar 
 				   ([A-Za-z0-9_%-] |'.'!EMailSuffix)* 
                   '.' 
                   EmailSuffix ;
	*/
    using namespace peg_templ;
    C c;
	typedef In<'A','Z','a','z'>                     Alpha;
	typedef In<'A','Z','a','z','0','9'>             Alnum;
    typedef Or<Alnum,OneOfChars<'.','_','%','-'> >	EMailChar;
    typedef And<For<2,4,Alpha >,Not<EMailChar > >	EMailSuffix;
    typedef And< 
                PlusRepeat<EMailChar>,
                Char<'@'>,
                PlusRepeat<Or< Or<Alnum,OneOfChars<'_','%','-'> >,
							  And<	Char<'.'>,Not<EMailSuffix> > 
						  > 
                >,
				Char<'.'>,
                EMailSuffix
            > EMail;
    return EMail::Matches(it,&c);
}

template<typename Iterator>
bool MatchesIdentOnly(Iterator& it)
{
	using namespace peg_templ;
	C c;
	typedef And<PlusRepeat<In<'a','z','A','Z'> >,Not<In<1,255> > > FullIdent;
	return FullIdent::Matches(it,&c);
}

std::string ShowResult(const char* toDisplay,
					   const char* itBeg, const char* it,bool bOk)
{
    int nPos= it-itBeg;
    std::string sRes=
        std::string(toDisplay,nPos) + "|" + std::string(toDisplay+nPos);
    return ">> " + sRes + (bOk?" : success " : " : fail")+"\n";
}

void Test_MatchesEnclosedDigits(const char* toTest)
{
    openend_iterator it= (openend_iterator)toTest;
    std::cout 
		<< ShowResult(toTest,toTest,(const char*)it,MatchesEnclosedDigits(it));  
        
}
void Test_MatchesEMail(const char* toTest)
{
	openend_iterator it= (openend_iterator)toTest;
	bool bOk= MatchesEMail(it);
	std::cout << ShowResult(toTest,toTest,(const char*)it,bOk);
}
void Test_MatchIdentBegEndIterator(const char* toTest)
{
	size_t n= strlen(toTest);
	peg_begend_iterator it(toTest,toTest+n);
	bool bOk= MatchesIdentOnly(it);
	std::cout << ShowResult(toTest,toTest,(const char*)it.m_p,bOk);
	if( !bOk && n>1){
		for( ;n>0;n--){
			peg_begend_iterator it1(toTest,toTest+n);
			bool bOk= MatchesIdentOnly(it1);
			if( bOk ){
				std::string sMsg(toTest);
				std::ostringstream oss;
				oss<<n;
				sMsg+= " (only first " + oss.str() + ")";
				std::cout << ShowResult(sMsg.c_str(),toTest,(const char*)it1.m_p,bOk);
				break;
			}
		}
	}
}
////////////////{ procedural implementation of Expr and related rules for performance test
namespace test_proc_peg{
	using namespace peg_proc;
inline bool IsExpr(unsigned const char*& p,C* pC);
inline bool IsSpace(unsigned const char*& p,C* pC)
{	(pC);
	while(OneOfChars(p,' ','\t'));
	return true;
}
inline bool IsIdent(const unsigned char*& p,C* pC)
{
	const unsigned char* pSave;
    if( !PEG_OR2(pSave,p,In(p,'a','z','A','Z'),Char(p,'_')) ){return false;}
	while( PEG_OR2(pSave,p,In(p,'a','z','A','Z','0','9'),Char(p,'_')) ){}
	pC->nNofIdents++;
    return true;
}
inline bool IsInteger(const unsigned char*& p,C* pC)
{
    if( !In(p,'0','9') ){return false;}
	while(In(p,'0','9') ){}
	pC->nNofIntegers++;
    return true;
}
inline bool IsFactor(const unsigned char*& p,C* pC)
{
	const unsigned char* pSave;
	return PEG_OR3(pSave,p,
				IsIdent(p,pC),
				IsInteger(p,pC),
				PEG_AND(pSave,p,	
							Char(p,'(')
						&&	IsSpace(p,pC)
						&&	IsExpr(p,pC)
						&&	IsSpace(p,pC)
						&&	Char(p,')')
						)
			);
}
inline bool IsTerm(const unsigned char*& p,C* pC)
{
    if( !IsFactor(p,pC) ){return false;}
    IsSpace(p,pC);
	const unsigned char* pSave;
    while(
			PEG_AND(pSave,p,
					OneOfChars(p,'*','/')
				&&	IsSpace(p,pC)
				&&	IsFactor(p,pC)
				&&	IsSpace(p,pC)
				)	){
	}
    return true;
}
inline bool IsExpr(unsigned const char*& p,C* pC)
{
    if( !IsTerm(p,pC) ){ return false; }
    IsSpace(p,pC);
	const unsigned char* pSave;
	while(
			PEG_AND(pSave,p,
					OneOfChars(p,'+','/')
				&&	IsSpace(p,pC)
				&&	IsTerm(p,pC)
				&&	IsSpace(p,pC)
				)	){
	}
    return true;
}
void ProcPegPerformance(const char* s,size_t nNofExpectedIntsAndIntegers)
{
	C c;
    SimpleTimer st("procedureal PEG implementation");
	const unsigned char* pStart=(const unsigned char*)s;
    while( IsExpr(pStart,&c) ){}
	if( !(		nNofExpectedIntsAndIntegers==c.nNofIntegers
		&&		nNofExpectedIntsAndIntegers==c.nNofIdents) ){
			st.SetErr("EVALUATION ERROR ");
	}
}
}

////////////////{ template implementation of Expr and related rules for performance test
namespace test_templ_peg{
using namespace peg_templ;
struct Term;
struct Factor;
typedef Or<In<'a','z','A','Z'>,Char<'_'> >  BegChar;
typedef Or<BegChar,In<'0','9'> >            SecondChar;
typedef And<BegChar,OptRepeat<SecondChar> > Ident;
typedef PlusRepeat<In<'0','9'> >            Integer;
typedef OptRepeat<OneOfChars<' ','\t'> >      S;



struct CountIdent{
	template<typename It, typename Context>
	 static bool Matches(It& p,Context* pC)
	{ 
		if(Ident::Matches(p,pC) ){
			pC->nNofIdents++;
			return true;
		}else{
			return false;
		}
	}
};

struct CountInteger{
	template<typename It, typename Context>
	 static bool Matches(It& p,Context* pC)
	{ 
		if(Integer::Matches(p,pC) ){
			pC->nNofIntegers++;
			return true;
		}else{
			return false;
		}
	}
};
struct Expr{
    typedef And<Term,S,OptRepeat<And<OneOfChars<'+','-'>,S,Term,S> > >    rule;

    template<typename It, typename Context>
    static bool Matches(It& p,Context* pC)
    {return rule::Matches(p,pC);}
};
struct Term{
    typedef And<S,Factor,S,OptRepeat<And<OneOfChars<'*','/'>,S,Factor,S> > >rule;

    template<typename It, typename Context>
    static bool Matches(It& p,Context* pC)
    {return rule::Matches(p,pC);}
};
struct Factor{
    typedef Or<CountIdent,CountInteger,And<Char<'('>,S,Expr,S,Char<')'> > > rule;

    template<typename It, typename Context>
    static bool Matches(It& p,Context* pC)
    { return rule::Matches(p,pC);}
};


////////////////} template implementation of Expr and related rules for performance test


template<typename Iterator>
void TemplPegPerformanceImpl(const char* sTitle, Iterator it,
									int nNofExpectedIntsAndIntegers)
{   using namespace peg_templ;
	C c;
	SimpleTimer st(sTitle);
    PlusRepeat<And<Expr,S> >::Matches(it,&c);
	if( !(		nNofExpectedIntsAndIntegers==c.nNofIntegers
		&&		nNofExpectedIntsAndIntegers==c.nNofIdents) ){
			st.SetErr("EVALUATION ERROR ");
	}
}



void TemplPegPerformance(const char* s,size_t nNofExpectedIntsAndIntegers)
{
	peg_begend_iterator begEnd(s,s+strlen(s));
	TemplPegPerformanceImpl("templated PEG implementation",
						(const unsigned char*)s,nNofExpectedIntsAndIntegers);
	TemplPegPerformanceImpl("begEnd templated PEG implementation",
								begEnd,nNofExpectedIntsAndIntegers);
}
}
////////////////{ hand crafted implementation of Expr and related rules for performance test
namespace test_handcrafted{
inline bool IsExpr(unsigned const char*& p,C* pC);
inline bool IsSpace(unsigned const char*& p,C* pC)
{	(pC);
	while(*p==' '||*p=='\t'){++p;}
	return true;
}
inline bool IsIdent(const unsigned char*& p,C* pC)
{
    if( !(*p>='a' && *p<='z' || *p>='A'&&*p<='Z' || *p=='_') ){return false;}
    ++p;
    while( *p>='a'&&*p<='z' || *p>='A'&&*p<='Z' || *p>='0'&&*p<='9' || *p=='_'){
        ++p;
    }
	pC->nNofIdents++;
    return true;
}
inline bool IsInteger(const unsigned char*& p,C* pC)
{
    if( !(*p>='0' && *p<='9') ){return false;}
    ++p;
    while( *p>='0'&&*p<='9'){
        ++p;
    }
	pC->nNofIntegers++;
    return true;
}
inline bool IsFactor(const unsigned char*& p,C* pC)
{
    if( IsIdent(p,pC) )    {return true;}
    if( IsInteger(p,pC) )  {return true;}
    const unsigned char* p0=p;
    if(*p=='('){
        ++p;
        IsSpace(p,pC);
        if( IsExpr(p,pC) ){
            IsSpace(p,pC);
            if(*p==')'){
                ++p;
                return true;
            }
        }
    }
    p= p0;
    return false;
}
inline bool IsTerm(const unsigned char*& p,C* pC)
{
    if( !IsFactor(p,pC) ){return false;}
    IsSpace(p,pC);
    while(*p=='*'||*p=='/'){
        const unsigned char* p0=p;
        ++p;
        IsSpace(p,pC);
        if(IsFactor(p,pC)){
            IsSpace(p,pC);
        }else{
            p=p0;
            break;
        }
    }
    return true;
}
inline bool IsExpr(unsigned const char*& p,C* pC)
{
    
    if( !IsTerm(p,pC) ){ return false; }
    IsSpace(p,pC);
    while(*p=='+'||*p=='-'){
        const unsigned char* p0=p;
        ++p;
        IsSpace(p,pC);
        if(IsTerm(p,pC)){
            IsSpace(p,pC);
        }else{
            p=p0;
            break;
        }
    }
    return true;
}
int HandOptimizedPerformance(const char* s,size_t nNofExpectedIntsAndIntegers)
{
    C c;
    SimpleTimer st("hand optimized PEG implementation");
	const unsigned char* pStart=(const unsigned char*)s;
    while( IsExpr(pStart,&c) ){}
	if( !(		nNofExpectedIntsAndIntegers==c.nNofIntegers
		&&		nNofExpectedIntsAndIntegers==c.nNofIdents) ){
			st.SetErr("EVALUATION ERROR ");
	}
    return 0;
}
}
void Test_Performance()
{//Expr: Term S (('+'|'-') S Term)* ;
 //Term= Factor S (('*'|'/') S Factor S)* ;
 //Factor: Ident / Integer / '(' S Expr S ')' ;
 //Ident:[a-zA-Z_][a-zA-Z0-9_]* ; Integer: [0-9]+ ;

	const char sExpr[]= "132*( firstOccurance  +  x2*( 1001/N55 )+19 ) ";
	char s[1000000]={0};
	for(int i=0;i<sizeof(s)/sizeof(sExpr);i++){
		memcpy(s+i*(sizeof(sExpr)-1),sExpr,sizeof(sExpr)-1);
	}
	s[sizeof(s)-1]='\0';
	std::cout << ">> parse a 1 MB buffer containing expressions " << std::endl;
	test_proc_peg::ProcPegPerformance			(s,	3*(sizeof(s)/sizeof(sExpr)));
	test_templ_peg::TemplPegPerformance			(s,	3*(sizeof(s)/sizeof(sExpr)));
	test_handcrafted::HandOptimizedPerformance	(s,	3*(sizeof(s)/sizeof(sExpr)));
	std::cout << ">> performance tests done" << std::endl;
}
void ShowBuiltInSamples();
void ExecuteOption(const char* sOption)
{
	using namespace peg_templ;
	C c;
	typedef OptRepeat<OneOfChars<' ','\t','\r'> > S;
	typedef And<S,Char<'-'> > option;
	if( option::Matches(sOption,&c) ){
		if(        And<Char<'d'>,S>::Matches(sOption,&c)){
			Test_MatchesEnclosedDigits(sOption);
		}else if ( And<Char<'e'>,S>::Matches(sOption,&c)){
			Test_MatchesEMail(sOption);
		}else if ( And<Char<'r'>,S>::Matches(sOption,&c)){
			Test_MatchIdentBegEndIterator(sOption);
		}else if ( And<Char<'p'>,S>::Matches(sOption,&c)){
			Test_Performance();
		}else if ( And<Char<'s'>,S>::Matches(sOption,&c)){
			ShowBuiltInSamples();
		}else{
			ShowOptions();
		}
	}else{
		std::cout 
		<< "your line must start with a '-' followed by 'd', 'e', 'r' or 'p'" 
		<<	std::endl;
		ShowOptions();
	}
}
void SelfExecuteOption(const char* pOption)
{
	std::cout << pOption << std::endl;
	ExecuteOption(pOption);
	std::cout << std::endl;
}
void ShowBuiltInSamples()
{
	SelfExecuteOption("-d (((489)))");
	SelfExecuteOption("-d (((123))");
	SelfExecuteOption("-e  support@codeproject.com");
	SelfExecuteOption("-e  bloom@blo.blo.uk");
	SelfExecuteOption("-e  byrne@bomm.123");
	SelfExecuteOption("-r  fashion week");
	SelfExecuteOption("-p ");
	ExecuteOption("-?");
}



int main(int argc, char *argv[])
{
	std::cout << "welcome to the samples of PARSING & GRAMMARS" << std::endl;
	ShowOptions();
    char buf[1048];
    while( std::cin.getline(buf,1048) ){
		ExecuteOption(buf);
	}
    system("PAUSE");
    return EXIT_SUCCESS;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Switzerland Switzerland
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions