Click here to Skip to main content
15,895,084 members
Articles / Programming Languages / C++

Tokenizer and analyzer package supporting precedence prioritized rules

Rate me:
Please Sign up or sign in to vote.
5.00/5 (4 votes)
1 Jan 20023 min read 183.1K   2.8K   54  
A library allowing you to conveniently build a custom tokenizer and analyzer supporting precedence priorized rules
/*********************************************************************
	Copyright (C) 2001 by

		Alexander Berthold, alexander-berthold@web.de.
		Hoegestr. 54
		79108 Freiburg i. Breisgau
		Germany

    -- This file is part of cxAnalyzer --

    "cxAnalyzer" is free software; you can redistribute it and/or 
	modify it under the terms of the GNU Lesser General Public 
	License as published by the Free Software Foundation; either 
	version 2 of the License, or any later version.

    "cxAnalyzer" is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
	License along with "cxAnalyzer"; if not, write to the Free 
	Software  Foundation, Inc., 59 Temple Place, Suite 330, 
	Boston, MA  02111-1307  USA

    ---------------------------------------------------------------
      If you find any bugs or if you make other corrections/
	  enhancements, i'd appreciate if you'd let me know about 
	  that. My email is
  
       alexander-berthold@web.de
  
      If you share this code, do not remove this text.
    ---------------------------------------------------------------

*********************************************************************/

// cxAnalyzerExpression.cpp: implementation of the cxAnalyzerExpression class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"

#include "cxAnalyzerException.h"
#include "cxAnalyzerTypeMap.h"
#include "cxaToken.h"
#include "cxAnalyzerExpression.h"
#include "cxAnalyzerTree.h"
#include "cxaRuleCacheElement.h"
#include "cxaRuleCache.h"
#include "cxAnalyzerMain.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

	cxAnalyzerExpression::cxAnalyzerExpression(
							cxAnalyzerTypeMap* patmTypeMap)
		{
		m_nAtmType		=ATM_ID_INVALID;
		m_nPrecPrio		=-1;
		m_patmTypeMap	=patmTypeMap;
		m_fInitializeComplete=false;
		m_pstrInitTemp	=NULL;
		m_apatiPattern	=NULL;
		m_nSizePattern	=0;
		m_fIsCacheable	=false;
		m_fImplicitIgnore=false;
		}

	cxAnalyzerExpression::~cxAnalyzerExpression()
		{
		if(m_pstrInitTemp)
			delete m_pstrInitTemp;
		if(m_apatiPattern)
			delete[] m_apatiPattern;
		}

//////////////////////////////////////////////////////////////////////
// Protected operations
//////////////////////////////////////////////////////////////////////

	/********************************************************************
	FUNCTION:	fCheckComp()

  PARAMETERS:	'paHost'
					the cxAnalyzerMain object doing the test
				'patsContext'
					the token stream containing 'start'
				'nAtmTypeFirstIs'
					can be ATM_ID_INVALID if n/a, or an AtmID of the token
					preceding 'start' (which may not be equal to *(start-1)).
				'patFirstToken'
					if 'nAtmTypeFirstIs'!=ATM_ID_INVALID, contains the token
					described by nAtmTypeFirstIs
				'pascCondition'
					Can point to an 'cxaStatusCookie' object. Will if != NULL
					contain information on what parsing problem happened.
					CAUTION: Consumes a lot of additional time; use only if
					needed.
				'start'
					the first token to start the test with
				'pend'
					receives the position of the first token after the
					expression if the function is successful
				'patTree'
					if it is needed to 'treeize' the expression, then this
					argument takes the 'cxAnalyzerTree' which does that
					later on.

	PURPOSE:	Performs a test on the given token stream (patsContext)
				if the pattern of this expression matches the token pattern.

	RETURNS:	'true':
					Expression recognized.
					- (*pend) contains the position of the 
					  first token after the expression.
					- 'patTree' (if !=NULL) has been filled with the
					  recognized tokens
				'false':
					Parsing error.
					- if 'pascCondition' was !=NULL: contains the error
					  condition.
	********************************************************************/
	bool	cxAnalyzerExpression::fCheckComp(
				cxAnalyzerMain *paHost,
				const cxaTokenStream *patsContext,
				int nAtmTypeFirstIs,
				const cxaToken* patFirstToken,
				cxaStatusCookie* pascCondition,
				cxaTokenStream::const_iterator startpos,
				cxaTokenStream::const_iterator *pend,
				cxAnalyzerTree* patTree)
		{
#ifdef ANALYZER_STATS_ENABLED
		if(pascCondition!=NULL)
			{
			pascCondition->stats.nCurRecursionLevel++;
			if(	pascCondition->stats.nCurRecursionLevel>
				pascCondition->stats.nMaxRecursionLevel)
				pascCondition->stats.nMaxRecursionLevel = pascCondition->stats.nCurRecursionLevel;
			pascCondition->stats.nAERuleCallsTotal++;
			}
#endif
        if(patTree)
            {
            patTree->vBeginTrans();

			if(!fImplicitIgnore() &&
				!(m_nSizePattern==1 && m_apatiPattern[0]->nGetSubType()==ATM_MTYPE_TOKEN) )
				patTree->vBeginRule(m_nAtmType,nGetIDValue(),this);
            }

		bool				fCheckFirstIs = true;
        bool                fIsFirstToken = true;
        bool                fEndRuleLogged = false;
		int					i;
		cxaTokenStream::const_iterator	
							it, log_it;

		it					=startpos;
        log_it              =patsContext->end();
		for(i=0;i<m_nSizePattern;i++, pascCondition?pascCondition->nCurLength++:0)
			{
            const cxaToken*		patToken = NULL;
			int					nAtmType = ATM_ID_INVALID;
			bool				fDone = false;
            bool                fWasFirstToken = fIsFirstToken;
            fIsFirstToken       =false;

			// Fetch the next 'AtmType' (either from the args or from the next element)
			if(fCheckFirstIs)
				{
				fCheckFirstIs	=false;
				if(nAtmTypeFirstIs!=ATM_ID_INVALID)
					{
					nAtmType		=nAtmTypeFirstIs;
                    patToken        =patFirstToken;
					fDone			=true;
					}
				}
			if(!fDone)
				{
				if(it==patsContext->end())
					{
					if(pascCondition)
						pascCondition->vSetBrkCauseEots(m_nAtmType, m_nIDValue, patToken);
					break;
					}

                patToken        =&(*it);
				nAtmType		=(*it).nGetAtmType();
				it++;
				}

			if(patToken!=NULL && pascCondition!=NULL)
				pascCondition->lCurPosition = patToken->nGetTokenOrderID();
			
			// Compare the patterns
			int					nAtmTypePat, nMTypePat;
			nAtmTypePat			=m_apatiPattern[i]->nGetAtmType();
			nMTypePat			=m_apatiPattern[i]->nGetMType();

			// Already equal?
			if( nAtmTypePat==nAtmType && nMTypePat!=ATM_MTYPE_RULE)
                {
                if(patTree) 
                    patTree->vLogToken(nAtmType,TOKEN_ID_INVALID,NULL,patToken);

				continue;
                }

			// Remember if a token was resolved using 'nResolveUnknownToken'
			// (the original requested nAtmType is used later to 
			int nIDValueResolve = TOKEN_ID_INVALID;
			int nAtmTypeResolve = nAtmType;
			void *pvDataResolve = NULL;

			// If the current token is unknown, it could be that the 
			// consumer can resolve the token to something meaningful ...
			if( nAtmType==ATM_ID_LITERAL && nMTypePat==ATM_MTYPE_COMP_TOKEN )
				{
				const cxaToken* patArgToken = patToken;
				cxaTokenStream::const_iterator tempit = it;
				int nAtmTypeRet = ATM_ID_LITERAL;
				int nIDValueRet = TOKEN_ID_INVALID;
				void *pvDataRet = NULL;

				tempit--;
				if(patArgToken==NULL) patArgToken = &(*tempit);

				if( paHost->nResolveUnknownToken(patTree,nAtmTypePat,patArgToken,&nAtmTypeRet,&nIDValueRet,&pvDataRet) )
					{
					nAtmType = nAtmTypeRet;
					nAtmTypeResolve = nAtmType;
					nIDValueResolve = nIDValueRet;
					pvDataResolve = pvDataRet;

					// Check again for equality
					if( nAtmTypePat==nAtmType && nMTypePat!=ATM_MTYPE_RULE)
						{
						if(patTree) 
							patTree->vLogToken(nAtmTypeResolve,nIDValueResolve,pvDataResolve,patToken);

						continue;
						}
					}
				}


			// Not equal
			// Is 'nAtmTypePat' an expression?
			if(nMTypePat != ATM_MTYPE_RULE)
				{
				if(pascCondition)
					pascCondition->vSetBrkCause(cxaStatusCookie::brkcause_wrongtoken,patToken);
				break;
				}

			// If the first token in the pattern is a 
			// rule (the same rule as the one here)
			// it must be able to be implicitly mapped to
			// the rule.
            if(fWasFirstToken && nAtmTypePat == nGetAtmType())
                {
				if(	nAtmTypePat!=nAtmType &&
					!paHost->fIsImplicitRule(nAtmTypePat, nAtmType, NULL))
					{
					if(pascCondition)
						pascCondition->vSetBrkCause(cxaStatusCookie::brkcause_wrongtoken,patToken);
					break;
					}

                if(patTree)
                    patTree->vLogToken(nAtmTypeResolve,nIDValueResolve,pvDataResolve,patToken);

                continue;
                }
			else
				{
				// the condition below is true, if we are in something
				// like {.xyz}={...}{.xyz}, i.e. in a right-bound expression.
				// then, we can safely leave the function with 'sucess'.
				if(i==(m_nSizePattern-1) && nAtmTypePat==m_nAtmType)
					{
					// the code below is because the continuation
					// is handled by the callee (cxAnalyzerMain)
					i++;
					it--; // most recent token is handled by callee
                       patTree->vLogToken(m_nAtmType,TOKEN_ID_INVALID,NULL,NULL);
					break;
					}
				}

			cxaTokenStream::const_iterator	temp;

			// Is a rule available?
            if(patTree)
                {
                patTree->vBeginTrans();
                if(i==(m_nSizePattern-1) && fIsRightBound(nGetAtmType()))
                    {
	                patTree->vLogToken(nAtmTypeResolve,nIDValueResolve,pvDataResolve,NULL);
                    fEndRuleLogged=true;

					if(!fImplicitIgnore())
						patTree->vEndRule();
                    }
                }

            if(!paHost->fCheckRuleInt(	nAtmTypePat,patsContext,
										nAtmType,patToken,
										pascCondition,
										it,&temp,patTree))
                {
                if(patTree)
                    patTree->vRollbackTrans();

				if(pascCondition)
					pascCondition->vSetBrkCause(cxaStatusCookie::brkcause_subrulefailed,patToken);
				break;
                }
			else
				{
                if(patTree)
                    patTree->vCommitTrans();

				// A rule was available
                if(i==(m_nSizePattern-1))
                    {
                    // This was the last element in the pattern,
                    // and the element was a rule. Since the rule
                    // has already been logged by 'fCheckRule',
                    // in this case the end of _this_ rule
                    // must point to the start of the recently
                    // recognized rule.
                    log_it  =it;
                    it      =temp;
                    }
                else
                    {
                    it		=temp;
                    }
                }
            }
        
        if(i==m_nSizePattern)
            {
#ifdef ANALYZER_STATS_ENABLED
			if(pascCondition!=NULL)
				{
				pascCondition->stats.nCurRecursionLevel--;
				pascCondition->stats.nSuccessfulAERuleCallsTotal++;
				}
#endif
            if(patTree)
                {
				if( !fEndRuleLogged && !fImplicitIgnore() &&
					!(m_nSizePattern==1 && m_apatiPattern[0]->nGetSubType()==ATM_MTYPE_TOKEN))
                    patTree->vEndRule();

                patTree->vCommitTrans();
                }

            (*pend)	=it;
			const cxaToken& pape = (*it);
            return true;
            }

#ifdef ANALYZER_STATS_ENABLED
		if(pascCondition!=NULL)
			pascCondition->stats.nCurRecursionLevel--;
#endif

        if(patTree)
            patTree->vRollbackTrans();

        return false;
        }
    
    
//////////////////////////////////////////////////////////////////////
// Operations
//////////////////////////////////////////////////////////////////////

    bool cxAnalyzerExpression::fCheckValid() const
        {
        if(NULL == this)
            return false;
        
        if(m_nAtmType==ATM_ID_INVALID)
            return false;
        
        if(m_nPrecPrio<0)
            return false;
        
        if(m_fInitializeComplete==false)
            return false;
        
        return true;
        }
/*
#ifndef ANALYZER_NO_OPTIMIZATION
	bool cxAnalyzerExpression::fInsertIntoExclusionMap(int nLength, const cxAnalyzerExpression *paeExpr)
		{
		aeexclude_mmap_type::const_iterator	it;
		bool	fInsert = true;

		it		=m_mapLenToExclusion.find(nLength);
		if(it!=m_mapLenToExclusion.end())
			{
			int iu = (*it).first;
			for(;(*it).first==nLength && it!=m_mapLenToExclusion.end(); it++)
				if( (*it).second==paeExpr)
					{ fInsert = false; break; };
			}
		if(fInsert)
			m_mapLenToExclusion.insert( std::make_pair(nLength,paeExpr) );

		return fInsert;
		}

	void cxAnalyzerExpression::vUpdateExclusionSet(int nLength, std::set<const cxAnalyzerExpression*>& exclset) const
		{
		aeexclude_mmap_type::const_iterator	it;

		it		=m_mapLenToExclusion.find(nLength);
		for(;it!=m_mapLenToExclusion.end() && it->first==nLength;it++)
			if(exclset.find(it->second)==exclset.end())
				exclset.insert(it->second);
		}
#endif
*/  
    bool cxAnalyzerExpression::fInitFromRule(std::tstring strRule)
        {
        // May not be done already
        ASSERT(m_nAtmType==ATM_ID_INVALID);
        ASSERT(m_fInitializeComplete==false);
        
        // This function should use a parser (hehe)
        int				nPosEqual,nSep,nPos0;
        std::tstring	strName;
        std::tstring	strTemp;

		nPos0			=0;
        nPosEqual		=strRule.find(_T('='));
		nSep			=strRule.find(_T(':'));
		if(nSep<nPosEqual)
			{
			m_nIDValue		=_ttoi(strRule.substr(0,nSep).c_str());
			if(m_nIDValue<0)
				{
				m_nIDValue	=-m_nIDValue;
				m_fImplicitIgnore = true;
				}
			nPos0			=nSep+1;
			}
		else
			m_nIDValue		=0;

        nSep			=strRule.find(_T(':'),nPosEqual+1);
        
        if(nPosEqual==-1 || nSep==-1)
            throw cxAnalyzerException(ANERR_RULEDEF_WRONG, strRule.c_str());
        
        strName			=strRule.substr(nPos0,nPosEqual-nPos0);
        strTemp			=strRule.substr(nPosEqual+1,nSep-nPosEqual-1);
        
		// Special flags specified?
		while(isalpha(strName[0]))
			{
			switch(strName[0])
				{
				case 'C':
					m_fIsCacheable = true;
					break;
				default:
					ASSERT(FALSE);
					break;
				}

			strName = strName.substr(1);
			}
        
        // Remove surrounding {}
        if(!ctkMisc::fStripOffBraces(strName))
            throw cxAnalyzerException(ANERR_RULEDEF_EXPR_WRONG, strRule.c_str());
        
		try {
			m_nPrecPrio		=_ttoi(strTemp.data());
			m_nAtmType		=m_patmTypeMap->nGetAtmTypeFor(strName);
			m_patmTypeMap->vSetCustIDToAtmMapping(m_nIDValue,m_nAtmType);
			m_patmTypeMap->patiSetTypeInfo(m_nAtmType,ATM_MTYPE_RULE,ATM_SUBT_RULE_UNDEF);
			}
		catch(cxAnalyzerException& e)
			{
			if(e.ErrorString()==NULL)
				e.SetErrorString(strRule.c_str());
			throw e;
			}
        
        // Temporary store the initialization string
        m_pstrInitTemp	=new std::tstring(strRule.substr(nSep+1));
        
        return true;
        }
    
    bool cxAnalyzerExpression::fFinishInit()
        {
        // May not be done already, and fInitFromRule must have been called
        // already.
        ASSERT(m_fInitializeComplete==false);
        ASSERT(m_nAtmType!=ATM_ID_INVALID);
        ASSERT(m_pstrInitTemp!=NULL);
        ASSERT(m_apatiPattern==NULL);
        ASSERT(m_nSizePattern==0);
        
        // Since we use no parser :), replace all single '{' and '}' with
        // '{{' and '}}', to distinguish them from escaped ones ('\{', '\}')
        std::tstring	strRule = ctkMisc::strEscapeBraces(*m_pstrInitTemp);
        
        // Delete temporary string
        //	delete m_pstrInitTemp;
        //	m_pstrInitTemp	=NULL;
        
        // Go on with parsing the expression
        // (The expression is now in the form {{...}}[{{...}}[...]] )
        int				nCurPos;
        int				nLen = strRule.length();
        if(nLen==0)
            throw cxAnalyzerException(ANERR_RULEDEF_EXPR_EMPTY, (*m_pstrInitTemp).c_str() );
        
        std::vector<int>	vecTemp;
        for(nCurPos=0;nCurPos<nLen;)
            {
            if(strRule.substr(nCurPos,2)!=_T("{{"))
                throw cxAnalyzerException(ANERR_RULEDEF_EXPR_WRONG, (*m_pstrInitTemp).c_str() );

            int				nEndBrace;
			for(nEndBrace=nCurPos;nEndBrace<((int)strRule.size());nEndBrace++)
				{
				if(strRule[nEndBrace]=='\\') { nEndBrace++; continue; }
				if(strRule[nEndBrace]=='}') break;
				}
            int				nAtmType = ATM_ID_INVALID;
            std::tstring	strPart;
            
            if(nEndBrace==-1)
                throw cxAnalyzerException(ANERR_RULEDEF_EXPR_WRONG, (*m_pstrInitTemp).c_str() );
            
            strPart			=ctkMisc::strUnescapeBraces(strRule.substr(nCurPos+2,nEndBrace-nCurPos-2));
			strPart			=ctkMisc::strParseEscapeCharacters(strPart);
            nAtmType		=m_patmTypeMap->nGetAtmTypeFor(strPart,false);
            if(nAtmType==ATM_ID_INVALID)
				{
				m_patmTypeMap->vDump();
                throw cxAnalyzerException(ANERR_RULEDEF_PART_UNDEFINED, (*m_pstrInitTemp).c_str() );
				}
            
            vecTemp.push_back(nAtmType);
            
            nCurPos			=nEndBrace+2;
            }
        
        // Now store the contents of the 'vecTemp' array in a dynamically
        // allocated 'int[]' (cause the length of the array is not subject
        // to change)
        int				i;
        
        m_nSizePattern	=vecTemp.size();
        m_apatiPattern	=new cxAnalyzerTypeInfo*[m_nSizePattern];
        for(i=0;i<m_nSizePattern;i++)
            m_apatiPattern[i]=m_patmTypeMap->patiGetTypeInfo(vecTemp[i]);
        
        // Almost done ...
        m_fInitializeComplete=true;

		// the only thing left is to update the nSubType value of
		// the cxAnalyzerTypeInfo belonging to our nAtmID if
		// necessary
		cxAnalyzerTypeInfo	*patiCur = 
			m_patmTypeMap->patiGetTypeInfo(m_nAtmType);

		if(fIsLeftBound(m_nAtmType) || fIsRightBound(m_nAtmType))
			{
			if(patiCur->nGetSubType()!=ATM_SUBT_RULE_OPEN)
				patiCur->vSetSubType(ATM_SUBT_RULE_OPEN);
			}
		else
			{
			if(patiCur->nGetSubType()==ATM_SUBT_RULE_UNDEF)
				patiCur->vSetSubType(ATM_SUBT_RULE_FINITE);
			}
        
        return true;
        }
    
    bool	cxAnalyzerExpression::fIsImplicitExpressionToAtm(int *pnAtmType) const
        {
        ASSERT(m_fInitializeComplete);
        ASSERT(pnAtmType!=NULL);
        
        if(m_nSizePattern!=1)
            return false;
        
        if(	(m_apatiPattern[0]->nGetMType()==ATM_MTYPE_TOKEN) ||
            (m_apatiPattern[0]->nGetMType()==ATM_MTYPE_COMP_TOKEN) ||
			(m_apatiPattern[0]->nGetMType()==ATM_MTYPE_RULE))
            {
            (*pnAtmType)=m_apatiPattern[0]->nGetAtmType();
            return true;
            }
        
        return false;
        }

	bool	cxAnalyzerExpression::fIsLeftBound(int nAtmType) const
		{
		ASSERT(m_nSizePattern>0);

		if(m_apatiPattern[0]->nGetMType()!=ATM_MTYPE_RULE)
			return false;

		if(m_apatiPattern[0]->nGetAtmType()!=nAtmType)
			return false;

		return true;
		}

	bool	cxAnalyzerExpression::fIsRightBound(int nAtmType) const
		{
		ASSERT(m_nSizePattern>0);

		if(m_apatiPattern[m_nSizePattern-1]->nGetMType()!=ATM_MTYPE_RULE)
			return false;

		if(m_apatiPattern[m_nSizePattern-1]->nGetAtmType()!=nAtmType)
			return false;

		return true;
		}
    

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
Germany Germany
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions