Click here to Skip to main content
15,896,207 members
Articles / Programming Languages / C++

Tokenizer and analyzer package supporting precedence prioritized rules

Rate me:
Please Sign up or sign in to vote.
5.00/5 (4 votes)
1 Jan 20023 min read 183.4K   2.8K   54  
A library allowing you to conveniently build a custom tokenizer and analyzer supporting precedence priorized rules
/*********************************************************************
	Copyright (C) 2001/2 by

		Alexander Berthold, alexander-berthold@web.de.
		Hoegestr. 54
		79108 Freiburg i. Breisgau
		Germany

    -- This file is part of cxAnalyzer --

    "cxAnalyzer" is free software; you can redistribute it and/or 
	modify it under the terms of the GNU Lesser General Public 
	License as published by the Free Software Foundation; either 
	version 2 of the License, or any later version.

    "cxAnalyzer" is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
	License along with "cxAnalyzer"; if not, write to the Free 
	Software  Foundation, Inc., 59 Temple Place, Suite 330, 
	Boston, MA  02111-1307  USA

    ---------------------------------------------------------------
      If you find any bugs or if you make other corrections/
	  enhancements, i'd appreciate if you'd let me know about 
	  that. My email is
  
       alexander-berthold@web.de
  
      If you share this code, do not remove this text.
    ---------------------------------------------------------------

Class:      cxAnalyzerMain
Author:     Alexander Berthold
Copyright:  Alexander Berthold
Date:       2001/12/19
Version:	0.2.01
Purpose:    Main class of this package. 
			- Stores the analyzer rules describing the grammar 
			  (m_mapAtmAEList).
			- Stores implicit mappings (m_mapAtmImplicit)
			- Provides the functionality to analyze a sequence
			  of tokens for a given rule (fCheckRule).


Version history:

	-	2001/06/01
		Optimized fCheckRule. If used for checking for a rule
		which is finite, it was still tried to test if the rule
		continues.

	-	2001/06/02
		Added cxaStatusCookie. Intended for providing error
		information.

	-	2001/06/02
		Source labeled version 0.1.06

	-	2001/06/12
		Fixed some bugs with cxaStatusCookie. Improved accuracy
		of returned information.
		Version number is now 0.1.07

	-	2001/12/16
		Improved speed by introducing a rule cache and by minor
		other optimizations. Currently, the rule cache caches only successfully
		parsed sub-expressions. I will extend it to cache also failed sub-expressions,
		because this saves even more time under certain (but probable) circumstances.

	-	2001/12/19
		Labeled version 0.2.01

ToDo:
	-	Further optimize fCheckRule:
		Currently it is only being differentiated between finite
		and non-finite rules. Can be improved by differentiating
		between left-only non-finite rules and right-only non-finite
		rules.

	-	Further optimize the code. There is still 30%-50% more speed by
		optimizing easily possible.

*********************************************************************/

// cxAnalyzerMain.h: interface for the cxAnalyzerMain class.
//
//////////////////////////////////////////////////////////////////////

#if !defined(AFX_CXANALYZERMAIN_H__CCC86421_3E3E_4694_A24A_2CDC157E268B__INCLUDED_)
#define AFX_CXANALYZERMAIN_H__CCC86421_3E3E_4694_A24A_2CDC157E268B__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

class cxAnalyzerTree;

class cxaStatusCookie
{
// Construction/Destruction
public:
	cxaStatusCookie()
		{
		nMaxLength	=0;
		nCurLength	=0;
		lCurPosition=0;
		lMaxPosition=0;
		nBrkCause	=0;
		patToken	=NULL;
		lPosition	=0;
#ifdef ANALYZER_STATS_ENABLED
		memset( (void*)&stats,0,sizeof(stats));
#endif
		};

	cxaStatusCookie(int nLeft, int nRight, const cxaToken* patToken)
		{
		nMaxLength	=0;
		nCurLength	=0;
		lCurPosition=nLeft;
		lMaxPosition=nRight;
		nBrkCause	=0;
		nBrkAtmType	=0;
		nBrkIDValue	=0;
		patToken	=patToken;
		lPosition	=nLeft;
#ifdef ANALYZER_STATS_ENABLED
		memset(&stats,0,sizeof(stats));
#endif
		}

// Attributes
public:
	enum	{
		brkcause_ok=0,
		brkcause_eots,
		brkcause_wrongtoken,
		brkcause_subrulefailed,
		brkcause_notokens,
		brkcause_unexpected
		};

	int		nMaxLength;			/* Internally used */
	int		nCurLength;			/* Internally used */
	long	lMaxPosition;		/* Internally used */
	long	lCurPosition;		/* Internally used */
	int		nBrkAtmType;		// AtmType of the analyzer expression which hit the end of the token stream
	int		nBrkIDValue;		// IDValue "   "     "         ...

protected:
	int		nBrkCause;			/* parsing error cause */
public:
	const cxaToken* patToken;	/* token where the error happened */
	long	lPosition;			/* Position in the input stream where the error happened */
	long	lLastPosition;		/* Position in the input stream where the error 'ended' */

#ifdef ANALYZER_STATS_ENABLED
// Statistics
public:
	struct
		{
		int		nRuleCallsTotal;
		int		nSuccessfulRuleCallsTotal;
		int		nAERuleCallsTotal;
		int		nSuccessfulAERuleCallsTotal;
		int		nMaxRecursionLevel;
		int		nCurRecursionLevel;
		int		nSkipped;
		}
	stats;
#endif

// Operations
public:
	void	vSetBrkCause(int nBrkCause_, const cxaToken* patToken_, long lEndPosition = 0)
		{
		ASSERT(nBrkCause_!=brkcause_eots);
		if(nBrkCause==brkcause_eots) // must be preserved!
			return; 
		if(nCurLength>=nMaxLength)
			{
			nBrkCause=nBrkCause_;
			nMaxLength=nCurLength;
			patToken=patToken_;
			lPosition=(patToken_!=NULL)?patToken_->nGetTokenOrderID():-1;
			lLastPosition=lEndPosition;
			if(lCurPosition>lMaxPosition)
				lMaxPosition = lCurPosition;
			}
		}

	void	vSetBrkCauseEots(int nAtmType, int nIDValue, const cxaToken* _patToken)
		{
		if(nBrkCause==brkcause_eots)
			return;

		nBrkCause = brkcause_eots;
		nBrkAtmType = nAtmType;
		nBrkIDValue = nIDValue;
		patToken=_patToken;
		lPosition=(patToken!=NULL)?patToken->nGetTokenOrderID():-1;
		lLastPosition=-1;
		lMaxPosition = lCurPosition;
		}

	int		nGetBrkCause() const { return nBrkCause; };

	void	vReset()
		{
		nMaxLength	=0;
		nCurLength	=0;
		lCurPosition=0;
		lMaxPosition=0;
		nBrkCause	=0;
		patToken	=NULL;
		lPosition	=0;
#ifdef ANALYZER_STATS_ENABLED
		memset( (void*)&stats,0,sizeof(stats));
#endif
		}
};

/*
 Callback class for resolving unknown tokens. The callback method
 doesn't need to be highly optimized because the results will be 
 cached (currently not 100% true, see optimization note above).
 */
class cxAnalyzerUnknownTokenCallback
{
public:
	virtual bool nResolveUnknownToken(
					cxAnalyzerTree *patTree,
					int nAtmTypeExpected,
					const cxaToken* patToken,
					int *pnAtmTypeRet, int *pnIDValueRet,
					void **ppvDataRet) const = 0;
};

class cxAnalyzerMain
{
// Construction/Destruction
public:
	cxAnalyzerMain(cxAnalyzerTypeMap *patmTypeMap);
	virtual ~cxAnalyzerMain();

	// Sub-Class for storing data on implicit mappings (ex.: {.expr}=0:{!number} )
	struct	atm_imp {
		atm_imp(int _nAtmType, int _nIDValue, cxAnalyzerExpression *_pExpr)
			{
			nAtmType	=_nAtmType;
			nIDValue	=_nIDValue;
			pExpr		=_pExpr;
			}
		int		nAtmType;
		int		nIDValue;
		cxAnalyzerExpression *pExpr;
		};

// Typedefs
public:
	typedef std::list<cxAnalyzerExpression*>
						ae_list_type;
	typedef std::vector<cxAnalyzerExpression*>
						ae_vec_type;
	typedef	std::map<int,ae_list_type*>
						atmae_map_type;
	typedef std::multimap<int,atm_imp>
						atmimp_map_type;

// Attributes
protected:
	// Already initialized?
	bool				m_fInitialized;
	// The TypeMap this expression belongs to
	cxAnalyzerTypeMap	*m_patmTypeMap;
	// Implicit mappings
	atmimp_map_type		m_mapAtmImplicit;
	// Map from nAtmID's to cxAnalyzerExpression (length sorted) lists
	atmae_map_type		m_mapAtmAEList;
	// Vector of cxAnalyzerExpression*'s to delete
	ae_vec_type			m_vecAEToDelete;
	// Callback class for unknown tokens
	cxAnalyzerUnknownTokenCallback *m_pacbUnknownToken;
#ifndef ANALYZER_NO_OPTIMIZATION
	cxaRuleCache		m_arcRuleCache;
#endif

// Protected operations
protected:
	// Registers the cxAnalyzerExpression.
	void				vRegisterAE(cxAnalyzerExpression* paeExpr);

#ifndef ANALYZER_NO_OPTIMIZATION

	// Insert the rule specified through the tuple 'nAtmTypeFirst',
	// 'start' and 'ende' into the rule cache. 'cpStart' and 'cpEnd'
	// contain the rule log start and end positions of the rule to
	// insert.
	void				vInsertIntoRuleCache(
										cxAnalyzerTree *patTree,
										int nRuleIDValue, int nAtmTypeFirst,
										const cxaToken* patFirstToken,
										bool fCheckFirstIs,
										cxAnalyzerTree::chkpoint_type cpStart,
										cxAnalyzerTree::chkpoint_type cpEnd,
										cxaTokenStream::const_iterator start,
										cxaTokenStream::const_iterator end);
#endif


	// Check if the given position in the token stream matches
	// the expression 'nAtmTypeTestFor', and determine (if it matches)
	// the next token after the expression.
	bool				fCheckRuleInt(	int nAtmTypeTestFor,
										const cxaTokenStream* patsContext,
										int nAtmTypeFirstIs, /*= PTTM_ID_INVALID */
										const cxaToken* patFirstToken /* = NULL */,
										cxaStatusCookie* pascCondition,
										cxaTokenStream::const_iterator start,
										cxaTokenStream::const_iterator *pend,
										cxAnalyzerTree* patTree);

// Operations
public:

	/*** Debugging support ***/
	bool				fCheckValid() const { return true; };
	void				vDump() const		{ ASSERT(FALSE); };


	/*** Initialization ***/
	bool				fLoadFromStream(std::basic_istream<TCHAR>& input);
	void				vReset();

	/*** Raw data access ***/
	const ae_vec_type&	paevecGetExpressionsVector() const
						{ return m_vecAEToDelete; };

	/*** Analyzer operations ***/
	// Test if 'nAtmTypeTestFor' belongs implictly to the
	// expression of type 'nAtmTypeSource'
	bool				fIsImplicitRule(int nAtmTypeTestFor, 
										int nAtmTypeSource,
										int *pnIDValue,
										cxAnalyzerExpression **ppaeExpr = NULL) const;

	bool				fCheckRule(	int nAtmTestFor,
									cxaTokenStream* patsContext,
									cxaStatusCookie* pascCondition,
									cxaTokenStream::const_iterator start,
									cxaTokenStream::const_iterator *pend,
									cxAnalyzerTree* patTree);

	// Virtual member function which is called everytime a ATM_ID_LITERAL
	// token is found (and no exact pattern match was possible i.e. the
	// patterns' token was not ATM_ID_LITERAL).
	virtual bool		nResolveUnknownToken(	cxAnalyzerTree* patTree,
												int nAtmTypeExpected,
												const cxaToken* patToken,
												int *pnAtmTypeRet, int *pnIDValueRet, void **ppvDataRet) const
		{
		if(m_pacbUnknownToken!=NULL)
			return m_pacbUnknownToken->nResolveUnknownToken(patTree,nAtmTypeExpected,patToken,pnAtmTypeRet,pnIDValueRet,ppvDataRet);
		else
			return false;
		}

	void				vSetUnknownTokenCallback(cxAnalyzerUnknownTokenCallback* pacbUnknownToken)
		{
		m_pacbUnknownToken = pacbUnknownToken;
		}

#ifndef ANALYZER_NO_OPTIMIZATION
	void				vClearCache() { m_arcRuleCache.clear(); };
#endif

	friend class cxAnalyzer;
	friend class cxAnalyzerExpression;
};

#endif // !defined(AFX_CXANALYZERMAIN_H__CCC86421_3E3E_4694_A24A_2CDC157E268B__INCLUDED_)

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
Germany Germany
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions