/*********************************************************************
Copyright (C) 2001/2 by
Alexander Berthold, alexander-berthold@web.de.
Hoegestr. 54
79108 Freiburg i. Breisgau
Germany
-- This file is part of cxTokenizer --
"cxTokenizer" is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or any later version.
"cxTokenizer" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with "cxTokenizer"; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330,
Boston, MA 02111-1307 USA
---------------------------------------------------------------
If you find any bugs or if you make other corrections/
enhancements, i'd appreciate if you'd let me know about
that. My email is
alexander-berthold@web.de
If you share this code, do not remove this text.
---------------------------------------------------------------
Class: cxTokenizerMap
Author: Alexander Berthold
Copyright: Alexander Berthold
Date: 2001/12/19
Version: 0.2.01
Purpose: Container class for the nested maps of rules.
A short explanation:
- There a two different sets of rules: 'first position'
rules and 'within' rules. 'within' rules can appear
anywhere, like the C++ bracket ('(') token. 'First position' rules
are those which apply only to a complete string. Sample:
return(5);
Here, '(' is found and breaks the input text into
'return' and '('. If there is a 'first position' rule which
matches 'return', it is used and recognized. If not, the
'NULL' rule applies.
- To speed up the recognition process, all rules which could
apply are tested in parallel. C++ operators '=' and '==',
for example, both start with the same character. So the
tokenizer finds if it happens to read a '=' both rules
could be valid. If the next token is _not_ another '=',
it recognizes an '=' token. If it _is_ '=', then the
'==' token is recognized.
- This is done by traversing a tree structure, which
would look like this for the tokens 'if', 'elseif', 'else'
and 'endif':
1st 2nd 3rd 4th 5th 6th 7th
'i'--'f'--(*)
'e'--'l'--'s'--'e'--(*)
| |
| +---'i'--'f'--(*)
|
+---'n'--'d'--'i'--'f'--(*)
(*) stands for a recognized token.
Version history:
- 2001/05/19
Renamed class from 'cpLexxerMap' to 'cxTokenizerMap'.
- 2001/12/19
Current source labeled version 0.2.01
*********************************************************************/
// cxTokenizerMap.h: interface for the cxTokenizerMap class.
//
//////////////////////////////////////////////////////////////////////
#if !defined(AFX_CXTOKENIZERMAP_H__F4E70A9E_72B3_4707_85A1_C5FDB8CE26C3__INCLUDED_)
#define AFX_CXTOKENIZERMAP_H__F4E70A9E_72B3_4707_85A1_C5FDB8CE26C3__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
class cxTokenizerInputStream;
class cxTokenizerMap :
public ctkCheckValid
{
// Construction/Destruction
public:
cxTokenizerMap();
cxTokenizerMap(std::basic_istream<TCHAR>& input);
virtual ~cxTokenizerMap();
// Typedefs
public:
typedef std::vector<cxTokenizerTokenRule*>
ttrdel_vec_type;
// Attributes
protected:
// Already initialized?
bool m_fInitialized;
// Map of rules which apply only from the first character of a new token
// (FP = first position)
cxTokenizerMapData m_tmdFPRules;
// Map of rules which can appear also within other tokens (WI=within)
cxTokenizerMapData m_tmdWIRules;
ttrdel_vec_type m_vecRulesToDelete;
// Protected operations
protected:
bool fRegisterCharRule(cxTokenizerMapData& maptRules,
cxTokenizerCharTokenRule* pRule,
cxTokenizerMapData **pptmdEnd = NULL);
bool fRegisterComputedRule(cxTokenizerMapData& maptRules,
cxTokenizerTokenRule* pRule);
// ctkCheckValid operations
public:
#ifdef _DEBUG
virtual bool fCheckValid() const;
static bool fRunDiagnostics();
#else
virtual bool fCheckValid() const { return true; };
#endif
// Operations
public:
bool fRegisterRule(cxTokenizerTokenRule* pRule);
const cxTokenizerMapData* ptmdGetFPRules() const { return &m_tmdFPRules; };
const cxTokenizerMapData* ptmdGetWIRules() const { return &m_tmdWIRules; };
const cxTokenizerTokenRule* pttrGetRuleForID(int nIDValue) const;
const cxTokenizerTokenRule* pttrGetRuleForString(const std::tstring& strToken) const;
bool fBuildRule(int nIDValue, const std::tstring strToken, const std::tstring& strInitString, bool fSeperator);
bool fBuildHybridRule(int nIDValue, const std::tstring strToken, const std::tstring& strInitString, bool fSeperator, cxTokenizerTokenRule *pRule);
void vLoadFromStream(std::basic_istream<TCHAR>& input);
void vInitNotify(bool fInit, cxTokenizer *pxTokenizer);
friend class cxTokenizer;
};
#endif // !defined(AFX_CXTOKENIZERMAP_H__F4E70A9E_72B3_4707_85A1_C5FDB8CE26C3__INCLUDED_)