/*********************************************************************
Copyright (C) 2001 by
Alexander Berthold, alexander-berthold@web.de.
Hoegestr. 54
79108 Freiburg i. Breisgau
Germany
-- This file is part of cxTokenizer --
"cxTokenizer" is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or any later version.
"cxTokenizer" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with "cxTokenizer"; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330,
Boston, MA 02111-1307 USA
---------------------------------------------------------------
If you find any bugs or if you make other corrections/
enhancements, i'd appreciate if you'd let me know about
that. My email is
alexander-berthold@web.de
If you share this code, do not remove this text.
---------------------------------------------------------------
Class: cxTokenizer
Author: Alexander Berthold
Copyright: Alexander Berthold
Date: 2001/12/19
Version: 0.2.01
Purpose: - Main class for the lexical analyzer.
- Scans the input character stream for tokens.
- Calls a call-back class ('cxTokenizerListener') for each
recognized token.
- Uses an incremental scan; should be pretty fast.
- Can also parse computed tokens, like "text" and 1234.
Version history:
- 2001/04/19
First functional version implemented and released (0.1.13).
- 2001/05/11
Minor allocation bug fixed.
- 2001/05/19
Renamed project from 'cpLexxer' to 'cxTokenizer'.
- 2001/06/01
Added vSetInputStream() to dynamically change the source
input stream.
- 2001/06/02
Current source labeled version 0.1.14
- 2001/06/04
Modified cxTokenizerListener interface; added parameter
'ptisStream' allowing to query the current tokens' position
within the input stream.
- 2001/06/05
Current source labeled version 0.1.15
- 2001/06/12
Current source labeled version 0.1.16
- 2001/12/08
Fixed bug when leaving exclusive mode.
Produces expected results now.
- 2001/12/19
Labeled 0.2.01
*********************************************************************/
// cxTokenizer.h: interface for the cxTokenizer class.
//
//////////////////////////////////////////////////////////////////////
#if !defined(AFX_CXTOKENIZER_H__D79EE6C7_40E0_4452_A7A4_20278169F438__INCLUDED_)
#define AFX_CXTOKENIZER_H__D79EE6C7_40E0_4452_A7A4_20278169F438__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#define MAX_SUBSTITUTORS 16
class cxTokenizerMap;
class cxTokenizerInputStream;
class cxTokenizerSubstitutionListener;
class cxTokenizerListener :
public ctkCheckValid,
public ctkExternalObjectPointer
{
// Operations
public:
virtual void vRegisterToken( const std::tstring& strTokenText, const cxTokenizerTokenRule* pltrRule,
const cxTokenizerInputStream *ptisStream) PURE_VIRTUAL;
};
class cxTokenizer
{
// Construction/Destruction
public:
cxTokenizer(cxTokenizerMap* ptmLexxerMap,
cxTokenizerListener* ptlReceiver);
cxTokenizer(cxTokenizerInputStream* ptisData,
cxTokenizerMap* ptmLexxerMap,
cxTokenizerListener* ptlReceiver);
virtual ~cxTokenizer();
// Typedefs
protected:
typedef std::stack<cxTokenizerInputStream*> stktis_type;
typedef std::set<cxTokenizerInputStream*> settis_type;
// Attributes
protected:
// Contains the state of the lexxer
cxTokenizerContext m_tcContextInfo;
// The 'tree' containing the tokens
cxTokenizerMap *m_ptmLexxerMap;
// The input stream stack
stktis_type m_stkPtisData;
// The set of used distinct input streams (used for cleanup)
settis_type m_setPtisData;
// The listener to the token output
cxTokenizerListener *m_ptlReceiver;
// The list of listeners to unknown token substitutors
cxTokenizerSubstitutionListener *m_aptslSubstitutors[MAX_SUBSTITUTORS];
// ctkCheckValid operations
public:
#ifdef _DEBUG
virtual bool fCheckValid() const;
static bool fRunDiagnostics();
#else
virtual bool fCheckValid() const { return true; };
#endif
// Protected operations
protected:
/*** Internal token recognition methods ***/
// A delimeting token has been recognized
void vDelimTokenRecognized();
// A non-delimeting token has been recognized
bool fRecognizeNonDelimTokens(std::tstring strTokenText);
/*** Post token recognition methods ***/
// Is called when a token could not be resolved (i.e. a literal)
bool fSubstituteUnknownToken(const std::tstring& strToken,
const std::tstring& strDelim,
cxTokenizerInputStream* ptisStream,
std::tstring& strSubstitute);
// Is called by the lexxer after a token has been recognized
void vRegisterToken( const std::tstring& strTokenText, cxTokenizerTokenRule* pttrRule,
const cxTokenizerInputStream* ptisStream);
/*** Helper functions ***/
// Find the longest currently active (completed) rule
cxTokenizerContext::cxListEntry
*pteFindEarliestCompletedToken();
// Find the longest applied rule, not yet invalidated rule
cxTokenizerContext::cxListEntry
*pteFindEarliestAppliedToken();
// Mark all active rules except the given one for deletion
void vMarkAllForDeletionExcept(const cxTokenizerContext::cxListEntry* pEntry);
// Exclusive mode has been entered
bool fExclusiveModeEntered(bool fHybrid, cxTokenizerMapData *ptmdNode, TCHAR tcChar, std::tstring& strCurrentText);
// Operations
public:
// Performs a complete reset of the internal tokenizer state
// Cleans up the input streams, resets the state.
// Does NOT clear the token map and does NOT reset the listener
// members.
void vReset();
// Cleans up the used input streams. May only be used if
// the stack of currently open input streams is empty.
void vCleanupInputStreams();
// Push the input stream on the input stream stack
void vPushInputStream(cxTokenizerInputStream* pxtisInput);
// Detach input stream (or all if pxtisInput==NULL)
// pxtisInput must be m_stkPtisData.top() or NULL to succeed.
void vDetachInputStream(cxTokenizerInputStream* pxtisInput = NULL);
// Add a token substitutor listener
void vAddSubstitutorListener(cxTokenizerSubstitutionListener* pxtslSubstitutor);
// Remove a token substitutor listener
void vRemoveSubstitutorListener(cxTokenizerSubstitutionListener* pxtslSubstitutor);
// Return the input stream
cxTokenizerInputStream *ptiGetInputStream() const { return m_stkPtisData.empty()?NULL:m_stkPtisData.top(); };
// Returns the tokenizer context
cxTokenizerContext* ptcGetContext() { return &m_tcContextInfo; };
const cxTokenizerContext* ptcGetContext() const { return &m_tcContextInfo; };
// Parse the next character, 'fOverrideIsLastChar' tells if the next
// char is to be treated as it was the last of the stream.
void vParseCharacter(bool fOverrideIsLastChar = false);
void dumpList(cxTokenizerContext::tc_list_type*plist);
};
#endif // !defined(AFX_CXTOKENIZER_H__D79EE6C7_40E0_4452_A7A4_20278169F438__INCLUDED_)