/*********************************************************************
Copyright (C) 2001 by
Alexander Berthold, alexander-berthold@web.de.
Hoegestr. 54
79108 Freiburg i. Breisgau
Germany
-- This file is part of cxTokenizer --
"cxTokenizer" is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or any later version.
"cxTokenizer" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with "cxTokenizer"; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330,
Boston, MA 02111-1307 USA
---------------------------------------------------------------
If you find any bugs or if you make other corrections/
enhancements, i'd appreciate if you'd let me know about
that. My email is
alexander-berthold@web.de
If you share this code, do not remove this text.
---------------------------------------------------------------
*********************************************************************/
// cxTokenizer.cpp: implementation of the cxTokenizer class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "cxTokenizerTokenRule.h"
#include "cxTokenizerCharTokenRule.h"
#include "cxTokenizerContextCookie.h"
#include "cxTokenizerContext.h"
#include "cxTokenizerMapData.h"
#include "cxTokenizerMap.h"
#include "cxTokenizerInputStream.h"
#include "cxTokenizer.h"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
#include "cxTokenizerDiags.cpp"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
cxTokenizer::cxTokenizer(cxTokenizerMap *ptmLexxerMap,
cxTokenizerListener *ptlReceiver)
{
// ASSERT if any assumption fails
ASSERT(ptmLexxerMap!=NULL);
ASSERT(ptlReceiver!=NULL);
ASSERT(ptmLexxerMap->fCheckValid());
ASSERT(ptlReceiver->fCheckValid());
// The input stream must be set later using vSetInputStream
m_ptisData =NULL;
// The tree with the 'path' to the tokens
m_ptmLexxerMap =ptmLexxerMap;
// The listener for the output token stream
m_ptlReceiver =ptlReceiver;
m_tcContextInfo.vClearAllFlags();
}
cxTokenizer::cxTokenizer(cxTokenizerInputStream *ptisData,
cxTokenizerMap *ptmLexxerMap,
cxTokenizerListener *ptlReceiver)
{
// ASSERT if any assumption fails
ASSERT(ptisData!=NULL);
ASSERT(ptmLexxerMap!=NULL);
ASSERT(ptlReceiver!=NULL);
ASSERT(ptisData->fCheckValid());
ASSERT(ptmLexxerMap->fCheckValid());
ASSERT(ptlReceiver->fCheckValid());
// The input stream ...
m_ptisData =ptisData;
// The tree with the 'path' to the tokens
m_ptmLexxerMap =ptmLexxerMap;
// The listener for the output token stream
m_ptlReceiver =ptlReceiver;
m_tcContextInfo.vClearAllFlags();
}
cxTokenizer::~cxTokenizer()
{
// Delete the input stream if it wants to
if(m_ptisData)
{
// Check validity
ASSERT(m_ptisData->fCheckValid());
// Delete?
if(m_ptisData->fShouldDelete())
{
// Yes.
delete m_ptisData;
m_ptisData =NULL;
}
}
// Delete the token listener if it wants to
if(m_ptlReceiver)
{
// Check validity
ASSERT(m_ptlReceiver->fCheckValid());
// Delete?
if(m_ptlReceiver->fShouldDelete())
{
// Yes.
delete m_ptlReceiver;
m_ptlReceiver =NULL;
}
}
}
//////////////////////////////////////////////////////////////////////
// Protected internal token recognition operations
//////////////////////////////////////////////////////////////////////
/*********************************************************************
FUNCTION: cxTokenizer::vRecognizeNonDelimTokens
PURPOSE: Internally called when a non-delimeting token
has been recognized. Analyzes the part between
the next delimeting token for non-delimeting tokens,
and calls vRegisterToken() if anything found.
RETURNS: - void -
*********************************************************************/
void cxTokenizer::vRecognizeNonDelimTokens(std::tstring strTokenText)
{
// Is anything to check?
if(strTokenText.length()==0)
return;
// TODO: Extend this function to correctly handle the flag
// ooptrf_intermediate (currently, only one token is recognized)
cxTokenizerMapData *ptmdCurRule = &m_ptmLexxerMap->m_tmdFPRules;
cxTokenizerMapData *ptmdNextRule = NULL;
int i,len;
TCHAR tcChar;
bool fComplete = false;
len =strTokenText.length();
for(i=0;i<len;i++)
{
tcChar =strTokenText[i];
m_tcContextInfo.strGetCurrentText()+=tcChar;
ptmdNextRule =ptmdCurRule->ptmdDoesApply(&m_tcContextInfo,tcChar,&fComplete,0,i);
if(ptmdNextRule==NULL)
break;
ptmdNextRule->vApplied(&m_tcContextInfo,tcChar,fComplete,0,i);
ptmdCurRule =ptmdNextRule;
}
if(ptmdNextRule!=NULL && fComplete)
{
std::tstring strTemp;
if(ptmdNextRule->pttrGetRule()->fGetResultString(&m_tcContextInfo,strTemp))
strTokenText=strTemp;
vRegisterToken(strTokenText,ptmdNextRule->pttrGetRule(),m_ptisData);
}
else
vRegisterToken(strTokenText,NULL,m_ptisData);
}
/*********************************************************************
FUNCTION: cxTokenizer::vDelimTokenRecognized
PURPOSE: Internally called when a delimeting token
has been recognized. Calls vRecognizeNonDelimTokens()
to analyze the part between the last delimeting token
and this one. Afterwards registers the token using
vRegisterToken().
RETURNS: - void -
*********************************************************************/
void cxTokenizer::vDelimTokenRecognized()
{
// Token has been recognized
cxTokenizerContext::tc_list_type
*plstRuleList;
cxTokenizerContext::cxListEntry
*pteCur;
cxTokenizerMapData *ptmdRule;
cxTokenizerTokenRule *pttrDelimTokenRule;
cxTokenizerContext::rulelist_iterator
it,cit;
std::tstring strDelimToken;
std::tstring strBetweenToken;
std::tstring strPutBack;
// There must be exactly one entry in the list of tokens
plstRuleList =m_tcContextInfo.plstGetTokenRuleList();
ASSERT(plstRuleList!=NULL);
ASSERT(plstRuleList->size()==1);
if(plstRuleList->size()!=1)
throw cxTokenizerException(ERR_UNEXPECTED_TOKEN);
// This one token is the delimeting token
pteCur =*plstRuleList->begin();
ptmdRule =pteCur->ptmdGetLastValidRule();
if(ptmdRule->fIsLeaf()==false)
throw cxTokenizerException(ERR_COMPILER_UNSPECIFIED);
pttrDelimTokenRule =ptmdRule->pttrGetRule();
strDelimToken =m_tcContextInfo.strGetCurrentText().substr(
pteCur->nGetStartPosition(),
pteCur->nGetEndPosition()-pteCur->nGetStartPosition());
strBetweenToken =m_tcContextInfo.strGetCurrentText().substr(
0,
pteCur->nGetStartPosition());
strPutBack =m_tcContextInfo.strGetCurrentText().substr(
pteCur->nGetEndPosition());
std::tstring strTemp;
if(pttrDelimTokenRule->fGetResultString(&m_tcContextInfo,strTemp))
strDelimToken =strTemp;
// Now parse the text before the delimeting token
// (can consist of multiple tokens)
m_tcContextInfo.vCleanUpAfterTokenRecognition();
vRecognizeNonDelimTokens(strBetweenToken);
// Register the delimeting token
vRegisterToken(strDelimToken,pttrDelimTokenRule,m_ptisData);
// Clean up the parser context
m_tcContextInfo.vCleanUpAfterTokenRecognition();
// Put back the remaining characters
m_ptisData->fPutBack(strPutBack.data());
}
//////////////////////////////////////////////////////////////////////
// Protected post token recognition operations
//////////////////////////////////////////////////////////////////////
void cxTokenizer::vRegisterToken(const std::tstring& strTokenText,
cxTokenizerTokenRule* pttrRule,
const cxTokenizerInputStream* ptisStream)
{
ASSERT(m_ptlReceiver->fCheckValid());
m_ptlReceiver->vRegisterToken( strTokenText, pttrRule,
ptisStream);
}
//////////////////////////////////////////////////////////////////////
// Protected helper operations
//////////////////////////////////////////////////////////////////////
/*********************************************************************
FUNCTION: cxTokenizer::pleFindEarliestCompletedToken
PURPOSE: Searches for the rule(token) with the status
'completed' and which started earliest
(minimal nStartPosition).
RETURNS: cxTokenizerContext::cooListEntry
*********************************************************************/
cxTokenizerContext::cxListEntry
*cxTokenizer::pteFindEarliestCompletedToken()
{
cxTokenizerContext::tc_list_type
*plstRuleList;
cxTokenizerContext::rulelist_iterator
cit;
plstRuleList =m_tcContextInfo.plstGetTokenRuleList();
// Search for the rule which started earliest
int nMRPos = 0x7fff;
cxTokenizerContext::cxListEntry
*pteMREntry = NULL;
for(cit=plstRuleList->begin();cit!=plstRuleList->end();cit++)
{
cxTokenizerContext::cxListEntry
*pteCur = *cit;
if(pteCur->fIsMarkedForDeletion())
continue;
if(pteCur->fIsCompleted())
{
if(pteCur->nGetStartPosition()<nMRPos)
{
nMRPos =pteCur->nGetStartPosition();
pteMREntry =pteCur;
}
}
}
return pteMREntry;
}
/*********************************************************************
FUNCTION: cxTokenizer::pleFindEarliestAppliedToken
PURPOSE: Searches for the rule(token) with the status
'applied' and which started earliest
(minimal nStartPosition).
RETURNS: cxTokenizerContext::cooListEntry
*********************************************************************/
cxTokenizerContext::cxListEntry
*cxTokenizer::pteFindEarliestAppliedToken()
{
cxTokenizerContext::tc_list_type
*plstRuleList;
cxTokenizerContext::rulelist_iterator
cit;
plstRuleList =m_tcContextInfo.plstGetTokenRuleList();
// Search for the rule which started earliest
int nMRPos = 0x7fff;
cxTokenizerContext::cxListEntry
*pteMREntry = NULL;
for(cit=plstRuleList->begin();cit!=plstRuleList->end();cit++)
{
cxTokenizerContext::cxListEntry
*pteCur = *cit;
if(pteCur->fIsMarkedForDeletion())
continue;
if(pteCur->fIsNotApplying())
{
if(pteCur->nGetStartPosition()<nMRPos)
{
nMRPos =pteCur->nGetStartPosition();
pteMREntry =pteCur;
}
}
}
return pteMREntry;
}
/*********************************************************************
FUNCTION: cxTokenizer::vMarkAllForDeletionExcept
PURPOSE: Marks all currently used rules for deletion
except the one given as an argument.
RETURNS: - void -
*********************************************************************/
void cxTokenizer::vMarkAllForDeletionExcept(const cxTokenizerContext::cxListEntry* pEntry)
{
cxTokenizerContext::tc_list_type
*plstRuleList;
cxTokenizerContext::rulelist_iterator
cit;
plstRuleList =m_tcContextInfo.plstGetTokenRuleList();
for(cit=plstRuleList->begin();cit!=plstRuleList->end();cit++)
{
cxTokenizerContext::cxListEntry
*pteCur = *cit;
if(pteCur!=pEntry)
pteCur->vMarkForDeletion();
}
}
//////////////////////////////////////////////////////////////////////
// Operations
//////////////////////////////////////////////////////////////////////
/*********************************************************************
FUNCTION: cxTokenizer::vSetInputStream
PURPOSE: Changes/sets the input stream
RETURNS: - void -
*********************************************************************/
void cxTokenizer::vSetInputStream(cxTokenizerInputStream *pxtisInput)
{
#ifdef _DEBUG
if(pxtisInput!=NULL)
{
ASSERT(m_ptisData != pxtisInput);
}
#endif
if(m_ptisData!=NULL)
{
if(m_ptisData->fShouldDelete())
delete m_ptisData;
}
m_ptisData =pxtisInput;
}
/*********************************************************************
FUNCTION: cxTokenizer::vParseCharacter
PURPOSE: Parses the next character from the input stream.
RETURNS: - void -
*********************************************************************/
void cxTokenizer::vParseCharacter(bool fOverrideIsLastChar)
{
TCHAR tcChar;
bool fIsLastChar;
cxTokenizerContext::tc_list_type
*plstRuleList;
cxTokenizerContext::rulelist_iterator
it,cit,dit;
std::tstring& strCurrentText = m_tcContextInfo.strGetCurrentText();
tcChar =m_ptisData->tcGetNextCharacter();
fIsLastChar =m_ptisData->fIsEofReached()?true:fOverrideIsLastChar;
strCurrentText +=tcChar;
plstRuleList =m_tcContextInfo.plstGetTokenRuleList();
it =plstRuleList->begin();
// The approach is like this:
// Find the next 'within' token (ignore all characters until this
// token is found). Then, determine the type of the token in between
// the starting position and the position where the 'within' token is
// found
// 1. Check if already found rules still apply
int nCntTotal = 0;
int nCntToDelete = 0;
int nCntCompleted = 0;
int nCntNotApplying = 0;
for(cit=it;cit!=plstRuleList->end();cit++)
{
bool fSkip = false;
bool fComplete = false;
cxTokenizerContext::cxListEntry
*pteCur = *cit;
cxTokenizerMapData *ptmdRule = pteCur->ptmdGetRule(),
*ptmdRuleNext = NULL;
nCntTotal++;
if(pteCur->fIsNotApplying())
nCntNotApplying++,fSkip=true;
if(fSkip)
continue;
ptmdRuleNext =ptmdRule->ptmdDoesApply(&m_tcContextInfo,tcChar,&fComplete,pteCur->nGetStartPosition(),strCurrentText.length());
if(ptmdRuleNext!=NULL)
{
ptmdRule->vApplied(&m_tcContextInfo,tcChar,fComplete,pteCur->nGetStartPosition(),strCurrentText.length());
if(fComplete)
{
pteCur->vSetCompleted( strCurrentText.length(),
ptmdRuleNext);
nCntCompleted++;
}
pteCur->vSetRule(ptmdRuleNext);
}
else
{
// NULL -> this char doesn't fit for this rule anymore
if(pteCur->fIsCompleted())
{
pteCur->vSetNotApplying();
nCntNotApplying++;
}
else
{
pteCur->vMarkForDeletion();
nCntToDelete++;
}
}
}
// Check for new tokens only if in non-exclusive mode
if(!m_tcContextInfo.fIsFlagSet(tctx_exclusive))
{
// 2. Check if any new rules apply for that character
cxTokenizerMapData *ptmdNewRule;
bool fComplete = false;
ptmdNewRule =m_ptmLexxerMap->m_tmdWIRules.ptmdDoesApply(
&m_tcContextInfo,tcChar,&fComplete,strCurrentText.length(),strCurrentText.length());
if(ptmdNewRule!=NULL)
{
ptmdNewRule->vApplied(&m_tcContextInfo,tcChar,fComplete,strCurrentText.length(),strCurrentText.length());
// Has the 'exclusive access' flag been set?
if(m_tcContextInfo.fIsFlagSet(tctx_exclusive))
{
// Is this the first character?
// If yes, we can proceed. If not, we have to complete
// any open token
int len;
len =strCurrentText.length();
if(len!=1)
{
// Reset exclusive flag temporarily
// (will be set again next time)
m_tcContextInfo.vSetFlag(tctx_exclusive,false);
TCHAR tcPB;
tcPB =strCurrentText[len-2];
strCurrentText =strCurrentText.substr(0,strCurrentText.length()-2);
m_ptisData->fPutBack(tcChar);
m_ptisData->fPutBack(tcPB);
vParseCharacter(true);
return;
}
}
cxTokenizerContext::cxListEntry
*pteNewEntry = new cxTokenizerContext::cxListEntry(
strCurrentText.length()-1,
ptmdNewRule);
if(fComplete)
{
pteNewEntry->vSetCompleted( strCurrentText.length(),
ptmdNewRule);
nCntCompleted++;
}
plstRuleList->push_front(pteNewEntry);
nCntTotal++;
}
}
bool fRecognized = false;
// More than one rule has been tested?
if(nCntTotal!=0)
{
// Is at least one rule valid?
if(nCntToDelete!=nCntTotal)
{
// Last character found?
if(fIsLastChar)
{
// Search for the rule which started earliest
cxTokenizerContext::cxListEntry
*pteMREntry = pteFindEarliestCompletedToken();
vMarkAllForDeletionExcept(pteMREntry);
nCntToDelete =plstRuleList->size()-1;
fRecognized =true;
}
// Are all rules which are still valid not applying anymore?
// (hence completed, but the new character doesn't fit to any
// of the remaining rules)?
if( (nCntTotal-nCntToDelete)==nCntNotApplying )
{
if( nCntNotApplying==1 )
{
// The remaining rule is the result
fRecognized =true;
}
else
{
// Search for the rule which started earliest
cxTokenizerContext::cxListEntry
*pteMREntry = pteFindEarliestAppliedToken();
vMarkAllForDeletionExcept(pteMREntry);
nCntToDelete =plstRuleList->size()-1;
fRecognized =true;
}
}
}
}
// Delete the rules in the list which are marked for deletion
VERIFY(m_tcContextInfo.nDeleteMarkedListEntries()==nCntToDelete);
// Has a rule been recognized?
if(fRecognized)
vDelimTokenRecognized();
else
{
if(fIsLastChar)
{
std::tstring strTemp = m_tcContextInfo.strGetCurrentTextConst();
m_tcContextInfo.vCleanUpAfterTokenRecognition();
vRecognizeNonDelimTokens(strTemp);
m_tcContextInfo.vCleanUpAfterTokenRecognition();
}
}
}