// Copyright 2000 John R. Bandela
// Please feel free to copy, distribute, and modify this code, provided this notice is retained.
#pragma warning(disable: 4786)
#pragma warning(disable: 4503)
#ifndef JRB_STL_EXTENSIONS_TOKENIZER
#define JRB_STL_EXTENSIONS_TOKENIZER
#include <locale>
#include <iterator>
#include <assert.h>
#define JRB_ITER(a,b,c) std::iterator<a,b,c>
#ifdef __SGI_STL_ITERATOR
#ifdef _MSC_VER
#define input_iterator_tag_CLASS_JRB input_iterator
#define forward_iterator_tag_CLASS_JRB forward_iterator
#define bidirectional_iterator_tag_CLASS_JRB bidirectional_iterator
#define random_access_iterator_tag_CLASS_JRB random_access_iterator
#define output_iterator_tag_CLASS_JRB output_iterator
#undef JRB_ITER
#define JRB_ITER(a,b,c) a##_CLASS_JRB<b,c>
#endif
#endif
namespace jrb_stl_extensions{
// The standard whitespace
const char* WT_Whitespace = " \t\n";
// All the punctuation on a keyboard
const char* WT_Punctuation1 = "/?.>,<\'\";:\\|]}[{=+-_)(*&^%$#@!`~";
// Same as punctuation except for ' and - because they are sometimes
// used a apostrophe and hyphen. Use this when you want to get just words
const char* WT_Punctuation2 = "/?.>,<\";:\\|]}[{=+_)(*&^%$#@!`~";
// This is the default tokenizer, it will separate a string using whitespace
// or punctuation as delimiters. The difference between punctuation and whitespace
// is that you have the option of returning punctuation as a token. This can be useful
// if you are building some type of parser
template <class StringType>
struct PunctSpaceTokenizer{
// Typedefs
typedef StringType TokenType;
typedef StringType::const_iterator iter;
private:
// Data
StringType whitespace;
StringType punctuation;
bool bReturnPunct;
public:
// Constructor
// WARNING: do not use 0 for p or w, if you want it blank, use ""
PunctSpaceTokenizer(bool returnPunct = false, StringType p = WT_Punctuation1, StringType w = WT_Whitespace)
:bReturnPunct(returnPunct),whitespace(w),punctuation(p){}
private:
// Internal Utility Functions
// Returns true if the character was found in punctuation
bool IsPunct(StringType::value_type E){
iter begin, end,found;
begin = punctuation.begin();
end = punctuation.end();
found = std::find(begin,end,E);
return !(found==end);
}
// Returns true if the character was found in whitespace
bool IsSpace(StringType::value_type E){
iter begin, end,found;
begin = whitespace.begin();
end = whitespace.end();
found = std::find(begin,end,E);
return !(found==end);
}
public:
// The functor
// Returns an iterator pointing to the beginning of the current token
// pNextToken will point to the iterator that starts the next token, and thus
// will be the end of this token
iter operator()(iter* pTokEnd,iter end,TokenType& tToken){
using namespace std;
// Initialize the token type to default
tToken = TokenType();
iter& pCur = *pTokEnd;
// skip past all white space
// skip past the punctuation only if we will not return the punctuation
while (pCur!=end && ( IsSpace(*pCur) || (IsPunct(*pCur) && (!bReturnPunct) ) ) ){
++pCur;
}
// Check if we are at the end
if(pCur == end){
return end;
}
// Store the beginning position of the current token
iter pBegin = pCur;
// if we are to return the punctuation, return it
if(IsPunct(*pCur) && bReturnPunct){
++pCur;
tToken.assign(pBegin,pCur);
return pBegin;
}
// Move the pCur past all the non-space non-punctuation characters
while(pCur!=end){
if( IsSpace(*pCur) || IsPunct(*pCur) ){
break;
}
++pCur;
}
// Assign the current token
tToken.assign(pBegin,pCur);
return pBegin;
}
};
// StringType is the string type
// TokenizerFunc a class that has StringType operator()(StringType)
template <class StringType, class TokenizerFunc=PunctSpaceTokenizer<StringType> >
class TokenIterator:public std::iterator<std::forward_iterator_tag,TokenizerFunc::TokenType>
{
public:
// Typedefs
typedef TokenizerFunc::TokenType TokenType;
private:
// Utility Class
// A class to hold the string data.
// This is so we when we assign, we don't have to copy the string
class StringData{
unsigned int nRefCount;
public:
const StringType s;
StringData(const StringType& _s):s(_s),nRefCount(1){}
void IncRef(){++nRefCount;}
void DecRef(){--nRefCount; if(nRefCount==0){delete this;} }
};
// Data
// A flag to tell if we are at the end
bool bAtEnd;
// iterator that points to the current token
StringType::const_iterator pCurStrData;
// iterator that points to the end of the current token
StringType::const_iterator pTokEnd;
// A pointer to hold the ref counted string data
StringData* pData;
// The function object that will perform tokenization
TokenizerFunc Func;
// The current token
TokenType tToken;
// Internal Utility Functions
// A utility function that increments the reference count on pData, checking for NULL
void SafeIncData()const{
if(pData){
pData->IncRef();
}
}
// A utility function that decrements the reference count on pData, checking for NULL
void SafeDecData()const{
if(pData){
pData->DecRef();
}
}
public:
// Constructors
TokenIterator(TokenizerFunc& F=TokenizerFunc()):bAtEnd(1),Func(F),pData(0)
{
}
TokenIterator(StringType input,TokenizerFunc& F=TokenizerFunc()):bAtEnd(0),Func(F)
{
pData = new StringData(input);
pTokEnd=pCurStrData=pData->s.begin();
// This will put us on the first token
++(*this);
}
TokenIterator(const TokenIterator& other):Func(other.Func),
bAtEnd(other.bAtEnd),pData(other.pData),
pCurStrData(other.pCurStrData),
pTokEnd(other.pTokEnd),tToken(other.tToken)
{
// Update the ref count on pData
SafeIncData();
}
// Operator =
TokenIterator& operator=(const TokenIterator& other){
// Increment the ref count on the data we will be receiving
other.SafeIncData()
// Dec the refcount on pData
SafeDecData();
bAtEnd=other.bAtEnd;
pData = other.pData;
pCurStrData=other.pCurStrData;
pTokEnd=other.pNextStrData;
Func = other.Func;
tToken = other.tToken;
}
// Public Operations
// The iterator pre-increment operator
TokenIterator& operator++(){
// If we have already reached the end, then do nothing except return this
if(bAtEnd){return *this;}
// pData should never be 0
assert(pData);
// Get the next Token
pCurStrData = Func(&pTokEnd,pData->s.end(),tToken);
// Update our end status
if(pCurStrData==pData->s.end()){
bAtEnd = 1;// If pCurStrData is at the end then we are at the end
}
return *this;
}
// Iterator post-increment
TokenIterator operator++(int){
TokenIterator OldState(*this);
++(*this);
return OldState;
}
// Iterator dereference operator
const TokenType& operator*()const{
using namespace std;
assert(bAtEnd!=true);
return tToken;
}
// Gets the remaining string that has not yet been parsed
const StringType GetRemaining()const{
return std::string(pCurStrData,pData->s.end());
}
// Relational Operators
bool operator==(const TokenIterator& other)const{return bAtEnd==other.bAtEnd;}
bool operator!=(const TokenIterator& other)const{return bAtEnd!=other.bAtEnd;}
// Destructors
~TokenIterator(){SafeDecData();}
};
// A tokenizer that will break apart C separated value lines
template <class StringType, unsigned short C = ','>
struct CSVTokenizer{
typedef StringType TokenType;
typedef typename StringType::const_iterator iter;
typedef typename StringType::value_type char_type;
char_type cSpecial;
CSVTokenizer(char_type _cS = '\\'):cSpecial(_cS){}
// Returns an iterator pointing to the beginning of the current token
// pNextToken will point to the iterator that starts the next token
iter operator()(iter* pTokEnd,iter end,TokenType& tToken){
using namespace std;
tToken = TokenType();
iter& pCur = *pTokEnd;
if(pCur == end){
return end;
}
iter pBegin = pCur;
bool bEscaped = false;
bool bInQuote = false;
TokenType curToken;
bool bContinue = true;
for(;(pCur != end) && bContinue;++pCur){
if(*pCur == cSpecial){
if(bEscaped){
curToken+=cSpecial;
bEscaped = false;
}
else{
bEscaped = true;
}
continue;
}
switch(*pCur){
case '\"':
if(!bEscaped){
bInQuote = !bInQuote;
}
else{
bEscaped = false;
curToken+="\"";
}
break;
case C:
bEscaped=false;
if(bInQuote){
curToken+=C;
}
else{
bContinue = false;
}
break;
case 'N':
case 'n':
if(bEscaped){
curToken += "\n";
}
else{
curToken+="n";
}
bEscaped = false;
break;
default:
bEscaped = false;
curToken += *pCur;
}
}
tToken = curToken;
return pBegin;
}
};
};
#endif