Click here to Skip to main content
15,883,996 members
Articles / Programming Languages / C++

The Token Iterator

Rate me:
Please Sign up or sign in to vote.
3.50/5 (2 votes)
3 May 2000 124.5K   675   19  
Token Iterator provides an easy to use, familiar, and customizable way in which to go through the tokens contained in a string
// Copyright 2000 John R. Bandela
// Please feel free to copy, distribute, and modify this code, provided this notice is retained.

#pragma warning(disable: 4786) 
#pragma warning(disable: 4503) 
#ifndef JRB_STL_EXTENSIONS_TOKENIZER
#define JRB_STL_EXTENSIONS_TOKENIZER

#include <locale>
#include <iterator>
#include <assert.h>

#define JRB_ITER(a,b,c) std::iterator<a,b,c>

#ifdef __SGI_STL_ITERATOR
#ifdef _MSC_VER

#define input_iterator_tag_CLASS_JRB input_iterator
#define forward_iterator_tag_CLASS_JRB forward_iterator
#define bidirectional_iterator_tag_CLASS_JRB bidirectional_iterator
#define random_access_iterator_tag_CLASS_JRB random_access_iterator
#define output_iterator_tag_CLASS_JRB output_iterator
#undef JRB_ITER
#define JRB_ITER(a,b,c) a##_CLASS_JRB<b,c>

#endif
#endif


namespace jrb_stl_extensions{
	
	// The standard whitespace
	const char* WT_Whitespace = " \t\n";

	// All the punctuation on a keyboard
	const char* WT_Punctuation1 = "/?.>,<\'\";:\\|]}[{=+-_)(*&^%$#@!`~";

	// Same as punctuation except for ' and - because they are sometimes
	// used a apostrophe and hyphen.  Use this when you want to get just words
	const char* WT_Punctuation2 = "/?.>,<\";:\\|]}[{=+_)(*&^%$#@!`~";
	
	// This is the default tokenizer, it will separate a string using whitespace
	// or punctuation as delimiters. The difference between punctuation and whitespace
	// is that you have the option of returning punctuation as a token. This can be useful
	// if you are building some type of parser
	template <class StringType>
	struct PunctSpaceTokenizer{
		
	// Typedefs		
		typedef StringType TokenType;
		typedef	StringType::const_iterator iter;
	private:	
	// Data
		StringType whitespace;
		StringType punctuation;
		bool bReturnPunct;
		
	public:
	// Constructor
		// WARNING: do not use 0 for p or w, if you want it blank, use ""
		PunctSpaceTokenizer(bool returnPunct = false, StringType p = WT_Punctuation1, StringType w = WT_Whitespace)
			:bReturnPunct(returnPunct),whitespace(w),punctuation(p){}
	private:
	// Internal Utility Functions


		// Returns true if the character was found in punctuation
		bool IsPunct(StringType::value_type E){
			iter begin, end,found;
			begin = punctuation.begin();
			end = punctuation.end();
			found = std::find(begin,end,E);
			return !(found==end);
		}
		// Returns true if the character was found in whitespace
		bool IsSpace(StringType::value_type E){
			iter begin, end,found;
			begin = whitespace.begin();
			end = whitespace.end();
			found = std::find(begin,end,E);
			return !(found==end);
		}
	public:
	// The functor 

		// Returns an iterator pointing to the beginning of the current token
		// pNextToken will point to the iterator that starts the next token, and thus
		// will be the end of this token
		iter operator()(iter* pTokEnd,iter end,TokenType& tToken){
			using namespace std;
			// Initialize the token type to default
			tToken = TokenType();
			iter& pCur = *pTokEnd;
			// skip past all white space
			// skip past the punctuation only if we will not return the punctuation
			while (pCur!=end && ( IsSpace(*pCur) || (IsPunct(*pCur) && (!bReturnPunct) ) ) ){
				++pCur;
			}
			// Check if we are at the end
			if(pCur == end){
				return end;
			}
			// Store the beginning position of the current token
			iter pBegin = pCur;
			
			// if we are to return the punctuation, return it
			if(IsPunct(*pCur) && bReturnPunct){
				++pCur;
				tToken.assign(pBegin,pCur);
				return pBegin;
			}
			// Move the pCur past all the non-space non-punctuation characters
			while(pCur!=end){
				if( IsSpace(*pCur) || IsPunct(*pCur) ){
					break;
				}
				++pCur;
			}
			
			// Assign the current token
			tToken.assign(pBegin,pCur);
			

			return pBegin;
		}
	};
	
	// StringType is the string type
	// TokenizerFunc a class that has StringType operator()(StringType)
	template <class StringType, class TokenizerFunc=PunctSpaceTokenizer<StringType> >
		class TokenIterator:public 	std::iterator<std::forward_iterator_tag,TokenizerFunc::TokenType>
	{
	public:
	// Typedefs
		typedef TokenizerFunc::TokenType TokenType;

	private:
	// Utility Class

		// A class to hold the string data.
		// This is so we when we assign, we don't have to copy the string
		class StringData{
			unsigned int nRefCount;
		public:
			const StringType s;
			StringData(const StringType& _s):s(_s),nRefCount(1){}
			void IncRef(){++nRefCount;}
			void DecRef(){--nRefCount; if(nRefCount==0){delete this;} }
		};

	// Data

		// A flag to tell if we are at the end 
		bool bAtEnd;

		// iterator that points to the current token
		StringType::const_iterator pCurStrData;
		
		// iterator that points to the end of the current token
		StringType::const_iterator pTokEnd;
		
		// A pointer to hold the ref counted string data
		StringData* pData;
		
		// The function object that will perform tokenization
		TokenizerFunc Func;
		
		// The current token
		TokenType tToken;
	// Internal Utility Functions

		// A utility  function that increments the reference count on pData, checking for NULL
		void SafeIncData()const{
			if(pData){
				pData->IncRef();
			}
		}

		// A utility  function that decrements the reference count on pData, checking for NULL
		void SafeDecData()const{
			if(pData){
				pData->DecRef();
			}
		}

	public:
	// Constructors
		TokenIterator(TokenizerFunc& F=TokenizerFunc()):bAtEnd(1),Func(F),pData(0)
		{
		}
		TokenIterator(StringType input,TokenizerFunc& F=TokenizerFunc()):bAtEnd(0),Func(F)
		{	
			pData = new StringData(input);
			pTokEnd=pCurStrData=pData->s.begin();
			// This will put us on the first token
			++(*this);
			
		}
		TokenIterator(const TokenIterator& other):Func(other.Func),
			bAtEnd(other.bAtEnd),pData(other.pData),
			pCurStrData(other.pCurStrData),
			pTokEnd(other.pTokEnd),tToken(other.tToken)
		{	
			// Update the ref count on pData
			SafeIncData();
		}
	// Operator =
		TokenIterator& operator=(const TokenIterator& other){
			// Increment the ref count on the data we will be receiving
			other.SafeIncData()
			// Dec the refcount on pData
			SafeDecData();
			bAtEnd=other.bAtEnd;
			pData = other.pData;
			pCurStrData=other.pCurStrData;
			pTokEnd=other.pNextStrData;
			Func = other.Func;
			tToken = other.tToken;
			
		}
	// Public Operations

		// The iterator pre-increment operator
		TokenIterator& operator++(){
			// If we have already reached the end, then do nothing except return this
			if(bAtEnd){return *this;}

			// pData should never be 0
			assert(pData);
			// Get the next Token
			pCurStrData = Func(&pTokEnd,pData->s.end(),tToken);
			// Update our end status
			if(pCurStrData==pData->s.end()){ 
				bAtEnd = 1;// If pCurStrData is at the end then we are at the end
			}
			return *this;
		}

		// Iterator post-increment
		TokenIterator operator++(int){
			TokenIterator OldState(*this);
			++(*this);
			return OldState;
			
			
		}
		
		// Iterator dereference operator
		const TokenType& operator*()const{
			using namespace std;
			assert(bAtEnd!=true);
			return tToken;
		}

		// Gets the remaining string that has not yet been parsed
		const StringType GetRemaining()const{
			return std::string(pCurStrData,pData->s.end());
		}

	// Relational Operators
		bool operator==(const TokenIterator& other)const{return bAtEnd==other.bAtEnd;}
		bool operator!=(const TokenIterator& other)const{return bAtEnd!=other.bAtEnd;}
	
	// Destructors
		~TokenIterator(){SafeDecData();}
	};


	// A tokenizer that will break apart C separated value lines
	template <class StringType, unsigned short C = ','>
	struct CSVTokenizer{
		typedef StringType TokenType;
		typedef	typename StringType::const_iterator iter;
		typedef typename StringType::value_type char_type;
		char_type cSpecial;
		CSVTokenizer(char_type _cS = '\\'):cSpecial(_cS){}
		
		// Returns an iterator pointing to the beginning of the current token
		// pNextToken will point to the iterator that starts the next token
		iter operator()(iter* pTokEnd,iter end,TokenType& tToken){
			using namespace std;
			tToken = TokenType();
			iter& pCur = *pTokEnd;
			if(pCur == end){
				return end;
			}
			iter pBegin = pCur;

			bool bEscaped = false;
			bool bInQuote = false;
			TokenType curToken;

			bool bContinue = true;
			for(;(pCur != end) && bContinue;++pCur){
				if(*pCur == cSpecial){
					if(bEscaped){
						curToken+=cSpecial;
						bEscaped = false;
					}
					else{
						bEscaped = true;
					}
					continue;
				}

				switch(*pCur){
				case '\"':
					if(!bEscaped){
						bInQuote = !bInQuote;
					}
					else{
						bEscaped = false;
						curToken+="\"";
					}
					break;
				case C:
					bEscaped=false;
					if(bInQuote){
						curToken+=C;
					}
					else{
						bContinue = false;
					}
					break;
				case 'N':
				case 'n':
					if(bEscaped){
						curToken += "\n";
					}
					else{
						curToken+="n";
					}
					bEscaped = false;
					break;

				default:
					bEscaped = false;
					curToken += *pCur;
				}


			}
			tToken = curToken;
			return pBegin;

		}
	};


};

#endif

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions