Click here to Skip to main content
15,879,326 members
Articles / Programming Languages / C++

A simple STL based XML parser

Rate me:
Please Sign up or sign in to vote.
4.21/5 (22 votes)
16 May 2000 250.8K   4.6K   70  
This is a small non-validating XML parser based purely on STL
#if !defined(XmlParser_H)
#define XmlParser_H

#include "XmlUtil.h"

#include <algorithm>
#include <string>
#include <sstream>
#include <iostream>
using namespace std;



//////////////////////////////////////////////////////////////////////////////
// XmlParserr
//
// Purpose: used to parse a buffer that has xml formatted data

class XmlParser
{
public:

	static long _parseid;
	
	// id of parser (for debugging)
	long		_id;

	// provides string interface to buffer
	string		_strParse;

	// parse buffer and offsets of where we are parsing
	char *		_buffer;
	long		_parseLength;

	// current position in parse buffer
	long		_current;

	// tag, attributes and data position information
	long		_firstTagStart;
	long		_firstTagEnd;

	long		_lastTagStart;
	long		_lastTagEnd;

	long		_nameStart;
	long		_nameEnd;
	long		_attrStart;
	long		_attrEnd;
	long		_valueStart;
	long		_valueEnd;


	XmlParser () :
		_strParse()
	{ 
		clear(); 

		// get parser id
		_id = _parseid;

		// next id
		_parseid++;
	}

	// create/release
	bool create ( const char * buffer, long parseStart, long parseEnd )
	{
		// if invalid stop
		if ( !buffer || parseStart < 0 || parseEnd < parseStart )
			return false;

		return create( buffer + parseStart, getLength(parseStart,parseEnd) );
	}	



	bool create ( const char * buffer, long length )
	{
		// if invalid stop
		if ( !buffer || length <= 0 )
			return false;

		// parse buffer and offsets of where we are parsing
		_buffer      = (char *) buffer;
		_parseLength = length;

		// current position in parse buffer
		_current    = 0;

		return true;
	}	

	void release ()
	{

	}

	// cur position methods
	long getCurrent ()
	{
		return _current;
	}

	long getParseLength ()
	{
		return _parseLength;
	}

	long getCurLength ()
	{
		return getOffsetLength(_current);
	}

	long getOffsetLength ( long offset )
	{
		return getLength(offset,_parseLength - 1);
	}


	char * getBufferPos ()
	{
		return _buffer;
	}

	char * getLastBufferPos ()
	{
		return _buffer + _parseLength;
	}

	char * getCurPos ()
	{
		if ( isValid() )
			return _buffer + _current;
		else
			return NULL;
	}

	char * getParseState ( long & parseLength )
	{
		// if not valid stop
		if ( !isValid() )
		{
			parseLength = 0;
			return getCurPos();
		}

		/*
		// if value has length
		if ( getValueLength() > 0 )
		{
			// if value has a tag it must be parsed
			if ( valueHasTag() )
			{
				// get parse length
				parseLength = getValueLength();

				// return ptr to start of value
				return _buffer + _valueStart;
			}
		}
		*/

		// get parse state
		parseLength = getCurLength();

		// get current buffer position
		char * buffer = getCurPos();

		// if last tag is valid
		/*
		if ( hasLastTag() )
		{
			// if current position is in last tag
			// pos then show stop
			if ( getCurPos() >= getLastTagPos() )
			{
				parseLength = 0;
				return getLastBufferPos();
			}
		}
		*/

		return getCurPos();

	}

	// get ref to parse buffer
	string & str ()
	{
		return _strParse;
	}

	// state methods
	void reset ()
	{
		resetTagPositions();
	}

	bool isValid ()
	{
		// if buffer state not valid
		if ( !_buffer || _parseLength <= 0 )
			return false;

		// if cur position not valid
		if ( _current < 0 || _current > _parseLength )
			return false;

		return true;
	}

	void resetTagPositions ( long start=-1 )
	{
		// set tag positions
		_firstTagStart  = start;
		_firstTagEnd	= start;

		_lastTagStart   = start;
		_lastTagEnd	    = start;

		_nameStart	    = start;
		_nameEnd		= start;

		_attrStart      = start;
		_attrEnd        = start;

		_valueStart     = start;
		_valueEnd       = start;
	}
	
	void clear ()
	{
		// parse buffer and offsets of where we are parsing
		_buffer		 = 0;
		_parseLength = 0;

		// current position in parse buffer
		_current	= 0;

		// reset tag positions
		reset();
	}

	// parse methods
	bool parse ( const char * buffer, long parseStart, long parseEnd )
	{
		// if create fails stop
		if ( !create(buffer,parseStart,parseEnd) )
			return false;

		return parse();
	}

	bool parse ( const char * buffer, long parseLength )
	{
		// if create fails stop
		if ( !create(buffer,parseLength) )
			return false;

		return parse();
	}

	bool parse () 
	{
		// init tag position
		_firstTagStart = _current;
		_firstTagEnd   = _current;

		_lastTagStart  = _current;
		_lastTagEnd    = _current;

				     
		// find first tag
		long first = find( idTagLeft, _current );
		if ( first == -1 )
			return false;


		// if find right tag
		long last  = find( idTagRight, first );
		if ( last == -1  )
			return false;

		// set first tag start/end
		_firstTagStart = first;
		_firstTagEnd   = last;

		// now parse name
		if ( !parseName() )
			return false;

		// parse attributes
		parseAttributes();

		// if null tag no data or last tag
		if ( hasNullTag() )
		{
			// update cur position
			_current  = _firstTagEnd + idTagRightLength;

			// done so show success
			return true;
		}		

		// form end tag
		string endTag;
		endTag  = idTagEnd;
		endTag += getName();
		endTag += idTagRight;

		// find last tag
		first = find( endTag, last );
		if ( first == -1 )
			return false;

		// set last tag start/end
		_lastTagStart = first;
		_lastTagEnd   = first + endTag.size();

		// parse the value if not a null tag
		if ( !hasNullTag() )
			parseValue();

		// update cur position
		// we have parsed a tag so look for the start
		// of a new tag, if found set current position
		// to it, else set to last tag
		long pos = find( idTagLeft, _lastTagEnd );
		if ( pos != -1 )
			_current = pos;
		else
			_current = _lastTagEnd;


		return true;
	}

	bool parse ( string & name,
				 string & value,
				 string & attributes,
				 long & current )
	{

		return true;
	}



	// tag search methods

	bool hasNullTag ()
	{
		// get end of first tag
		char * buffer = _buffer + _firstTagEnd - 1;

		// if null tag marker
		if ( *buffer == '/' && *(buffer+1) == '>' )
			return true;
		else
			return false;
	}

	/////////////////////////////////////////////////////////
	// these methods are protected because the state
	// of parsing might not be properly setup, and
	// if that were so then calling these methods
	// would cause errors

protected:

	// name search methods
	bool parseName ()
	{
		// if first tag search failed show failed
		if ( _firstTagStart < 0 ||  _firstTagEnd < 0 ||
			 _firstTagEnd <= _firstTagStart )
		{
			_nameStart = -1;
			_nameEnd   = -1;
			return false;
		}

		// init name start/end position
		_nameStart = _firstTagStart + idTagLeftLength;
		_nameEnd   = _firstTagEnd - 1;

		// if null tag then backup before
		// null tag marker
		if ( hasNullTag() )
			_nameEnd -= 1;

		// check for separator (i.e. there are attributes)
		long last = find(' ',_nameStart, getNameLength());
		if ( last != -1 )
		{
			// there are attributes so backup
			// before attributes
			_nameEnd = last - 1;
		}

		return true;
	}

	bool parseName ( string & name )
	{
		// set name state
		if ( !parseName() )
			return false;

		name = getName();

		return true;
	}

	// attribute search methods
	bool parseAttributes ()
	{
		// init name start/end position
		_attrStart = -1;
		_attrEnd   = -1;

		// if tag or name length invalid stop
		long tagLength  = getTagLength();
		long nameLength = getNameLength();
		if ( tagLength <= 0 || nameLength <= 0 )
			return 0;

		// if the difference in the lengths is
		// less than the length of the left/right marker
		// then no attributes
		long diff = getTagLength() - getNameLength();

		switch ( diff )
		{
			// no attribute case
			case 0:
			case 1:
			case 2:
				return false;

			// no attribute case but has null tag
			case 3:
				return false;
		}

		// init attributes start, move past space after name
		_attrStart = _nameEnd + 2;

		// init attribute end move before right tag marker
		// if null tag move before null tag marker
		_attrEnd = _firstTagEnd - 1;
		if ( hasNullTag() )
			_attrEnd -= -1;

		return true;
	}

	bool parseAttributes ( string & attributes )
	{
		// set name state
		if ( !parseAttributes() )
			return false;

		attributes = getAttributes();

		return true;
	}


	// data search methods
	bool parseValue ()
	{
		// if first tag search failed show failed
		if ( _firstTagStart < 0 ||  _lastTagEnd < 0 ||
			 _lastTagEnd <= _firstTagStart )
		{
			_valueStart = -1;
			_valueEnd   = -1;
			return false;
		}

		// init value start/end positions
		_valueStart = _firstTagEnd + 1;
		_valueEnd   = _lastTagStart - 1;

		return true;
	}


	bool parseValue ( string & value )
	{
		// set name state
		if ( !parseValue() )
			return false;

		value = getValue();

		return true;
	}

public:

	// name access methods
	char * getNamePos ()
	{
		if ( hasName() )
			return _buffer + _nameStart;
		else
			return NULL;
	}

	bool hasName ()
	{
		if ( getNameLength() > 0 )
			return true;
		else
			return false;
	}

	long getNameLength ()
	{
		long length = getLength(_nameStart,_nameEnd);
		return length;
	}


	string getName ()
	{
		// get name length
		long length = getNameLength();

		// if length invalid show null string
		// else get string
		if ( length <= 0 )
			return string("");
		else
			return substr(_nameStart,length);
	}


	// attribute access methods
	char * getAttributesPos ()
	{
		if ( hasAttributes() )
			return _buffer + _attrStart;
		else
			return NULL;
	}

	bool hasAttributes ()
	{
		if ( getValueLength() > 0 )
			return true;
		else
			return false;
	}

	long getAttributesLength ()
	{
		long length = getLength(_attrStart,_attrEnd);
		return length;
	}

	string getAttributes ()
	{
		// get attribute length
		long length = getAttributesLength();

		// if length invalid show null string
		// else get string
		if ( length <= 0 )
			return string("");
		else
			return substr(_attrStart,length);
	}

	// value access methods
	char * getValuePos ()
	{
		if ( hasValue() )
			return _buffer + _valueStart;
		else
			return NULL;
	}

	bool hasValue ()
	{
		if ( getValueLength() > 0 )
			return true;
		else
			return false;
	}

	long getValueLength ()
	{
		long length = getLength(_valueStart,_valueEnd);
		return length;
	}

	string getValue ()
	{
		// get tag data length
		long length = getValueLength();

		// if length invalid show null string
		// else get string
		if ( length <= 0 )
			return string("");
		else
			return substr(_valueStart,length);
	}

	char * getValueState ( long & valueLength )
	{
		// get value state
		valueLength = getValueLength();

		// return value buffer pos
		return _buffer + _valueStart;
	}

	bool valueHasTag ()
	{
		// if find end tag
		long pos = find( idTagLeft, _valueStart, getValueLength() );

		// if found tag
		if ( pos != -1 )
			return true;
		else
			return false;
	}

	// tag access methods
	long getTagLength ()
	{
		long length = getLength( _firstTagStart, _firstTagEnd );
		return length;
	}

	long getLastTagLength ()
	{
		long length = getLength( _lastTagStart, _lastTagEnd );
		return length;
	}

	bool hasTag ()
	{
		if ( getTagLength() > 0 )
			return true;
		else
			return false;
	}

	bool hasLastTag ()
	{
		if ( getLastTagLength() > 0 )
			return true;
		else
			return false;
	}

	char * getTagPos ()
	{
		if ( hasTag() )
			return _buffer + _firstTagStart;
		else
			return NULL;
	}

	char * getLastTagPos ()
	{
		if ( hasTag() )
			return _buffer + _lastTagStart;
		else
			return NULL;
	}

	string getTag ()
	{
		// get tag data length
		long length = getTagLength();
		return substr(_firstTagStart,length);
	}

	// string utility methods
	long getLength ( long startPos,
			         long endPos )
	{
		// if positions invalid show no length
		if ( startPos < 0 || endPos < 0 ||
			 endPos < startPos )
			return 0;

		// get length
		long length = endPos - startPos + 1;
		return length;
	}

	string ::iterator begin ()
	{
		string::iterator buf = _buffer;
		return string::iterator(buf);
	}

	string ::iterator end ()
	{
		string::iterator buf = _buffer + _parseLength;
		return string::iterator(buf);
	}

	long find ( char srchChar, long offset, long length = -1 )
	{
		// if no length set to length to
		// end of parse buffer
		if ( length == -1 )
			length = getOffsetLength(offset);

		// set start and end of search 
		string::iterator start = _buffer + offset;
		string::iterator end   = _buffer + (offset + length);

		// search for it
		string::iterator found = std::find( start, end, srchChar );

		// if at end did not find it
		if ( found >= end )
		{
			return -1;
		}
		else
		{
			// as a last check make sure found is valid
			if ( found < start )
				return -1;

			// set position
			long pos = (found - start);
			pos += offset;

			return pos;
		}
	}

	long find ( char * srchStr, long offset, long length = -1 )
	{
		// if no length set to length to
		// end of parse buffer
		if ( length == -1 )
			length = getOffsetLength(offset);

		// set start and end of search 
		string::iterator start = _buffer + offset;
		string::iterator end   = _buffer + (offset + length);


		string::iterator srchStart = srchStr;
		string::iterator srchEnd   = srchStr + strlen(srchStr);

		// search for it
		string::iterator found = std::search( start, end, srchStart, srchEnd );

		// if at end did not find it
		if ( found >= end )
		{
			return -1;
		}
		else
		{
			// as a last check make sure found is valid
			if ( found < start )
				return -1;

			// set position
			long pos = (found - start);
			pos += offset;

			return pos;
		}
	}

	long find ( string & srchStr, long offset, long length = -1 )
	{
		// if no length set to length to
		// end of parse buffer
		if ( length == -1 )
			length = getOffsetLength(offset);

		// set start and end of search 
		string::iterator start = _buffer + offset;
		string::iterator end   = _buffer + (offset + length);


		string::iterator srchStart = srchStr.begin();
		string::iterator srchEnd   = srchStr.end();

		// search for it
		string::iterator found = std::search( start, end, srchStart, srchEnd );

		// if at end did not find it
		if ( found >= end )
		{
			return -1;
		}
		else
		{
			// as a last check make sure found is valid
			if ( found < start )
				return -1;

			// set position
			long pos = (found - start);
			pos += offset;

			return pos;
		}
	}

	long rfind ( char srchChar, long offset, long length )
	{
		// setup srch string
		char srchStr[2];
		srchStr[0] = srchChar;
		srchStr[1] = '\0';

		return rfind(srchStr,offset,length);
	}

	long rfind ( char * srchStr, long offset, long length )
	{
		/*
		// set start and end of search 
		string::reverse_iterator revStart = _buffer + (offset + length)
		string::reverse_iterator revEnd   = _buffer + offset;

		// search for it
		string::reverse_iterator found = std::find( start, end, srchStr );

		// get position 
		long pos = found - revStart;
		*/
		long pos = 0;

		return pos;
	}


	string substr ( long offset, long length )
	{
		// get start of sub string
		char * ptr = _buffer + offset;

		// create string for it
		string str;
		str.assign( ptr, length );

		return string(str);
	}


};


#endif

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions