Click here to Skip to main content
15,891,567 members
Articles / Programming Languages / XML

UTF-8 encoded XML file/stream processing

Rate me:
Please Sign up or sign in to vote.
3.58/5 (9 votes)
5 Jan 20072 min read 84.6K   6K   31  
Process an UTF-8 encoded XML file or stream; read group & attribute values; write & delete grps, attribs, values & comments.
// ============================================================================	//
//				X M 8 _ U T F s b s . c p p			//
// ----------------------------------------------------------------------------	//
// Contains the UTF-8 routines							//
// ----------------------------------------------------------------------------	//
// static				U T F t o U C S				//
// ----------------------------------------------------------------------------	//
//					u t f U C S				//
// ----------------------------------------------------------------------------	//
//					U T F l e n				//
// ----------------------------------------------------------------------------	//
//				X M 8 _ U T F t o U C S				//
// ----------------------------------------------------------------------------	//
//				X M 8 _ U T F 8 t o U T F 1 6			//
// ============================================================================	//
#include "XM8_Main.h"


#define bitsUTF2530(x) {if (utf[x] & 32) lC |= 0x20000000; /* bit 30?, yes */	\
			if (utf[x] & 16) lC |= 0x10000000; /* bit 29?, yes */	\
			if (utf[x] &  8) lC |= 0x08000000; /* bit 28?, yes */	\
			if (utf[x] &  4) lC |= 0x04000000; /* bit 27?, yes */	\
			if (utf[x] &  2) lC |= 0x02000000; /* bit 26?, yes */	\
			if (utf[x] &  1) lC |= 0x01000000;}/* bit 25?, yes */

#define bitsUTF1924(x) {if (utf[x] & 32) lC |= 0x00800000; /* bit 24?, yes */	\
			if (utf[x] & 16) lC |= 0x00400000; /* bit 23?, yes */	\
			if (utf[x] &  8) lC |= 0x00200000; /* bit 22?, yes */	\
			if (utf[x] &  4) lC |= 0x00100000; /* bit 21?, yes */	\
			if (utf[x] &  2) lC |= 0x00080000; /* bit 20?, yes */	\
			if (utf[x] &  1) lC |= 0x00040000;}/* bit 19?, yes */

#define bitsUTF1318(x) {if (utf[x] & 32) lC |= 0x00020000; /* bit 18?, yes */	\
			if (utf[x] & 16) lC |= 0x00010000; /* bit 17?, yes */	\
			if (utf[x] &  8) lC |= 0x00008000; /* bit 16?, yes */	\
			if (utf[x] &  4) lC |= 0x00004000; /* bit 15?, yes */	\
			if (utf[x] &  2) lC |= 0x00002000; /* bit 14?, yes */	\
			if (utf[x] &  1) lC |= 0x00001000;}/* bit 13?, yes */

#define bitsUTF0712(x) {if (utf[x] & 32) lC |= 0x00000800; /* bit 12?, yes */	\
			if (utf[x] & 16) lC |= 0x00000400; /* bit 11?, yes */	\
			if (utf[x] &  8) lC |= 0x00000200; /* bit 10?, yes */	\
			if (utf[x] &  4) lC |= 0x00000100; /* bit  9?, yes */	\
			if (utf[x] &  2) lC |= 0x00000080; /* bit  8?, yes */	\
			if (utf[x] &  1) lC |= 0x00000040;}/* bit  7?, yes */

// ============================================================================	//
//				U T F t o U C S					//
// ----------------------------------------------------------------------------	//
// Inputs:  utBf	pointer to pointer to 1-6 bytes for conversion		//
// Output:  lChar	The UCS character					//
// Returns: True/False	Success or Failure, N.B. if fail, utBf stepped on	//
// ============================================================================	//
static bool UTFtoUCS (byte**utBf, long*lChar)
{
    byte bC, utf[6]; long lC = 0; memset(utf,0,sizeof(utf));

    bC = utf[0] = *(*utBf)++;
    if (0x80 & bC)						//	    top bit set?
    {	if (0x40 & bC)						// yes, 2nd top bit set?
	{   utf[1] = *(*utBf)++;				// yes, at least 2 bytes in train
	    if (0x20 & bC)					//	3rd top bit set?
	    {   if (utf[1] == 0x80) return false;		// yes, extra stuffing?, yes, illegal
		utf[2] = *(*utBf)++;				// at least 3 bytes in train
		if (0x10 & bC)					//	4th top bit set?
		{   if (utf[2] == 0x80) return false;		// yes, extra stuffing?, yes, illegal
		    utf[3] = *(*utBf)++;			// at least 4 bytes in train
		    if (0x08 & bC)				//	5th top bit set?
		    {   if (utf[3] == 0x80) return false;	// yes, extra stuffing?, yes, illegal
			utf[4] = *(*utBf)++;			// at least 5 bytes in train
			if (0x04 & bC)	    			//	6th top bit set?
			{   if (utf[4] == 0x80) return false;	// yes, extra stuffing?, yes, illegal
			    utf[5] = *(*utBf)++;		// at least 6 bytes in train
			    if (0x02 & bC) return false;	// train begins 1111111x?, yes invalid
								//  6 byte utf-8 train
			    if (bC & 1) lC |= 0x40000000;	// 31
			    bitsUTF2530(1);			// 25-30
			    bitsUTF1924(2);			// 19-24
			    bitsUTF1318(3);			// 13-18
			    bitsUTF0712(4);			//  7-12
					lC |= utf[5]&0x3F;	//  6
			}
			else					//  5 byte utf-8 train
			{   if (bC & 2) lC |= 0x02000000;	// 26
			    if (bC & 1) lC |= 0x01000000;	// 25
			    bitsUTF1924(1);			// 19-24
			    bitsUTF1318(2);			// 13-18
			    bitsUTF0712(3);			//  7-12
					lC |= utf[4]&0x3F;	//  6
			}
		    }
		    else					//  4 byte utf-8 train
		    {   if (bC & 4) lC |= 0x00100000;		// 21
			if (bC & 2) lC |= 0x00080000;		// 20
			if (bC & 1) lC |= 0x00040000;		// 19
			bitsUTF1318(1);				// 13-18
			bitsUTF0712(2);				//  7-12
				    lC |= utf[3]&0x3F;		//  6
		    }
		}
		else						//  3 byte utf-8 train
		{   if (bC & 8) lC |= 0x00008000;		// 16
		    if (bC & 4) lC |= 0x00004000;		// 15
		    if (bC & 2) lC |= 0x00002000;		// 14
		    if (bC & 1) lC |= 0x00001000;		// 13
		    bitsUTF0712(1);				//  7-12
				lC |= utf[2]&0x3F;		//  6
		}
	    }
	    else						//  2 byte utf-8 train
	    {   if (bC &16) lC |= 0x00000400;			// 11
		if (bC & 8) lC |= 0x00000200;			// 10
		if (bC & 4) lC |= 0x00000100;			//  9
		if (bC & 2) lC |= 0x00000080;			//  8
		if (bC & 1) lC |= 0x00000040;			//  7
			    lC |= utf[1]&0x3F;			//  6
	    }
	}
	else return false;  // 2nd top bit not set // train begins 10xxxxxx, invalid
    }
    else lC = bC;   // top bit not set				//  1 byte utf-8 train

    *lChar = lC; if (UNICODE && lC > 0x00110000) return false;

    return true;
}

// ============================================================================	//
//				u t f U C S					//
// ----------------------------------------------------------------------------	//
// Input:   aBf	convert the supplied utf-8 train into a UCS train		//
// Output:  oBf	an address for a UCS train					//
// ============================================================================	//
extern bool utfUCS (char*aBf,long*oBf)
{
    while(*aBf > '\0') *oBf++ = (byte)*aBf++; // terminator or multi-byte char ends loop

//  if multi-byte char ended loop, continue in next loop

    while(*aBf != '\0')
    {
	if (!UTFtoUCS((byte**)&aBf,oBf++))
	{
	    sprintf(L.err,"Supplied utf-8 train invalid UNICODE");

	    *oBf = 0; // zero terminate

	    return false;
	}
    }

    *oBf = 0; // zero terminate

    return true;
}


// ============================================================================	//
//				U T F l e n					//
// ----------------------------------------------------------------------------	//
// Input:   aBf	a utf-8 train							//
// Returns: number of CHARACTERS in train or -ive train point of failure	//
// ============================================================================	//
extern long UTFlen (char*aBf)
{
    long i = 0, lC;
    
    char*Bf = aBf;

    while(*Bf++ > '\0') i++;	// terminator or multi-byte char ends loop

    --Bf;			// if multi-byte char ended loop, continue in next loop

    while(*Bf != '\0')
    {
	if(!UTFtoUCS((byte**)&Bf,&lC)) return (aBf-Bf);

	i++;
    }

    return i;
}


// ============================================================================	//
//				X M 8 _ U T F t o U C S				//
// ----------------------------------------------------------------------------	//
// Inputs:  aBf	a utf-8 train							//
// Outputs: lBf	an address for a UCS train					//
// ============================================================================	//
extern "C" bool _stdcall XM8_UTFtoUCS (char*aBf,long*lBf)
{
    long i = UTFlen(aBf);
    if (i < 0)
    {
	sprintf(L.err,"Supplied utf-8 train invalid at position %n",-i);

	*lBf = 0; // zero terminate
	
	return false;
    }

    return utfUCS(aBf,lBf);
}

// ============================================================================	//
//				X M 8 _ U T F 8 t o U T F 1 6			//
// ----------------------------------------------------------------------------	//
// Inputs:  aBf	a utf-8 train							//
// Outputs: sBf	an address for a utf-16 train (wide char)			//
// 1st chars of output buffer 0xFE,0xFF ->    BOM &    bigEndian		//
// 1st chars of output buffer 0xFF,0xFE ->    BOM & littleEndian		//
// otherwise				-> no BOM &    bigEndian		//
// ============================================================================	//
extern "C" bool _stdcall XM8_UTF8toUTF16 (char*aBf,ushort*sBf)
{
    bool bigEndian = true; byte*bsBf = (byte*)sBf; char*Bf = aBf;

    if	    (*bsBf++ == 0xFF && *bsBf == 0xFE) {sBf++; bigEndian = false;}
    else if (*bsBf == 0xFF && *--bsBf == 0xFE)  sBf++;

    long lC; ushort wC, xC; byte bwCu, bwCl, bxCu, bxCl;

    bsBf = (byte*)sBf;// begining of train buffer; after BOM, if it exists

    long i = UTFlen(aBf);
    if (i < 0)
    {
	sprintf(L.err,"Supplied utf-8 train invalid at position %n",i);

	*bsBf++ = 0; *bsBf = 0; // zero terminate
	
	return false;
    }

    while(*aBf != 0)
    {
	UTFtoUCS((byte**)&aBf,&lC);

	if	(lC < 0x010000)
	{
	    wC = (ushort)lC; bwCu = wC>>8; bwCl = wC & 0xFF;

	    if (bigEndian)	// ?
	    {			// yes
		*bsBf++ = bwCu; *bsBf++ = bwCl;
	    }
	    else		// littleEndian
	    {   *bsBf++ = bwCl; *bsBf++ = bwCu;
	    }
	}
	else if (lC < 0x110000)
	{
	    lC -= 0x10000;	// reduce to 20 bits

	    wC = 0xD800 | ((lC>>10) & 0x3FF); bwCu = wC>>8; bwCl = wC & 0xFF;
	    xC = 0xDC00 | ( lC      & 0x3FF); bxCu = xC>>8; bxCl = xC & 0xFF;

	    if (bigEndian)	// ?
	    {			// yes
		*bsBf++ = bxCu; *bsBf++ = bxCl; *bsBf++ = bwCu; *bsBf++ = bwCl;
	    }
	    else		// littleEndian
	    {   *bsBf++ = bwCl; *bsBf++ = bwCu; *bsBf++ = bxCl; *bsBf++ = bxCu;
	    }
	}
	else
	{   sprintf(L.err,"Supplied utf-8 train yielded invalid utf16 train at position %d",aBf-Bf);

	    *bsBf++ = 0; *bsBf = 0; // zero terminate
	    
	    return false;
	}
    }

    *bsBf++ = 0; *bsBf = 0; // zero terminate

    return true;
}


By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United Kingdom United Kingdom
BSc (St.Andrews(1963-67))
MSCE
Systems Programmer 39+yrs
Married to first wife 35yrs & counting, four grown-up children
Religious opinions similar to MelG's
It is not the gnosis, but the praxis must be the fruit. (Aristotle)

Comments and Discussions