Click here to Skip to main content
15,896,201 members
Articles / Programming Languages / C++

UTF-8 UTILITY FUNCTIONS IN C++ (Platform Independent Code)

Rate me:
Please Sign up or sign in to vote.
3.09/5 (14 votes)
6 Apr 2007CPOL2 min read 83K   1.7K   33  
This article describes the basics of UTF-8 and provides some utility functions for handling UTF-8. The code can be compiled for Windows as well as Linux.
/*
* Written by Boby Thomas Pazheparampil. (March 2007.)
* Platform independent code.  (I hope so)
* Tested with Windows 2000, XP, Cygwin and Linux Debian.
*/

#include "utf_functions.h"

/*************************************************************************
* @f Fnct			: EatupWhiteSpace
* @r Return			: Trimmed string without whitespaces at both ends of 
						strings
* Description       : Function to trim whitespaces from both ends of the
						strings.
* @author			: Boby thomas
**************************************************************************/
string EatupWhiteSpace(string sInput)
{
	int iStart = sInput.find_first_not_of(" \t\n");
	int iEnd = sInput.find_last_not_of(" \t\n");
	if(iStart == -1) // No non-spaces
		return "";

	return string(sInput, iStart, iEnd - iStart + 1);
}

/*************************************************************************
* @f Fnct			: hexchar2binary
* @r Return			: Binary string.
* Description       : Convert a charater to a binary string.
* @author			: Boby thomas
**************************************************************************/

string hexchar2binary(char c)
{
	switch(c)
	{
	case '0':
		return "0000";

	case '1':
		return "0001";

	case '2':
		return "0010";

	case '3':
		return "0011";

	case '4':
		return "0100";

	case '5':
		return "0101";

	case '6':
		return "0110";

	case '7':
		return "0111";

	case '8':
		return "1000";

	case '9':
		return "1001";

	case 'a':
		return "1010";

	case 'b':
		return "1011";

	case 'c':
		return "1100";

	case 'd':
		return "1101";

	case 'e':
		return "1110";

	case 'f':
		return "1111";
	}

	return "0000";
}

/*************************************************************************
* @f Fnct			: hex2binary
* @r Return			: Binary string.
* Description       : Convert stl string of hex values to a binary string.
* @author			: Boby thomas
**************************************************************************/

string hex2binary(string sAscii)
{
	string sBinary = "";

	for(unsigned int iCnt = 0;iCnt < sAscii.length();iCnt++)
	{
		char c = sAscii[iCnt];
		sBinary +=  hexchar2binary(c);

	}

	int iTmp = sBinary.find_first_not_of("0");
	if(iTmp == -1)
		return "0";

	sBinary = sBinary.substr(iTmp,9999);

	return sBinary;

}


/*************************************************************************
* @f Fnct			: binary8bit2hex
* @r Return			: Hexadecimal string.
* Description       : Convert stl binary string to of hex value string.
						Accept only 8bit binary. 
* @author			: Boby thomas
**************************************************************************/

string binary8bit2hex(string sBinary)
{
	double lVal = 0;
	long lPower = 0;
	char acBuffer[20];
	for(unsigned int iTmp = sBinary.length();iTmp > 0;iTmp--)
	{
		char c = sBinary[sBinary.length()-1-lPower];
		switch(c)
		{
		case '1':
			lVal += pow(2,lPower);
			break;

		case '0':
			break;
		default:
			cout<<"\nInvalid binary data";
		}
		lPower++;
	
	}

	sprintf(acBuffer,"%x",(int)lVal);
	string sHex = acBuffer;
	if(sHex.length() == 1)
		sHex = "0" + sHex;

	return sHex;

}

/*************************************************************************
* @f Fnct			: binary2hex
* @r Return			: Hexadecimal string.
* Description       : Convert stl binary string to hex value string.
						Accept binary string of any length. 
* @author			: Boby thomas
**************************************************************************/
string binary2hex(string sBinary)
{
	string sHex;
	int iStart = sBinary.find_first_of("1");
	if(iStart == -1)
		return "00";
	else
		sBinary = sBinary.substr(iStart,99999);

	while(sBinary.length() > 8)
	{
		string sOctect = sBinary.substr(sBinary.length()-8);
		sHex = binary8bit2hex(sOctect) + sHex;


		sBinary = sBinary.substr(0,sBinary.length()-8);
	}
	sHex = binary8bit2hex(sBinary) + sHex;
	return sHex;
}


/*************************************************************************
* @f Fnct			: convertHex2UTF
* @r Return			: single character UTF string.
* Description     : Convert stl hex charater string to corresponding
						UTF character string. Do not misunderstand this function
                  with a stream converter. This function converts only one 
                  character. 
                  For example 
                  "7f" return "7f"
                  "80" return "c280"
                  "fffd" return "efbfbd"

* @author			: Boby thomas
**************************************************************************/

string convertHex2UTF(string sHex)
{
	string sReturn = "";
	string sTemp;
	string sUTFBinary;
	unsigned int iLen = 6;

	sHex = EatupWhiteSpace(sHex);
	int iTmp = sHex.find_first_not_of("0123456789abcdef");
	if(iTmp != -1)
		sHex = sHex.substr(0,iTmp);

	//we have a binary array now.
	sHex = hex2binary(sHex);

	if(sHex.length() > 7)
	{
		while(iLen < sHex.length() )
		{
			while(sHex.length()<6)
				sHex = "0" + sHex;
			sUTFBinary = sHex.substr(sHex.length()-6,sHex.length()) + sUTFBinary;
			sUTFBinary = "10" + sUTFBinary;

			sHex = sHex.substr(0,sHex.length()-6);

			iLen--;
		}
		if(iLen > 6)
		{
			cout<<"Too long input...";
			return "error";
		}


		while(sHex.length() <= iLen)
			sHex = "0" + sHex;


		while(sHex.length() < 8)
			sHex = "1" + sHex;

		sUTFBinary = sHex + sUTFBinary;
	}
	else
		sUTFBinary = sHex;

	sReturn = binary2hex(sUTFBinary);



	return sReturn;
}

/*************************************************************************
* @f Fnct			: findLengthUTF
* @r Return			: single character. Normaly first character of a UTF stream.
						-1 for invalid UTF entry.
* Description       : Returns the number of characters in the UTF string.
						Say for example 0xc2  will return 2 since one more byte 
						following this will constitute the UTF character.
* @author			: Boby thomas
**************************************************************************/
long findLengthUTF(string sUTFFirstByte)
{
	long iLen = 6;
	char c;

	sUTFFirstByte = EatupWhiteSpace(sUTFFirstByte);
	int iTmp = sUTFFirstByte.find_first_not_of("0123456789abcdef");
	if(iTmp != -1)
		sUTFFirstByte = sUTFFirstByte.substr(0,iTmp);

	//we have a binary array now.
	sUTFFirstByte = hex2binary(sUTFFirstByte);

	while(sUTFFirstByte.length() < 8)
		sUTFFirstByte = "0"+sUTFFirstByte;

	string sHeader = sUTFFirstByte.substr(0,8);
	iLen = 0;
	iTmp = 0;
	while((c = sHeader[iTmp++]) != '0')
	{
		iLen ++;
		if(iLen == 8)
			return -1;
	}


	if(0==iLen)
		iLen = 1;

	return iLen;
}


/*************************************************************************
* @f Fnct			: convertUTF2Hex
* @r Return			: Hex value corresponding to the UTF chracter.
						"error" on invalid character.
* Description       : Returns the hex value corresponding to a UTF character.
						Do not misunderstand this function with a stream converter.
                  This function converts only one UTF-8 character. 
                  For example 
                  "7f" return "7f"
                  "c280" return "80"
                  "efbfbd" return "fffd"

* @author			: Boby thomas
**************************************************************************/
string convertUTF2Hex(string sUTF)
{
	string sReturn = "";
	string sBinary = "";
	unsigned int iLen = 6;
	char c;

	sUTF = EatupWhiteSpace(sUTF);
	int iStart = sUTF.find_first_not_of("0123456789abcdef");
	if(iStart != -1)
		sUTF = sUTF.substr(0,iStart);

	//we have a binary array now.
	sUTF = hex2binary(sUTF);

	while(sUTF.length()%8 != 0)
		sUTF = "0"+sUTF;


	string sHeader = sUTF.substr(0,8);
	iLen = 0;
	unsigned int iTmp = 0;
	while((c = sHeader[iTmp++]) != '0')
	{
		iLen ++;
		if(iLen == 8)
		{
			return "error";
		}
	}

	if(sUTF.length() < (iLen*8))
		return "error";


	if(0==iLen)
	{
		iLen = 1;
		sBinary = sHeader.substr(iTmp,9999);
	}
	else
	{
		sBinary = sHeader.substr(iTmp,9999);
		iTmp = 1;
		while(iTmp < iLen)
		{
			sHeader = sUTF.substr((iTmp*8),8);
			if((sHeader[0] != '1') || (sHeader[1] != '0') )
				return "error";

			sBinary += sHeader.substr(2,6);
			iTmp ++;
		}
		
	}
	sReturn = binary2hex(sBinary);
	return sReturn;
}


/*************************************************************************
* @f Fnct			: generateUTFFileDetails
* @r Return			: true - file could be a UTF file. 
						(No invalid UTF character in the file)
* Description       : This function evaluate a file for validity. Returns false
						if there a single occurance of a nonpossible character.
						Writes a file utfdetails_<filename> with all the utf 
						character details.
* @author			: Boby thomas
**************************************************************************/
bool generateUTFFileDetails(string sFileName)
{
	bool bSuccess = true;
	long lLength = 0;
	char HexBuffer[25] = {0};


	string UTFString;
	string UnicodeString;


	FILE * fpInput = fopen(sFileName.c_str(),"rb");
	if(fpInput == NULL)
	{
		cout<<"Failed to open file "<<sFileName.c_str()<<"\n";
		return false;
	}

	string sOutput = "utfdetails_" + sFileName;

	FILE * fpOutput = fopen(sOutput.c_str(),"wb");

	if(fpOutput == NULL)
	{
		cout<<"Failed to open output file "<<sOutput.c_str()<<"\n";
		return false;
	}

	int lChar;
	fprintf(fpOutput,"=================================================================\n");
	fprintf(fpOutput,"        ASCII       ||       BINARY       ||        UTF         \n");
	fprintf(fpOutput,"=================================================================\n");

	while(1)
	{
		lChar = fgetc(fpInput);
		if(-1 == lChar)
			break;

		if(lLength == 0)
		{
			UTFString = "";
			sprintf(HexBuffer,"%x",lChar);
			lLength = findLengthUTF(HexBuffer);
		}
		if(lLength == -1)
		{
			lLength = 0;
			bSuccess = false;
			fprintf(fpOutput,"Invalid UTF character. Not a possible first byte. Binary value:%s\n",HexBuffer);
			continue;
		}

		if(lLength > 0)
		{
			sprintf(HexBuffer,"%x",lChar);
			UTFString += HexBuffer;
			lLength--;
			if(lLength == 0)
			{
				string sResult = convertUTF2Hex(UTFString);
				
				if(sResult.compare("error") == 0)
					fprintf(fpOutput,"Invalid UTF character. Binary data:%s\n",UTFString.c_str());
				else if((convertHex2UTF(sResult).compare(UTFString) != 0) && (convertHex2UTF(sResult).compare("0"+UTFString) != 0) )
					fprintf(fpOutput,"Invalid UTF character. Binary data:%s\n",UTFString.c_str());
				else
					fprintf(fpOutput,"%18c  ||%18s  ||%18s  \n",(char)lChar,UTFString.c_str(),sResult.c_str());
			}
		}

	}


	if(fpInput)
		fclose(fpInput);

	if(fpOutput)
		fclose(fpOutput);

	return bSuccess;

}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer (Senior) DWS
Australia Australia

Comments and Discussions