|
/*
* Written by Boby Thomas Pazheparampil. (March 2007.)
* Platform independent code. (I hope so)
* Tested with Windows 2000, XP, Cygwin and Linux Debian.
*/
#include "utf_functions.h"
/*************************************************************************
* @f Fnct : EatupWhiteSpace
* @r Return : Trimmed string without whitespaces at both ends of
strings
* Description : Function to trim whitespaces from both ends of the
strings.
* @author : Boby thomas
**************************************************************************/
string EatupWhiteSpace(string sInput)
{
int iStart = sInput.find_first_not_of(" \t\n");
int iEnd = sInput.find_last_not_of(" \t\n");
if(iStart == -1) // No non-spaces
return "";
return string(sInput, iStart, iEnd - iStart + 1);
}
/*************************************************************************
* @f Fnct : hexchar2binary
* @r Return : Binary string.
* Description : Convert a charater to a binary string.
* @author : Boby thomas
**************************************************************************/
string hexchar2binary(char c)
{
switch(c)
{
case '0':
return "0000";
case '1':
return "0001";
case '2':
return "0010";
case '3':
return "0011";
case '4':
return "0100";
case '5':
return "0101";
case '6':
return "0110";
case '7':
return "0111";
case '8':
return "1000";
case '9':
return "1001";
case 'a':
return "1010";
case 'b':
return "1011";
case 'c':
return "1100";
case 'd':
return "1101";
case 'e':
return "1110";
case 'f':
return "1111";
}
return "0000";
}
/*************************************************************************
* @f Fnct : hex2binary
* @r Return : Binary string.
* Description : Convert stl string of hex values to a binary string.
* @author : Boby thomas
**************************************************************************/
string hex2binary(string sAscii)
{
string sBinary = "";
for(unsigned int iCnt = 0;iCnt < sAscii.length();iCnt++)
{
char c = sAscii[iCnt];
sBinary += hexchar2binary(c);
}
int iTmp = sBinary.find_first_not_of("0");
if(iTmp == -1)
return "0";
sBinary = sBinary.substr(iTmp,9999);
return sBinary;
}
/*************************************************************************
* @f Fnct : binary8bit2hex
* @r Return : Hexadecimal string.
* Description : Convert stl binary string to of hex value string.
Accept only 8bit binary.
* @author : Boby thomas
**************************************************************************/
string binary8bit2hex(string sBinary)
{
double lVal = 0;
long lPower = 0;
char acBuffer[20];
for(unsigned int iTmp = sBinary.length();iTmp > 0;iTmp--)
{
char c = sBinary[sBinary.length()-1-lPower];
switch(c)
{
case '1':
lVal += pow(2,lPower);
break;
case '0':
break;
default:
cout<<"\nInvalid binary data";
}
lPower++;
}
sprintf(acBuffer,"%x",(int)lVal);
string sHex = acBuffer;
if(sHex.length() == 1)
sHex = "0" + sHex;
return sHex;
}
/*************************************************************************
* @f Fnct : binary2hex
* @r Return : Hexadecimal string.
* Description : Convert stl binary string to hex value string.
Accept binary string of any length.
* @author : Boby thomas
**************************************************************************/
string binary2hex(string sBinary)
{
string sHex;
int iStart = sBinary.find_first_of("1");
if(iStart == -1)
return "00";
else
sBinary = sBinary.substr(iStart,99999);
while(sBinary.length() > 8)
{
string sOctect = sBinary.substr(sBinary.length()-8);
sHex = binary8bit2hex(sOctect) + sHex;
sBinary = sBinary.substr(0,sBinary.length()-8);
}
sHex = binary8bit2hex(sBinary) + sHex;
return sHex;
}
/*************************************************************************
* @f Fnct : convertHex2UTF
* @r Return : single character UTF string.
* Description : Convert stl hex charater string to corresponding
UTF character string. Do not misunderstand this function
with a stream converter. This function converts only one
character.
For example
"7f" return "7f"
"80" return "c280"
"fffd" return "efbfbd"
* @author : Boby thomas
**************************************************************************/
string convertHex2UTF(string sHex)
{
string sReturn = "";
string sTemp;
string sUTFBinary;
unsigned int iLen = 6;
sHex = EatupWhiteSpace(sHex);
int iTmp = sHex.find_first_not_of("0123456789abcdef");
if(iTmp != -1)
sHex = sHex.substr(0,iTmp);
//we have a binary array now.
sHex = hex2binary(sHex);
if(sHex.length() > 7)
{
while(iLen < sHex.length() )
{
while(sHex.length()<6)
sHex = "0" + sHex;
sUTFBinary = sHex.substr(sHex.length()-6,sHex.length()) + sUTFBinary;
sUTFBinary = "10" + sUTFBinary;
sHex = sHex.substr(0,sHex.length()-6);
iLen--;
}
if(iLen > 6)
{
cout<<"Too long input...";
return "error";
}
while(sHex.length() <= iLen)
sHex = "0" + sHex;
while(sHex.length() < 8)
sHex = "1" + sHex;
sUTFBinary = sHex + sUTFBinary;
}
else
sUTFBinary = sHex;
sReturn = binary2hex(sUTFBinary);
return sReturn;
}
/*************************************************************************
* @f Fnct : findLengthUTF
* @r Return : single character. Normaly first character of a UTF stream.
-1 for invalid UTF entry.
* Description : Returns the number of characters in the UTF string.
Say for example 0xc2 will return 2 since one more byte
following this will constitute the UTF character.
* @author : Boby thomas
**************************************************************************/
long findLengthUTF(string sUTFFirstByte)
{
long iLen = 6;
char c;
sUTFFirstByte = EatupWhiteSpace(sUTFFirstByte);
int iTmp = sUTFFirstByte.find_first_not_of("0123456789abcdef");
if(iTmp != -1)
sUTFFirstByte = sUTFFirstByte.substr(0,iTmp);
//we have a binary array now.
sUTFFirstByte = hex2binary(sUTFFirstByte);
while(sUTFFirstByte.length() < 8)
sUTFFirstByte = "0"+sUTFFirstByte;
string sHeader = sUTFFirstByte.substr(0,8);
iLen = 0;
iTmp = 0;
while((c = sHeader[iTmp++]) != '0')
{
iLen ++;
if(iLen == 8)
return -1;
}
if(0==iLen)
iLen = 1;
return iLen;
}
/*************************************************************************
* @f Fnct : convertUTF2Hex
* @r Return : Hex value corresponding to the UTF chracter.
"error" on invalid character.
* Description : Returns the hex value corresponding to a UTF character.
Do not misunderstand this function with a stream converter.
This function converts only one UTF-8 character.
For example
"7f" return "7f"
"c280" return "80"
"efbfbd" return "fffd"
* @author : Boby thomas
**************************************************************************/
string convertUTF2Hex(string sUTF)
{
string sReturn = "";
string sBinary = "";
unsigned int iLen = 6;
char c;
sUTF = EatupWhiteSpace(sUTF);
int iStart = sUTF.find_first_not_of("0123456789abcdef");
if(iStart != -1)
sUTF = sUTF.substr(0,iStart);
//we have a binary array now.
sUTF = hex2binary(sUTF);
while(sUTF.length()%8 != 0)
sUTF = "0"+sUTF;
string sHeader = sUTF.substr(0,8);
iLen = 0;
unsigned int iTmp = 0;
while((c = sHeader[iTmp++]) != '0')
{
iLen ++;
if(iLen == 8)
{
return "error";
}
}
if(sUTF.length() < (iLen*8))
return "error";
if(0==iLen)
{
iLen = 1;
sBinary = sHeader.substr(iTmp,9999);
}
else
{
sBinary = sHeader.substr(iTmp,9999);
iTmp = 1;
while(iTmp < iLen)
{
sHeader = sUTF.substr((iTmp*8),8);
if((sHeader[0] != '1') || (sHeader[1] != '0') )
return "error";
sBinary += sHeader.substr(2,6);
iTmp ++;
}
}
sReturn = binary2hex(sBinary);
return sReturn;
}
/*************************************************************************
* @f Fnct : generateUTFFileDetails
* @r Return : true - file could be a UTF file.
(No invalid UTF character in the file)
* Description : This function evaluate a file for validity. Returns false
if there a single occurance of a nonpossible character.
Writes a file utfdetails_<filename> with all the utf
character details.
* @author : Boby thomas
**************************************************************************/
bool generateUTFFileDetails(string sFileName)
{
bool bSuccess = true;
long lLength = 0;
char HexBuffer[25] = {0};
string UTFString;
string UnicodeString;
FILE * fpInput = fopen(sFileName.c_str(),"rb");
if(fpInput == NULL)
{
cout<<"Failed to open file "<<sFileName.c_str()<<"\n";
return false;
}
string sOutput = "utfdetails_" + sFileName;
FILE * fpOutput = fopen(sOutput.c_str(),"wb");
if(fpOutput == NULL)
{
cout<<"Failed to open output file "<<sOutput.c_str()<<"\n";
return false;
}
int lChar;
fprintf(fpOutput,"=================================================================\n");
fprintf(fpOutput," ASCII || BINARY || UTF \n");
fprintf(fpOutput,"=================================================================\n");
while(1)
{
lChar = fgetc(fpInput);
if(-1 == lChar)
break;
if(lLength == 0)
{
UTFString = "";
sprintf(HexBuffer,"%x",lChar);
lLength = findLengthUTF(HexBuffer);
}
if(lLength == -1)
{
lLength = 0;
bSuccess = false;
fprintf(fpOutput,"Invalid UTF character. Not a possible first byte. Binary value:%s\n",HexBuffer);
continue;
}
if(lLength > 0)
{
sprintf(HexBuffer,"%x",lChar);
UTFString += HexBuffer;
lLength--;
if(lLength == 0)
{
string sResult = convertUTF2Hex(UTFString);
if(sResult.compare("error") == 0)
fprintf(fpOutput,"Invalid UTF character. Binary data:%s\n",UTFString.c_str());
else if((convertHex2UTF(sResult).compare(UTFString) != 0) && (convertHex2UTF(sResult).compare("0"+UTFString) != 0) )
fprintf(fpOutput,"Invalid UTF character. Binary data:%s\n",UTFString.c_str());
else
fprintf(fpOutput,"%18c ||%18s ||%18s \n",(char)lChar,UTFString.c_str(),sResult.c_str());
}
}
}
if(fpInput)
fclose(fpInput);
if(fpOutput)
fclose(fpOutput);
return bSuccess;
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.