// ============================================================================ //
// X M 8 _ U T F s b s . c p p //
// ---------------------------------------------------------------------------- //
// Contains the UTF-8 routines //
// ---------------------------------------------------------------------------- //
// static U T F t o U C S //
// ---------------------------------------------------------------------------- //
// u t f U C S //
// ---------------------------------------------------------------------------- //
// U T F l e n //
// ---------------------------------------------------------------------------- //
// X M 8 _ U T F t o U C S //
// ---------------------------------------------------------------------------- //
// X M 8 _ U T F 8 t o U T F 1 6 //
// ============================================================================ //
#include "XM8_Main.h"
#define bitsUTF2530(x) {if (utf[x] & 32) lC |= 0x20000000; /* bit 30?, yes */ \
if (utf[x] & 16) lC |= 0x10000000; /* bit 29?, yes */ \
if (utf[x] & 8) lC |= 0x08000000; /* bit 28?, yes */ \
if (utf[x] & 4) lC |= 0x04000000; /* bit 27?, yes */ \
if (utf[x] & 2) lC |= 0x02000000; /* bit 26?, yes */ \
if (utf[x] & 1) lC |= 0x01000000;}/* bit 25?, yes */
#define bitsUTF1924(x) {if (utf[x] & 32) lC |= 0x00800000; /* bit 24?, yes */ \
if (utf[x] & 16) lC |= 0x00400000; /* bit 23?, yes */ \
if (utf[x] & 8) lC |= 0x00200000; /* bit 22?, yes */ \
if (utf[x] & 4) lC |= 0x00100000; /* bit 21?, yes */ \
if (utf[x] & 2) lC |= 0x00080000; /* bit 20?, yes */ \
if (utf[x] & 1) lC |= 0x00040000;}/* bit 19?, yes */
#define bitsUTF1318(x) {if (utf[x] & 32) lC |= 0x00020000; /* bit 18?, yes */ \
if (utf[x] & 16) lC |= 0x00010000; /* bit 17?, yes */ \
if (utf[x] & 8) lC |= 0x00008000; /* bit 16?, yes */ \
if (utf[x] & 4) lC |= 0x00004000; /* bit 15?, yes */ \
if (utf[x] & 2) lC |= 0x00002000; /* bit 14?, yes */ \
if (utf[x] & 1) lC |= 0x00001000;}/* bit 13?, yes */
#define bitsUTF0712(x) {if (utf[x] & 32) lC |= 0x00000800; /* bit 12?, yes */ \
if (utf[x] & 16) lC |= 0x00000400; /* bit 11?, yes */ \
if (utf[x] & 8) lC |= 0x00000200; /* bit 10?, yes */ \
if (utf[x] & 4) lC |= 0x00000100; /* bit 9?, yes */ \
if (utf[x] & 2) lC |= 0x00000080; /* bit 8?, yes */ \
if (utf[x] & 1) lC |= 0x00000040;}/* bit 7?, yes */
// ============================================================================ //
// U T F t o U C S //
// ---------------------------------------------------------------------------- //
// Inputs: utBf pointer to pointer to 1-6 bytes for conversion //
// Output: lChar The UCS character //
// Returns: True/False Success or Failure, N.B. if fail, utBf stepped on //
// ============================================================================ //
static bool UTFtoUCS (byte**utBf, long*lChar)
{
byte bC, utf[6]; long lC = 0; memset(utf,0,sizeof(utf));
bC = utf[0] = *(*utBf)++;
if (0x80 & bC) // top bit set?
{ if (0x40 & bC) // yes, 2nd top bit set?
{ utf[1] = *(*utBf)++; // yes, at least 2 bytes in train
if (0x20 & bC) // 3rd top bit set?
{ if (utf[1] == 0x80) return false; // yes, extra stuffing?, yes, illegal
utf[2] = *(*utBf)++; // at least 3 bytes in train
if (0x10 & bC) // 4th top bit set?
{ if (utf[2] == 0x80) return false; // yes, extra stuffing?, yes, illegal
utf[3] = *(*utBf)++; // at least 4 bytes in train
if (0x08 & bC) // 5th top bit set?
{ if (utf[3] == 0x80) return false; // yes, extra stuffing?, yes, illegal
utf[4] = *(*utBf)++; // at least 5 bytes in train
if (0x04 & bC) // 6th top bit set?
{ if (utf[4] == 0x80) return false; // yes, extra stuffing?, yes, illegal
utf[5] = *(*utBf)++; // at least 6 bytes in train
if (0x02 & bC) return false; // train begins 1111111x?, yes invalid
// 6 byte utf-8 train
if (bC & 1) lC |= 0x40000000; // 31
bitsUTF2530(1); // 25-30
bitsUTF1924(2); // 19-24
bitsUTF1318(3); // 13-18
bitsUTF0712(4); // 7-12
lC |= utf[5]&0x3F; // 6
}
else // 5 byte utf-8 train
{ if (bC & 2) lC |= 0x02000000; // 26
if (bC & 1) lC |= 0x01000000; // 25
bitsUTF1924(1); // 19-24
bitsUTF1318(2); // 13-18
bitsUTF0712(3); // 7-12
lC |= utf[4]&0x3F; // 6
}
}
else // 4 byte utf-8 train
{ if (bC & 4) lC |= 0x00100000; // 21
if (bC & 2) lC |= 0x00080000; // 20
if (bC & 1) lC |= 0x00040000; // 19
bitsUTF1318(1); // 13-18
bitsUTF0712(2); // 7-12
lC |= utf[3]&0x3F; // 6
}
}
else // 3 byte utf-8 train
{ if (bC & 8) lC |= 0x00008000; // 16
if (bC & 4) lC |= 0x00004000; // 15
if (bC & 2) lC |= 0x00002000; // 14
if (bC & 1) lC |= 0x00001000; // 13
bitsUTF0712(1); // 7-12
lC |= utf[2]&0x3F; // 6
}
}
else // 2 byte utf-8 train
{ if (bC &16) lC |= 0x00000400; // 11
if (bC & 8) lC |= 0x00000200; // 10
if (bC & 4) lC |= 0x00000100; // 9
if (bC & 2) lC |= 0x00000080; // 8
if (bC & 1) lC |= 0x00000040; // 7
lC |= utf[1]&0x3F; // 6
}
}
else return false; // 2nd top bit not set // train begins 10xxxxxx, invalid
}
else lC = bC; // top bit not set // 1 byte utf-8 train
*lChar = lC; if (UNICODE && lC > 0x00110000) return false;
return true;
}
// ============================================================================ //
// u t f U C S //
// ---------------------------------------------------------------------------- //
// Input: aBf convert the supplied utf-8 train into a UCS train //
// Output: oBf an address for a UCS train //
// ============================================================================ //
extern bool utfUCS (char*aBf,long*oBf)
{
while(*aBf > '\0') *oBf++ = (byte)*aBf++; // terminator or multi-byte char ends loop
// if multi-byte char ended loop, continue in next loop
while(*aBf != '\0')
{
if (!UTFtoUCS((byte**)&aBf,oBf++))
{
sprintf(L.err,"Supplied utf-8 train invalid UNICODE");
*oBf = 0; // zero terminate
return false;
}
}
*oBf = 0; // zero terminate
return true;
}
// ============================================================================ //
// U T F l e n //
// ---------------------------------------------------------------------------- //
// Input: aBf a utf-8 train //
// Returns: number of CHARACTERS in train or -ive train point of failure //
// ============================================================================ //
extern long UTFlen (char*aBf)
{
long i = 0, lC;
char*Bf = aBf;
while(*Bf++ > '\0') i++; // terminator or multi-byte char ends loop
--Bf; // if multi-byte char ended loop, continue in next loop
while(*Bf != '\0')
{
if(!UTFtoUCS((byte**)&Bf,&lC)) return (aBf-Bf);
i++;
}
return i;
}
// ============================================================================ //
// X M 8 _ U T F t o U C S //
// ---------------------------------------------------------------------------- //
// Inputs: aBf a utf-8 train //
// Outputs: lBf an address for a UCS train //
// ============================================================================ //
extern "C" bool _stdcall XM8_UTFtoUCS (char*aBf,long*lBf)
{
long i = UTFlen(aBf);
if (i < 0)
{
sprintf(L.err,"Supplied utf-8 train invalid at position %n",-i);
*lBf = 0; // zero terminate
return false;
}
return utfUCS(aBf,lBf);
}
// ============================================================================ //
// X M 8 _ U T F 8 t o U T F 1 6 //
// ---------------------------------------------------------------------------- //
// Inputs: aBf a utf-8 train //
// Outputs: sBf an address for a utf-16 train (wide char) //
// 1st chars of output buffer 0xFE,0xFF -> BOM & bigEndian //
// 1st chars of output buffer 0xFF,0xFE -> BOM & littleEndian //
// otherwise -> no BOM & bigEndian //
// ============================================================================ //
extern "C" bool _stdcall XM8_UTF8toUTF16 (char*aBf,ushort*sBf)
{
bool bigEndian = true; byte*bsBf = (byte*)sBf; char*Bf = aBf;
if (*bsBf++ == 0xFF && *bsBf == 0xFE) {sBf++; bigEndian = false;}
else if (*bsBf == 0xFF && *--bsBf == 0xFE) sBf++;
long lC; ushort wC, xC; byte bwCu, bwCl, bxCu, bxCl;
bsBf = (byte*)sBf;// begining of train buffer; after BOM, if it exists
long i = UTFlen(aBf);
if (i < 0)
{
sprintf(L.err,"Supplied utf-8 train invalid at position %n",i);
*bsBf++ = 0; *bsBf = 0; // zero terminate
return false;
}
while(*aBf != 0)
{
UTFtoUCS((byte**)&aBf,&lC);
if (lC < 0x010000)
{
wC = (ushort)lC; bwCu = wC>>8; bwCl = wC & 0xFF;
if (bigEndian) // ?
{ // yes
*bsBf++ = bwCu; *bsBf++ = bwCl;
}
else // littleEndian
{ *bsBf++ = bwCl; *bsBf++ = bwCu;
}
}
else if (lC < 0x110000)
{
lC -= 0x10000; // reduce to 20 bits
wC = 0xD800 | ((lC>>10) & 0x3FF); bwCu = wC>>8; bwCl = wC & 0xFF;
xC = 0xDC00 | ( lC & 0x3FF); bxCu = xC>>8; bxCl = xC & 0xFF;
if (bigEndian) // ?
{ // yes
*bsBf++ = bxCu; *bsBf++ = bxCl; *bsBf++ = bwCu; *bsBf++ = bwCl;
}
else // littleEndian
{ *bsBf++ = bwCl; *bsBf++ = bwCu; *bsBf++ = bxCl; *bsBf++ = bxCu;
}
}
else
{ sprintf(L.err,"Supplied utf-8 train yielded invalid utf16 train at position %d",aBf-Bf);
*bsBf++ = 0; *bsBf = 0; // zero terminate
return false;
}
}
*bsBf++ = 0; *bsBf = 0; // zero terminate
return true;
}