Due to the fact that UTF-8 encoded characters have a variable length, you have to check each byte read. A possible solution (using file a file handle opened in binary mode) would be:
typedef struct {
int nLen;
unsigned char cByte[6];
} utf8char_t;
int read_utf8_char(FILE *f, utf8char_t& tChar)
{
tChar.nLen = 0;
if (feof(f))
return 0;
unsigned char c = tChar.cByte[0] =
static_cast<unsigned char>(fgetc(f));
if (c & 0x80)
{
while (c & 0x80)
{
++tChar.nLen;
c <<= 1;
}
for (int i = 1; i < tChar.nLen && i < 6)
{
if (feof(f))
return 0;
tChar.cByte[i] = static_cast<unsigned char>(fgetc(f));
if ((tChar.cByte[i] & 0xC0) != 0x80)
return -1;
}
if (tChar.nLen >= 6)
return -1;
}
else
tChar.nLen = 1;
return tChar.nLen;
}
Please nothe that this example does not check for all possible wrong UTF-8 codes.