Click here to Skip to main content
15,879,326 members
Articles / Desktop Programming / MFC
Article

UTF-8 Encoding and Decoding

Rate me:
Please Sign up or sign in to vote.
2.45/5 (28 votes)
22 Oct 20031 min read 602.8K   4.5K   59   30
How to Encode and Decode Text to/from UTF-8

Image 1

Figure : Dialog-based test application showing text encoded and decoded

Introduction

This article shows you how to encode and decode text to/from UTF-8 encoding format. There are various situations where text characters must be limited to ASCII 127 or lower. The most common is in sending email or using XML. If you send on HTML email to yourself, using ASCII 128 or higher characters, such as ® (ALT-0174), and you examine the message source, you will see it replaced with =AE (at least in MS Outlook Express). This is because it has been UTF-8 encoded. This allows all characters, including Unicode, to be stored using ASCII 127 or lower. For more information on UTF-8, visit http://www1.tip.nl/~t876506/utf8tbl.html. I hope the link doesn't disappear anytime soon because it is very good, but I don't have time to add any details.

Using the code

The example code is simple MFC dialog application showing encoding and decoding. The same code works for both ANSI and Unicode. The function to encode is:

CString EncodeToUTF8(LPCTSTR szSource)
{
  WORD ch;

  BYTE bt1, bt2, bt3, bt4, bt5, bt6;

  int n, nMax = _tcslen(szSource);

  CString sFinal, sTemp;

  for (n = 0; n < nMax; ++n)
  {
    ch = (WORD)szSource[n];

    if (ch == _T('='))
    {
      sTemp.Format(_T("=%02X"), ch);

      sFinal += sTemp;
    }
    else if (ch < 128)
    {
      sFinal += szSource[n];
    }
    else if (ch <= 2047)
    {
       bt1 = (BYTE)(192 + (ch / 64));
       bt2 = (BYTE)(128 + (ch % 64));

      sTemp.Format(_T("=%02X=%02X"), bt1, bt2);
            
      sFinal += sTemp;
    }
    else if (ch <= 65535)
    {
       bt1 = (BYTE)(224 + (ch / 4096));
       bt2 = (BYTE)(128 + ((ch / 64) % 64));
       bt3 = (BYTE)(128 + (ch % 64));

      sTemp.Format(_T("=%02X=%02X=%02X"), bt1, bt2, bt3);
            
      sFinal += sTemp;
    }
    else if (ch <= 2097151)
    {
       bt1 = (BYTE)(240 + (ch / 262144));
       bt2 = (BYTE)(128 + ((ch / 4096) % 64));
       bt3 = (BYTE)(128 + ((ch / 64) % 64));
       bt4 = (BYTE)(128 + (ch % 64));

      sTemp.Format(_T("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
      sFinal += sTemp;
    }
    else if (ch <=67108863)
    {
      bt1 = (BYTE)(248 + (ch / 16777216));
      bt2 = (BYTE)(128 + ((ch / 262144) % 64));
      bt3 = (BYTE)(128 + ((ch / 4096) % 64));
      bt4 = (BYTE)(128 + ((ch / 64) % 64));
      bt5 = (BYTE)(128 + (ch % 64));

      sTemp.Format(_T("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
      sFinal += sTemp;
    }
    else if (ch <=2147483647)
    {
       bt1 = (BYTE)(252 + (ch / 1073741824));
       bt2 = (BYTE)(128 + ((ch / 16777216) % 64));
       bt3 = (BYTE)(128 + ((ch / 262144) % 64));
       bt4 = (BYTE)(128 + ((ch / 4096) % 64));
       bt5 = (BYTE)(128 + ((ch / 64) % 64));
       bt6 = (BYTE)(128 + (ch % 64));

      sTemp.Format(_T("=%02X=%02X=%02X=%02X=%02X=%02X"), 
                bt1, bt2, bt3, bt4, bt5, bt6);
      sFinal += sTemp;
    }

  }

  return sFinal;
}

The function to decode is:

CString DecodeFromUTF8(LPCTSTR szSource)
{
  TRACE(_T("\n"));

  int n, nMax = _tcslen(szSource);
  WORD ch;

  CString sFinal, sTemp;

  BYTE z, y, x, w, v, u;
  
  for (n = 0; n < nMax; ++n)
  {
    ch = (WORD)szSource[n];

    if (ch != _T('='))
    {
      sFinal += (TCHAR)ch;
      continue;
    }

    if (n >= nMax - 2) break; // something is wrong
    z = MakeByte(szSource[n+1], szSource[n+2]);

    if (z < 127)
    {
      sFinal += (TCHAR)z;
      n = n + 2;
    }
    else if (z >= 192 && z <= 223)
    {
      // character is two bytes
      if (n >= nMax - 5) break; // something is wrong
      y = MakeByte(szSource[n+4], szSource[n+5]);
      sFinal += (TCHAR)( (z-192)*64 + (y-128) );
      n = n + 5;
    }
    else if (z >= 224 && z <= 239)
    {
      // character is three bytes
      if (n >= nMax - 8) break; // something is wrong
      y = MakeByte(szSource[n+4], szSource[n+5]);
      x = MakeByte(szSource[n+7], szSource[n+8]);
      sFinal += (TCHAR)( (z-224)*4096 + (y-128)*64 + (x-128) );
      n = n + 8;
    }
    else if (z >= 240 && z <= 247)
    {
      // character is four bytes
      if (n >= nMax - 11) break; // something is wrong
      y = MakeByte(szSource[n+4], szSource[n+5]);
      x = MakeByte(szSource[n+7], szSource[n+8]);
      w = MakeByte(szSource[n+10], szSource[n+11]);
      sFinal += (TCHAR)( (z-240)*262144 + (y-128)*4096 + 
             (x-128)*64 + (w-128) );
      n = n + 11;
    }
    else if (z >= 248 && z <= 251)
    {
      // character is four bytes
      if (n >= nMax - 14) break; // something is wrong
      y = MakeByte(szSource[n+4], szSource[n+5]);
      x = MakeByte(szSource[n+7], szSource[n+8]);
      w = MakeByte(szSource[n+10], szSource[n+11]);
      v = MakeByte(szSource[n+13], szSource[n+14]);
      sFinal += (TCHAR)( (z-248)*16777216 + (y-128)*262144 + 
           (x-128)*4096 + (w-128)*64 + (v-128) );
      n = n + 14;
    }
    else if (z >= 252 && z <= 253)
    {
      // character is four bytes
      if (n >= nMax - 17) break; // something is wrong
      y = MakeByte(szSource[n+4], szSource[n+5]);
      x = MakeByte(szSource[n+7], szSource[n+8]);
      w = MakeByte(szSource[n+10], szSource[n+11]);
      v = MakeByte(szSource[n+13], szSource[n+14]);
      u = MakeByte(szSource[n+16], szSource[n+17]);
      sFinal += (TCHAR)( (z-252)*1073741824 + (y-128)*16777216 + 
          (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
      n = n + 17;
    }
    
  }

  return sFinal;
}

The Decode function also requires this helper function MakeByte to convert a two-character pair into the appropriate byte.

// helper function for decoding
BYTE MakeByte(TCHAR ch1, TCHAR ch2)
{
  BYTE bt1 = 0, bt2 = 0;

  switch (ch2)
  {
    case _T('0'):
      bt2 = 0x00;
      break;
    case _T('1'):
      bt2 = 0x01;
      break;
    case _T('2'):
      bt2 = 0x02;
      break;
    case _T('3'):
      bt2 = 0x03;
      break;
    case _T('4'):
      bt2 = 0x04;
      break;
    case _T('5'):
      bt2 = 0x05;
      break;
    case _T('6'):
      bt2 = 0x06;
      break;
    case _T('7'):
      bt2 = 0x07;
      break;
    case _T('8'):
      bt2 = 0x08;
      break;
    case _T('9'):
      bt2 = 0x09;
      break;
    case _T('A'):
      bt2 = 0x0A;
      break;
    case _T('B'):
      bt2 = 0x0B;
      break;
    case _T('C'):
      bt2 = 0x0C;
      break;
    case _T('D'):
      bt2 = 0x0D;
      break;
    case _T('E'):
      bt2 = 0x0E;
      break;
    case _T('F'):
      bt2 = 0x0F;
      break;
  }

  switch (ch1)
  {
    case _T('0'):
      bt1 = 0x00;
      break;
    case _T('1'):
      bt1 = 0x10;
      break;
    case _T('2'):
      bt1 = 0x20;
      break;
    case _T('3'):
      bt1 = 0x30;
      break;
    case _T('4'):
      bt1 = 0x40;
      break;
    case _T('5'):
      bt1 = 0x50;
      break;
    case _T('6'):
      bt1 = 0x60;
      break;
    case _T('7'):
      bt1 = 0x70;
      break;
    case _T('8'):
      bt1 = 0x80;
      break;
    case _T('9'):
      bt1 = 0x90;
      break;
    case _T('A'):
      bt1 = 0xA0;
      break;
    case _T('B'):
      bt1 = 0xB0;
      break;
    case _T('C'):
      bt1 = 0xC0;
      break;
    case _T('D'):
      bt1 = 0xD0;
      break;
    case _T('E'):
      bt1 = 0xE0;
      break;
    case _T('F'):
      bt1 = 0xF0;
      break;
  }

  BYTE btFinal = bt2 | bt1;

  return  btFinal;  

}

I apologize that this code was written relatively quickly, and as such could probably be rewritten to run much faster. But that is all there is to it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
President Starpoint Software Inc.
United States United States
Bob Pittenger is founder and President of Starpoint Software Inc. He holds a B.A. degree from Miami University, M.S. and Ph.D. degrees from Purdue University, and an MBA from Xavier University. He has been programming since 1993, starting with Windows application development in C++/MFC and moving to C# and .NET around 2005 and is a .NET Microsoft Certified Professional Developer.

Bob is the author of two books:
Billionaire: How the Ultra-Rich Built Their Fortunes Through Good and Evil and What You Can Learn from Them
and
Wealthonomics: The Most Important Economic and Financial Concepts that Can Make You Rich Fast.
Visit http://www.billionairebook.net for more information.

Comments and Discussions

 
GeneralMy vote of 4 Pin
karnikjain8728-Dec-10 22:33
karnikjain8728-Dec-10 22:33 
GeneralWarning: this is not what UTF-8 looks like Pin
Jeff Roe14-Sep-10 13:11
Jeff Roe14-Sep-10 13:11 
Generalproject problem Pin
Christophe_Pichaud20-Nov-08 22:16
professionalChristophe_Pichaud20-Nov-08 22:16 
QuestionHow can i read data in ANSI format and create with UTF8 ? Pin
suwat110-Nov-08 5:24
suwat110-Nov-08 5:24 
GeneralIn VC++, how do i represent menu items in chinese letters.. Pin
ganesa moorthy27-Mar-08 21:29
ganesa moorthy27-Mar-08 21:29 
Generalfatal error occur Pin
ShilpiP2-Oct-06 23:09
ShilpiP2-Oct-06 23:09 
when i use ur project fatal error occur and it is not working and this project is really very nice and is of my use ....can u help me for this

Yes U Can ...If U Can ,Dream it , U can do it ...ICAN

GeneralRe: fatal error occur Pin
James, Lu Zuheng6-Mar-07 7:11
James, Lu Zuheng6-Mar-07 7:11 
GeneralVersion for visual studio 2005 (regular and express) non MFC Pin
Zulu_kj13-Jul-06 22:17
Zulu_kj13-Jul-06 22:17 
GeneralProblem with UTF8 data Pin
Veera Raghavendra29-Sep-05 23:03
Veera Raghavendra29-Sep-05 23:03 
GeneralI am confused Pin
Ali Tavakol10-Aug-05 2:19
Ali Tavakol10-Aug-05 2:19 
GeneralA more elegant encoding implementation Pin
Anonymous16-Jul-04 0:29
Anonymous16-Jul-04 0:29 
AnswerRe: A more elegant encoding implementation Pin
Dmitriy Sinyagin17-Mar-06 14:47
Dmitriy Sinyagin17-Mar-06 14:47 
Generalvietnamese characters Pin
chinnivenkat14-Mar-04 19:19
chinnivenkat14-Mar-04 19:19 
GeneralDecode nothing Pin
Dominique Aigroz19-Jan-04 23:19
Dominique Aigroz19-Jan-04 23:19 
GeneralRe: Decode nothing Pin
ldaoust7-Jul-04 5:52
ldaoust7-Jul-04 5:52 
GeneralRe: Decode nothing Pin
Dominique Aigroz11-Jul-04 21:46
Dominique Aigroz11-Jul-04 21:46 
GeneralRe: Decode nothing Pin
Dominique Aigroz11-Jul-04 22:25
Dominique Aigroz11-Jul-04 22:25 
GeneralWarning: This UTF-8 Encoding is wrong! Pin
I18n Guy30-Dec-03 21:42
I18n Guy30-Dec-03 21:42 
GeneralRe: Warning: This UTF-8 Encoding is wrong! Pin
Krishnakumar G15-Jul-04 0:50
Krishnakumar G15-Jul-04 0:50 
GeneralRe: Warning: This UTF-8 Encoding is wrong! Pin
I18n Guy15-Jul-04 3:01
I18n Guy15-Jul-04 3:01 
QuestionUTF-8? Quoted printable? Pin
immo28-Oct-03 10:30
immo28-Oct-03 10:30 
AnswerRe: UTF-8? Quoted printable? Pin
daveice31-Mar-06 23:43
daveice31-Mar-06 23:43 
GeneralSimple Way to count UTF-8 bytes Pin
Gregor Brandt23-Oct-03 16:16
Gregor Brandt23-Oct-03 16:16 
GeneralICU Pin
Uwe Keim23-Oct-03 5:31
sitebuilderUwe Keim23-Oct-03 5:31 
GeneralWideCharToMultiByte Pin
Ian Prest23-Oct-03 2:27
sussIan Prest23-Oct-03 2:27 

General General    News News    Suggestion Suggestion    Question Question    Bug Bug    Answer Answer    Joke Joke    Praise Praise    Rant Rant    Admin Admin   

Use Ctrl+Left/Right to switch messages, Ctrl+Up/Down to switch threads, Ctrl+Shift+Left/Right to switch pages.