Click here to Skip to main content
Click here to Skip to main content

Convert Between std::string and std::wstring, UTF-8 and UTF-16

By , 20 May 2007
 

Introduction

I needed to convert between UTF-8 coded std::string and UTF-16 coded std::wstring. I found some converting functions for native C strings, but these leave the memory handling to the caller. Not nice in modern times.

The best converter is probably the one from unicode.org. Here is a wrapper around this one which converts the STL strings.

Unlike other articles, this one has no other dependencies, does not introduce yet another string class, only converts the STL strings, and that's it. And it's better than the widely found...

std::wstring widestring(sourcestring.begin(), sourcestring.end()); 

... which only works for ASCII text.

Source

The header goes like this:

#ifndef UTFCONVERTER__H__
#define UTFCONVERTER__H__

namespace UtfConverter
{
    std::wstring FromUtf8(const std::string& utf8string);
    std::string ToUtf8(const std::wstring& widestring);
}

#endif

I guess this is simple and easy enough to use.

Here is the source code:

#include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"

namespace UtfConverter
{
    std::wstring FromUtf8(const std::string& utf8string)
    {
        size_t widesize = utf8string.length();
        if (sizeof(wchar_t) == 2)
        {
            wchar_t* widestringnative = new wchar_t[widesize+1];
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF16* targetstart = reinterpret_cast<UTF16*>(widestringnative);
            UTF16* targetend = targetstart + widesize+1;
            ConversionResult res = ConvertUTF8toUTF16
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] widestringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::wstring resultstring(widestringnative);
            delete [] widestringnative;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            wchar_t* widestringnative = new wchar_t[widesize];
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF32* targetstart = reinterpret_cast<UTF32*>(widestringnative);
            UTF32* targetend = targetstart + widesize;
            ConversionResult res = ConvertUTF8toUTF32
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] widestringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::wstring resultstring(widestringnative);
            delete [] widestringnative;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return L"";
    }

    std::string ToUtf8(const std::wstring& widestring)
    {
        size_t widesize = widestring.length();

        if (sizeof(wchar_t) == 2)
        {
            size_t utf8size = 3 * widesize + 1;
            char* utf8stringnative = new char[utf8size];
            const UTF16* sourcestart = 
		reinterpret_cast<const UTF16*>(widestring.c_str());
            const UTF16* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF16toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] utf8stringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::string resultstring(utf8stringnative);
            delete [] utf8stringnative;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            size_t utf8size = 4 * widesize + 1;
            char* utf8stringnative = new char[utf8size];
            const UTF32* sourcestart = 
		reinterpret_cast<const UTF32*>(widestring.c_str());
            const UTF32* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF32toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                delete [] utf8stringnative;
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            std::string resultstring(utf8stringnative);
            delete [] utf8stringnative;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return "";
    }
} 

How To Do It Better

Here's another version that avoids using new and delete, by writing directly into the string buffer. Does anyone know whether this is okay?

#include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"

namespace UtfConverter
{
    std::wstring FromUtf8(const std::string& utf8string)
    {
        size_t widesize = utf8string.length();
        if (sizeof(wchar_t) == 2)
        {
            std::wstring resultstring;
            resultstring.resize(widesize+1, L'\0');
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF16* targetstart = reinterpret_cast<UTF16*>(&resultstring[0]);
            UTF16* targetend = targetstart + widesize;
            ConversionResult res = ConvertUTF8toUTF16
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            std::wstring resultstring;
            resultstring.resize(widesize+1, L'\0');
            const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
            const UTF8* sourceend = sourcestart + widesize;
            UTF32* targetstart = reinterpret_cast<UTF32*>(&resultstring[0]);
            UTF32* targetend = targetstart + widesize;
            ConversionResult res = ConvertUTF8toUTF32
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return L"";
    }

    std::string ToUtf8(const std::wstring& widestring)
    {
        size_t widesize = widestring.length();

        if (sizeof(wchar_t) == 2)
        {
            size_t utf8size = 3 * widesize + 1;
            std::string resultstring;
            resultstring.resize(utf8size, '\0');
            const UTF16* sourcestart = 
		reinterpret_cast<const UTF16*>(widestring.c_str());
            const UTF16* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF16toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else if (sizeof(wchar_t) == 4)
        {
            size_t utf8size = 4 * widesize + 1;
            std::string resultstring;
            resultstring.resize(utf8size, '\0');
            const UTF32* sourcestart = 
		reinterpret_cast<const UTF32*>(widestring.c_str());
            const UTF32* sourceend = sourcestart + widesize;
            UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
            UTF8* targetend = targetstart + utf8size;
            ConversionResult res = ConvertUTF32toUTF8
		(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
            if (res != conversionOK)
            {
                throw std::exception("La falla!");
            }
            *targetstart = 0;
            return resultstring;
        }
        else
        {
            throw std::exception("La falla!");
        }
        return "";
    }
}

How to Use It

Just add it to your project. Download the Unicode converter from here and add that to the project, too. It should just work.

Of course, you can throw whatever exceptions you like upon failure.

I must admit I tried it only for 2-byte wchar_t.

Comments are welcome.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

About the Author

rh_
Germany Germany
Member
No Biography provided

Sign Up to vote   Poor Excellent
Add a reason or comment to your vote: x
Votes of 3 or less require a comment

Comments and Discussions

 
Hint: For improved responsiveness ensure Javascript is enabled and choose 'Normal' from the Layout dropdown and hit 'Update'.
You must Sign In to use this message board.
Search this forum  
    Spacing  Noise  Layout  Per page   
GeneralRe: The easiest way to do the same conversionmemberkurt.griffiths9 Nov '11 - 9:09 
Generaltrouble appending string to conversionmemberMember 457434818 Oct '08 - 2:53 
GeneralIncorrect size set in ToUTF8memberDEmberton19 Jun '08 - 2:55 
GeneralRe: Incorrect size set in ToUTF8memberpeterchen21 Aug '08 - 4:47 
GeneralRe: Incorrect size set in ToUTF8memberVite Falcon22 Apr '11 - 10:19 
Generalthank youmemberwipehindy15 Feb '08 - 12:59 
Questionwhat is “L” in:resultstring.resize(widesize+1, L'\0');memberEva ranee6 Jan '08 - 20:34 
GeneralRe: what is “L” in:resultstring.resize(widesize+1, L'\0');memberMircea Puiu6 Jan '08 - 23:41 
GeneralUNICODE is not the same as UTF16memberchristophe.hermier@quickfds.com30 Sep '07 - 21:55 
GeneralCA2T, CA2Wmemberkpnut30 Aug '07 - 4:35 
QuestionIs this OK?mvpStephen Hewitt20 May '07 - 21:12 
QuestionBug in the code?member_ema_10 May '07 - 13:51 
AnswerRe: Bug in the code?memberrh_20 May '07 - 20:32 
GeneralSimplest waymembertracker200213 Feb '07 - 4:06 
GeneralRe: Simplest waymemberrh_13 Feb '07 - 20:04 
GeneralUTF-8 and Multibyte are not the samememberTed Dunlop12 Feb '07 - 5:59 
GeneralRe: UTF-8 and Multibyte are not the samememberrh_13 Feb '07 - 0:21 
GeneralMultiByteToWideChar and WideCharToMultiByte...memberJohann Gerell11 Feb '07 - 19:27 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberrh_11 Feb '07 - 22:37 
NewsRe: MultiByteToWideChar and WideCharToMultiByte...memberJohann Gerell11 Feb '07 - 22:54 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberrh_12 Feb '07 - 1:32 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberJohann Gerell12 Feb '07 - 1:49 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberrh_12 Feb '07 - 2:34 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberBjornar19 Feb '07 - 3:44 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberkonmel7 Nov '07 - 6:53 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberchipmunk7 Aug '08 - 11:25 
GeneralRe: MultiByteToWideChar and WideCharToMultiByte...memberjohnny longxzy26 Jun '09 - 11:33 
AnswerRe: MultiByteToWideChar and WideCharToMultiByte...memberJohann Gerell26 Jun '09 - 12:34 

General General    News News    Suggestion Suggestion    Question Question    Bug Bug    Answer Answer    Joke Joke    Rant Rant    Admin Admin   

Permalink | Advertise | Privacy | Mobile
Web02 | 2.6.130516.1 | Last Updated 21 May 2007
Article Copyright 2007 by rh_
Everything else Copyright © CodeProject, 1999-2013
Terms of Use
Layout: fixed | fluid