|
|||||||||||||||||||||||
|
|||||||||||||||||||||||
|
Announcements
Chapters
Services
Feature Zones
|
Note: This is an unedited contribution. If this article is inappropriate,
needs attention or copies someone else's work without reference then please
Report This Article
IntroductionI needed to convert between UTF-8 coded std::string and UTF-16 coded std::wstring. I found here and there converting functions for native C strings, but these leave the memory handling to the caller. Not nice in modern times. The best converter is probably the one from unicode.org. Here is a wrapper around this one which converts the STL strings. Unlike other articles, this one has no other dependencies, does not introduce yet another string class, it only converts the STL strings, and that's it. And it's better than the widely found std::wstring widestring(sourcestring.begin(), sourcestring.end()); which only works for Ascii text. SourceThe header goes like this: #ifndef UTFCONVERTER__H__
#define UTFCONVERTER__H__
namespace UtfConverter
{
std::wstring FromUtf8(const std::string& utf8string);
std::string ToUtf8(const std::wstring& widestring);
}
#endif
I guess this is simple and easy enough to use. Here is the source code: #include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"
namespace UtfConverter
{
std::wstring FromUtf8(const std::string& utf8string)
{
size_t widesize = utf8string.length();
if (sizeof(wchar_t) == 2)
{
wchar_t* widestringnative = new wchar_t[widesize+1];
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF16* targetstart = reinterpret_cast<UTF16*>(widestringnative);
UTF16* targetend = targetstart + widesize+1;
ConversionResult res = ConvertUTF8toUTF16(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] widestringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::wstring resultstring(widestringnative);
delete [] widestringnative;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
wchar_t* widestringnative = new wchar_t[widesize];
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF32* targetstart = reinterpret_cast<UTF32*>(widestringnative);
UTF32* targetend = targetstart + widesize;
ConversionResult res = ConvertUTF8toUTF32(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] widestringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::wstring resultstring(widestringnative);
delete [] widestringnative;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return L"";
}
std::string ToUtf8(const std::wstring& widestring)
{
size_t widesize = widestring.length();
if (sizeof(wchar_t) == 2)
{
size_t utf8size = 3 * widesize + 1;
char* utf8stringnative = new char[utf8size];
const UTF16* sourcestart = reinterpret_cast<const UTF16*>(widestring.c_str());
const UTF16* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] utf8stringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::string resultstring(utf8stringnative);
delete [] utf8stringnative;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
size_t utf8size = 4 * widesize + 1;
char* utf8stringnative = new char[utf8size];
const UTF32* sourcestart = reinterpret_cast<const UTF32*>(widestring.c_str());
const UTF32* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
delete [] utf8stringnative;
throw std::exception("La falla!");
}
*targetstart = 0;
std::string resultstring(utf8stringnative);
delete [] utf8stringnative;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return "";
}
}
How to do it betterHere's another version that avoids using new and delete, by writing directly into the string buffer. Does anyone know whether this is okay? #include "stdafx.h"
#include "UtfConverter.h"
#include "ConvertUTF.h"
namespace UtfConverter
{
std::wstring FromUtf8(const std::string& utf8string)
{
size_t widesize = utf8string.length();
if (sizeof(wchar_t) == 2)
{
std::wstring resultstring;
resultstring.resize(widesize+1, L'\0');
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF16* targetstart = reinterpret_cast<UTF16*>(&resultstring[0]);
UTF16* targetend = targetstart + widesize;
ConversionResult res = ConvertUTF8toUTF16(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
std::wstring resultstring;
resultstring.resize(widesize+1, L'\0');
const UTF8* sourcestart = reinterpret_cast<const UTF8*>(utf8string.c_str());
const UTF8* sourceend = sourcestart + widesize;
UTF32* targetstart = reinterpret_cast<UTF32*>(&resultstring[0]);
UTF32* targetend = targetstart + widesize;
ConversionResult res = ConvertUTF8toUTF32(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return L"";
}
std::string ToUtf8(const std::wstring& widestring)
{
size_t widesize = widestring.length();
if (sizeof(wchar_t) == 2)
{
size_t utf8size = 3 * widesize + 1;
std::string resultstring;
resultstring.resize(utf8size, '\0');
const UTF16* sourcestart = reinterpret_cast<const UTF16*>(widestring.c_str());
const UTF16* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else if (sizeof(wchar_t) == 4)
{
size_t utf8size = 4 * widesize + 1;
std::string resultstring;
resultstring.resize(utf8size, '\0');
const UTF32* sourcestart = reinterpret_cast<const UTF32*>(widestring.c_str());
const UTF32* sourceend = sourcestart + widesize;
UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]);
UTF8* targetend = targetstart + utf8size;
ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion);
if (res != conversionOK)
{
throw std::exception("La falla!");
}
*targetstart = 0;
return resultstring;
}
else
{
throw std::exception("La falla!");
}
return "";
}
}
<> >How to use itJust add them to your project. Download the unicode converter from http://www.unicode.org/Public/PROGRAMS/CVTUTF/ and add that to the project, too. It should just work. Of course you can throw whatever exception you like upon failure. I must admit I tried it only for 2-byte wchar_t. Comments are welcome.
|
||||||||||||||||||||||