/************************************************************************
Xport: XHTML Parsing & Objective Reporting Toolkit
Copyright (C) 2007 Mitchel Haas
This file is part of Xport.
Xport is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Xport is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Xport. If not, see <http://www.gnu.org/licenses/>.
For complete documentation on this library and alternative
licensing options, visit http://www.xportpro.com
Email questions, comments or suggestions to mitchel.haas@xportpro.com
************************************************************************/
#pragma once
#include <string>
#include <locale>
#include <cctype>
#include <cwctype>
#include <sstream>
/************************************************************************/
/* Character type support */
/************************************************************************/
namespace Xport
{
enum encoding {
default_text,
utf_8,
utf_16,
utf_32
};
enum bom_encoding {
bom_undetermined,
bom_utf_8,
bom_utf_16be,
bom_utf_16le,
bom_utf_32be,
bom_utf_32le
};
enum endianness { little_endian, big_endian };
enum formatter_encoding_option {bom};
inline void convert_endiness(std::basic_istream<char>* input, std::basic_string<char>& conv_str) {}
inline void convert_endiness(std::basic_istream<wchar_t>* input, std::basic_string<wchar_t>& conv_str);
inline bom_encoding determine_bom(std::basic_istream<char>& input);
inline bom_encoding determine_bom(std::basic_istream<wchar_t>& input);
inline endianness determine_system_endianness();
inline void endian_swap(wchar_t& x);
inline void endian_swap(unsigned int& x);
inline void get_bom_string(encoding bom_enc, std::basic_string<char>& bom_str);
inline void get_bom_string(encoding bom_enc, std::basic_string<wchar_t>& bom_str);
template<typename CT> inline encoding get_default_bom() { return sizeof(CT) > 1 ? utf_16 : default_text; }
inline void typed_char_impl(const char ch, char& output);
inline void typed_char_impl(const char ch, wchar_t& output);
inline void typed_string_impl(const std::string& str, std::basic_string<wchar_t>& output);
inline void typed_string_impl(const std::string& str, std::basic_string<char>& output);
template<typename CT>
inline void process_bom(std::basic_istream<CT>** ppInput, bool created_input)
{
// determining bom will strip it
bom_encoding enc = determine_bom(**ppInput);
endianness sys_en = determine_system_endianness();
if (sys_en == little_endian && enc == bom_utf_16be || sys_en == big_endian && enc == bom_utf_16le) {
// need to swap bytes in characters
std::basic_string<CT> conv_str;
convert_endiness(*ppInput, conv_str);
if (created_input) {
delete *ppInput;
}
*ppInput = new std::basic_istringstream<CT>(conv_str);
created_input = true;
}
}
template<typename CT>
inline CT typed_char(const char ch)
{
CT temp;
typed_char_impl(ch, temp);
return temp;
}
template<typename CT>
inline std::basic_string<CT> typed_string(const std::string& str)
{
std::basic_string<CT> temp;
typed_string_impl(str, temp);
return temp;
}
}
inline void Xport::convert_endiness(std::basic_istream<wchar_t>* input, std::basic_string<wchar_t>& conv_str)
{
std::istreambuf_iterator<wchar_t> it(*input), it_end;
for (; it != it_end; ++it) {
wchar_t ch = *it;
endian_swap(ch);
conv_str += ch;
}
}
inline Xport::bom_encoding Xport::determine_bom(std::basic_istream<char>& input)
{
if (input.good()) {
char ch;
input >> ch;
switch (ch)
{
case '\xEF': // UTF-8
if (!input.eof())
input >> ch;
if (ch == '\xBB' && !input.eof()) {
input >> ch;
if (ch == '\xBF') {
return bom_utf_8;
} else {
input.putback(ch);
}
} else {
input.putback(ch);
}
break;
case '\xFE': // UTF-16BE
if (!input.eof()) {
input >> ch;
if (ch == '\xFF') {
return bom_utf_16be;
} else {
input.unget();
}
}
break;
case 0xFF: // UTF-16LE OR UTF-32LE
if (!input.eof()) {
input >> ch;
if (ch == '\xFE') {
if (!input.eof()) {
input >> ch;
if (ch == '\x00' && !input.eof()) {
input >> ch;
if (ch == '\x00') {
return bom_utf_32le;
}
} else {
input.putback(ch);
}
return bom_utf_16le;
}
}
}
break;
case 0x00: // UTF-32BE
if (!input.eof()) {
input >> ch;
if (ch == '\x00' && !input.eof()) {
input >> ch;
if (ch == '\xFE' && !input.eof()) {
input >> ch;
if (ch == '\xFF') {
return bom_utf_32be;
}
}
}
}
break;
default:
input.putback(ch);
break;
}
}
return bom_undetermined;
}
inline Xport::bom_encoding Xport::determine_bom(std::basic_istream<wchar_t>& input)
{
endianness sys_endian = determine_system_endianness();
if (input.good()) {
wchar_t ch;
input >> ch;
switch (ch)
{
case L'\xBBEF': // UTF-8 on little endian system
if (sys_endian == little_endian) {
return bom_utf_8;
}
break;
case L'\xEFBB': // UTF-8 on big endian system
if (sys_endian == big_endian) {
return bom_utf_8;
}
break;
case L'\xFEFF': // UTF-16BE on big endian system, UTF-16LE on little endian system
if (sys_endian == little_endian && input.good()) {
// check for UTF-32LE
input >> ch;
if (ch == L'\x0000') {
return bom_utf_32le;
} else {
input.putback(ch);
}
}
return (sys_endian == big_endian ? bom_utf_16be : bom_utf_16le);
break;
case L'\xFFFE': // UTF-16LE on a big endian system, UTF-16BE on a little endian system
if (sys_endian == big_endian && input.good()) {
// check for UTF-32LE
input >> ch;
if (ch == L'\x0000') {
return bom_utf_32le;
} else {
input.putback(ch);
}
}
return (sys_endian == big_endian ? bom_utf_16le : bom_utf_16be);
break;
case 0x00: // possible UTF-32BE
if (input.good()) {
input >> ch;
if ((sys_endian == big_endian && ch == L'\xFEFF') || (sys_endian == little_endian && ch == L'\xFFFE')) {
return bom_utf_32be;
} else {
input.putback(ch);
}
}
break;
default:
input.putback(ch);
break;
}
}
return bom_undetermined;
}
inline Xport::endianness Xport::determine_system_endianness()
{
short int word = 0x0001;
char *byte = (char *) &word;
return(byte[0] ? little_endian : big_endian);
}
// endian_swap courtesy of Kevin Hall
inline void Xport::endian_swap(wchar_t& x)
{
x = (x>>8) |
(x<<8);
}
inline void Xport::endian_swap(unsigned int& x)
{
x = (x>>24) |
((x<<8) & 0x00FF0000) |
((x>>8) & 0x0000FF00) |
(x<<24);
}
inline void Xport::get_bom_string(encoding enc, std::basic_string<char>& bom_str)
{
std::basic_ostringstream<char> enc_str;
switch(enc)
{
case utf_8:
enc_str << '\xEF' << '\xBB' << '\xBF';
break;
default:
break;
}
bom_str = enc_str.str();
}
inline void Xport::get_bom_string(encoding bom_enc, std::basic_string<wchar_t>& bom_str)
{
std::basic_ostringstream<wchar_t> enc;
switch(bom_enc)
{
case utf_16:
enc << L'\xFEFF';
break;
case utf_32:
enc << L'\x0000' << L'\xFEFF';
break;
default:
break;
}
bom_str = enc.str();
}
inline void Xport::typed_char_impl(const char ch, char& output)
{
output = ch;
}
inline void Xport::typed_char_impl(const char ch, wchar_t& output)
{
typedef std::ctype<wchar_t> CTP;
#if defined(_MSC_VER) && _MSC_VER < 1300
CTP const& ct = std::_USE(std::locale(), CTP);
#else
CTP const& ct = std::use_facet<CTP>(std::locale());
#endif
output = ct.widen(ch);
}
inline void Xport::typed_string_impl(const std::string& str, std::basic_string<wchar_t>& output)
{
typedef std::ctype<wchar_t> CTP;
std::wstring wide;
wide.resize( str.length() );
#if defined(_MSC_VER) && _MSC_VER < 1300
CTP const& ct = std::_USE(std::locale(), CTP);
#else
CTP const& ct = std::use_facet<CTP>(std::locale());
#endif
ct.widen(&str[0], &str[0] + str.size(), &wide[0]);
output = wide;
}
inline void Xport::typed_string_impl(const std::string& str, std::basic_string<char>& output)
{
output = str;
}