Xport: XHTML Parsing and Objective Reporting Toolkit

Mitchel Haas
Rate me:
4.73/5 (10 votes)
4 May 2008GPL313 min read
60K
682
Open source C++ class template library for generating and parsing xhtml documents.
/************************************************************************
Xport: XHTML Parsing & Objective Reporting Toolkit
Copyright (C) 2007  Mitchel Haas

This file is part of Xport.

Xport is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Xport is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Xport.  If not, see <http://www.gnu.org/licenses/>.

For complete documentation on this library and alternative
licensing options, visit http://www.xportpro.com
Email questions, comments or suggestions to mitchel.haas@xportpro.com
************************************************************************/
#pragma once
#include <string>
#include <locale>
#include <cctype> 
#include <cwctype>
#include <sstream>


/************************************************************************/
/* Character type support                                               */
/************************************************************************/

namespace Xport 
{
  enum encoding {
    default_text,
    utf_8,
    utf_16,
    utf_32
  };

  enum bom_encoding { 
    bom_undetermined,
    bom_utf_8, 
    bom_utf_16be, 
    bom_utf_16le, 
    bom_utf_32be, 
    bom_utf_32le 
  };

  enum endianness { little_endian, big_endian };
  enum formatter_encoding_option {bom};

  inline void convert_endiness(std::basic_istream<char>* input, std::basic_string<char>& conv_str) {}
  inline void convert_endiness(std::basic_istream<wchar_t>* input, std::basic_string<wchar_t>& conv_str);
  inline bom_encoding determine_bom(std::basic_istream<char>& input);
  inline bom_encoding determine_bom(std::basic_istream<wchar_t>& input);
  inline endianness determine_system_endianness();
  inline void endian_swap(wchar_t& x);
  inline void endian_swap(unsigned int& x);
  inline void get_bom_string(encoding bom_enc, std::basic_string<char>& bom_str);
  inline void get_bom_string(encoding bom_enc, std::basic_string<wchar_t>& bom_str);
  template<typename CT> inline encoding get_default_bom() { return sizeof(CT) > 1 ? utf_16 : default_text; }
  inline void typed_char_impl(const char ch, char& output);
  inline void typed_char_impl(const char ch, wchar_t& output);
  inline void typed_string_impl(const std::string& str, std::basic_string<wchar_t>& output);
  inline void typed_string_impl(const std::string& str, std::basic_string<char>& output);

  template<typename CT> 
  inline void process_bom(std::basic_istream<CT>** ppInput, bool created_input)
  {
    // determining bom will strip it
    bom_encoding enc = determine_bom(**ppInput);

    endianness sys_en = determine_system_endianness();

    if (sys_en == little_endian && enc == bom_utf_16be || sys_en == big_endian && enc == bom_utf_16le) {
      // need to swap bytes in characters
      std::basic_string<CT> conv_str;
      convert_endiness(*ppInput, conv_str);
      if (created_input) {
        delete *ppInput;
      }
      *ppInput = new std::basic_istringstream<CT>(conv_str);
      created_input = true;
    }
  }


  template<typename CT> 
  inline CT typed_char(const char ch)
  {
    CT temp;
    typed_char_impl(ch, temp);
    return temp;
  }

  template<typename CT> 
  inline std::basic_string<CT> typed_string(const std::string& str)
  {
    std::basic_string<CT> temp;
    typed_string_impl(str, temp);
    return temp;
  }


}

inline void Xport::convert_endiness(std::basic_istream<wchar_t>* input, std::basic_string<wchar_t>& conv_str)
{
  std::istreambuf_iterator<wchar_t> it(*input), it_end;
  for (; it != it_end; ++it) {
    wchar_t ch = *it;
    endian_swap(ch);
    conv_str += ch;
  }
}


inline Xport::bom_encoding Xport::determine_bom(std::basic_istream<char>& input)
{
  if (input.good()) {
    char ch;
    input >> ch;

    switch (ch) 
    {
    case '\xEF': // UTF-8
      if (!input.eof())
      input >> ch;
      if (ch == '\xBB' && !input.eof()) {
        input >> ch;
        if (ch == '\xBF') {
          return bom_utf_8;
        } else {
          input.putback(ch);
        }
      } else {
        input.putback(ch);
      }
      break;

    case '\xFE': // UTF-16BE
      if (!input.eof()) {
        input >> ch;
        if (ch == '\xFF') {
          return bom_utf_16be;
        } else {
          input.unget();
        }
      }
      break;

    case 0xFF: // UTF-16LE OR UTF-32LE
      if (!input.eof()) {
        input >> ch;
        if (ch == '\xFE') {
          if (!input.eof()) {
            input >> ch;
            if (ch == '\x00' && !input.eof()) {
              input >> ch;
              if (ch == '\x00') {
                return bom_utf_32le;
              }
            } else {
              input.putback(ch);
            }
            return bom_utf_16le;
          }
        }
      }
      break;

    case 0x00: // UTF-32BE
      if (!input.eof()) {
        input >> ch;
        if (ch == '\x00' && !input.eof()) {
          input >> ch;
          if (ch == '\xFE' && !input.eof()) {
            input >> ch;
            if (ch == '\xFF') {
              return bom_utf_32be;
            }
          }
        }
      }
      break;

    default:
      input.putback(ch);
      break;
    }
  }

  return bom_undetermined;
}

inline Xport::bom_encoding Xport::determine_bom(std::basic_istream<wchar_t>& input)
{
  endianness sys_endian = determine_system_endianness();

  if (input.good()) {
    wchar_t ch;
    input >> ch;

    switch (ch) 
    {
    case L'\xBBEF': // UTF-8 on little endian system
      if (sys_endian == little_endian) {
        return bom_utf_8;
      }
      break;

    case L'\xEFBB':  // UTF-8 on big endian system
      if (sys_endian == big_endian) {
        return bom_utf_8;
      }
      break;

    case L'\xFEFF': // UTF-16BE on big endian system, UTF-16LE on little endian system
      if (sys_endian == little_endian && input.good()) {
        // check for UTF-32LE
        input >> ch;
        if (ch == L'\x0000') {
          return bom_utf_32le;
        } else {
          input.putback(ch);
        }
      } 

      return (sys_endian == big_endian ? bom_utf_16be : bom_utf_16le);
      break;

    case L'\xFFFE': // UTF-16LE on a big endian system, UTF-16BE on a little endian system
      if (sys_endian == big_endian && input.good()) {
        // check for UTF-32LE
        input >> ch;
        if (ch == L'\x0000') {
          return bom_utf_32le;
        }  else {
          input.putback(ch);
        }
      }

      return (sys_endian == big_endian ? bom_utf_16le : bom_utf_16be);
      break;


    case 0x00: // possible UTF-32BE
      if (input.good()) {
        input >> ch;
        if ((sys_endian == big_endian && ch == L'\xFEFF') || (sys_endian == little_endian && ch == L'\xFFFE')) {
          return bom_utf_32be;
        }  else {
          input.putback(ch);
        }
      }
      break;

    default:
      input.putback(ch);
      break;
    }

  }  

  return bom_undetermined;
}


inline Xport::endianness Xport::determine_system_endianness()
{
  short int word = 0x0001;
  char *byte = (char *) &word;
  return(byte[0] ? little_endian : big_endian);
}

// endian_swap courtesy of Kevin Hall
inline void Xport::endian_swap(wchar_t& x)
{
  x = (x>>8) | 
    (x<<8);
}

inline void Xport::endian_swap(unsigned int& x)
{
  x = (x>>24) | 
    ((x<<8) & 0x00FF0000) |
    ((x>>8) & 0x0000FF00) |
    (x<<24);
}


inline void Xport::get_bom_string(encoding enc, std::basic_string<char>& bom_str)
{
  std::basic_ostringstream<char> enc_str;

  switch(enc)
  {
  case utf_8:
    enc_str << '\xEF' << '\xBB' << '\xBF';
    break;

  default:
    break;
  }

  bom_str = enc_str.str();
}

inline void Xport::get_bom_string(encoding bom_enc, std::basic_string<wchar_t>& bom_str)
{
  std::basic_ostringstream<wchar_t> enc;

  switch(bom_enc)
  {
  case utf_16:
    enc << L'\xFEFF';
    break;

  case utf_32:
    enc << L'\x0000' << L'\xFEFF';
    break;

  default:
    break;
  }

  bom_str = enc.str();
}


inline void Xport::typed_char_impl(const char ch, char& output)
{
  output = ch;
}


inline void Xport::typed_char_impl(const char ch, wchar_t& output)
{
  typedef std::ctype<wchar_t> CTP;

  #if defined(_MSC_VER) && _MSC_VER < 1300
    CTP const& ct = std::_USE(std::locale(), CTP);
  #else
    CTP const& ct = std::use_facet<CTP>(std::locale());
  #endif
  output = ct.widen(ch);
}



inline void Xport::typed_string_impl(const std::string& str, std::basic_string<wchar_t>& output)
{
  typedef std::ctype<wchar_t> CTP;

  std::wstring wide;

  wide.resize( str.length() );

  #if defined(_MSC_VER) && _MSC_VER < 1300
    CTP const& ct = std::_USE(std::locale(), CTP);
  #else
    CTP const& ct = std::use_facet<CTP>(std::locale());
  #endif
  ct.widen(&str[0], &str[0] + str.size(), &wide[0]);

  output = wide;
}

inline void Xport::typed_string_impl(const std::string& str, std::basic_string<char>& output)
{
  output = str;
}
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.
License

This article, along with any associated source code and files, is licensed under The GNU General Public License (GPLv3)
Xport: XHTML Parsing and Objective Reporting Toolkit

License

Comments and Discussions