Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

Xport: XHTML Parsing and Objective Reporting Toolkit

, 4 May 2008 GPL3
Open source C++ class template library for generating and parsing xhtml documents.
xport_1.6.1.zip
descendant_markup_iterator.inl
stylesheet.inl
stylesheet_formatter.inl
stylesheet_rule.inl
tag.inl
tag_trait.inl
tag_traits.inl
xhtml_comment.inl
xhtml_doc.inl
xhtml_element.inl
xhtml_formatter.inl
xhtml_frameset.inl
xhtml_markup.inl
xhtml_parser.inl
xhtml_processing_instruction.inl
xhtml_strict.inl
xhtml_transitional.inl
xport_1.6.5.zip
descendant_markup_iterator.inl
stylesheet.inl
stylesheet_formatter.inl
stylesheet_rule.inl
tag.inl
tag_trait.inl
tag_traits.inl
xhtml_comment.inl
xhtml_doc.inl
xhtml_element.inl
xhtml_formatter.inl
xhtml_frameset.inl
xhtml_markup.inl
xhtml_parser.inl
xhtml_processing_instruction.inl
xhtml_strict.inl
xhtml_transitional.inl
xport_1.6.7.zip
descendant_markup_iterator.inl
stylesheet.inl
stylesheet_formatter.inl
stylesheet_rule.inl
tag.inl
tag_trait.inl
tag_traits.inl
xhtml_comment.inl
xhtml_doc.inl
xhtml_element.inl
xhtml_formatter.inl
xhtml_frameset.inl
xhtml_markup.inl
xhtml_parser.inl
xhtml_processing_instruction.inl
xhtml_strict.inl
xhtml_transitional.inl
xport_documentation.zip
Xport.chm
xport_htmlhelp.zip
Xport.chm
/************************************************************************
Xport: XHTML Parsing & Objective Reporting Toolkit
Copyright (C) 2007  Mitchel Haas

This file is part of Xport.

Xport is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Xport is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Xport.  If not, see <http://www.gnu.org/licenses/>.

For complete documentation on this library and alternative
licensing options, visit http://www.xportpro.com
Email questions, comments or suggestions to mitchel.haas@xportpro.com
************************************************************************/
#pragma once
#include <string>
#include <locale>
#include <cctype> 
#include <cwctype>
#include <sstream>


/************************************************************************/
/* Character type support                                               */
/************************************************************************/

namespace Xport 
{
  enum encoding {
    default_text,
    utf_8,
    utf_16,
    utf_32
  };

  enum bom_encoding { 
    bom_undetermined,
    bom_utf_8, 
    bom_utf_16be, 
    bom_utf_16le, 
    bom_utf_32be, 
    bom_utf_32le 
  };

  enum endianness { little_endian, big_endian };
  enum formatter_encoding_option {bom};

  inline void convert_endiness(std::basic_istream<char>* input, std::basic_string<char>& conv_str) {}
  inline void convert_endiness(std::basic_istream<wchar_t>* input, std::basic_string<wchar_t>& conv_str);
  inline bom_encoding determine_bom(std::basic_istream<char>& input);
  inline bom_encoding determine_bom(std::basic_istream<wchar_t>& input);
  inline endianness determine_system_endianness();
  inline void endian_swap(wchar_t& x);
  inline void endian_swap(unsigned int& x);
  inline void get_bom_string(encoding bom_enc, std::basic_string<char>& bom_str);
  inline void get_bom_string(encoding bom_enc, std::basic_string<wchar_t>& bom_str);
  template<typename CT> inline encoding get_default_bom() { return sizeof(CT) > 1 ? utf_16 : default_text; }
  inline void typed_char_impl(const char ch, char& output);
  inline void typed_char_impl(const char ch, wchar_t& output);
  inline void typed_string_impl(const std::string& str, std::basic_string<wchar_t>& output);
  inline void typed_string_impl(const std::string& str, std::basic_string<char>& output);

  template<typename CT> 
  inline void process_bom(std::basic_istream<CT>** ppInput, bool created_input)
  {
    // determining bom will strip it
    bom_encoding enc = determine_bom(**ppInput);

    endianness sys_en = determine_system_endianness();

    if (sys_en == little_endian && enc == bom_utf_16be || sys_en == big_endian && enc == bom_utf_16le) {
      // need to swap bytes in characters
      std::basic_string<CT> conv_str;
      convert_endiness(*ppInput, conv_str);
      if (created_input) {
        delete *ppInput;
      }
      *ppInput = new std::basic_istringstream<CT>(conv_str);
      created_input = true;
    }
  }


  template<typename CT> 
  inline CT typed_char(const char ch)
  {
    CT temp;
    typed_char_impl(ch, temp);
    return temp;
  }

  template<typename CT> 
  inline std::basic_string<CT> typed_string(const std::string& str)
  {
    std::basic_string<CT> temp;
    typed_string_impl(str, temp);
    return temp;
  }


}

inline void Xport::convert_endiness(std::basic_istream<wchar_t>* input, std::basic_string<wchar_t>& conv_str)
{
  std::istreambuf_iterator<wchar_t> it(*input), it_end;
  for (; it != it_end; ++it) {
    wchar_t ch = *it;
    endian_swap(ch);
    conv_str += ch;
  }
}


inline Xport::bom_encoding Xport::determine_bom(std::basic_istream<char>& input)
{
  if (input.good()) {
    char ch;
    input >> ch;

    switch (ch) 
    {
    case '\xEF': // UTF-8
      if (!input.eof())
      input >> ch;
      if (ch == '\xBB' && !input.eof()) {
        input >> ch;
        if (ch == '\xBF') {
          return bom_utf_8;
        } else {
          input.putback(ch);
        }
      } else {
        input.putback(ch);
      }
      break;

    case '\xFE': // UTF-16BE
      if (!input.eof()) {
        input >> ch;
        if (ch == '\xFF') {
          return bom_utf_16be;
        } else {
          input.unget();
        }
      }
      break;

    case 0xFF: // UTF-16LE OR UTF-32LE
      if (!input.eof()) {
        input >> ch;
        if (ch == '\xFE') {
          if (!input.eof()) {
            input >> ch;
            if (ch == '\x00' && !input.eof()) {
              input >> ch;
              if (ch == '\x00') {
                return bom_utf_32le;
              }
            } else {
              input.putback(ch);
            }
            return bom_utf_16le;
          }
        }
      }
      break;

    case 0x00: // UTF-32BE
      if (!input.eof()) {
        input >> ch;
        if (ch == '\x00' && !input.eof()) {
          input >> ch;
          if (ch == '\xFE' && !input.eof()) {
            input >> ch;
            if (ch == '\xFF') {
              return bom_utf_32be;
            }
          }
        }
      }
      break;

    default:
      input.putback(ch);
      break;
    }
  }

  return bom_undetermined;
}

inline Xport::bom_encoding Xport::determine_bom(std::basic_istream<wchar_t>& input)
{
  endianness sys_endian = determine_system_endianness();

  if (input.good()) {
    wchar_t ch;
    input >> ch;

    switch (ch) 
    {
    case L'\xBBEF': // UTF-8 on little endian system
      if (sys_endian == little_endian) {
        return bom_utf_8;
      }
      break;

    case L'\xEFBB':  // UTF-8 on big endian system
      if (sys_endian == big_endian) {
        return bom_utf_8;
      }
      break;

    case L'\xFEFF': // UTF-16BE on big endian system, UTF-16LE on little endian system
      if (sys_endian == little_endian && input.good()) {
        // check for UTF-32LE
        input >> ch;
        if (ch == L'\x0000') {
          return bom_utf_32le;
        } else {
          input.putback(ch);
        }
      } 

      return (sys_endian == big_endian ? bom_utf_16be : bom_utf_16le);
      break;

    case L'\xFFFE': // UTF-16LE on a big endian system, UTF-16BE on a little endian system
      if (sys_endian == big_endian && input.good()) {
        // check for UTF-32LE
        input >> ch;
        if (ch == L'\x0000') {
          return bom_utf_32le;
        }  else {
          input.putback(ch);
        }
      }

      return (sys_endian == big_endian ? bom_utf_16le : bom_utf_16be);
      break;


    case 0x00: // possible UTF-32BE
      if (input.good()) {
        input >> ch;
        if ((sys_endian == big_endian && ch == L'\xFEFF') || (sys_endian == little_endian && ch == L'\xFFFE')) {
          return bom_utf_32be;
        }  else {
          input.putback(ch);
        }
      }
      break;

    default:
      input.putback(ch);
      break;
    }

  }  

  return bom_undetermined;
}


inline Xport::endianness Xport::determine_system_endianness()
{
  short int word = 0x0001;
  char *byte = (char *) &word;
  return(byte[0] ? little_endian : big_endian);
}

// endian_swap courtesy of Kevin Hall
inline void Xport::endian_swap(wchar_t& x)
{
  x = (x>>8) | 
    (x<<8);
}

inline void Xport::endian_swap(unsigned int& x)
{
  x = (x>>24) | 
    ((x<<8) & 0x00FF0000) |
    ((x>>8) & 0x0000FF00) |
    (x<<24);
}


inline void Xport::get_bom_string(encoding enc, std::basic_string<char>& bom_str)
{
  std::basic_ostringstream<char> enc_str;

  switch(enc)
  {
  case utf_8:
    enc_str << '\xEF' << '\xBB' << '\xBF';
    break;

  default:
    break;
  }

  bom_str = enc_str.str();
}

inline void Xport::get_bom_string(encoding bom_enc, std::basic_string<wchar_t>& bom_str)
{
  std::basic_ostringstream<wchar_t> enc;

  switch(bom_enc)
  {
  case utf_16:
    enc << L'\xFEFF';
    break;

  case utf_32:
    enc << L'\x0000' << L'\xFEFF';
    break;

  default:
    break;
  }

  bom_str = enc.str();
}


inline void Xport::typed_char_impl(const char ch, char& output)
{
  output = ch;
}


inline void Xport::typed_char_impl(const char ch, wchar_t& output)
{
  typedef std::ctype<wchar_t> CTP;

  #if defined(_MSC_VER) && _MSC_VER < 1300
    CTP const& ct = std::_USE(std::locale(), CTP);
  #else
    CTP const& ct = std::use_facet<CTP>(std::locale());
  #endif
  output = ct.widen(ch);
}



inline void Xport::typed_string_impl(const std::string& str, std::basic_string<wchar_t>& output)
{
  typedef std::ctype<wchar_t> CTP;

  std::wstring wide;

  wide.resize( str.length() );

  #if defined(_MSC_VER) && _MSC_VER < 1300
    CTP const& ct = std::_USE(std::locale(), CTP);
  #else
    CTP const& ct = std::use_facet<CTP>(std::locale());
  #endif
  ct.widen(&str[0], &str[0] + str.size(), &wide[0]);

  output = wide;
}

inline void Xport::typed_string_impl(const std::string& str, std::basic_string<char>& output)
{
  output = str;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The GNU General Public License (GPLv3)

Share

About the Author

Mitchel Haas
Software Developer Datasoft Solutions
United States United States
I'm a c++ programmer in the midwest, now using VC7 at work and at home. I enjoy creating generic libraries, and template based programming.
 
I also enjoy web development (xhtml, css, javascript, php).

| Advertise | Privacy | Terms of Use | Mobile
Web02 | 2.8.141220.1 | Last Updated 4 May 2008
Article Copyright 2008 by Mitchel Haas
Everything else Copyright © CodeProject, 1999-2014
Layout: fixed | fluid