Click here to Skip to main content
15,896,375 members
Articles / Programming Languages / C++

Wave: a Standard conformant C++ preprocessor library

Rate me:
Please Sign up or sign in to vote.
4.96/5 (58 votes)
10 Jan 200413 min read 403.1K   4.4K   81  
Describes a free and fully Standard conformant C++ preprocessor library
/*=============================================================================
    Wave: A Standard compliant C++ preprocessor

    Grammar for universal character validation (see C++ standard: Annex E)
    
    Copyright (c) 2001-2003 Hartmut Kaiser
    http://spirit.sourceforge.net/

    Permission to copy, use, modify, sell and distribute this software
    is granted provided this copyright notice appears in all copies.
    This software is provided "as is" without express or implied
    warranty, and with no claim as to its suitability for any purpose.

    See Copyright.txt for full copyright notices and acknowledgements.
=============================================================================*/
#if !defined(VALIDATE_UNIVERSAL_CHAR_HPP_55F1B811_CD76_4C72_8344_CBC69CF3B339_INCLUDED)
#define VALIDATE_UNIVERSAL_CHAR_HPP_55F1B811_CD76_4C72_8344_CBC69CF3B339_INCLUDED

#include <boost/spirit/core.hpp>

#include "wave/util/file_position.hpp"
#include "wave/cpplexer/cpplexer_exceptions.hpp"

///////////////////////////////////////////////////////////////////////////////
namespace wave {
namespace cpplexer {
namespace impl {

enum universal_char_type {
    universal_char_type_valid = 0,
    universal_char_type_invalid = 1,
    universal_char_type_base_charset = 2,
    universal_char_type_not_allowed_for_identifiers = 3,
};

namespace {

    ///////////////////////////////////////////////////////////////////////////
    // 
    //  is_range is a helper function for the classification by brute force 
    //  below
    //
    ///////////////////////////////////////////////////////////////////////////
    inline bool 
    in_range(unsigned long ch, unsigned long l, unsigned long u)
    {
        return (l <= ch && ch <= u);
    }
}

///////////////////////////////////////////////////////////////////////////////
//
//  classify_universal_char
//
//      This function classifies an universal character value into 4 subranges:
//      universal_char_type_valid
//          the universal character value is valid for identifiers
//      universal_char_type_invalid
//          the universal character value is not valid for its usage inside
//          identifiers (see C++ Standard: 2.2.2 [lex.charset])
//      universal_char_type_base_charset
//          the universal character value designates a character from the base
//          character set
//      universal_char_type_not_allowed_for_identifiers
//          the universal character value is not allowed in an identifier
//
//  Implementation note:
//      This classification isnt implemented very effectively here. This
//      function should be rewritten with some range run matching algorithm.
//      
///////////////////////////////////////////////////////////////////////////////
inline universal_char_type 
classify_universal_char (unsigned long ch)
{
// test for invalid characters
    if (ch <= 0x0020 || in_range(ch, 0x007f, 0x009f))
        return universal_char_type_invalid;
    
// test for characters in the range of the base character set
    if (in_range(ch, 0x0021, 0x005f) || in_range(ch, 0x0061, 0x007e))
        return universal_char_type_base_charset;

// test for additional valid character values (see C++ Standard: Annex E)
    if (in_range(ch, 0x00c0, 0x00d6) || in_range(ch, 0x00d8, 0x00f6) ||
        in_range(ch, 0x00f8, 0x01f5) || in_range(ch, 0x01fa, 0x0217) ||
        in_range(ch, 0x0250, 0x02a8) || in_range(ch, 0x1e00, 0x1e9a) ||
        in_range(ch, 0x1ea0, 0x1ef9))
    {
        return universal_char_type_valid;   // Latin
    }
    
    if (0x0384 == ch || in_range(ch, 0x0388, 0x038a) ||
        0x038c == ch || in_range(ch, 0x038e, 0x03a1) ||
        in_range(ch, 0x03a3, 0x03ce) || in_range(ch, 0x03d0, 0x03d6) ||
        0x03da == ch || 0x03dc == ch || 0x03de == ch || 0x03e0 == ch ||
        in_range(ch, 0x03e2, 0x03f3) || in_range(ch, 0x1f00, 0x1f15) ||
        in_range(ch, 0x1f18, 0x1f1d) || in_range(ch, 0x1f20, 0x1f45) ||
        in_range(ch, 0x1f48, 0x1f4d) || in_range(ch, 0x1f50, 0x1f57) ||
        0x1f59 == ch || 0x1f5b == ch || 0x1f5d == ch || 
        in_range(ch, 0x1f5f, 0x1f7d) || in_range(ch, 0x1f80, 0x1fb4) ||
        in_range(ch, 0x1fb6, 0x1fbc) || in_range(ch, 0x1fc2, 0x1fc4) ||
        in_range(ch, 0x1fc6, 0x1fcc) || in_range(ch, 0x1fd0, 0x1fd3) ||
        in_range(ch, 0x1fd6, 0x1fdb) || in_range(ch, 0x1fe0, 0x1fec) ||
        in_range(ch, 0x1ff2, 0x1ff4) || in_range(ch, 0x1ff6, 0x1ffc))
    {
        return universal_char_type_valid;   // Greek
    }
    
    if (in_range(ch, 0x0401, 0x040d) || in_range(ch, 0x040f, 0x044f) ||
        in_range(ch, 0x0451, 0x045c) || in_range(ch, 0x045e, 0x0481) ||
        in_range(ch, 0x0490, 0x04c4) || in_range(ch, 0x04c7, 0x04c8) ||
        in_range(ch, 0x04cb, 0x04cc) || in_range(ch, 0x04d0, 0x04eb) ||
        in_range(ch, 0x04ee, 0x04f5) || in_range(ch, 0x04f8, 0x04f9))
    {
        return universal_char_type_valid;   // Cyrillic
    }
    
    if (in_range(ch, 0x0531, 0x0556) || in_range(ch, 0x0561, 0x0587))
        return universal_char_type_valid;   // Armenian

    if (in_range(ch, 0x05d0, 0x05ea) || in_range(ch, 0x05f0, 0x05f4))
        return universal_char_type_valid;   // Hebrew
        
    if (in_range(ch, 0x0621, 0x063a) || in_range(ch, 0x0640, 0x0652) ||
        in_range(ch, 0x0670, 0x06b7) || in_range(ch, 0x06ba, 0x06be) ||
        in_range(ch, 0x06c0, 0x06ce) || in_range(ch, 0x06e5, 0x06e7))
    {
        return universal_char_type_valid;   // Arabic
    }
    
    if (in_range(ch, 0x0905, 0x0939) || in_range(ch, 0x0958, 0x0962))
        return universal_char_type_valid;   // Devanagari

    if (in_range(ch, 0x0985, 0x098c) || in_range(ch, 0x098f, 0x0990) ||
        in_range(ch, 0x0993, 0x09a8) || in_range(ch, 0x09aa, 0x09b0) ||
        0x09b2 == ch || in_range(ch, 0x09b6, 0x09b9) ||
        in_range(ch, 0x09dc, 0x09dd) || in_range(ch, 0x09df, 0x09e1) ||
        in_range(ch, 0x09f0, 0x09f1))
    {
        return universal_char_type_valid;   // Bengali
    }
    
    if (in_range(ch, 0x0a05, 0x0a0a) || in_range(ch, 0x0a0f, 0x0a10) ||
        in_range(ch, 0x0a13, 0x0a28) || in_range(ch, 0x0a2a, 0x0a30) ||
        in_range(ch, 0x0a32, 0x0a33) || in_range(ch, 0x0a35, 0x0a36) ||
        in_range(ch, 0x0a38, 0x0a39) || in_range(ch, 0x0a59, 0x0a5c) ||
        0x0a5e == ch)
    {
        return universal_char_type_valid;   // Gurmukhi
    }

    if (in_range(ch, 0x0a85, 0x0a8b) || 0x0a8d == ch ||
        in_range(ch, 0x0a8f, 0x0a91) || in_range(ch, 0x0a93, 0x0aa8) ||
        in_range(ch, 0x0aaa, 0x0ab0) || in_range(ch, 0x0ab2, 0x0ab3) ||
        in_range(ch, 0x0ab5, 0x0ab9) || 0x0ae0 == ch)
    {
        return universal_char_type_valid;   // Gujarati
    }

    if (in_range(ch, 0x0b05, 0x0b0c) || in_range(ch, 0x0b0f, 0x0b10) ||
        in_range(ch, 0x0b13, 0x0b28) || in_range(ch, 0x0b2a, 0x0b30) ||
        in_range(ch, 0x0b32, 0x0b33) || in_range(ch, 0x0b36, 0x0b39) ||
        in_range(ch, 0x0b5c, 0x0b5d) || in_range(ch, 0x0b5f, 0x0b61))
    {
        return universal_char_type_valid;   // Oriya
    }

    if (in_range(ch, 0x0b85, 0x0b8a) || in_range(ch, 0x0b8e, 0x0b90) ||
        in_range(ch, 0x0b92, 0x0b95) || in_range(ch, 0x0b99, 0x0b9a) ||
        0x0b9c == ch || in_range(ch, 0x0b9e, 0x0b9f) ||
        in_range(ch, 0x0ba3, 0x0ba4) || in_range(ch, 0x0ba8, 0x0baa) ||
        in_range(ch, 0x0bae, 0x0bb5) || in_range(ch, 0x0bb7, 0x0bb9))
    {
        return universal_char_type_valid;   // Tamil
    }

    if (in_range(ch, 0x0c05, 0x0c0c) || in_range(ch, 0x0c0e, 0x0c10) ||
        in_range(ch, 0x0c12, 0x0c28) || in_range(ch, 0x0c2a, 0x0c33) ||
        in_range(ch, 0x0c35, 0x0c39) || in_range(ch, 0x0c60, 0x0c61))
    {
        return universal_char_type_valid;   // Telugu
    }

    if (in_range(ch, 0x0c85, 0x0c8c) || in_range(ch, 0x0c8e, 0x0c90) ||
        in_range(ch, 0x0c92, 0x0ca8) || in_range(ch, 0x0caa, 0x0cb3) ||
        in_range(ch, 0x0cb5, 0x0cb9) || in_range(ch, 0x0ce0, 0x0ce1))
    {
        return universal_char_type_valid;   // Kannada
    }

    if (in_range(ch, 0x0d05, 0x0d0c) || in_range(ch, 0x0d0e, 0x0d10) ||
        in_range(ch, 0x0d12, 0x0d28) || in_range(ch, 0x0d2a, 0x0d39) ||
        in_range(ch, 0x0d60, 0x0d61))
    {
        return universal_char_type_valid;   // Malayalam
    }

    if (in_range(ch, 0x0e01, 0x0e30) || in_range(ch, 0x0e32, 0x0e33) ||
        in_range(ch, 0x0e40, 0x0e46) || in_range(ch, 0x0e4f, 0x0e5b))
    {
        return universal_char_type_valid;   // Thai
    }

    return universal_char_type_not_allowed_for_identifiers;
}

///////////////////////////////////////////////////////////////////////////////
//
//  validate_identifier_name
//
//      The validate_identifier_name function tests a given identifier name for
//      its validity in regard to eventually contained universal characters. 
//      These should be in valid ranges (see the function 
//      classify_universal_char above).
//
//      If the identifier name contains invalid or not allowed universal 
//      characters a corresponding lexing_exception is thrown.
//
///////////////////////////////////////////////////////////////////////////////
template <typename StringT>
inline void 
validate_identifier_name (StringT const &name, int line, int column, 
    StringT const &file_name)
{
    using namespace std;    // some systems have strtoul in namespace std::
    
typename StringT::size_type pos = name.find_first_of('\\');

    while (StringT::npos != pos) {
    // the identifier name contains a backslash (must be universal char)
        BOOST_SPIRIT_ASSERT('u' == name[pos+1] || 'U' == name[pos+1]);
        
    StringT uchar_val(name.substr(pos+2, ('u' == name[pos+1]) ? 4 : 8));
    universal_char_type type = 
        classify_universal_char(strtoul(uchar_val.c_str(), 0, 16));  
        
        if (universal_char_type_valid != type) {
        // an invalid char was found, so throw an exception
        StringT error_uchar(name.substr(pos, ('u' == name[pos+1]) ? 6 : 10));
        
            if (universal_char_type_invalid == type) {
                CPPLEXER_THROW(lexing_exception, universal_char_invalid, 
                    error_uchar, line, column, file_name.c_str());
            }
            else if (universal_char_type_base_charset == type) {
                CPPLEXER_THROW(lexing_exception, universal_char_base_charset, 
                    error_uchar, line, column, file_name.c_str());
            }
            else {
                CPPLEXER_THROW(lexing_exception, universal_char_not_allowed, 
                    error_uchar, line, column, file_name.c_str());
            }
        }
        
    // find next universal char (if appropriate)
        pos = name.find_first_of('\\', pos+2);
    }
}

///////////////////////////////////////////////////////////////////////////////
//
//  validate_literal
//
//      The validate_literal function tests a given string or character literal 
//      for its validity in regard to eventually contained universal 
//      characters. These should be in valid ranges (see the function 
//      classify_universal_char above).
//
//      If the string or character literal contains invalid or not allowed 
//      universal characters a corresponding lexing_exception is thrown.
//
///////////////////////////////////////////////////////////////////////////////
template <typename StringT>
inline void 
validate_literal (StringT const &name, int line, int column, 
    StringT const &file_name)
{
    using namespace std;    // some systems have strtoul in namespace std::
    
typename StringT::size_type pos = name.find_first_of('\\');

    while (StringT::npos != pos) {
    // the literal contains a backslash (may be universal char)
        if ('u' == name[pos+1] || 'U' == name[pos+1]) {
        StringT uchar_val(name.substr(pos+2, ('u' == name[pos+1]) ? 4 : 8));
        universal_char_type type = 
            classify_universal_char(strtoul(uchar_val.c_str(), 0, 16));  
            
            if (universal_char_type_valid != type && 
                universal_char_type_not_allowed_for_identifiers != type) 
            {
            // an invalid char was found, so throw an exception
            StringT error_uchar(name.substr(pos, ('u' == name[pos+1]) ? 6 : 10));
            
                if (universal_char_type_invalid == type) {
                    CPPLEXER_THROW(lexing_exception, universal_char_invalid, 
                        error_uchar, line, column, file_name.c_str());
                }
                else {
                    CPPLEXER_THROW(lexing_exception, universal_char_base_charset, 
                        error_uchar, line, column, file_name.c_str());
                }
            }
        }
                
    // find next universal char (if appropriate)
        pos = name.find_first_of('\\', pos+2);
    }
}

///////////////////////////////////////////////////////////////////////////////
}   // namespace impl           
}   // namespace cpplexer
}   // namespace wave

#endif // !defined(VALIDATE_UNIVERSAL_CHAR_HPP_55F1B811_CD76_4C72_8344_CBC69CF3B339_INCLUDED)

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
United States United States
Actively involved in Boost and the development of the Spirit parser construction framework.

Comments and Discussions