Click here to Skip to main content
13,402,795 members (41,024 online)
Click here to Skip to main content


37 bookmarked
Posted 29 Jun 2009

REG file parser using the Boost Spirit Parser Framework

This article describes a sample parser of reg files using the Boost Spirit Parser Framework. We'll discuss why we use the curtain libraries and make one or another solution, algorithm, and also tests.

Simple schema of .reg file data structure:
- KEY_NAME may consist of alphabetical symbols and '"','\\','[',']'
- Number of values of one key can be from 0 to infinite
- VALUE_NAME can be:
   - symbol '@' - it's mean default
   - "text"     - in this "text" can be this symbols '\n','"','\\','[',']'
- DATA can be:
 - "text" - in this "text" can be any symbols but it always end by "\n" 
 - binary:
   - hex(N):XX
   - hex:XX 
   - dword:XX
     In XX can be pair of number symbols through a blank and
     it can end by '\' symbol that mean that data continue on next line

Also you can read boost::spirit info at the and of this file        

#include <boost/spirit/core.hpp>
#include <boost/spirit.hpp>
#include <boost/bind.hpp>

namespace reg_parser

template<class charT>
struct IResultProcessor
    virtual ~IResultProcessor(){}

    virtual void OnKeyFound(const charT* begin, const charT* end)=0;
    virtual void OnValueNameFound(const charT* begin, const charT* end)=0;
    virtual void OnValueDataFound(const charT* begin, const charT* end)=0;    

template<class charT>
inline bool ParseRegFileImpl(const charT* buffer,
                                    IResultProcessor<charT>* resultProc)
    using namespace boost::spirit;
    typedef rule<scanner<const charT*> > RuleType;
    typedef chlit<charT> ch_t;  // Single character
     typedef chset<charT> chs_t; // Character set
     typedef IResultProcessor<charT> ResProcT;

    chs_t anychar_t(anychar_p); // Pattern to char set
    chs_t eol_CR('\r'); // End of line CR
    chs_t eol_LF('\n'); // End of line LF
    chs_t eol_t(eol_CR);// CR or LF end of line
    eol_t |= eol_LF;
/* ------------------------------------------------------------------------------------- */
/* Help rules*/

    // Matches spaces or tabs
    RuleType blanks   = * blank_p;
    // Symbols ']'and '[' - separate Key Name
    RuleType not_name_separator = ~ch_t(']') & ~ch_t('[');
    // empty data
    RuleType empty_data = blanks >> (eol_t | ch_t('\0'));
    // Data in what we don't interested
    // @ - it's default value name, " - require additional processing
    RuleType other_data = *(anychar_t & not_name_separator & ~ch_t('@')& ~ch_t('"'));     

/* ------------------------------------------------------------------------------------- */
/* Rules that describe identifier of key name */

    RuleType ident_kname_continue = ch_t(']') >> ~eol_t;
    RuleType ident_key_name = *(anychar_t & ~ch_t(']')) || ident_kname_continue >> ident_key_name;
/* ------------------------------------------------------------------------------------- */
/* Rules that describe identifier of value name */

    // Skip \" sequence
    RuleType ident_vname_sz_skip = ch_t('\\') >> ch_t('"');
    RuleType ident_vname_sz_impl = *(anychar_t & ~ch_t('"') || ident_vname_sz_skip );  
    // Add trailing symbols to match pattern
    RuleType ident_vname_sz = ch_t('"') >> ident_vname_sz_impl >> +ch_t('"');
    // Rule for default value name     
    RuleType ident_vname_def = ch_t('@');
    // "text" or DEFAULT
    RuleType ident_value_name = ident_vname_def | ident_vname_sz;

/* ------------------------------------------------------------------------------------- */
/* Rules that describe value data */

    // if value data is binary
    // Data can be any character except '\\' and '\n' in sequence
    RuleType vdata_bin_body =*(anychar_t & ~ch_t('\0') & ~eol_t & ~ch_t('\\'));
    RuleType vdata_bin_continue = ch_t('\\') >> eol_t;
    // if '\\' and '\n' in sequence data continue on the next line
    RuleType vdata_bin = vdata_bin_body >> *(vdata_bin_continue >> vdata_bin_body);

    // if value data is string
    // String data always end by '"' and '\n' symbols in sequence
    RuleType vdata_sz_continue = +ch_t('"') >> ~eol_t;
    RuleType vdata_sz_body_impl = *(anychar_t & ~ch_t('"'));
    RuleType vdata_sz_body = vdata_sz_body_impl >> *(vdata_sz_continue >> vdata_sz_body_impl);
    // String data always starts and end with '"' symbol
    RuleType ident_vdata_sz = ch_t('"') >> vdata_sz_body >> +ch_t('"');
    // Check value data format    
    RuleType ident_vdata = ident_vdata_sz | vdata_bin;

/* ------------------------------------------------------------------------------------- */
/* Put all rules together */

    // line with key name     
    RuleType l_key = 
        other_data>> // Can be comments or start title 
        ch_t('[') >> // starts key name
        // Call OnKeyFound function if rule succeed
        ident_key_name   [bind(&ResProcT::OnKeyFound, resultProc, _1,_2) ] >>
        blanks    >> // can be blanks
        ch_t(']') >> // end key name
        blanks    >> // can be blanks
        *eol_t;      // one or more end of line symbols

    // lines with value name and data
    RuleType l_values =             
        other_data>> // Can be comments or start title
        // Call OnValueNameFound function if rule succeed
        ident_value_name   [bind(&ResProcT::OnValueNameFound, resultProc, _1,_2) ] >>   
        blanks    >> // can be blanks                         
        ch_t('=') >> // always separate value name and value data
        blanks    >> // can be blanks
        // Call OnValueDataFound function if rule succeed
        ident_vdata  [bind(&ResProcT::OnValueDataFound, resultProc, _1,_2) ]  >>
        blanks    >> // can be blanks
        *eol_t;      // one or more end of line symbols

    // Any line can satisfy one of three rules
    RuleType lines = l_key | l_values | empty_data;

    // Do lexeme_d pars that compare also additional symbols
    // if do just *lines, symbols ' ','\t','\n' not be compared
    RuleType reg_file =  lexeme_d [*lines] ;
    // Execute parse
    return (parse(buffer, reg_file).full);

} //namespace reg_parser


boost::spirit sort info
Full info available at or directly 

Set operators:
 a | b     Union             Match a or b. Also referred to as alternative
 a & b     Intersection     Match a and b
 a - b     Difference         Match a but not b. If both match and b's matched text
                        is shorter than a's matched text, a successful match is made
 a ^ b     XOR                 Match a or b, but not both

Sequencing Operators:
 a >> b     Sequence         Match a and b in sequence
 a && b     Sequential-and     Sequential-and. Same as above, match a and b in sequence
 a || b     Sequential-or     Match a or b in sequence

Optional and Loops:
 *a         -     Match a zero (0) or more times
 +a         -     Match a one (1) or more times
 !a         -     Match a zero (0) or one (1) time
 a % b     -     Match a list of one or more repetitions of a separated by occurrences of b. 
            This is the same as a >> *(b >> a). Note that a must not also match b

Single character parsers:
 anychar_p     Matches any single character (including the null terminator: '\0')
 alnum_p     Matches alpha-numeric characters
 alpha_p     Matches alphabetic characters
 blank_p     Matches spaces or tabs
 cntrl_p     Matches control characters
 digit_p     Matches numeric digits
 graph_p     Matches non-space printing characters
 lower_p     Matches lower case letters
 print_p     Matches printable characters
 punct_p     Matches punctuation symbols
 space_p     Matches spaces, tabs, returns, and newlines
 upper_p     Matches upper case letters
 xdigit_p     Matches hexadecimal digits

Other comments:
 negation ~
 Example: ~ch_t('x') - matches any character except 'x'

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.


This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


About the Authors

Apriorit Inc
Apriorit Inc.
Hungary Hungary
This member doesn't quite have enough reputation to be able to display their biography and homepage.
Group type: Organisation

32 members

Ivan Romanenko
Software Developer Codedgers Inc
Ukraine Ukraine
No Biography provided

You may also be interested in...

Permalink | Advertise | Privacy | Terms of Use | Mobile
Web02 | 2.8.1802120.2 | Last Updated 30 Jun 2009
Article Copyright 2009 by Apriorit Inc, Ivan Romanenko
Everything else Copyright © CodeProject, 1999-2018
Layout: fixed | fluid