Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version
Go to top

Parsing XML in C++ using the YARD Parser

, 21 Dec 2004
Provides a set of tools for building XML parsers in C++ using the YARD recursive descent parser.
// released into the public domain 
// by Christopher Diggins 2004 
// http://www.cdiggins.com 

#ifndef XML_RULES_HPP_INCLUDED
#define XML_RULES_HPP_INCLUDED

#include "re_ops.hpp"
#include "rules.hpp"
#include <string>

namespace xml_grammar
{    
  using namespace yard;  
  
  // ============================================================
  // string representations 
  
  struct CDStart_string { static char const* GetString() { return "<![CDATA["; } };  
  struct CDEnd_string { static char const* GetString() { return "]]>"; } };
  struct CDComment_begin_string { static char const* GetString() { return "<!--"; } };
  struct CDComment_end_string { static char const* GetString() { return "-->"; } }; 
  struct XMLDecl_begin_string { static char const* GetString() { return "<?xml"; } };
  struct XMLDecl_end_string { static char const* GetString() { return "?>"; } };
  struct doctypedecl_string { static char const* GetString() { return "<!DOCTYPE"; } };
  struct PI_begin_string { static char const* GetString() { return "<?"; } };
  struct PI_end_string { static char const* GetString() { return "?>"; } };  
  
  // ============================================================
  // forward fxn decls 
  
  bool AcceptElement(ParserInputStream<char>& stream); 
  bool AcceptComment(ParserInputStream<char>& stream);
  bool AcceptCDSect(ParserInputStream<char>& stream);
  bool AcceptPI(ParserInputStream<char>& stream);     
  
  // ============================================================
  // the grammar productions
  
  struct Char : public 
    re_or<
      re_or<
        MatchChar<0x9>, 
        MatchChar<0xA> 
      >, 
      re_or<
        MatchChar<0xD>, 
        MatchCharRange<0x20, 127> 
      > 
    > 
    { };
    
  struct S : public 
    re_plus<
      re_or<
        re_or<
          MatchChar<0x20>, 
          MatchChar<0x9> 
        >, 
        re_or<
          MatchChar<0xD>, 
          MatchChar<0xA> 
        > 
      > 
    >
    { };
    
  struct NameChar : public 
    re_or<
      MatchIdentOtherChar, 
      re_or3<
        MatchChar<'-'>, 
        MatchChar<':'>, 
        MatchChar<'.'> 
      > 
    >
    { };
    
  struct Name : public 
    re_and<
      re_or<
        MatchIdentFirstChar, 
        MatchChar<':'> 
      >, 
      re_star<NameChar> 
    > 
    { };
    
  struct Names : public 
    re_and<
      Name, 
      re_star<
        re_and<
          MatchChar<0x20>, 
          Name
        > 
      > 
    >
    { };
    
  struct NMToken : public 
    re_plus<NameChar> 
    { };
    
  struct NMTokens : public 
    re_and<
      NMToken, 
      re_star<
        re_and<
          MatchChar<0x20>, 
          NMToken 
        > 
      >
    >
    { };
    
  struct CDStart : public MatchString<CDStart_string> 
    { };

  struct CDEnd : public MatchString<CDEnd_string>
    { };   

  struct CDSect : public re_and<CDStart, re_until<CDEnd> > 
    { };
          
  struct AttValue : public 
    re_or<
      re_and<
        MatchChar<'"'>, 
        re_until<
          MatchChar<'"'>
        >
      >,
      re_and<
        MatchChar<'\''>,
        re_until<
          MatchChar<'\''>
        >
      >
    >
    { };
    
  struct Eq : public 
    re_and3<
      re_opt<S>, 
      MatchChar<'='>, 
      re_opt<S> 
    >
    { };
    
  struct Attribute : public
    re_and3<
      Name, 
      Eq, 
      AttValue
    >
    { };

  struct Attributes : public 
    re_star<
      re_and<
        S, 
        Attribute 
      >
    >
    { };
          
  struct Comment : public 
    re_and<
      MatchString<CDComment_begin_string>, 
      re_until<
        MatchString<CDComment_end_string>
      > 
    >
    { }; 
        
  struct TagContents : public 
    re_and3<
      Name, 
      Attributes, 
      re_opt<S> 
    >
    { };
    
  struct STag : public 
    re_and3<
      MatchChar<'<'>, 
      TagContents, 
      MatchChar<'>'>
    >
    { };
    
  struct ETag : public 
    re_and<
      re_and3<
        MatchChar<'<'>, 
        MatchChar<'/'>, 
        Name
      >, 
      re_and<
        re_opt<S>, 
        MatchChar<'>'> 
      > 
    >
    { };
                        
  struct EmptyElemTag : public 
    re_and<
      re_and<
        MatchChar<'<'>, 
        Name
      >, 
      re_and3<
        Attributes, 
        re_opt<S>, 
        re_and<
          MatchChar<'/'>,
          MatchChar<'>'>
        >
      > 
    >
    { };

  struct CharData 
  {
    static bool Accept(ParserInputStream<char>& stream) {
      while (!stream.AtEnd() && (stream.GetElem() != '<')) {
        stream.GotoNext();
      }
      return true;
    }      
  };

  struct content 
  {
    static bool Accept(ParserInputStream<char>& stream) {          
      while (!stream.AtEnd()) {
        CharData::Accept(stream);
        if (!(AcceptElement(stream) 
          || AcceptComment(stream)
          || AcceptCDSect(stream)
          || AcceptPI(stream)))
        {
          break;
        }        
      }  
      return true;             
    }
  };

  struct NonEmptyElemTag : public 
    re_and3<
      STag, 
      content, 
      ETag
    >
    { };

  struct element : public 
    re_or<
      EmptyElemTag, 
      NonEmptyElemTag
    >    
    { };

  struct XMLDecl : public
    re_and<
      MatchString<XMLDecl_begin_string>, 
      re_until<
        MatchString<XMLDecl_end_string>
      > 
    >
    { };
    
  struct doctypedecl : public 
    re_and<
      MatchString<doctypedecl_string>, 
      re_until<
        MatchChar<'>'> 
      >
    >
    { };
        
  struct PI : public 
    re_and<
      MatchString<PI_begin_string>, 
      re_until<
        MatchString<PI_end_string> 
      > 
    >
    { };

  struct Misc : public 
    re_or3<
      Comment, 
      PI, 
      S
    > 
    { };

  struct prolog : public 
    re_and3<
      re_opt<XMLDecl>, 
      re_star<Misc>, 
      re_opt<
        re_and<
          doctypedecl, 
          re_star<Misc> 
        > 
      >
    >
    { };
    
  struct document : public 
    re_and3<
      prolog, 
      element, 
      re_star<Misc> 
    > 
    { };
    
  // ============================================================
  // function definitions 
          
  bool AcceptElement(ParserInputStream<char>& stream) {
    return element::Accept(stream);  
  } 
  
  bool AcceptComment(ParserInputStream<char>& stream) {
    return Comment::Accept(stream);
  }
  
  bool AcceptCDSect(ParserInputStream<char>& stream) {
    return CDSect::Accept(stream);
  }
  
  bool AcceptPI(ParserInputStream<char>& stream) {
    return PI::Accept(stream);
  }
};

#endif // #ifndef XML_RULES_HPP_INCLUDED    
  

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

Share

About the Author

Christopher Diggins
Software Developer Autodesk
Canada Canada
This article was written by Christopher Diggins, a computer science nerd who currently works at Autodesk as an SDK specialist.
Follow on   Twitter   Google+   LinkedIn

| Advertise | Privacy | Mobile
Web01 | 2.8.140916.1 | Last Updated 21 Dec 2004
Article Copyright 2004 by Christopher Diggins
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid