Click here to Skip to main content
15,892,298 members
Articles / Programming Languages / C++

Parsing XML in C++ using the YARD Parser

Rate me:
Please Sign up or sign in to vote.
4.79/5 (23 votes)
21 Dec 20046 min read 87.6K   1.2K   39  
Provides a set of tools for building XML parsers in C++ using the YARD recursive descent parser.
// released into the public domain 
// by Christopher Diggins 2004 
// http://www.cdiggins.com 

#ifndef XML_RULES_HPP_INCLUDED
#define XML_RULES_HPP_INCLUDED

#include "re_ops.hpp"
#include "rules.hpp"
#include <string>

namespace xml_grammar
{    
  using namespace yard;  
  
  // ============================================================
  // string representations 
  
  struct CDStart_string { static char const* GetString() { return "<![CDATA["; } };  
  struct CDEnd_string { static char const* GetString() { return "]]>"; } };
  struct CDComment_begin_string { static char const* GetString() { return "<!--"; } };
  struct CDComment_end_string { static char const* GetString() { return "-->"; } }; 
  struct XMLDecl_begin_string { static char const* GetString() { return "<?xml"; } };
  struct XMLDecl_end_string { static char const* GetString() { return "?>"; } };
  struct doctypedecl_string { static char const* GetString() { return "<!DOCTYPE"; } };
  struct PI_begin_string { static char const* GetString() { return "<?"; } };
  struct PI_end_string { static char const* GetString() { return "?>"; } };  
  
  // ============================================================
  // forward fxn decls 
  
  bool AcceptElement(ParserInputStream<char>& stream); 
  bool AcceptComment(ParserInputStream<char>& stream);
  bool AcceptCDSect(ParserInputStream<char>& stream);
  bool AcceptPI(ParserInputStream<char>& stream);     
  
  // ============================================================
  // the grammar productions
  
  struct Char : public 
    re_or<
      re_or<
        MatchChar<0x9>, 
        MatchChar<0xA> 
      >, 
      re_or<
        MatchChar<0xD>, 
        MatchCharRange<0x20, 127> 
      > 
    > 
    { };
    
  struct S : public 
    re_plus<
      re_or<
        re_or<
          MatchChar<0x20>, 
          MatchChar<0x9> 
        >, 
        re_or<
          MatchChar<0xD>, 
          MatchChar<0xA> 
        > 
      > 
    >
    { };
    
  struct NameChar : public 
    re_or<
      MatchIdentOtherChar, 
      re_or3<
        MatchChar<'-'>, 
        MatchChar<':'>, 
        MatchChar<'.'> 
      > 
    >
    { };
    
  struct Name : public 
    re_and<
      re_or<
        MatchIdentFirstChar, 
        MatchChar<':'> 
      >, 
      re_star<NameChar> 
    > 
    { };
    
  struct Names : public 
    re_and<
      Name, 
      re_star<
        re_and<
          MatchChar<0x20>, 
          Name
        > 
      > 
    >
    { };
    
  struct NMToken : public 
    re_plus<NameChar> 
    { };
    
  struct NMTokens : public 
    re_and<
      NMToken, 
      re_star<
        re_and<
          MatchChar<0x20>, 
          NMToken 
        > 
      >
    >
    { };
    
  struct CDStart : public MatchString<CDStart_string> 
    { };

  struct CDEnd : public MatchString<CDEnd_string>
    { };   

  struct CDSect : public re_and<CDStart, re_until<CDEnd> > 
    { };
          
  struct AttValue : public 
    re_or<
      re_and<
        MatchChar<'"'>, 
        re_until<
          MatchChar<'"'>
        >
      >,
      re_and<
        MatchChar<'\''>,
        re_until<
          MatchChar<'\''>
        >
      >
    >
    { };
    
  struct Eq : public 
    re_and3<
      re_opt<S>, 
      MatchChar<'='>, 
      re_opt<S> 
    >
    { };
    
  struct Attribute : public
    re_and3<
      Name, 
      Eq, 
      AttValue
    >
    { };

  struct Attributes : public 
    re_star<
      re_and<
        S, 
        Attribute 
      >
    >
    { };
          
  struct Comment : public 
    re_and<
      MatchString<CDComment_begin_string>, 
      re_until<
        MatchString<CDComment_end_string>
      > 
    >
    { }; 
        
  struct TagContents : public 
    re_and3<
      Name, 
      Attributes, 
      re_opt<S> 
    >
    { };
    
  struct STag : public 
    re_and3<
      MatchChar<'<'>, 
      TagContents, 
      MatchChar<'>'>
    >
    { };
    
  struct ETag : public 
    re_and<
      re_and3<
        MatchChar<'<'>, 
        MatchChar<'/'>, 
        Name
      >, 
      re_and<
        re_opt<S>, 
        MatchChar<'>'> 
      > 
    >
    { };
                        
  struct EmptyElemTag : public 
    re_and<
      re_and<
        MatchChar<'<'>, 
        Name
      >, 
      re_and3<
        Attributes, 
        re_opt<S>, 
        re_and<
          MatchChar<'/'>,
          MatchChar<'>'>
        >
      > 
    >
    { };

  struct CharData 
  {
    static bool Accept(ParserInputStream<char>& stream) {
      while (!stream.AtEnd() && (stream.GetElem() != '<')) {
        stream.GotoNext();
      }
      return true;
    }      
  };

  struct content 
  {
    static bool Accept(ParserInputStream<char>& stream) {          
      while (!stream.AtEnd()) {
        CharData::Accept(stream);
        if (!(AcceptElement(stream) 
          || AcceptComment(stream)
          || AcceptCDSect(stream)
          || AcceptPI(stream)))
        {
          break;
        }        
      }  
      return true;             
    }
  };

  struct NonEmptyElemTag : public 
    re_and3<
      STag, 
      content, 
      ETag
    >
    { };

  struct element : public 
    re_or<
      EmptyElemTag, 
      NonEmptyElemTag
    >    
    { };

  struct XMLDecl : public
    re_and<
      MatchString<XMLDecl_begin_string>, 
      re_until<
        MatchString<XMLDecl_end_string>
      > 
    >
    { };
    
  struct doctypedecl : public 
    re_and<
      MatchString<doctypedecl_string>, 
      re_until<
        MatchChar<'>'> 
      >
    >
    { };
        
  struct PI : public 
    re_and<
      MatchString<PI_begin_string>, 
      re_until<
        MatchString<PI_end_string> 
      > 
    >
    { };

  struct Misc : public 
    re_or3<
      Comment, 
      PI, 
      S
    > 
    { };

  struct prolog : public 
    re_and3<
      re_opt<XMLDecl>, 
      re_star<Misc>, 
      re_opt<
        re_and<
          doctypedecl, 
          re_star<Misc> 
        > 
      >
    >
    { };
    
  struct document : public 
    re_and3<
      prolog, 
      element, 
      re_star<Misc> 
    > 
    { };
    
  // ============================================================
  // function definitions 
          
  bool AcceptElement(ParserInputStream<char>& stream) {
    return element::Accept(stream);  
  } 
  
  bool AcceptComment(ParserInputStream<char>& stream) {
    return Comment::Accept(stream);
  }
  
  bool AcceptCDSect(ParserInputStream<char>& stream) {
    return CDSect::Accept(stream);
  }
  
  bool AcceptPI(ParserInputStream<char>& stream) {
    return PI::Accept(stream);
  }
};

#endif // #ifndef XML_RULES_HPP_INCLUDED    
  

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Software Developer Ara 3D
Canada Canada
I am the designer of the Plato programming language and I am the founder of Ara 3D. I can be reached via email at cdiggins@gmail.com

Comments and Discussions