Click here to Skip to main content
15,878,945 members
Articles / Containers / Virtual Machine

Parsing Expression Grammar Support for C# 3.0 Part 1 - PEG Lib and Parser Generator

Rate me:
Please Sign up or sign in to vote.
4.95/5 (49 votes)
7 Oct 2008CPOL40 min read 202.3K   2.1K   118  
Introduction to the parsing method PEG with library and parser generator
<<Grammar Name="python_2_5_2_i"  
          encoding_class="ascii" 
          reference="http://www.python.org/doc/2.5.2/ref/grammar.txt"
          comment="'readable', but not optimized, heavily backtracking version of python grammar"
>>
{
      bool init_() 
      { in_raw_mode= false;
        indentStack.Clear(); 
        indentStack.Push(0); 
        indentLength_=-1; 
        return true; 
      }
      #region raw string handling
      bool in_raw_mode;
      bool set_raw_(){in_raw_mode=true;return true;}
      bool unset_raw_(){in_raw_mode=false;return true;}
      bool in_raw_(){return in_raw_mode;}
      #endregion raw string handling
      #region implicit line joining
      bool in_impl_line_join_;
      bool in_implicit_line_joining_(){return in_impl_line_join_;}
      bool set_implicit_line_joining_(bool bOn,out bool prev)
      {prev=in_impl_line_join_;in_impl_line_join_=bOn;return true;}
      bool set_implicit_line_joining_(bool bOn)
      {in_impl_line_join_=bOn;return true;}
      #endregion implicit line joining
      string indent_;
      int indentLength_;//accounts for tabs
      int indentLengthToRestore_;
      System.Collections.Generic.Stack<int> indentStack= new System.Collections.Generic.Stack<int>();
      int NormalizedIndentLength()
      {
         int indentLength = 0;
         for (int i = 0; i < indent_.Length; ++i)
         {
            if (indent_[i] == '\t') indentLength += 8 - i % 8;
            else                    indentLength++;
         }
         return indentLength;
      }
      bool continue_after_dedent_()
      {
         indentLengthToRestore_= indentLength_;
         if( indentLength_== -1 ) return false;
         if (indentLength_ == indentStack.Peek()){
            indentLength_=-1;
            return true;
         }
         return false;
      }
      bool RESTOREINDENT_(){indentLength_= indentLengthToRestore_;return false;}
      bool keep_handler_()
      {
          if( indentLength_!=-1 ) return false;//indentation belongs to other block
          indentLength_= NormalizedIndentLength();
          bool kept= indentStack.Peek()== indentLength_;
          if( kept ) indentLength_=-1;
          return kept;
      }
      bool indent_handler_()
      {
          if( indentLength_!=-1 ) return false;//indentation belongs to other block
          int indentLength = NormalizedIndentLength();
          if (indentLength <= indentStack.Peek()) return false;
          else{
                indentStack.Push(indentLength);
                indentLength_=-1;
                return true;
          }
      }
      bool dedent_handler_()
      {
         System.Diagnostics.Debug.Assert(indentStack.Count>1);
         if (indentLength_ >= indentStack.Peek() || !indentStack.Contains(indentLength_)) return false;
         else{
            indentStack.Pop();
            if( indentStack.Count==1 ) indentLength_=-1;
            return true;
         }
      }
}
Line_join_sem_{
      bool prev_;
      Line_join_sem_(){set_implicit_line_joining_(true,out prev_);}
      ~Line_join_sem_(){set_implicit_line_joining_(prev_);}
}
[1] ^^interactive_input: (compound_stmt/stmt_list?)
                        ( NEWLINE / FATAL<"syntax error: line break expected">);
[2] ^^file_input:       init_ BLANKLINE* (NEWLINE / statement)* ENDMARKER
                        (!./FATAL<"wrong indent or non-python text here">);
[3] ^^eval_input:       init_ expression_list NEWLINE*  ENDMARKER;
[4] ^^input_input:      init_ expression_list NEWLINE  ENDMARKER;

[5] ^^atom:              identifier S / literal / enclosure;
[6]   enclosure:         generator_expression / dict_display
                       / string_conversion / yield_atom
                       / parenth_form / list_display;
[7]   literal :          (STRING S)+ / NUMBER S ;
[8]   parenth_form:      '(' parenth_form_content @')' S;
[9]   parenth_form_content 
using Line_join_sem_:    S expression_list?;
[10]^^list_display:      '[' list_display_content @']' S;
[11]  list_display_content
using Line_join_sem_:   S (list_comprehension / expression_list )?;		
[12]^^list_comprehension:expression list_for;

[13]^^list_for:          'for' S2 target_list @'in' S2 old_expression_list list_iter?;
[14]^^old_expression_list:
                         old_expression ((',' S old_expression)+ (','S)?)?; 
[15]^^list_iter:         list_for / list_if;
[16]^^list_if:           'if' S2 old_expression list_iter?;
[17]^^generator_expression: 
                         '(' generator_expression_content @')' S;
[18]generator_expression_content
using Line_join_sem_:    S expression genexpr_for;
[19]^^genexpr_for:       'for' S2 target_list @'in' S2 or_test genexpr_iter?;
[20]^^genexpr_iter:      genexpr_for / genexpr_if;
[21]^^genexpr_if:        'if' S2 old_expression genexpr_iter?;
[22]^^dict_display:      '{' dict_display_content @'}' S ;
[23] dict_display_content
using Line_join_sem_:    S key_datum_list?;
[24]^^key_datum_list:    key_datum (',' S key_datum)* (',' S)? ;
[25]^^key_datum:         expression @':' S expression;
[26]^^string_conversion: '`' S expression_list @'`' S;
[27]^^yield_atom:        '(' yield_atom_content @')' S;
[28]yield_atom_content
using Line_join_sem_:    S yield_expression;
[29]^^yield_expression:  'yield' S2 expression_list?;
[30] ^primary:           atom (attributeref /  slicing  / call_args)*;
[31]^^attributeref:      '.' S @identifier S;
[32]^^slicing:           '[' slicing_content @']' S;
[33]slicing_content
using Line_join_sem_:    S slice_list;
[34]^^slice_list:        slice_item (',' S slice_item)* (','S)?;
[35]^^slice_item:        ellipsis / slice / expression ;
[36]^^slice:             lower_bound? ':' S upper_bound? stride?;
[37]^^stride:            ':' S expression?;
[38]^^lower_bound:       expression;
[39]^^upper_bound:       expression;
[40]^^ellipsis:          '.' S '.' S '.' S; // or '...' ?
[41]^^call_args:         '(' call_args_content @')' S;
[42] call_args_content
using Line_join_sem_      :S (expression genexpr_for / argument_list (',' S)? )?;
[43]^^argument_list:     keyword_arguments seq_arg? dict_arg?
                       / positional_arguments ((',' S) keyword_arguments)? seq_arg? dict_arg?
                       / seq_arg dict_arg?
                       / dict_arg;
[44]^^seq_arg:           ',' S '*' S expression;
[45]^^dict_arg:          ',' S '**' S expression;
[46]^^positional_arguments: 
                         expression (',' S !keyword_item expression)*;
[47]^^keyword_arguments: keyword_item (',' S keyword_item)*;
[48]^^keyword_item:      identifier S '=' S expression;
[49] ^power:             primary ('**' S @u_expr)?;
[50] ^u_expr:            power / ^'-' S @u_expr / ^'+'  S @u_expr / ^'~' S @u_expr;
[51] ^m_expr:            u_expr (^('*'/'//'/'/'/'%') S u_expr)*;
[52] ^a_expr:            m_expr ( ^[+-] S  @m_expr)*;
[53] ^shift_expr:        a_expr (^('<<' / '>>' ) S @a_expr)*;
[54] ^and_expr:          shift_expr ('&' S @shift_expr)*;
[55] ^xor_expr:          and_expr ( '^' S @and_expr)*;
[56] ^or_expr:           xor_expr ( '|' S @xor_expr)*;
[57] ^comparison:        or_expr ( comp_operator S @or_expr )*;
[58]^^comp_operator:     '>=' / '<=' / '<>' / '<' / '>' / '==' / '!=' / 
                         is_operator / in_operator;
[59]^^is_operator:	 'is' S2 (not_symbol S2)? ;
[60]^^in_operator:	 (not_symbol S2)? 'in';	
[61]  expression:        conditional_expression / lambda_form;
[62]^^old_expression:    or_test / old_lambda_form;
[63]^^conditional_expression: 
                         or_test ('if' S2 @or_test @(^'else') S2 expression)?;
[64] ^or_test:           and_test ('or' S2 @and_test)*;
[65] ^and_test:          not_test ( 'and' S2 @not_test)*;
[66] ^not_test:          (not_symbol S2)* comparison;
[67]^^not_symbol:        'not';
[68]^^lambda_form:       'lambda' S2 parameter_list? @':' S expression;
[69]^^old_lambda_form:   'lambda' S2 parameter_list? @':' S old_expression;
[70]  expression_list:   expression ( ',' S expression )* (',' S)?;
[71]  simple_stmt:
                         assert_stmt
                       / pass_stmt
                       / del_stmt
                       / print_stmt
                       / return_stmt
                       / yield_stmt
                       / raise_stmt
                       / break_stmt
                       / continue_stmt
                       / import_stmt
                       / global_stmt
                       / exec_stmt
                       / augmented_assignment_stmt
                       / assignment_stmt
                       / expression_stmt;
[72]^^expression_stmt:   expression_list;
[73]^^assert_stmt:       'assert' S2 @expression (',' S @expression)?;
[74]^^assignment_stmt:   (target_list '='!'=' S)+ @(expression_list / yield_expression);
[75]^^target_list:       target (',' S target)* (',' S)?;
  
[76]^^target:            '(' target_contents ')' S
                       / '[' target_contents ']' S
                       / assignable_primary;
[77]target_contents
using Line_join_sem_:    S target_list;

[78]^^assignable_primary:
                         (identifier S/ enclosure !targetlist_end)    
                         (attributeref /  slicing  / call_args !targetlist_end )*;
[79]^^targetlist_end:    S ([,=] S / 'in' S2);

[80]^^augmented_assignment_stmt: 
                         target augop S @(expression_list / yield_expression);
  
[81]^^augop: 	         '+=' / '-=' / '*=' / '/=' / '%=' / '**='
                       / '>>=' / '<<=' / '&=' / '^=' / '/=' / '//=';	// '//=' added by me

[82]^^pass_stmt:         'pass' S2;
[83]^^del_stmt:          'del' S2 @target_list;
[84]^^print_stmt:        'print' S2 (  '>>' S expression (',' S expression)+ (',' S)?
                          / expression (',' S expression)* (',' S)?
                                   )?;
[85]^^return_stmt:       'return' S2 expression_list?;
[86]^^yield_stmt:        yield_expression;
[87]^^raise_stmt:        'raise' S2 (expression (',' S expression (',' S expression)?)?)?;
[88]^^break_stmt:        'break' S2;
[89]^^continue_stmt:     'continue' S2;
[90]^^import_stmt:       import_name / import_from;
[91]^^import_name:       'import' S2 module import_as? ( ',' S module import_as? )*;
[92]^^import_from:       'from' S2 module @'import' S '*' S
                       / 'from' S2 relative_module @'import' S2  
                                   (imports_in_parens / imports_list);
[93]^^import_as:         'as' S2 @name;
[94]^^imports_in_parens: '(' imports_in_parens_content @')' S;
[95] imports_in_parens_content:
                         S2 identifier  S import_as? rest_import_list (',' S)?;
[96]^^imports_list:      identifier S import_as? rest_import_list;
[97]^^rest_import_list:  (',' S identifier S import_as? )*;
[98]^^module:            (identifier S '.' S)* identifier S;
[99]^^relative_module:   ('.' S)* module / ('.' S)+;
  
[100]^^name:             identifier S;
[101]^^global_stmt:      'global' S2 @identifier S (',' S @identifier S)*;
[102]^^exec_stmt:        'exec' S2 or_expr ('in' S2 expression (',' S expression)?)?;
[103]^^compound_stmt:    if_stmt / while_stmt / for_stmt / try_stmt / with_stmt 
                       / funcdef / classdef;
[104]^^suite:            stmt_list @(NEWLINE/ENDMARKER) / 
                         NEWLINE INDENT @statement (KEEPINDENT @statement)* DEDENT;
[105]  statement:        compound_stmt / stmt_list (NEWLINE/ENDMARKER);
[106]  stmt_list:        simple_stmt (';' S simple_stmt)* (';' S)? ;
[107]^^if_stmt:          'if' S2 @expression @':' S suite
                         (KEEPINDENT ^'elif'  S2 @expression @':' S suite / RESTOREINDENT_ )*
                         (KEEPINDENT ^'else' S @':' S suite / RESTOREINDENT_ )?;
[108]^^while_stmt:        'while' S2 @expression @':' S suite
                         (KEEPINDENT 'else' S @':' S suite / RESTOREINDENT_ )?;
[109]^^for_stmt:          'for' S2 target_list @'in' S2 expression_list
                          @':' S suite (KEEPINDENT 'else' S @':' S suite / RESTOREINDENT_ )?;
[110]^^try_stmt:          'try' S @':' S suite ( except_handler / finally_only_handler );
[111]^^finally_only_handler: 
                         KEEPINDENT 'finally' S @':' S suite / RESTOREINDENT_ ;
[112]^^except_handler:   (KEEPINDENT ^'except' S2 (expression (',' S target)?)? @':' S suite / RESTOREINDENT_ )+
                         (KEEPINDENT ^'else' S @':' S suite / RESTOREINDENT_ )? finally_only_handler?;
[113]^^with_stmt:        'with' S2 expression ('as' S2 @target)? @':' S suite;
[114]^^funcdef:          decorators? 'def' S2 @funcname S '(' parameter_list_in @')' S @':' S suite;
[115] parameter_list_in
using Line_join_sem_:    S parameter_list?;
[116]^^decorators:       decorator+;
[117]^^decorator:        '@' S dotted_name ('(' decorater_in @')' S)? NEWLINE 
                         (KEEPINDENT/FATAL<"decorator continuation incorrectly indented">);
[118] decorater_in
using Line_join_sem_:    S (argument_list (','S)?)?;
[119]^^dotted_name:      identifier S('.' S identifier S)*;

[120]^^parameter_list:   defparameter  (',' S defparameter )* (',' S seq_dict_pars?)? / seq_dict_pars;
[121]^^seq_dict_pars:    ('*' S identifier S (',' S '**' S identifier S)? / '**' S identifier S);
[122]^^defparameter:     parameter ('=' S @expression)?;
[123]^^sublist:          parameter (',' S parameter)* ','?;
[124]^^parameter:        identifier S / '('  parameter_in  @')' S;
[125] parameter_in:      S @sublist;
[126]^^funcname:         identifier;
[127]^^classdef:         'class' S classname S inheritance? ':' S suite;
[128]^^inheritance
using Line_join_sem_:    '('  S expression_list?  @')' S;
[129]^^classname:        identifier;

//token definitions added for PEG following the docs at python.org
[130]  S:                ( [ \t\v\f]+ /
                           COMMENT     / 
                           '\\' NL  / 
                          NL in_implicit_line_joining_
                         )* ;
[131]S2:                 ![A-Za-z_0-9] S;
[132]^^identifier:       !(KEYWORD S2) [A-Za-z_][A-Za-z0-9_]*;
[133]INDENT:             S:indent_ (indent_handler_/FATAL<"indentation error">);
[134]KEEPINDENT:         continue_after_dedent_ / S:indent_ keep_handler_;
[135]DEDENT:             S dedent_handler_/FATAL<"indentation error">;
[136]NEWLINE:            NL BLANKLINE*;
[137]NL:                 '\r'?'\n' / '\n' / '\r' ;
[138]BLANKLINE:          [ \t\v\f]* COMMENT? NL/ [ \t\v\f]+ COMMENT? !. / COMMENT !. ;
[139]NAME:               !(KEYWORD S2) [A-Za-z_][A-Za-z0-9_]* S;
[140]IDENT:              [A-Za-z_][A-Za-z0-9_]*;
[141]KEYWORD: 
                         ('and' /      'del' /       'from' /      'not' /       'while' /    
                          'as' /        'elif' /      'global' /    'or' /        'with' /     
                          'assert' /    'else' /      'if' /        'pass' /      'yield' /    
                          'break' /     'except' /    'import' /    'print' /  
                          'class' /     'exec' /      'in' /        'raise' /  
                          'continue' /  'finally' /   'is' /        'return' / 
                          'def' /       'for' /       'lambda' /    'try') ;
[142]COMMENT:            '#' (!NL.)*;
[143]ENDMARKER:          !./WARNING<"file end expected">;
[144]NUMBER:           imagnumber / floatnumber / integer ^([lL]?) ;
[145]STRING:           stringprefix? (longstring/shortstring) unset_raw_;
[146]^^integer:          [1-9][0-9]* / '0'[xX] [0-9a-fA-F]+ / '0'[0-7]*;
[147]^^floatnumber:      pointfloat exponent? / [0-9]+ exponent;
[148]pointfloat:         [0-9]* '.' [0-9]+ / [0-9]+ '.';
[149]exponent:           [eE][+-]?[0-9]+;
[150]^^imagnumber:       (floatnumber / [0-9]+) [jJ];
[151]  stringprefix:     raw_indicating_stringprefix / unicodeonly_stringprefix;
[152]^^raw_indicating_stringprefix: 
                         ('r' /   'ur' /  'R' /   'UR' /  'Ur' /  'uR') set_raw_;
[153]^^unicodeonly_stringprefix:   
                         'u' /  'U' ;
[154]shortstring:        '"' shortstringitem1_content 
                         ('"' / FATAL<"string not terminated before input end">) / 
                         ['] shortstringitem2_content 
                         ([']/ FATAL<"string not terminated before input end">);
[155]^^shortstringitem1_content: 
                         (!'"' shortstringitem)*;
[156]^^shortstringitem2_content: 
                         (!['] shortstringitem)*;
longstring:              '"""' longstring1_content 
                         ('"""' / FATAL<"triple quoted string not terminated">) 
                       / ['][']['] longstring2_content 
                         (['][']['] / FATAL<"triple quoted string not terminated">);
[157]^^longstring1_content: (!'"""' longstringitem)* ;
[158]^^longstring2_content: (!(['][']['])longstringitem)* ;
[159]shortstringitem:    escapeseq 
                       / '\\' NL
                       / '\\' check_unrecognized_escape  . 
                       / [^\n] 
                       / FATAL<"new line in string not allowed">;
[160]longstringitem:     escapeseq / '\\' check_unrecognized_escape . /  .  ;
[161]check_unrecognized_escape: 
                         !in_raw_ WARNING<"unrecognized escape"> / &.;
[162]escapeseq:          in_raw_ '\\' ( unicode_4digits  / unicode_8digits ) 
                       / !in_raw_ '\\' 
                         ( [\\'"abfnrtv]
                         / 'N{' NAME '}' 
                         / unicode_4digits 
                         / unicode_8digits 
                         / [0-8]{1,3}
                         / hex_2digits );
[163]^^unicode_4digits:  'u'([0-9A-Fa-f]{4}/WARNING<"u must be followed by 4 hex digits">);
[164]^^unicode_8digits:  'U'([0-9A-Fa-f]{8}/WARNING<"U must be followed by 8 hex digits">);
[165]^^hex_2digits:      'x'([0-9A-Fa-f]{2} /WARNING<"x must be followed by 2 hex digits">);
set_explicit_line_joining_:;


<</Grammar>>

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Switzerland Switzerland
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions