Click here to Skip to main content
15,891,607 members
Articles / Programming Languages / C#

Irony - .NET Compiler Construction Kit

Rate me:
Please Sign up or sign in to vote.
4.97/5 (86 votes)
4 Jan 2008MIT19 min read 295.1K   3.2K   201  
Introduction to Irony - a new technology of parser/compiler construction for .NET.
#region License
/* **********************************************************************************
 * Copyright (c) Roman Ivantsov
 * This source code is subject to terms and conditions of the MIT License
 * for Irony. A copy of the license can be found in the License.txt file
 * at the root of this distribution. 
 * By using this source code in any fashion, you are agreeing to be bound by the terms of the 
 * MIT License.
 * You must not remove this notice from this software.
 * **********************************************************************************/
#endregion

using System;
using System.Collections.Generic;
using System.Text;

namespace Irony.Compiler {

  #region Grammar class
  public abstract class Grammar {

    #region properties: CaseSensitive, WhitespaceChars, Root, TokenFilters
    public bool CaseSensitive = true;
    public string WhitespaceChars = " \t\r\n\v";

    //Terminals not reachable from the Root (Comment terminal is usually one of them)
    public readonly TerminalList ExtraTerminals = new TerminalList();

    //Punctuation symbols are those that are excluded from arguments of node constructors
    public readonly KeyList PunctuationSymbols = new KeyList();

    //Default node type; if null then GenericNode type is used. 
    public Type DefaultNodeType;

    #region Comments
    //  If the following flag is true, the scanner removes all keyword-like terminals (those that start with a letter)
    //  from the list of terminals found in grammar rules. These symbols are treated 
    //  as something else in the input stream  and the grammar should include a terminal (usuall Identifier)
    //  that would match these keywords. 
    //  For ex., if there is a symbol "begin" used somewhere in grammar rules, it will not be included 
    //  into final list of terminals, and word "begin" in input stream will be represented by the Identifier token.  
    //  The parser will match it to the "begin" symbol specified in the expression 
    //   thus recognizing it as a keyword. 
    //  This is a pure optimization option, to improve scanner performance (terminal lookup by char returns less terminals). 
    //  It is recommended to set it to true for most languages. 
    //  For some languages like PHP that has all variable IDs start with "$" it should be set to false.
    #endregion
    public bool NoKeywordTerminals = true;

    public NonTerminal Root  {
      get {return _root;}
      set { _root = value;  }
    } NonTerminal _root;

    public TokenFilterList TokenFilters = new TokenFilterList();

    #endregion 

    #region Operators handling
    public readonly OperatorInfoTable Operators = new OperatorInfoTable();

    public void RegisterOperators(int precedence, params string[] opSymbols) {
      RegisterOperators(precedence, Associativity.Left, opSymbols);
    }
    public void RegisterOperators(int precedence, Associativity associativity, params string[] opSymbols) {
      foreach (string op in opSymbols) {
        if (Operators.ContainsKey(op))
          throw new ApplicationException("Operator '" + op + "' is registered more than once.");
        Operators[op] = new OperatorInfo(op, precedence, associativity);
      }
    }//method
    #endregion

    #region virtual methods: CreateNode
    // Override this method in language grammar if you want a custom node creation mechanism.
    public virtual AstNode CreateNode(CompilerContext context, ActionRecord reduceAction, 
                                      SourceLocation location, AstNodeList childNodes) {
      AstNode node;
      //First check and try custom NodeCreator method attached to non-terminal
      if (reduceAction.NonTerminal.NodeCreator != null) {
        node = reduceAction.NonTerminal.NodeCreator(context, reduceAction, location, childNodes);
        if (node != null) return node;
      }
      //General node-creation path
      Type nodeType = reduceAction.NonTerminal.NodeType ?? this.DefaultNodeType;
      if (childNodes.Count == 0) {
        //Create NULL node
        node = null; // new AstNode(reduceAction.NonTerminal, location, true);
      } else if (nodeType == null || nodeType == typeof(GenericNode)) {
        //GenericNode
        if (childNodes.Count == 1) 
          //Node bubbling. For the default case, if no node type is specified (meaning "use GenericNode"),
          // and new node has just one child node (meaning reduce on identity-like production A->B),
          // then we do not create new node but simply return the child node itself. So nodes "bubble-up" to higher
          // levels. This simplifies the default syntax tree. 
          node = childNodes[0];
        else
          node = new GenericNode(context, reduceAction.NonTerminal, location, childNodes);
      } else {
        //Custom node type
        node = (AstNode)Activator.CreateInstance(nodeType, context, reduceAction.NonTerminal, location, childNodes);
      }
      return node;      
    }
    #endregion

    #region Static utility methods used in custom grammars: Symbol(), ToElement, WithStar, WithPlus, WithQ
    protected static SymbolTerminal Symbol(string symbol) {
      return SymbolTerminal.GetSymbol(symbol);
    }
    protected static SymbolTerminal Symbol(string symbol, string name) {
      return SymbolTerminal.GetSymbol(symbol, name);
    }
    protected static BnfElement ToElement(BnfExpression expression) {
      string name = expression.ToString();
      return new NonTerminal(name, expression);
    }
    protected static BnfElement WithStar(BnfExpression expression) {
      return ToElement(expression).Star();
    }
    protected static BnfElement WithPlus(BnfExpression expression) {
      return ToElement(expression).Plus();
    }
    protected static BnfElement WithQ(BnfExpression expression) {
      return ToElement(expression).Q();
    }
    public static Token CreateErrorToken(SourceLocation location, string message, params object[] args) {
      if (args != null && args.Length > 0)
        message = string.Format(message, args);
      return new Token(Grammar.Error, location, message);
    }
    #endregion

    #region Standard terminals: EOF, Empty, NewLine, Indent, Dedent
    // Identifies end of file
    // Note: do not use this symbol in grammar rules. Parser automatically add this symbol 
    // as a lookahead to Root non-terminal
    public readonly static Terminal Eof = new Terminal("_EOF", TokenCategory.Outline);
    // Empty object is used to identify optional element: 
    //    elem.Rule = elem1 | Empty;
    public readonly static Terminal Empty = new Terminal("_Empty", TokenCategory.Outline);
    public readonly static Terminal Error = new Terminal("_Error", TokenCategory.Error);
    // The following terminals are used in indent-sensitive languages like Python 
    public readonly static Terminal NewLine = new Terminal("_LF", TokenCategory.Outline);
    public readonly static Terminal Indent = new Terminal("_Indent", TokenCategory.Outline);
    public readonly static Terminal Dedent = new Terminal("_Dedent", TokenCategory.Outline);

    // Special terminal for ReservedWord token produced by IdentifierTerminal when lexeme matches one of reserved words. 
    // It is sometimes(not always) necessary to distinguish reserved words from free identifiers in the input stream.
    public readonly static Terminal ReservedWord = new Terminal("ReservedWord", TokenMatchMode.ByValue);
    
    #endregion


    #region Optimization 
    //Not sure if this is useful ----------------------------------------------
    public void FlattenNestedORs() {
      NonTerminalList listInProgress = new NonTerminalList();
      FlattenNestedORs(this.Root, listInProgress);
    }
    private void FlattenNestedORs(NonTerminal nonTerminal, NonTerminalList listInProgress) {
    
      if (nonTerminal == null)
        return;
      if (listInProgress.Contains(nonTerminal))
        return; //prevent endless looping due to recurrency
      listInProgress.Add(nonTerminal);
      //process all children
      foreach (IBnfExpression  ichild in nonTerminal.Expression.Operands)
        FlattenNestedORs(ichild as NonTerminal, listInProgress);
      listInProgress.Remove(nonTerminal);
      //See if we need to process this NonTerminal
      if (nonTerminal.Expression.ExpressionType != BnfExpressionType.Alternative)
        return;

      //Go through children and see if we can embed grandchildren directly 
      BnfExpressionList args = nonTerminal.Expression.Operands;
      for (int i = 0; i < args.Count; i++) {
        NonTerminal arg = args[i] as NonTerminal;
        if (arg == null || arg == nonTerminal  //avoid infinite loops 
          || arg.Expression.ExpressionType != BnfExpressionType.Alternative
          || arg.Expression.Operands.Count == 0)
          continue;
        //Insert child's alternatives directly here
        args.RemoveAt(i);
        args.InsertRange(i, arg.Expression.Operands);
      }//for i
       
    }//method

    #endregion
        
  }//class
  #endregion


}//namespace

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The MIT License


Written By
Software Developer (Senior) Microsoft
United States United States
25 years of professional experience. .NET/c#, databases, security.
Currently Senior Security Engineer, Cloud Security, Microsoft

Comments and Discussions