/// Public domain code by Christopher Diggins
/// http://www.cat-language.com
using System;
using System.Collections.Generic;
using System.Text;
namespace Cat
{
/// <summary>
/// The different kinds of nodes in a parse tree are distinguished
/// using a NodeType identifier. This allows any processor of an
/// abstract syntax tree (AST) to use efficient switch statements
/// to process nodes.
/// </summary>
public enum NodeType
{
Definition,
Namespace, // not used in the current version
StackState,
SimpleType,
FxnType,
Root,
Quotation,
Word,
Number,
String,
ParanGroup,
BraceGroup,
LineComment,
Reference,
VarDecl,
Undefined
};
/// <summary>
/// An AstNode is a part of the abstract syntax tree. It can represent an identifier,
/// a definition, a symbol, a type, a part of a type, and so forth. For example
/// in languages which support expressions an AstNode might be an expression,
/// a sub-expression, an operator, and values.
///
/// For example in the following expression: 5 * (2 + 3), if I was to put square
/// brackets around each node and label them, I would get:
///
/// [5]=number [*]=operator [ ( 2 + 3 ) ]=expression
///
/// Then the paranthesized sub-expression itself would also be broken down further
/// into nodes:
///
/// [2]=number [+]=operator [3]=number
///
/// However, this example does not apply as-is to the Cat language. A sequence of
/// contiguous symbols (like arithmetic operators) are called identifiers, and
/// can be defined as functions. Cat also doesn't support paranethese except for
/// within function types. This is because Cat programs are unambiguous
/// </summary>
public class AstNode
{
#region Constructors
public AstNode(Parser p, AstNode parent_node, NodeType nt)
{
parser = p;
parent = parent_node;
node_type = nt;
marker = parser.cur_index;
line = parser.GetLine();
col = parser.GetCol();
}
#endregion
#region Public Functions
public void Complete()
{
text = parser.input.Substring(marker, parser.cur_index - marker);
}
public void AddChild(AstNode n)
{
children.Add(n);
}
public AstNode GetLastChild()
{
if (children.Count == 0) return null;
return children[children.Count - 1];
}
public void DeleteLastChildIfEmpty()
{
if (children.Count == 0) return;
if (GetLastChild().children.Count != 0) return;
children.RemoveAt(children.Count - 1);
}
public AstNode GetSibling()
{
int n = parent.children.IndexOf(this);
if (n > parent.children.Count) return null;
return parent.children[n];
}
public override string ToString()
{
return text;
}
#endregion
#region Fields
public Parser parser = null;
public AstNode parent = null;
public NodeType node_type = NodeType.Undefined;
public List<AstNode> children = new List<AstNode>();
public int marker = -1;
public string text = "";
public int line = -1;
public int col = -1;
/// <summary>
/// Used to signal that the node parsing was cut-short.
/// </summary>
public bool is_incomplete = false;
#endregion
public bool IsComment()
{
return (node_type == NodeType.LineComment);
}
}
/// <summary>
/// The Parser creates an abstract syntax tree (AST) which represents the input
/// source code in a more easily managed format. Programs, expressions, and types
/// provide constructors for creating nodes from the parse tree.
///
/// Because the language is so simple, and computers are so fast, there
/// is no need for a lexing phase in modern compilers. All of my compilers and
/// interpreters merge the lexing phase and parse phase.
///
/// Lexing, or lexical analysis is the transformation of source code into tokens.
/// </summary>
public class Parser
{
#region Parsing Error and Validation functions
public static string GetStringFromLine(Parser p, int nLine)
{
string[] lines = p.input.Split(new char[] { '\n' });
return lines[nLine];
}
public static string GetPointerFromCol(Parser p, int nCol)
{
string s = new string(' ', nCol);
return s + '^';
}
public static void Error(AstNode node, String msg)
{
Console.WriteLine("Parsing error at line " + (node.line + 1) + " : " + msg);
Console.WriteLine(GetStringFromLine(node.parser, node.line));
Console.WriteLine(GetPointerFromCol(node.parser, node.col));
throw new Exception("Handled parsing error");
}
public static void AssertNodeType(AstNode node, NodeType nt)
{
if (node.node_type != nt)
{
Error(node, "unexpected node-type " + node.node_type.ToString() + ", expected " + nt.ToString());
}
}
public static void AssertNonEmptyName(AstNode node)
{
AssertNodeType(node, NodeType.Word);
string s = node.ToString();
if (s.Length < 1) Error(node, "zero length name");
}
public static void AssertNodeChildCount(AstNode node, int cnt)
{
if (node.children.Count != cnt) Error(node, "expected " + cnt + " child-nodes, instead found " + node.children.Count);
}
#endregion
#region Fields
public string input;
public int cur_index = 0;
public AstNode root;
public AstNode cur_node;
private int cur_line = 0;
private int cur_col = 0;
#endregion
public Parser()
{
root = new AstNode(this, null, NodeType.Root);
cur_node = root;
}
public bool IsIncomplete()
{
return cur_node.is_incomplete;
}
public AstNode GetRoot()
{
return root;
}
public void Clear()
{
GetChildren().Clear();
cur_node = root;
}
public List<AstNode> GetChildren()
{
return root.children;
}
// The parsing utility functions contain things like is_letter, at_number, etc.
#region Parsing Utility Functions
public bool is_letter(char c)
{
return ((c >= 'a') && (c <= 'z')) ||
((c >= 'A') && (c <= 'Z')) ||
(c == '_');
}
public bool is_number(char c)
{
return ((c >= '0') && (c <= '9'));
}
public bool is_alphanum(char c)
{
return is_letter(c) || is_number(c);
}
public bool is_space(char c)
{
return Char.IsWhiteSpace(c);
}
public bool is_newline(char c)
{
return (c == '\r') || (c == '\n') || (c == '\u000D') || (c == '\u000A') || (c == '\u0085') || (c == '\u2028');
}
public bool is_symbol(char c)
{
switch (c)
{
case (':'): return true;
case ('~'): return true;
case ('`'): return true;
case ('!'): return true;
case ('#'): return true;
case ('$'): return true;
case ('%'): return true;
case ('^'): return true;
case ('&'): return true;
case ('*'): return true;
case ('-'): return true;
case ('/'): return true;
case ('+'): return true;
case ('='): return true;
case ('|'): return true;
case ('\''): return true;
case ('<'): return true;
case (','): return true;
case ('>'): return true;
case ('.'): return true;
case ('?'): return true;
case ('\\'): return true;
default: return false;
}
}
public bool at_end()
{
return cur_index >= input.Length;
}
public bool at_number()
{
if (at_end()) return false;
return is_number(peek());
}
public bool at_letter()
{
if (at_end()) return false;
return is_letter(peek());
}
public bool at_space()
{
if (at_end()) return false;
return is_space(peek());
}
public bool at_newline()
{
if (at_end()) return true;
return is_newline(peek());
}
public bool next_char_is(char c)
{
if (cur_index >= input.Length - 1) return false;
return (peek_next() == c) ;
}
public bool at_alphanum()
{
if (at_end()) return false;
return is_alphanum(peek());
}
public bool at_char(char c)
{
if (at_end()) return false;
return peek() == c;
}
public bool at_symbol()
{
if (at_end()) return false;
return is_symbol(peek());
}
public char peek()
{
return input[cur_index];
}
public char peek_next()
{
return input[cur_index + 1];
}
public char next()
{
++cur_col;
return input[++cur_index];
}
public void goto_next()
{
++cur_col;
if (is_newline(input[cur_index++]))
{
cur_line++;
cur_col = 0;
}
}
#endregion
public void Load(string s)
{
input += s;
}
public bool Parse(string s)
{
Load(s);
try
{
if (IsIncomplete())
{
Resume();
}
ParseLoop();
}
catch (Exception e)
{
cur_node.col = cur_col;
cur_node.line = cur_line;
Error(cur_node, e.Message);
}
return cur_node == root;
}
public void SetInput(string s)
{
cur_index = 0;
cur_col = 0;
cur_line = 0;
input = s;
}
/// <summary>
/// Takes the oldest parse node from the parser, and uses it to create
/// an instruction, or a definition.
/// </summary>
public AstNode ExtractNode()
{
if (root.children.Count == 0) return null;
AstNode result = root.children[0];
if (result.is_incomplete) return null;
root.children.RemoveAt(0);
return result;
}
// The parse_loop function is a recursive version of the Parse function
#region Parse Loop
private AstNode CreateNewNode(NodeType nt)
{
// it is VERY important that nodes only get added to the parent once
// they are fully parsed
cur_node = new AstNode(this, cur_node, nt);
return cur_node;
}
private void CompleteCurrentNode()
{
cur_node.Complete();
// it is VERY important that nodes only get added to the parent once
// they are fully parsed
cur_node.parent.AddChild(cur_node);
cur_node.is_incomplete = false;
cur_node = cur_node.parent;
if (cur_node == null)
{
throw new Exception("parsing error");
}
}
private void EatWS()
{
while (!at_end() && at_space()) goto_next();
}
private bool ParseText(string s)
{
int old_index = cur_index;
int old_col = cur_col;
int i = 0;
while (i < s.Length)
{
if (at_end() || s[i] != peek())
{
cur_index = old_index;
cur_col = old_col;
return false;
}
++i;
++cur_index;
++cur_col;
}
if (!at_space())
{
cur_index = old_index;
cur_col = old_col;
return false;
}
return true;
}
private void advance_char(char c)
{
if (at_end() || peek() != c)
{
Error(cur_node, "expected " + c);
}
goto_next();
}
private void ParseBraceGroup()
{
CreateNewNode(NodeType.BraceGroup);
advance_char('{');
CompleteBraceGroup();
}
private void CompleteBraceGroup()
{
ParseLoop();
if (at_end())
{
cur_node.is_incomplete = true;
}
else
{
advance_char('}');
CompleteCurrentNode();
// Complete the owner function as well.
CompleteCurrentNode();
}
}
private void ParseQuote()
{
CreateNewNode(NodeType.Quotation);
advance_char('[');
CompleteQuote();
}
private void CompleteQuote()
{
ParseLoop();
if (at_end())
{
cur_node.is_incomplete = true;
}
else
{
advance_char(']');
CompleteCurrentNode();
}
}
/// <summary>
/// Generates a node representing a type from a string.
/// </summary>
public AstNode ParseType(string s)
{
Clear();
SetInput(s);
ParseType();
return root.children[0];
}
private void ParseType()
{
if (at_char('('))
{
ParseFxnType();
}
else if (at_letter())
{
ParseSimpleTypeOrVarDecl();
}
else
{
throw new Exception("Unrecognized type");
}
}
private void ParseSimpleTypeOrVarDecl()
{
CreateNewNode(NodeType.SimpleType);
ParseWord();
if (at_char(':'))
{
// Change the node type, we know this is actually a Variable declaration
cur_node.node_type = NodeType.VarDecl;
advance_char(':');
ParseWord();
EatWS();
if (at_char('*'))
ParseWord();
}
CompleteCurrentNode();
EatWS();
}
private void ParseStackState()
{
CreateNewNode(NodeType.StackState);
advance_char('(');
EatWS();
while (!at_char(')'))
{
ParseType();
EatWS();
}
advance_char(')');
CompleteCurrentNode();
EatWS();
}
private void ParseFxnType()
{
CreateNewNode(NodeType.FxnType);
ParseStackState();
if (at_char('('))
{
ParseStackState();
ParseWord(); // This is either "->" or "~>"
EatWS();
ParseStackState();
ParseStackState();
}
else
{
ParseWord(); // This is either "->" or "~>"
EatWS();
ParseStackState();
}
CompleteCurrentNode();
EatWS();
}
private void ParseNumber()
{
CreateNewNode(NodeType.Number);
while (at_number())
{
goto_next();
}
CompleteCurrentNode();
}
private void ParseWord()
{
CreateNewNode(NodeType.Word);
if (at_letter())
{
while (at_alphanum())
goto_next();
}
else if (at_symbol())
{
while (at_symbol())
goto_next();
}
CompleteCurrentNode();
}
private void ParseString()
{
CreateNewNode(NodeType.String);
goto_next();
while (!at_char('\"'))
goto_next();
goto_next();
CompleteCurrentNode();
}
private void ParseTerm()
{
if (at_letter() || at_symbol())
{
ParseWord();
EatWS();
}
else if (at_number())
{
ParseNumber();
EatWS();
}
else if (at_char('"'))
{
ParseString();
EatWS();
}
else {
throw new Exception("Expected a term");
}
}
private void Resume()
{
if (cur_node.node_type == NodeType.Quotation)
{
CompleteQuote();
}
else if (cur_node.node_type == NodeType.BraceGroup)
{
CompleteBraceGroup();
}
else
{
throw new Exception("only quotations and brace groups can be left in an incomplete state by the parser");
}
}
private void ParseLoop()
{
while (!at_end())
{
if (at_char('['))
{
ParseQuote();
}
else if (at_char(']'))
{
return;
}
else if (at_char('('))
{
ParseType();
}
else if (at_char(')'))
{
return;
}
else if (at_char('{'))
{
ParseBraceGroup();
}
else if (at_char('}'))
{
return;
}
else if (at_char('\"'))
{
ParseString();
EatWS();
}
else if (at_char('@'))
{
CreateNewNode(NodeType.Reference);
goto_next();
EatWS();
ParseTerm();
CompleteCurrentNode();
}
else if (at_letter() || at_symbol())
{
if (ParseText("define"))
{
CreateNewNode(NodeType.Definition);
cur_col += "define".Length;
EatWS();
ParseWord();
EatWS();
if (at_char(':'))
{
goto_next();
EatWS();
ParseType();
EatWS();
}
ParseBraceGroup();
}
else
{
ParseWord();
EatWS();
}
}
else if (at_space())
{
EatWS();
}
else if (at_number())
{
ParseNumber();
EatWS();
}
else if (at_char('/') && next_char_is('/'))
{
CreateNewNode(NodeType.LineComment);
goto_next();
while (!at_end() && !at_newline())
{
goto_next();
}
CompleteCurrentNode();
}
else
{
throw new Exception("failed to parse input string");
}
}
}
#endregion
public int GetLine()
{
return cur_line;
}
public int GetCol()
{
return cur_col;
}
/// <summary>
/// This function must be used instead of int.ParseInt due to bugs in the
/// current version of Mono.
/// </summary>
public static int StringToInt(string s)
{
int ret = 0;
int nMul = 1;
for (int i = s.Length - 1; i >= 0; --i)
{
char c = s[i];
if ((c < '0') || (c > '9'))
throw new Exception("expected integer " + s);
int nDigit = c - '0';
ret += nDigit * nMul;
nMul *= 10;
}
return ret;
}
}
}