Click here to Skip to main content
15,896,557 members
Articles / Programming Languages / Visual Basic

A Tiny Parser Generator v1.2

Rate me:
Please Sign up or sign in to vote.
4.94/5 (201 votes)
21 Sep 2010CPOL25 min read 675.7K   17.5K   465  
@TinyPG is a utility that makes it easier to write and try out your own parser/compiler
' Generated by TinyPG v1.2 available at www.codeproject.com

Imports System
Imports System.Collections.Generic
Imports System.Text.RegularExpressions


Namespace TinyPG
#Region "Scanner"

    Partial Public Class Scanner
        Public Input As String
        Public StartPos As Integer = 0
        Public EndPos As Integer = 0
        Public CurrentLine As Integer
        Public CurrentColumn As Integer
        Public CurrentPosition As Integer
        Public Skipped As List(Of Token) ' tokens that were skipped

        Private LookAheadToken As Token
        Private Patterns As List(Of Regex)
        Private Tokens As List(Of TokenType)
        Private SkipList As List(Of TokenType) ' tokens to be skipped

        Public Sub New()
            Dim regex As Regex
            Patterns = New List(Of Regex)()
            Tokens = New List(Of TokenType)()
            LookAheadToken = Nothing

            SkipList = New List(Of TokenType)()
            SkipList.Add(TokenType.WHITESPACE)
            SkipList.Add(TokenType.COMMENTLINE)
            SkipList.Add(TokenType.COMMENTBLOCK)

            regex = new Regex("\(", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.BRACKETOPEN)

            regex = new Regex("\)", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.BRACKETCLOSE)

            regex = new Regex("\{[^\}]*\}([^};][^}]*\}+)*;", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.CODEBLOCK)

            regex = new Regex(",", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.COMMA)

            regex = new Regex("\[", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.SQUAREOPEN)

            regex = new Regex("\]", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.SQUARECLOSE)

            regex = new Regex("=", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.ASSIGN)

            regex = new Regex("\|", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.PIPE)

            regex = new Regex(";", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.SEMICOLON)

            regex = new Regex("(\*|\+|\?)", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.UNARYOPER)

            regex = new Regex("[a-zA-Z_][a-zA-Z0-9_]*", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.IDENTIFIER)

            regex = new Regex("[0-9]+", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.CINTEGER)

            regex = new Regex("[0-9]*\.[0-9]+", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.CDOUBLE)

            regex = new Regex("(0x[0-9a-fA-F]{6})", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.HEX)

            regex = new Regex("->", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.ARROW)

            regex = new Regex("<%\s*@", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.DIRECTIVEOPEN)

            regex = new Regex("%>", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.DIRECTIVECLOSE)

            regex = new Regex("^$", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.EOF)

            regex = new Regex("@?\""(\""\""|[^\""])*\""", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.CSTRING)

            regex = new Regex("\s+", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.WHITESPACE)

            regex = new Regex("//[^\n]*\n?", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.COMMENTLINE)

            regex = new Regex("/\*[^*]*\*+(?:[^/*][^*]*\*+)*/", RegexOptions.Compiled)
            Patterns.Add(regex)
            Tokens.Add(TokenType.COMMENTBLOCK)



        End Sub

        Public Sub Init(ByVal input As String)
            Me.Input = input
            StartPos = 0
            EndPos = 0
            CurrentLine = 0
            CurrentColumn = 0
            CurrentPosition = 0
            Skipped = New List(Of Token)()
            LookAheadToken = Nothing
        End Sub

        Public Function GetToken(ByVal type As TokenType) As Token
            Dim t As New Token(Me.StartPos, Me.EndPos)
            t.Type = type
            t.Text = type.ToString()
            Return t
        End Function

        ''' <summary>
        ''' executes a lookahead of the next token
        ''' and will advance the scan on the input string
        ''' </summary>
        ''' <returns></returns>
        Public Function Scan() As Token
            Dim tok As Token = LookAhead()
            ' temporarely retrieve the lookahead
            LookAheadToken = Nothing
            ' reset lookahead token, so scanning will continue
            StartPos = tok.EndPos
            EndPos = tok.EndPos
            ' set the tokenizer to the new scan position
            Return tok
        End Function

        ''' <summary>
        ''' returns token with longest best match
        ''' </summary>
        ''' <returns></returns>
        Public Function LookAhead() As Token
            Dim len As Integer
            Dim index As Integer = -1
            Dim i As Integer
            Dim start As Integer = StartPos
            Dim tok As Token = Nothing

            ' this prevents double scanning and matching
            ' increased performance
            If Not LookAheadToken Is Nothing Then
                Return LookAheadToken
            End If

            Do

                len = -1
                Dim m_input As String = Input.Substring(start)

                tok = New Token(start, EndPos)

                i = 0
                While i < Patterns.Count
    Dim r As Regex = Patterns(i)
    Dim m As Match = r.Match(m_input)
                    If m.Success AndAlso m.Index = 0 AndAlso m.Length > len Then
                        len = m.Length
                        index = i
                    End If
                    System.Math.Max(System.Threading.Interlocked.Increment(i), i - 1)
                End While

                If index >= 0 AndAlso len >= 0 Then
                    tok.EndPos = start + len
                    tok.Text = Input.Substring(tok.StartPos, len)
                    tok.Type = Tokens(index)
                Else
                    If tok.EndPos < Input.Length Then
                        tok.Text = Input.Substring(tok.StartPos, 1)
                    End If
                End If

                If SkipList.Contains(tok.Type) Then
                    start = tok.EndPos
                    Skipped.Add(tok)
                End If
            Loop While SkipList.Contains(tok.Type)

            LookAheadToken = tok
            Return tok
        End Function
    End Class
#End Region

#Region "Token"

    Public Enum TokenType

        'Non terminal tokens:
        _NONE_      = 0
        _UNDETERMINED_= 1

        'Non terminal tokens:
        Start       = 2
        Directive   = 3
        NameValue   = 4
        ExtProduction= 5
        Attribute   = 6
        Params      = 7
        Param       = 8
        Production  = 9
        Rule        = 10
        Subrule     = 11
        ConcatRule  = 12
        Symbol      = 13

        'Terminal tokens:
        BRACKETOPEN = 14
        BRACKETCLOSE= 15
        CODEBLOCK   = 16
        COMMA       = 17
        SQUAREOPEN  = 18
        SQUARECLOSE = 19
        ASSIGN      = 20
        PIPE        = 21
        SEMICOLON   = 22
        UNARYOPER   = 23
        IDENTIFIER  = 24
        CINTEGER    = 25
        CDOUBLE     = 26
        HEX         = 27
        ARROW       = 28
        DIRECTIVEOPEN= 29
        DIRECTIVECLOSE= 30
        EOF         = 31
        CSTRING     = 32
        WHITESPACE  = 33
        COMMENTLINE = 34
        COMMENTBLOCK= 35
    End Enum

    Public Class Token 
        Private m_startPos As Integer
        Private m_endPos As Integer
        Private m_text As String
        Private m_value As Object

        Public Property StartPos() As Integer
            Get
                Return m_startPos
            End Get
            Set(ByVal value As Integer)
                m_startPos = value
            End Set
        End Property

        Public Property EndPos() As Integer
            Get
                Return m_endPos
            End Get
            Set(ByVal value As Integer)
                m_endPos = value
            End Set
        End Property

        Public ReadOnly Property Length() As Integer
            Get
                Return m_endPos - m_startPos
            End Get
        End Property

        Public Property Text() As String
            Get
                Return m_text
            End Get
            Set(ByVal value As String)
                m_text = value
            End Set
        End Property

        Public Property Value() As Object
            Get
                Return m_value
            End Get
            Set(ByVal value As Object)
                Me.m_value = value
            End Set
        End Property

        Public Type As TokenType

        Public Sub New()
            Me.New(0, 0)
        End Sub

        Public Sub New(ByVal start As Integer, ByVal endPos As Integer)
            Type = TokenType._UNDETERMINED_
            m_startPos = start
            m_endPos = endPos
            Text = ""
            ' must initialize with empty string, may cause null reference exceptions otherwise
            Value = Nothing
        End Sub

        Public Sub UpdateRange(ByVal token As Token)
            If token.StartPos < m_startPos Then
                m_startPos = token.StartPos
            End If
            If token.EndPos > m_endPos Then
                m_endPos = token.EndPos
            End If
        End Sub

        Public Overloads Overrides Function ToString() As String
            If Text <> Nothing Then
                Return Type.ToString() + " '" + Text + "'"
            Else
                Return Type.ToString()
            End If
        End Function
    End Class
#End Region
End Namespace

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Architect Rubicon
Netherlands Netherlands
Currently Herre Kuijpers is employed at Rubicon. During his career he developed skills with all kinds of technologies, methodologies and programming languages such as c#, ASP.Net, .Net Core, VC++, Javascript, SQL, Agile, Scrum, DevOps, ALM. Currently he fulfills the role of software architect in various projects.

Herre Kuijpers is a very experienced software architect with deep knowledge of software design and development on the Microsoft .Net platform. He has a broad knowledge of Microsoft products and knows how these, in combination with custom software, can be optimally implemented in the often complex environment of the customer.

Comments and Discussions