More doc updates

2016-09-14 14:21:33 -05:00 · 2016-09-14 14:21:33 -05:00 · 0a17f78d2e
commit 0a17f78d2e
parent 5c3083712f
5 changed files with 423 additions and 103 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,192 @@
-# SLY (Sly Lex Yacc)
+SLY (Sly Lex-Yacc)                   Version 0.0
+
+Copyright (C) 2016
+David M. Beazley (Dabeaz LLC)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.  
+* Redistributions in binary form must reproduce the above copyright notice, 
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.  
+* Neither the name of the David Beazley or Dabeaz LLC may be used to
+  endorse or promote products derived from this software without
+  specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Requirements
+============
+
+SLY requires the use of Python 3.5 or greater.  Older versions
+of Python are not supported.
+
+Introduction
+============
+
+SLY is a 100% Python implementation of the lex and yacc tools
+commonly used to write parsers and compilers.  Parsing is
+based on the same LALR(1) algorithm used by many yacc tools.
+Here are a few notable features:
+
+ -  SLY provides *very* extensive error reporting and diagnostic 
+    information to assist in parser construction.  The original
+    implementation was developed for instructional purposes.  As
+    a result, the system tries to identify the most common types
+    of errors made by novice users.  
+
+ -  SLY provides full support for empty productions, error recovery,
+    precedence specifiers, and moderately ambiguous grammars.
+
+ -  SLY uses various Python metaprogramming features to specify
+    lexers and parsers.  There are no generated files or extra
+    steps involved. You simply write Python code and run it.
+
+ -  SLY can be used to build parsers for "real" programming languages.
+    Although it is not ultra-fast due to its Python implementation,
+    SLY can be used to parse grammars consisting of several hundred
+    rules (as might be found for a language like C).  
+
+An Example
+==========
+
+SLY is probably best illustrated by an example.  Here's what it
+looks like to write a parser that can evaluate simple arithmetic
+expressions and store variables:
+
+    # -----------------------------------------------------------------------------
+    # calc.py
+    # -----------------------------------------------------------------------------
+
+    from sly import Lexer, Parser
+
+    class CalcLexer(Lexer):
+        tokens = {
+            'NAME', 'NUMBER',
+            }
+        ignore = ' \t'
+        literals = { '=', '+', '-', '*', '/', '(', ')' }
+
+        # Tokens
+        NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
+
+        @_(r'\d+')
+        def NUMBER(self, t):
+            t.value = int(t.value)
+            return t
+
+        @_(r'\n+')
+        def newline(self, t):
+            self.lineno += t.value.count('\n')
+
+        def error(self, value):
+            print("Illegal character '%s'" % value[0])
+            self.index += 1
+
+    class CalcParser(Parser):
+        tokens = CalcLexer.tokens
+
+        precedence = (
+            ('left', '+', '-'),
+            ('left', '*', '/'),
+            ('right', 'UMINUS'),
+            )
+
+        def __init__(self):
+            self.names = { }
+
+        @_('NAME "=" expr')
+        def statement(self, p):
+            self.names[p.NAME] = p.expr
+
+        @_('expr')
+        def statement(self, p):
+            print(p.expr)
+
+        @_('expr "+" expr')
+        def expr(self, p):
+            return p.expr0 + p.expr1
+
+        @_('expr "-" expr')
+        def expr(self, p):
+            return p.expr0 - p.expr1
+
+        @_('expr "*" expr')
+        def expr(self, p):
+            return p.expr0 * p.expr1
+
+        @_('expr "/" expr')
+        def expr(self, p):
+            return p.expr0 / p.expr1
+
+        @_('"-" expr %prec UMINUS')
+        def expr(self, p):
+            return -p.expr
+
+        @_('"(" expr ")"')
+        def expr(self, p):
+            return p.expr
+
+        @_('NUMBER')
+        def expr(self, p):
+            return p.NUMBER
+
+        @_('NAME')
+        def expr(self, p):
+            try:
+                return self.names[p.NAME]
+            except LookupError:
+                print("Undefined name '%s'" % p.NAME)
+                return 0
+
+    if __name__ == '__main__':
+        lexer = CalcLexer()
+        parser = CalcParser()
+        while True:
+            try:
+                text = input('calc > ')
+            except EOFError:
+                break
+            if text:
+                parser.parse(lexer.tokenize(text))
+
+Resources
+=========
+
+For a detailed overview of parsing theory, consult the excellent
+book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and
+Ullman.  The topics found in "Lex & Yacc" by Levine, Mason, and Brown
+may also be useful.
+
+The GitHub page for SLY can be found at:
+
+     https://github.com/dabeaz/sly
+
+Please direct bug reports and pull requests to the GitHub page.
+To contact me directly, send email to dave@dabeaz.com or contact
+me on Twitter (@dabeaz).
+ 
+-- Dave
+
+
+
+
+
+
+

-The name says it all.

--- a/docs/sly.rst
+++ b/docs/sly.rst
@ -638,35 +638,35 @@ SLY::
        # Grammar rules and actions
        @_('expr PLUS term')
        def expr(self, p):
-            return p[0] + p[2]
+            return p.expr + p.term

        @_('expr MINUS term')
        def expr(self, p):
-            return p[0] - p[2]
+            return p.expr - p.term

        @_('term')
        def expr(self, p):
-            return p[0]
+            return p.term

        @_('term TIMES factor')
        def term(self, p):
-            return p[0] * p[2]
+            return p.term * p.factor

        @_('term DIVIDE factor')
        def term(self, p):
-            return p[0] / p[2]
+            return p.term / p.factor

        @_('factor')
        def term(self, p):
-            return p[0]
+            return p.factor

        @_('NUMBER')
        def factor(self, p):
-            return p[0]
+            return p.NUMBER

        @_('LPAREN expr RPAREN')
        def factor(self, p):
-            return p[1]
+            return p.expr

    if __name__ == '__main__':
        lexer = CalcLexer()
@ -697,37 +697,45 @@ becomes a method like this::

 The method is triggered when that grammar rule is recognized on the
 input.  As an argument, the method receives a sequence of grammar symbol
-values ``p`` that is accessed as an array of symbols.  The mapping between
-elements of ``p`` and the grammar rule is as shown here::
+values in ``p``.  There are two ways to access these symbols. First, you
+can use symbol names as shown::

-    #   p[0] p[1] p[2]   
-    #    |    |    |
    @_('expr PLUS term')
    def expr(self, p):
-        ...
+        return p.expr + p.term

-For tokens, the value of the corresponding ``p[i]`` is the *same* as
-the ``p.value`` attribute assigned to tokens in the lexer module.  For
-non-terminals, the value is whatever was returned by the methods
-defined for that rule.
-
-Within each rule, you return a value that becomes associated with that
-grammar symbol elsewhere. In the example shown, rules are carrying out
-the evaluation of an arithmetic expression::
+Alternatively, you can also index ``p`` like an array::

    @_('expr PLUS term')
    def expr(self, p):
        return p[0] + p[2]

+For tokens, the value of the corresponding ``p.symbol`` or ``p[i]`` is
+the *same* as the ``p.value`` attribute assigned to tokens in the
+lexer module.  For non-terminals, the value is whatever was returned
+by the methods defined for that rule.
+
+If a grammar rule includes the same symbol name more than once, you
+need to append a numeric suffix to disambiguate the symbol name when
+you're accessing values.  For example::
+
+    @_('expr PLUS expr')
+    def expr(self, p):
+        return p.expr0 + p.expr1
+
+Finally, within each rule, you always return a value that becomes
+associated with that grammar symbol elsewhere. This is how values
+propagate within the grammar.
+
 There are many other kinds of things that might happen in a rule
 though. For example, a rule might construct part of a parse tree
 instead::

    @_('expr PLUS term')
    def expr(self, p):
-        return ('+', p[0], p[2])
+        return ('+', p.expr, p.term)

-or perhaps create an instance related to an abstract syntax tree::
+or it might create an instance related to an abstract syntax tree::

    class BinOp(object):
        def __init__(self, op, left, right):
@ -737,7 +745,7 @@ or perhaps create an instance related to an abstract syntax tree::

    @_('expr PLUS term')
    def expr(self, p):
-        return BinOp('+', p[0], p[2])
+        return BinOp('+', p.expr, p.term)

 The key thing is that the method returns the value that's going to
 be attached to the symbol "expr" in this case.  This is the propagation
@ -751,25 +759,29 @@ For example, suppose you had two rules that were constructing a parse tree::

    @_('expr PLUS term')
    def expr(self, p):
-        return ('+', p[0], p[2])
+        return ('+', p.expr, p.term)

    @_('expr MINUS term')
    def expr(self, p):
-        return ('-', p[0], p[2])
+        return ('-', p.expr, p.term)

 Instead of writing two functions, you might write a single function like this::

    @_('expr PLUS term',
       'expr MINUS term')
    def expr(self, p):
-        return (p[1], p[0], p[2])
+        return (p[1], p.expr, p.term)
+
+In this example, the operator could be ``PLUS`` or ``MINUS``.  Thus,
+we can't use the symbolic name to refer to its value. Instead, use the array
+index ``p[1]`` to get it as shown.

 In general, the ``@_()`` decorator for any given method can list
 multiple grammar rules.  When combining grammar rules into a single
-function though, it is usually a good idea for all of the rules to
-have a similar structure (e.g., the same number of terms).  Otherwise,
-the corresponding action code may end up being more complicated than
-necessary.
+function though, all of the rules should have a similar structure
+(e.g., the same number of terms and consistent symbol names).
+Otherwise, the corresponding action code may end up being more
+complicated than necessary.

 Character Literals
 ^^^^^^^^^^^^^^^^^^
@ -779,11 +791,11 @@ literals.  For example::

    @_('expr "+" term')
    def expr(self, p):
-        return p[0] + p[2]
+        return p.expr + p.term

    @_('expr "-" term')
    def expr(self, p):
-        return p[0] - p[2]
+        return p.expr - p.term

 A character literal must be enclosed in quotes such as ``"+"``.  In
 addition, if literals are used, they must be declared in the
@ -898,16 +910,33 @@ like this::
           ('left', 'PLUS', 'MINUS'),
           ('left', 'TIMES', 'DIVIDE'),
        )
+
+        # Rules where precedence is applied
+	@_('expr PLUS expr')
+ 	def expr(self, p):
+            return p.expr0 + p.expr1
+
+	@_('expr MINUS expr')
+ 	def expr(self, p):
+            return p.expr0 - p.expr1
+
+	@_('expr TIMES expr')
+ 	def expr(self, p):
+            return p.expr0 * p.expr1
+
+	@_('expr DIVIDE expr')
+ 	def expr(self, p):
+            return p.expr0 / p.expr1
        ...

-This declaration specifies that ``PLUS``/``MINUS`` have the
-same precedence level and are left-associative and that
+This ``precedence`` declaration specifies that ``PLUS``/``MINUS`` have
+the same precedence level and are left-associative and that
 ``TIMES``/``DIVIDE`` have the same precedence and are
-left-associative.  Within the ``precedence`` declaration, tokens
-are ordered from lowest to highest precedence. Thus, this declaration
-specifies that ``TIMES``/``DIVIDE`` have higher precedence
-than ``PLUS``/``MINUS`` (since they appear later in the
-precedence specification).
+left-associative.  Within the ``precedence`` declaration, tokens are
+ordered from lowest to highest precedence. Thus, this declaration
+specifies that ``TIMES``/``DIVIDE`` have higher precedence than
+``PLUS``/``MINUS`` (since they appear later in the precedence
+specification).

 The precedence specification works by associating a numerical
 precedence level value and associativity direction to the listed
@ -977,7 +1006,7 @@ Now, in the grammar file, you write the unary minus rule like this::

        @_('MINUS expr %prec UMINUS')
        def expr(p):
-           p[0] = -p[2]
+           return -p.expr

 In this case, ``%prec UMINUS`` overrides the default rule precedence--setting it to that
 of ``UMINUS`` in the precedence specifier.
@ -1310,15 +1339,15 @@ like this::
       'expr TIMES expr',
       'expr DIVIDE expr')
    def expr(self, p):
-        return ('binary-expression', p[1], p[0], p[2])
+        return ('binary-expression', p[1], p.expr0, p.expr1)

    @_('LPAREN expr RPAREN')
    def expr(self, p):
-        return ('group-expression',p[1])
+        return ('group-expression',p.expr])

    @_('NUMBER')
    def expr(self, p):
-        return ('number-expression', p[0])
+        return ('number-expression', p.NUMBER)

 Another approach is to create a set of data structure for different
 kinds of abstract syntax tree nodes and create different node types
@ -1342,15 +1371,15 @@ in each rule::
       'expr TIMES expr',
       'expr DIVIDE expr')
    def expr(self, p):
-        return BinOp(p[1], p[0], p[2])
+        return BinOp(p[1], p.expr0, p.expr1)

    @_('LPAREN expr RPAREN')
    def expr(self, p):
-        return p[1]
+        return p.expr

    @_('NUMBER')
    def expr(self, p):
-        return Number(p[0])
+        return Number(p.NUMBER)

 The advantage to this approach is that it may make it easier to attach
 more complicated semantics, type checking, code generation, and other
@ -1385,7 +1414,7 @@ at the end of a rule.  For example, suppose you have a rule like this::

    @_('A B C D')
    def foo(self, p):
-        print("Parsed a foo", p[0],p[1],p[2],p[3])
+        print("Parsed a foo", p.A, p.B, p.C, p.D)

 In this case, the supplied action code only executes after all of the
 symbols ``A``, ``B``, ``C``, and ``D`` have been
@ -1396,8 +1425,8 @@ been parsed. To do this, write an empty rule like this::

    @_('A seen_A B C D')
    def foo(self, p):
-        print("Parsed a foo", p[0],p[2],p[3],p[4])
-        print("seen_A returned", p[1])
+        print("Parsed a foo", p.A, p.B, p.C, p.D)
+        print("seen_A returned", p.seen_A])

    @_('')
    def seen_A(self, p):
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@ -8,14 +8,31 @@ sys.path.insert(0, "../..")
 from sly import Lexer, Parser

 class CalcLexer(Lexer):
-    tokens = (
-        'NAME', 'NUMBER',
-        )
-    ignore = ' \t'
-    literals = ['=', '+', '-', '*', '/', '(', ')']
+    # Set of token names.   This is always required
+    tokens = {
+        'ID',
+        'NUMBER',
+        'PLUS',
+        'MINUS',
+        'TIMES',
+        'DIVIDE',
+        'ASSIGN',
+        'LPAREN',
+        'RPAREN',
+        }

-    # Tokens
-    NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
+    # String containing ignored characters between tokens
+    ignore = ' \t'
+
+    # Regular expression rules for tokens
+    ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
+    PLUS    = r'\+'
+    MINUS   = r'-'
+    TIMES   = r'\*'
+    DIVIDE  = r'/'
+    ASSIGN  = r'='
+    LPAREN  = r'\('
+    RPAREN  = r'\)'

    @_(r'\d+')
    def NUMBER(self, t):
@ -31,66 +48,50 @@ class CalcLexer(Lexer):
        self.index += 1

 class CalcParser(Parser):
+    # Get the token list from the lexer (required)
    tokens = CalcLexer.tokens

-    precedence = (
-        ('left', '+', '-'),
-        ('left', '*', '/'),
-        ('right', 'UMINUS'),
-    )
+    # Grammar rules and actions
+    @_('expr PLUS term')
+    def expr(self, p):
+        return p.expr + p.term

-    def __init__(self):
-        self.names = { }
+    @_('expr MINUS term')
+    def expr(self, p):
+        return p.expr - p.term

-    @_('NAME "=" expression')
-    def statement(self, p):
-        self.names[p.NAME] = p.expression
+    @_('term')
+    def expr(self, p):
+        return p.term

-    @_('expression')
-    def statement(self, p):
-        print(p.expression)
+    @_('term TIMES factor1')
+    def term(self, p):
+        return p.term * p.factor

-    @_('expression "+" expression',
-       'expression "-" expression',
-       'expression "*" expression',
-       'expression "/" expression')
-    def expression(self, p):
-        if p[1] == '+':
-            return p.expression0 + p.expression1
-        elif p[1] == '-':
-            return p.expression0 - p.expression1
-        elif p[1] == '*':
-            return p.expression0 * p.expression1
-        elif p[1] == '/':
-            return p.expression0 / p.expression1
+    @_('term DIVIDE factor')
+    def term(self, p):
+        return p.term / p.factor

-    @_('"-" expression %prec UMINUS')
-    def expression(self, p):
-        return -p.expression
-
-    @_('"(" expression ")"')
-    def expression(self, p):
-        return p.expression
+    @_('factor')
+    def term(self, p):
+        return p.factor

    @_('NUMBER')
-    def expression(self, p):
+    def factor(self, p):
        return p.NUMBER

-    @_('NAME')
-    def expression(self, p):
-        try:
-            return self.names[p.NAME]
-        except LookupError:
-            print("Undefined name '%s'" % p.NAME)
-            return 0
+    @_('LPAREN expr RPAREN')
+    def factor(self, p):
+        return p.expr

 if __name__ == '__main__':
    lexer = CalcLexer()
    parser = CalcParser()
+
    while True:
        try:
            text = input('calc > ')
+            result = parser.parse(lexer.tokenize(text))
+            print(result)
        except EOFError:
            break
-        if text:
-            parser.parse(lexer.tokenize(text))
--- a/example/calc_prec/calc.py
+++ b/example/calc_prec/calc.py
@ -0,0 +1,98 @@
+# -----------------------------------------------------------------------------
+# calc.py
+# -----------------------------------------------------------------------------
+
+import sys
+sys.path.insert(0, "../..")
+
+from sly import Lexer, Parser
+
+class CalcLexer(Lexer):
+    tokens = {
+        'NAME', 'NUMBER',
+        }
+    ignore = ' \t'
+    literals = { '=', '+', '-', '*', '/', '(', ')' }
+
+    # Tokens
+    NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
+
+    @_(r'\d+')
+    def NUMBER(self, t):
+        t.value = int(t.value)
+        return t
+
+    @_(r'\n+')
+    def newline(self, t):
+        self.lineno += t.value.count('\n')
+
+    def error(self, value):
+        print("Illegal character '%s'" % value[0])
+        self.index += 1
+
+class CalcParser(Parser):
+    tokens = CalcLexer.tokens
+
+    precedence = (
+        ('left', '+', '-'),
+        ('left', '*', '/'),
+        ('right', 'UMINUS'),
+        )
+
+    def __init__(self):
+        self.names = { }
+
+    @_('NAME "=" expr')
+    def statement(self, p):
+        self.names[p.NAME] = p.expr
+
+    @_('expr')
+    def statement(self, p):
+        print(p.expr)
+
+    @_('expr "+" expr')
+    def expr(self, p):
+        return p.expr0 + p.expr1
+
+    @_('expr "-" expr')
+    def expr(self, p):
+        return p.expr0 - p.expr1
+
+    @_('expr "*" expr')
+    def expr(self, p):
+        return p.expr0 * p.expr1
+
+    @_('expr "/" expr')
+    def expr(self, p):
+        return p.expr0 / p.expr1
+
+    @_('"-" expr %prec UMINUS')
+    def expr(self, p):
+        return -p.expr
+
+    @_('"(" expr ")"')
+    def expr(self, p):
+        return p.expr
+
+    @_('NUMBER')
+    def expr(self, p):
+        return p.NUMBER
+
+    @_('NAME')
+    def expr(self, p):
+        try:
+            return self.names[p.NAME]
+        except LookupError:
+            print("Undefined name '%s'" % p.NAME)
+            return 0
+
+if __name__ == '__main__':
+    lexer = CalcLexer()
+    parser = CalcParser()
+    while True:
+        try:
+            text = input('calc > ')
+        except EOFError:
+            break
+        if text:
+            parser.parse(lexer.tokenize(text))
--- a/sly/yacc.py
+++ b/sly/yacc.py
@ -126,6 +126,8 @@ class YaccProduction:
    @property
    def lineno(self):
        for tok in self._slice:
+            if isinstance(tok, YaccSymbol):
+                continue
            lineno = getattr(tok, 'lineno', None)
            if lineno:
                return lineno
@ -134,6 +136,8 @@ class YaccProduction:
    @property
    def index(self):
        for tok in self._slice:
+            if isinstance(tok, YaccSymbol):
+                continue
            index = getattr(tok, 'index', None)
            if index:
                return index
@ -1680,7 +1684,7 @@ class Parser(metaclass=ParserMeta):
        undefined_symbols = grammar.undefined_symbols()
        for sym, prod in undefined_symbols:
            cls.log.error('%s:%d: Symbol %r used, but not defined as a token or a rule', prod.file, prod.line, sym)
-            fai = True
+            fail = True

        unused_terminals = grammar.unused_terminals()
        for term in unused_terminals: