More doc updates

This commit is contained in:
David Beazley 2016-09-14 14:21:33 -05:00
parent 5c3083712f
commit 0a17f78d2e
5 changed files with 423 additions and 103 deletions

192
README.md
View File

@ -1,4 +1,192 @@
# SLY (Sly Lex Yacc)
SLY (Sly Lex-Yacc) Version 0.0
Copyright (C) 2016
David M. Beazley (Dabeaz LLC)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the David Beazley or Dabeaz LLC may be used to
endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Requirements
============
SLY requires the use of Python 3.5 or greater. Older versions
of Python are not supported.
Introduction
============
SLY is a 100% Python implementation of the lex and yacc tools
commonly used to write parsers and compilers. Parsing is
based on the same LALR(1) algorithm used by many yacc tools.
Here are a few notable features:
- SLY provides *very* extensive error reporting and diagnostic
information to assist in parser construction. The original
implementation was developed for instructional purposes. As
a result, the system tries to identify the most common types
of errors made by novice users.
- SLY provides full support for empty productions, error recovery,
precedence specifiers, and moderately ambiguous grammars.
- SLY uses various Python metaprogramming features to specify
lexers and parsers. There are no generated files or extra
steps involved. You simply write Python code and run it.
- SLY can be used to build parsers for "real" programming languages.
Although it is not ultra-fast due to its Python implementation,
SLY can be used to parse grammars consisting of several hundred
rules (as might be found for a language like C).
An Example
==========
SLY is probably best illustrated by an example. Here's what it
looks like to write a parser that can evaluate simple arithmetic
expressions and store variables:
# -----------------------------------------------------------------------------
# calc.py
# -----------------------------------------------------------------------------
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, value):
print("Illegal character '%s'" % value[0])
self.index += 1
class CalcParser(Parser):
tokens = CalcLexer.tokens
precedence = (
('left', '+', '-'),
('left', '*', '/'),
('right', 'UMINUS'),
)
def __init__(self):
self.names = { }
@_('NAME "=" expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('expr "+" expr')
def expr(self, p):
return p.expr0 + p.expr1
@_('expr "-" expr')
def expr(self, p):
return p.expr0 - p.expr1
@_('expr "*" expr')
def expr(self, p):
return p.expr0 * p.expr1
@_('expr "/" expr')
def expr(self, p):
return p.expr0 / p.expr1
@_('"-" expr %prec UMINUS')
def expr(self, p):
return -p.expr
@_('"(" expr ")"')
def expr(self, p):
return p.expr
@_('NUMBER')
def expr(self, p):
return p.NUMBER
@_('NAME')
def expr(self, p):
try:
return self.names[p.NAME]
except LookupError:
print("Undefined name '%s'" % p.NAME)
return 0
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))
Resources
=========
For a detailed overview of parsing theory, consult the excellent
book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and
Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown
may also be useful.
The GitHub page for SLY can be found at:
https://github.com/dabeaz/sly
Please direct bug reports and pull requests to the GitHub page.
To contact me directly, send email to dave@dabeaz.com or contact
me on Twitter (@dabeaz).
-- Dave
The name says it all.

View File

@ -638,35 +638,35 @@ SLY::
# Grammar rules and actions
@_('expr PLUS term')
def expr(self, p):
return p[0] + p[2]
return p.expr + p.term
@_('expr MINUS term')
def expr(self, p):
return p[0] - p[2]
return p.expr - p.term
@_('term')
def expr(self, p):
return p[0]
return p.term
@_('term TIMES factor')
def term(self, p):
return p[0] * p[2]
return p.term * p.factor
@_('term DIVIDE factor')
def term(self, p):
return p[0] / p[2]
return p.term / p.factor
@_('factor')
def term(self, p):
return p[0]
return p.factor
@_('NUMBER')
def factor(self, p):
return p[0]
return p.NUMBER
@_('LPAREN expr RPAREN')
def factor(self, p):
return p[1]
return p.expr
if __name__ == '__main__':
lexer = CalcLexer()
@ -697,37 +697,45 @@ becomes a method like this::
The method is triggered when that grammar rule is recognized on the
input. As an argument, the method receives a sequence of grammar symbol
values ``p`` that is accessed as an array of symbols. The mapping between
elements of ``p`` and the grammar rule is as shown here::
values in ``p``. There are two ways to access these symbols. First, you
can use symbol names as shown::
# p[0] p[1] p[2]
# | | |
@_('expr PLUS term')
def expr(self, p):
...
return p.expr + p.term
For tokens, the value of the corresponding ``p[i]`` is the *same* as
the ``p.value`` attribute assigned to tokens in the lexer module. For
non-terminals, the value is whatever was returned by the methods
defined for that rule.
Within each rule, you return a value that becomes associated with that
grammar symbol elsewhere. In the example shown, rules are carrying out
the evaluation of an arithmetic expression::
Alternatively, you can also index ``p`` like an array::
@_('expr PLUS term')
def expr(self, p):
return p[0] + p[2]
For tokens, the value of the corresponding ``p.symbol`` or ``p[i]`` is
the *same* as the ``p.value`` attribute assigned to tokens in the
lexer module. For non-terminals, the value is whatever was returned
by the methods defined for that rule.
If a grammar rule includes the same symbol name more than once, you
need to append a numeric suffix to disambiguate the symbol name when
you're accessing values. For example::
@_('expr PLUS expr')
def expr(self, p):
return p.expr0 + p.expr1
Finally, within each rule, you always return a value that becomes
associated with that grammar symbol elsewhere. This is how values
propagate within the grammar.
There are many other kinds of things that might happen in a rule
though. For example, a rule might construct part of a parse tree
instead::
@_('expr PLUS term')
def expr(self, p):
return ('+', p[0], p[2])
return ('+', p.expr, p.term)
or perhaps create an instance related to an abstract syntax tree::
or it might create an instance related to an abstract syntax tree::
class BinOp(object):
def __init__(self, op, left, right):
@ -737,7 +745,7 @@ or perhaps create an instance related to an abstract syntax tree::
@_('expr PLUS term')
def expr(self, p):
return BinOp('+', p[0], p[2])
return BinOp('+', p.expr, p.term)
The key thing is that the method returns the value that's going to
be attached to the symbol "expr" in this case. This is the propagation
@ -751,25 +759,29 @@ For example, suppose you had two rules that were constructing a parse tree::
@_('expr PLUS term')
def expr(self, p):
return ('+', p[0], p[2])
return ('+', p.expr, p.term)
@_('expr MINUS term')
def expr(self, p):
return ('-', p[0], p[2])
return ('-', p.expr, p.term)
Instead of writing two functions, you might write a single function like this::
@_('expr PLUS term',
'expr MINUS term')
def expr(self, p):
return (p[1], p[0], p[2])
return (p[1], p.expr, p.term)
In this example, the operator could be ``PLUS`` or ``MINUS``. Thus,
we can't use the symbolic name to refer to its value. Instead, use the array
index ``p[1]`` to get it as shown.
In general, the ``@_()`` decorator for any given method can list
multiple grammar rules. When combining grammar rules into a single
function though, it is usually a good idea for all of the rules to
have a similar structure (e.g., the same number of terms). Otherwise,
the corresponding action code may end up being more complicated than
necessary.
function though, all of the rules should have a similar structure
(e.g., the same number of terms and consistent symbol names).
Otherwise, the corresponding action code may end up being more
complicated than necessary.
Character Literals
^^^^^^^^^^^^^^^^^^
@ -779,11 +791,11 @@ literals. For example::
@_('expr "+" term')
def expr(self, p):
return p[0] + p[2]
return p.expr + p.term
@_('expr "-" term')
def expr(self, p):
return p[0] - p[2]
return p.expr - p.term
A character literal must be enclosed in quotes such as ``"+"``. In
addition, if literals are used, they must be declared in the
@ -898,16 +910,33 @@ like this::
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
)
# Rules where precedence is applied
@_('expr PLUS expr')
def expr(self, p):
return p.expr0 + p.expr1
@_('expr MINUS expr')
def expr(self, p):
return p.expr0 - p.expr1
@_('expr TIMES expr')
def expr(self, p):
return p.expr0 * p.expr1
@_('expr DIVIDE expr')
def expr(self, p):
return p.expr0 / p.expr1
...
This declaration specifies that ``PLUS``/``MINUS`` have the
same precedence level and are left-associative and that
This ``precedence`` declaration specifies that ``PLUS``/``MINUS`` have
the same precedence level and are left-associative and that
``TIMES``/``DIVIDE`` have the same precedence and are
left-associative. Within the ``precedence`` declaration, tokens
are ordered from lowest to highest precedence. Thus, this declaration
specifies that ``TIMES``/``DIVIDE`` have higher precedence
than ``PLUS``/``MINUS`` (since they appear later in the
precedence specification).
left-associative. Within the ``precedence`` declaration, tokens are
ordered from lowest to highest precedence. Thus, this declaration
specifies that ``TIMES``/``DIVIDE`` have higher precedence than
``PLUS``/``MINUS`` (since they appear later in the precedence
specification).
The precedence specification works by associating a numerical
precedence level value and associativity direction to the listed
@ -977,7 +1006,7 @@ Now, in the grammar file, you write the unary minus rule like this::
@_('MINUS expr %prec UMINUS')
def expr(p):
p[0] = -p[2]
return -p.expr
In this case, ``%prec UMINUS`` overrides the default rule precedence--setting it to that
of ``UMINUS`` in the precedence specifier.
@ -1310,15 +1339,15 @@ like this::
'expr TIMES expr',
'expr DIVIDE expr')
def expr(self, p):
return ('binary-expression', p[1], p[0], p[2])
return ('binary-expression', p[1], p.expr0, p.expr1)
@_('LPAREN expr RPAREN')
def expr(self, p):
return ('group-expression',p[1])
return ('group-expression',p.expr])
@_('NUMBER')
def expr(self, p):
return ('number-expression', p[0])
return ('number-expression', p.NUMBER)
Another approach is to create a set of data structure for different
kinds of abstract syntax tree nodes and create different node types
@ -1342,15 +1371,15 @@ in each rule::
'expr TIMES expr',
'expr DIVIDE expr')
def expr(self, p):
return BinOp(p[1], p[0], p[2])
return BinOp(p[1], p.expr0, p.expr1)
@_('LPAREN expr RPAREN')
def expr(self, p):
return p[1]
return p.expr
@_('NUMBER')
def expr(self, p):
return Number(p[0])
return Number(p.NUMBER)
The advantage to this approach is that it may make it easier to attach
more complicated semantics, type checking, code generation, and other
@ -1385,7 +1414,7 @@ at the end of a rule. For example, suppose you have a rule like this::
@_('A B C D')
def foo(self, p):
print("Parsed a foo", p[0],p[1],p[2],p[3])
print("Parsed a foo", p.A, p.B, p.C, p.D)
In this case, the supplied action code only executes after all of the
symbols ``A``, ``B``, ``C``, and ``D`` have been
@ -1396,8 +1425,8 @@ been parsed. To do this, write an empty rule like this::
@_('A seen_A B C D')
def foo(self, p):
print("Parsed a foo", p[0],p[2],p[3],p[4])
print("seen_A returned", p[1])
print("Parsed a foo", p.A, p.B, p.C, p.D)
print("seen_A returned", p.seen_A])
@_('')
def seen_A(self, p):

View File

@ -8,14 +8,31 @@ sys.path.insert(0, "../..")
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = (
'NAME', 'NUMBER',
)
ignore = ' \t'
literals = ['=', '+', '-', '*', '/', '(', ')']
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
# String containing ignored characters between tokens
ignore = ' \t'
# Regular expression rules for tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
@_(r'\d+')
def NUMBER(self, t):
@ -31,66 +48,50 @@ class CalcLexer(Lexer):
self.index += 1
class CalcParser(Parser):
# Get the token list from the lexer (required)
tokens = CalcLexer.tokens
precedence = (
('left', '+', '-'),
('left', '*', '/'),
('right', 'UMINUS'),
)
# Grammar rules and actions
@_('expr PLUS term')
def expr(self, p):
return p.expr + p.term
def __init__(self):
self.names = { }
@_('expr MINUS term')
def expr(self, p):
return p.expr - p.term
@_('NAME "=" expression')
def statement(self, p):
self.names[p.NAME] = p.expression
@_('term')
def expr(self, p):
return p.term
@_('expression')
def statement(self, p):
print(p.expression)
@_('term TIMES factor1')
def term(self, p):
return p.term * p.factor
@_('expression "+" expression',
'expression "-" expression',
'expression "*" expression',
'expression "/" expression')
def expression(self, p):
if p[1] == '+':
return p.expression0 + p.expression1
elif p[1] == '-':
return p.expression0 - p.expression1
elif p[1] == '*':
return p.expression0 * p.expression1
elif p[1] == '/':
return p.expression0 / p.expression1
@_('term DIVIDE factor')
def term(self, p):
return p.term / p.factor
@_('"-" expression %prec UMINUS')
def expression(self, p):
return -p.expression
@_('"(" expression ")"')
def expression(self, p):
return p.expression
@_('factor')
def term(self, p):
return p.factor
@_('NUMBER')
def expression(self, p):
def factor(self, p):
return p.NUMBER
@_('NAME')
def expression(self, p):
try:
return self.names[p.NAME]
except LookupError:
print("Undefined name '%s'" % p.NAME)
return 0
@_('LPAREN expr RPAREN')
def factor(self, p):
return p.expr
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
result = parser.parse(lexer.tokenize(text))
print(result)
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))

98
example/calc_prec/calc.py Normal file
View File

@ -0,0 +1,98 @@
# -----------------------------------------------------------------------------
# calc.py
# -----------------------------------------------------------------------------
import sys
sys.path.insert(0, "../..")
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, value):
print("Illegal character '%s'" % value[0])
self.index += 1
class CalcParser(Parser):
tokens = CalcLexer.tokens
precedence = (
('left', '+', '-'),
('left', '*', '/'),
('right', 'UMINUS'),
)
def __init__(self):
self.names = { }
@_('NAME "=" expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('expr "+" expr')
def expr(self, p):
return p.expr0 + p.expr1
@_('expr "-" expr')
def expr(self, p):
return p.expr0 - p.expr1
@_('expr "*" expr')
def expr(self, p):
return p.expr0 * p.expr1
@_('expr "/" expr')
def expr(self, p):
return p.expr0 / p.expr1
@_('"-" expr %prec UMINUS')
def expr(self, p):
return -p.expr
@_('"(" expr ")"')
def expr(self, p):
return p.expr
@_('NUMBER')
def expr(self, p):
return p.NUMBER
@_('NAME')
def expr(self, p):
try:
return self.names[p.NAME]
except LookupError:
print("Undefined name '%s'" % p.NAME)
return 0
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))

View File

@ -126,6 +126,8 @@ class YaccProduction:
@property
def lineno(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
lineno = getattr(tok, 'lineno', None)
if lineno:
return lineno
@ -134,6 +136,8 @@ class YaccProduction:
@property
def index(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
index = getattr(tok, 'index', None)
if index:
return index
@ -1680,7 +1684,7 @@ class Parser(metaclass=ParserMeta):
undefined_symbols = grammar.undefined_symbols()
for sym, prod in undefined_symbols:
cls.log.error('%s:%d: Symbol %r used, but not defined as a token or a rule', prod.file, prod.line, sym)
fai = True
fail = True
unused_terminals = grammar.unused_terminals()
for term in unused_terminals: