diff --git a/README.md b/README.md index f5c4167..d449860 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,192 @@ -# SLY (Sly Lex Yacc) +SLY (Sly Lex-Yacc) Version 0.0 + +Copyright (C) 2016 +David M. Beazley (Dabeaz LLC) +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the David Beazley or Dabeaz LLC may be used to + endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Requirements +============ + +SLY requires the use of Python 3.5 or greater. Older versions +of Python are not supported. + +Introduction +============ + +SLY is a 100% Python implementation of the lex and yacc tools +commonly used to write parsers and compilers. Parsing is +based on the same LALR(1) algorithm used by many yacc tools. +Here are a few notable features: + + - SLY provides *very* extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + + - SLY provides full support for empty productions, error recovery, + precedence specifiers, and moderately ambiguous grammars. + + - SLY uses various Python metaprogramming features to specify + lexers and parsers. There are no generated files or extra + steps involved. You simply write Python code and run it. + + - SLY can be used to build parsers for "real" programming languages. + Although it is not ultra-fast due to its Python implementation, + SLY can be used to parse grammars consisting of several hundred + rules (as might be found for a language like C). + +An Example +========== + +SLY is probably best illustrated by an example. Here's what it +looks like to write a parser that can evaluate simple arithmetic +expressions and store variables: + + # ----------------------------------------------------------------------------- + # calc.py + # ----------------------------------------------------------------------------- + + from sly import Lexer, Parser + + class CalcLexer(Lexer): + tokens = { + 'NAME', 'NUMBER', + } + ignore = ' \t' + literals = { '=', '+', '-', '*', '/', '(', ')' } + + # Tokens + NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + @_(r'\d+') + def NUMBER(self, t): + t.value = int(t.value) + return t + + @_(r'\n+') + def newline(self, t): + self.lineno += t.value.count('\n') + + def error(self, value): + print("Illegal character '%s'" % value[0]) + self.index += 1 + + class CalcParser(Parser): + tokens = CalcLexer.tokens + + precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), + ) + + def __init__(self): + self.names = { } + + @_('NAME "=" expr') + def statement(self, p): + self.names[p.NAME] = p.expr + + @_('expr') + def statement(self, p): + print(p.expr) + + @_('expr "+" expr') + def expr(self, p): + return p.expr0 + p.expr1 + + @_('expr "-" expr') + def expr(self, p): + return p.expr0 - p.expr1 + + @_('expr "*" expr') + def expr(self, p): + return p.expr0 * p.expr1 + + @_('expr "/" expr') + def expr(self, p): + return p.expr0 / p.expr1 + + @_('"-" expr %prec UMINUS') + def expr(self, p): + return -p.expr + + @_('"(" expr ")"') + def expr(self, p): + return p.expr + + @_('NUMBER') + def expr(self, p): + return p.NUMBER + + @_('NAME') + def expr(self, p): + try: + return self.names[p.NAME] + except LookupError: + print("Undefined name '%s'" % p.NAME) + return 0 + + if __name__ == '__main__': + lexer = CalcLexer() + parser = CalcParser() + while True: + try: + text = input('calc > ') + except EOFError: + break + if text: + parser.parse(lexer.tokenize(text)) + +Resources +========= + +For a detailed overview of parsing theory, consult the excellent +book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and +Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown +may also be useful. + +The GitHub page for SLY can be found at: + + https://github.com/dabeaz/sly + +Please direct bug reports and pull requests to the GitHub page. +To contact me directly, send email to dave@dabeaz.com or contact +me on Twitter (@dabeaz). + +-- Dave + + + + + + + -The name says it all. diff --git a/docs/sly.rst b/docs/sly.rst index 4ca544a..a1bf5d5 100644 --- a/docs/sly.rst +++ b/docs/sly.rst @@ -638,35 +638,35 @@ SLY:: # Grammar rules and actions @_('expr PLUS term') def expr(self, p): - return p[0] + p[2] + return p.expr + p.term @_('expr MINUS term') def expr(self, p): - return p[0] - p[2] + return p.expr - p.term @_('term') def expr(self, p): - return p[0] + return p.term @_('term TIMES factor') def term(self, p): - return p[0] * p[2] + return p.term * p.factor @_('term DIVIDE factor') def term(self, p): - return p[0] / p[2] + return p.term / p.factor @_('factor') def term(self, p): - return p[0] + return p.factor @_('NUMBER') def factor(self, p): - return p[0] + return p.NUMBER @_('LPAREN expr RPAREN') def factor(self, p): - return p[1] + return p.expr if __name__ == '__main__': lexer = CalcLexer() @@ -697,37 +697,45 @@ becomes a method like this:: The method is triggered when that grammar rule is recognized on the input. As an argument, the method receives a sequence of grammar symbol -values ``p`` that is accessed as an array of symbols. The mapping between -elements of ``p`` and the grammar rule is as shown here:: +values in ``p``. There are two ways to access these symbols. First, you +can use symbol names as shown:: - # p[0] p[1] p[2] - # | | | @_('expr PLUS term') def expr(self, p): - ... + return p.expr + p.term -For tokens, the value of the corresponding ``p[i]`` is the *same* as -the ``p.value`` attribute assigned to tokens in the lexer module. For -non-terminals, the value is whatever was returned by the methods -defined for that rule. - -Within each rule, you return a value that becomes associated with that -grammar symbol elsewhere. In the example shown, rules are carrying out -the evaluation of an arithmetic expression:: +Alternatively, you can also index ``p`` like an array:: @_('expr PLUS term') def expr(self, p): return p[0] + p[2] +For tokens, the value of the corresponding ``p.symbol`` or ``p[i]`` is +the *same* as the ``p.value`` attribute assigned to tokens in the +lexer module. For non-terminals, the value is whatever was returned +by the methods defined for that rule. + +If a grammar rule includes the same symbol name more than once, you +need to append a numeric suffix to disambiguate the symbol name when +you're accessing values. For example:: + + @_('expr PLUS expr') + def expr(self, p): + return p.expr0 + p.expr1 + +Finally, within each rule, you always return a value that becomes +associated with that grammar symbol elsewhere. This is how values +propagate within the grammar. + There are many other kinds of things that might happen in a rule though. For example, a rule might construct part of a parse tree instead:: @_('expr PLUS term') def expr(self, p): - return ('+', p[0], p[2]) + return ('+', p.expr, p.term) -or perhaps create an instance related to an abstract syntax tree:: +or it might create an instance related to an abstract syntax tree:: class BinOp(object): def __init__(self, op, left, right): @@ -737,7 +745,7 @@ or perhaps create an instance related to an abstract syntax tree:: @_('expr PLUS term') def expr(self, p): - return BinOp('+', p[0], p[2]) + return BinOp('+', p.expr, p.term) The key thing is that the method returns the value that's going to be attached to the symbol "expr" in this case. This is the propagation @@ -751,25 +759,29 @@ For example, suppose you had two rules that were constructing a parse tree:: @_('expr PLUS term') def expr(self, p): - return ('+', p[0], p[2]) + return ('+', p.expr, p.term) @_('expr MINUS term') def expr(self, p): - return ('-', p[0], p[2]) + return ('-', p.expr, p.term) Instead of writing two functions, you might write a single function like this:: @_('expr PLUS term', 'expr MINUS term') def expr(self, p): - return (p[1], p[0], p[2]) + return (p[1], p.expr, p.term) + +In this example, the operator could be ``PLUS`` or ``MINUS``. Thus, +we can't use the symbolic name to refer to its value. Instead, use the array +index ``p[1]`` to get it as shown. In general, the ``@_()`` decorator for any given method can list multiple grammar rules. When combining grammar rules into a single -function though, it is usually a good idea for all of the rules to -have a similar structure (e.g., the same number of terms). Otherwise, -the corresponding action code may end up being more complicated than -necessary. +function though, all of the rules should have a similar structure +(e.g., the same number of terms and consistent symbol names). +Otherwise, the corresponding action code may end up being more +complicated than necessary. Character Literals ^^^^^^^^^^^^^^^^^^ @@ -779,11 +791,11 @@ literals. For example:: @_('expr "+" term') def expr(self, p): - return p[0] + p[2] + return p.expr + p.term @_('expr "-" term') def expr(self, p): - return p[0] - p[2] + return p.expr - p.term A character literal must be enclosed in quotes such as ``"+"``. In addition, if literals are used, they must be declared in the @@ -898,16 +910,33 @@ like this:: ('left', 'PLUS', 'MINUS'), ('left', 'TIMES', 'DIVIDE'), ) + + # Rules where precedence is applied + @_('expr PLUS expr') + def expr(self, p): + return p.expr0 + p.expr1 + + @_('expr MINUS expr') + def expr(self, p): + return p.expr0 - p.expr1 + + @_('expr TIMES expr') + def expr(self, p): + return p.expr0 * p.expr1 + + @_('expr DIVIDE expr') + def expr(self, p): + return p.expr0 / p.expr1 ... -This declaration specifies that ``PLUS``/``MINUS`` have the -same precedence level and are left-associative and that +This ``precedence`` declaration specifies that ``PLUS``/``MINUS`` have +the same precedence level and are left-associative and that ``TIMES``/``DIVIDE`` have the same precedence and are -left-associative. Within the ``precedence`` declaration, tokens -are ordered from lowest to highest precedence. Thus, this declaration -specifies that ``TIMES``/``DIVIDE`` have higher precedence -than ``PLUS``/``MINUS`` (since they appear later in the -precedence specification). +left-associative. Within the ``precedence`` declaration, tokens are +ordered from lowest to highest precedence. Thus, this declaration +specifies that ``TIMES``/``DIVIDE`` have higher precedence than +``PLUS``/``MINUS`` (since they appear later in the precedence +specification). The precedence specification works by associating a numerical precedence level value and associativity direction to the listed @@ -977,7 +1006,7 @@ Now, in the grammar file, you write the unary minus rule like this:: @_('MINUS expr %prec UMINUS') def expr(p): - p[0] = -p[2] + return -p.expr In this case, ``%prec UMINUS`` overrides the default rule precedence--setting it to that of ``UMINUS`` in the precedence specifier. @@ -1310,15 +1339,15 @@ like this:: 'expr TIMES expr', 'expr DIVIDE expr') def expr(self, p): - return ('binary-expression', p[1], p[0], p[2]) + return ('binary-expression', p[1], p.expr0, p.expr1) @_('LPAREN expr RPAREN') def expr(self, p): - return ('group-expression',p[1]) + return ('group-expression',p.expr]) @_('NUMBER') def expr(self, p): - return ('number-expression', p[0]) + return ('number-expression', p.NUMBER) Another approach is to create a set of data structure for different kinds of abstract syntax tree nodes and create different node types @@ -1342,15 +1371,15 @@ in each rule:: 'expr TIMES expr', 'expr DIVIDE expr') def expr(self, p): - return BinOp(p[1], p[0], p[2]) + return BinOp(p[1], p.expr0, p.expr1) @_('LPAREN expr RPAREN') def expr(self, p): - return p[1] + return p.expr @_('NUMBER') def expr(self, p): - return Number(p[0]) + return Number(p.NUMBER) The advantage to this approach is that it may make it easier to attach more complicated semantics, type checking, code generation, and other @@ -1385,7 +1414,7 @@ at the end of a rule. For example, suppose you have a rule like this:: @_('A B C D') def foo(self, p): - print("Parsed a foo", p[0],p[1],p[2],p[3]) + print("Parsed a foo", p.A, p.B, p.C, p.D) In this case, the supplied action code only executes after all of the symbols ``A``, ``B``, ``C``, and ``D`` have been @@ -1396,8 +1425,8 @@ been parsed. To do this, write an empty rule like this:: @_('A seen_A B C D') def foo(self, p): - print("Parsed a foo", p[0],p[2],p[3],p[4]) - print("seen_A returned", p[1]) + print("Parsed a foo", p.A, p.B, p.C, p.D) + print("seen_A returned", p.seen_A]) @_('') def seen_A(self, p): diff --git a/example/calc/calc.py b/example/calc/calc.py index d8bd7e0..6a47ca9 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -8,14 +8,31 @@ sys.path.insert(0, "../..") from sly import Lexer, Parser class CalcLexer(Lexer): - tokens = ( - 'NAME', 'NUMBER', - ) - ignore = ' \t' - literals = ['=', '+', '-', '*', '/', '(', ')'] + # Set of token names. This is always required + tokens = { + 'ID', + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'ASSIGN', + 'LPAREN', + 'RPAREN', + } - # Tokens - NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + # String containing ignored characters between tokens + ignore = ' \t' + + # Regular expression rules for tokens + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + PLUS = r'\+' + MINUS = r'-' + TIMES = r'\*' + DIVIDE = r'/' + ASSIGN = r'=' + LPAREN = r'\(' + RPAREN = r'\)' @_(r'\d+') def NUMBER(self, t): @@ -31,66 +48,50 @@ class CalcLexer(Lexer): self.index += 1 class CalcParser(Parser): + # Get the token list from the lexer (required) tokens = CalcLexer.tokens - - precedence = ( - ('left', '+', '-'), - ('left', '*', '/'), - ('right', 'UMINUS'), - ) - def __init__(self): - self.names = { } + # Grammar rules and actions + @_('expr PLUS term') + def expr(self, p): + return p.expr + p.term - @_('NAME "=" expression') - def statement(self, p): - self.names[p.NAME] = p.expression + @_('expr MINUS term') + def expr(self, p): + return p.expr - p.term - @_('expression') - def statement(self, p): - print(p.expression) + @_('term') + def expr(self, p): + return p.term - @_('expression "+" expression', - 'expression "-" expression', - 'expression "*" expression', - 'expression "/" expression') - def expression(self, p): - if p[1] == '+': - return p.expression0 + p.expression1 - elif p[1] == '-': - return p.expression0 - p.expression1 - elif p[1] == '*': - return p.expression0 * p.expression1 - elif p[1] == '/': - return p.expression0 / p.expression1 + @_('term TIMES factor1') + def term(self, p): + return p.term * p.factor - @_('"-" expression %prec UMINUS') - def expression(self, p): - return -p.expression + @_('term DIVIDE factor') + def term(self, p): + return p.term / p.factor - @_('"(" expression ")"') - def expression(self, p): - return p.expression + @_('factor') + def term(self, p): + return p.factor @_('NUMBER') - def expression(self, p): + def factor(self, p): return p.NUMBER - @_('NAME') - def expression(self, p): - try: - return self.names[p.NAME] - except LookupError: - print("Undefined name '%s'" % p.NAME) - return 0 + @_('LPAREN expr RPAREN') + def factor(self, p): + return p.expr if __name__ == '__main__': lexer = CalcLexer() parser = CalcParser() + while True: try: text = input('calc > ') + result = parser.parse(lexer.tokenize(text)) + print(result) except EOFError: break - if text: - parser.parse(lexer.tokenize(text)) diff --git a/example/calc_prec/calc.py b/example/calc_prec/calc.py new file mode 100644 index 0000000..24b51fc --- /dev/null +++ b/example/calc_prec/calc.py @@ -0,0 +1,98 @@ +# ----------------------------------------------------------------------------- +# calc.py +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +from sly import Lexer, Parser + +class CalcLexer(Lexer): + tokens = { + 'NAME', 'NUMBER', + } + ignore = ' \t' + literals = { '=', '+', '-', '*', '/', '(', ')' } + + # Tokens + NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + @_(r'\d+') + def NUMBER(self, t): + t.value = int(t.value) + return t + + @_(r'\n+') + def newline(self, t): + self.lineno += t.value.count('\n') + + def error(self, value): + print("Illegal character '%s'" % value[0]) + self.index += 1 + +class CalcParser(Parser): + tokens = CalcLexer.tokens + + precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), + ) + + def __init__(self): + self.names = { } + + @_('NAME "=" expr') + def statement(self, p): + self.names[p.NAME] = p.expr + + @_('expr') + def statement(self, p): + print(p.expr) + + @_('expr "+" expr') + def expr(self, p): + return p.expr0 + p.expr1 + + @_('expr "-" expr') + def expr(self, p): + return p.expr0 - p.expr1 + + @_('expr "*" expr') + def expr(self, p): + return p.expr0 * p.expr1 + + @_('expr "/" expr') + def expr(self, p): + return p.expr0 / p.expr1 + + @_('"-" expr %prec UMINUS') + def expr(self, p): + return -p.expr + + @_('"(" expr ")"') + def expr(self, p): + return p.expr + + @_('NUMBER') + def expr(self, p): + return p.NUMBER + + @_('NAME') + def expr(self, p): + try: + return self.names[p.NAME] + except LookupError: + print("Undefined name '%s'" % p.NAME) + return 0 + +if __name__ == '__main__': + lexer = CalcLexer() + parser = CalcParser() + while True: + try: + text = input('calc > ') + except EOFError: + break + if text: + parser.parse(lexer.tokenize(text)) diff --git a/sly/yacc.py b/sly/yacc.py index f2a303e..6f9e894 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -126,6 +126,8 @@ class YaccProduction: @property def lineno(self): for tok in self._slice: + if isinstance(tok, YaccSymbol): + continue lineno = getattr(tok, 'lineno', None) if lineno: return lineno @@ -134,6 +136,8 @@ class YaccProduction: @property def index(self): for tok in self._slice: + if isinstance(tok, YaccSymbol): + continue index = getattr(tok, 'index', None) if index: return index @@ -1680,7 +1684,7 @@ class Parser(metaclass=ParserMeta): undefined_symbols = grammar.undefined_symbols() for sym, prod in undefined_symbols: cls.log.error('%s:%d: Symbol %r used, but not defined as a token or a rule', prod.file, prod.line, sym) - fai = True + fail = True unused_terminals = grammar.unused_terminals() for term in unused_terminals: