diff --git a/docs/sly.rst b/docs/sly.rst index 8d56f41..aa72daa 100644 --- a/docs/sly.rst +++ b/docs/sly.rst @@ -403,7 +403,11 @@ Writing a Parser The ``Parser`` class is used to parse language syntax. Before showing an example, there are a few important bits of background that must be -mentioned. First, *syntax* is usually specified in terms of a BNF +mentioned. + +Parsing Background +^^^^^^^^^^^^^^^^^^ +When writing a parser, *syntax* is usually specified in terms of a BNF grammar. For example, if you wanted to parse simple arithmetic expressions, you might first write an unambiguous grammar specification like this:: @@ -427,7 +431,7 @@ identifiers are known as *non-terminals*. The semantic behavior of a language is often specified using a technique known as syntax directed translation. In syntax directed -translation, attributes are attached to each symbol in a given grammar +translation, values are attached to each symbol in a given grammar rule along with an action. Whenever a particular grammar rule is recognized, the action describes what to do. For example, given the expression grammar above, you might write the specification for a @@ -456,17 +460,17 @@ values. SLY uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a bottom up technique that tries to recognize the right-hand-side of various grammar rules. Whenever a valid -right-hand-side is found in the input, the appropriate action code is -triggered and the grammar symbols are replaced by the grammar symbol -on the left-hand-side. +right-hand-side is found in the input, the appropriate action method +is triggered and the grammar symbols on right hand side are replaced +by the grammar symbol on the left-hand-side. LR parsing is commonly implemented by shifting grammar symbols onto a stack and looking at the stack and the next input token for patterns that match one of the grammar rules. The details of the algorithm can be found in a compiler textbook, but the following example illustrates -the steps that are performed if you wanted to parse the expression ``3 -+ 5 * (10 - 20)`` using the grammar defined above. In the example, -the special symbol ``$`` represents the end of input:: +the steps that are performed when parsing the expression ``3 + 5 * (10 +- 20)`` using the grammar defined above. In the example, the special +symbol ``$`` represents the end of input:: Step Symbol Stack Input Tokens Action ---- --------------------- --------------------- ------------------------------- @@ -519,9 +523,9 @@ rule ``expr : expr + term``. Parsing Example ^^^^^^^^^^^^^^^ -Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. -Here is how you would do it with SLY:: - +Suppose you wanted to make a grammar for evaluating simple arithmetic +expressions as previously described. Here is how you would do it with +SLY:: from sly import Parser from calclex import CalcLexer @@ -533,35 +537,35 @@ Here is how you would do it with SLY:: # Grammar rules and actions @_('expr PLUS term') def expr(self, p): - p[0] = p[1] + p[3] + return p[0] + p[2] @_('expr MINUS term') def expr(self, p): - p[0] = p[1] - p[3] + return p[0] - p[2] @_('term') def expr(self, p): - p[0] = p[1] + return p[0] @_('term TIMES factor') def term(self, p): - p[0] = p[1] * p[3] + return p[0] * p[2] @_('term DIVIDE factor') def term(self, p): - p[0] = p[1] / p[3] + return p[0] / p[2] @_('factor') def term(self, p): - p[0] = p[1] + return p[0] @_('NUMBER') def factor(self, p): - p[0] = p[1] + return p[0] @_('LPAREN expr RPAREN') def factor(self, p): - p[0] = p[2] + return p[1] # Error rule for syntax errors def error(self, p): @@ -579,110 +583,98 @@ Here is how you would do it with SLY:: except EOFError: break -In this example, each grammar rule is defined by a Python function -where the docstring to that function contains the appropriate -context-free grammar specification. The statements that make up the -function body implement the semantic actions of the rule. Each function -accepts a single argument ``p`` that is a sequence containing the -values of each grammar symbol in the corresponding rule. The values -of ``p[i]`` are mapped to grammar symbols as shown here:: +In this example, each grammar rule is defined by a method that's been +decorated by ``@_(rule)`` decorator. The very first grammar rule +defines the top of the parse. The name of each method should match +the name of the grammar rule being parsed. The argument to the +``@_()`` decorator is a string describing the right-hand-side of the +grammar. Thus, a grammar rule like this:: - # p[1] p[2] p[3] + expr : expr PLUS term + +becomes a method like this:: + + @_('expr PLUS term') + def expr(self, p): + ... + +The method is triggered when that grammar rule is recognized on the +input. As an argument, the method receives a sequence of grammar symbol +values ``p`` that is accessed as an array. The mapping between +elements of ``p`` and the grammar rule is as shown here:: + + # p[0] p[1] p[2] # | | | @_('expr PLUS term') def expr(self, p): - p[0] = p[1] + p[3] + ... -For tokens, the "value" of the corresponding ``p[i]`` is the -*same* as the ``p.value`` attribute assigned in the lexer -module. For non-terminals, the value is determined by whatever is -placed in ``p[0]`` when rules are reduced. This value can be -anything at all. However, it probably most common for the value to be -a simple Python type, a tuple, or an instance. In this example, we -are relying on the fact that the ``NUMBER`` token stores an -integer value in its value field. All of the other rules simply -perform various types of integer operations and propagate the result. +For tokens, the value of the corresponding ``p[i]`` is the *same* as +the ``p.value`` attribute assigned to tokens in the lexer module. For +non-terminals, the value is whatever was returned by the methods +defined for that rule. -Note: The use of negative indices have a special meaning in -yacc---specially ``p[-1]`` does not have the same value -as ``p[3]`` in this example. Please see the section on "Embedded -Actions" for further details. +Within each rule, you return a value that becomes associated with that +grammar symbol elsewhere. In the example shown, rules are carrying out +the evaluation of an arithmetic expression:: -The first rule defined in the yacc specification determines the -starting grammar symbol (in this case, a rule for ``expr`` -appears first). Whenever the starting rule is reduced by the parser -and no more input is available, parsing stops and the final value is -returned (this value will be whatever the top-most rule placed -in ``p[0]``). Note: an alternative starting symbol can be -specified using the ``start`` attribute in the class. + @_('expr PLUS term') + def expr(self, p): + return p[0] + p[2] -The ``error()`` method is defined to catch syntax errors. +There are many other kinds of things that might happen in a rule +though. For example, a rule might construct part of a parse tree +instead:: + + @_('expr PLUS term') + def expr(self, p): + return ('+', p[0], p[2]) + +or perhaps create an instance related to an abstract syntax tree:: + + class BinOp(object): + def __init__(self, op, left, right): + self.op = op + self.left = left + self.right = right + + @_('expr PLUS term') + def expr(self, p): + return BinOp('+', p[0], p[2]) + +The key thing is that the method returns the value that's going to +be attached to the symbol "expr" in this case. + +The ``error()`` method is defined to handle syntax errors (if any). See the error handling section below for more detail. -If any errors are detected in your grammar specification, SLY will -produce diagnostic messages and possibly raise an exception. Some of -the errors that can be detected include: - -- Duplicated grammar rules -- Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. -- Badly specified grammar rules. -- Infinite recursion (rules that can never terminate). -- Unused rules and tokens -- Undefined rules and tokens - -The final part of the example shows how to actually run the parser. -To run the parser, you simply have to call the ``parse()`` method with -a sequence of the input tokens. This will run all of the grammar -rules and return the result of the entire parse. This result return -is the value assigned to ``p[0]`` in the starting grammar rule. - Combining Grammar Rule Functions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When grammar rules are similar, they can be combined into a single function. -For example, consider the two rules in our earlier example:: +When grammar rules are similar, they can be combined into a single method. +For example, suppose you had two rules that were constructing a parse tree:: @_('expr PLUS term') def expr(self, p): - p[0] = p[1] + p[3] + return ('+', p[0], p[2]) @_('expr MINUS term') def expr(self, p): - p[0] = p[1] - p[3] + return ('-', p[0], p[2]) -Instead of writing two functions, you might write a single function like this: +Instead of writing two functions, you might write a single function like this:: @_('expr PLUS term', 'expr MINUS term') def expr(self, p): - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] + return (p[1], p[0], p[2]) In general, the ``@_()`` decorator for any given method can list multiple grammar rules. When combining grammar rules into a single -function though, it is usually a good idea for all of the rules to have a -similar structure (e.g., the same number of terms). Otherwise, the -corresponding action code may be more complicated than necessary. -However, it is possible to handle simple cases using len(). For -example: - - @_('expr MINUS expr', - 'MINUS expression') - def expr(self, p): - if (len(p) == 4): - p[0] = p[1] - p[3] - elif (len(p) == 3): - p[0] = -p[2] - -If parsing performance is a concern, you should resist the urge to put -too much conditional processing into a single grammar rule as shown in -these examples. When you add checks to see which grammar rule is -being handled, you are actually duplicating the work that the parser -has already performed (i.e., the parser already knows exactly what rule it -matched). You can eliminate this overhead by using a -separate method for each grammar rule. +function though, it is usually a good idea for all of the rules to +have a similar structure (e.g., the same number of terms). Otherwise, +the corresponding action code may end up being more complicated than +necessary. Character Literals ^^^^^^^^^^^^^^^^^^ @@ -692,14 +684,16 @@ literals. For example:: @_('expr "+" term') def expr(self, p): - p[0] = p[1] + p[3] + return p[0] + p[2] @_('expr "-" term') def expr(self, p): - p[0] = p[1] - p[3] + return p[0] - p[2] -A character literal must be enclosed in quotes such as ``"+"``. In addition, if literals are used, they must be declared in the -corresponding lexer class through the use of a special ``literals`` declaration:: +A character literal must be enclosed in quotes such as ``"+"``. In +addition, if literals are used, they must be declared in the +corresponding lexer class through the use of a special ``literals`` +declaration:: class CalcLexer(Lexer): ... @@ -707,9 +701,8 @@ corresponding lexer class through the use of a special ``literals`` declaration: ... Character literals are limited to a single character. Thus, it is not -legal to specify literals such as ``<=`` or ``==``. -For this, use the normal lexing rules (e.g., define a rule such as -``EQ = r'=='``). +legal to specify literals such as ``<=`` or ``==``. For this, use the +normal lexing rules (e.g., define a rule such as ``EQ = r'=='``). Empty Productions ^^^^^^^^^^^^^^^^^ @@ -720,7 +713,7 @@ If you need an empty production, define a special rule like this:: def empty(self, p): pass -Now to use the empty production, simply use 'empty' as a symbol. For +Now to use the empty production elsewhere, use the name 'empty' as a symbol. For example:: @_('item') @@ -732,16 +725,16 @@ example:: ... Note: You can write empty rules anywhere by simply specifying an empty -string. However, I personally find that writing an "empty" -rule and using "empty" to denote an empty production is easier to read -and more clearly states your intentions. +string. However,writing an "empty" rule and using "empty" to denote an +empty production may be easier to read and more clearly state your +intention. Changing the starting symbol ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Normally, the first rule found in a yacc specification defines the -starting grammar rule (top level rule). To change this, supply -a ``start`` specifier in your file. For example:: +Normally, the first rule found in a parser class defines the starting +grammar rule (top level rule). To change this, supply a ``start`` +specifier in your class. For example:: class CalcParser(Parser): start = 'foo' @@ -766,12 +759,12 @@ situations, it is extremely difficult or awkward to write grammars in this format. A much more natural way to express the grammar is in a more compact form like this:: -expr : expr PLUS expr - | expr MINUS expr - | expr TIMES expr - | expr DIVIDE expr - | LPAREN expr RPAREN - | NUMBER + expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr + | LPAREN expr RPAREN + | NUMBER Unfortunately, this grammar specification is ambiguous. For example, if you are parsing the string "3 * 4 + 5", there is no way to tell how @@ -801,17 +794,17 @@ perfectly legal from the rules of the context-free-grammar. By default, all shift/reduce conflicts are resolved in favor of shifting. Therefore, in the above example, the parser will always -shift the ``+`` instead of reducing. Although this strategy -works in many cases (for example, the case of -"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, -in the above example, the decision to shift ``+`` is completely -wrong---we should have reduced ``expr * expr`` since -multiplication has higher mathematical precedence than addition. +shift the ``+`` instead of reducing. Although this strategy works in +many cases (for example, the case of "if-then" versus "if-then-else"), +it is not enough for arithmetic expressions. In fact, in the above +example, the decision to shift ``+`` is completely wrong---we should +have reduced ``expr * expr`` since multiplication has higher +mathematical precedence than addition. -To resolve ambiguity, especially in expression -grammars, SLY allows individual tokens to be assigned a -precedence level and associativity. This is done by adding a variable -``precedence`` to the grammar file like this:: +To resolve ambiguity, especially in expression grammars, SLY allows +individual tokens to be assigned a precedence level and associativity. +This is done by adding a variable ``precedence`` to the parser class +like this:: class CalcParser(Parser): ... @@ -865,12 +858,12 @@ rule is reduced for left associativity, whereas the token is shifted for right a 4. If nothing is known about the precedence, shift/reduce conflicts are resolved in favor of shifting (the default). -For example, if "expr PLUS expr" has been parsed and the -next token is "TIMES", the action is going to be a shift because -"TIMES" has a higher precedence level than "PLUS". On the other hand, -if "expr TIMES expr" has been parsed and the next token is -"PLUS", the action is going to be reduce because "PLUS" has a lower -precedence than "TIMES." +For example, if ``expr PLUS expr`` has been parsed and the +next token is ``TIMES``, the action is going to be a shift because +``TIMES`` has a higher precedence level than ``PLUS``. On the other hand, +if ``expr TIMES expr`` has been parsed and the next token is +``PLUS``, the action is going to be reduce because ``PLUS`` has a lower +precedence than ``TIMES.`` When shift/reduce conflicts are resolved using the first three techniques (with the help of precedence rules), SLY will @@ -878,10 +871,10 @@ report no errors or conflicts in the grammar. One problem with the precedence specifier technique is that it is sometimes necessary to change the precedence of an operator in certain -contexts. For example, consider a unary-minus operator in "3 + 4 * --5". Mathematically, the unary minus is normally given a very high +contexts. For example, consider a unary-minus operator in ``3 + 4 * +-5``. Mathematically, the unary minus is normally given a very high precedence--being evaluated before the multiply. However, in our -precedence specifier, MINUS has a lower precedence than TIMES. To +precedence specifier, ``MINUS`` has a lower precedence than ``TIMES``. To deal with this, precedence rules can be given for so-called "fictitious tokens" like this:: @@ -893,19 +886,19 @@ like this:: ('right', 'UMINUS'), # Unary minus operator ) -Now, in the grammar file, we can write our unary minus rule like this:: +Now, in the grammar file, you write the unary minus rule like this:: @_('MINUS expr %prec UMINUS') def expr(p): p[0] = -p[2] In this case, ``%prec UMINUS`` overrides the default rule precedence--setting it to that -of UMINUS in the precedence specifier. +of ``UMINUS`` in the precedence specifier. -At first, the use of UMINUS in this example may appear very confusing. -UMINUS is not an input token or a grammar rule. Instead, you should +At first, the use of ``UMINUS`` in this example may appear very confusing. +``UMINUS`` is not an input token or a grammar rule. Instead, you should think of it as the name of a special marker in the precedence table. -When you use the ``%prec`` qualifier, you're simply telling SLY +When you use the ``%prec`` qualifier, you're telling SLY that you want the precedence of the expression to be the same as for this special marker instead of the usual precedence. @@ -934,7 +927,6 @@ rule that appears first in the grammar file. Reduce/reduce conflicts are almost always caused when different sets of grammar rules somehow generate the same set of symbols. For example:: - assignment : ID EQUALS NUMBER | ID EQUALS expr @@ -950,7 +942,7 @@ In this case, a reduce/reduce conflict exists between these two rules:: assignment : ID EQUALS NUMBER expr : NUMBER -For example, if you wrote "a = 5", the parser can't figure out if this +For example, if you're parsing ``a = 5``, the parser can't figure out if this is supposed to be reduced as ``assignment : ID EQUALS NUMBER`` or whether it's supposed to reduce the 5 as an expression and then reduce the rule ``assignment : ID EQUALS expr``. @@ -975,1200 +967,401 @@ Parser Debugging Tracking down shift/reduce and reduce/reduce conflicts is one of the finer pleasures of using an LR parsing algorithm. To assist in -debugging, SLY creates a debugging file called 'parser.out' when it -generates the parsing table. The contents of this file look like the -following: - -
-
-Unused terminals:
-
-
-Grammar
-
-Rule 1     expression -> expression PLUS expression
-Rule 2     expression -> expression MINUS expression
-Rule 3     expression -> expression TIMES expression
-Rule 4     expression -> expression DIVIDE expression
-Rule 5     expression -> NUMBER
-Rule 6     expression -> LPAREN expression RPAREN
-
-Terminals, with rules where they appear
-
-TIMES                : 3
-error                : 
-MINUS                : 2
-RPAREN               : 6
-LPAREN               : 6
-DIVIDE               : 4
-PLUS                 : 1
-NUMBER               : 5
-
-Nonterminals, with rules where they appear
-
-expression           : 1 1 2 2 3 3 4 4 6 0
-
-
-Parsing method: LALR
-
-
-state 0
-
-    S' -> . expression
-    expression -> . expression PLUS expression
-    expression -> . expression MINUS expression
-    expression -> . expression TIMES expression
-    expression -> . expression DIVIDE expression
-    expression -> . NUMBER
-    expression -> . LPAREN expression RPAREN
-
-    NUMBER          shift and go to state 3
-    LPAREN          shift and go to state 2
-
-
-state 1
-
-    S' -> expression .
-    expression -> expression . PLUS expression
-    expression -> expression . MINUS expression
-    expression -> expression . TIMES expression
-    expression -> expression . DIVIDE expression
-
-    PLUS            shift and go to state 6
-    MINUS           shift and go to state 5
-    TIMES           shift and go to state 4
-    DIVIDE          shift and go to state 7
-
-
-state 2
-
-    expression -> LPAREN . expression RPAREN
-    expression -> . expression PLUS expression
-    expression -> . expression MINUS expression
-    expression -> . expression TIMES expression
-    expression -> . expression DIVIDE expression
-    expression -> . NUMBER
-    expression -> . LPAREN expression RPAREN
-
-    NUMBER          shift and go to state 3
-    LPAREN          shift and go to state 2
-
-
-state 3
-
-    expression -> NUMBER .
-
-    $               reduce using rule 5
-    PLUS            reduce using rule 5
-    MINUS           reduce using rule 5
-    TIMES           reduce using rule 5
-    DIVIDE          reduce using rule 5
-    RPAREN          reduce using rule 5
-
-
-state 4
-
-    expression -> expression TIMES . expression
-    expression -> . expression PLUS expression
-    expression -> . expression MINUS expression
-    expression -> . expression TIMES expression
-    expression -> . expression DIVIDE expression
-    expression -> . NUMBER
-    expression -> . LPAREN expression RPAREN
-
-    NUMBER          shift and go to state 3
-    LPAREN          shift and go to state 2
-
-
-state 5
-
-    expression -> expression MINUS . expression
-    expression -> . expression PLUS expression
-    expression -> . expression MINUS expression
-    expression -> . expression TIMES expression
-    expression -> . expression DIVIDE expression
-    expression -> . NUMBER
-    expression -> . LPAREN expression RPAREN
-
-    NUMBER          shift and go to state 3
-    LPAREN          shift and go to state 2
-
-
-state 6
-
-    expression -> expression PLUS . expression
-    expression -> . expression PLUS expression
-    expression -> . expression MINUS expression
-    expression -> . expression TIMES expression
-    expression -> . expression DIVIDE expression
-    expression -> . NUMBER
-    expression -> . LPAREN expression RPAREN
-
-    NUMBER          shift and go to state 3
-    LPAREN          shift and go to state 2
-
-
-state 7
-
-    expression -> expression DIVIDE . expression
-    expression -> . expression PLUS expression
-    expression -> . expression MINUS expression
-    expression -> . expression TIMES expression
-    expression -> . expression DIVIDE expression
-    expression -> . NUMBER
-    expression -> . LPAREN expression RPAREN
-
-    NUMBER          shift and go to state 3
-    LPAREN          shift and go to state 2
-
-
-state 8
-
-    expression -> LPAREN expression . RPAREN
-    expression -> expression . PLUS expression
-    expression -> expression . MINUS expression
-    expression -> expression . TIMES expression
-    expression -> expression . DIVIDE expression
-
-    RPAREN          shift and go to state 13
-    PLUS            shift and go to state 6
-    MINUS           shift and go to state 5
-    TIMES           shift and go to state 4
-    DIVIDE          shift and go to state 7
-
-
-state 9
-
-    expression -> expression TIMES expression .
-    expression -> expression . PLUS expression
-    expression -> expression . MINUS expression
-    expression -> expression . TIMES expression
-    expression -> expression . DIVIDE expression
-
-    $               reduce using rule 3
-    PLUS            reduce using rule 3
-    MINUS           reduce using rule 3
-    TIMES           reduce using rule 3
-    DIVIDE          reduce using rule 3
-    RPAREN          reduce using rule 3
-
-  ! PLUS            [ shift and go to state 6 ]
-  ! MINUS           [ shift and go to state 5 ]
-  ! TIMES           [ shift and go to state 4 ]
-  ! DIVIDE          [ shift and go to state 7 ]
-
-state 10
-
-    expression -> expression MINUS expression .
-    expression -> expression . PLUS expression
-    expression -> expression . MINUS expression
-    expression -> expression . TIMES expression
-    expression -> expression . DIVIDE expression
-
-    $               reduce using rule 2
-    PLUS            reduce using rule 2
-    MINUS           reduce using rule 2
-    RPAREN          reduce using rule 2
-    TIMES           shift and go to state 4
-    DIVIDE          shift and go to state 7
-
-  ! TIMES           [ reduce using rule 2 ]
-  ! DIVIDE          [ reduce using rule 2 ]
-  ! PLUS            [ shift and go to state 6 ]
-  ! MINUS           [ shift and go to state 5 ]
-
-state 11
-
-    expression -> expression PLUS expression .
-    expression -> expression . PLUS expression
-    expression -> expression . MINUS expression
-    expression -> expression . TIMES expression
-    expression -> expression . DIVIDE expression
-
-    $               reduce using rule 1
-    PLUS            reduce using rule 1
-    MINUS           reduce using rule 1
-    RPAREN          reduce using rule 1
-    TIMES           shift and go to state 4
-    DIVIDE          shift and go to state 7
-
-  ! TIMES           [ reduce using rule 1 ]
-  ! DIVIDE          [ reduce using rule 1 ]
-  ! PLUS            [ shift and go to state 6 ]
-  ! MINUS           [ shift and go to state 5 ]
-
-state 12
-
-    expression -> expression DIVIDE expression .
-    expression -> expression . PLUS expression
-    expression -> expression . MINUS expression
-    expression -> expression . TIMES expression
-    expression -> expression . DIVIDE expression
-
-    $               reduce using rule 4
-    PLUS            reduce using rule 4
-    MINUS           reduce using rule 4
-    TIMES           reduce using rule 4
-    DIVIDE          reduce using rule 4
-    RPAREN          reduce using rule 4
-
-  ! PLUS            [ shift and go to state 6 ]
-  ! MINUS           [ shift and go to state 5 ]
-  ! TIMES           [ shift and go to state 4 ]
-  ! DIVIDE          [ shift and go to state 7 ]
-
-state 13
-
-    expression -> LPAREN expression RPAREN .
-
-    $               reduce using rule 6
-    PLUS            reduce using rule 6
-    MINUS           reduce using rule 6
-    TIMES           reduce using rule 6
-    DIVIDE          reduce using rule 6
-    RPAREN          reduce using rule 6
-
-
- -The different states that appear in this file are a representation of -every possible sequence of valid input tokens allowed by the grammar. -When receiving input tokens, the parser is building up a stack and -looking for matching rules. Each state keeps track of the grammar -rules that might be in the process of being matched at that point. Within each -rule, the "." character indicates the current location of the parse -within that rule. In addition, the actions for each valid input token -are listed. When a shift/reduce or reduce/reduce conflict arises, -rules not selected are prefixed with an !. For example: - -
-
-  ! TIMES           [ reduce using rule 2 ]
-  ! DIVIDE          [ reduce using rule 2 ]
-  ! PLUS            [ shift and go to state 6 ]
-  ! MINUS           [ shift and go to state 5 ]
-
-
- -By looking at these rules (and with a little practice), you can usually track down the source -of most parsing conflicts. It should also be stressed that not all shift-reduce conflicts are -bad. However, the only way to be sure that they are resolved correctly is to look at parser.out. +debugging, you can have SLY produce a debugging file when it +constructs the parsing tables. Add a ``debugfile`` attribute to your +class like this:: + + class CalcParser(Parser): + debugfile = 'parser.out' + ... + +When present, this will write the entire grammar along with all parsing +states to the file you specify. Each state of the parser is shown +as output that looks something like this:: + + state 2 + + (7) factor -> LPAREN . expr RPAREN + (1) expr -> . term + (2) expr -> . expr MINUS term + (3) expr -> . expr PLUS term + (4) term -> . factor + (5) term -> . term DIVIDE factor + (6) term -> . term TIMES factor + (7) factor -> . LPAREN expr RPAREN + (8) factor -> . NUMBER + LPAREN shift and go to state 2 + NUMBER shift and go to state 3 + + factor shift and go to state 1 + term shift and go to state 4 + expr shift and go to state 6 + +Each state keeps track of the grammar rules that might be in the +process of being matched at that point. Within each rule, the "." +character indicates the current location of the parse within that +rule. In addition, the actions for each valid input token are listed. +By looking at these rules (and with a little practice), you can +usually track down the source of most parsing conflicts. It should +also be stressed that not all shift-reduce conflicts are bad. +However, the only way to be sure that they are resolved correctly is +to look at the debugging file. -

6.8 Syntax Error Handling

- +Syntax Error Handling +^^^^^^^^^^^^^^^^^^^^^ If you are creating a parser for production use, the handling of syntax errors is important. As a general rule, you don't want a parser to simply throw up its hands and stop at the first sign of -trouble. Instead, you want it to report the error, recover if possible, and -continue parsing so that all of the errors in the input get reported -to the user at once. This is the standard behavior found in compilers -for languages such as C, C++, and Java. +trouble. Instead, you want it to report the error, recover if +possible, and continue parsing so that all of the errors in the input +get reported to the user at once. This is the standard behavior found +in compilers for languages such as C, C++, and Java. -In PLY, when a syntax error occurs during parsing, the error is immediately +In SLY, when a syntax error occurs during parsing, the error is immediately detected (i.e., the parser does not read any more tokens beyond the source of the error). However, at this point, the parser enters a recovery mode that can be used to try and continue further parsing. As a general rule, error recovery in LR parsers is a delicate topic that involves ancient rituals and black-magic. The recovery mechanism -provided by yacc.py is comparable to Unix yacc so you may want +provided by SLY is comparable to Unix yacc so you may want consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. -

-When a syntax error occurs, yacc.py performs the following steps: +When a syntax error occurs, SLY performs the following steps: -

    -
  1. On the first occurrence of an error, the user-defined p_error() function -is called with the offending token as an argument. However, if the syntax error is due to -reaching the end-of-file, p_error() is called with an - argument of None. -Afterwards, the parser enters -an "error-recovery" mode in which it will not make future calls to p_error() until it -has successfully shifted at least 3 tokens onto the parsing stack. +1. On the first occurrence of an error, the user-defined ``error()`` + method is called with the offending token as an argument. However, if + the syntax error is due to reaching the end-of-file, an argument of + ``None`` is passed. Afterwards, the parser enters an "error-recovery" + mode in which it will not make future calls to ``error()`` until it + has successfully shifted at least 3 tokens onto the parsing stack. -

    -

  2. If no recovery action is taken in p_error(), the offending lookahead token is replaced -with a special error token. +2. If no recovery action is taken in ``error()``, the offending + lookahead token is replaced with a special ``error`` token. -

    -

  3. If the offending lookahead token is already set to error, the top item of the parsing stack is -deleted. +3. If the offending lookahead token is already set to ``error``, + the top item of the parsing stack is deleted. -

    -

  4. If the entire parsing stack is unwound, the parser enters a restart state and attempts to start -parsing from its initial state. +4. If the entire parsing stack is unwound, the parser enters a restart + state and attempts to start parsing from its initial state. -

    -

  5. If a grammar rule accepts error as a token, it will be -shifted onto the parsing stack. +5. If a grammar rule accepts ``error`` as a token, it will be + shifted onto the parsing stack. -

    -

  6. If the top item of the parsing stack is error, lookahead tokens will be discarded until the -parser can successfully shift a new symbol or reduce a rule involving error. -
+6. If the top item of the parsing stack is ``error``, lookahead tokens will be discarded until the +parser can successfully shift a new symbol or reduce a rule involving ``error``. -

6.8.1 Recovery and resynchronization with error rules

+Recovery and resynchronization with error rules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The most well-behaved approach for handling syntax errors is to write +grammar rules that include the ``error`` token. For example, +suppose your language had a grammar rule for a print statement like +this:: -The most well-behaved approach for handling syntax errors is to write grammar rules that include the error -token. For example, suppose your language had a grammar rule for a print statement like this: + @_('PRINT expr SEMI') + def statement(self, p): + ... -
-
-def p_statement_print(p):
-     'statement : PRINT expr SEMI'
-     ...
-
-
+To account for the possibility of a bad expression, you might write an +additional grammar rule like this:: -To account for the possibility of a bad expression, you might write an additional grammar rule like this: + @_('PRINT error SEMI') + def statement(self, p): + print("Syntax error in print statement. Bad expression") -
-
-def p_statement_print_error(p):
-     'statement : PRINT error SEMI'
-     print("Syntax error in print statement. Bad expression")
-
-
-
- -In this case, the error token will match any sequence of +In this case, the ``error`` token will match any sequence of tokens that might appear up to the first semicolon that is encountered. Once the semicolon is reached, the rule will be -invoked and the error token will go away. +invoked and the ``error`` token will go away. -

This type of recovery is sometimes known as parser resynchronization. -The error token acts as a wildcard for any bad input text and -the token immediately following error acts as a +The ``error`` token acts as a wildcard for any bad input text and +the token immediately following ``error`` acts as a synchronization token. -

-It is important to note that the error token usually does not appear as the last token -on the right in an error rule. For example: +It is important to note that the ``error`` token usually does not +appear as the last token on the right in an error rule. For example:: -

-
-def p_statement_print_error(p):
-    'statement : PRINT error'
-    print("Syntax error in print statement. Bad expression")
-
-
+ @_('PRINT error') + def statement(self, p): + print("Syntax error in print statement. Bad expression") This is because the first bad token encountered will cause the rule to be reduced--which may make it difficult to recover if more bad tokens immediately follow. -

6.8.2 Panic mode recovery

+Panic mode recovery +~~~~~~~~~~~~~~~~~~~ +An alternative error recovery scheme is to enter a panic mode recovery +in which tokens are discarded to a point where the parser might be +able to recover in some sensible manner. -An alternative error recovery scheme is to enter a panic mode recovery in which tokens are -discarded to a point where the parser might be able to recover in some sensible manner. +Panic mode recovery is implemented entirely in the ``error()`` +function. For example, this function starts discarding tokens until +it reaches a closing '}'. Then, it restarts the parser in its initial +state:: -

-Panic mode recovery is implemented entirely in the p_error() function. For example, this -function starts discarding tokens until it reaches a closing '}'. Then, it restarts the -parser in its initial state. + def error(self, p): + print("Whoa. You are seriously hosed.") + if not p: + print("End of File!") + return -

-
-def p_error(p):
-    print("Whoa. You are seriously hosed.")
-    if not p:
-        print("End of File!")
-        return
+        # Read ahead looking for a closing '}'
+        while True:
+            tok = next(self.tokens, None)
+            if not tok or tok.type == 'RBRACE': 
+                break
+            self.restart()
 
-    # Read ahead looking for a closing '}'
-    while True:
-        tok = parser.token()             # Get the next token
-        if not tok or tok.type == 'RBRACE': 
-            break
-    parser.restart()
-
-
+This function simply discards the bad token and tells the parser that +the error was ok:: -

-This function simply discards the bad token and tells the parser that the error was ok. + def error(self, p): + if p: + print("Syntax error at token", p.type) + # Just discard the token and tell the parser it's okay. + self.errok() + else: + print("Syntax error at EOF") -

-
-def p_error(p):
-    if p:
-         print("Syntax error at token", p.type)
-         # Just discard the token and tell the parser it's okay.
-         parser.errok()
-    else:
-         print("Syntax error at EOF")
-
-
+A few additional details about some of the attributes and methods being used: -

-More information on these methods is as follows: -

+- ``self.errok()``. This resets the parser state so it doesn't think + it's in error-recovery mode. This will prevent an ``error`` token + from being generated and will reset the internal error counters so + that the next syntax error will call ``error()`` again. -

-

+To supply the next lookahead token to the parser, ``error()`` can return a token. This might be +useful if trying to synchronize on special characters. For example:: -

-To supply the next lookahead token to the parser, p_error() can return a token. This might be -useful if trying to synchronize on special characters. For example: + def error(self, tok): + # Read ahead looking for a terminating ";" + while True: + tok = next(self.tokens, None) # Get the next token + if not tok or tok.type == 'SEMI': + break + self.errok() -

-
-def p_error(p):
-    # Read ahead looking for a terminating ";"
-    while True:
-        tok = parser.token()             # Get the next token
-        if not tok or tok.type == 'SEMI': break
-    parser.errok()
+        # Return SEMI to the parser as the next lookahead token
+        return tok  
 
-    # Return SEMI to the parser as the next lookahead token
-    return tok  
-
-
+When Do Syntax Errors Get Reported? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -

-Keep in mind in that the above error handling functions, -parser is an instance of the parser created by -yacc(). You'll need to save this instance someplace in your -code so that you can refer to it during error handling. -

+In most cases, SLY will handle errors as soon as a bad input token is +detected on the input. However, be aware that SLY may choose to delay +error handling until after it has reduced one or more grammar rules +first. This behavior might be unexpected, but it's related to special +states in the underlying parsing table known as "defaulted states." A +defaulted state is parsing condition where the same grammar rule will +be reduced regardless of what valid token comes next on the input. +For such states, SLY chooses to go ahead and reduce the grammar rule +*without reading the next input token*. If the next token is bad, SLY +will eventually get around to reading it and report a syntax error. +It's just a little unusual in that you might see some of your grammar +rules firing immediately prior to the syntax error. -

6.8.3 Signalling an error from a production

+General comments on error handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For normal types of languages, error recovery with error rules and +resynchronization characters is probably the most reliable +technique. This is because you can instrument the grammar to catch +errors at selected places where it is relatively easy to recover and +continue parsing. Panic mode recovery is really only useful in +certain specialized applications where you might want to discard huge +portions of the input text to find a valid restart point. -If necessary, a production rule can manually force the parser to enter error recovery. This -is done by raising the SyntaxError exception like this: - -
-
-def p_production(p):
-    'production : some production ...'
-    raise SyntaxError
-
-
- -The effect of raising SyntaxError is the same as if the last symbol shifted onto the -parsing stack was actually a syntax error. Thus, when you do this, the last symbol shifted is popped off -of the parsing stack and the current lookahead token is set to an error token. The parser -then enters error-recovery mode where it tries to reduce rules that can accept error tokens. -The steps that follow from this point are exactly the same as if a syntax error were detected and -p_error() were called. - -

-One important aspect of manually setting an error is that the p_error() function will NOT be -called in this case. If you need to issue an error message, make sure you do it in the production that -raises SyntaxError. - -

-Note: This feature of PLY is meant to mimic the behavior of the YYERROR macro in yacc. - -

6.8.4 When Do Syntax Errors Get Reported

- - -

-In most cases, yacc will handle errors as soon as a bad input token is -detected on the input. However, be aware that yacc may choose to -delay error handling until after it has reduced one or more grammar -rules first. This behavior might be unexpected, but it's related to -special states in the underlying parsing table known as "defaulted -states." A defaulted state is parsing condition where the same -grammar rule will be reduced regardless of what valid token -comes next on the input. For such states, yacc chooses to go ahead -and reduce the grammar rule without reading the next input -token. If the next token is bad, yacc will eventually get around to reading it and -report a syntax error. It's just a little unusual in that you might -see some of your grammar rules firing immediately prior to the syntax -error. -

- -

-Usually, the delayed error reporting with defaulted states is harmless -(and there are other reasons for wanting PLY to behave in this way). -However, if you need to turn this behavior off for some reason. You -can clear the defaulted states table like this: -

- -
-
-parser = yacc.yacc()
-parser.defaulted_states = {}
-
-
- -

-Disabling defaulted states is not recommended if your grammar makes use -of embedded actions as described in Section 6.11.

- -

6.8.5 General comments on error handling

- - -For normal types of languages, error recovery with error rules and resynchronization characters is probably the most reliable -technique. This is because you can instrument the grammar to catch errors at selected places where it is relatively easy -to recover and continue parsing. Panic mode recovery is really only useful in certain specialized applications where you might want -to discard huge portions of the input text to find a valid restart point. - -

6.9 Line Number and Position Tracking

- +Line Number and Position Tracking +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Position tracking is often a tricky problem when writing compilers. -By default, PLY tracks the line number and position of all tokens. -This information is available using the following functions: +By default, SLY tracks the line number and position of all tokens. +The following attributes may be useful in a production method: - +- ``p.lineno``. Line number of the left-most terminal in a production. +- ``p.index``. Lexing index of the left-most terminal in a production. -For example: +For example:: -
-
-def p_expression(p):
-    'expression : expression PLUS expression'
-    line   = p.lineno(2)        # line number of the PLUS token
-    index  = p.lexpos(2)        # Position of the PLUS token
-
-
- -As an optional feature, yacc.py can automatically track line -numbers and positions for all of the grammar symbols as well. -However, this extra tracking requires extra processing and can -significantly slow down parsing. Therefore, it must be enabled by -passing the -tracking=True option to yacc.parse(). For example: - -
-
-yacc.parse(data,tracking=True)
-
-
- -Once enabled, the lineno() and lexpos() methods work -for all grammar symbols. In addition, two additional methods can be -used: - - - -For example: - -
-
-def p_expression(p):
-    'expression : expression PLUS expression'
-    p.lineno(1)        # Line number of the left expression
-    p.lineno(2)        # line number of the PLUS operator
-    p.lineno(3)        # line number of the right expression
-    ...
-    start,end = p.linespan(3)    # Start,end lines of the right expression
-    starti,endi = p.lexspan(3)   # Start,end positions of right expression
-
-
-
- -Note: The lexspan() function only returns the range of values up to the start of the last grammar symbol. - -

-Although it may be convenient for PLY to track position information on -all grammar symbols, this is often unnecessary. For example, if you -are merely using line number information in an error message, you can -often just key off of a specific token in the grammar rule. For -example: - -

-
-def p_bad_func(p):
-    'funccall : fname LPAREN error RPAREN'
-    # Line number reported from LPAREN token
-    print("Bad function call at line", p.lineno(2))
-
-
- -

-Similarly, you may get better parsing performance if you only -selectively propagate line number information where it's needed using -the p.set_lineno() method. For example: - -

-
-def p_fname(p):
-    'fname : ID'
-    p[0] = p[1]
-    p.set_lineno(0,p.lineno(1))
-
-
- -PLY doesn't retain line number information from rules that have already been -parsed. If you are building an abstract syntax tree and need to have line numbers, -you should make sure that the line numbers appear in the tree itself. - -

6.10 AST Construction

+ @_('expr PLUS expr') + def expr(self, p): + line = p.lineno # line number of the PLUS token + index = p.index # Index of the PLUS token in input text -yacc.py provides no special functions for constructing an -abstract syntax tree. However, such construction is easy enough to do -on your own. +SLY doesn't propagate line number information to non-terminals. If you need +this, you'll need to store line number information yourself and propagate it +in AST nodes or some other data structure. -

A minimal way to construct a tree is to simply create and +AST Construction +^^^^^^^^^^^^^^^^ + +SLY provides no special functions for constructing an abstract syntax +tree. However, such construction is easy enough to do on your own. + +A minimal way to construct a tree is to simply create and propagate a tuple or list in each grammar rule function. There are many possible ways to do this, but one example would be something -like this: +like this:: -

-
-def p_expression_binop(p):
-    '''expression : expression PLUS expression
-                  | expression MINUS expression
-                  | expression TIMES expression
-                  | expression DIVIDE expression'''
+    @_('expr PLUS expr',
+       'expr MINUS expr', 
+       'expr TIMES expr',
+       'expr DIVIDE expr')
+    def expr(self, p):
+        return ('binary-expression', p[1], p[0], p[2])
 
-    p[0] = ('binary-expression',p[2],p[1],p[3])
+    @_('LPAREN expr RPAREN')
+    def expr(self, p):
+        return ('group-expression',p[1])
 
-def p_expression_group(p):
-    'expression : LPAREN expression RPAREN'
-    p[0] = ('group-expression',p[2])
+    @_('NUMBER')
+    def expr(self, p):
+        return ('number-expression', p[0])
 
-def p_expression_number(p):
-    'expression : NUMBER'
-    p[0] = ('number-expression',p[1])
-
-
- -

Another approach is to create a set of data structure for different -kinds of abstract syntax tree nodes and assign nodes to p[0] -in each rule. For example: +kinds of abstract syntax tree nodes and create different node types +in each rule:: -

-
-class Expr: pass
+    class Expr: 
+        pass
 
-class BinOp(Expr):
-    def __init__(self,left,op,right):
-        self.type = "binop"
-        self.left = left
-        self.right = right
-        self.op = op
+    class BinOp(Expr):
+        def __init__(self, op, left, right)
+            self.op = op
+            self.left = left
+            self.right = right
 
-class Number(Expr):
-    def __init__(self,value):
-        self.type = "number"
-        self.value = value
+    class Number(Expr):
+        def __init__(self, value):
+            self.value = value
 
-def p_expression_binop(p):
-    '''expression : expression PLUS expression
-                  | expression MINUS expression
-                  | expression TIMES expression
-                  | expression DIVIDE expression'''
+    @_('expr PLUS expr',
+       'expr MINUS expr', 
+       'expr TIMES expr',
+       'expr DIVIDE expr')
+    def expr(self, p):
+        return BinOp(p[1], p[0], p[2])
 
-    p[0] = BinOp(p[1],p[2],p[3])
+    @_('LPAREN expr RPAREN')
+    def expr(self, p):
+        return p[1]
 
-def p_expression_group(p):
-    'expression : LPAREN expression RPAREN'
-    p[0] = p[2]
+    @_('NUMBER')
+    def expr(self, p):
+        return Number(p[0])
 
-def p_expression_number(p):
-    'expression : NUMBER'
-    p[0] = Number(p[1])
-
-
+The advantage to this approach is that it may make it easier to attach +more complicated semantics, type checking, code generation, and other +features to the node classes. -The advantage to this approach is that it may make it easier to attach more complicated -semantics, type checking, code generation, and other features to the node classes. +Embedded Actions +^^^^^^^^^^^^^^^^ -

-To simplify tree traversal, it may make sense to pick a very generic -tree structure for your parse tree nodes. For example: +The parsing technique used by SLY only allows actions to be executed +at the end of a rule. For example, suppose you have a rule like this:: -

-
-class Node:
-    def __init__(self,type,children=None,leaf=None):
-         self.type = type
-         if children:
-              self.children = children
-         else:
-              self.children = [ ]
-         self.leaf = leaf
-	 
-def p_expression_binop(p):
-    '''expression : expression PLUS expression
-                  | expression MINUS expression
-                  | expression TIMES expression
-                  | expression DIVIDE expression'''
+    @_('A B C D')
+    def foo(self, p):
+        print("Parsed a foo", p[0],p[1],p[2],p[3])
 
-    p[0] = Node("binop", [p[1],p[3]], p[2])
-
-
- -

6.11 Embedded Actions

- - -The parsing technique used by yacc only allows actions to be executed at the end of a rule. For example, -suppose you have a rule like this: - -
-
-def p_foo(p):
-    "foo : A B C D"
-    print("Parsed a foo", p[1],p[2],p[3],p[4])
-
-
- -

In this case, the supplied action code only executes after all of the -symbols A, B, C, and D have been +symbols ``A``, ``B``, ``C``, and ``D`` have been parsed. Sometimes, however, it is useful to execute small code fragments during intermediate stages of parsing. For example, suppose -you wanted to perform some action immediately after A has -been parsed. To do this, write an empty rule like this: +you wanted to perform some action immediately after ``A`` has +been parsed. To do this, write an empty rule like this:: -

-
-def p_foo(p):
-    "foo : A seen_A B C D"
-    print("Parsed a foo", p[1],p[3],p[4],p[5])
-    print("seen_A returned", p[2])
+    @_('A seen_A B C D')
+    def foo(self, p):
+        print("Parsed a foo", p[0],p[2],p[3],p[4])
+        print("seen_A returned", p[1])
 
-def p_seen_A(p):
-    "seen_A :"
-    print("Saw an A = ", p[-1])   # Access grammar symbol to left
-    p[0] = some_value            # Assign value to seen_A
+    @_('')
+    def seen_A(self, p):
+        print("Saw an A = ", p[-1])   # Access grammar symbol to the left
+        return 'some_value'           # Assign value to seen_A
 
-
-
+In this example, the empty ``seen_A`` rule executes immediately after +``A`` is shifted onto the parsing stack. Within this rule, ``p[-1]`` +refers to the symbol on the stack that appears immediately to the left +of the ``seen_A`` symbol. In this case, it would be the value of +``A`` in the ``foo`` rule immediately above. Like other rules, a +value can be returned from an embedded action by returning it. -

-In this example, the empty seen_A rule executes immediately -after A is shifted onto the parsing stack. Within this -rule, p[-1] refers to the symbol on the stack that appears -immediately to the left of the seen_A symbol. In this case, -it would be the value of A in the foo rule -immediately above. Like other rules, a value can be returned from an -embedded action by simply assigning it to p[0] +The use of embedded actions can sometimes introduce extra shift/reduce +conflicts. For example, this grammar has no conflicts:: -

-The use of embedded actions can sometimes introduce extra shift/reduce conflicts. For example, -this grammar has no conflicts: + @_('abcd', + 'abcx') + def foo(self, p): + pass -

-
-def p_foo(p):
-    """foo : abcd
-           | abcx"""
+    @_('A B C D')
+    def abcd(self, p):
+        pass
 
-def p_abcd(p):
-    "abcd : A B C D"
+    @_('A B C X')
+    def abcx(self, p):
+        pass
 
-def p_abcx(p):
-    "abcx : A B C X"
-
-
+However, if you insert an embedded action into one of the rules like this:: -However, if you insert an embedded action into one of the rules like this, + @_('abcd', + 'abcx') + def foo(self, p): + pass -
-
-def p_foo(p):
-    """foo : abcd
-           | abcx"""
+    @_('A B C D')
+    def abcd(self, p):
+        pass
 
-def p_abcd(p):
-    "abcd : A B C D"
+    @_('A B seen_AB C X')
+    def abcx(self, p):
+        pass
 
-def p_abcx(p):
-    "abcx : A B seen_AB C X"
-
-def p_seen_AB(p):
-    "seen_AB :"
-
-
+ @_('') + def seen_AB(self, p): + pass an extra shift-reduce conflict will be introduced. This conflict is -caused by the fact that the same symbol C appears next in -both the abcd and abcx rules. The parser can either -shift the symbol (abcd rule) or reduce the empty -rule seen_AB (abcx rule). +caused by the fact that the same symbol ``C`` appears next in +both the ``abcd`` and ``abcx`` rules. The parser can either +shift the symbol (``abcd`` rule) or reduce the empty +rule ``seen_AB`` (``abcx`` rule). -

A common use of embedded rules is to control other aspects of parsing -such as scoping of local variables. For example, if you were parsing C code, you might -write code like this: +such as scoping of local variables. For example, if you were parsing +C code, you might write code like this:: -

-
-def p_statements_block(p):
-    "statements: LBRACE new_scope statements RBRACE"""
-    # Action code
-    ...
-    pop_scope()        # Return to previous scope
+    @_('LBRACE new_scope statements RBRACE')
+    def statements(self, p):
+        # Action code
+        ...
+        pop_scope()        # Return to previous scope
+    
+    @_('')
+    def new_scope(self, p):
+        # Create a new scope for local variables
+        create_scope()
+        ...
 
-def p_new_scope(p):
-    "new_scope :"
-    # Create a new scope for local variables
-    s = new_scope()
-    push_scope(s)
-    ...
-
-
- -In this case, the embedded action new_scope executes -immediately after a LBRACE ({) symbol is parsed. +In this case, the embedded action ``new_scope`` executes +immediately after a ``LBRACE`` (``{``) symbol is parsed. This might adjust internal symbol tables and other aspects of the -parser. Upon completion of the rule statements_block, code -might undo the operations performed in the embedded action -(e.g., pop_scope()). - -

6.12 Miscellaneous Yacc Notes

- - - -

- - -

7. Multiple Parsers and Lexers

- - -In advanced parsing applications, you may want to have multiple -parsers and lexers. - -

-As a general rules this isn't a problem. However, to make it work, -you need to carefully make sure everything gets hooked up correctly. -First, make sure you save the objects returned by lex() and -yacc(). For example: - -

-
-lexer  = lex.lex()       # Return lexer object
-parser = yacc.yacc()     # Return parser object
-
-
- -Next, when parsing, make sure you give the parse() function a reference to the lexer it -should be using. For example: - -
-
-parser.parse(text,lexer=lexer)
-
-
- -If you forget to do this, the parser will use the last lexer -created--which is not always what you want. - -

-Within lexer and parser rule functions, these objects are also -available. In the lexer, the "lexer" attribute of a token refers to -the lexer object that triggered the rule. For example: - -

-
-def t_NUMBER(t):
-   r'\d+'
-   ...
-   print(t.lexer)           # Show lexer object
-
-
- -In the parser, the "lexer" and "parser" attributes refer to the lexer -and parser objects respectively. - -
-
-def p_expr_plus(p):
-   'expr : expr PLUS expr'
-   ...
-   print(p.parser)          # Show parser object
-   print(p.lexer)           # Show lexer object
-
-
- -If necessary, arbitrary attributes can be attached to the lexer or parser object. -For example, if you wanted to have different parsing modes, you could attach a mode -attribute to the parser object and look at it later. - -

8. Using Python's Optimized Mode

- - -Because PLY uses information from doc-strings, parsing and lexing -information must be gathered while running the Python interpreter in -normal mode (i.e., not with the -O or -OO options). However, if you -specify optimized mode like this: - -
-
-lex.lex(optimize=1)
-yacc.yacc(optimize=1)
-
-
- -then PLY can later be used when Python runs in optimized mode. To make this work, -make sure you first run Python in normal mode. Once the lexing and parsing tables -have been generated the first time, run Python in optimized mode. PLY will use -the tables without the need for doc strings. - -

-Beware: running PLY in optimized mode disables a lot of error -checking. You should only do this when your project has stabilized -and you don't need to do any debugging. One of the purposes of -optimized mode is to substantially decrease the startup time of -your compiler (by assuming that everything is already properly -specified and works). - -

9. Advanced Debugging

- - -

-Debugging a compiler is typically not an easy task. PLY provides some -advanced diagostic capabilities through the use of Python's -logging module. The next two sections describe this: - -

9.1 Debugging the lex() and yacc() commands

- - -

-Both the lex() and yacc() commands have a debugging -mode that can be enabled using the debug flag. For example: - -

-
-lex.lex(debug=True)
-yacc.yacc(debug=True)
-
-
- -Normally, the output produced by debugging is routed to either -standard error or, in the case of yacc(), to a file -parser.out. This output can be more carefully controlled -by supplying a logging object. Here is an example that adds -information about where different debugging messages are coming from: - -
-
-# Set up a logging object
-import logging
-logging.basicConfig(
-    level = logging.DEBUG,
-    filename = "parselog.txt",
-    filemode = "w",
-    format = "%(filename)10s:%(lineno)4d:%(message)s"
-)
-log = logging.getLogger()
-
-lex.lex(debug=True,debuglog=log)
-yacc.yacc(debug=True,debuglog=log)
-
-
- -If you supply a custom logger, the amount of debugging -information produced can be controlled by setting the logging level. -Typically, debugging messages are either issued at the DEBUG, -INFO, or WARNING levels. - -

-PLY's error messages and warnings are also produced using the logging -interface. This can be controlled by passing a logging object -using the errorlog parameter. - -

-
-lex.lex(errorlog=log)
-yacc.yacc(errorlog=log)
-
-
- -If you want to completely silence warnings, you can either pass in a -logging object with an appropriate filter level or use the NullLogger -object defined in either lex or yacc. For example: - -
-
-yacc.yacc(errorlog=yacc.NullLogger())
-
-
- -

9.2 Run-time Debugging

- - -

-To enable run-time debugging of a parser, use the debug option to parse. This -option can either be an integer (which simply turns debugging on or off) or an instance -of a logger object. For example: - -

-
-log = logging.getLogger()
-parser.parse(input,debug=log)
-
-
- -If a logging object is passed, you can use its filtering level to control how much -output gets generated. The INFO level is used to produce information -about rule reductions. The DEBUG level will show information about the -parsing stack, token shifts, and other details. The ERROR level shows information -related to parsing errors. - -

-For very complicated problems, you should pass in a logging object that -redirects to a file where you can more easily inspect the output after -execution. - -

11. Where to go from here?

- - -The examples directory of the PLY distribution contains several simple examples. Please consult a -compilers textbook for the theory and underlying implementation details or LR parsing. - - - - +parser. Upon completion of the rule ``statements``, code +undos the operations performed in the embedded action +(e.g., ``pop_scope()``). diff --git a/sly/yacc.py b/sly/yacc.py index 61d8c30..f2a303e 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -105,8 +105,8 @@ class YaccSymbol: class YaccProduction: def __init__(self, s, stack=None): self._slice = s - self._stack = stack self._namemap = { } + self._stack = stack def __getitem__(self, n): if n >= 0: @@ -115,19 +115,29 @@ class YaccProduction: return self._stack[n].value def __setitem__(self, n, v): - self._slice[n].value = v + if n >= 0: + self._slice[n].value = v + else: + self._stack[n].value = v def __len__(self): return len(self._slice) - def lineno(self, n): - return getattr(self._slice[n], 'lineno', 0) + @property + def lineno(self): + for tok in self._slice: + lineno = getattr(tok, 'lineno', None) + if lineno: + return lineno + raise AttributeError('No line number found') - def set_lineno(self, n, lineno): - self._slice[n].lineno = lineno - - def index(self, n): - return getattr(self._slice[n], 'index', 0) + @property + def index(self): + for tok in self._slice: + index = getattr(tok, 'index', None) + if index: + return index + raise AttributeError('No index attribute found') def __getattr__(self, name): return self._slice[self._namemap[name]].value @@ -1821,9 +1831,10 @@ class Parser(metaclass=ParserMeta): errorcount = 0 # Used during error recovery # Set up the state and symbol stacks + self.tokens = tokens self.statestack = statestack = [] # Stack of parsing states self.symstack = symstack = [] # Stack of grammar symbols - pslice.stack = symstack # Put in the production + pslice._stack = symstack # Associate the stack with the production self.restart() errtoken = None # Err token @@ -1870,15 +1881,15 @@ class Parser(metaclass=ParserMeta): # Call the production function pslice._slice = symstack[-plen:] if plen else [] - if plen: - del symstack[-plen:] - del statestack[-plen:] sym = YaccSymbol() sym.type = pname sym.value = p.func(self, pslice) - symstack.append(sym) + if plen: + del symstack[-plen:] + del statestack[-plen:] + symstack.append(sym) self.state = goto[statestack[-1]][pname] statestack.append(self.state) continue