Added EBNF choice handling

2020-05-09 12:24:33 -05:00
parent ab75a58b10
commit 1f87ddaf39
5 changed files with 322 additions and 6 deletions
--- a/18
+++ b/18
@@ -1,5 +1,19 @@
 Version 0.4
 -----------
 05/09/2020 Experimental support for EBNF choices.  For example:
 	      @('term { PLUS|MINUS term }')
              def expr(self, p):
                  lterm = p.pterm0
                  for op, rterm in p[1]:
 		      lterm = BinOp(op, lterm, rterm)
           One issue here is just how one refers to the choice
           of values.  There is no unified name to pick. So,
           you basically have to do it using a numeric index like p[1].
           In this case, p[1] is a list of all of the repeated items
           (represented as tuples).
 05/09/2020 Changed the internal names used for EBNF rules to make them
           a bit easier to debug in the parser.out file.
@@ -8,7 +22,7 @@ Version 0.4
           expression list:
               @('expr { COMMA expr }')
-               def exprlist(p):
+               def exprlist(self, p):
                   return [ p.expr0 ] + p.expr1
           In this code, the { ... } means zero-or-more repetitions.
@@ -19,7 +33,7 @@ Version 0.4
           An optional value can be enclosed in brackets like this:
              @('VAR NAME [ EQUAL expr ] SEMI')
-              def variable_declaration(p):
+              def variable_declaration(self, p):
                  print(f"Definining {p.NAME}. Initial value={p.expr}")
           In this case, all symbols inside [ ... ] either have a value
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@@ -3,7 +3,7 @@
 # -----------------------------------------------------------------------------
 import sys
-sys.path.append('../..')
+sys.path.insert(0, '../..')
 from sly import Lexer, Parser
--- a/example/calc_ebnf/calc.py
+++ b/example/calc_ebnf/calc.py
@@ -0,0 +1,101 @@
 # -----------------------------------------------------------------------------
 # calc.py
 # -----------------------------------------------------------------------------
 import sys
 sys.path.insert(0, '../..')
 from sly import Lexer, Parser
 class CalcLexer(Lexer):
    tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN }
    ignore = ' \t'
    # Tokens
    NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
    NUMBER = r'\d+'
    # Special symbols
    PLUS = r'\+'
    MINUS = r'-'
    TIMES = r'\*'
    DIVIDE = r'/'
    ASSIGN = r'='
    LPAREN = r'\('
    RPAREN = r'\)'
    # Ignored pattern
    ignore_newline = r'\n+'
    # Extra action for newlines
    def ignore_newline(self, t):
        self.lineno += t.value.count('\n')
    def error(self, t):
        print("Illegal character '%s'" % t.value[0])
        self.index += 1
 class CalcParser(Parser):
    tokens = CalcLexer.tokens
    def __init__(self):
        self.names = { }
    @_('NAME ASSIGN expr')
    def statement(self, p):
        self.names[p.NAME] = p.expr
    @_('expr')
    def statement(self, p):
        print(p.expr)
    @_('term { PLUS|MINUS term }')
    def expr(self, p):
        lval = p.term0
        for op, rval in p[1]:
            if op == '+':
                lval = lval + rval
            elif op == '-':
                lval = lval - rval
        return lval
    @_('factor { TIMES|DIVIDE factor }')
    def term(self, p):
        lval = p.factor0
        for op, rval in p[1]:
            if op == '*':
                lval = lval * rval
            elif op == '/':
                lval = lval / rval
        return lval
    @_('MINUS factor')
    def factor(self, p):
        return -p.factor
    @_('LPAREN expr RPAREN')
    def factor(self, p):
        return p.expr
    @_('NUMBER')
    def factor(self, p):
        return int(p.NUMBER)
    @_('NAME')
    def factor(self, p):
        try:
            return self.names[p.NAME]
        except LookupError:
            print(f'Undefined name {p.NAME!r}')
            return 0
 if __name__ == '__main__':
    lexer = CalcLexer()
    parser = CalcParser()
    while True:
        try:
            text = input('calc > ')
        except EOFError:
            break
        if text:
            parser.parse(lexer.tokenize(text))
--- a/sly/yacc.py
+++ b/sly/yacc.py
@@ -1582,6 +1582,10 @@ def _collect_grammar_rules(func):
                        syms, prod = _replace_ebnf_repeat(syms)
                        ebnf_prod.extend(prod)
                        break
                    elif '|' in s:
                        syms, prod = _replace_ebnf_choice(syms)
                        ebnf_prod.extend(prod)
                        break
            if syms[1:2] == [':'] or syms[1:2] == ['::=']:
                grammar.append((func, filename, lineno, syms[0], syms[2:]))
@@ -1598,9 +1602,17 @@ def _replace_ebnf_repeat(syms):
    syms = list(syms)
    first = syms.index('{')
    end = syms.index('}', first)
-    symname, prods = _generate_repeat_rules(syms[first+1:end])
+
    # Look for choices inside
    repeated_syms = syms[first+1:end]
    if any('|' in sym for sym in repeated_syms):
        repeated_syms, prods = _replace_ebnf_choice(repeated_syms)
    else:
        prods = []
    symname, moreprods = _generate_repeat_rules(repeated_syms)
    syms[first:end+1] = [symname]
-    return syms, prods
+    return syms, prods + moreprods
 def _replace_ebnf_optional(syms):
    syms = list(syms)
@@ -1610,6 +1622,18 @@ def _replace_ebnf_optional(syms):
    syms[first:end+1] = [symname]
    return syms, prods
 def _replace_ebnf_choice(syms):
    syms = list(syms)
    newprods = [ ]
    n = 0
    while n < len(syms):
        if '|' in syms[n]:
            symname, prods = _generate_choice_rules(syms[n].split('|'))
            syms[n] = symname
            newprods.extend(prods)
        n += 1
    return syms, newprods
 # Generate grammar rules for repeated items
 _gencount = 0
@@ -1729,6 +1753,31 @@ def _generate_optional_rules(symbols):
    productions.extend(_collect_grammar_rules(optional2))
    return name, productions
 def _generate_choice_rules(symbols):
    '''
    Symbols is a list of grammar symbols such as [ 'PLUS', 'MINUS' ].
    This generates code corresponding to the following construction:
    @('PLUS', 'MINUS')
    def choice(self, p):
        return p[0]
    '''
    global _gencount
    _gencount += 1
    basename = f'_{_gencount}_' + '_'.join(_sanitize_symbols(symbols))
    name = f'{basename}_choice'
    _ = _decorator
    productions = [ ]
    def choice(self, p):
        return p[0]
    choice.__name__ = name
    choice = _(*symbols)(choice)
    productions.extend(_collect_grammar_rules(choice))
    return name, productions
 class ParserMetaDict(dict):
    '''
    Dictionary that allows decorated grammar rule functions to be overloaded
--- a/tests/test_ebnf.py
+++ b/tests/test_ebnf.py
@@ -0,0 +1,152 @@
 import pytest
 from sly import Lexer, Parser
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
    literals = { '(', ')' }
    # String containing ignored characters between tokens
    ignore = ' \t'
    # Regular expression rules for tokens
    ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
    PLUS    = r'\+'
    MINUS   = r'-'
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
    COMMA   = r','
    @_(r'\d+')
    def NUMBER(self, t):
        t.value = int(t.value)
        return t
    # Ignored text
    ignore_comment = r'\#.*'
    @_(r'\n+')
    def newline(self, t):
        self.lineno += t.value.count('\n')
    def error(self, t):
        self.errors.append(t.value[0])
        self.index += 1
    def __init__(self):
        self.errors = []
 class CalcParser(Parser):
    tokens = CalcLexer.tokens
    def __init__(self):
        self.names = { }
        self.errors = [ ]
    @_('ID ASSIGN expr')
    def statement(self, p):
        self.names[p.ID] = p.expr
    @_('ID "(" [ arglist ] ")"')
    def statement(self, p):
        return (p.ID, p.arglist)
    @_('expr { COMMA expr }')
    def arglist(self, p):
        return [p.expr0, *p.expr1]
    @_('expr')
    def statement(self, p):
        return p.expr
    @_('term { PLUS|MINUS term }')
    def expr(self, p):
        lval = p.term0
        for op, rval in p[1]:
            if op == '+':
                lval = lval + rval
            elif op == '-':
                lval = lval - rval
        return lval
    @_('factor { TIMES|DIVIDE factor }')
    def term(self, p):
        lval = p.factor0
        for op, rval in p[1]:
            if op == '*':
                lval = lval * rval
            elif op == '/':
                lval = lval / rval
        return lval
    @_('MINUS factor')
    def factor(self, p):
        return -p.factor
    @_("'(' expr ')'")
    def factor(self, p):
        return p.expr
    @_('NUMBER')
    def factor(self, p):
        return int(p.NUMBER)
    @_('ID')
    def factor(self, p):
        try:
            return self.names[p.ID]
        except LookupError:
            print(f'Undefined name {p.ID!r}')
            return 0
    def error(self, tok):
        self.errors.append(tok)
 # Test basic recognition of various tokens and literals
 def test_simple():
    lexer = CalcLexer()
    parser = CalcParser()
    result = parser.parse(lexer.tokenize('a = 3 + 4 * (5 + 6)'))
    assert result == None
    assert parser.names['a'] == 47
    result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
    assert result == 47
 def test_ebnf():
    lexer = CalcLexer()
    parser = CalcParser()
    result = parser.parse(lexer.tokenize('a()'))
    assert result == ('a', None)
    result = parser.parse(lexer.tokenize('a(2+3)'))
    assert result == ('a', [5])
    result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
    assert result == ('a', [5, 9])
 def test_parse_error():
    lexer = CalcLexer()
    parser = CalcParser()
    result = parser.parse(lexer.tokenize('a 123 4 + 5'))
    assert result == 9
    assert len(parser.errors) == 1
    assert parser.errors[0].type == 'NUMBER'
    assert parser.errors[0].value == 123
 # TO DO:  Add tests
 # - error productions
 # - embedded actions
 # - lineno tracking
 # - various error cases caught during parser construction