diff --git a/CHANGES b/CHANGES index 8eba463..5becbb0 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,19 @@ Version 0.4 ----------- +05/09/2020 Experimental support for EBNF choices. For example: + + @('term { PLUS|MINUS term }') + def expr(self, p): + lterm = p.pterm0 + for op, rterm in p[1]: + lterm = BinOp(op, lterm, rterm) + + One issue here is just how one refers to the choice + of values. There is no unified name to pick. So, + you basically have to do it using a numeric index like p[1]. + In this case, p[1] is a list of all of the repeated items + (represented as tuples). + 05/09/2020 Changed the internal names used for EBNF rules to make them a bit easier to debug in the parser.out file. @@ -8,7 +22,7 @@ Version 0.4 expression list: @('expr { COMMA expr }') - def exprlist(p): + def exprlist(self, p): return [ p.expr0 ] + p.expr1 In this code, the { ... } means zero-or-more repetitions. @@ -19,7 +33,7 @@ Version 0.4 An optional value can be enclosed in brackets like this: @('VAR NAME [ EQUAL expr ] SEMI') - def variable_declaration(p): + def variable_declaration(self, p): print(f"Definining {p.NAME}. Initial value={p.expr}") In this case, all symbols inside [ ... ] either have a value diff --git a/example/calc/calc.py b/example/calc/calc.py index 80c192b..cec81e0 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -3,7 +3,7 @@ # ----------------------------------------------------------------------------- import sys -sys.path.append('../..') +sys.path.insert(0, '../..') from sly import Lexer, Parser diff --git a/example/calc_ebnf/calc.py b/example/calc_ebnf/calc.py new file mode 100644 index 0000000..c4cfcd7 --- /dev/null +++ b/example/calc_ebnf/calc.py @@ -0,0 +1,101 @@ +# ----------------------------------------------------------------------------- +# calc.py +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, '../..') + +from sly import Lexer, Parser + +class CalcLexer(Lexer): + tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN } + ignore = ' \t' + + # Tokens + NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + NUMBER = r'\d+' + + # Special symbols + PLUS = r'\+' + MINUS = r'-' + TIMES = r'\*' + DIVIDE = r'/' + ASSIGN = r'=' + LPAREN = r'\(' + RPAREN = r'\)' + + # Ignored pattern + ignore_newline = r'\n+' + + # Extra action for newlines + def ignore_newline(self, t): + self.lineno += t.value.count('\n') + + def error(self, t): + print("Illegal character '%s'" % t.value[0]) + self.index += 1 + +class CalcParser(Parser): + tokens = CalcLexer.tokens + + def __init__(self): + self.names = { } + + @_('NAME ASSIGN expr') + def statement(self, p): + self.names[p.NAME] = p.expr + + @_('expr') + def statement(self, p): + print(p.expr) + + @_('term { PLUS|MINUS term }') + def expr(self, p): + lval = p.term0 + for op, rval in p[1]: + if op == '+': + lval = lval + rval + elif op == '-': + lval = lval - rval + return lval + + @_('factor { TIMES|DIVIDE factor }') + def term(self, p): + lval = p.factor0 + for op, rval in p[1]: + if op == '*': + lval = lval * rval + elif op == '/': + lval = lval / rval + return lval + + @_('MINUS factor') + def factor(self, p): + return -p.factor + + @_('LPAREN expr RPAREN') + def factor(self, p): + return p.expr + + @_('NUMBER') + def factor(self, p): + return int(p.NUMBER) + + @_('NAME') + def factor(self, p): + try: + return self.names[p.NAME] + except LookupError: + print(f'Undefined name {p.NAME!r}') + return 0 + +if __name__ == '__main__': + lexer = CalcLexer() + parser = CalcParser() + while True: + try: + text = input('calc > ') + except EOFError: + break + if text: + parser.parse(lexer.tokenize(text)) diff --git a/sly/yacc.py b/sly/yacc.py index c7f0d99..085ed6b 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -1582,6 +1582,10 @@ def _collect_grammar_rules(func): syms, prod = _replace_ebnf_repeat(syms) ebnf_prod.extend(prod) break + elif '|' in s: + syms, prod = _replace_ebnf_choice(syms) + ebnf_prod.extend(prod) + break if syms[1:2] == [':'] or syms[1:2] == ['::=']: grammar.append((func, filename, lineno, syms[0], syms[2:])) @@ -1598,9 +1602,17 @@ def _replace_ebnf_repeat(syms): syms = list(syms) first = syms.index('{') end = syms.index('}', first) - symname, prods = _generate_repeat_rules(syms[first+1:end]) + + # Look for choices inside + repeated_syms = syms[first+1:end] + if any('|' in sym for sym in repeated_syms): + repeated_syms, prods = _replace_ebnf_choice(repeated_syms) + else: + prods = [] + + symname, moreprods = _generate_repeat_rules(repeated_syms) syms[first:end+1] = [symname] - return syms, prods + return syms, prods + moreprods def _replace_ebnf_optional(syms): syms = list(syms) @@ -1609,7 +1621,19 @@ def _replace_ebnf_optional(syms): symname, prods = _generate_optional_rules(syms[first+1:end]) syms[first:end+1] = [symname] return syms, prods - + +def _replace_ebnf_choice(syms): + syms = list(syms) + newprods = [ ] + n = 0 + while n < len(syms): + if '|' in syms[n]: + symname, prods = _generate_choice_rules(syms[n].split('|')) + syms[n] = symname + newprods.extend(prods) + n += 1 + return syms, newprods + # Generate grammar rules for repeated items _gencount = 0 @@ -1728,6 +1752,31 @@ def _generate_optional_rules(symbols): productions.extend(_collect_grammar_rules(optional)) productions.extend(_collect_grammar_rules(optional2)) return name, productions + +def _generate_choice_rules(symbols): + ''' + Symbols is a list of grammar symbols such as [ 'PLUS', 'MINUS' ]. + This generates code corresponding to the following construction: + + @('PLUS', 'MINUS') + def choice(self, p): + return p[0] + ''' + global _gencount + _gencount += 1 + basename = f'_{_gencount}_' + '_'.join(_sanitize_symbols(symbols)) + name = f'{basename}_choice' + + _ = _decorator + productions = [ ] + + + def choice(self, p): + return p[0] + choice.__name__ = name + choice = _(*symbols)(choice) + productions.extend(_collect_grammar_rules(choice)) + return name, productions class ParserMetaDict(dict): ''' diff --git a/tests/test_ebnf.py b/tests/test_ebnf.py new file mode 100644 index 0000000..b89e0b7 --- /dev/null +++ b/tests/test_ebnf.py @@ -0,0 +1,152 @@ +import pytest +from sly import Lexer, Parser + +class CalcLexer(Lexer): + # Set of token names. This is always required + tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA } + literals = { '(', ')' } + + # String containing ignored characters between tokens + ignore = ' \t' + + # Regular expression rules for tokens + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + PLUS = r'\+' + MINUS = r'-' + TIMES = r'\*' + DIVIDE = r'/' + ASSIGN = r'=' + COMMA = r',' + + @_(r'\d+') + def NUMBER(self, t): + t.value = int(t.value) + return t + + # Ignored text + ignore_comment = r'\#.*' + + @_(r'\n+') + def newline(self, t): + self.lineno += t.value.count('\n') + + def error(self, t): + self.errors.append(t.value[0]) + self.index += 1 + + def __init__(self): + self.errors = [] + + +class CalcParser(Parser): + tokens = CalcLexer.tokens + + def __init__(self): + self.names = { } + self.errors = [ ] + + @_('ID ASSIGN expr') + def statement(self, p): + self.names[p.ID] = p.expr + + @_('ID "(" [ arglist ] ")"') + def statement(self, p): + return (p.ID, p.arglist) + + @_('expr { COMMA expr }') + def arglist(self, p): + return [p.expr0, *p.expr1] + + @_('expr') + def statement(self, p): + return p.expr + + @_('term { PLUS|MINUS term }') + def expr(self, p): + lval = p.term0 + for op, rval in p[1]: + if op == '+': + lval = lval + rval + elif op == '-': + lval = lval - rval + return lval + + @_('factor { TIMES|DIVIDE factor }') + def term(self, p): + lval = p.factor0 + for op, rval in p[1]: + if op == '*': + lval = lval * rval + elif op == '/': + lval = lval / rval + return lval + + @_('MINUS factor') + def factor(self, p): + return -p.factor + + @_("'(' expr ')'") + def factor(self, p): + return p.expr + + @_('NUMBER') + def factor(self, p): + return int(p.NUMBER) + + @_('ID') + def factor(self, p): + try: + return self.names[p.ID] + except LookupError: + print(f'Undefined name {p.ID!r}') + return 0 + + def error(self, tok): + self.errors.append(tok) + + +# Test basic recognition of various tokens and literals +def test_simple(): + lexer = CalcLexer() + parser = CalcParser() + + result = parser.parse(lexer.tokenize('a = 3 + 4 * (5 + 6)')) + assert result == None + assert parser.names['a'] == 47 + + result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)')) + assert result == 47 + +def test_ebnf(): + lexer = CalcLexer() + parser = CalcParser() + result = parser.parse(lexer.tokenize('a()')) + assert result == ('a', None) + + result = parser.parse(lexer.tokenize('a(2+3)')) + assert result == ('a', [5]) + + result = parser.parse(lexer.tokenize('a(2+3, 4+5)')) + assert result == ('a', [5, 9]) + +def test_parse_error(): + lexer = CalcLexer() + parser = CalcParser() + + result = parser.parse(lexer.tokenize('a 123 4 + 5')) + assert result == 9 + assert len(parser.errors) == 1 + assert parser.errors[0].type == 'NUMBER' + assert parser.errors[0].value == 123 + +# TO DO: Add tests +# - error productions +# - embedded actions +# - lineno tracking +# - various error cases caught during parser construction + + + + + +