Added EBNF choice handling

This commit is contained in:
David Beazley 2020-05-09 12:24:33 -05:00
parent ab75a58b10
commit 1f87ddaf39
5 changed files with 322 additions and 6 deletions

18
CHANGES
View File

@ -1,5 +1,19 @@
Version 0.4
-----------
05/09/2020 Experimental support for EBNF choices. For example:
@('term { PLUS|MINUS term }')
def expr(self, p):
lterm = p.pterm0
for op, rterm in p[1]:
lterm = BinOp(op, lterm, rterm)
One issue here is just how one refers to the choice
of values. There is no unified name to pick. So,
you basically have to do it using a numeric index like p[1].
In this case, p[1] is a list of all of the repeated items
(represented as tuples).
05/09/2020 Changed the internal names used for EBNF rules to make them
a bit easier to debug in the parser.out file.
@ -8,7 +22,7 @@ Version 0.4
expression list:
@('expr { COMMA expr }')
def exprlist(p):
def exprlist(self, p):
return [ p.expr0 ] + p.expr1
In this code, the { ... } means zero-or-more repetitions.
@ -19,7 +33,7 @@ Version 0.4
An optional value can be enclosed in brackets like this:
@('VAR NAME [ EQUAL expr ] SEMI')
def variable_declaration(p):
def variable_declaration(self, p):
print(f"Definining {p.NAME}. Initial value={p.expr}")
In this case, all symbols inside [ ... ] either have a value

View File

@ -3,7 +3,7 @@
# -----------------------------------------------------------------------------
import sys
sys.path.append('../..')
sys.path.insert(0, '../..')
from sly import Lexer, Parser

101
example/calc_ebnf/calc.py Normal file
View File

@ -0,0 +1,101 @@
# -----------------------------------------------------------------------------
# calc.py
# -----------------------------------------------------------------------------
import sys
sys.path.insert(0, '../..')
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN }
ignore = ' \t'
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
# Special symbols
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
# Ignored pattern
ignore_newline = r'\n+'
# Extra action for newlines
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index += 1
class CalcParser(Parser):
tokens = CalcLexer.tokens
def __init__(self):
self.names = { }
@_('NAME ASSIGN expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('term { PLUS|MINUS term }')
def expr(self, p):
lval = p.term0
for op, rval in p[1]:
if op == '+':
lval = lval + rval
elif op == '-':
lval = lval - rval
return lval
@_('factor { TIMES|DIVIDE factor }')
def term(self, p):
lval = p.factor0
for op, rval in p[1]:
if op == '*':
lval = lval * rval
elif op == '/':
lval = lval / rval
return lval
@_('MINUS factor')
def factor(self, p):
return -p.factor
@_('LPAREN expr RPAREN')
def factor(self, p):
return p.expr
@_('NUMBER')
def factor(self, p):
return int(p.NUMBER)
@_('NAME')
def factor(self, p):
try:
return self.names[p.NAME]
except LookupError:
print(f'Undefined name {p.NAME!r}')
return 0
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))

View File

@ -1582,6 +1582,10 @@ def _collect_grammar_rules(func):
syms, prod = _replace_ebnf_repeat(syms)
ebnf_prod.extend(prod)
break
elif '|' in s:
syms, prod = _replace_ebnf_choice(syms)
ebnf_prod.extend(prod)
break
if syms[1:2] == [':'] or syms[1:2] == ['::=']:
grammar.append((func, filename, lineno, syms[0], syms[2:]))
@ -1598,9 +1602,17 @@ def _replace_ebnf_repeat(syms):
syms = list(syms)
first = syms.index('{')
end = syms.index('}', first)
symname, prods = _generate_repeat_rules(syms[first+1:end])
# Look for choices inside
repeated_syms = syms[first+1:end]
if any('|' in sym for sym in repeated_syms):
repeated_syms, prods = _replace_ebnf_choice(repeated_syms)
else:
prods = []
symname, moreprods = _generate_repeat_rules(repeated_syms)
syms[first:end+1] = [symname]
return syms, prods
return syms, prods + moreprods
def _replace_ebnf_optional(syms):
syms = list(syms)
@ -1609,7 +1621,19 @@ def _replace_ebnf_optional(syms):
symname, prods = _generate_optional_rules(syms[first+1:end])
syms[first:end+1] = [symname]
return syms, prods
def _replace_ebnf_choice(syms):
syms = list(syms)
newprods = [ ]
n = 0
while n < len(syms):
if '|' in syms[n]:
symname, prods = _generate_choice_rules(syms[n].split('|'))
syms[n] = symname
newprods.extend(prods)
n += 1
return syms, newprods
# Generate grammar rules for repeated items
_gencount = 0
@ -1728,6 +1752,31 @@ def _generate_optional_rules(symbols):
productions.extend(_collect_grammar_rules(optional))
productions.extend(_collect_grammar_rules(optional2))
return name, productions
def _generate_choice_rules(symbols):
'''
Symbols is a list of grammar symbols such as [ 'PLUS', 'MINUS' ].
This generates code corresponding to the following construction:
@('PLUS', 'MINUS')
def choice(self, p):
return p[0]
'''
global _gencount
_gencount += 1
basename = f'_{_gencount}_' + '_'.join(_sanitize_symbols(symbols))
name = f'{basename}_choice'
_ = _decorator
productions = [ ]
def choice(self, p):
return p[0]
choice.__name__ = name
choice = _(*symbols)(choice)
productions.extend(_collect_grammar_rules(choice))
return name, productions
class ParserMetaDict(dict):
'''

152
tests/test_ebnf.py Normal file
View File

@ -0,0 +1,152 @@
import pytest
from sly import Lexer, Parser
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
literals = { '(', ')' }
# String containing ignored characters between tokens
ignore = ' \t'
# Regular expression rules for tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
COMMA = r','
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
# Ignored text
ignore_comment = r'\#.*'
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
self.errors.append(t.value[0])
self.index += 1
def __init__(self):
self.errors = []
class CalcParser(Parser):
tokens = CalcLexer.tokens
def __init__(self):
self.names = { }
self.errors = [ ]
@_('ID ASSIGN expr')
def statement(self, p):
self.names[p.ID] = p.expr
@_('ID "(" [ arglist ] ")"')
def statement(self, p):
return (p.ID, p.arglist)
@_('expr { COMMA expr }')
def arglist(self, p):
return [p.expr0, *p.expr1]
@_('expr')
def statement(self, p):
return p.expr
@_('term { PLUS|MINUS term }')
def expr(self, p):
lval = p.term0
for op, rval in p[1]:
if op == '+':
lval = lval + rval
elif op == '-':
lval = lval - rval
return lval
@_('factor { TIMES|DIVIDE factor }')
def term(self, p):
lval = p.factor0
for op, rval in p[1]:
if op == '*':
lval = lval * rval
elif op == '/':
lval = lval / rval
return lval
@_('MINUS factor')
def factor(self, p):
return -p.factor
@_("'(' expr ')'")
def factor(self, p):
return p.expr
@_('NUMBER')
def factor(self, p):
return int(p.NUMBER)
@_('ID')
def factor(self, p):
try:
return self.names[p.ID]
except LookupError:
print(f'Undefined name {p.ID!r}')
return 0
def error(self, tok):
self.errors.append(tok)
# Test basic recognition of various tokens and literals
def test_simple():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a = 3 + 4 * (5 + 6)'))
assert result == None
assert parser.names['a'] == 47
result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
assert result == 47
def test_ebnf():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a()'))
assert result == ('a', None)
result = parser.parse(lexer.tokenize('a(2+3)'))
assert result == ('a', [5])
result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
assert result == ('a', [5, 9])
def test_parse_error():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a 123 4 + 5'))
assert result == 9
assert len(parser.errors) == 1
assert parser.errors[0].type == 'NUMBER'
assert parser.errors[0].value == 123
# TO DO: Add tests
# - error productions
# - embedded actions
# - lineno tracking
# - various error cases caught during parser construction