Experimental EBNF features added

This commit is contained in:
David Beazley 2020-03-06 20:58:48 -06:00
parent 9944b6239c
commit a2cdf52d0f
4 changed files with 238 additions and 10 deletions

29
CHANGES
View File

@ -1,3 +1,32 @@
Version 0.5
-----------
03/06/2020 Added experimental support for EBNF repetition and optional
syntax. For example, here is a rule for a comma-separated
expression list:
@('expr { COMMA expr }')
def exprlist(p):
return [ p.expr ] + [e.expr for e in p[1]]
In this code, the { ... } means zero-or-more repetitions.
It produces a list of matches that must be accessed by
position index (p[1] in this example. p[0] is 'expr').
The elements of the list are named tuples with attribute
names that match the enclosed grammar symbols (e.g., e.expr
in the example).
An optional value can be enclosed in brackets like this:
@('NAME LPAREN [ exprlist ] RPAREN')
def function_call(p):
args = p[2] if p[2] else []
name = p.NAME
print('Calling:', name, args)
In this case, p[2] contains the optional value. If not present,
the value is None. If present, it is a tuple of values
or a single value (if only one symbol).
Version 0.4
-----------
04/09/2019 Fixed very mysterious error message that resulted if you

View File

@ -360,6 +360,7 @@ class Lexer(metaclass=LexerMeta):
def tokenize(self, text, lineno=1, index=0):
_ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None
# --- Support for state changes
def _set_state(cls):
nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
_ignored_tokens = cls._ignored_tokens
@ -371,8 +372,26 @@ class Lexer(metaclass=LexerMeta):
self.__set_state = _set_state
_set_state(type(self))
self.text = text
# --- Support for backtracking
_mark_stack = []
def _mark():
_mark_stack.append((type(self), index, lineno))
self.mark = _mark
def _accept():
_mark_stack.pop()
self.accept = _accept
def _reject():
nonlocal index, lineno
cls, index, lineno = _mark_stack[-1]
_set_state(cls)
self.reject = _reject
# --- Main tokenization function
self.text = text
try:
while True:
try:

View File

@ -33,7 +33,7 @@
import sys
import inspect
from collections import OrderedDict, defaultdict
from collections import OrderedDict, defaultdict, namedtuple
__all__ = [ 'Parser' ]
@ -1551,14 +1551,166 @@ def _collect_grammar_rules(func):
lineno = unwrapped.__code__.co_firstlineno
for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)):
syms = rule.split()
ebnf_prod = []
while ('{' in syms) or ('[' in syms):
for s in syms:
if s == '[':
syms, prod = _replace_ebnf_optional(syms)
ebnf_prod.extend(prod)
break
elif s == '{':
syms, prod = _replace_ebnf_repeat(syms)
ebnf_prod.extend(prod)
break
if syms[1:2] == [':'] or syms[1:2] == ['::=']:
grammar.append((func, filename, lineno, syms[0], syms[2:]))
else:
grammar.append((func, filename, lineno, prodname, syms))
grammar.extend(ebnf_prod)
func = getattr(func, 'next_func', None)
return grammar
# Replace EBNF repetition
def _replace_ebnf_repeat(syms):
syms = list(syms)
first = syms.index('{')
end = syms.index('}', first)
symname, prods = _generate_repeat_rules(syms[first+1:end])
syms[first:end+1] = [symname]
return syms, prods
def _replace_ebnf_optional(syms):
syms = list(syms)
first = syms.index('[')
end = syms.index(']', first)
symname, prods = _generate_optional_rules(syms[first+1:end])
syms[first:end+1] = [symname]
return syms, prods
# Generate grammar rules for repeated items
_gencount = 0
def _unique_names(names):
from collections import defaultdict, Counter
counts = Counter(names)
indices = defaultdict(int)
newnames = []
for name in names:
if counts[name] == 1:
newnames.append(name)
else:
newnames.append(f'{name}{indices[name]}')
indices[name] += 1
return newnames
def _generate_repeat_rules(symbols):
'''
Symbols is a list of grammar symbols [ symbols ]. This
generates code corresponding to these grammar construction:
@('repeat : many')
def repeat(self, p):
return p.many
@('repeat :')
def repeat(self, p):
return []
@('many : many symbols')
def many(self, p):
p.many.append(symbols)
return p.many
@('many : symbols')
def many(self, p):
return [ p.symbols ]
'''
global _gencount
_gencount += 1
name = f'_{_gencount}_repeat'
oname = f'_{_gencount}_items'
iname = f'_{_gencount}_item'
symtext = ' '.join(symbols)
productions = [ ]
_ = _decorator
@_(f'{name} : {oname}')
def repeat(self, p):
return getattr(p, oname)
@_(f'{name} : ')
def repeat2(self, p):
return []
productions.extend(_collect_grammar_rules(repeat))
productions.extend(_collect_grammar_rules(repeat2))
@_(f'{oname} : {oname} {iname}')
def many(self, p):
items = getattr(p, oname)
items.append(getattr(p, iname))
return items
@_(f'{oname} : {iname}')
def many2(self, p):
return [ getattr(p, iname) ]
productions.extend(_collect_grammar_rules(many))
productions.extend(_collect_grammar_rules(many2))
utuple = namedtuple('syms', _unique_names(symbols))
@_(f'{iname} : {symtext}')
def item(self, p):
if len(p) == 1:
return p[0]
else:
return utuple(*p)
productions.extend(_collect_grammar_rules(item))
return name, productions
def _generate_optional_rules(symbols):
'''
Symbols is a list of grammar symbols [ symbols ]. This
generates code corresponding to these grammar construction:
@('optional : symbols')
def optional(self, p):
return p.symbols
@('optional :')
def optional(self, p):
return None
'''
global _gencount
_gencount += 1
name = f'_{_gencount}_optional'
symtext = ' '.join(symbols)
productions = [ ]
_ = _decorator
utuple = namedtuple('syms', _unique_names(symbols))
@_(f'{name} : {symtext}')
def optional(self, p):
if len(p) == 1:
return p[0]
else:
return utuple(*p)
@_(f'{name} : ')
def optional2(self, p):
return None
productions.extend(_collect_grammar_rules(optional))
productions.extend(_collect_grammar_rules(optional2))
return name, productions
class ParserMetaDict(dict):
'''
Dictionary that allows decorated grammar rule functions to be overloaded
@ -1576,17 +1728,24 @@ class ParserMetaDict(dict):
else:
return super().__getitem__(key)
def _decorator(rule, *extra):
rules = [rule, *extra]
def decorate(func):
func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
return func
return decorate
class ParserMeta(type):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
def _(rule, *extra):
rules = [rule, *extra]
def decorate(func):
func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
return func
return decorate
d['_'] = _
# def _(rule, *extra):
# rules = [rule, *extra]
# def decorate(func):
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
# return func
# return decorate
d['_'] = _decorator
return d
def __new__(meta, clsname, bases, attributes):

View File

@ -3,7 +3,7 @@ from sly import Lexer, Parser
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN }
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
literals = { '(', ')' }
# String containing ignored characters between tokens
@ -16,6 +16,7 @@ class CalcLexer(Lexer):
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
COMMA = r','
@_(r'\d+')
def NUMBER(self, t):
@ -53,6 +54,14 @@ class CalcParser(Parser):
def statement(self, p):
self.names[p.ID] = p.expr
@_('ID "(" [ arglist ] ")"')
def statement(self, p):
return (p.ID, p[2])
@_('expr { COMMA expr }')
def arglist(self, p):
return [p.expr, *[e.expr for e in p[1]]]
@_('expr')
def statement(self, p):
return p.expr
@ -109,6 +118,18 @@ def test_simple():
result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
assert result == 47
def test_ebnf():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a()'))
assert result == ('a', None)
result = parser.parse(lexer.tokenize('a(2+3)'))
assert result == ('a', [5])
result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
assert result == ('a', [5, 9])
def test_parse_error():
lexer = CalcLexer()
parser = CalcParser()