Experimental EBNF features added

This commit is contained in:
David Beazley 2020-03-06 20:58:48 -06:00
parent 9944b6239c
commit a2cdf52d0f
4 changed files with 238 additions and 10 deletions

29
CHANGES
View File

@ -1,3 +1,32 @@
Version 0.5
-----------
03/06/2020 Added experimental support for EBNF repetition and optional
syntax. For example, here is a rule for a comma-separated
expression list:
@('expr { COMMA expr }')
def exprlist(p):
return [ p.expr ] + [e.expr for e in p[1]]
In this code, the { ... } means zero-or-more repetitions.
It produces a list of matches that must be accessed by
position index (p[1] in this example. p[0] is 'expr').
The elements of the list are named tuples with attribute
names that match the enclosed grammar symbols (e.g., e.expr
in the example).
An optional value can be enclosed in brackets like this:
@('NAME LPAREN [ exprlist ] RPAREN')
def function_call(p):
args = p[2] if p[2] else []
name = p.NAME
print('Calling:', name, args)
In this case, p[2] contains the optional value. If not present,
the value is None. If present, it is a tuple of values
or a single value (if only one symbol).
Version 0.4 Version 0.4
----------- -----------
04/09/2019 Fixed very mysterious error message that resulted if you 04/09/2019 Fixed very mysterious error message that resulted if you

View File

@ -360,6 +360,7 @@ class Lexer(metaclass=LexerMeta):
def tokenize(self, text, lineno=1, index=0): def tokenize(self, text, lineno=1, index=0):
_ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None _ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None
# --- Support for state changes
def _set_state(cls): def _set_state(cls):
nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
_ignored_tokens = cls._ignored_tokens _ignored_tokens = cls._ignored_tokens
@ -371,8 +372,26 @@ class Lexer(metaclass=LexerMeta):
self.__set_state = _set_state self.__set_state = _set_state
_set_state(type(self)) _set_state(type(self))
self.text = text
# --- Support for backtracking
_mark_stack = []
def _mark():
_mark_stack.append((type(self), index, lineno))
self.mark = _mark
def _accept():
_mark_stack.pop()
self.accept = _accept
def _reject():
nonlocal index, lineno
cls, index, lineno = _mark_stack[-1]
_set_state(cls)
self.reject = _reject
# --- Main tokenization function
self.text = text
try: try:
while True: while True:
try: try:

View File

@ -33,7 +33,7 @@
import sys import sys
import inspect import inspect
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict, namedtuple
__all__ = [ 'Parser' ] __all__ = [ 'Parser' ]
@ -1551,14 +1551,166 @@ def _collect_grammar_rules(func):
lineno = unwrapped.__code__.co_firstlineno lineno = unwrapped.__code__.co_firstlineno
for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)): for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)):
syms = rule.split() syms = rule.split()
ebnf_prod = []
while ('{' in syms) or ('[' in syms):
for s in syms:
if s == '[':
syms, prod = _replace_ebnf_optional(syms)
ebnf_prod.extend(prod)
break
elif s == '{':
syms, prod = _replace_ebnf_repeat(syms)
ebnf_prod.extend(prod)
break
if syms[1:2] == [':'] or syms[1:2] == ['::=']: if syms[1:2] == [':'] or syms[1:2] == ['::=']:
grammar.append((func, filename, lineno, syms[0], syms[2:])) grammar.append((func, filename, lineno, syms[0], syms[2:]))
else: else:
grammar.append((func, filename, lineno, prodname, syms)) grammar.append((func, filename, lineno, prodname, syms))
grammar.extend(ebnf_prod)
func = getattr(func, 'next_func', None) func = getattr(func, 'next_func', None)
return grammar return grammar
# Replace EBNF repetition
def _replace_ebnf_repeat(syms):
syms = list(syms)
first = syms.index('{')
end = syms.index('}', first)
symname, prods = _generate_repeat_rules(syms[first+1:end])
syms[first:end+1] = [symname]
return syms, prods
def _replace_ebnf_optional(syms):
syms = list(syms)
first = syms.index('[')
end = syms.index(']', first)
symname, prods = _generate_optional_rules(syms[first+1:end])
syms[first:end+1] = [symname]
return syms, prods
# Generate grammar rules for repeated items
_gencount = 0
def _unique_names(names):
from collections import defaultdict, Counter
counts = Counter(names)
indices = defaultdict(int)
newnames = []
for name in names:
if counts[name] == 1:
newnames.append(name)
else:
newnames.append(f'{name}{indices[name]}')
indices[name] += 1
return newnames
def _generate_repeat_rules(symbols):
'''
Symbols is a list of grammar symbols [ symbols ]. This
generates code corresponding to these grammar construction:
@('repeat : many')
def repeat(self, p):
return p.many
@('repeat :')
def repeat(self, p):
return []
@('many : many symbols')
def many(self, p):
p.many.append(symbols)
return p.many
@('many : symbols')
def many(self, p):
return [ p.symbols ]
'''
global _gencount
_gencount += 1
name = f'_{_gencount}_repeat'
oname = f'_{_gencount}_items'
iname = f'_{_gencount}_item'
symtext = ' '.join(symbols)
productions = [ ]
_ = _decorator
@_(f'{name} : {oname}')
def repeat(self, p):
return getattr(p, oname)
@_(f'{name} : ')
def repeat2(self, p):
return []
productions.extend(_collect_grammar_rules(repeat))
productions.extend(_collect_grammar_rules(repeat2))
@_(f'{oname} : {oname} {iname}')
def many(self, p):
items = getattr(p, oname)
items.append(getattr(p, iname))
return items
@_(f'{oname} : {iname}')
def many2(self, p):
return [ getattr(p, iname) ]
productions.extend(_collect_grammar_rules(many))
productions.extend(_collect_grammar_rules(many2))
utuple = namedtuple('syms', _unique_names(symbols))
@_(f'{iname} : {symtext}')
def item(self, p):
if len(p) == 1:
return p[0]
else:
return utuple(*p)
productions.extend(_collect_grammar_rules(item))
return name, productions
def _generate_optional_rules(symbols):
'''
Symbols is a list of grammar symbols [ symbols ]. This
generates code corresponding to these grammar construction:
@('optional : symbols')
def optional(self, p):
return p.symbols
@('optional :')
def optional(self, p):
return None
'''
global _gencount
_gencount += 1
name = f'_{_gencount}_optional'
symtext = ' '.join(symbols)
productions = [ ]
_ = _decorator
utuple = namedtuple('syms', _unique_names(symbols))
@_(f'{name} : {symtext}')
def optional(self, p):
if len(p) == 1:
return p[0]
else:
return utuple(*p)
@_(f'{name} : ')
def optional2(self, p):
return None
productions.extend(_collect_grammar_rules(optional))
productions.extend(_collect_grammar_rules(optional2))
return name, productions
class ParserMetaDict(dict): class ParserMetaDict(dict):
''' '''
Dictionary that allows decorated grammar rule functions to be overloaded Dictionary that allows decorated grammar rule functions to be overloaded
@ -1576,17 +1728,24 @@ class ParserMetaDict(dict):
else: else:
return super().__getitem__(key) return super().__getitem__(key)
class ParserMeta(type): def _decorator(rule, *extra):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
def _(rule, *extra):
rules = [rule, *extra] rules = [rule, *extra]
def decorate(func): def decorate(func):
func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ] func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
return func return func
return decorate return decorate
d['_'] = _
class ParserMeta(type):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
# def _(rule, *extra):
# rules = [rule, *extra]
# def decorate(func):
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
# return func
# return decorate
d['_'] = _decorator
return d return d
def __new__(meta, clsname, bases, attributes): def __new__(meta, clsname, bases, attributes):

View File

@ -3,7 +3,7 @@ from sly import Lexer, Parser
class CalcLexer(Lexer): class CalcLexer(Lexer):
# Set of token names. This is always required # Set of token names. This is always required
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN } tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
literals = { '(', ')' } literals = { '(', ')' }
# String containing ignored characters between tokens # String containing ignored characters between tokens
@ -16,6 +16,7 @@ class CalcLexer(Lexer):
TIMES = r'\*' TIMES = r'\*'
DIVIDE = r'/' DIVIDE = r'/'
ASSIGN = r'=' ASSIGN = r'='
COMMA = r','
@_(r'\d+') @_(r'\d+')
def NUMBER(self, t): def NUMBER(self, t):
@ -53,6 +54,14 @@ class CalcParser(Parser):
def statement(self, p): def statement(self, p):
self.names[p.ID] = p.expr self.names[p.ID] = p.expr
@_('ID "(" [ arglist ] ")"')
def statement(self, p):
return (p.ID, p[2])
@_('expr { COMMA expr }')
def arglist(self, p):
return [p.expr, *[e.expr for e in p[1]]]
@_('expr') @_('expr')
def statement(self, p): def statement(self, p):
return p.expr return p.expr
@ -109,6 +118,18 @@ def test_simple():
result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)')) result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
assert result == 47 assert result == 47
def test_ebnf():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a()'))
assert result == ('a', None)
result = parser.parse(lexer.tokenize('a(2+3)'))
assert result == ('a', [5])
result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
assert result == ('a', [5, 9])
def test_parse_error(): def test_parse_error():
lexer = CalcLexer() lexer = CalcLexer()
parser = CalcParser() parser = CalcParser()