diff --git a/CHANGES b/CHANGES index e76da56..46bf92a 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,32 @@ +Version 0.5 +----------- +03/06/2020 Added experimental support for EBNF repetition and optional + syntax. For example, here is a rule for a comma-separated + expression list: + + @('expr { COMMA expr }') + def exprlist(p): + return [ p.expr ] + [e.expr for e in p[1]] + + In this code, the { ... } means zero-or-more repetitions. + It produces a list of matches that must be accessed by + position index (p[1] in this example. p[0] is 'expr'). + The elements of the list are named tuples with attribute + names that match the enclosed grammar symbols (e.g., e.expr + in the example). + + An optional value can be enclosed in brackets like this: + + @('NAME LPAREN [ exprlist ] RPAREN') + def function_call(p): + args = p[2] if p[2] else [] + name = p.NAME + print('Calling:', name, args) + + In this case, p[2] contains the optional value. If not present, + the value is None. If present, it is a tuple of values + or a single value (if only one symbol). + Version 0.4 ----------- 04/09/2019 Fixed very mysterious error message that resulted if you diff --git a/sly/lex.py b/sly/lex.py index 246dd9e..2f3a345 100644 --- a/sly/lex.py +++ b/sly/lex.py @@ -360,6 +360,7 @@ class Lexer(metaclass=LexerMeta): def tokenize(self, text, lineno=1, index=0): _ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None + # --- Support for state changes def _set_state(cls): nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping _ignored_tokens = cls._ignored_tokens @@ -371,8 +372,26 @@ class Lexer(metaclass=LexerMeta): self.__set_state = _set_state _set_state(type(self)) - self.text = text + # --- Support for backtracking + _mark_stack = [] + def _mark(): + _mark_stack.append((type(self), index, lineno)) + self.mark = _mark + + def _accept(): + _mark_stack.pop() + self.accept = _accept + + def _reject(): + nonlocal index, lineno + cls, index, lineno = _mark_stack[-1] + _set_state(cls) + self.reject = _reject + + + # --- Main tokenization function + self.text = text try: while True: try: diff --git a/sly/yacc.py b/sly/yacc.py index c30f13c..00a9c8d 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -33,7 +33,7 @@ import sys import inspect -from collections import OrderedDict, defaultdict +from collections import OrderedDict, defaultdict, namedtuple __all__ = [ 'Parser' ] @@ -1551,14 +1551,166 @@ def _collect_grammar_rules(func): lineno = unwrapped.__code__.co_firstlineno for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)): syms = rule.split() + ebnf_prod = [] + while ('{' in syms) or ('[' in syms): + for s in syms: + if s == '[': + syms, prod = _replace_ebnf_optional(syms) + ebnf_prod.extend(prod) + break + elif s == '{': + syms, prod = _replace_ebnf_repeat(syms) + ebnf_prod.extend(prod) + break + if syms[1:2] == [':'] or syms[1:2] == ['::=']: grammar.append((func, filename, lineno, syms[0], syms[2:])) else: grammar.append((func, filename, lineno, prodname, syms)) + grammar.extend(ebnf_prod) + func = getattr(func, 'next_func', None) return grammar +# Replace EBNF repetition +def _replace_ebnf_repeat(syms): + syms = list(syms) + first = syms.index('{') + end = syms.index('}', first) + symname, prods = _generate_repeat_rules(syms[first+1:end]) + syms[first:end+1] = [symname] + return syms, prods + +def _replace_ebnf_optional(syms): + syms = list(syms) + first = syms.index('[') + end = syms.index(']', first) + symname, prods = _generate_optional_rules(syms[first+1:end]) + syms[first:end+1] = [symname] + return syms, prods + +# Generate grammar rules for repeated items +_gencount = 0 + +def _unique_names(names): + from collections import defaultdict, Counter + counts = Counter(names) + indices = defaultdict(int) + newnames = [] + for name in names: + if counts[name] == 1: + newnames.append(name) + else: + newnames.append(f'{name}{indices[name]}') + indices[name] += 1 + return newnames + +def _generate_repeat_rules(symbols): + ''' + Symbols is a list of grammar symbols [ symbols ]. This + generates code corresponding to these grammar construction: + + @('repeat : many') + def repeat(self, p): + return p.many + + @('repeat :') + def repeat(self, p): + return [] + + @('many : many symbols') + def many(self, p): + p.many.append(symbols) + return p.many + + @('many : symbols') + def many(self, p): + return [ p.symbols ] + ''' + global _gencount + _gencount += 1 + name = f'_{_gencount}_repeat' + oname = f'_{_gencount}_items' + iname = f'_{_gencount}_item' + symtext = ' '.join(symbols) + + productions = [ ] + _ = _decorator + + @_(f'{name} : {oname}') + def repeat(self, p): + return getattr(p, oname) + + @_(f'{name} : ') + def repeat2(self, p): + return [] + productions.extend(_collect_grammar_rules(repeat)) + productions.extend(_collect_grammar_rules(repeat2)) + + @_(f'{oname} : {oname} {iname}') + def many(self, p): + items = getattr(p, oname) + items.append(getattr(p, iname)) + return items + + @_(f'{oname} : {iname}') + def many2(self, p): + return [ getattr(p, iname) ] + + productions.extend(_collect_grammar_rules(many)) + productions.extend(_collect_grammar_rules(many2)) + + utuple = namedtuple('syms', _unique_names(symbols)) + + @_(f'{iname} : {symtext}') + def item(self, p): + if len(p) == 1: + return p[0] + else: + return utuple(*p) + + productions.extend(_collect_grammar_rules(item)) + return name, productions + +def _generate_optional_rules(symbols): + ''' + Symbols is a list of grammar symbols [ symbols ]. This + generates code corresponding to these grammar construction: + + @('optional : symbols') + def optional(self, p): + return p.symbols + + @('optional :') + def optional(self, p): + return None + ''' + global _gencount + _gencount += 1 + name = f'_{_gencount}_optional' + symtext = ' '.join(symbols) + + productions = [ ] + _ = _decorator + + utuple = namedtuple('syms', _unique_names(symbols)) + + @_(f'{name} : {symtext}') + def optional(self, p): + if len(p) == 1: + return p[0] + else: + return utuple(*p) + + @_(f'{name} : ') + def optional2(self, p): + return None + + productions.extend(_collect_grammar_rules(optional)) + productions.extend(_collect_grammar_rules(optional2)) + return name, productions + class ParserMetaDict(dict): ''' Dictionary that allows decorated grammar rule functions to be overloaded @@ -1576,17 +1728,24 @@ class ParserMetaDict(dict): else: return super().__getitem__(key) +def _decorator(rule, *extra): + rules = [rule, *extra] + def decorate(func): + func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ] + return func + return decorate + class ParserMeta(type): @classmethod def __prepare__(meta, *args, **kwargs): d = ParserMetaDict() - def _(rule, *extra): - rules = [rule, *extra] - def decorate(func): - func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ] - return func - return decorate - d['_'] = _ +# def _(rule, *extra): +# rules = [rule, *extra] +# def decorate(func): +# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ] +# return func +# return decorate + d['_'] = _decorator return d def __new__(meta, clsname, bases, attributes): diff --git a/tests/test_parser.py b/tests/test_parser.py index 2661448..5751666 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -3,7 +3,7 @@ from sly import Lexer, Parser class CalcLexer(Lexer): # Set of token names. This is always required - tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN } + tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA } literals = { '(', ')' } # String containing ignored characters between tokens @@ -16,6 +16,7 @@ class CalcLexer(Lexer): TIMES = r'\*' DIVIDE = r'/' ASSIGN = r'=' + COMMA = r',' @_(r'\d+') def NUMBER(self, t): @@ -53,6 +54,14 @@ class CalcParser(Parser): def statement(self, p): self.names[p.ID] = p.expr + @_('ID "(" [ arglist ] ")"') + def statement(self, p): + return (p.ID, p[2]) + + @_('expr { COMMA expr }') + def arglist(self, p): + return [p.expr, *[e.expr for e in p[1]]] + @_('expr') def statement(self, p): return p.expr @@ -109,6 +118,18 @@ def test_simple(): result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)')) assert result == 47 +def test_ebnf(): + lexer = CalcLexer() + parser = CalcParser() + result = parser.parse(lexer.tokenize('a()')) + assert result == ('a', None) + + result = parser.parse(lexer.tokenize('a(2+3)')) + assert result == ('a', [5]) + + result = parser.parse(lexer.tokenize('a(2+3, 4+5)')) + assert result == ('a', [5, 9]) + def test_parse_error(): lexer = CalcLexer() parser = CalcParser()