Experimental EBNF features added

2020-03-06 20:58:48 -06:00
parent 9944b6239c
commit a2cdf52d0f
4 changed files with 238 additions and 10 deletions
--- a/29
+++ b/29
@@ -1,3 +1,32 @@
 Version 0.5
 -----------
 03/06/2020 Added experimental support for EBNF repetition and optional
           syntax.  For example, here is a rule for a comma-separated
           expression list:
               @('expr { COMMA expr }')
               def exprlist(p):
                   return [ p.expr ] + [e.expr for e in p[1]]
           In this code, the { ... } means zero-or-more repetitions.
           It produces a list of matches that must be accessed by
           position index (p[1] in this example. p[0] is 'expr').
 	   The elements of the list are named tuples with attribute
           names that match the enclosed grammar symbols (e.g., e.expr
           in the example).
           An optional value can be enclosed in brackets like this:
              @('NAME LPAREN [ exprlist ] RPAREN')
              def function_call(p):
                  args = p[2] if p[2] else []
                  name = p.NAME
                  print('Calling:', name, args)
           In this case, p[2] contains the optional value.  If not present,
           the value is None.  If present, it is a tuple of values 
           or a single value (if only one symbol). 
 Version 0.4
 -----------
 04/09/2019 Fixed very mysterious error message that resulted if you 
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -360,6 +360,7 @@ class Lexer(metaclass=LexerMeta):
    def tokenize(self, text, lineno=1, index=0):
        _ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None
        # --- Support for state changes
        def _set_state(cls):
            nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
            _ignored_tokens = cls._ignored_tokens
@@ -371,8 +372,26 @@ class Lexer(metaclass=LexerMeta):
        self.__set_state = _set_state
        _set_state(type(self))
        self.text = text
        # --- Support for backtracking
        _mark_stack = []
        def _mark():
            _mark_stack.append((type(self), index, lineno))
        self.mark = _mark
        def _accept():
            _mark_stack.pop()
        self.accept = _accept
        def _reject():
            nonlocal index, lineno
            cls, index, lineno = _mark_stack[-1]
            _set_state(cls)
        self.reject = _reject
        # --- Main tokenization function
        self.text = text
        try:
            while True:
                try:
--- a/sly/yacc.py
+++ b/sly/yacc.py
@@ -33,7 +33,7 @@
 import sys
 import inspect
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple
 __all__        = [ 'Parser' ]
@@ -1551,14 +1551,166 @@ def _collect_grammar_rules(func):
        lineno = unwrapped.__code__.co_firstlineno
        for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)):
            syms = rule.split()
            ebnf_prod = []
            while ('{' in syms) or ('[' in syms):
                for s in syms:
                    if s == '[':
                        syms, prod = _replace_ebnf_optional(syms)
                        ebnf_prod.extend(prod)
                        break
                    elif s == '{':
                        syms, prod = _replace_ebnf_repeat(syms)
                        ebnf_prod.extend(prod)
                        break
            if syms[1:2] == [':'] or syms[1:2] == ['::=']:
                grammar.append((func, filename, lineno, syms[0], syms[2:]))
            else:
                grammar.append((func, filename, lineno, prodname, syms))
            grammar.extend(ebnf_prod)
        func = getattr(func, 'next_func', None)
    return grammar
 # Replace EBNF repetition
 def _replace_ebnf_repeat(syms):
    syms = list(syms)
    first = syms.index('{')
    end = syms.index('}', first)
    symname, prods = _generate_repeat_rules(syms[first+1:end])
    syms[first:end+1] = [symname]
    return syms, prods
 def _replace_ebnf_optional(syms):
    syms = list(syms)
    first = syms.index('[')
    end = syms.index(']', first)
    symname, prods = _generate_optional_rules(syms[first+1:end])
    syms[first:end+1] = [symname]
    return syms, prods
 # Generate grammar rules for repeated items
 _gencount = 0
 def _unique_names(names):
    from collections import defaultdict, Counter
    counts = Counter(names)
    indices = defaultdict(int)
    newnames = []
    for name in names:
        if counts[name] == 1:
            newnames.append(name)
        else:
            newnames.append(f'{name}{indices[name]}')
            indices[name] += 1
    return newnames
 def _generate_repeat_rules(symbols):
    '''
    Symbols is a list of grammar symbols [ symbols ]. This
    generates code corresponding to these grammar construction:
       @('repeat : many')
       def repeat(self, p):
           return p.many
       @('repeat :')
       def repeat(self, p):
           return []
       @('many : many symbols')
       def many(self, p):
           p.many.append(symbols)
           return p.many
       @('many : symbols')
       def many(self, p):
           return [ p.symbols ]
    '''
    global _gencount
    _gencount += 1
    name = f'_{_gencount}_repeat'
    oname = f'_{_gencount}_items'
    iname = f'_{_gencount}_item'
    symtext = ' '.join(symbols)
    productions = [ ]
    _ = _decorator
    @_(f'{name} : {oname}')
    def repeat(self, p):
        return getattr(p, oname)
    @_(f'{name} : ')
    def repeat2(self, p):
        return []
    productions.extend(_collect_grammar_rules(repeat))
    productions.extend(_collect_grammar_rules(repeat2))
    @_(f'{oname} : {oname} {iname}')
    def many(self, p):
        items = getattr(p, oname)
        items.append(getattr(p, iname))
        return items
    @_(f'{oname} : {iname}')
    def many2(self, p):
        return [ getattr(p, iname) ]
    productions.extend(_collect_grammar_rules(many))
    productions.extend(_collect_grammar_rules(many2))
    utuple = namedtuple('syms', _unique_names(symbols))
    @_(f'{iname} : {symtext}')
    def item(self, p):
        if len(p) == 1:
            return p[0]
        else:
            return utuple(*p)
    productions.extend(_collect_grammar_rules(item))
    return name, productions
 def _generate_optional_rules(symbols):
    '''
    Symbols is a list of grammar symbols [ symbols ]. This
    generates code corresponding to these grammar construction:
       @('optional : symbols')
       def optional(self, p):
           return p.symbols
       @('optional :')
       def optional(self, p):
           return None
    '''
    global _gencount
    _gencount += 1
    name = f'_{_gencount}_optional'
    symtext = ' '.join(symbols)
    productions = [ ]
    _ = _decorator
    utuple = namedtuple('syms', _unique_names(symbols))
    @_(f'{name} : {symtext}')
    def optional(self, p):
        if len(p) == 1:
            return p[0]
        else:
            return utuple(*p)
    @_(f'{name} : ')
    def optional2(self, p):
        return None
    productions.extend(_collect_grammar_rules(optional))
    productions.extend(_collect_grammar_rules(optional2))
    return name, productions
 class ParserMetaDict(dict):
    '''
    Dictionary that allows decorated grammar rule functions to be overloaded
@@ -1576,17 +1728,24 @@ class ParserMetaDict(dict):
        else:
            return super().__getitem__(key)
-class ParserMeta(type):
+def _decorator(rule, *extra):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = ParserMetaDict()
        def _(rule, *extra):
     rules = [rule, *extra]
     def decorate(func):
         func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
         return func
     return decorate
-        d['_'] = _
+
 class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = ParserMetaDict()
 #        def _(rule, *extra):
 #            rules = [rule, *extra]
 #            def decorate(func):
 #                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
 #                return func
 #            return decorate
        d['_'] = _decorator
        return d
    def __new__(meta, clsname, bases, attributes):
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -3,7 +3,7 @@ from sly import Lexer, Parser
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
-    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN }
+    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
    literals = { '(', ')' }
    # String containing ignored characters between tokens
@@ -16,6 +16,7 @@ class CalcLexer(Lexer):
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
    COMMA   = r','
    @_(r'\d+')
    def NUMBER(self, t):
@@ -53,6 +54,14 @@ class CalcParser(Parser):
    def statement(self, p):
        self.names[p.ID] = p.expr
    @_('ID "(" [ arglist ] ")"')
    def statement(self, p):
        return (p.ID, p[2])
    @_('expr { COMMA expr }')
    def arglist(self, p):
        return [p.expr, *[e.expr for e in p[1]]]
    @_('expr')
    def statement(self, p):
        return p.expr
@@ -109,6 +118,18 @@ def test_simple():
    result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
    assert result == 47
 def test_ebnf():
    lexer = CalcLexer()
    parser = CalcParser()
    result = parser.parse(lexer.tokenize('a()'))
    assert result == ('a', None)
    result = parser.parse(lexer.tokenize('a(2+3)'))
    assert result == ('a', [5])
    result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
    assert result == ('a', [5, 9])
 def test_parse_error():
    lexer = CalcLexer()
    parser = CalcParser()