Experimental EBNF features added

2020-03-06 20:58:48 -06:00 · 2020-03-06 20:58:48 -06:00 · a2cdf52d0f
commit a2cdf52d0f
parent 9944b6239c
4 changed files with 238 additions and 10 deletions
--- a/29
+++ b/29
@ -1,3 +1,32 @@
+Version 0.5
+-----------
+03/06/2020 Added experimental support for EBNF repetition and optional
+           syntax.  For example, here is a rule for a comma-separated
+           expression list:
+
+               @('expr { COMMA expr }')
+               def exprlist(p):
+                   return [ p.expr ] + [e.expr for e in p[1]]
+
+           In this code, the { ... } means zero-or-more repetitions.
+           It produces a list of matches that must be accessed by
+           position index (p[1] in this example. p[0] is 'expr').
+	   The elements of the list are named tuples with attribute
+           names that match the enclosed grammar symbols (e.g., e.expr
+           in the example).
+
+           An optional value can be enclosed in brackets like this:
+
+              @('NAME LPAREN [ exprlist ] RPAREN')
+              def function_call(p):
+                  args = p[2] if p[2] else []
+                  name = p.NAME
+                  print('Calling:', name, args)
+
+           In this case, p[2] contains the optional value.  If not present,
+           the value is None.  If present, it is a tuple of values 
+           or a single value (if only one symbol). 
+
 Version 0.4
 -----------
 04/09/2019 Fixed very mysterious error message that resulted if you 
--- a/sly/lex.py
+++ b/sly/lex.py
@ -360,6 +360,7 @@ class Lexer(metaclass=LexerMeta):
    def tokenize(self, text, lineno=1, index=0):
        _ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None

+        # --- Support for state changes
        def _set_state(cls):
            nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
            _ignored_tokens = cls._ignored_tokens
@ -371,8 +372,26 @@ class Lexer(metaclass=LexerMeta):

        self.__set_state = _set_state
        _set_state(type(self))
-        self.text = text

+        # --- Support for backtracking
+        _mark_stack = []
+        def _mark():
+            _mark_stack.append((type(self), index, lineno))
+        self.mark = _mark
+
+        def _accept():
+            _mark_stack.pop()
+        self.accept = _accept
+
+        def _reject():
+            nonlocal index, lineno
+            cls, index, lineno = _mark_stack[-1]
+            _set_state(cls)
+        self.reject = _reject
+
+
+        # --- Main tokenization function
+        self.text = text
        try:
            while True:
                try:
--- a/sly/yacc.py
+++ b/sly/yacc.py
@ -33,7 +33,7 @@

 import sys
 import inspect
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple

 __all__        = [ 'Parser' ]

@ -1551,14 +1551,166 @@ def _collect_grammar_rules(func):
        lineno = unwrapped.__code__.co_firstlineno
        for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)):
            syms = rule.split()
+            ebnf_prod = []
+            while ('{' in syms) or ('[' in syms):
+                for s in syms:
+                    if s == '[':
+                        syms, prod = _replace_ebnf_optional(syms)
+                        ebnf_prod.extend(prod)
+                        break
+                    elif s == '{':
+                        syms, prod = _replace_ebnf_repeat(syms)
+                        ebnf_prod.extend(prod)
+                        break
+
            if syms[1:2] == [':'] or syms[1:2] == ['::=']:
                grammar.append((func, filename, lineno, syms[0], syms[2:]))
            else:
                grammar.append((func, filename, lineno, prodname, syms))
+            grammar.extend(ebnf_prod)
+            
        func = getattr(func, 'next_func', None)

    return grammar

+# Replace EBNF repetition
+def _replace_ebnf_repeat(syms):
+    syms = list(syms)
+    first = syms.index('{')
+    end = syms.index('}', first)
+    symname, prods = _generate_repeat_rules(syms[first+1:end])
+    syms[first:end+1] = [symname]
+    return syms, prods
+
+def _replace_ebnf_optional(syms):
+    syms = list(syms)
+    first = syms.index('[')
+    end = syms.index(']', first)
+    symname, prods = _generate_optional_rules(syms[first+1:end])
+    syms[first:end+1] = [symname]
+    return syms, prods
+                
+# Generate grammar rules for repeated items
+_gencount = 0
+
+def _unique_names(names):
+    from collections import defaultdict, Counter
+    counts = Counter(names)
+    indices = defaultdict(int)
+    newnames = []
+    for name in names:
+        if counts[name] == 1:
+            newnames.append(name)
+        else:
+            newnames.append(f'{name}{indices[name]}')
+            indices[name] += 1
+    return newnames
+
+def _generate_repeat_rules(symbols):
+    '''
+    Symbols is a list of grammar symbols [ symbols ]. This
+    generates code corresponding to these grammar construction:
+  
+       @('repeat : many')
+       def repeat(self, p):
+           return p.many
+
+       @('repeat :')
+       def repeat(self, p):
+           return []
+
+       @('many : many symbols')
+       def many(self, p):
+           p.many.append(symbols)
+           return p.many
+
+       @('many : symbols')
+       def many(self, p):
+           return [ p.symbols ]
+    '''
+    global _gencount
+    _gencount += 1
+    name = f'_{_gencount}_repeat'
+    oname = f'_{_gencount}_items'
+    iname = f'_{_gencount}_item'
+    symtext = ' '.join(symbols)
+
+    productions = [ ]
+    _ = _decorator
+
+    @_(f'{name} : {oname}')
+    def repeat(self, p):
+        return getattr(p, oname)
+
+    @_(f'{name} : ')
+    def repeat2(self, p):
+        return []
+    productions.extend(_collect_grammar_rules(repeat))
+    productions.extend(_collect_grammar_rules(repeat2))
+
+    @_(f'{oname} : {oname} {iname}')
+    def many(self, p):
+        items = getattr(p, oname)
+        items.append(getattr(p, iname))
+        return items
+
+    @_(f'{oname} : {iname}')
+    def many2(self, p):
+        return [ getattr(p, iname) ]
+
+    productions.extend(_collect_grammar_rules(many))
+    productions.extend(_collect_grammar_rules(many2))
+
+    utuple = namedtuple('syms', _unique_names(symbols))
+
+    @_(f'{iname} : {symtext}')
+    def item(self, p):
+        if len(p) == 1:
+            return p[0]
+        else:
+            return utuple(*p)
+
+    productions.extend(_collect_grammar_rules(item))
+    return name, productions
+
+def _generate_optional_rules(symbols):
+    '''
+    Symbols is a list of grammar symbols [ symbols ]. This
+    generates code corresponding to these grammar construction:
+  
+       @('optional : symbols')
+       def optional(self, p):
+           return p.symbols
+
+       @('optional :')
+       def optional(self, p):
+           return None
+    '''
+    global _gencount
+    _gencount += 1
+    name = f'_{_gencount}_optional'
+    symtext = ' '.join(symbols)
+    
+    productions = [ ]
+    _ = _decorator
+
+    utuple = namedtuple('syms', _unique_names(symbols))
+
+    @_(f'{name} : {symtext}')
+    def optional(self, p):
+        if len(p) == 1:
+            return p[0]
+        else:
+            return utuple(*p)
+
+    @_(f'{name} : ')
+    def optional2(self, p):
+        return None
+
+    productions.extend(_collect_grammar_rules(optional))
+    productions.extend(_collect_grammar_rules(optional2))
+    return name, productions
+    
 class ParserMetaDict(dict):
    '''
    Dictionary that allows decorated grammar rule functions to be overloaded
@ -1576,17 +1728,24 @@ class ParserMetaDict(dict):
        else:
            return super().__getitem__(key)

+def _decorator(rule, *extra):
+     rules = [rule, *extra]
+     def decorate(func):
+         func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
+         return func
+     return decorate
+
 class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = ParserMetaDict()
-        def _(rule, *extra):
-            rules = [rule, *extra]
-            def decorate(func):
-                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
-                return func
-            return decorate
-        d['_'] = _
+#        def _(rule, *extra):
+#            rules = [rule, *extra]
+#            def decorate(func):
+#                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
+#                return func
+#            return decorate
+        d['_'] = _decorator
        return d

    def __new__(meta, clsname, bases, attributes):
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -3,7 +3,7 @@ from sly import Lexer, Parser

 class CalcLexer(Lexer):
    # Set of token names.   This is always required
-    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN }
+    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
    literals = { '(', ')' }

    # String containing ignored characters between tokens
@ -16,6 +16,7 @@ class CalcLexer(Lexer):
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
+    COMMA   = r','

    @_(r'\d+')
    def NUMBER(self, t):
@ -53,6 +54,14 @@ class CalcParser(Parser):
    def statement(self, p):
        self.names[p.ID] = p.expr

+    @_('ID "(" [ arglist ] ")"')
+    def statement(self, p):
+        return (p.ID, p[2])
+
+    @_('expr { COMMA expr }')
+    def arglist(self, p):
+        return [p.expr, *[e.expr for e in p[1]]]
+
    @_('expr')
    def statement(self, p):
        return p.expr
@ -109,6 +118,18 @@ def test_simple():
    result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
    assert result == 47

+def test_ebnf():
+    lexer = CalcLexer()
+    parser = CalcParser()
+    result = parser.parse(lexer.tokenize('a()'))
+    assert result == ('a', None)
+
+    result = parser.parse(lexer.tokenize('a(2+3)'))
+    assert result == ('a', [5])
+
+    result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
+    assert result == ('a', [5, 9])
+
 def test_parse_error():
    lexer = CalcLexer()
    parser = CalcParser()