Work in progress. Changes to parser production API

2016-09-08 15:05:03 -05:00
parent 9a1899fa69
commit 05a709aaea
4 changed files with 1869 additions and 182 deletions
--- a/docs/sly.rst
+++ b/docs/sly.rst
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@@ -44,46 +44,45 @@ class CalcParser(Parser):
    @_('NAME "=" expression')
    def statement(self, p):
-        self.names[p[1]] = p[3]
+        self.names[p.NAME] = p.expression
    @_('expression')
    def statement(self, p):
-        print(p[1])
+        print(p.expression)
    @_('expression "+" expression',
       'expression "-" expression',
       'expression "*" expression',
       'expression "/" expression')
    def expression(self, p):
-        if p[2] == '+':
+        if p[1] == '+':
-            p[0] = p[1] + p[3]
+            return p.expression0 + p.expression1
-        elif p[2] == '-':
+        elif p[1] == '-':
-            p[0] = p[1] - p[3]
+            return p.expression0 - p.expression1
-        elif p[2] == '*':
+        elif p[1] == '*':
-            p[0] = p[1] * p[3]
+            return p.expression0 * p.expression1
-        elif p[2] == '/':
+        elif p[1] == '/':
-            p[0] = p[1] / p[3]
+            return p.expression0 / p.expression1
    @_('"-" expression %prec UMINUS')
    def expression(self, p):
-        p[0] = -p[2]
+        return -p.expression
    @_('"(" expression ")"')
    def expression(self, p):
-        p[0] = p[2]
+        return p.expression
    @_('NUMBER')
    def expression(self, p):
-        p[0] = p[1]
+        return p.NUMBER
    @_('NAME')
    def expression(self, p):
        try:
-            p[0] = self.names[p[1]]
+            return self.names[p.NAME]
        except LookupError:
-            print("Undefined name '%s'" % p[1])
+            print("Undefined name '%s'" % p.NAME)
-            p[0] = 0
+            return 0
 if __name__ == '__main__':
    lexer = CalcLexer()
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -68,9 +68,9 @@ class Token(object):
    def __repr__(self):
        return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)
-class NoDupeDict(OrderedDict):
+class LexerMetaDict(OrderedDict):
    '''
-    Special dictionary that prohits duplicate definitions.
+    Special dictionary that prohits duplicate definitions in lexer specifications.
    '''
    def __setitem__(self, key, value):
        if key in self and not isinstance(value, property):
@@ -83,17 +83,15 @@ class LexerMeta(type):
    '''
    @classmethod
    def __prepare__(meta, *args, **kwargs):
-        d = NoDupeDict()
+        d = LexerMetaDict()
-        def _(*patterns):
+        def _(pattern, *extra):
            patterns = [pattern, *extra]
            def decorate(func):
-                for pattern in patterns:
+                pattern = '|'.join('(%s)' % pat for pat in patterns )
-                    if hasattr(func, 'pattern'):
+                if hasattr(func, 'pattern'):
-                        if isinstance(pattern, str):
+                    func.pattern = pattern + '|' + func.pattern
-                            func.pattern = ''.join(['(', pattern, ')|(', func.pattern, ')'])
+                else:
-                        else:
+                    func.pattern = pattern
                            func.pattern = b''.join([b'(', pattern, b')|(', func.pattern, b')'])
                    else:
                        func.pattern = pattern
                return func
            return decorate
        d['_'] = _
@@ -109,7 +107,7 @@ class Lexer(metaclass=LexerMeta):
    # These attributes may be defined in subclasses
    tokens = set()
    literals = set()
-    ignore = None
+    ignore = ''
    reflags = 0
    # These attributes are constructed automatically by the associated metaclass
@@ -118,7 +116,6 @@ class Lexer(metaclass=LexerMeta):
    _literals = set()
    _token_funcs = { }
    _ignored_tokens = set()
    _input_type = str
    @classmethod
    def _collect_rules(cls, definitions):
@@ -151,7 +148,7 @@ class Lexer(metaclass=LexerMeta):
                tokname = tokname[7:]
                cls._ignored_tokens.add(tokname)
-            if isinstance(value, (str, bytes)):
+            if isinstance(value, str):
                pattern = value
            elif callable(value):
@@ -159,10 +156,7 @@ class Lexer(metaclass=LexerMeta):
                cls._token_funcs[tokname] = value
            # Form the regular expression component 
-            if isinstance(pattern, str):
+            part = '(?P<%s>%s)' % (tokname, pattern)
                part = '(?P<%s>%s)' % (tokname, pattern)
            else:
                part = b'(?P<%s>%s)' % (tokname.encode('ascii'), pattern)
            # Make sure the individual regex compiles properly
            try:
@@ -171,38 +165,24 @@ class Lexer(metaclass=LexerMeta):
                raise PatternError('Invalid regex for token %s' % tokname) from e
            # Verify that the pattern doesn't match the empty string
-            if cpat.match(type(pattern)()):
+            if cpat.match(''):
                raise PatternError('Regex for token %s matches empty input' % tokname)
            parts.append(part)
        # If no parts collected, then no rules to process
        if not parts:
            return
        # Verify that all of the patterns are of the same type
        if not all(type(part) == type(parts[0]) for part in parts):
            raise LexerBuildError('Tokens are specified using both bytes and strings.')
        # Form the master regular expression
-        if parts and isinstance(parts[0], bytes):
+        previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
-            previous = (b'|' + cls._master_re.pattern) if cls._master_re else b''
+        cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
            cls._master_re = re.compile(b'|'.join(parts) + previous, cls.reflags)
            cls._input_type = bytes
        else:
            previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
            cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
            cls._input_type = str
        # Verify that that ignore and literals specifiers match the input type
-        if cls.ignore is not None and not isinstance(cls.ignore, cls._input_type):
+        if not isinstance(cls.ignore, str):
-            raise LexerBuildError("ignore specifier type doesn't match token types (%s)" %
+            raise LexerBuildError('ignore specifier must be a string')
                                  cls._input_type.__name__)
-        if not all(isinstance(lit, cls._input_type) for lit in cls.literals):
+        if not all(isinstance(lit, str) for lit in cls.literals):
-            raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
+            raise LexerBuildError("literals must be specified as strings")
                                  cls._input_type.__name__)
    def tokenize(self, text, lineno=1, index=0):
        # Local copies of frequently used values
@@ -220,11 +200,6 @@ class Lexer(metaclass=LexerMeta):
                        index += 1
                        continue
                except IndexError:
                    if self.eof:
                        text = self.eof()
                        if text:
                            index = 0
                            continue
                    break
                tok = Token()
@@ -270,9 +245,6 @@ class Lexer(metaclass=LexerMeta):
            self.index = index
            self.lineno = lineno
-    # Default implementations of methods that may be subclassed by users
+    # Default implementations of the error handler. May be changed in subclasses
    def error(self, value):
        raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)
    def eof(self):
        pass
--- a/sly/yacc.py
+++ b/sly/yacc.py
@@ -33,7 +33,7 @@
 import sys
 import inspect
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 __version__    = '0.0'
 __all__        = [ 'Parser' ]
@@ -104,31 +104,39 @@ class YaccSymbol:
 class YaccProduction:
    def __init__(self, s, stack=None):
-        self.slice = s
+        self._slice = s
-        self.stack = stack
+        self._stack = stack
        self._namemap = { }
    def __getitem__(self, n):
-        if isinstance(n, slice):
+        if n >= 0:
-            return [s.value for s in self.slice[n]]
+            return self._slice[n].value
        elif n >= 0:
            return self.slice[n].value
        else:
-            return self.stack[n].value
+            return self._stack[n].value
    def __setitem__(self, n, v):
-        self.slice[n].value = v
+        self._slice[n].value = v
    def __len__(self):
-        return len(self.slice)
+        return len(self._slice)
    def lineno(self, n):
-        return getattr(self.slice[n], 'lineno', 0)
+        return getattr(self._slice[n], 'lineno', 0)
    def set_lineno(self, n, lineno):
-        self.slice[n].lineno = lineno
+        self._slice[n].lineno = lineno
    def index(self, n):
-        return getattr(self.slice[n], 'index', 0)
+        return getattr(self._slice[n], 'index', 0)
    def __getattr__(self, name):
        return self._slice[self._namemap[name]].value
    def __setattr__(self, name, value):
        if name[0:1] == '_' or name not in self._namemap:
            super().__setattr__(name, value)
        else:
            self._slice[self._namemap[name]].value = value
 # -----------------------------------------------------------------------------
 #                          === Grammar Representation ===
@@ -171,17 +179,29 @@ class Production(object):
        self.file     = file
        self.line     = line
        self.prec     = precedence
-
+        
        # Internal settings used during table construction
        self.len  = len(self.prod)   # Length of the production
        # Create a list of unique production symbols used in the production
        self.usyms = []
-        for s in self.prod:
+        symmap = defaultdict(list)
        for n, s in enumerate(self.prod):
            symmap[s].append(n)
            if s not in self.usyms:
                self.usyms.append(s)
        # Create a dict mapping symbol names to indices
        m = {}
        for key, indices in symmap.items():
            if len(indices) == 1:
                m[key] = indices[0]
            else:
                for n, index in enumerate(indices):
                    m[key+str(n)] = index
        self.namemap = m
        # List of all LR items for the production
        self.lr_items = []
        self.lr_next = None
@@ -1512,9 +1532,10 @@ def _collect_grammar_rules(func):
            else:
                grammar.append((func, filename, lineno, prodname, syms))
        func = getattr(func, 'next_func', None)
    return grammar
-class OverloadDict(OrderedDict):
+class ParserMetaDict(OrderedDict):
    '''
    Dictionary that allows decorated grammar rule functions to be overloaded
    '''
@@ -1526,13 +1547,11 @@ class OverloadDict(OrderedDict):
 class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
-        d = OverloadDict()
+        d = ParserMetaDict()
-        def _(*rules):
+        def _(rule, *extra):
            rules = [rule, *extra]
            def decorate(func):
-                if hasattr(func, 'rules'):
+                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
                    func.rules.extend(rules[::-1])
                else:
                    func.rules = list(rules[::-1])
                return func
            return decorate
        d['_'] = _
@@ -1788,9 +1807,9 @@ class Parser(metaclass=ParserMeta):
        self.statestack.append(0)
        self.state = 0
-    def parse(self, lexer):
+    def parse(self, tokens):
        '''
-        Parse the given input text.  lexer is a Lexer object that produces tokens
+        Parse the given input tokens.
        '''
        lookahead = None                                  # Current lookahead symbol
        lookaheadstack = []                               # Stack of lookahead symbols
@@ -1801,10 +1820,6 @@ class Parser(metaclass=ParserMeta):
        pslice  = YaccProduction(None)                    # Production object passed to grammar rules
        errorcount = 0                                    # Used during error recovery
        # Save a local reference of the lexer being used
        self.lexer = lexer
        tokens = iter(self.lexer)
        # Set up the state and symbol stacks
        self.statestack = statestack = []                 # Stack of parsing states
        self.symstack = symstack = []                     # Stack of grammar symbols
@@ -1816,7 +1831,6 @@ class Parser(metaclass=ParserMeta):
            # Get the next symbol on the input.  If a lookahead symbol
            # is already set, we just use that. Otherwise, we'll pull
            # the next token off of the lookaheadstack or from the lexer
            if self.state not in defaulted_states:
                if not lookahead:
                    if not lookaheadstack:
@@ -1852,74 +1866,22 @@ class Parser(metaclass=ParserMeta):
                    self.production = p = prod[-t]
                    pname = p.name
                    plen  = p.len
                    pslice._namemap = p.namemap
                    # Call the production function
-                    sym = YaccSymbol()
+                    pslice._slice = symstack[-plen:] if plen else []
                    sym.type = pname       # Production name
                    sym.value = None
                    if plen:
-                        targ = symstack[-plen-1:]
+                        del symstack[-plen:]
-                        targ[0] = sym
+                        del statestack[-plen:]
-                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+                    sym = YaccSymbol()
-                        # The code enclosed in this section is duplicated
+                    sym.type = pname       
-                        # below as a performance optimization.  Make sure
+                    sym.value = p.func(self, pslice)
-                        # changes get made in both locations.
+                    symstack.append(sym)
-                        pslice.slice = targ
+                    self.state = goto[statestack[-1]][pname]
-
+                    statestack.append(self.state)
-                        try:
+                    continue
                            # Call the grammar rule with our special slice object
                            del symstack[-plen:]
                            p.func(self, pslice)
                            del statestack[-plen:]
                            symstack.append(sym)
                            self.state = goto[statestack[-1]][pname]
                            statestack.append(self.state)
                        except SyntaxError:
                            # If an error was set. Enter error recovery state
                            lookaheadstack.append(lookahead)
                            symstack.extend(targ[1:-1])
                            statestack.pop()
                            self.state = statestack[-1]
                            sym.type = 'error'
                            sym.value = 'error'
                            lookahead = sym
                            errorcount = ERROR_COUNT
                            self.errorok = False
                        continue
                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                    else:
                        targ = [sym]
                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                        # The code enclosed in this section is duplicated
                        # above as a performance optimization.  Make sure
                        # changes get made in both locations.
                        pslice.slice = targ
                        try:
                            # Call the grammar rule with our special slice object
                            p.func(self, pslice)
                            symstack.append(sym)
                            self.state = goto[statestack[-1]][pname]
                            statestack.append(self.state)
                        except SyntaxError:
                            # If an error was set. Enter error recovery state
                            lookaheadstack.append(lookahead)
                            statestack.pop()
                            self.state = statestack[-1]
                            sym.type = 'error'
                            sym.value = 'error'
                            lookahead = sym
                            errorcount = ERROR_COUNT
                            self.errorok = False
                        continue
                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                if t == 0:
                    n = symstack[-1]