Work in progress. Changes to parser production API

2016-09-08 15:05:03 -05:00 · 2016-09-08 15:05:03 -05:00 · 05a709aaea
commit 05a709aaea
parent 9a1899fa69
4 changed files with 1869 additions and 182 deletions
--- a/docs/sly.rst
+++ b/docs/sly.rst
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@ -44,46 +44,45 @@ class CalcParser(Parser):

    @_('NAME "=" expression')
    def statement(self, p):
-        self.names[p[1]] = p[3]
+        self.names[p.NAME] = p.expression

    @_('expression')
    def statement(self, p):
-        print(p[1])
+        print(p.expression)

    @_('expression "+" expression',
       'expression "-" expression',
       'expression "*" expression',
       'expression "/" expression')
    def expression(self, p):
-        if p[2] == '+':
-            p[0] = p[1] + p[3]
-        elif p[2] == '-':
-            p[0] = p[1] - p[3]
-        elif p[2] == '*':
-            p[0] = p[1] * p[3]
-        elif p[2] == '/':
-            p[0] = p[1] / p[3]
+        if p[1] == '+':
+            return p.expression0 + p.expression1
+        elif p[1] == '-':
+            return p.expression0 - p.expression1
+        elif p[1] == '*':
+            return p.expression0 * p.expression1
+        elif p[1] == '/':
+            return p.expression0 / p.expression1

    @_('"-" expression %prec UMINUS')
    def expression(self, p):
-        p[0] = -p[2]
+        return -p.expression

    @_('"(" expression ")"')
    def expression(self, p):
-        p[0] = p[2]
+        return p.expression

    @_('NUMBER')
    def expression(self, p):
-        p[0] = p[1]
+        return p.NUMBER

    @_('NAME')
    def expression(self, p):
        try:
-            p[0] = self.names[p[1]]
+            return self.names[p.NAME]
        except LookupError:
-            print("Undefined name '%s'" % p[1])
-            p[0] = 0
-
+            print("Undefined name '%s'" % p.NAME)
+            return 0

 if __name__ == '__main__':
    lexer = CalcLexer()
--- a/sly/lex.py
+++ b/sly/lex.py
@ -68,9 +68,9 @@ class Token(object):
    def __repr__(self):
        return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)

-class NoDupeDict(OrderedDict):
+class LexerMetaDict(OrderedDict):
    '''
-    Special dictionary that prohits duplicate definitions.
+    Special dictionary that prohits duplicate definitions in lexer specifications.
    '''
    def __setitem__(self, key, value):
        if key in self and not isinstance(value, property):
@ -83,17 +83,15 @@ class LexerMeta(type):
    '''
    @classmethod
    def __prepare__(meta, *args, **kwargs):
-        d = NoDupeDict()
-        def _(*patterns):
+        d = LexerMetaDict()
+        def _(pattern, *extra):
+            patterns = [pattern, *extra]
            def decorate(func):
-                for pattern in patterns:
-                    if hasattr(func, 'pattern'):
-                        if isinstance(pattern, str):
-                            func.pattern = ''.join(['(', pattern, ')|(', func.pattern, ')'])
-                        else:
-                            func.pattern = b''.join([b'(', pattern, b')|(', func.pattern, b')'])
-                    else:
-                        func.pattern = pattern
+                pattern = '|'.join('(%s)' % pat for pat in patterns )
+                if hasattr(func, 'pattern'):
+                    func.pattern = pattern + '|' + func.pattern
+                else:
+                    func.pattern = pattern
                return func
            return decorate
        d['_'] = _
@ -109,7 +107,7 @@ class Lexer(metaclass=LexerMeta):
    # These attributes may be defined in subclasses
    tokens = set()
    literals = set()
-    ignore = None
+    ignore = ''
    reflags = 0

    # These attributes are constructed automatically by the associated metaclass
@ -118,7 +116,6 @@ class Lexer(metaclass=LexerMeta):
    _literals = set()
    _token_funcs = { }
    _ignored_tokens = set()
-    _input_type = str

    @classmethod
    def _collect_rules(cls, definitions):
@ -151,7 +148,7 @@ class Lexer(metaclass=LexerMeta):
                tokname = tokname[7:]
                cls._ignored_tokens.add(tokname)

-            if isinstance(value, (str, bytes)):
+            if isinstance(value, str):
                pattern = value

            elif callable(value):
@ -159,10 +156,7 @@ class Lexer(metaclass=LexerMeta):
                cls._token_funcs[tokname] = value

            # Form the regular expression component 
-            if isinstance(pattern, str):
-                part = '(?P<%s>%s)' % (tokname, pattern)
-            else:
-                part = b'(?P<%s>%s)' % (tokname.encode('ascii'), pattern)
+            part = '(?P<%s>%s)' % (tokname, pattern)

            # Make sure the individual regex compiles properly
            try:
@ -171,38 +165,24 @@ class Lexer(metaclass=LexerMeta):
                raise PatternError('Invalid regex for token %s' % tokname) from e

            # Verify that the pattern doesn't match the empty string
-            if cpat.match(type(pattern)()):
+            if cpat.match(''):
                raise PatternError('Regex for token %s matches empty input' % tokname)

            parts.append(part)

-        # If no parts collected, then no rules to process
        if not parts:
            return

-        # Verify that all of the patterns are of the same type
-        if not all(type(part) == type(parts[0]) for part in parts):
-            raise LexerBuildError('Tokens are specified using both bytes and strings.')
-
        # Form the master regular expression
-        if parts and isinstance(parts[0], bytes):
-            previous = (b'|' + cls._master_re.pattern) if cls._master_re else b''
-            cls._master_re = re.compile(b'|'.join(parts) + previous, cls.reflags)
-            cls._input_type = bytes
-        else:
-            previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
-            cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
-            cls._input_type = str
+        previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
+        cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)

        # Verify that that ignore and literals specifiers match the input type
-        if cls.ignore is not None and not isinstance(cls.ignore, cls._input_type):
-            raise LexerBuildError("ignore specifier type doesn't match token types (%s)" %
-                                  cls._input_type.__name__)
-        
-        if not all(isinstance(lit, cls._input_type) for lit in cls.literals):
-            raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
-                                  cls._input_type.__name__)
+        if not isinstance(cls.ignore, str):
+            raise LexerBuildError('ignore specifier must be a string')
        
+        if not all(isinstance(lit, str) for lit in cls.literals):
+            raise LexerBuildError("literals must be specified as strings")

    def tokenize(self, text, lineno=1, index=0):
        # Local copies of frequently used values
@ -220,11 +200,6 @@ class Lexer(metaclass=LexerMeta):
                        index += 1
                        continue
                except IndexError:
-                    if self.eof:
-                        text = self.eof()
-                        if text:
-                            index = 0
-                            continue
                    break

                tok = Token()
@ -270,9 +245,6 @@ class Lexer(metaclass=LexerMeta):
            self.index = index
            self.lineno = lineno

-    # Default implementations of methods that may be subclassed by users
+    # Default implementations of the error handler. May be changed in subclasses
    def error(self, value):
        raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)
-
-    def eof(self):
-        pass
--- a/sly/yacc.py
+++ b/sly/yacc.py
@ -33,7 +33,7 @@

 import sys
 import inspect
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict

 __version__    = '0.0'
 __all__        = [ 'Parser' ]
@ -104,31 +104,39 @@ class YaccSymbol:

 class YaccProduction:
    def __init__(self, s, stack=None):
-        self.slice = s
-        self.stack = stack
+        self._slice = s
+        self._stack = stack
+        self._namemap = { }

    def __getitem__(self, n):
-        if isinstance(n, slice):
-            return [s.value for s in self.slice[n]]
-        elif n >= 0:
-            return self.slice[n].value
+        if n >= 0:
+            return self._slice[n].value
        else:
-            return self.stack[n].value
+            return self._stack[n].value

    def __setitem__(self, n, v):
-        self.slice[n].value = v
+        self._slice[n].value = v

    def __len__(self):
-        return len(self.slice)
+        return len(self._slice)

    def lineno(self, n):
-        return getattr(self.slice[n], 'lineno', 0)
+        return getattr(self._slice[n], 'lineno', 0)

    def set_lineno(self, n, lineno):
-        self.slice[n].lineno = lineno
+        self._slice[n].lineno = lineno

    def index(self, n):
-        return getattr(self.slice[n], 'index', 0)
+        return getattr(self._slice[n], 'index', 0)
+
+    def __getattr__(self, name):
+        return self._slice[self._namemap[name]].value
+
+    def __setattr__(self, name, value):
+        if name[0:1] == '_' or name not in self._namemap:
+            super().__setattr__(name, value)
+        else:
+            self._slice[self._namemap[name]].value = value

 # -----------------------------------------------------------------------------
 #                          === Grammar Representation ===
@ -173,15 +181,27 @@ class Production(object):
        self.prec     = precedence
        
        # Internal settings used during table construction
-
        self.len  = len(self.prod)   # Length of the production

        # Create a list of unique production symbols used in the production
        self.usyms = []
-        for s in self.prod:
+        symmap = defaultdict(list)
+        for n, s in enumerate(self.prod):
+            symmap[s].append(n)
            if s not in self.usyms:
                self.usyms.append(s)

+        # Create a dict mapping symbol names to indices
+        m = {}
+        for key, indices in symmap.items():
+            if len(indices) == 1:
+                m[key] = indices[0]
+            else:
+                for n, index in enumerate(indices):
+                    m[key+str(n)] = index
+
+        self.namemap = m
+                
        # List of all LR items for the production
        self.lr_items = []
        self.lr_next = None
@ -1512,9 +1532,10 @@ def _collect_grammar_rules(func):
            else:
                grammar.append((func, filename, lineno, prodname, syms))
        func = getattr(func, 'next_func', None)
+
    return grammar

-class OverloadDict(OrderedDict):
+class ParserMetaDict(OrderedDict):
    '''
    Dictionary that allows decorated grammar rule functions to be overloaded
    '''
@ -1526,13 +1547,11 @@ class OverloadDict(OrderedDict):
 class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
-        d = OverloadDict()
-        def _(*rules):
+        d = ParserMetaDict()
+        def _(rule, *extra):
+            rules = [rule, *extra]
            def decorate(func):
-                if hasattr(func, 'rules'):
-                    func.rules.extend(rules[::-1])
-                else:
-                    func.rules = list(rules[::-1])
+                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
                return func
            return decorate
        d['_'] = _
@ -1788,9 +1807,9 @@ class Parser(metaclass=ParserMeta):
        self.statestack.append(0)
        self.state = 0

-    def parse(self, lexer):
+    def parse(self, tokens):
        '''
-        Parse the given input text.  lexer is a Lexer object that produces tokens
+        Parse the given input tokens.
        '''
        lookahead = None                                  # Current lookahead symbol
        lookaheadstack = []                               # Stack of lookahead symbols
@ -1801,10 +1820,6 @@ class Parser(metaclass=ParserMeta):
        pslice  = YaccProduction(None)                    # Production object passed to grammar rules
        errorcount = 0                                    # Used during error recovery

-        # Save a local reference of the lexer being used
-        self.lexer = lexer
-        tokens = iter(self.lexer)
-
        # Set up the state and symbol stacks
        self.statestack = statestack = []                 # Stack of parsing states
        self.symstack = symstack = []                     # Stack of grammar symbols
@ -1816,7 +1831,6 @@ class Parser(metaclass=ParserMeta):
            # Get the next symbol on the input.  If a lookahead symbol
            # is already set, we just use that. Otherwise, we'll pull
            # the next token off of the lookaheadstack or from the lexer
-
            if self.state not in defaulted_states:
                if not lookahead:
                    if not lookaheadstack:
@ -1852,74 +1866,22 @@ class Parser(metaclass=ParserMeta):
                    self.production = p = prod[-t]
                    pname = p.name
                    plen  = p.len
+                    pslice._namemap = p.namemap

                    # Call the production function
-                    sym = YaccSymbol()
-                    sym.type = pname       # Production name
-                    sym.value = None
-
+                    pslice._slice = symstack[-plen:] if plen else []
                    if plen:
-                        targ = symstack[-plen-1:]
-                        targ[0] = sym
+                        del symstack[-plen:]
+                        del statestack[-plen:]

-                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-                        # The code enclosed in this section is duplicated
-                        # below as a performance optimization.  Make sure
-                        # changes get made in both locations.
+                    sym = YaccSymbol()
+                    sym.type = pname       
+                    sym.value = p.func(self, pslice)
+                    symstack.append(sym)

-                        pslice.slice = targ
-
-                        try:
-                            # Call the grammar rule with our special slice object
-                            del symstack[-plen:]
-                            p.func(self, pslice)
-                            del statestack[-plen:]
-                            symstack.append(sym)
-                            self.state = goto[statestack[-1]][pname]
-                            statestack.append(self.state)
-                        except SyntaxError:
-                            # If an error was set. Enter error recovery state
-                            lookaheadstack.append(lookahead)
-                            symstack.extend(targ[1:-1])
-                            statestack.pop()
-                            self.state = statestack[-1]
-                            sym.type = 'error'
-                            sym.value = 'error'
-                            lookahead = sym
-                            errorcount = ERROR_COUNT
-                            self.errorok = False
-                        continue
-                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-                    else:
-
-                        targ = [sym]
-
-                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-                        # The code enclosed in this section is duplicated
-                        # above as a performance optimization.  Make sure
-                        # changes get made in both locations.
-
-                        pslice.slice = targ
-
-                        try:
-                            # Call the grammar rule with our special slice object
-                            p.func(self, pslice)
-                            symstack.append(sym)
-                            self.state = goto[statestack[-1]][pname]
-                            statestack.append(self.state)
-                        except SyntaxError:
-                            # If an error was set. Enter error recovery state
-                            lookaheadstack.append(lookahead)
-                            statestack.pop()
-                            self.state = statestack[-1]
-                            sym.type = 'error'
-                            sym.value = 'error'
-                            lookahead = sym
-                            errorcount = ERROR_COUNT
-                            self.errorok = False
-                        continue
-                        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+                    self.state = goto[statestack[-1]][pname]
+                    statestack.append(self.state)
+                    continue

                if t == 0:
                    n = symstack[-1]