Cleanup. Improvements to lexer pattern decorator

2016-09-06 14:38:40 -05:00
parent 36cf652eae
commit 7bfadaaab3
4 changed files with 132 additions and 61 deletions
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@@ -86,11 +86,12 @@ class CalcParser(Parser):


 if __name__ == '__main__':
+    lexer = CalcLexer()
    parser = CalcParser()
    while True:
        try:
-            s = input('calc > ')
+            text = input('calc > ')
        except EOFError:
            break
-        if s:
-            parser.parse(CalcLexer(s))
+        if text:
+            parser.parse(lexer.tokenize(text))
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,28 @@
+try:
+    from setuptools import setup
+except ImportError:
+    from distutils.core import setup
+
+tests_require = ['pytest']
+
+setup(name = "sly",
+            description="SLY - Sly Lex Yacc",
+            long_description = """
+SLY is an implementation of lex and yacc for Python 3.
+""",
+            license="""BSD""",
+            version = "0.0",
+            author = "David Beazley",
+            author_email = "dave@dabeaz.com",
+            maintainer = "David Beazley",
+            maintainer_email = "dave@dabeaz.com",
+            url = "https://github.com/dabeaz/sly",
+            packages = ['sly'],
+            tests_require = tests_require,
+            extras_require = {
+                'test': tests_require,
+              },
+            classifiers = [
+              'Programming Language :: Python :: 3',
+              ]
+            )
--- a/sly/_meta.py
+++ b/sly/_meta.py
@@ -7,7 +7,7 @@ class NoDupeDict(OrderedDict):
        super().__setitem__(key, value)

 class RuleMeta(type):
-    @staticmethod
+    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = NoDupeDict()
        def _(rule):
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -37,8 +37,6 @@ __all__ = ['Lexer']
 import re
 from collections import OrderedDict

-from ._meta import RuleMeta
-
 class LexError(Exception):
    '''
    Exception raised if an invalid character is encountered and no default
@@ -70,12 +68,43 @@ class Token(object):
    def __repr__(self):
        return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)

-class Lexer(metaclass=RuleMeta):
+class NoDupeDict(OrderedDict):
    '''
-    Representation of a single lexing state.  This class is automatically constructed 
-    by the RuleDict during class definition.
+    Special dictionary that prohits duplicate definitions.
    '''
+    def __setitem__(self, key, value):
+        if key in self and not isinstance(value, property):
+            raise AttributeError('Name %s redefined' % (key))
+        super().__setitem__(key, value)

+class LexerMeta(type):
+    '''
+    Metaclass for collecting lexing rules
+    '''
+    @classmethod
+    def __prepare__(meta, *args, **kwargs):
+        d = NoDupeDict()
+        def _(pattern):
+            def decorate(func):
+                if hasattr(func, 'pattern'):
+                    if isinstance(pattern, str):
+                        func.pattern = ''.join(['(', pattern, ')|(', func.pattern, ')'])
+                    else:
+                        func.pattern = b''.join([b'(', pattern, b')|(', func.pattern, b')'])
+                else:
+                    func.pattern = pattern
+                return func
+            return decorate
+        d['_'] = _
+        return d
+
+    def __new__(meta, clsname, bases, attributes):
+        del attributes['_']
+        cls = super().__new__(meta, clsname, bases, attributes)
+        cls._build(list(attributes.items()))
+        return cls
+
+class Lexer(metaclass=LexerMeta):
    # These attributes may be defined in subclasses
    tokens = set()
    literals = set()
@@ -90,11 +119,6 @@ class Lexer(metaclass=RuleMeta):
    _ignored_tokens = set()
    _input_type = str

-    def __init__(self, text, lineno=1, index=0):
-        self.text = text
-        self.lineno = lineno
-        self.index = index
-
    @classmethod
    def _collect_rules(cls, definitions):
        '''
@@ -102,7 +126,7 @@ class Lexer(metaclass=RuleMeta):
        '''
        rules = []
        for key, value in definitions:
-            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'rule'):
+            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
                rules.append((key, value))
        return rules

@@ -130,7 +154,7 @@ class Lexer(metaclass=RuleMeta):
                pattern = value

            elif callable(value):
-                pattern = value.rule
+                pattern = value.pattern
                cls._token_funcs[tokname] = value

            # Form the regular expression component 
@@ -178,46 +202,55 @@ class Lexer(metaclass=RuleMeta):
            raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
                                  cls._input_type.__name__)

-    def __iter__(self):
-        text = self.text
-        index = self.index
+
+    def tokenize(self, text, lineno=1, index=0):
+        # Local copies of frequently used values
+        _ignored_tokens = self._ignored_tokens
+        _master_re = self._master_re
+        _ignore = self.ignore
+        _token_funcs = self._token_funcs
+        _literals = self._literals
+
+        self.text = text
+        try:
            while True:
                try:
-                if text[index] in self.ignore:
+                    if text[index] in _ignore:
                        index += 1
                        continue
                except IndexError:
                    if self.eof:
                        text = self.eof()
                        if text is not None:
-                        self.text = text
-                        self.index = 0
                            index = 0
                            continue
                    break

                tok = Token()
-            tok.lineno = self.lineno
+                tok.lineno = lineno
                tok.index = index
-            m = self._master_re.match(text, index)
+                m = _master_re.match(text, index)
                if m:
                    index = m.end()
                    tok.value = m.group()
                    tok.type = m.lastgroup
-                if tok.type in self._token_funcs:
+                    if tok.type in _token_funcs:
                        self.index = index
-                    tok = self._token_funcs[tok.type](self, tok)
+                        self.lineno = lineno
+                        tok = _token_funcs[tok.type](self, tok)
                        index = self.index
+                        lineno = self.lineno
                        if not tok:
                            continue

-                if tok.type in self._ignored_tokens:
+                    if tok.type in _ignored_tokens:
                        continue

                    yield tok
+
                else:
                    # No match, see if the character is in literals
-                if text[index] in self._literals:
+                    if text[index] in _literals:
                        tok.value = text[index]
                        tok.type = tok.value
                        index += 1
@@ -225,9 +258,18 @@ class Lexer(metaclass=RuleMeta):
                    else:
                        # A lexing error
                        self.index = index
-                    self.error(self.text[self.index:])
+                        self.lineno = lineno
+                        self.error(text[index:])
                        index = self.index
+                        lineno = self.lineno

+        # Set the final state of the lexer before exiting (even if exception)
+        finally:
+            self.text = text
+            self.index = index
+            self.lineno = lineno
+
+    # Default implementations of methods that may be subclassed by users
    def error(self, value):
        raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)