Cleanup. Improvements to lexer pattern decorator

2016-09-06 14:38:40 -05:00 · 2016-09-06 14:38:40 -05:00 · 7bfadaaab3
commit 7bfadaaab3
parent 36cf652eae
4 changed files with 132 additions and 61 deletions
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@ -86,11 +86,12 @@ class CalcParser(Parser):
 if __name__ == '__main__':
    lexer = CalcLexer()
    parser = CalcParser()
    while True:
        try:
-            s = input('calc > ')
+            text = input('calc > ')
        except EOFError:
            break
-        if s:
+        if text:
-            parser.parse(CalcLexer(s))
+            parser.parse(lexer.tokenize(text))
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,28 @@
 try:
    from setuptools import setup
 except ImportError:
    from distutils.core import setup
 tests_require = ['pytest']
 setup(name = "sly",
            description="SLY - Sly Lex Yacc",
            long_description = """
 SLY is an implementation of lex and yacc for Python 3.
 """,
            license="""BSD""",
            version = "0.0",
            author = "David Beazley",
            author_email = "dave@dabeaz.com",
            maintainer = "David Beazley",
            maintainer_email = "dave@dabeaz.com",
            url = "https://github.com/dabeaz/sly",
            packages = ['sly'],
            tests_require = tests_require,
            extras_require = {
                'test': tests_require,
              },
            classifiers = [
              'Programming Language :: Python :: 3',
              ]
            )
--- a/sly/_meta.py
+++ b/sly/_meta.py
@ -7,7 +7,7 @@ class NoDupeDict(OrderedDict):
        super().__setitem__(key, value)
 class RuleMeta(type):
-    @staticmethod
+    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = NoDupeDict()
        def _(rule):
--- a/sly/lex.py
+++ b/sly/lex.py
@ -37,8 +37,6 @@ __all__ = ['Lexer']
 import re
 from collections import OrderedDict
 from ._meta import RuleMeta
 class LexError(Exception):
    '''
    Exception raised if an invalid character is encountered and no default
@ -70,12 +68,43 @@ class Token(object):
    def __repr__(self):
        return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)
-class Lexer(metaclass=RuleMeta):
+class NoDupeDict(OrderedDict):
    '''
-    Representation of a single lexing state.  This class is automatically constructed 
+    Special dictionary that prohits duplicate definitions.
    by the RuleDict during class definition.
    '''
    def __setitem__(self, key, value):
        if key in self and not isinstance(value, property):
            raise AttributeError('Name %s redefined' % (key))
        super().__setitem__(key, value)
 class LexerMeta(type):
    '''
    Metaclass for collecting lexing rules
    '''
    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = NoDupeDict()
        def _(pattern):
            def decorate(func):
                if hasattr(func, 'pattern'):
                    if isinstance(pattern, str):
                        func.pattern = ''.join(['(', pattern, ')|(', func.pattern, ')'])
                    else:
                        func.pattern = b''.join([b'(', pattern, b')|(', func.pattern, b')'])
                else:
                    func.pattern = pattern
                return func
            return decorate
        d['_'] = _
        return d
    def __new__(meta, clsname, bases, attributes):
        del attributes['_']
        cls = super().__new__(meta, clsname, bases, attributes)
        cls._build(list(attributes.items()))
        return cls
 class Lexer(metaclass=LexerMeta):
    # These attributes may be defined in subclasses
    tokens = set()
    literals = set()
@ -90,11 +119,6 @@ class Lexer(metaclass=RuleMeta):
    _ignored_tokens = set()
    _input_type = str
    def __init__(self, text, lineno=1, index=0):
        self.text = text
        self.lineno = lineno
        self.index = index
    @classmethod
    def _collect_rules(cls, definitions):
        '''
@ -102,7 +126,7 @@ class Lexer(metaclass=RuleMeta):
        '''
        rules = []
        for key, value in definitions:
-            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'rule'):
+            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
                rules.append((key, value))
        return rules
@ -130,7 +154,7 @@ class Lexer(metaclass=RuleMeta):
                pattern = value
            elif callable(value):
-                pattern = value.rule
+                pattern = value.pattern
                cls._token_funcs[tokname] = value
            # Form the regular expression component 
@ -178,46 +202,55 @@ class Lexer(metaclass=RuleMeta):
            raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
                                  cls._input_type.__name__)
-    def __iter__(self):
+
-        text = self.text
+    def tokenize(self, text, lineno=1, index=0):
-        index = self.index
+        # Local copies of frequently used values
        _ignored_tokens = self._ignored_tokens
        _master_re = self._master_re
        _ignore = self.ignore
        _token_funcs = self._token_funcs
        _literals = self._literals
        self.text = text
        try:
            while True:
                try:
-                if text[index] in self.ignore:
+                    if text[index] in _ignore:
                        index += 1
                        continue
                except IndexError:
                    if self.eof:
                        text = self.eof()
                        if text is not None:
                        self.text = text
                        self.index = 0
                            index = 0
                            continue
                    break
                tok = Token()
-            tok.lineno = self.lineno
+                tok.lineno = lineno
                tok.index = index
-            m = self._master_re.match(text, index)
+                m = _master_re.match(text, index)
                if m:
                    index = m.end()
                    tok.value = m.group()
                    tok.type = m.lastgroup
-                if tok.type in self._token_funcs:
+                    if tok.type in _token_funcs:
                        self.index = index
-                    tok = self._token_funcs[tok.type](self, tok)
+                        self.lineno = lineno
                        tok = _token_funcs[tok.type](self, tok)
                        index = self.index
                        lineno = self.lineno
                        if not tok:
                            continue
-                if tok.type in self._ignored_tokens:
+                    if tok.type in _ignored_tokens:
                        continue
                    yield tok
                else:
                    # No match, see if the character is in literals
-                if text[index] in self._literals:
+                    if text[index] in _literals:
                        tok.value = text[index]
                        tok.type = tok.value
                        index += 1
@ -225,9 +258,18 @@ class Lexer(metaclass=RuleMeta):
                    else:
                        # A lexing error
                        self.index = index
-                    self.error(self.text[self.index:])
+                        self.lineno = lineno
                        self.error(text[index:])
                        index = self.index
                        lineno = self.lineno
        # Set the final state of the lexer before exiting (even if exception)
        finally:
            self.text = text
            self.index = index
            self.lineno = lineno
    # Default implementations of methods that may be subclassed by users
    def error(self, value):
        raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)