Improvements to lexer inheritance

2018-04-01 20:06:27 -05:00
parent c5659a4465
commit 1251da034a
2 changed files with 182 additions and 51 deletions
--- a/78
+++ b/78
@@ -1,5 +1,83 @@
 Version 0.3
 -----------
 4/1/2018   Support for Lexer inheritance added.  For example:
            from sly import Lexer
            class BaseLexer(Lexer):
                tokens = { NAME, NUMBER }
                ignore = ' \t'
                NAME = r'[a-zA-Z]+'
 		NUMBER = r'\d+'
            class ChildLexer(BaseLexer):
                tokens = { PLUS, MINUS }
                PLUS = r'\+'
                MINUS = r'-'
           In this example, the ChildLexer class gets all of the tokens
           from the parent class (BaseLexer) in addition to the new
           definitions it added of its own.  
           One quirk of Lexer inheritance is that definition order has
           an impact on the low-level regular expression parsing.  By
           default new definitions are always processed AFTER any previous
           definitions.  You can change this using the before() function
           like this:
            class GrandChildLexer(ChildLexer):
                tokens = { PLUSPLUS, MINUSMINUS }
                PLUSPLUS = before(PLUS, r'\+\+')
                MINUSMINUS = before(MINUS, r'--')
           In this example, the PLUSPLUS token is checked before the
           PLUS token in the base class.  Thus, an input text of '++'
           will be parsed as a single token PLUSPLUS, not two PLUS tokens.
 4/1/2018   Better support lexing states.   Each lexing state can be defined as
           as a separate class.  Use the begin(cls) method to switch to a
           different state.  For example:
            from sly import Lexer
            class LexerA(Lexer):
                tokens = { NAME, NUMBER, LBRACE }
                ignore = ' \t'
                NAME = r'[a-zA-Z]+'
                NUMBER = r'\d+'
                LBRACE = r'\{'
                def LBRACE(self, t):
                    self.begin(LexerB)
                    return t
            class LexerB(Lexer):
                tokens = { PLUS, MINUS, RBRACE }
                ignore = ' \t'
                PLUS = r'\+'
                MINUS = r'-'
                RBRACE = r'\}'
                def RBRACE(self, t):
                    self.begin(LexerA)
                    return t
           In this example, LexerA switches to a new state LexerB when
           a left brace ({) is encountered.  The begin() method causes
           the state transition.   LexerB switches back to state LexerA
           when a right brace (}) is encountered.
           An option to the begin() method, you can also use push_state(cls)
           and pop_state(cls) methods.  This manages the lexing states as a
           stack.  The pop_state() method will return back to the previous
           lexing state.
 1/27/2018  Tokens no longer have to be specified as strings.   For example, you
           can now write:
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -80,48 +80,64 @@ class Token(object):
 class TokenStr(str):
    @staticmethod
-    def __new__(cls, value):
+    def __new__(cls, value, key=None, remap=None):
        self = super().__new__(cls, value)
-        if isinstance(value, TokenStr):
+        self.key = key
-            self.remap = dict(value.remap)
+        self.remap = remap
            self.before = value.before
        else:
            self.remap = { }
            self.before = None
        return self
    # Implementation of TOKEN[value] = NEWTOKEN
    def __setitem__(self, key, value):
-        self.remap[key] = value
+        if self.remap is not None:
            self.remap[self.key, key] = value
    # Implementation of del TOKEN[value]
    def __delitem__(self, key):
-        del self.remap[key]
+        if self.remap is not None:
            self.remap[self.key, key] = self.key
 class _Before:
    def __init__(self, tok, pattern):
        self.tok = tok
        self.pattern = pattern
 class LexerMetaDict(dict):
    '''
    Special dictionary that prohibits duplicate definitions in lexer specifications.
    '''
    def __init__(self):
        self.before = { }
        self.delete = [ ]
        self.remap = { }
    def __setitem__(self, key, value):
        if isinstance(value, str):
-            value = TokenStr(value)
+            value = TokenStr(value, key, self.remap)
        if isinstance(value, _Before):
            self.before[key] = value.tok
            value = TokenStr(value.pattern, key, self.remap)
        if key in self and not isinstance(value, property):
            prior = self[key]
            if isinstance(prior, str):
                if callable(value):
                    value.pattern = prior
                    value.remap = getattr(prior, 'remap', None)
                else:
-                    pass
+                    raise AttributeError(f'Name {key} redefined')
                    # raise AttributeError(f'Name {key} redefined')
        super().__setitem__(key, value)
    def __delitem__(self, key):
        self.delete.append(key)
        if key not in self and key.isupper():
            pass
        else:
            return super().__delitem__(key)
    def __getitem__(self, key):
        if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
-            return key
+            return TokenStr(key, key, self.remap)
        else:
            return super().__getitem__(key)
@@ -144,22 +160,24 @@ class LexerMeta(type):
                return func
            return decorate
        def before(tok, pattern):
            value = TokenStr(pattern)
            value.before = tok
            return value
        d['_'] = _
-        d['before'] = before
+        d['before'] = _Before
        return d
    def __new__(meta, clsname, bases, attributes):
        del attributes['_']
-        clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
+        del attributes['before']
-                       for key, val in attributes.items() }
+
-        cls = super().__new__(meta, clsname, bases, clsattributes)
+        # Create attributes for use in the actual class body
-        # Record the original definition environment
+        cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
-        cls._attributes = attributes
+                           for key, val in attributes.items() }
        cls = super().__new__(meta, clsname, bases, cls_attributes)
        # Attach various metadata to the class
        cls._attributes = dict(attributes)
        cls._remap = attributes.remap
        cls._before = attributes.before
        cls._delete = attributes.delete
        cls._build()
        return cls
@@ -170,9 +188,12 @@ class Lexer(metaclass=LexerMeta):
    ignore = ''
    reflags = 0
    _token_names = set()
    _token_funcs = {}
    _ignored_tokens = set()
    _remapping = {}
    _delete = {}
    _remap = {}
    # Internal attributes
    __state_stack = None
@@ -180,36 +201,63 @@ class Lexer(metaclass=LexerMeta):
    @classmethod
    def _collect_rules(cls):
-        '''
+        # Collect all of the rules from class definitions that look like token
-        Collect all of the rules from class definitions that look like tokens
+        # information.   There are a few things that govern this:
-        '''
+        #
-        definitions = list(cls._attributes.items())
+        # 1.  Any definition of the form NAME = str is a token if NAME is
        #     is defined in the tokens set.
        #
        # 2.  Any definition of the form ignore_NAME = str is a rule for an ignored
        #     token.
        #
        # 3.  Any function defined with a 'pattern' attribute is treated as a rule.
        #     Such functions can be created with the @_ decorator or by defining
        #     function with the same name as a previously defined string.
        #
        # This function is responsible for keeping rules in order. 
        # Collect all previous rules from base classes
        rules = []
        # Collect all of the previous rules from base classes
        for base in cls.__bases__:
            if isinstance(base, LexerMeta):
-                rules.extend(base._collect_rules())
+                rules.extend(base._rules)
        # Dictionary of previous rules
        existing = dict(rules)
-        for key, value in definitions:
+        for key, value in cls._attributes.items():
-            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
+            if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
                if callable(value) and not hasattr(value, 'pattern'):
                    raise LexerBuildError(f"function {value} doesn't have a regex pattern")
                if key in existing:
                    # The definition matches something that already existed in the base class.
                    # We replace it, but keep the original ordering
                    n = rules.index((key, existing[key]))
                    rules[n] = (key, value)
                    existing[key] = value
-                elif isinstance(value, TokenStr) and value.before in existing:
+
-                    n = rules.index((value.before, existing[value.before]))
+                elif isinstance(value, TokenStr) and key in cls._before:
-                    rules.insert(n, (key, value))
+                    before = cls._before[key]
                    if before in existing:
                        # Position the token before another specified token
                        n = rules.index((before, existing[before]))
                        rules.insert(n, (key, value))
                    else:
                        # Put at the end of the rule list
                        rules.append((key, value))
                    existing[key] = value
                else:
                    rules.append((key, value))
                    existing[key] = value
-            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
+
            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
                raise LexerBuildError(f'{key} does not match a name in tokens')
-        return rules
+        # Apply deletion rules
        rules = [ (key, value) for key, value in rules if key not in cls._delete ]
        cls._rules = rules
    @classmethod
    def _build(cls):
@@ -220,24 +268,30 @@ class Lexer(metaclass=LexerMeta):
        if 'tokens' not in vars(cls):
            raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
        # Pull definitions created for any parent classes
        cls._token_names = cls._token_names | set(cls.tokens)
        cls._ignored_tokens = set(cls._ignored_tokens)
        cls._token_funcs = dict(cls._token_funcs)
        cls._remapping = dict(cls._remapping)
        cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
                           if getattr(val, 'remap', None) })
-        # Build a set of all remapped tokens
+        for (key, val), newtok in cls._remap.items():
-        remapped_tokens = set()
+            if key not in cls._remapping:
-        for toks in cls._remapping.values():
+                cls._remapping[key] = {}
-            remapped_tokens.update(toks.values())
+            cls._remapping[key][val] = newtok
-        undefined = remapped_tokens - set(cls.tokens)
+        remapped_toks = set()
        for d in cls._remapping.values():
            remapped_toks.update(d.values())
        undefined = remapped_toks - set(cls._token_names)
        if undefined:
            missing = ', '.join(undefined)
            raise LexerBuildError(f'{missing} not included in token(s)')
        cls._collect_rules()
        parts = []
-        for tokname, value in cls._collect_rules():
+        for tokname, value in cls._rules:
            if tokname.startswith('ignore_'):
                tokname = tokname[7:]
                cls._ignored_tokens.add(tokname)
@@ -247,9 +301,7 @@ class Lexer(metaclass=LexerMeta):
            elif callable(value):
                cls._token_funcs[tokname] = value
-                pattern = getattr(value, 'pattern', None)
+                pattern = getattr(value, 'pattern')
                if not pattern:
                    continue
            # Form the regular expression component
            part = f'(?P<{tokname}>{pattern})'
@@ -338,6 +390,7 @@ class Lexer(metaclass=LexerMeta):
                    index = m.end()
                    tok.value = m.group()
                    tok.type = m.lastgroup
                    if tok.type in _remapping:
                        tok.type = _remapping[tok.type].get(tok.value, tok.type)