Improvements to lexer inheritance

2018-04-01 20:06:27 -05:00
parent c5659a4465
commit 1251da034a
2 changed files with 182 additions and 51 deletions
--- a/78
+++ b/78
@@ -1,5 +1,83 @@
 Version 0.3
 -----------
+4/1/2018   Support for Lexer inheritance added.  For example:
+
+            from sly import Lexer
+
+            class BaseLexer(Lexer):
+                tokens = { NAME, NUMBER }
+                ignore = ' \t'
+		
+                NAME = r'[a-zA-Z]+'
+		NUMBER = r'\d+'
+
+               
+            class ChildLexer(BaseLexer):
+                tokens = { PLUS, MINUS }
+                PLUS = r'\+'
+                MINUS = r'-'
+
+           In this example, the ChildLexer class gets all of the tokens
+           from the parent class (BaseLexer) in addition to the new
+           definitions it added of its own.  
+
+           One quirk of Lexer inheritance is that definition order has
+           an impact on the low-level regular expression parsing.  By
+           default new definitions are always processed AFTER any previous
+           definitions.  You can change this using the before() function
+           like this:
+
+            class GrandChildLexer(ChildLexer):
+                tokens = { PLUSPLUS, MINUSMINUS }
+                PLUSPLUS = before(PLUS, r'\+\+')
+                MINUSMINUS = before(MINUS, r'--')
+
+           In this example, the PLUSPLUS token is checked before the
+           PLUS token in the base class.  Thus, an input text of '++'
+           will be parsed as a single token PLUSPLUS, not two PLUS tokens.
+
+4/1/2018   Better support lexing states.   Each lexing state can be defined as
+           as a separate class.  Use the begin(cls) method to switch to a
+           different state.  For example:
+
+            from sly import Lexer
+
+            class LexerA(Lexer):
+                tokens = { NAME, NUMBER, LBRACE }
+
+                ignore = ' \t'
+
+                NAME = r'[a-zA-Z]+'
+                NUMBER = r'\d+'
+                LBRACE = r'\{'
+
+                def LBRACE(self, t):
+                    self.begin(LexerB)
+                    return t
+
+            class LexerB(Lexer):
+                tokens = { PLUS, MINUS, RBRACE }
+
+                ignore = ' \t'
+
+                PLUS = r'\+'
+                MINUS = r'-'
+                RBRACE = r'\}'
+
+                def RBRACE(self, t):
+                    self.begin(LexerA)
+                    return t
+
+           In this example, LexerA switches to a new state LexerB when
+           a left brace ({) is encountered.  The begin() method causes
+           the state transition.   LexerB switches back to state LexerA
+           when a right brace (}) is encountered.
+
+           An option to the begin() method, you can also use push_state(cls)
+           and pop_state(cls) methods.  This manages the lexing states as a
+           stack.  The pop_state() method will return back to the previous
+           lexing state.
+   
 1/27/2018  Tokens no longer have to be specified as strings.   For example, you
           can now write:

--- a/sly/lex.py
+++ b/sly/lex.py
@@ -80,48 +80,64 @@ class Token(object):

 class TokenStr(str):
    @staticmethod
-    def __new__(cls, value):
+    def __new__(cls, value, key=None, remap=None):
        self = super().__new__(cls, value)
-        if isinstance(value, TokenStr):
-            self.remap = dict(value.remap)
-            self.before = value.before
-        else:
-            self.remap = { }
-            self.before = None
+        self.key = key
+        self.remap = remap
        return self

    # Implementation of TOKEN[value] = NEWTOKEN
    def __setitem__(self, key, value):
-        self.remap[key] = value
+        if self.remap is not None:
+            self.remap[self.key, key] = value

    # Implementation of del TOKEN[value]
    def __delitem__(self, key):
-        del self.remap[key]
+        if self.remap is not None:
+            self.remap[self.key, key] = self.key

+class _Before:
+    def __init__(self, tok, pattern):
+        self.tok = tok
+        self.pattern = pattern

 class LexerMetaDict(dict):
    '''
    Special dictionary that prohibits duplicate definitions in lexer specifications.
    '''
+    def __init__(self):
+        self.before = { }
+        self.delete = [ ]
+        self.remap = { }
+
    def __setitem__(self, key, value):
        if isinstance(value, str):
-            value = TokenStr(value)
+            value = TokenStr(value, key, self.remap)
+            
+        if isinstance(value, _Before):
+            self.before[key] = value.tok
+            value = TokenStr(value.pattern, key, self.remap)
            
        if key in self and not isinstance(value, property):
            prior = self[key]
            if isinstance(prior, str):
                if callable(value):
                    value.pattern = prior
-                    value.remap = getattr(prior, 'remap', None)
                else:
-                    pass
-                    # raise AttributeError(f'Name {key} redefined')
+                    raise AttributeError(f'Name {key} redefined')

        super().__setitem__(key, value)

+    def __delitem__(self, key):
+        self.delete.append(key)
+        if key not in self and key.isupper():
+            pass
+        else:
+            return super().__delitem__(key)
+
    def __getitem__(self, key):
        if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
-            return key
+            return TokenStr(key, key, self.remap)
        else:
            return super().__getitem__(key)

@@ -144,22 +160,24 @@ class LexerMeta(type):
                return func
            return decorate

-        def before(tok, pattern):
-            value = TokenStr(pattern)
-            value.before = tok
-            return value
-
        d['_'] = _
-        d['before'] = before
+        d['before'] = _Before
        return d

    def __new__(meta, clsname, bases, attributes):
        del attributes['_']
-        clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
-                       for key, val in attributes.items() }
-        cls = super().__new__(meta, clsname, bases, clsattributes)
-        # Record the original definition environment
-        cls._attributes = attributes
+        del attributes['before']
+
+        # Create attributes for use in the actual class body
+        cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
+                           for key, val in attributes.items() }
+        cls = super().__new__(meta, clsname, bases, cls_attributes)
+
+        # Attach various metadata to the class
+        cls._attributes = dict(attributes)
+        cls._remap = attributes.remap
+        cls._before = attributes.before
+        cls._delete = attributes.delete
        cls._build()
        return cls

@@ -170,9 +188,12 @@ class Lexer(metaclass=LexerMeta):
    ignore = ''
    reflags = 0

+    _token_names = set()
    _token_funcs = {}
    _ignored_tokens = set()
    _remapping = {}
+    _delete = {}
+    _remap = {}

    # Internal attributes
    __state_stack = None
@@ -180,36 +201,63 @@ class Lexer(metaclass=LexerMeta):

    @classmethod
    def _collect_rules(cls):
-        '''
-        Collect all of the rules from class definitions that look like tokens
-        '''
-        definitions = list(cls._attributes.items())
+        # Collect all of the rules from class definitions that look like token
+        # information.   There are a few things that govern this:
+        #
+        # 1.  Any definition of the form NAME = str is a token if NAME is
+        #     is defined in the tokens set.
+        #
+        # 2.  Any definition of the form ignore_NAME = str is a rule for an ignored
+        #     token.
+        #
+        # 3.  Any function defined with a 'pattern' attribute is treated as a rule.
+        #     Such functions can be created with the @_ decorator or by defining
+        #     function with the same name as a previously defined string.
+        #
+        # This function is responsible for keeping rules in order. 
+
+        # Collect all previous rules from base classes
        rules = []

-        # Collect all of the previous rules from base classes
        for base in cls.__bases__:
            if isinstance(base, LexerMeta):
-                rules.extend(base._collect_rules())
+                rules.extend(base._rules)
                
+        # Dictionary of previous rules
        existing = dict(rules)

-        for key, value in definitions:
-            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
+        for key, value in cls._attributes.items():
+            if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
+                if callable(value) and not hasattr(value, 'pattern'):
+                    raise LexerBuildError(f"function {value} doesn't have a regex pattern")
+                
                if key in existing:
+                    # The definition matches something that already existed in the base class.
+                    # We replace it, but keep the original ordering
                    n = rules.index((key, existing[key]))
                    rules[n] = (key, value)
                    existing[key] = value
-                elif isinstance(value, TokenStr) and value.before in existing:
-                    n = rules.index((value.before, existing[value.before]))
-                    rules.insert(n, (key, value))
+
+                elif isinstance(value, TokenStr) and key in cls._before:
+                    before = cls._before[key]
+                    if before in existing:
+                        # Position the token before another specified token
+                        n = rules.index((before, existing[before]))
+                        rules.insert(n, (key, value))
+                    else:
+                        # Put at the end of the rule list
+                        rules.append((key, value))
                    existing[key] = value
                else:
                    rules.append((key, value))
                    existing[key] = value
-            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
+
+            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
                raise LexerBuildError(f'{key} does not match a name in tokens')

-        return rules
+        # Apply deletion rules
+        rules = [ (key, value) for key, value in rules if key not in cls._delete ]
+        cls._rules = rules

    @classmethod
    def _build(cls):
@@ -220,24 +268,30 @@ class Lexer(metaclass=LexerMeta):
        if 'tokens' not in vars(cls):
            raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')

+        # Pull definitions created for any parent classes
+        cls._token_names = cls._token_names | set(cls.tokens)
        cls._ignored_tokens = set(cls._ignored_tokens)
        cls._token_funcs = dict(cls._token_funcs)
        cls._remapping = dict(cls._remapping)
-        cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
-                           if getattr(val, 'remap', None) })

-        # Build a set of all remapped tokens
-        remapped_tokens = set()
-        for toks in cls._remapping.values():
-            remapped_tokens.update(toks.values())
+        for (key, val), newtok in cls._remap.items():
+            if key not in cls._remapping:
+                cls._remapping[key] = {}
+            cls._remapping[key][val] = newtok

-        undefined = remapped_tokens - set(cls.tokens)
+        remapped_toks = set()
+        for d in cls._remapping.values():
+            remapped_toks.update(d.values())
+            
+        undefined = remapped_toks - set(cls._token_names)
        if undefined:
            missing = ', '.join(undefined)
            raise LexerBuildError(f'{missing} not included in token(s)')

+        cls._collect_rules()
+
        parts = []
-        for tokname, value in cls._collect_rules():
+        for tokname, value in cls._rules:
            if tokname.startswith('ignore_'):
                tokname = tokname[7:]
                cls._ignored_tokens.add(tokname)
@@ -247,9 +301,7 @@ class Lexer(metaclass=LexerMeta):

            elif callable(value):
                cls._token_funcs[tokname] = value
-                pattern = getattr(value, 'pattern', None)
-                if not pattern:
-                    continue
+                pattern = getattr(value, 'pattern')

            # Form the regular expression component
            part = f'(?P<{tokname}>{pattern})'
@@ -338,6 +390,7 @@ class Lexer(metaclass=LexerMeta):
                    index = m.end()
                    tok.value = m.group()
                    tok.type = m.lastgroup
+
                    if tok.type in _remapping:
                        tok.type = _remapping[tok.type].get(tok.value, tok.type)