From 1251da034a0aeb5b2d0ecbf1441806a459bc82b7 Mon Sep 17 00:00:00 2001
From: David Beazley <dave@dabeaz.com>
Date: Sun, 1 Apr 2018 20:06:27 -0500
Subject: [PATCH] Improvements to lexer inheritance

---
 CHANGES    |  78 +++++++++++++++++++++++++++
 sly/lex.py | 155 +++++++++++++++++++++++++++++++++++------------------
 2 files changed, 182 insertions(+), 51 deletions(-)

diff --git a/CHANGES b/CHANGES
index 0b4e38c..4a1e7e5 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,5 +1,83 @@
 Version 0.3
 -----------
+4/1/2018   Support for Lexer inheritance added.  For example:
+
+            from sly import Lexer
+
+            class BaseLexer(Lexer):
+                tokens = { NAME, NUMBER }
+                ignore = ' \t'
+		
+                NAME = r'[a-zA-Z]+'
+		NUMBER = r'\d+'
+
+               
+            class ChildLexer(BaseLexer):
+                tokens = { PLUS, MINUS }
+                PLUS = r'\+'
+                MINUS = r'-'
+
+           In this example, the ChildLexer class gets all of the tokens
+           from the parent class (BaseLexer) in addition to the new
+           definitions it added of its own.  
+
+           One quirk of Lexer inheritance is that definition order has
+           an impact on the low-level regular expression parsing.  By
+           default new definitions are always processed AFTER any previous
+           definitions.  You can change this using the before() function
+           like this:
+
+            class GrandChildLexer(ChildLexer):
+                tokens = { PLUSPLUS, MINUSMINUS }
+                PLUSPLUS = before(PLUS, r'\+\+')
+                MINUSMINUS = before(MINUS, r'--')
+
+           In this example, the PLUSPLUS token is checked before the
+           PLUS token in the base class.  Thus, an input text of '++'
+           will be parsed as a single token PLUSPLUS, not two PLUS tokens.
+
+4/1/2018   Better support lexing states.   Each lexing state can be defined as
+           as a separate class.  Use the begin(cls) method to switch to a
+           different state.  For example:
+
+            from sly import Lexer
+
+            class LexerA(Lexer):
+                tokens = { NAME, NUMBER, LBRACE }
+
+                ignore = ' \t'
+
+                NAME = r'[a-zA-Z]+'
+                NUMBER = r'\d+'
+                LBRACE = r'\{'
+
+                def LBRACE(self, t):
+                    self.begin(LexerB)
+                    return t
+
+            class LexerB(Lexer):
+                tokens = { PLUS, MINUS, RBRACE }
+
+                ignore = ' \t'
+
+                PLUS = r'\+'
+                MINUS = r'-'
+                RBRACE = r'\}'
+
+                def RBRACE(self, t):
+                    self.begin(LexerA)
+                    return t
+
+           In this example, LexerA switches to a new state LexerB when
+           a left brace ({) is encountered.  The begin() method causes
+           the state transition.   LexerB switches back to state LexerA
+           when a right brace (}) is encountered.
+
+           An option to the begin() method, you can also use push_state(cls)
+           and pop_state(cls) methods.  This manages the lexing states as a
+           stack.  The pop_state() method will return back to the previous
+           lexing state.
+   
 1/27/2018  Tokens no longer have to be specified as strings.   For example, you
            can now write:
 
diff --git a/sly/lex.py b/sly/lex.py
index 7c4368a..8cd19e6 100644
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -80,48 +80,64 @@ class Token(object):
 
 class TokenStr(str):
     @staticmethod
-    def __new__(cls, value):
+    def __new__(cls, value, key=None, remap=None):
         self = super().__new__(cls, value)
-        if isinstance(value, TokenStr):
-            self.remap = dict(value.remap)
-            self.before = value.before
-        else:
-            self.remap = { }
-            self.before = None
+        self.key = key
+        self.remap = remap
         return self
 
     # Implementation of TOKEN[value] = NEWTOKEN
     def __setitem__(self, key, value):
-        self.remap[key] = value
+        if self.remap is not None:
+            self.remap[self.key, key] = value
 
     # Implementation of del TOKEN[value]
     def __delitem__(self, key):
-        del self.remap[key]
+        if self.remap is not None:
+            self.remap[self.key, key] = self.key
 
+class _Before:
+    def __init__(self, tok, pattern):
+        self.tok = tok
+        self.pattern = pattern
 
 class LexerMetaDict(dict):
     '''
     Special dictionary that prohibits duplicate definitions in lexer specifications.
     '''
+    def __init__(self):
+        self.before = { }
+        self.delete = [ ]
+        self.remap = { }
+
     def __setitem__(self, key, value):
         if isinstance(value, str):
-            value = TokenStr(value)
+            value = TokenStr(value, key, self.remap)
+            
+        if isinstance(value, _Before):
+            self.before[key] = value.tok
+            value = TokenStr(value.pattern, key, self.remap)
             
         if key in self and not isinstance(value, property):
             prior = self[key]
             if isinstance(prior, str):
                 if callable(value):
                     value.pattern = prior
-                    value.remap = getattr(prior, 'remap', None)
                 else:
-                    pass
-                    # raise AttributeError(f'Name {key} redefined')
+                    raise AttributeError(f'Name {key} redefined')
 
         super().__setitem__(key, value)
 
+    def __delitem__(self, key):
+        self.delete.append(key)
+        if key not in self and key.isupper():
+            pass
+        else:
+            return super().__delitem__(key)
+
     def __getitem__(self, key):
         if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
-            return key
+            return TokenStr(key, key, self.remap)
         else:
             return super().__getitem__(key)
 
@@ -144,22 +160,24 @@ class LexerMeta(type):
                 return func
             return decorate
 
-        def before(tok, pattern):
-            value = TokenStr(pattern)
-            value.before = tok
-            return value
-
         d['_'] = _
-        d['before'] = before
+        d['before'] = _Before
         return d
 
     def __new__(meta, clsname, bases, attributes):
         del attributes['_']
-        clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
-                       for key, val in attributes.items() }
-        cls = super().__new__(meta, clsname, bases, clsattributes)
-        # Record the original definition environment
-        cls._attributes = attributes
+        del attributes['before']
+
+        # Create attributes for use in the actual class body
+        cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
+                           for key, val in attributes.items() }
+        cls = super().__new__(meta, clsname, bases, cls_attributes)
+
+        # Attach various metadata to the class
+        cls._attributes = dict(attributes)
+        cls._remap = attributes.remap
+        cls._before = attributes.before
+        cls._delete = attributes.delete
         cls._build()
         return cls
 
@@ -169,10 +187,13 @@ class Lexer(metaclass=LexerMeta):
     literals = set()
     ignore = ''
     reflags = 0
-    
+
+    _token_names = set()
     _token_funcs = {}
     _ignored_tokens = set()
     _remapping = {}
+    _delete = {}
+    _remap = {}
 
     # Internal attributes
     __state_stack = None
@@ -180,36 +201,63 @@ class Lexer(metaclass=LexerMeta):
 
     @classmethod
     def _collect_rules(cls):
-        '''
-        Collect all of the rules from class definitions that look like tokens
-        '''
-        definitions = list(cls._attributes.items())
+        # Collect all of the rules from class definitions that look like token
+        # information.   There are a few things that govern this:
+        #
+        # 1.  Any definition of the form NAME = str is a token if NAME is
+        #     is defined in the tokens set.
+        #
+        # 2.  Any definition of the form ignore_NAME = str is a rule for an ignored
+        #     token.
+        #
+        # 3.  Any function defined with a 'pattern' attribute is treated as a rule.
+        #     Such functions can be created with the @_ decorator or by defining
+        #     function with the same name as a previously defined string.
+        #
+        # This function is responsible for keeping rules in order. 
+
+        # Collect all previous rules from base classes
         rules = []
 
-        # Collect all of the previous rules from base classes
         for base in cls.__bases__:
             if isinstance(base, LexerMeta):
-                rules.extend(base._collect_rules())
-
+                rules.extend(base._rules)
+                
+        # Dictionary of previous rules
         existing = dict(rules)
 
-        for key, value in definitions:
-            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
+        for key, value in cls._attributes.items():
+            if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
+                if callable(value) and not hasattr(value, 'pattern'):
+                    raise LexerBuildError(f"function {value} doesn't have a regex pattern")
+                
                 if key in existing:
+                    # The definition matches something that already existed in the base class.
+                    # We replace it, but keep the original ordering
                     n = rules.index((key, existing[key]))
                     rules[n] = (key, value)
                     existing[key] = value
-                elif isinstance(value, TokenStr) and value.before in existing:
-                    n = rules.index((value.before, existing[value.before]))
-                    rules.insert(n, (key, value))
+
+                elif isinstance(value, TokenStr) and key in cls._before:
+                    before = cls._before[key]
+                    if before in existing:
+                        # Position the token before another specified token
+                        n = rules.index((before, existing[before]))
+                        rules.insert(n, (key, value))
+                    else:
+                        # Put at the end of the rule list
+                        rules.append((key, value))
                     existing[key] = value
                 else:
                     rules.append((key, value))
                     existing[key] = value
-            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
+
+            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
                 raise LexerBuildError(f'{key} does not match a name in tokens')
 
-        return rules
+        # Apply deletion rules
+        rules = [ (key, value) for key, value in rules if key not in cls._delete ]
+        cls._rules = rules
 
     @classmethod
     def _build(cls):
@@ -220,24 +268,30 @@ class Lexer(metaclass=LexerMeta):
         if 'tokens' not in vars(cls):
             raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
 
+        # Pull definitions created for any parent classes
+        cls._token_names = cls._token_names | set(cls.tokens)
         cls._ignored_tokens = set(cls._ignored_tokens)
         cls._token_funcs = dict(cls._token_funcs)
         cls._remapping = dict(cls._remapping)
-        cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
-                           if getattr(val, 'remap', None) })
 
-        # Build a set of all remapped tokens
-        remapped_tokens = set()
-        for toks in cls._remapping.values():
-            remapped_tokens.update(toks.values())
+        for (key, val), newtok in cls._remap.items():
+            if key not in cls._remapping:
+                cls._remapping[key] = {}
+            cls._remapping[key][val] = newtok
 
-        undefined = remapped_tokens - set(cls.tokens)
+        remapped_toks = set()
+        for d in cls._remapping.values():
+            remapped_toks.update(d.values())
+            
+        undefined = remapped_toks - set(cls._token_names)
         if undefined:
             missing = ', '.join(undefined)
             raise LexerBuildError(f'{missing} not included in token(s)')
 
+        cls._collect_rules()
+
         parts = []
-        for tokname, value in cls._collect_rules():
+        for tokname, value in cls._rules:
             if tokname.startswith('ignore_'):
                 tokname = tokname[7:]
                 cls._ignored_tokens.add(tokname)
@@ -247,9 +301,7 @@ class Lexer(metaclass=LexerMeta):
 
             elif callable(value):
                 cls._token_funcs[tokname] = value
-                pattern = getattr(value, 'pattern', None)
-                if not pattern:
-                    continue
+                pattern = getattr(value, 'pattern')
 
             # Form the regular expression component
             part = f'(?P<{tokname}>{pattern})'
@@ -338,6 +390,7 @@ class Lexer(metaclass=LexerMeta):
                     index = m.end()
                     tok.value = m.group()
                     tok.type = m.lastgroup
+
                     if tok.type in _remapping:
                         tok.type = _remapping[tok.type].get(tok.value, tok.type)