Improvements to lexer inheritance
This commit is contained in:
		
							
								
								
									
										78
									
								
								CHANGES
									
									
									
									
									
								
							
							
						
						
									
										78
									
								
								CHANGES
									
									
									
									
									
								
							@@ -1,5 +1,83 @@
 | 
			
		||||
Version 0.3
 | 
			
		||||
-----------
 | 
			
		||||
4/1/2018   Support for Lexer inheritance added.  For example:
 | 
			
		||||
 | 
			
		||||
            from sly import Lexer
 | 
			
		||||
 | 
			
		||||
            class BaseLexer(Lexer):
 | 
			
		||||
                tokens = { NAME, NUMBER }
 | 
			
		||||
                ignore = ' \t'
 | 
			
		||||
		
 | 
			
		||||
                NAME = r'[a-zA-Z]+'
 | 
			
		||||
		NUMBER = r'\d+'
 | 
			
		||||
 | 
			
		||||
               
 | 
			
		||||
            class ChildLexer(BaseLexer):
 | 
			
		||||
                tokens = { PLUS, MINUS }
 | 
			
		||||
                PLUS = r'\+'
 | 
			
		||||
                MINUS = r'-'
 | 
			
		||||
 | 
			
		||||
           In this example, the ChildLexer class gets all of the tokens
 | 
			
		||||
           from the parent class (BaseLexer) in addition to the new
 | 
			
		||||
           definitions it added of its own.  
 | 
			
		||||
 | 
			
		||||
           One quirk of Lexer inheritance is that definition order has
 | 
			
		||||
           an impact on the low-level regular expression parsing.  By
 | 
			
		||||
           default new definitions are always processed AFTER any previous
 | 
			
		||||
           definitions.  You can change this using the before() function
 | 
			
		||||
           like this:
 | 
			
		||||
 | 
			
		||||
            class GrandChildLexer(ChildLexer):
 | 
			
		||||
                tokens = { PLUSPLUS, MINUSMINUS }
 | 
			
		||||
                PLUSPLUS = before(PLUS, r'\+\+')
 | 
			
		||||
                MINUSMINUS = before(MINUS, r'--')
 | 
			
		||||
 | 
			
		||||
           In this example, the PLUSPLUS token is checked before the
 | 
			
		||||
           PLUS token in the base class.  Thus, an input text of '++'
 | 
			
		||||
           will be parsed as a single token PLUSPLUS, not two PLUS tokens.
 | 
			
		||||
 | 
			
		||||
4/1/2018   Better support lexing states.   Each lexing state can be defined as
 | 
			
		||||
           as a separate class.  Use the begin(cls) method to switch to a
 | 
			
		||||
           different state.  For example:
 | 
			
		||||
 | 
			
		||||
            from sly import Lexer
 | 
			
		||||
 | 
			
		||||
            class LexerA(Lexer):
 | 
			
		||||
                tokens = { NAME, NUMBER, LBRACE }
 | 
			
		||||
 | 
			
		||||
                ignore = ' \t'
 | 
			
		||||
 | 
			
		||||
                NAME = r'[a-zA-Z]+'
 | 
			
		||||
                NUMBER = r'\d+'
 | 
			
		||||
                LBRACE = r'\{'
 | 
			
		||||
 | 
			
		||||
                def LBRACE(self, t):
 | 
			
		||||
                    self.begin(LexerB)
 | 
			
		||||
                    return t
 | 
			
		||||
 | 
			
		||||
            class LexerB(Lexer):
 | 
			
		||||
                tokens = { PLUS, MINUS, RBRACE }
 | 
			
		||||
 | 
			
		||||
                ignore = ' \t'
 | 
			
		||||
 | 
			
		||||
                PLUS = r'\+'
 | 
			
		||||
                MINUS = r'-'
 | 
			
		||||
                RBRACE = r'\}'
 | 
			
		||||
 | 
			
		||||
                def RBRACE(self, t):
 | 
			
		||||
                    self.begin(LexerA)
 | 
			
		||||
                    return t
 | 
			
		||||
 | 
			
		||||
           In this example, LexerA switches to a new state LexerB when
 | 
			
		||||
           a left brace ({) is encountered.  The begin() method causes
 | 
			
		||||
           the state transition.   LexerB switches back to state LexerA
 | 
			
		||||
           when a right brace (}) is encountered.
 | 
			
		||||
 | 
			
		||||
           An option to the begin() method, you can also use push_state(cls)
 | 
			
		||||
           and pop_state(cls) methods.  This manages the lexing states as a
 | 
			
		||||
           stack.  The pop_state() method will return back to the previous
 | 
			
		||||
           lexing state.
 | 
			
		||||
   
 | 
			
		||||
1/27/2018  Tokens no longer have to be specified as strings.   For example, you
 | 
			
		||||
           can now write:
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										151
									
								
								sly/lex.py
									
									
									
									
									
								
							
							
						
						
									
										151
									
								
								sly/lex.py
									
									
									
									
									
								
							@@ -80,48 +80,64 @@ class Token(object):
 | 
			
		||||
 | 
			
		||||
class TokenStr(str):
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def __new__(cls, value):
 | 
			
		||||
    def __new__(cls, value, key=None, remap=None):
 | 
			
		||||
        self = super().__new__(cls, value)
 | 
			
		||||
        if isinstance(value, TokenStr):
 | 
			
		||||
            self.remap = dict(value.remap)
 | 
			
		||||
            self.before = value.before
 | 
			
		||||
        else:
 | 
			
		||||
            self.remap = { }
 | 
			
		||||
            self.before = None
 | 
			
		||||
        self.key = key
 | 
			
		||||
        self.remap = remap
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    # Implementation of TOKEN[value] = NEWTOKEN
 | 
			
		||||
    def __setitem__(self, key, value):
 | 
			
		||||
        self.remap[key] = value
 | 
			
		||||
        if self.remap is not None:
 | 
			
		||||
            self.remap[self.key, key] = value
 | 
			
		||||
 | 
			
		||||
    # Implementation of del TOKEN[value]
 | 
			
		||||
    def __delitem__(self, key):
 | 
			
		||||
        del self.remap[key]
 | 
			
		||||
        if self.remap is not None:
 | 
			
		||||
            self.remap[self.key, key] = self.key
 | 
			
		||||
 | 
			
		||||
class _Before:
 | 
			
		||||
    def __init__(self, tok, pattern):
 | 
			
		||||
        self.tok = tok
 | 
			
		||||
        self.pattern = pattern
 | 
			
		||||
 | 
			
		||||
class LexerMetaDict(dict):
 | 
			
		||||
    '''
 | 
			
		||||
    Special dictionary that prohibits duplicate definitions in lexer specifications.
 | 
			
		||||
    '''
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.before = { }
 | 
			
		||||
        self.delete = [ ]
 | 
			
		||||
        self.remap = { }
 | 
			
		||||
 | 
			
		||||
    def __setitem__(self, key, value):
 | 
			
		||||
        if isinstance(value, str):
 | 
			
		||||
            value = TokenStr(value)
 | 
			
		||||
            value = TokenStr(value, key, self.remap)
 | 
			
		||||
            
 | 
			
		||||
        if isinstance(value, _Before):
 | 
			
		||||
            self.before[key] = value.tok
 | 
			
		||||
            value = TokenStr(value.pattern, key, self.remap)
 | 
			
		||||
            
 | 
			
		||||
        if key in self and not isinstance(value, property):
 | 
			
		||||
            prior = self[key]
 | 
			
		||||
            if isinstance(prior, str):
 | 
			
		||||
                if callable(value):
 | 
			
		||||
                    value.pattern = prior
 | 
			
		||||
                    value.remap = getattr(prior, 'remap', None)
 | 
			
		||||
                else:
 | 
			
		||||
                    pass
 | 
			
		||||
                    # raise AttributeError(f'Name {key} redefined')
 | 
			
		||||
                    raise AttributeError(f'Name {key} redefined')
 | 
			
		||||
 | 
			
		||||
        super().__setitem__(key, value)
 | 
			
		||||
 | 
			
		||||
    def __delitem__(self, key):
 | 
			
		||||
        self.delete.append(key)
 | 
			
		||||
        if key not in self and key.isupper():
 | 
			
		||||
            pass
 | 
			
		||||
        else:
 | 
			
		||||
            return super().__delitem__(key)
 | 
			
		||||
 | 
			
		||||
    def __getitem__(self, key):
 | 
			
		||||
        if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
 | 
			
		||||
            return key
 | 
			
		||||
            return TokenStr(key, key, self.remap)
 | 
			
		||||
        else:
 | 
			
		||||
            return super().__getitem__(key)
 | 
			
		||||
 | 
			
		||||
@@ -144,22 +160,24 @@ class LexerMeta(type):
 | 
			
		||||
                return func
 | 
			
		||||
            return decorate
 | 
			
		||||
 | 
			
		||||
        def before(tok, pattern):
 | 
			
		||||
            value = TokenStr(pattern)
 | 
			
		||||
            value.before = tok
 | 
			
		||||
            return value
 | 
			
		||||
 | 
			
		||||
        d['_'] = _
 | 
			
		||||
        d['before'] = before
 | 
			
		||||
        d['before'] = _Before
 | 
			
		||||
        return d
 | 
			
		||||
 | 
			
		||||
    def __new__(meta, clsname, bases, attributes):
 | 
			
		||||
        del attributes['_']
 | 
			
		||||
        clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
 | 
			
		||||
                       for key, val in attributes.items() }
 | 
			
		||||
        cls = super().__new__(meta, clsname, bases, clsattributes)
 | 
			
		||||
        # Record the original definition environment
 | 
			
		||||
        cls._attributes = attributes
 | 
			
		||||
        del attributes['before']
 | 
			
		||||
 | 
			
		||||
        # Create attributes for use in the actual class body
 | 
			
		||||
        cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
 | 
			
		||||
                           for key, val in attributes.items() }
 | 
			
		||||
        cls = super().__new__(meta, clsname, bases, cls_attributes)
 | 
			
		||||
 | 
			
		||||
        # Attach various metadata to the class
 | 
			
		||||
        cls._attributes = dict(attributes)
 | 
			
		||||
        cls._remap = attributes.remap
 | 
			
		||||
        cls._before = attributes.before
 | 
			
		||||
        cls._delete = attributes.delete
 | 
			
		||||
        cls._build()
 | 
			
		||||
        return cls
 | 
			
		||||
 | 
			
		||||
@@ -170,9 +188,12 @@ class Lexer(metaclass=LexerMeta):
 | 
			
		||||
    ignore = ''
 | 
			
		||||
    reflags = 0
 | 
			
		||||
 | 
			
		||||
    _token_names = set()
 | 
			
		||||
    _token_funcs = {}
 | 
			
		||||
    _ignored_tokens = set()
 | 
			
		||||
    _remapping = {}
 | 
			
		||||
    _delete = {}
 | 
			
		||||
    _remap = {}
 | 
			
		||||
 | 
			
		||||
    # Internal attributes
 | 
			
		||||
    __state_stack = None
 | 
			
		||||
@@ -180,36 +201,63 @@ class Lexer(metaclass=LexerMeta):
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def _collect_rules(cls):
 | 
			
		||||
        '''
 | 
			
		||||
        Collect all of the rules from class definitions that look like tokens
 | 
			
		||||
        '''
 | 
			
		||||
        definitions = list(cls._attributes.items())
 | 
			
		||||
        # Collect all of the rules from class definitions that look like token
 | 
			
		||||
        # information.   There are a few things that govern this:
 | 
			
		||||
        #
 | 
			
		||||
        # 1.  Any definition of the form NAME = str is a token if NAME is
 | 
			
		||||
        #     is defined in the tokens set.
 | 
			
		||||
        #
 | 
			
		||||
        # 2.  Any definition of the form ignore_NAME = str is a rule for an ignored
 | 
			
		||||
        #     token.
 | 
			
		||||
        #
 | 
			
		||||
        # 3.  Any function defined with a 'pattern' attribute is treated as a rule.
 | 
			
		||||
        #     Such functions can be created with the @_ decorator or by defining
 | 
			
		||||
        #     function with the same name as a previously defined string.
 | 
			
		||||
        #
 | 
			
		||||
        # This function is responsible for keeping rules in order. 
 | 
			
		||||
 | 
			
		||||
        # Collect all previous rules from base classes
 | 
			
		||||
        rules = []
 | 
			
		||||
 | 
			
		||||
        # Collect all of the previous rules from base classes
 | 
			
		||||
        for base in cls.__bases__:
 | 
			
		||||
            if isinstance(base, LexerMeta):
 | 
			
		||||
                rules.extend(base._collect_rules())
 | 
			
		||||
                rules.extend(base._rules)
 | 
			
		||||
                
 | 
			
		||||
        # Dictionary of previous rules
 | 
			
		||||
        existing = dict(rules)
 | 
			
		||||
 | 
			
		||||
        for key, value in definitions:
 | 
			
		||||
            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
 | 
			
		||||
        for key, value in cls._attributes.items():
 | 
			
		||||
            if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
 | 
			
		||||
                if callable(value) and not hasattr(value, 'pattern'):
 | 
			
		||||
                    raise LexerBuildError(f"function {value} doesn't have a regex pattern")
 | 
			
		||||
                
 | 
			
		||||
                if key in existing:
 | 
			
		||||
                    # The definition matches something that already existed in the base class.
 | 
			
		||||
                    # We replace it, but keep the original ordering
 | 
			
		||||
                    n = rules.index((key, existing[key]))
 | 
			
		||||
                    rules[n] = (key, value)
 | 
			
		||||
                    existing[key] = value
 | 
			
		||||
                elif isinstance(value, TokenStr) and value.before in existing:
 | 
			
		||||
                    n = rules.index((value.before, existing[value.before]))
 | 
			
		||||
                    rules.insert(n, (key, value))
 | 
			
		||||
 | 
			
		||||
                elif isinstance(value, TokenStr) and key in cls._before:
 | 
			
		||||
                    before = cls._before[key]
 | 
			
		||||
                    if before in existing:
 | 
			
		||||
                        # Position the token before another specified token
 | 
			
		||||
                        n = rules.index((before, existing[before]))
 | 
			
		||||
                        rules.insert(n, (key, value))
 | 
			
		||||
                    else:
 | 
			
		||||
                        # Put at the end of the rule list
 | 
			
		||||
                        rules.append((key, value))
 | 
			
		||||
                    existing[key] = value
 | 
			
		||||
                else:
 | 
			
		||||
                    rules.append((key, value))
 | 
			
		||||
                    existing[key] = value
 | 
			
		||||
            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
 | 
			
		||||
 | 
			
		||||
            elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
 | 
			
		||||
                raise LexerBuildError(f'{key} does not match a name in tokens')
 | 
			
		||||
 | 
			
		||||
        return rules
 | 
			
		||||
        # Apply deletion rules
 | 
			
		||||
        rules = [ (key, value) for key, value in rules if key not in cls._delete ]
 | 
			
		||||
        cls._rules = rules
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def _build(cls):
 | 
			
		||||
@@ -220,24 +268,30 @@ class Lexer(metaclass=LexerMeta):
 | 
			
		||||
        if 'tokens' not in vars(cls):
 | 
			
		||||
            raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
 | 
			
		||||
 | 
			
		||||
        # Pull definitions created for any parent classes
 | 
			
		||||
        cls._token_names = cls._token_names | set(cls.tokens)
 | 
			
		||||
        cls._ignored_tokens = set(cls._ignored_tokens)
 | 
			
		||||
        cls._token_funcs = dict(cls._token_funcs)
 | 
			
		||||
        cls._remapping = dict(cls._remapping)
 | 
			
		||||
        cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
 | 
			
		||||
                           if getattr(val, 'remap', None) })
 | 
			
		||||
 | 
			
		||||
        # Build a set of all remapped tokens
 | 
			
		||||
        remapped_tokens = set()
 | 
			
		||||
        for toks in cls._remapping.values():
 | 
			
		||||
            remapped_tokens.update(toks.values())
 | 
			
		||||
        for (key, val), newtok in cls._remap.items():
 | 
			
		||||
            if key not in cls._remapping:
 | 
			
		||||
                cls._remapping[key] = {}
 | 
			
		||||
            cls._remapping[key][val] = newtok
 | 
			
		||||
 | 
			
		||||
        undefined = remapped_tokens - set(cls.tokens)
 | 
			
		||||
        remapped_toks = set()
 | 
			
		||||
        for d in cls._remapping.values():
 | 
			
		||||
            remapped_toks.update(d.values())
 | 
			
		||||
            
 | 
			
		||||
        undefined = remapped_toks - set(cls._token_names)
 | 
			
		||||
        if undefined:
 | 
			
		||||
            missing = ', '.join(undefined)
 | 
			
		||||
            raise LexerBuildError(f'{missing} not included in token(s)')
 | 
			
		||||
 | 
			
		||||
        cls._collect_rules()
 | 
			
		||||
 | 
			
		||||
        parts = []
 | 
			
		||||
        for tokname, value in cls._collect_rules():
 | 
			
		||||
        for tokname, value in cls._rules:
 | 
			
		||||
            if tokname.startswith('ignore_'):
 | 
			
		||||
                tokname = tokname[7:]
 | 
			
		||||
                cls._ignored_tokens.add(tokname)
 | 
			
		||||
@@ -247,9 +301,7 @@ class Lexer(metaclass=LexerMeta):
 | 
			
		||||
 | 
			
		||||
            elif callable(value):
 | 
			
		||||
                cls._token_funcs[tokname] = value
 | 
			
		||||
                pattern = getattr(value, 'pattern', None)
 | 
			
		||||
                if not pattern:
 | 
			
		||||
                    continue
 | 
			
		||||
                pattern = getattr(value, 'pattern')
 | 
			
		||||
 | 
			
		||||
            # Form the regular expression component
 | 
			
		||||
            part = f'(?P<{tokname}>{pattern})'
 | 
			
		||||
@@ -338,6 +390,7 @@ class Lexer(metaclass=LexerMeta):
 | 
			
		||||
                    index = m.end()
 | 
			
		||||
                    tok.value = m.group()
 | 
			
		||||
                    tok.type = m.lastgroup
 | 
			
		||||
 | 
			
		||||
                    if tok.type in _remapping:
 | 
			
		||||
                        tok.type = _remapping[tok.type].get(tok.value, tok.type)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user