Some work in progress on Lexer inheritance. Everything kind of broken

This commit is contained in:
David Beazley 2018-03-30 14:23:34 -05:00
parent 3a0ee0d9c1
commit c5659a4465
2 changed files with 149 additions and 84 deletions

View File

@ -35,6 +35,7 @@ __version__ = '0.3'
__all__ = ['Lexer', 'LexerStateChange'] __all__ = ['Lexer', 'LexerStateChange']
import re import re
import copy
class LexError(Exception): class LexError(Exception):
''' '''
@ -79,15 +80,25 @@ class Token(object):
class TokenStr(str): class TokenStr(str):
@staticmethod @staticmethod
def __new__(cls, value, before=None): def __new__(cls, value):
self = super().__new__(cls, value) self = super().__new__(cls, value)
if isinstance(value, TokenStr):
self.remap = dict(value.remap)
self.before = value.before
else:
self.remap = { } self.remap = { }
self.before = before self.before = None
return self return self
# Implementation of TOKEN[value] = NEWTOKEN
def __setitem__(self, key, value): def __setitem__(self, key, value):
self.remap[key] = value self.remap[key] = value
# Implementation of del TOKEN[value]
def __delitem__(self, key):
del self.remap[key]
class LexerMetaDict(dict): class LexerMetaDict(dict):
''' '''
Special dictionary that prohibits duplicate definitions in lexer specifications. Special dictionary that prohibits duplicate definitions in lexer specifications.
@ -103,12 +114,13 @@ class LexerMetaDict(dict):
value.pattern = prior value.pattern = prior
value.remap = getattr(prior, 'remap', None) value.remap = getattr(prior, 'remap', None)
else: else:
raise AttributeError(f'Name {key} redefined') pass
# raise AttributeError(f'Name {key} redefined')
super().__setitem__(key, value) super().__setitem__(key, value)
def __getitem__(self, key): def __getitem__(self, key):
if key not in self and key.isupper() and key[:1] != '_': if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
return key return key
else: else:
return super().__getitem__(key) return super().__getitem__(key)
@ -118,8 +130,9 @@ class LexerMeta(type):
Metaclass for collecting lexing rules Metaclass for collecting lexing rules
''' '''
@classmethod @classmethod
def __prepare__(meta, *args, **kwargs): def __prepare__(meta, name, bases):
d = LexerMetaDict() d = LexerMetaDict()
def _(pattern, *extra): def _(pattern, *extra):
patterns = [pattern, *extra] patterns = [pattern, *extra]
def decorate(func): def decorate(func):
@ -130,17 +143,22 @@ class LexerMeta(type):
func.pattern = pattern func.pattern = pattern
return func return func
return decorate return decorate
def before(tok, pattern):
value = TokenStr(pattern)
value.before = tok
return value
d['_'] = _ d['_'] = _
d['before'] = before
return d return d
def __new__(meta, clsname, bases, attributes): def __new__(meta, clsname, bases, attributes):
del attributes['_'] del attributes['_']
remapping = { key: val.remap for key, val in attributes.items()
if getattr(val, 'remap', None) }
clsattributes = { key: str(val) if isinstance(val, TokenStr) else val clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
for key, val in attributes.items() } for key, val in attributes.items() }
cls = super().__new__(meta, clsname, bases, clsattributes) cls = super().__new__(meta, clsname, bases, clsattributes)
cls._remapping = remapping # Record the original definition environment
cls._attributes = attributes cls._attributes = attributes
cls._build() cls._build()
return cls return cls
@ -152,12 +170,13 @@ class Lexer(metaclass=LexerMeta):
ignore = '' ignore = ''
reflags = 0 reflags = 0
# These attributes are constructed automatically by the associated metaclass _token_funcs = {}
_master_re = None
_token_names = set()
_literals = set()
_token_funcs = { }
_ignored_tokens = set() _ignored_tokens = set()
_remapping = {}
# Internal attributes
__state_stack = None
__set_state = None
@classmethod @classmethod
def _collect_rules(cls): def _collect_rules(cls):
@ -181,10 +200,12 @@ class Lexer(metaclass=LexerMeta):
rules[n] = (key, value) rules[n] = (key, value)
existing[key] = value existing[key] = value
elif isinstance(value, TokenStr) and value.before in existing: elif isinstance(value, TokenStr) and value.before in existing:
n = rules.index((key, existing[key])) n = rules.index((value.before, existing[value.before]))
rules.insert(n, (key, value)) rules.insert(n, (key, value))
existing[key] = value
else: else:
rules.append((key, value)) rules.append((key, value))
existing[key] = value
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}: elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
raise LexerBuildError(f'{key} does not match a name in tokens') raise LexerBuildError(f'{key} does not match a name in tokens')
@ -199,19 +220,18 @@ class Lexer(metaclass=LexerMeta):
if 'tokens' not in vars(cls): if 'tokens' not in vars(cls):
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute') raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
# Inherit token names, literals, ignored tokens, and other details
# from parent class (if any)
cls._token_names = cls._token_names | set(cls.tokens)
cls._literals = cls._literals | set(cls.literals)
cls._ignored_tokens = set(cls._ignored_tokens) cls._ignored_tokens = set(cls._ignored_tokens)
cls._token_funcs = dict(cls._token_funcs) cls._token_funcs = dict(cls._token_funcs)
cls._remapping = dict(cls._remapping)
cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
if getattr(val, 'remap', None) })
# Build a set of all remapped tokens # Build a set of all remapped tokens
remapped_tokens = set() remapped_tokens = set()
for toks in cls._remapping.values(): for toks in cls._remapping.values():
remapped_tokens.update(toks.values()) remapped_tokens.update(toks.values())
undefined = remapped_tokens - cls._token_names undefined = remapped_tokens - set(cls.tokens)
if undefined: if undefined:
missing = ', '.join(undefined) missing = ', '.join(undefined)
raise LexerBuildError(f'{missing} not included in token(s)') raise LexerBuildError(f'{missing} not included in token(s)')
@ -261,16 +281,46 @@ class Lexer(metaclass=LexerMeta):
if not all(isinstance(lit, str) for lit in cls.literals): if not all(isinstance(lit, str) for lit in cls.literals):
raise LexerBuildError('literals must be specified as strings') raise LexerBuildError('literals must be specified as strings')
def begin(self, cls):
'''
Begin a new lexer state
'''
assert isinstance(cls, LexerMeta), "state must be a subclass of Lexer"
if self.__set_state:
self.__set_state(cls)
self.__class__ = cls
def push_state(self, cls):
'''
Push a new lexer state onto the stack
'''
if self.__state_stack is None:
self.__state_stack = []
self.__state_stack.append(type(self))
self.begin(cls)
def pop_state(self):
'''
Pop a lexer state from the stack
'''
self.begin(self.__state_stack.pop())
def tokenize(self, text, lineno=1, index=0): def tokenize(self, text, lineno=1, index=0):
while True: _ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None
# Local copies of frequently used values
_ignored_tokens = self._ignored_tokens def _set_state(cls):
_master_re = self._master_re nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
_ignore = self.ignore _ignored_tokens = cls._ignored_tokens
_token_funcs = self._token_funcs _master_re = cls._master_re
_literals = self._literals _ignore = cls.ignore
_remapping = self._remapping _token_funcs = cls._token_funcs
_literals = cls.literals
_remapping = cls._remapping
self.__set_state = _set_state
_set_state(type(self))
self.text = text self.text = text
try: try:
while True: while True:
try: try:
@ -325,11 +375,6 @@ class Lexer(metaclass=LexerMeta):
index = self.index index = self.index
lineno = self.lineno lineno = self.lineno
except LexerStateChange as e:
self.__class__ = e.newstate
if e.tok:
yield e.tok
# Set the final state of the lexer before exiting (even if exception) # Set the final state of the lexer before exiting (even if exception)
finally: finally:
self.text = text self.text = text

View File

@ -189,3 +189,23 @@ def test_modern_error_return():
assert vals == [123, ':+-', '+', '-'] assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ] assert lexer.errors == [ ':+-' ]
# Test Lexer Inheritance. This class should inherit all of the tokens
# and features of ModernCalcLexer, but add two new tokens to it. The
# PLUSPLUS token matches before the PLUS token.
if False:
class SubModernCalcLexer(ModernCalcLexer):
tokens |= { DOLLAR, PLUSPLUS }
DOLLAR = r'\$'
PLUSPLUS = r'\+\+'
PLUSPLUS.before = PLUS
def test_lexer_inherit():
lexer = SubModernCalcLexer()
toks = list(lexer.tokenize('123 + - $ ++ if'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
assert vals == [123, '+', '-', '$', '++', 'if']