Some work in progress on Lexer inheritance. Everything kind of broken
This commit is contained in:
parent
3a0ee0d9c1
commit
c5659a4465
107
sly/lex.py
107
sly/lex.py
@ -35,6 +35,7 @@ __version__ = '0.3'
|
|||||||
__all__ = ['Lexer', 'LexerStateChange']
|
__all__ = ['Lexer', 'LexerStateChange']
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import copy
|
||||||
|
|
||||||
class LexError(Exception):
|
class LexError(Exception):
|
||||||
'''
|
'''
|
||||||
@ -79,15 +80,25 @@ class Token(object):
|
|||||||
|
|
||||||
class TokenStr(str):
|
class TokenStr(str):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __new__(cls, value, before=None):
|
def __new__(cls, value):
|
||||||
self = super().__new__(cls, value)
|
self = super().__new__(cls, value)
|
||||||
|
if isinstance(value, TokenStr):
|
||||||
|
self.remap = dict(value.remap)
|
||||||
|
self.before = value.before
|
||||||
|
else:
|
||||||
self.remap = { }
|
self.remap = { }
|
||||||
self.before = before
|
self.before = None
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
# Implementation of TOKEN[value] = NEWTOKEN
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
self.remap[key] = value
|
self.remap[key] = value
|
||||||
|
|
||||||
|
# Implementation of del TOKEN[value]
|
||||||
|
def __delitem__(self, key):
|
||||||
|
del self.remap[key]
|
||||||
|
|
||||||
|
|
||||||
class LexerMetaDict(dict):
|
class LexerMetaDict(dict):
|
||||||
'''
|
'''
|
||||||
Special dictionary that prohibits duplicate definitions in lexer specifications.
|
Special dictionary that prohibits duplicate definitions in lexer specifications.
|
||||||
@ -103,12 +114,13 @@ class LexerMetaDict(dict):
|
|||||||
value.pattern = prior
|
value.pattern = prior
|
||||||
value.remap = getattr(prior, 'remap', None)
|
value.remap = getattr(prior, 'remap', None)
|
||||||
else:
|
else:
|
||||||
raise AttributeError(f'Name {key} redefined')
|
pass
|
||||||
|
# raise AttributeError(f'Name {key} redefined')
|
||||||
|
|
||||||
super().__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
if key not in self and key.isupper() and key[:1] != '_':
|
if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
|
||||||
return key
|
return key
|
||||||
else:
|
else:
|
||||||
return super().__getitem__(key)
|
return super().__getitem__(key)
|
||||||
@ -118,8 +130,9 @@ class LexerMeta(type):
|
|||||||
Metaclass for collecting lexing rules
|
Metaclass for collecting lexing rules
|
||||||
'''
|
'''
|
||||||
@classmethod
|
@classmethod
|
||||||
def __prepare__(meta, *args, **kwargs):
|
def __prepare__(meta, name, bases):
|
||||||
d = LexerMetaDict()
|
d = LexerMetaDict()
|
||||||
|
|
||||||
def _(pattern, *extra):
|
def _(pattern, *extra):
|
||||||
patterns = [pattern, *extra]
|
patterns = [pattern, *extra]
|
||||||
def decorate(func):
|
def decorate(func):
|
||||||
@ -130,17 +143,22 @@ class LexerMeta(type):
|
|||||||
func.pattern = pattern
|
func.pattern = pattern
|
||||||
return func
|
return func
|
||||||
return decorate
|
return decorate
|
||||||
|
|
||||||
|
def before(tok, pattern):
|
||||||
|
value = TokenStr(pattern)
|
||||||
|
value.before = tok
|
||||||
|
return value
|
||||||
|
|
||||||
d['_'] = _
|
d['_'] = _
|
||||||
|
d['before'] = before
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def __new__(meta, clsname, bases, attributes):
|
def __new__(meta, clsname, bases, attributes):
|
||||||
del attributes['_']
|
del attributes['_']
|
||||||
remapping = { key: val.remap for key, val in attributes.items()
|
|
||||||
if getattr(val, 'remap', None) }
|
|
||||||
clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
|
clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
|
||||||
for key, val in attributes.items() }
|
for key, val in attributes.items() }
|
||||||
cls = super().__new__(meta, clsname, bases, clsattributes)
|
cls = super().__new__(meta, clsname, bases, clsattributes)
|
||||||
cls._remapping = remapping
|
# Record the original definition environment
|
||||||
cls._attributes = attributes
|
cls._attributes = attributes
|
||||||
cls._build()
|
cls._build()
|
||||||
return cls
|
return cls
|
||||||
@ -152,12 +170,13 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
ignore = ''
|
ignore = ''
|
||||||
reflags = 0
|
reflags = 0
|
||||||
|
|
||||||
# These attributes are constructed automatically by the associated metaclass
|
|
||||||
_master_re = None
|
|
||||||
_token_names = set()
|
|
||||||
_literals = set()
|
|
||||||
_token_funcs = {}
|
_token_funcs = {}
|
||||||
_ignored_tokens = set()
|
_ignored_tokens = set()
|
||||||
|
_remapping = {}
|
||||||
|
|
||||||
|
# Internal attributes
|
||||||
|
__state_stack = None
|
||||||
|
__set_state = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _collect_rules(cls):
|
def _collect_rules(cls):
|
||||||
@ -181,10 +200,12 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
rules[n] = (key, value)
|
rules[n] = (key, value)
|
||||||
existing[key] = value
|
existing[key] = value
|
||||||
elif isinstance(value, TokenStr) and value.before in existing:
|
elif isinstance(value, TokenStr) and value.before in existing:
|
||||||
n = rules.index((key, existing[key]))
|
n = rules.index((value.before, existing[value.before]))
|
||||||
rules.insert(n, (key, value))
|
rules.insert(n, (key, value))
|
||||||
|
existing[key] = value
|
||||||
else:
|
else:
|
||||||
rules.append((key, value))
|
rules.append((key, value))
|
||||||
|
existing[key] = value
|
||||||
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
|
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
|
||||||
raise LexerBuildError(f'{key} does not match a name in tokens')
|
raise LexerBuildError(f'{key} does not match a name in tokens')
|
||||||
|
|
||||||
@ -199,19 +220,18 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
if 'tokens' not in vars(cls):
|
if 'tokens' not in vars(cls):
|
||||||
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
|
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
|
||||||
|
|
||||||
# Inherit token names, literals, ignored tokens, and other details
|
|
||||||
# from parent class (if any)
|
|
||||||
cls._token_names = cls._token_names | set(cls.tokens)
|
|
||||||
cls._literals = cls._literals | set(cls.literals)
|
|
||||||
cls._ignored_tokens = set(cls._ignored_tokens)
|
cls._ignored_tokens = set(cls._ignored_tokens)
|
||||||
cls._token_funcs = dict(cls._token_funcs)
|
cls._token_funcs = dict(cls._token_funcs)
|
||||||
|
cls._remapping = dict(cls._remapping)
|
||||||
|
cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
|
||||||
|
if getattr(val, 'remap', None) })
|
||||||
|
|
||||||
# Build a set of all remapped tokens
|
# Build a set of all remapped tokens
|
||||||
remapped_tokens = set()
|
remapped_tokens = set()
|
||||||
for toks in cls._remapping.values():
|
for toks in cls._remapping.values():
|
||||||
remapped_tokens.update(toks.values())
|
remapped_tokens.update(toks.values())
|
||||||
|
|
||||||
undefined = remapped_tokens - cls._token_names
|
undefined = remapped_tokens - set(cls.tokens)
|
||||||
if undefined:
|
if undefined:
|
||||||
missing = ', '.join(undefined)
|
missing = ', '.join(undefined)
|
||||||
raise LexerBuildError(f'{missing} not included in token(s)')
|
raise LexerBuildError(f'{missing} not included in token(s)')
|
||||||
@ -261,16 +281,46 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
if not all(isinstance(lit, str) for lit in cls.literals):
|
if not all(isinstance(lit, str) for lit in cls.literals):
|
||||||
raise LexerBuildError('literals must be specified as strings')
|
raise LexerBuildError('literals must be specified as strings')
|
||||||
|
|
||||||
|
def begin(self, cls):
|
||||||
|
'''
|
||||||
|
Begin a new lexer state
|
||||||
|
'''
|
||||||
|
assert isinstance(cls, LexerMeta), "state must be a subclass of Lexer"
|
||||||
|
if self.__set_state:
|
||||||
|
self.__set_state(cls)
|
||||||
|
self.__class__ = cls
|
||||||
|
|
||||||
|
def push_state(self, cls):
|
||||||
|
'''
|
||||||
|
Push a new lexer state onto the stack
|
||||||
|
'''
|
||||||
|
if self.__state_stack is None:
|
||||||
|
self.__state_stack = []
|
||||||
|
self.__state_stack.append(type(self))
|
||||||
|
self.begin(cls)
|
||||||
|
|
||||||
|
def pop_state(self):
|
||||||
|
'''
|
||||||
|
Pop a lexer state from the stack
|
||||||
|
'''
|
||||||
|
self.begin(self.__state_stack.pop())
|
||||||
|
|
||||||
def tokenize(self, text, lineno=1, index=0):
|
def tokenize(self, text, lineno=1, index=0):
|
||||||
while True:
|
_ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None
|
||||||
# Local copies of frequently used values
|
|
||||||
_ignored_tokens = self._ignored_tokens
|
def _set_state(cls):
|
||||||
_master_re = self._master_re
|
nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
|
||||||
_ignore = self.ignore
|
_ignored_tokens = cls._ignored_tokens
|
||||||
_token_funcs = self._token_funcs
|
_master_re = cls._master_re
|
||||||
_literals = self._literals
|
_ignore = cls.ignore
|
||||||
_remapping = self._remapping
|
_token_funcs = cls._token_funcs
|
||||||
|
_literals = cls.literals
|
||||||
|
_remapping = cls._remapping
|
||||||
|
|
||||||
|
self.__set_state = _set_state
|
||||||
|
_set_state(type(self))
|
||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -325,11 +375,6 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
index = self.index
|
index = self.index
|
||||||
lineno = self.lineno
|
lineno = self.lineno
|
||||||
|
|
||||||
except LexerStateChange as e:
|
|
||||||
self.__class__ = e.newstate
|
|
||||||
if e.tok:
|
|
||||||
yield e.tok
|
|
||||||
|
|
||||||
# Set the final state of the lexer before exiting (even if exception)
|
# Set the final state of the lexer before exiting (even if exception)
|
||||||
finally:
|
finally:
|
||||||
self.text = text
|
self.text = text
|
||||||
|
@ -189,3 +189,23 @@ def test_modern_error_return():
|
|||||||
assert vals == [123, ':+-', '+', '-']
|
assert vals == [123, ':+-', '+', '-']
|
||||||
assert lexer.errors == [ ':+-' ]
|
assert lexer.errors == [ ':+-' ]
|
||||||
|
|
||||||
|
# Test Lexer Inheritance. This class should inherit all of the tokens
|
||||||
|
# and features of ModernCalcLexer, but add two new tokens to it. The
|
||||||
|
# PLUSPLUS token matches before the PLUS token.
|
||||||
|
|
||||||
|
if False:
|
||||||
|
class SubModernCalcLexer(ModernCalcLexer):
|
||||||
|
tokens |= { DOLLAR, PLUSPLUS }
|
||||||
|
DOLLAR = r'\$'
|
||||||
|
PLUSPLUS = r'\+\+'
|
||||||
|
PLUSPLUS.before = PLUS
|
||||||
|
|
||||||
|
def test_lexer_inherit():
|
||||||
|
lexer = SubModernCalcLexer()
|
||||||
|
toks = list(lexer.tokenize('123 + - $ ++ if'))
|
||||||
|
types = [t.type for t in toks]
|
||||||
|
vals = [t.value for t in toks]
|
||||||
|
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
|
||||||
|
assert vals == [123, '+', '-', '$', '++', 'if']
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user