Improvements to lexer inheritance
This commit is contained in:
parent
c5659a4465
commit
1251da034a
78
CHANGES
78
CHANGES
@ -1,5 +1,83 @@
|
||||
Version 0.3
|
||||
-----------
|
||||
4/1/2018 Support for Lexer inheritance added. For example:
|
||||
|
||||
from sly import Lexer
|
||||
|
||||
class BaseLexer(Lexer):
|
||||
tokens = { NAME, NUMBER }
|
||||
ignore = ' \t'
|
||||
|
||||
NAME = r'[a-zA-Z]+'
|
||||
NUMBER = r'\d+'
|
||||
|
||||
|
||||
class ChildLexer(BaseLexer):
|
||||
tokens = { PLUS, MINUS }
|
||||
PLUS = r'\+'
|
||||
MINUS = r'-'
|
||||
|
||||
In this example, the ChildLexer class gets all of the tokens
|
||||
from the parent class (BaseLexer) in addition to the new
|
||||
definitions it added of its own.
|
||||
|
||||
One quirk of Lexer inheritance is that definition order has
|
||||
an impact on the low-level regular expression parsing. By
|
||||
default new definitions are always processed AFTER any previous
|
||||
definitions. You can change this using the before() function
|
||||
like this:
|
||||
|
||||
class GrandChildLexer(ChildLexer):
|
||||
tokens = { PLUSPLUS, MINUSMINUS }
|
||||
PLUSPLUS = before(PLUS, r'\+\+')
|
||||
MINUSMINUS = before(MINUS, r'--')
|
||||
|
||||
In this example, the PLUSPLUS token is checked before the
|
||||
PLUS token in the base class. Thus, an input text of '++'
|
||||
will be parsed as a single token PLUSPLUS, not two PLUS tokens.
|
||||
|
||||
4/1/2018 Better support lexing states. Each lexing state can be defined as
|
||||
as a separate class. Use the begin(cls) method to switch to a
|
||||
different state. For example:
|
||||
|
||||
from sly import Lexer
|
||||
|
||||
class LexerA(Lexer):
|
||||
tokens = { NAME, NUMBER, LBRACE }
|
||||
|
||||
ignore = ' \t'
|
||||
|
||||
NAME = r'[a-zA-Z]+'
|
||||
NUMBER = r'\d+'
|
||||
LBRACE = r'\{'
|
||||
|
||||
def LBRACE(self, t):
|
||||
self.begin(LexerB)
|
||||
return t
|
||||
|
||||
class LexerB(Lexer):
|
||||
tokens = { PLUS, MINUS, RBRACE }
|
||||
|
||||
ignore = ' \t'
|
||||
|
||||
PLUS = r'\+'
|
||||
MINUS = r'-'
|
||||
RBRACE = r'\}'
|
||||
|
||||
def RBRACE(self, t):
|
||||
self.begin(LexerA)
|
||||
return t
|
||||
|
||||
In this example, LexerA switches to a new state LexerB when
|
||||
a left brace ({) is encountered. The begin() method causes
|
||||
the state transition. LexerB switches back to state LexerA
|
||||
when a right brace (}) is encountered.
|
||||
|
||||
An option to the begin() method, you can also use push_state(cls)
|
||||
and pop_state(cls) methods. This manages the lexing states as a
|
||||
stack. The pop_state() method will return back to the previous
|
||||
lexing state.
|
||||
|
||||
1/27/2018 Tokens no longer have to be specified as strings. For example, you
|
||||
can now write:
|
||||
|
||||
|
155
sly/lex.py
155
sly/lex.py
@ -80,48 +80,64 @@ class Token(object):
|
||||
|
||||
class TokenStr(str):
|
||||
@staticmethod
|
||||
def __new__(cls, value):
|
||||
def __new__(cls, value, key=None, remap=None):
|
||||
self = super().__new__(cls, value)
|
||||
if isinstance(value, TokenStr):
|
||||
self.remap = dict(value.remap)
|
||||
self.before = value.before
|
||||
else:
|
||||
self.remap = { }
|
||||
self.before = None
|
||||
self.key = key
|
||||
self.remap = remap
|
||||
return self
|
||||
|
||||
# Implementation of TOKEN[value] = NEWTOKEN
|
||||
def __setitem__(self, key, value):
|
||||
self.remap[key] = value
|
||||
if self.remap is not None:
|
||||
self.remap[self.key, key] = value
|
||||
|
||||
# Implementation of del TOKEN[value]
|
||||
def __delitem__(self, key):
|
||||
del self.remap[key]
|
||||
if self.remap is not None:
|
||||
self.remap[self.key, key] = self.key
|
||||
|
||||
class _Before:
|
||||
def __init__(self, tok, pattern):
|
||||
self.tok = tok
|
||||
self.pattern = pattern
|
||||
|
||||
class LexerMetaDict(dict):
|
||||
'''
|
||||
Special dictionary that prohibits duplicate definitions in lexer specifications.
|
||||
'''
|
||||
def __init__(self):
|
||||
self.before = { }
|
||||
self.delete = [ ]
|
||||
self.remap = { }
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if isinstance(value, str):
|
||||
value = TokenStr(value)
|
||||
value = TokenStr(value, key, self.remap)
|
||||
|
||||
if isinstance(value, _Before):
|
||||
self.before[key] = value.tok
|
||||
value = TokenStr(value.pattern, key, self.remap)
|
||||
|
||||
if key in self and not isinstance(value, property):
|
||||
prior = self[key]
|
||||
if isinstance(prior, str):
|
||||
if callable(value):
|
||||
value.pattern = prior
|
||||
value.remap = getattr(prior, 'remap', None)
|
||||
else:
|
||||
pass
|
||||
# raise AttributeError(f'Name {key} redefined')
|
||||
raise AttributeError(f'Name {key} redefined')
|
||||
|
||||
super().__setitem__(key, value)
|
||||
|
||||
def __delitem__(self, key):
|
||||
self.delete.append(key)
|
||||
if key not in self and key.isupper():
|
||||
pass
|
||||
else:
|
||||
return super().__delitem__(key)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
|
||||
return key
|
||||
return TokenStr(key, key, self.remap)
|
||||
else:
|
||||
return super().__getitem__(key)
|
||||
|
||||
@ -144,22 +160,24 @@ class LexerMeta(type):
|
||||
return func
|
||||
return decorate
|
||||
|
||||
def before(tok, pattern):
|
||||
value = TokenStr(pattern)
|
||||
value.before = tok
|
||||
return value
|
||||
|
||||
d['_'] = _
|
||||
d['before'] = before
|
||||
d['before'] = _Before
|
||||
return d
|
||||
|
||||
def __new__(meta, clsname, bases, attributes):
|
||||
del attributes['_']
|
||||
clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
|
||||
for key, val in attributes.items() }
|
||||
cls = super().__new__(meta, clsname, bases, clsattributes)
|
||||
# Record the original definition environment
|
||||
cls._attributes = attributes
|
||||
del attributes['before']
|
||||
|
||||
# Create attributes for use in the actual class body
|
||||
cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
|
||||
for key, val in attributes.items() }
|
||||
cls = super().__new__(meta, clsname, bases, cls_attributes)
|
||||
|
||||
# Attach various metadata to the class
|
||||
cls._attributes = dict(attributes)
|
||||
cls._remap = attributes.remap
|
||||
cls._before = attributes.before
|
||||
cls._delete = attributes.delete
|
||||
cls._build()
|
||||
return cls
|
||||
|
||||
@ -169,10 +187,13 @@ class Lexer(metaclass=LexerMeta):
|
||||
literals = set()
|
||||
ignore = ''
|
||||
reflags = 0
|
||||
|
||||
|
||||
_token_names = set()
|
||||
_token_funcs = {}
|
||||
_ignored_tokens = set()
|
||||
_remapping = {}
|
||||
_delete = {}
|
||||
_remap = {}
|
||||
|
||||
# Internal attributes
|
||||
__state_stack = None
|
||||
@ -180,36 +201,63 @@ class Lexer(metaclass=LexerMeta):
|
||||
|
||||
@classmethod
|
||||
def _collect_rules(cls):
|
||||
'''
|
||||
Collect all of the rules from class definitions that look like tokens
|
||||
'''
|
||||
definitions = list(cls._attributes.items())
|
||||
# Collect all of the rules from class definitions that look like token
|
||||
# information. There are a few things that govern this:
|
||||
#
|
||||
# 1. Any definition of the form NAME = str is a token if NAME is
|
||||
# is defined in the tokens set.
|
||||
#
|
||||
# 2. Any definition of the form ignore_NAME = str is a rule for an ignored
|
||||
# token.
|
||||
#
|
||||
# 3. Any function defined with a 'pattern' attribute is treated as a rule.
|
||||
# Such functions can be created with the @_ decorator or by defining
|
||||
# function with the same name as a previously defined string.
|
||||
#
|
||||
# This function is responsible for keeping rules in order.
|
||||
|
||||
# Collect all previous rules from base classes
|
||||
rules = []
|
||||
|
||||
# Collect all of the previous rules from base classes
|
||||
for base in cls.__bases__:
|
||||
if isinstance(base, LexerMeta):
|
||||
rules.extend(base._collect_rules())
|
||||
|
||||
rules.extend(base._rules)
|
||||
|
||||
# Dictionary of previous rules
|
||||
existing = dict(rules)
|
||||
|
||||
for key, value in definitions:
|
||||
if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
|
||||
for key, value in cls._attributes.items():
|
||||
if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
|
||||
if callable(value) and not hasattr(value, 'pattern'):
|
||||
raise LexerBuildError(f"function {value} doesn't have a regex pattern")
|
||||
|
||||
if key in existing:
|
||||
# The definition matches something that already existed in the base class.
|
||||
# We replace it, but keep the original ordering
|
||||
n = rules.index((key, existing[key]))
|
||||
rules[n] = (key, value)
|
||||
existing[key] = value
|
||||
elif isinstance(value, TokenStr) and value.before in existing:
|
||||
n = rules.index((value.before, existing[value.before]))
|
||||
rules.insert(n, (key, value))
|
||||
|
||||
elif isinstance(value, TokenStr) and key in cls._before:
|
||||
before = cls._before[key]
|
||||
if before in existing:
|
||||
# Position the token before another specified token
|
||||
n = rules.index((before, existing[before]))
|
||||
rules.insert(n, (key, value))
|
||||
else:
|
||||
# Put at the end of the rule list
|
||||
rules.append((key, value))
|
||||
existing[key] = value
|
||||
else:
|
||||
rules.append((key, value))
|
||||
existing[key] = value
|
||||
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
|
||||
|
||||
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
|
||||
raise LexerBuildError(f'{key} does not match a name in tokens')
|
||||
|
||||
return rules
|
||||
# Apply deletion rules
|
||||
rules = [ (key, value) for key, value in rules if key not in cls._delete ]
|
||||
cls._rules = rules
|
||||
|
||||
@classmethod
|
||||
def _build(cls):
|
||||
@ -220,24 +268,30 @@ class Lexer(metaclass=LexerMeta):
|
||||
if 'tokens' not in vars(cls):
|
||||
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
|
||||
|
||||
# Pull definitions created for any parent classes
|
||||
cls._token_names = cls._token_names | set(cls.tokens)
|
||||
cls._ignored_tokens = set(cls._ignored_tokens)
|
||||
cls._token_funcs = dict(cls._token_funcs)
|
||||
cls._remapping = dict(cls._remapping)
|
||||
cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
|
||||
if getattr(val, 'remap', None) })
|
||||
|
||||
# Build a set of all remapped tokens
|
||||
remapped_tokens = set()
|
||||
for toks in cls._remapping.values():
|
||||
remapped_tokens.update(toks.values())
|
||||
for (key, val), newtok in cls._remap.items():
|
||||
if key not in cls._remapping:
|
||||
cls._remapping[key] = {}
|
||||
cls._remapping[key][val] = newtok
|
||||
|
||||
undefined = remapped_tokens - set(cls.tokens)
|
||||
remapped_toks = set()
|
||||
for d in cls._remapping.values():
|
||||
remapped_toks.update(d.values())
|
||||
|
||||
undefined = remapped_toks - set(cls._token_names)
|
||||
if undefined:
|
||||
missing = ', '.join(undefined)
|
||||
raise LexerBuildError(f'{missing} not included in token(s)')
|
||||
|
||||
cls._collect_rules()
|
||||
|
||||
parts = []
|
||||
for tokname, value in cls._collect_rules():
|
||||
for tokname, value in cls._rules:
|
||||
if tokname.startswith('ignore_'):
|
||||
tokname = tokname[7:]
|
||||
cls._ignored_tokens.add(tokname)
|
||||
@ -247,9 +301,7 @@ class Lexer(metaclass=LexerMeta):
|
||||
|
||||
elif callable(value):
|
||||
cls._token_funcs[tokname] = value
|
||||
pattern = getattr(value, 'pattern', None)
|
||||
if not pattern:
|
||||
continue
|
||||
pattern = getattr(value, 'pattern')
|
||||
|
||||
# Form the regular expression component
|
||||
part = f'(?P<{tokname}>{pattern})'
|
||||
@ -338,6 +390,7 @@ class Lexer(metaclass=LexerMeta):
|
||||
index = m.end()
|
||||
tok.value = m.group()
|
||||
tok.type = m.lastgroup
|
||||
|
||||
if tok.type in _remapping:
|
||||
tok.type = _remapping[tok.type].get(tok.value, tok.type)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user