Improvements to lexer inheritance
This commit is contained in:
parent
c5659a4465
commit
1251da034a
78
CHANGES
78
CHANGES
@ -1,5 +1,83 @@
|
|||||||
Version 0.3
|
Version 0.3
|
||||||
-----------
|
-----------
|
||||||
|
4/1/2018 Support for Lexer inheritance added. For example:
|
||||||
|
|
||||||
|
from sly import Lexer
|
||||||
|
|
||||||
|
class BaseLexer(Lexer):
|
||||||
|
tokens = { NAME, NUMBER }
|
||||||
|
ignore = ' \t'
|
||||||
|
|
||||||
|
NAME = r'[a-zA-Z]+'
|
||||||
|
NUMBER = r'\d+'
|
||||||
|
|
||||||
|
|
||||||
|
class ChildLexer(BaseLexer):
|
||||||
|
tokens = { PLUS, MINUS }
|
||||||
|
PLUS = r'\+'
|
||||||
|
MINUS = r'-'
|
||||||
|
|
||||||
|
In this example, the ChildLexer class gets all of the tokens
|
||||||
|
from the parent class (BaseLexer) in addition to the new
|
||||||
|
definitions it added of its own.
|
||||||
|
|
||||||
|
One quirk of Lexer inheritance is that definition order has
|
||||||
|
an impact on the low-level regular expression parsing. By
|
||||||
|
default new definitions are always processed AFTER any previous
|
||||||
|
definitions. You can change this using the before() function
|
||||||
|
like this:
|
||||||
|
|
||||||
|
class GrandChildLexer(ChildLexer):
|
||||||
|
tokens = { PLUSPLUS, MINUSMINUS }
|
||||||
|
PLUSPLUS = before(PLUS, r'\+\+')
|
||||||
|
MINUSMINUS = before(MINUS, r'--')
|
||||||
|
|
||||||
|
In this example, the PLUSPLUS token is checked before the
|
||||||
|
PLUS token in the base class. Thus, an input text of '++'
|
||||||
|
will be parsed as a single token PLUSPLUS, not two PLUS tokens.
|
||||||
|
|
||||||
|
4/1/2018 Better support lexing states. Each lexing state can be defined as
|
||||||
|
as a separate class. Use the begin(cls) method to switch to a
|
||||||
|
different state. For example:
|
||||||
|
|
||||||
|
from sly import Lexer
|
||||||
|
|
||||||
|
class LexerA(Lexer):
|
||||||
|
tokens = { NAME, NUMBER, LBRACE }
|
||||||
|
|
||||||
|
ignore = ' \t'
|
||||||
|
|
||||||
|
NAME = r'[a-zA-Z]+'
|
||||||
|
NUMBER = r'\d+'
|
||||||
|
LBRACE = r'\{'
|
||||||
|
|
||||||
|
def LBRACE(self, t):
|
||||||
|
self.begin(LexerB)
|
||||||
|
return t
|
||||||
|
|
||||||
|
class LexerB(Lexer):
|
||||||
|
tokens = { PLUS, MINUS, RBRACE }
|
||||||
|
|
||||||
|
ignore = ' \t'
|
||||||
|
|
||||||
|
PLUS = r'\+'
|
||||||
|
MINUS = r'-'
|
||||||
|
RBRACE = r'\}'
|
||||||
|
|
||||||
|
def RBRACE(self, t):
|
||||||
|
self.begin(LexerA)
|
||||||
|
return t
|
||||||
|
|
||||||
|
In this example, LexerA switches to a new state LexerB when
|
||||||
|
a left brace ({) is encountered. The begin() method causes
|
||||||
|
the state transition. LexerB switches back to state LexerA
|
||||||
|
when a right brace (}) is encountered.
|
||||||
|
|
||||||
|
An option to the begin() method, you can also use push_state(cls)
|
||||||
|
and pop_state(cls) methods. This manages the lexing states as a
|
||||||
|
stack. The pop_state() method will return back to the previous
|
||||||
|
lexing state.
|
||||||
|
|
||||||
1/27/2018 Tokens no longer have to be specified as strings. For example, you
|
1/27/2018 Tokens no longer have to be specified as strings. For example, you
|
||||||
can now write:
|
can now write:
|
||||||
|
|
||||||
|
151
sly/lex.py
151
sly/lex.py
@ -80,48 +80,64 @@ class Token(object):
|
|||||||
|
|
||||||
class TokenStr(str):
|
class TokenStr(str):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __new__(cls, value):
|
def __new__(cls, value, key=None, remap=None):
|
||||||
self = super().__new__(cls, value)
|
self = super().__new__(cls, value)
|
||||||
if isinstance(value, TokenStr):
|
self.key = key
|
||||||
self.remap = dict(value.remap)
|
self.remap = remap
|
||||||
self.before = value.before
|
|
||||||
else:
|
|
||||||
self.remap = { }
|
|
||||||
self.before = None
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
# Implementation of TOKEN[value] = NEWTOKEN
|
# Implementation of TOKEN[value] = NEWTOKEN
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
self.remap[key] = value
|
if self.remap is not None:
|
||||||
|
self.remap[self.key, key] = value
|
||||||
|
|
||||||
# Implementation of del TOKEN[value]
|
# Implementation of del TOKEN[value]
|
||||||
def __delitem__(self, key):
|
def __delitem__(self, key):
|
||||||
del self.remap[key]
|
if self.remap is not None:
|
||||||
|
self.remap[self.key, key] = self.key
|
||||||
|
|
||||||
|
class _Before:
|
||||||
|
def __init__(self, tok, pattern):
|
||||||
|
self.tok = tok
|
||||||
|
self.pattern = pattern
|
||||||
|
|
||||||
class LexerMetaDict(dict):
|
class LexerMetaDict(dict):
|
||||||
'''
|
'''
|
||||||
Special dictionary that prohibits duplicate definitions in lexer specifications.
|
Special dictionary that prohibits duplicate definitions in lexer specifications.
|
||||||
'''
|
'''
|
||||||
|
def __init__(self):
|
||||||
|
self.before = { }
|
||||||
|
self.delete = [ ]
|
||||||
|
self.remap = { }
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
value = TokenStr(value)
|
value = TokenStr(value, key, self.remap)
|
||||||
|
|
||||||
|
if isinstance(value, _Before):
|
||||||
|
self.before[key] = value.tok
|
||||||
|
value = TokenStr(value.pattern, key, self.remap)
|
||||||
|
|
||||||
if key in self and not isinstance(value, property):
|
if key in self and not isinstance(value, property):
|
||||||
prior = self[key]
|
prior = self[key]
|
||||||
if isinstance(prior, str):
|
if isinstance(prior, str):
|
||||||
if callable(value):
|
if callable(value):
|
||||||
value.pattern = prior
|
value.pattern = prior
|
||||||
value.remap = getattr(prior, 'remap', None)
|
|
||||||
else:
|
else:
|
||||||
pass
|
raise AttributeError(f'Name {key} redefined')
|
||||||
# raise AttributeError(f'Name {key} redefined')
|
|
||||||
|
|
||||||
super().__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
|
|
||||||
|
def __delitem__(self, key):
|
||||||
|
self.delete.append(key)
|
||||||
|
if key not in self and key.isupper():
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
return super().__delitem__(key)
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
|
if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
|
||||||
return key
|
return TokenStr(key, key, self.remap)
|
||||||
else:
|
else:
|
||||||
return super().__getitem__(key)
|
return super().__getitem__(key)
|
||||||
|
|
||||||
@ -144,22 +160,24 @@ class LexerMeta(type):
|
|||||||
return func
|
return func
|
||||||
return decorate
|
return decorate
|
||||||
|
|
||||||
def before(tok, pattern):
|
|
||||||
value = TokenStr(pattern)
|
|
||||||
value.before = tok
|
|
||||||
return value
|
|
||||||
|
|
||||||
d['_'] = _
|
d['_'] = _
|
||||||
d['before'] = before
|
d['before'] = _Before
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def __new__(meta, clsname, bases, attributes):
|
def __new__(meta, clsname, bases, attributes):
|
||||||
del attributes['_']
|
del attributes['_']
|
||||||
clsattributes = { key: str(val) if isinstance(val, TokenStr) else val
|
del attributes['before']
|
||||||
for key, val in attributes.items() }
|
|
||||||
cls = super().__new__(meta, clsname, bases, clsattributes)
|
# Create attributes for use in the actual class body
|
||||||
# Record the original definition environment
|
cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
|
||||||
cls._attributes = attributes
|
for key, val in attributes.items() }
|
||||||
|
cls = super().__new__(meta, clsname, bases, cls_attributes)
|
||||||
|
|
||||||
|
# Attach various metadata to the class
|
||||||
|
cls._attributes = dict(attributes)
|
||||||
|
cls._remap = attributes.remap
|
||||||
|
cls._before = attributes.before
|
||||||
|
cls._delete = attributes.delete
|
||||||
cls._build()
|
cls._build()
|
||||||
return cls
|
return cls
|
||||||
|
|
||||||
@ -170,9 +188,12 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
ignore = ''
|
ignore = ''
|
||||||
reflags = 0
|
reflags = 0
|
||||||
|
|
||||||
|
_token_names = set()
|
||||||
_token_funcs = {}
|
_token_funcs = {}
|
||||||
_ignored_tokens = set()
|
_ignored_tokens = set()
|
||||||
_remapping = {}
|
_remapping = {}
|
||||||
|
_delete = {}
|
||||||
|
_remap = {}
|
||||||
|
|
||||||
# Internal attributes
|
# Internal attributes
|
||||||
__state_stack = None
|
__state_stack = None
|
||||||
@ -180,36 +201,63 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _collect_rules(cls):
|
def _collect_rules(cls):
|
||||||
'''
|
# Collect all of the rules from class definitions that look like token
|
||||||
Collect all of the rules from class definitions that look like tokens
|
# information. There are a few things that govern this:
|
||||||
'''
|
#
|
||||||
definitions = list(cls._attributes.items())
|
# 1. Any definition of the form NAME = str is a token if NAME is
|
||||||
|
# is defined in the tokens set.
|
||||||
|
#
|
||||||
|
# 2. Any definition of the form ignore_NAME = str is a rule for an ignored
|
||||||
|
# token.
|
||||||
|
#
|
||||||
|
# 3. Any function defined with a 'pattern' attribute is treated as a rule.
|
||||||
|
# Such functions can be created with the @_ decorator or by defining
|
||||||
|
# function with the same name as a previously defined string.
|
||||||
|
#
|
||||||
|
# This function is responsible for keeping rules in order.
|
||||||
|
|
||||||
|
# Collect all previous rules from base classes
|
||||||
rules = []
|
rules = []
|
||||||
|
|
||||||
# Collect all of the previous rules from base classes
|
|
||||||
for base in cls.__bases__:
|
for base in cls.__bases__:
|
||||||
if isinstance(base, LexerMeta):
|
if isinstance(base, LexerMeta):
|
||||||
rules.extend(base._collect_rules())
|
rules.extend(base._rules)
|
||||||
|
|
||||||
|
# Dictionary of previous rules
|
||||||
existing = dict(rules)
|
existing = dict(rules)
|
||||||
|
|
||||||
for key, value in definitions:
|
for key, value in cls._attributes.items():
|
||||||
if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
|
if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
|
||||||
|
if callable(value) and not hasattr(value, 'pattern'):
|
||||||
|
raise LexerBuildError(f"function {value} doesn't have a regex pattern")
|
||||||
|
|
||||||
if key in existing:
|
if key in existing:
|
||||||
|
# The definition matches something that already existed in the base class.
|
||||||
|
# We replace it, but keep the original ordering
|
||||||
n = rules.index((key, existing[key]))
|
n = rules.index((key, existing[key]))
|
||||||
rules[n] = (key, value)
|
rules[n] = (key, value)
|
||||||
existing[key] = value
|
existing[key] = value
|
||||||
elif isinstance(value, TokenStr) and value.before in existing:
|
|
||||||
n = rules.index((value.before, existing[value.before]))
|
elif isinstance(value, TokenStr) and key in cls._before:
|
||||||
rules.insert(n, (key, value))
|
before = cls._before[key]
|
||||||
|
if before in existing:
|
||||||
|
# Position the token before another specified token
|
||||||
|
n = rules.index((before, existing[before]))
|
||||||
|
rules.insert(n, (key, value))
|
||||||
|
else:
|
||||||
|
# Put at the end of the rule list
|
||||||
|
rules.append((key, value))
|
||||||
existing[key] = value
|
existing[key] = value
|
||||||
else:
|
else:
|
||||||
rules.append((key, value))
|
rules.append((key, value))
|
||||||
existing[key] = value
|
existing[key] = value
|
||||||
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore'}:
|
|
||||||
|
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
|
||||||
raise LexerBuildError(f'{key} does not match a name in tokens')
|
raise LexerBuildError(f'{key} does not match a name in tokens')
|
||||||
|
|
||||||
return rules
|
# Apply deletion rules
|
||||||
|
rules = [ (key, value) for key, value in rules if key not in cls._delete ]
|
||||||
|
cls._rules = rules
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _build(cls):
|
def _build(cls):
|
||||||
@ -220,24 +268,30 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
if 'tokens' not in vars(cls):
|
if 'tokens' not in vars(cls):
|
||||||
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
|
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
|
||||||
|
|
||||||
|
# Pull definitions created for any parent classes
|
||||||
|
cls._token_names = cls._token_names | set(cls.tokens)
|
||||||
cls._ignored_tokens = set(cls._ignored_tokens)
|
cls._ignored_tokens = set(cls._ignored_tokens)
|
||||||
cls._token_funcs = dict(cls._token_funcs)
|
cls._token_funcs = dict(cls._token_funcs)
|
||||||
cls._remapping = dict(cls._remapping)
|
cls._remapping = dict(cls._remapping)
|
||||||
cls._remapping.update({ key: val.remap for key, val in cls._attributes.items()
|
|
||||||
if getattr(val, 'remap', None) })
|
|
||||||
|
|
||||||
# Build a set of all remapped tokens
|
for (key, val), newtok in cls._remap.items():
|
||||||
remapped_tokens = set()
|
if key not in cls._remapping:
|
||||||
for toks in cls._remapping.values():
|
cls._remapping[key] = {}
|
||||||
remapped_tokens.update(toks.values())
|
cls._remapping[key][val] = newtok
|
||||||
|
|
||||||
undefined = remapped_tokens - set(cls.tokens)
|
remapped_toks = set()
|
||||||
|
for d in cls._remapping.values():
|
||||||
|
remapped_toks.update(d.values())
|
||||||
|
|
||||||
|
undefined = remapped_toks - set(cls._token_names)
|
||||||
if undefined:
|
if undefined:
|
||||||
missing = ', '.join(undefined)
|
missing = ', '.join(undefined)
|
||||||
raise LexerBuildError(f'{missing} not included in token(s)')
|
raise LexerBuildError(f'{missing} not included in token(s)')
|
||||||
|
|
||||||
|
cls._collect_rules()
|
||||||
|
|
||||||
parts = []
|
parts = []
|
||||||
for tokname, value in cls._collect_rules():
|
for tokname, value in cls._rules:
|
||||||
if tokname.startswith('ignore_'):
|
if tokname.startswith('ignore_'):
|
||||||
tokname = tokname[7:]
|
tokname = tokname[7:]
|
||||||
cls._ignored_tokens.add(tokname)
|
cls._ignored_tokens.add(tokname)
|
||||||
@ -247,9 +301,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
|
|
||||||
elif callable(value):
|
elif callable(value):
|
||||||
cls._token_funcs[tokname] = value
|
cls._token_funcs[tokname] = value
|
||||||
pattern = getattr(value, 'pattern', None)
|
pattern = getattr(value, 'pattern')
|
||||||
if not pattern:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Form the regular expression component
|
# Form the regular expression component
|
||||||
part = f'(?P<{tokname}>{pattern})'
|
part = f'(?P<{tokname}>{pattern})'
|
||||||
@ -338,6 +390,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
index = m.end()
|
index = m.end()
|
||||||
tok.value = m.group()
|
tok.value = m.group()
|
||||||
tok.type = m.lastgroup
|
tok.type = m.lastgroup
|
||||||
|
|
||||||
if tok.type in _remapping:
|
if tok.type in _remapping:
|
||||||
tok.type = _remapping[tok.type].get(tok.value, tok.type)
|
tok.type = _remapping[tok.type].get(tok.value, tok.type)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user