Changes to token specification. More metamagic

This commit is contained in:
David Beazley 2018-01-27 15:27:15 -06:00
parent b74e7223ce
commit b088d9b2ce
10 changed files with 302 additions and 142 deletions

59
CHANGES
View File

@ -1,5 +1,64 @@
Version 0.3
-----------
1/27/2018 Tokens no longer have to be specified as strings. For example, you
can now write:
from sly import Lexer
class TheLexer(Lexer):
tokens = { ID, NUMBER, PLUS, MINUS }
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
PLUS = r'\+'
MINUS = r'-'
This convention also carries over to the parser for things such
as precedence specifiers:
from sly import Parser
class TheParser(Parser):
tokens = TheLexer.tokens
precedence = (
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS),
)
...
Nevermind the fact that ID, NUMBER, PLUS, and MINUS appear to be
undefined identifiers. It all works.
1/27/2018 Tokens now allow special-case remapping. For example:
from sly import Lexer
class TheLexer(Lexer):
tokens = { ID, IF, ELSE, WHILE, NUMBER, PLUS, MINUS }
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
NUMBER = r'\d+'
PLUS = r'\+'
MINUS = r'-'
In this code, the ID rule matches any identifier. However,
special cases have been made for IF, ELSE, and WHILE tokens.
Previously, this had to be handled in a special action method
such as this:
def ID(self, t):
if t.value in { 'if', 'else', 'while' }:
t.type = t.value.upper()
return t
Nevermind the fact that the syntax appears to suggest that strings
work as a kind of mutable mapping.
1/16/2018 Usability improvement on Lexer class. Regular expression rules
specified as strings that don't match any name in tokens are
now reported as errors.

View File

@ -1,6 +1,6 @@
SLY (Sly Lex-Yacc) Version 0.2
SLY (Sly Lex-Yacc) Version 0.3
Copyright (C) 2016-2017
Copyright (C) 2016-2018
David M. Beazley (Dabeaz LLC)
All rights reserved.
@ -85,9 +85,7 @@ expressions and store variables:
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
tokens = { NAME, NUMBER }
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }

View File

@ -60,9 +60,7 @@ expressions and store variables::
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
tokens = { NAME, NUMBER }
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }

View File

@ -68,17 +68,8 @@ lexer that tokenizes the above text::
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
DIVIDE, ASSIGN, LPAREN, RPAREN }
# String containing ignored characters between tokens
ignore = ' \t'
@ -131,19 +122,12 @@ In the example, the following code specified the token names::
class CalcLexer(Lexer):
...
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
DIVIDE, ASSIGN, LPAREN, RPAREN }
...
Token names should be specified using all-caps as shown.
Specification of token match patterns
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -167,7 +151,7 @@ short tokens. For example, if you wanted to have separate tokens for
example::
class MyLexer(Lexer):
tokens = {'ASSIGN', 'EQ', ...}
tokens = { ASSIGN, EQ, ...}
...
EQ = r'==' # MUST APPEAR FIRST! (LONGER)
ASSIGN = r'='
@ -251,12 +235,10 @@ Instead of using the ``@_()`` decorator, you can also write a method
that matches the same name as a token previously specified as a
string. For example::
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
...
def ID(self, t):
reserved = { 'if', 'else', 'while', 'for' }
if t.value in reserved:
t.type = t.value.upper()
def NUMBER(self, t):
t.value = int(t.value)
return t
This is potentially useful trick for debugging a lexer. You can temporarily
@ -264,6 +246,36 @@ attach a method a token and have it execute when the token is encountered.
If you later take the method away, the lexer will revert back to its original
behavior.
Token Remapping
^^^^^^^^^^^^^^^
Occasionally, you might need to remap tokens based on special cases.
Consider the case of matching identifiers such as "abc", "python", or "guido".
Certain identifiers such as "if", "else", and "while" might need to be
treated as special keywords. To handle this, include token remapping rules when
writing the lexer like this::
# calclex.py
from sly import Lexer
class CalcLexer(Lexer):
tokens = { ID, IF, ELSE, WHILE }
# String containing ignored characters (between tokens)
ignore = ' \t'
# Base ID rule
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
# Special cases
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
When parsing an identifier, the special cases will remap certain matching
values to a new token type. For example, if the value of an identifier is
"if" above, an ``IF`` token will be generated.
Line numbers and position tracking
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -385,26 +397,11 @@ into practice::
from sly import Lexer
class CalcLexer(Lexer):
# Set of reserved names (language keywords)
reserved_words = { 'WHILE', 'IF', 'ELSE', 'PRINT' }
# Set of token names. This is always required
tokens = {
'NUMBER',
'ID',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'EQ',
'LT',
'LE',
'GT',
'GE',
'NE',
*reserved_words,
}
tokens = { NUMBER, ID, WHILE, IF, ELSE, PRINT,
PLUS, MINUS, TIMES, DIVIDE, ASSIGN,
EQ, LT, LE, GT, GE, NE }
literals = { '(', ')', '{', '}', ';' }
@ -429,12 +426,12 @@ into practice::
t.value = int(t.value)
return t
@_(r'[a-zA-Z_][a-zA-Z0-9_]*')
def ID(self, t):
# Check if name matches a reserved word (change token type if true)
if t.value.upper() in self.reserved_words:
t.type = t.value.upper()
return t
# Identifiers and keywords
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
ID['print'] = PRINT
ignore_comment = r'\#.*'
@ -443,8 +440,8 @@ into practice::
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
def error(self, value):
print('Line %d: Bad character %r' % (self.lineno, value[0]))
def error(self, t):
print('Line %d: Bad character %r' % (self.lineno, t.value[0]))
self.index += 1
if __name__ == '__main__':
@ -462,27 +459,27 @@ into practice::
If you run this code, you'll get output that looks like this::
Token(ID, 'x', 3, 12)
Token(ASSIGN, '=', 3, 14)
Token(NUMBER, 0, 3, 16)
Token(;, ';', 3, 17)
Token(WHILE, 'while', 4, 19)
Token((, '(', 4, 25)
Token(ID, 'x', 4, 26)
Token(LT, '<', 4, 28)
Token(NUMBER, 10, 4, 30)
Token(), ')', 4, 32)
Token({, '{', 4, 34)
Token(PRINT, 'print', 5, 40)
Token(ID, 'x', 5, 46)
Token(type='ID', value='x', lineno=3, index=20)
Token(type='ASSIGN', value='=', lineno=3, index=22)
Token(type='NUMBER', value=0, lineno=3, index=24)
Token(type=';', value=';', lineno=3, index=25)
Token(type='WHILE', value='while', lineno=4, index=31)
Token(type='(', value='(', lineno=4, index=37)
Token(type='ID', value='x', lineno=4, index=38)
Token(type='LT', value='<', lineno=4, index=40)
Token(type='NUMBER', value=10, lineno=4, index=42)
Token(type=')', value=')', lineno=4, index=44)
Token(type='{', value='{', lineno=4, index=46)
Token(type='PRINT', value='print', lineno=5, index=56)
Token(type='ID', value='x', lineno=5, index=62)
Line 5: Bad character ':'
Token(ID, 'x', 6, 53)
Token(ASSIGN, '=', 6, 55)
Token(ID, 'x', 6, 57)
Token(PLUS, '+', 6, 59)
Token(NUMBER, 1, 6, 61)
Token(;, ';', 6, 62)
Token(}, '}', 7, 64)
Token(type='ID', value='x', lineno=6, index=73)
Token(type='ASSIGN', value='=', lineno=6, index=75)
Token(type='ID', value='x', lineno=6, index=77)
Token(type='PLUS', value='+', lineno=6, index=79)
Token(type='NUMBER', value=1, lineno=6, index=81)
Token(type=';', value=';', lineno=6, index=82)
Token(type='}', value='}', lineno=7, index=88)
Study this example closely. It might take a bit to digest, but all of the
essential parts of writing a lexer are there. Tokens have to be specified
@ -914,8 +911,8 @@ like this::
class CalcParser(Parser):
...
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
)
# Rules where precedence is applied
@ -1004,9 +1001,9 @@ like this::
class CalcParser(Parser):
...
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('right', 'UMINUS'), # Unary minus operator
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS), # Unary minus operator
)
Now, in the grammar file, you write the unary minus rule like this::
@ -1034,10 +1031,10 @@ operators like ``<`` and ``>`` but you didn't want combinations like
class MyParser(Parser):
...
precedence = (
('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('right', 'UMINUS'), # Unary minus operator
('nonassoc', LESSTHAN, GREATERTHAN), # Nonassociative operators
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS), # Unary minus operator
)
If you do this, the occurrence of input text such as ``a < b < c``

View File

@ -9,28 +9,16 @@ from sly import Lexer, Parser
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
tokens = { NUMBER, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN }
# String containing ignored characters between tokens
ignore = ' \t'
# Regular expression rules for tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'

View File

@ -8,9 +8,7 @@ sys.path.insert(0, "../..")
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
tokens = { NAME, NUMBER }
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }
@ -36,7 +34,7 @@ class CalcParser(Parser):
precedence = (
('left', '+', '-'),
('left', '*', '/'),
('right', 'UMINUS'),
('right', UMINUS),
)
def __init__(self):

View File

@ -1,7 +1,7 @@
# -----------------------------------------------------------------------------
# sly: lex.py
#
# Copyright (C) 2016
# Copyright (C) 2016 - 2018
# David M. Beazley (Dabeaz LLC)
# All rights reserved.
#
@ -31,11 +31,10 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
__version__ = '0.2'
__version__ = '0.3'
__all__ = ['Lexer', 'LexerStateChange']
import re
from collections import OrderedDict
class LexError(Exception):
'''
@ -78,20 +77,41 @@ class Token(object):
def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
class LexerMetaDict(OrderedDict):
class TokenStr(str):
@staticmethod
def __new__(cls, value):
self = super().__new__(cls, value)
self.remap = { }
return self
def __setitem__(self, key, value):
self.remap[key] = value
class LexerMetaDict(dict):
'''
Special dictionary that prohits duplicate definitions in lexer specifications.
'''
def __setitem__(self, key, value):
if isinstance(value, str):
value = TokenStr(value)
if key in self and not isinstance(value, property):
if isinstance(self[key], str):
prior = self[key]
if isinstance(prior, str):
if callable(value):
value.pattern = self[key]
value.pattern = prior
value.remap = getattr(prior, 'remap', None)
else:
raise AttributeError(f'Name {key} redefined')
super().__setitem__(key, value)
def __getitem__(self, key):
if key not in self and key.isupper() and key[:1] != '_':
return key
else:
return super().__getitem__(key)
class LexerMeta(type):
'''
Metaclass for collecting lexing rules
@ -114,7 +134,12 @@ class LexerMeta(type):
def __new__(meta, clsname, bases, attributes):
del attributes['_']
remapping = { key: val.remap for key, val in attributes.items()
if getattr(val, 'remap', None) }
attributes = { key: str(val) if isinstance(val, TokenStr) else val
for key, val in attributes.items() }
cls = super().__new__(meta, clsname, bases, attributes)
cls._remapping = remapping
cls._build(list(attributes.items()))
return cls
@ -159,6 +184,16 @@ class Lexer(metaclass=LexerMeta):
cls._ignored_tokens = set(cls._ignored_tokens)
cls._token_funcs = dict(cls._token_funcs)
# Build a set of all remapped tokens
remapped_tokens = set()
for toks in cls._remapping.values():
remapped_tokens.update(toks.values())
undefined = remapped_tokens - cls._token_names
if undefined:
missing = ', '.join(undefined)
raise LexerBuildError(f'{missing} not included in token(s)')
parts = []
for tokname, value in cls._collect_rules(definitions):
if tokname.startswith('ignore_'):
@ -169,8 +204,10 @@ class Lexer(metaclass=LexerMeta):
pattern = value
elif callable(value):
pattern = value.pattern
cls._token_funcs[tokname] = value
pattern = getattr(value, 'pattern', None)
if not pattern:
continue
# Form the regular expression component
part = f'(?P<{tokname}>{pattern})'
@ -209,7 +246,7 @@ class Lexer(metaclass=LexerMeta):
_ignore = self.ignore
_token_funcs = self._token_funcs
_literals = self._literals
_remapping = self._remapping
self.text = text
try:
while True:
@ -228,6 +265,9 @@ class Lexer(metaclass=LexerMeta):
index = m.end()
tok.value = m.group()
tok.type = m.lastgroup
if tok.type in _remapping:
tok.type = _remapping[tok.type].get(tok.value, tok.type)
if tok.type in _token_funcs:
self.index = index
self.lineno = lineno

View File

@ -1,7 +1,7 @@
# -----------------------------------------------------------------------------
# sly: yacc.py
#
# Copyright (C) 2016-2017
# Copyright (C) 2016-2018
# David M. Beazley (Dabeaz LLC)
# All rights reserved.
#
@ -35,7 +35,7 @@ import sys
import inspect
from collections import OrderedDict, defaultdict
__version__ = '0.2'
__version__ = '0.3'
__all__ = [ 'Parser' ]
class YaccError(Exception):
@ -55,12 +55,12 @@ ERROR_COUNT = 3 # Number of symbols that must be shifted to leave
MAXINT = sys.maxsize
# This object is a stand-in for a logging object created by the
# logging module. PLY will use this by default to create things
# logging module. SLY will use this by default to create things
# such as the parser.out file. If a user wants more detailed
# information, they can create their own logging object and pass
# it into PLY.
# it into SLY.
class PlyLogger(object):
class SlyLogger(object):
def __init__(self, f):
self.f = f
@ -1552,7 +1552,7 @@ def _collect_grammar_rules(func):
return grammar
class ParserMetaDict(OrderedDict):
class ParserMetaDict(dict):
'''
Dictionary that allows decorated grammar rule functions to be overloaded
'''
@ -1560,6 +1560,12 @@ class ParserMetaDict(OrderedDict):
if key in self and callable(value) and hasattr(value, 'rules'):
value.next_func = self[key]
super().__setitem__(key, value)
def __getitem__(self, key):
if key not in self and key.isupper() and key[:1] != '_':
return key.upper()
else:
return super().__getitem__(key)
class ParserMeta(type):
@classmethod
@ -1582,7 +1588,7 @@ class ParserMeta(type):
class Parser(metaclass=ParserMeta):
# Logging object where debugging/diagnostic messages are sent
log = PlyLogger(sys.stderr)
log = SlyLogger(sys.stderr)
# Debugging filename where parsetab.out data can be written
debugfile = None

View File

@ -98,9 +98,94 @@ def test_error_return():
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]
class ModernCalcLexer(Lexer):
# Set of token names. This is always required
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, LT, LE, IF, ELSE }
literals = { '(', ')' }
# String containing ignored characters between tokens
ignore = ' \t'
# Regular expression rules for tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['if'] = IF
ID['else'] = ELSE
NUMBER = r'\d+'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LE = r'<='
LT = r'<'
def NUMBER(self, t):
t.value = int(t.value)
return t
# Ignored text
ignore_comment = r'\#.*'
@_(r'\n+')
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
# Attached rule
def ID(self, t):
t.value = t.value.upper()
return t
def error(self, t):
self.errors.append(t.value)
self.index += 1
if hasattr(self, 'return_error'):
return t
def __init__(self):
self.errors = []
# Test basic recognition of various tokens and literals
def test_modern_tokens():
lexer = ModernCalcLexer()
toks = list(lexer.tokenize('abc if else 123 + - * / = < <= ( )'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['ID','IF','ELSE', 'NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC','if','else', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
# Test ignored comments and newlines
def test_modern_ignored():
lexer = ModernCalcLexer()
toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
linenos = [t.lineno for t in toks]
assert types == ['NUMBER', 'ID']
assert vals == [123, 'ABC']
assert linenos == [4,5]
assert lexer.lineno == 6
# Test error handling
def test_modern_error():
lexer = ModernCalcLexer()
toks = list(lexer.tokenize('123 :+-'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS']
assert vals == [123, '+', '-']
assert lexer.errors == [ ':+-' ]
# Test error token return handling
def test_modern_error_return():
lexer = ModernCalcLexer()
lexer.return_error = True
toks = list(lexer.tokenize('123 :+-'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]

View File

@ -3,16 +3,7 @@ from sly import Lexer, Parser
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
}
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN }
literals = { '(', ')' }
# String containing ignored characters between tokens
@ -38,8 +29,8 @@ class CalcLexer(Lexer):
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, value):
self.errors.append(value)
def error(self, t):
self.errors.append(t.value[0])
self.index += 1
def __init__(self):
@ -49,9 +40,9 @@ class CalcParser(Parser):
tokens = CalcLexer.tokens
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('right', 'UMINUS'),
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS),
)
def __init__(self):