From b088d9b2cef949278dc63083b2375265f21eb3a4 Mon Sep 17 00:00:00 2001 From: David Beazley Date: Sat, 27 Jan 2018 15:27:15 -0600 Subject: [PATCH] Changes to token specification. More metamagic --- CHANGES | 59 ++++++++++++++ README.txt | 8 +- docs/index.rst | 4 +- docs/sly.rst | 165 +++++++++++++++++++------------------- example/calc/calc.py | 14 +--- example/calc_prec/calc.py | 6 +- sly/lex.py | 56 +++++++++++-- sly/yacc.py | 20 +++-- tests/test_lex.py | 91 ++++++++++++++++++++- tests/test_parser.py | 21 ++--- 10 files changed, 302 insertions(+), 142 deletions(-) diff --git a/CHANGES b/CHANGES index 988aa6b..0b4e38c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,64 @@ Version 0.3 ----------- +1/27/2018 Tokens no longer have to be specified as strings. For example, you + can now write: + + from sly import Lexer + + class TheLexer(Lexer): + tokens = { ID, NUMBER, PLUS, MINUS } + + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + NUMBER = r'\d+' + PLUS = r'\+' + MINUS = r'-' + + This convention also carries over to the parser for things such + as precedence specifiers: + + from sly import Parser + class TheParser(Parser): + tokens = TheLexer.tokens + + precedence = ( + ('left', PLUS, MINUS), + ('left', TIMES, DIVIDE), + ('right', UMINUS), + ) + ... + + Nevermind the fact that ID, NUMBER, PLUS, and MINUS appear to be + undefined identifiers. It all works. + +1/27/2018 Tokens now allow special-case remapping. For example: + + from sly import Lexer + + class TheLexer(Lexer): + tokens = { ID, IF, ELSE, WHILE, NUMBER, PLUS, MINUS } + + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + ID['if'] = IF + ID['else'] = ELSE + ID['while'] = WHILE + + NUMBER = r'\d+' + PLUS = r'\+' + MINUS = r'-' + + In this code, the ID rule matches any identifier. However, + special cases have been made for IF, ELSE, and WHILE tokens. + Previously, this had to be handled in a special action method + such as this: + + def ID(self, t): + if t.value in { 'if', 'else', 'while' }: + t.type = t.value.upper() + return t + + Nevermind the fact that the syntax appears to suggest that strings + work as a kind of mutable mapping. + 1/16/2018 Usability improvement on Lexer class. Regular expression rules specified as strings that don't match any name in tokens are now reported as errors. diff --git a/README.txt b/README.txt index b94c0ff..7007383 100644 --- a/README.txt +++ b/README.txt @@ -1,6 +1,6 @@ -SLY (Sly Lex-Yacc) Version 0.2 +SLY (Sly Lex-Yacc) Version 0.3 -Copyright (C) 2016-2017 +Copyright (C) 2016-2018 David M. Beazley (Dabeaz LLC) All rights reserved. @@ -85,9 +85,7 @@ expressions and store variables: from sly import Lexer, Parser class CalcLexer(Lexer): - tokens = { - 'NAME', 'NUMBER', - } + tokens = { NAME, NUMBER } ignore = ' \t' literals = { '=', '+', '-', '*', '/', '(', ')' } diff --git a/docs/index.rst b/docs/index.rst index 29b43c3..d79a586 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -60,9 +60,7 @@ expressions and store variables:: from sly import Lexer, Parser class CalcLexer(Lexer): - tokens = { - 'NAME', 'NUMBER', - } + tokens = { NAME, NUMBER } ignore = ' \t' literals = { '=', '+', '-', '*', '/', '(', ')' } diff --git a/docs/sly.rst b/docs/sly.rst index dc6dd87..9509abd 100644 --- a/docs/sly.rst +++ b/docs/sly.rst @@ -68,17 +68,8 @@ lexer that tokenizes the above text:: class CalcLexer(Lexer): # Set of token names. This is always required - tokens = { - 'ID', - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'ASSIGN', - 'LPAREN', - 'RPAREN', - } + tokens = { ID, NUMBER, PLUS, MINUS, TIMES, + DIVIDE, ASSIGN, LPAREN, RPAREN } # String containing ignored characters between tokens ignore = ' \t' @@ -131,19 +122,12 @@ In the example, the following code specified the token names:: class CalcLexer(Lexer): ... # Set of token names. This is always required - tokens = { - 'ID', - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'ASSIGN', - 'LPAREN', - 'RPAREN', - } + tokens = { ID, NUMBER, PLUS, MINUS, TIMES, + DIVIDE, ASSIGN, LPAREN, RPAREN } ... +Token names should be specified using all-caps as shown. + Specification of token match patterns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -167,7 +151,7 @@ short tokens. For example, if you wanted to have separate tokens for example:: class MyLexer(Lexer): - tokens = {'ASSIGN', 'EQ', ...} + tokens = { ASSIGN, EQ, ...} ... EQ = r'==' # MUST APPEAR FIRST! (LONGER) ASSIGN = r'=' @@ -251,12 +235,10 @@ Instead of using the ``@_()`` decorator, you can also write a method that matches the same name as a token previously specified as a string. For example:: - ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + NUMBER = r'\d+' ... - def ID(self, t): - reserved = { 'if', 'else', 'while', 'for' } - if t.value in reserved: - t.type = t.value.upper() + def NUMBER(self, t): + t.value = int(t.value) return t This is potentially useful trick for debugging a lexer. You can temporarily @@ -264,6 +246,36 @@ attach a method a token and have it execute when the token is encountered. If you later take the method away, the lexer will revert back to its original behavior. +Token Remapping +^^^^^^^^^^^^^^^ + +Occasionally, you might need to remap tokens based on special cases. +Consider the case of matching identifiers such as "abc", "python", or "guido". +Certain identifiers such as "if", "else", and "while" might need to be +treated as special keywords. To handle this, include token remapping rules when +writing the lexer like this:: + + # calclex.py + + from sly import Lexer + + class CalcLexer(Lexer): + tokens = { ID, IF, ELSE, WHILE } + # String containing ignored characters (between tokens) + ignore = ' \t' + + # Base ID rule + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + + # Special cases + ID['if'] = IF + ID['else'] = ELSE + ID['while'] = WHILE + +When parsing an identifier, the special cases will remap certain matching +values to a new token type. For example, if the value of an identifier is +"if" above, an ``IF`` token will be generated. + Line numbers and position tracking ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -385,26 +397,11 @@ into practice:: from sly import Lexer class CalcLexer(Lexer): - # Set of reserved names (language keywords) - reserved_words = { 'WHILE', 'IF', 'ELSE', 'PRINT' } - # Set of token names. This is always required - tokens = { - 'NUMBER', - 'ID', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'ASSIGN', - 'EQ', - 'LT', - 'LE', - 'GT', - 'GE', - 'NE', - *reserved_words, - } + tokens = { NUMBER, ID, WHILE, IF, ELSE, PRINT, + PLUS, MINUS, TIMES, DIVIDE, ASSIGN, + EQ, LT, LE, GT, GE, NE } + literals = { '(', ')', '{', '}', ';' } @@ -429,12 +426,12 @@ into practice:: t.value = int(t.value) return t - @_(r'[a-zA-Z_][a-zA-Z0-9_]*') - def ID(self, t): - # Check if name matches a reserved word (change token type if true) - if t.value.upper() in self.reserved_words: - t.type = t.value.upper() - return t + # Identifiers and keywords + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + ID['if'] = IF + ID['else'] = ELSE + ID['while'] = WHILE + ID['print'] = PRINT ignore_comment = r'\#.*' @@ -443,8 +440,8 @@ into practice:: def ignore_newline(self, t): self.lineno += t.value.count('\n') - def error(self, value): - print('Line %d: Bad character %r' % (self.lineno, value[0])) + def error(self, t): + print('Line %d: Bad character %r' % (self.lineno, t.value[0])) self.index += 1 if __name__ == '__main__': @@ -462,27 +459,27 @@ into practice:: If you run this code, you'll get output that looks like this:: - Token(ID, 'x', 3, 12) - Token(ASSIGN, '=', 3, 14) - Token(NUMBER, 0, 3, 16) - Token(;, ';', 3, 17) - Token(WHILE, 'while', 4, 19) - Token((, '(', 4, 25) - Token(ID, 'x', 4, 26) - Token(LT, '<', 4, 28) - Token(NUMBER, 10, 4, 30) - Token(), ')', 4, 32) - Token({, '{', 4, 34) - Token(PRINT, 'print', 5, 40) - Token(ID, 'x', 5, 46) + Token(type='ID', value='x', lineno=3, index=20) + Token(type='ASSIGN', value='=', lineno=3, index=22) + Token(type='NUMBER', value=0, lineno=3, index=24) + Token(type=';', value=';', lineno=3, index=25) + Token(type='WHILE', value='while', lineno=4, index=31) + Token(type='(', value='(', lineno=4, index=37) + Token(type='ID', value='x', lineno=4, index=38) + Token(type='LT', value='<', lineno=4, index=40) + Token(type='NUMBER', value=10, lineno=4, index=42) + Token(type=')', value=')', lineno=4, index=44) + Token(type='{', value='{', lineno=4, index=46) + Token(type='PRINT', value='print', lineno=5, index=56) + Token(type='ID', value='x', lineno=5, index=62) Line 5: Bad character ':' - Token(ID, 'x', 6, 53) - Token(ASSIGN, '=', 6, 55) - Token(ID, 'x', 6, 57) - Token(PLUS, '+', 6, 59) - Token(NUMBER, 1, 6, 61) - Token(;, ';', 6, 62) - Token(}, '}', 7, 64) + Token(type='ID', value='x', lineno=6, index=73) + Token(type='ASSIGN', value='=', lineno=6, index=75) + Token(type='ID', value='x', lineno=6, index=77) + Token(type='PLUS', value='+', lineno=6, index=79) + Token(type='NUMBER', value=1, lineno=6, index=81) + Token(type=';', value=';', lineno=6, index=82) + Token(type='}', value='}', lineno=7, index=88) Study this example closely. It might take a bit to digest, but all of the essential parts of writing a lexer are there. Tokens have to be specified @@ -914,8 +911,8 @@ like this:: class CalcParser(Parser): ... precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), + ('left', PLUS, MINUS), + ('left', TIMES, DIVIDE), ) # Rules where precedence is applied @@ -1004,9 +1001,9 @@ like this:: class CalcParser(Parser): ... precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), # Unary minus operator + ('left', PLUS, MINUS), + ('left', TIMES, DIVIDE), + ('right', UMINUS), # Unary minus operator ) Now, in the grammar file, you write the unary minus rule like this:: @@ -1034,10 +1031,10 @@ operators like ``<`` and ``>`` but you didn't want combinations like class MyParser(Parser): ... precedence = ( - ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), # Unary minus operator + ('nonassoc', LESSTHAN, GREATERTHAN), # Nonassociative operators + ('left', PLUS, MINUS), + ('left', TIMES, DIVIDE), + ('right', UMINUS), # Unary minus operator ) If you do this, the occurrence of input text such as ``a < b < c`` diff --git a/example/calc/calc.py b/example/calc/calc.py index e15f80a..f7825ec 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -9,28 +9,16 @@ from sly import Lexer, Parser class CalcLexer(Lexer): # Set of token names. This is always required - tokens = { - 'ID', - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'ASSIGN', - 'LPAREN', - 'RPAREN', - } + tokens = { NUMBER, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN } # String containing ignored characters between tokens ignore = ' \t' # Regular expression rules for tokens - ID = r'[a-zA-Z_][a-zA-Z0-9_]*' PLUS = r'\+' MINUS = r'-' TIMES = r'\*' DIVIDE = r'/' - ASSIGN = r'=' LPAREN = r'\(' RPAREN = r'\)' diff --git a/example/calc_prec/calc.py b/example/calc_prec/calc.py index 0a6261e..597aaa0 100644 --- a/example/calc_prec/calc.py +++ b/example/calc_prec/calc.py @@ -8,9 +8,7 @@ sys.path.insert(0, "../..") from sly import Lexer, Parser class CalcLexer(Lexer): - tokens = { - 'NAME', 'NUMBER', - } + tokens = { NAME, NUMBER } ignore = ' \t' literals = { '=', '+', '-', '*', '/', '(', ')' } @@ -36,7 +34,7 @@ class CalcParser(Parser): precedence = ( ('left', '+', '-'), ('left', '*', '/'), - ('right', 'UMINUS'), + ('right', UMINUS), ) def __init__(self): diff --git a/sly/lex.py b/sly/lex.py index 82b5a24..dc69569 100644 --- a/sly/lex.py +++ b/sly/lex.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # sly: lex.py # -# Copyright (C) 2016 +# Copyright (C) 2016 - 2018 # David M. Beazley (Dabeaz LLC) # All rights reserved. # @@ -31,11 +31,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- -__version__ = '0.2' +__version__ = '0.3' __all__ = ['Lexer', 'LexerStateChange'] import re -from collections import OrderedDict class LexError(Exception): ''' @@ -78,20 +77,41 @@ class Token(object): def __repr__(self): return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})' -class LexerMetaDict(OrderedDict): +class TokenStr(str): + @staticmethod + def __new__(cls, value): + self = super().__new__(cls, value) + self.remap = { } + return self + + def __setitem__(self, key, value): + self.remap[key] = value + +class LexerMetaDict(dict): ''' Special dictionary that prohits duplicate definitions in lexer specifications. ''' def __setitem__(self, key, value): + if isinstance(value, str): + value = TokenStr(value) + if key in self and not isinstance(value, property): - if isinstance(self[key], str): + prior = self[key] + if isinstance(prior, str): if callable(value): - value.pattern = self[key] + value.pattern = prior + value.remap = getattr(prior, 'remap', None) else: raise AttributeError(f'Name {key} redefined') super().__setitem__(key, value) + def __getitem__(self, key): + if key not in self and key.isupper() and key[:1] != '_': + return key + else: + return super().__getitem__(key) + class LexerMeta(type): ''' Metaclass for collecting lexing rules @@ -114,7 +134,12 @@ class LexerMeta(type): def __new__(meta, clsname, bases, attributes): del attributes['_'] + remapping = { key: val.remap for key, val in attributes.items() + if getattr(val, 'remap', None) } + attributes = { key: str(val) if isinstance(val, TokenStr) else val + for key, val in attributes.items() } cls = super().__new__(meta, clsname, bases, attributes) + cls._remapping = remapping cls._build(list(attributes.items())) return cls @@ -159,6 +184,16 @@ class Lexer(metaclass=LexerMeta): cls._ignored_tokens = set(cls._ignored_tokens) cls._token_funcs = dict(cls._token_funcs) + # Build a set of all remapped tokens + remapped_tokens = set() + for toks in cls._remapping.values(): + remapped_tokens.update(toks.values()) + + undefined = remapped_tokens - cls._token_names + if undefined: + missing = ', '.join(undefined) + raise LexerBuildError(f'{missing} not included in token(s)') + parts = [] for tokname, value in cls._collect_rules(definitions): if tokname.startswith('ignore_'): @@ -169,8 +204,10 @@ class Lexer(metaclass=LexerMeta): pattern = value elif callable(value): - pattern = value.pattern cls._token_funcs[tokname] = value + pattern = getattr(value, 'pattern', None) + if not pattern: + continue # Form the regular expression component part = f'(?P<{tokname}>{pattern})' @@ -209,7 +246,7 @@ class Lexer(metaclass=LexerMeta): _ignore = self.ignore _token_funcs = self._token_funcs _literals = self._literals - + _remapping = self._remapping self.text = text try: while True: @@ -228,6 +265,9 @@ class Lexer(metaclass=LexerMeta): index = m.end() tok.value = m.group() tok.type = m.lastgroup + if tok.type in _remapping: + tok.type = _remapping[tok.type].get(tok.value, tok.type) + if tok.type in _token_funcs: self.index = index self.lineno = lineno diff --git a/sly/yacc.py b/sly/yacc.py index 6478580..85cd85c 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # sly: yacc.py # -# Copyright (C) 2016-2017 +# Copyright (C) 2016-2018 # David M. Beazley (Dabeaz LLC) # All rights reserved. # @@ -35,7 +35,7 @@ import sys import inspect from collections import OrderedDict, defaultdict -__version__ = '0.2' +__version__ = '0.3' __all__ = [ 'Parser' ] class YaccError(Exception): @@ -55,12 +55,12 @@ ERROR_COUNT = 3 # Number of symbols that must be shifted to leave MAXINT = sys.maxsize # This object is a stand-in for a logging object created by the -# logging module. PLY will use this by default to create things +# logging module. SLY will use this by default to create things # such as the parser.out file. If a user wants more detailed # information, they can create their own logging object and pass -# it into PLY. +# it into SLY. -class PlyLogger(object): +class SlyLogger(object): def __init__(self, f): self.f = f @@ -1552,7 +1552,7 @@ def _collect_grammar_rules(func): return grammar -class ParserMetaDict(OrderedDict): +class ParserMetaDict(dict): ''' Dictionary that allows decorated grammar rule functions to be overloaded ''' @@ -1560,6 +1560,12 @@ class ParserMetaDict(OrderedDict): if key in self and callable(value) and hasattr(value, 'rules'): value.next_func = self[key] super().__setitem__(key, value) + + def __getitem__(self, key): + if key not in self and key.isupper() and key[:1] != '_': + return key.upper() + else: + return super().__getitem__(key) class ParserMeta(type): @classmethod @@ -1582,7 +1588,7 @@ class ParserMeta(type): class Parser(metaclass=ParserMeta): # Logging object where debugging/diagnostic messages are sent - log = PlyLogger(sys.stderr) + log = SlyLogger(sys.stderr) # Debugging filename where parsetab.out data can be written debugfile = None diff --git a/tests/test_lex.py b/tests/test_lex.py index 212ceec..a730f71 100644 --- a/tests/test_lex.py +++ b/tests/test_lex.py @@ -98,9 +98,94 @@ def test_error_return(): assert vals == [123, ':+-', '+', '-'] assert lexer.errors == [ ':+-' ] - - - + +class ModernCalcLexer(Lexer): + # Set of token names. This is always required + tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, LT, LE, IF, ELSE } + literals = { '(', ')' } + + # String containing ignored characters between tokens + ignore = ' \t' + + # Regular expression rules for tokens + ID = r'[a-zA-Z_][a-zA-Z0-9_]*' + ID['if'] = IF + ID['else'] = ELSE + + NUMBER = r'\d+' + PLUS = r'\+' + MINUS = r'-' + TIMES = r'\*' + DIVIDE = r'/' + ASSIGN = r'=' + LE = r'<=' + LT = r'<' + + def NUMBER(self, t): + t.value = int(t.value) + return t + + # Ignored text + ignore_comment = r'\#.*' + + @_(r'\n+') + def ignore_newline(self, t): + self.lineno += t.value.count('\n') + + # Attached rule + def ID(self, t): + t.value = t.value.upper() + return t + + def error(self, t): + self.errors.append(t.value) + self.index += 1 + if hasattr(self, 'return_error'): + return t + + def __init__(self): + self.errors = [] +# Test basic recognition of various tokens and literals +def test_modern_tokens(): + lexer = ModernCalcLexer() + toks = list(lexer.tokenize('abc if else 123 + - * / = < <= ( )')) + types = [t.type for t in toks] + vals = [t.value for t in toks] + assert types == ['ID','IF','ELSE', 'NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')'] + assert vals == ['ABC','if','else', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')'] + +# Test ignored comments and newlines +def test_modern_ignored(): + lexer = ModernCalcLexer() + toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n')) + types = [t.type for t in toks] + vals = [t.value for t in toks] + linenos = [t.lineno for t in toks] + assert types == ['NUMBER', 'ID'] + assert vals == [123, 'ABC'] + assert linenos == [4,5] + assert lexer.lineno == 6 + +# Test error handling +def test_modern_error(): + lexer = ModernCalcLexer() + toks = list(lexer.tokenize('123 :+-')) + types = [t.type for t in toks] + vals = [t.value for t in toks] + assert types == ['NUMBER', 'PLUS', 'MINUS'] + assert vals == [123, '+', '-'] + assert lexer.errors == [ ':+-' ] + +# Test error token return handling +def test_modern_error_return(): + lexer = ModernCalcLexer() + lexer.return_error = True + toks = list(lexer.tokenize('123 :+-')) + types = [t.type for t in toks] + vals = [t.value for t in toks] + assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS'] + assert vals == [123, ':+-', '+', '-'] + assert lexer.errors == [ ':+-' ] diff --git a/tests/test_parser.py b/tests/test_parser.py index 38db4fe..2661448 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -3,16 +3,7 @@ from sly import Lexer, Parser class CalcLexer(Lexer): # Set of token names. This is always required - tokens = { - 'ID', - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'ASSIGN', - } - + tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN } literals = { '(', ')' } # String containing ignored characters between tokens @@ -38,8 +29,8 @@ class CalcLexer(Lexer): def newline(self, t): self.lineno += t.value.count('\n') - def error(self, value): - self.errors.append(value) + def error(self, t): + self.errors.append(t.value[0]) self.index += 1 def __init__(self): @@ -49,9 +40,9 @@ class CalcParser(Parser): tokens = CalcLexer.tokens precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), + ('left', PLUS, MINUS), + ('left', TIMES, DIVIDE), + ('right', UMINUS), ) def __init__(self):