Changes to token specification. More metamagic
This commit is contained in:
parent
b74e7223ce
commit
b088d9b2ce
59
CHANGES
59
CHANGES
@ -1,5 +1,64 @@
|
|||||||
Version 0.3
|
Version 0.3
|
||||||
-----------
|
-----------
|
||||||
|
1/27/2018 Tokens no longer have to be specified as strings. For example, you
|
||||||
|
can now write:
|
||||||
|
|
||||||
|
from sly import Lexer
|
||||||
|
|
||||||
|
class TheLexer(Lexer):
|
||||||
|
tokens = { ID, NUMBER, PLUS, MINUS }
|
||||||
|
|
||||||
|
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
||||||
|
NUMBER = r'\d+'
|
||||||
|
PLUS = r'\+'
|
||||||
|
MINUS = r'-'
|
||||||
|
|
||||||
|
This convention also carries over to the parser for things such
|
||||||
|
as precedence specifiers:
|
||||||
|
|
||||||
|
from sly import Parser
|
||||||
|
class TheParser(Parser):
|
||||||
|
tokens = TheLexer.tokens
|
||||||
|
|
||||||
|
precedence = (
|
||||||
|
('left', PLUS, MINUS),
|
||||||
|
('left', TIMES, DIVIDE),
|
||||||
|
('right', UMINUS),
|
||||||
|
)
|
||||||
|
...
|
||||||
|
|
||||||
|
Nevermind the fact that ID, NUMBER, PLUS, and MINUS appear to be
|
||||||
|
undefined identifiers. It all works.
|
||||||
|
|
||||||
|
1/27/2018 Tokens now allow special-case remapping. For example:
|
||||||
|
|
||||||
|
from sly import Lexer
|
||||||
|
|
||||||
|
class TheLexer(Lexer):
|
||||||
|
tokens = { ID, IF, ELSE, WHILE, NUMBER, PLUS, MINUS }
|
||||||
|
|
||||||
|
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
||||||
|
ID['if'] = IF
|
||||||
|
ID['else'] = ELSE
|
||||||
|
ID['while'] = WHILE
|
||||||
|
|
||||||
|
NUMBER = r'\d+'
|
||||||
|
PLUS = r'\+'
|
||||||
|
MINUS = r'-'
|
||||||
|
|
||||||
|
In this code, the ID rule matches any identifier. However,
|
||||||
|
special cases have been made for IF, ELSE, and WHILE tokens.
|
||||||
|
Previously, this had to be handled in a special action method
|
||||||
|
such as this:
|
||||||
|
|
||||||
|
def ID(self, t):
|
||||||
|
if t.value in { 'if', 'else', 'while' }:
|
||||||
|
t.type = t.value.upper()
|
||||||
|
return t
|
||||||
|
|
||||||
|
Nevermind the fact that the syntax appears to suggest that strings
|
||||||
|
work as a kind of mutable mapping.
|
||||||
|
|
||||||
1/16/2018 Usability improvement on Lexer class. Regular expression rules
|
1/16/2018 Usability improvement on Lexer class. Regular expression rules
|
||||||
specified as strings that don't match any name in tokens are
|
specified as strings that don't match any name in tokens are
|
||||||
now reported as errors.
|
now reported as errors.
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
SLY (Sly Lex-Yacc) Version 0.2
|
SLY (Sly Lex-Yacc) Version 0.3
|
||||||
|
|
||||||
Copyright (C) 2016-2017
|
Copyright (C) 2016-2018
|
||||||
David M. Beazley (Dabeaz LLC)
|
David M. Beazley (Dabeaz LLC)
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
@ -85,9 +85,7 @@ expressions and store variables:
|
|||||||
from sly import Lexer, Parser
|
from sly import Lexer, Parser
|
||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
tokens = {
|
tokens = { NAME, NUMBER }
|
||||||
'NAME', 'NUMBER',
|
|
||||||
}
|
|
||||||
ignore = ' \t'
|
ignore = ' \t'
|
||||||
literals = { '=', '+', '-', '*', '/', '(', ')' }
|
literals = { '=', '+', '-', '*', '/', '(', ')' }
|
||||||
|
|
||||||
|
@ -60,9 +60,7 @@ expressions and store variables::
|
|||||||
from sly import Lexer, Parser
|
from sly import Lexer, Parser
|
||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
tokens = {
|
tokens = { NAME, NUMBER }
|
||||||
'NAME', 'NUMBER',
|
|
||||||
}
|
|
||||||
ignore = ' \t'
|
ignore = ' \t'
|
||||||
literals = { '=', '+', '-', '*', '/', '(', ')' }
|
literals = { '=', '+', '-', '*', '/', '(', ')' }
|
||||||
|
|
||||||
|
165
docs/sly.rst
165
docs/sly.rst
@ -68,17 +68,8 @@ lexer that tokenizes the above text::
|
|||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
|
||||||
'ID',
|
DIVIDE, ASSIGN, LPAREN, RPAREN }
|
||||||
'NUMBER',
|
|
||||||
'PLUS',
|
|
||||||
'MINUS',
|
|
||||||
'TIMES',
|
|
||||||
'DIVIDE',
|
|
||||||
'ASSIGN',
|
|
||||||
'LPAREN',
|
|
||||||
'RPAREN',
|
|
||||||
}
|
|
||||||
|
|
||||||
# String containing ignored characters between tokens
|
# String containing ignored characters between tokens
|
||||||
ignore = ' \t'
|
ignore = ' \t'
|
||||||
@ -131,19 +122,12 @@ In the example, the following code specified the token names::
|
|||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
...
|
...
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
|
||||||
'ID',
|
DIVIDE, ASSIGN, LPAREN, RPAREN }
|
||||||
'NUMBER',
|
|
||||||
'PLUS',
|
|
||||||
'MINUS',
|
|
||||||
'TIMES',
|
|
||||||
'DIVIDE',
|
|
||||||
'ASSIGN',
|
|
||||||
'LPAREN',
|
|
||||||
'RPAREN',
|
|
||||||
}
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
Token names should be specified using all-caps as shown.
|
||||||
|
|
||||||
Specification of token match patterns
|
Specification of token match patterns
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
@ -167,7 +151,7 @@ short tokens. For example, if you wanted to have separate tokens for
|
|||||||
example::
|
example::
|
||||||
|
|
||||||
class MyLexer(Lexer):
|
class MyLexer(Lexer):
|
||||||
tokens = {'ASSIGN', 'EQ', ...}
|
tokens = { ASSIGN, EQ, ...}
|
||||||
...
|
...
|
||||||
EQ = r'==' # MUST APPEAR FIRST! (LONGER)
|
EQ = r'==' # MUST APPEAR FIRST! (LONGER)
|
||||||
ASSIGN = r'='
|
ASSIGN = r'='
|
||||||
@ -251,12 +235,10 @@ Instead of using the ``@_()`` decorator, you can also write a method
|
|||||||
that matches the same name as a token previously specified as a
|
that matches the same name as a token previously specified as a
|
||||||
string. For example::
|
string. For example::
|
||||||
|
|
||||||
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
NUMBER = r'\d+'
|
||||||
...
|
...
|
||||||
def ID(self, t):
|
def NUMBER(self, t):
|
||||||
reserved = { 'if', 'else', 'while', 'for' }
|
t.value = int(t.value)
|
||||||
if t.value in reserved:
|
|
||||||
t.type = t.value.upper()
|
|
||||||
return t
|
return t
|
||||||
|
|
||||||
This is potentially useful trick for debugging a lexer. You can temporarily
|
This is potentially useful trick for debugging a lexer. You can temporarily
|
||||||
@ -264,6 +246,36 @@ attach a method a token and have it execute when the token is encountered.
|
|||||||
If you later take the method away, the lexer will revert back to its original
|
If you later take the method away, the lexer will revert back to its original
|
||||||
behavior.
|
behavior.
|
||||||
|
|
||||||
|
Token Remapping
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Occasionally, you might need to remap tokens based on special cases.
|
||||||
|
Consider the case of matching identifiers such as "abc", "python", or "guido".
|
||||||
|
Certain identifiers such as "if", "else", and "while" might need to be
|
||||||
|
treated as special keywords. To handle this, include token remapping rules when
|
||||||
|
writing the lexer like this::
|
||||||
|
|
||||||
|
# calclex.py
|
||||||
|
|
||||||
|
from sly import Lexer
|
||||||
|
|
||||||
|
class CalcLexer(Lexer):
|
||||||
|
tokens = { ID, IF, ELSE, WHILE }
|
||||||
|
# String containing ignored characters (between tokens)
|
||||||
|
ignore = ' \t'
|
||||||
|
|
||||||
|
# Base ID rule
|
||||||
|
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
||||||
|
|
||||||
|
# Special cases
|
||||||
|
ID['if'] = IF
|
||||||
|
ID['else'] = ELSE
|
||||||
|
ID['while'] = WHILE
|
||||||
|
|
||||||
|
When parsing an identifier, the special cases will remap certain matching
|
||||||
|
values to a new token type. For example, if the value of an identifier is
|
||||||
|
"if" above, an ``IF`` token will be generated.
|
||||||
|
|
||||||
Line numbers and position tracking
|
Line numbers and position tracking
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
@ -385,26 +397,11 @@ into practice::
|
|||||||
from sly import Lexer
|
from sly import Lexer
|
||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
# Set of reserved names (language keywords)
|
|
||||||
reserved_words = { 'WHILE', 'IF', 'ELSE', 'PRINT' }
|
|
||||||
|
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = { NUMBER, ID, WHILE, IF, ELSE, PRINT,
|
||||||
'NUMBER',
|
PLUS, MINUS, TIMES, DIVIDE, ASSIGN,
|
||||||
'ID',
|
EQ, LT, LE, GT, GE, NE }
|
||||||
'PLUS',
|
|
||||||
'MINUS',
|
|
||||||
'TIMES',
|
|
||||||
'DIVIDE',
|
|
||||||
'ASSIGN',
|
|
||||||
'EQ',
|
|
||||||
'LT',
|
|
||||||
'LE',
|
|
||||||
'GT',
|
|
||||||
'GE',
|
|
||||||
'NE',
|
|
||||||
*reserved_words,
|
|
||||||
}
|
|
||||||
|
|
||||||
literals = { '(', ')', '{', '}', ';' }
|
literals = { '(', ')', '{', '}', ';' }
|
||||||
|
|
||||||
@ -429,12 +426,12 @@ into practice::
|
|||||||
t.value = int(t.value)
|
t.value = int(t.value)
|
||||||
return t
|
return t
|
||||||
|
|
||||||
@_(r'[a-zA-Z_][a-zA-Z0-9_]*')
|
# Identifiers and keywords
|
||||||
def ID(self, t):
|
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
||||||
# Check if name matches a reserved word (change token type if true)
|
ID['if'] = IF
|
||||||
if t.value.upper() in self.reserved_words:
|
ID['else'] = ELSE
|
||||||
t.type = t.value.upper()
|
ID['while'] = WHILE
|
||||||
return t
|
ID['print'] = PRINT
|
||||||
|
|
||||||
ignore_comment = r'\#.*'
|
ignore_comment = r'\#.*'
|
||||||
|
|
||||||
@ -443,8 +440,8 @@ into practice::
|
|||||||
def ignore_newline(self, t):
|
def ignore_newline(self, t):
|
||||||
self.lineno += t.value.count('\n')
|
self.lineno += t.value.count('\n')
|
||||||
|
|
||||||
def error(self, value):
|
def error(self, t):
|
||||||
print('Line %d: Bad character %r' % (self.lineno, value[0]))
|
print('Line %d: Bad character %r' % (self.lineno, t.value[0]))
|
||||||
self.index += 1
|
self.index += 1
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -462,27 +459,27 @@ into practice::
|
|||||||
|
|
||||||
If you run this code, you'll get output that looks like this::
|
If you run this code, you'll get output that looks like this::
|
||||||
|
|
||||||
Token(ID, 'x', 3, 12)
|
Token(type='ID', value='x', lineno=3, index=20)
|
||||||
Token(ASSIGN, '=', 3, 14)
|
Token(type='ASSIGN', value='=', lineno=3, index=22)
|
||||||
Token(NUMBER, 0, 3, 16)
|
Token(type='NUMBER', value=0, lineno=3, index=24)
|
||||||
Token(;, ';', 3, 17)
|
Token(type=';', value=';', lineno=3, index=25)
|
||||||
Token(WHILE, 'while', 4, 19)
|
Token(type='WHILE', value='while', lineno=4, index=31)
|
||||||
Token((, '(', 4, 25)
|
Token(type='(', value='(', lineno=4, index=37)
|
||||||
Token(ID, 'x', 4, 26)
|
Token(type='ID', value='x', lineno=4, index=38)
|
||||||
Token(LT, '<', 4, 28)
|
Token(type='LT', value='<', lineno=4, index=40)
|
||||||
Token(NUMBER, 10, 4, 30)
|
Token(type='NUMBER', value=10, lineno=4, index=42)
|
||||||
Token(), ')', 4, 32)
|
Token(type=')', value=')', lineno=4, index=44)
|
||||||
Token({, '{', 4, 34)
|
Token(type='{', value='{', lineno=4, index=46)
|
||||||
Token(PRINT, 'print', 5, 40)
|
Token(type='PRINT', value='print', lineno=5, index=56)
|
||||||
Token(ID, 'x', 5, 46)
|
Token(type='ID', value='x', lineno=5, index=62)
|
||||||
Line 5: Bad character ':'
|
Line 5: Bad character ':'
|
||||||
Token(ID, 'x', 6, 53)
|
Token(type='ID', value='x', lineno=6, index=73)
|
||||||
Token(ASSIGN, '=', 6, 55)
|
Token(type='ASSIGN', value='=', lineno=6, index=75)
|
||||||
Token(ID, 'x', 6, 57)
|
Token(type='ID', value='x', lineno=6, index=77)
|
||||||
Token(PLUS, '+', 6, 59)
|
Token(type='PLUS', value='+', lineno=6, index=79)
|
||||||
Token(NUMBER, 1, 6, 61)
|
Token(type='NUMBER', value=1, lineno=6, index=81)
|
||||||
Token(;, ';', 6, 62)
|
Token(type=';', value=';', lineno=6, index=82)
|
||||||
Token(}, '}', 7, 64)
|
Token(type='}', value='}', lineno=7, index=88)
|
||||||
|
|
||||||
Study this example closely. It might take a bit to digest, but all of the
|
Study this example closely. It might take a bit to digest, but all of the
|
||||||
essential parts of writing a lexer are there. Tokens have to be specified
|
essential parts of writing a lexer are there. Tokens have to be specified
|
||||||
@ -914,8 +911,8 @@ like this::
|
|||||||
class CalcParser(Parser):
|
class CalcParser(Parser):
|
||||||
...
|
...
|
||||||
precedence = (
|
precedence = (
|
||||||
('left', 'PLUS', 'MINUS'),
|
('left', PLUS, MINUS),
|
||||||
('left', 'TIMES', 'DIVIDE'),
|
('left', TIMES, DIVIDE),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Rules where precedence is applied
|
# Rules where precedence is applied
|
||||||
@ -1004,9 +1001,9 @@ like this::
|
|||||||
class CalcParser(Parser):
|
class CalcParser(Parser):
|
||||||
...
|
...
|
||||||
precedence = (
|
precedence = (
|
||||||
('left', 'PLUS', 'MINUS'),
|
('left', PLUS, MINUS),
|
||||||
('left', 'TIMES', 'DIVIDE'),
|
('left', TIMES, DIVIDE),
|
||||||
('right', 'UMINUS'), # Unary minus operator
|
('right', UMINUS), # Unary minus operator
|
||||||
)
|
)
|
||||||
|
|
||||||
Now, in the grammar file, you write the unary minus rule like this::
|
Now, in the grammar file, you write the unary minus rule like this::
|
||||||
@ -1034,10 +1031,10 @@ operators like ``<`` and ``>`` but you didn't want combinations like
|
|||||||
class MyParser(Parser):
|
class MyParser(Parser):
|
||||||
...
|
...
|
||||||
precedence = (
|
precedence = (
|
||||||
('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators
|
('nonassoc', LESSTHAN, GREATERTHAN), # Nonassociative operators
|
||||||
('left', 'PLUS', 'MINUS'),
|
('left', PLUS, MINUS),
|
||||||
('left', 'TIMES', 'DIVIDE'),
|
('left', TIMES, DIVIDE),
|
||||||
('right', 'UMINUS'), # Unary minus operator
|
('right', UMINUS), # Unary minus operator
|
||||||
)
|
)
|
||||||
|
|
||||||
If you do this, the occurrence of input text such as ``a < b < c``
|
If you do this, the occurrence of input text such as ``a < b < c``
|
||||||
|
@ -9,28 +9,16 @@ from sly import Lexer, Parser
|
|||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = { NUMBER, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN }
|
||||||
'ID',
|
|
||||||
'NUMBER',
|
|
||||||
'PLUS',
|
|
||||||
'MINUS',
|
|
||||||
'TIMES',
|
|
||||||
'DIVIDE',
|
|
||||||
'ASSIGN',
|
|
||||||
'LPAREN',
|
|
||||||
'RPAREN',
|
|
||||||
}
|
|
||||||
|
|
||||||
# String containing ignored characters between tokens
|
# String containing ignored characters between tokens
|
||||||
ignore = ' \t'
|
ignore = ' \t'
|
||||||
|
|
||||||
# Regular expression rules for tokens
|
# Regular expression rules for tokens
|
||||||
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
|
||||||
PLUS = r'\+'
|
PLUS = r'\+'
|
||||||
MINUS = r'-'
|
MINUS = r'-'
|
||||||
TIMES = r'\*'
|
TIMES = r'\*'
|
||||||
DIVIDE = r'/'
|
DIVIDE = r'/'
|
||||||
ASSIGN = r'='
|
|
||||||
LPAREN = r'\('
|
LPAREN = r'\('
|
||||||
RPAREN = r'\)'
|
RPAREN = r'\)'
|
||||||
|
|
||||||
|
@ -8,9 +8,7 @@ sys.path.insert(0, "../..")
|
|||||||
from sly import Lexer, Parser
|
from sly import Lexer, Parser
|
||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
tokens = {
|
tokens = { NAME, NUMBER }
|
||||||
'NAME', 'NUMBER',
|
|
||||||
}
|
|
||||||
ignore = ' \t'
|
ignore = ' \t'
|
||||||
literals = { '=', '+', '-', '*', '/', '(', ')' }
|
literals = { '=', '+', '-', '*', '/', '(', ')' }
|
||||||
|
|
||||||
@ -36,7 +34,7 @@ class CalcParser(Parser):
|
|||||||
precedence = (
|
precedence = (
|
||||||
('left', '+', '-'),
|
('left', '+', '-'),
|
||||||
('left', '*', '/'),
|
('left', '*', '/'),
|
||||||
('right', 'UMINUS'),
|
('right', UMINUS),
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
56
sly/lex.py
56
sly/lex.py
@ -1,7 +1,7 @@
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# sly: lex.py
|
# sly: lex.py
|
||||||
#
|
#
|
||||||
# Copyright (C) 2016
|
# Copyright (C) 2016 - 2018
|
||||||
# David M. Beazley (Dabeaz LLC)
|
# David M. Beazley (Dabeaz LLC)
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
@ -31,11 +31,10 @@
|
|||||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
__version__ = '0.2'
|
__version__ = '0.3'
|
||||||
__all__ = ['Lexer', 'LexerStateChange']
|
__all__ = ['Lexer', 'LexerStateChange']
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
class LexError(Exception):
|
class LexError(Exception):
|
||||||
'''
|
'''
|
||||||
@ -78,20 +77,41 @@ class Token(object):
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
|
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
|
||||||
|
|
||||||
class LexerMetaDict(OrderedDict):
|
class TokenStr(str):
|
||||||
|
@staticmethod
|
||||||
|
def __new__(cls, value):
|
||||||
|
self = super().__new__(cls, value)
|
||||||
|
self.remap = { }
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
self.remap[key] = value
|
||||||
|
|
||||||
|
class LexerMetaDict(dict):
|
||||||
'''
|
'''
|
||||||
Special dictionary that prohits duplicate definitions in lexer specifications.
|
Special dictionary that prohits duplicate definitions in lexer specifications.
|
||||||
'''
|
'''
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = TokenStr(value)
|
||||||
|
|
||||||
if key in self and not isinstance(value, property):
|
if key in self and not isinstance(value, property):
|
||||||
if isinstance(self[key], str):
|
prior = self[key]
|
||||||
|
if isinstance(prior, str):
|
||||||
if callable(value):
|
if callable(value):
|
||||||
value.pattern = self[key]
|
value.pattern = prior
|
||||||
|
value.remap = getattr(prior, 'remap', None)
|
||||||
else:
|
else:
|
||||||
raise AttributeError(f'Name {key} redefined')
|
raise AttributeError(f'Name {key} redefined')
|
||||||
|
|
||||||
super().__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
if key not in self and key.isupper() and key[:1] != '_':
|
||||||
|
return key
|
||||||
|
else:
|
||||||
|
return super().__getitem__(key)
|
||||||
|
|
||||||
class LexerMeta(type):
|
class LexerMeta(type):
|
||||||
'''
|
'''
|
||||||
Metaclass for collecting lexing rules
|
Metaclass for collecting lexing rules
|
||||||
@ -114,7 +134,12 @@ class LexerMeta(type):
|
|||||||
|
|
||||||
def __new__(meta, clsname, bases, attributes):
|
def __new__(meta, clsname, bases, attributes):
|
||||||
del attributes['_']
|
del attributes['_']
|
||||||
|
remapping = { key: val.remap for key, val in attributes.items()
|
||||||
|
if getattr(val, 'remap', None) }
|
||||||
|
attributes = { key: str(val) if isinstance(val, TokenStr) else val
|
||||||
|
for key, val in attributes.items() }
|
||||||
cls = super().__new__(meta, clsname, bases, attributes)
|
cls = super().__new__(meta, clsname, bases, attributes)
|
||||||
|
cls._remapping = remapping
|
||||||
cls._build(list(attributes.items()))
|
cls._build(list(attributes.items()))
|
||||||
return cls
|
return cls
|
||||||
|
|
||||||
@ -159,6 +184,16 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
cls._ignored_tokens = set(cls._ignored_tokens)
|
cls._ignored_tokens = set(cls._ignored_tokens)
|
||||||
cls._token_funcs = dict(cls._token_funcs)
|
cls._token_funcs = dict(cls._token_funcs)
|
||||||
|
|
||||||
|
# Build a set of all remapped tokens
|
||||||
|
remapped_tokens = set()
|
||||||
|
for toks in cls._remapping.values():
|
||||||
|
remapped_tokens.update(toks.values())
|
||||||
|
|
||||||
|
undefined = remapped_tokens - cls._token_names
|
||||||
|
if undefined:
|
||||||
|
missing = ', '.join(undefined)
|
||||||
|
raise LexerBuildError(f'{missing} not included in token(s)')
|
||||||
|
|
||||||
parts = []
|
parts = []
|
||||||
for tokname, value in cls._collect_rules(definitions):
|
for tokname, value in cls._collect_rules(definitions):
|
||||||
if tokname.startswith('ignore_'):
|
if tokname.startswith('ignore_'):
|
||||||
@ -169,8 +204,10 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
pattern = value
|
pattern = value
|
||||||
|
|
||||||
elif callable(value):
|
elif callable(value):
|
||||||
pattern = value.pattern
|
|
||||||
cls._token_funcs[tokname] = value
|
cls._token_funcs[tokname] = value
|
||||||
|
pattern = getattr(value, 'pattern', None)
|
||||||
|
if not pattern:
|
||||||
|
continue
|
||||||
|
|
||||||
# Form the regular expression component
|
# Form the regular expression component
|
||||||
part = f'(?P<{tokname}>{pattern})'
|
part = f'(?P<{tokname}>{pattern})'
|
||||||
@ -209,7 +246,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
_ignore = self.ignore
|
_ignore = self.ignore
|
||||||
_token_funcs = self._token_funcs
|
_token_funcs = self._token_funcs
|
||||||
_literals = self._literals
|
_literals = self._literals
|
||||||
|
_remapping = self._remapping
|
||||||
self.text = text
|
self.text = text
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
@ -228,6 +265,9 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
index = m.end()
|
index = m.end()
|
||||||
tok.value = m.group()
|
tok.value = m.group()
|
||||||
tok.type = m.lastgroup
|
tok.type = m.lastgroup
|
||||||
|
if tok.type in _remapping:
|
||||||
|
tok.type = _remapping[tok.type].get(tok.value, tok.type)
|
||||||
|
|
||||||
if tok.type in _token_funcs:
|
if tok.type in _token_funcs:
|
||||||
self.index = index
|
self.index = index
|
||||||
self.lineno = lineno
|
self.lineno = lineno
|
||||||
|
20
sly/yacc.py
20
sly/yacc.py
@ -1,7 +1,7 @@
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# sly: yacc.py
|
# sly: yacc.py
|
||||||
#
|
#
|
||||||
# Copyright (C) 2016-2017
|
# Copyright (C) 2016-2018
|
||||||
# David M. Beazley (Dabeaz LLC)
|
# David M. Beazley (Dabeaz LLC)
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
@ -35,7 +35,7 @@ import sys
|
|||||||
import inspect
|
import inspect
|
||||||
from collections import OrderedDict, defaultdict
|
from collections import OrderedDict, defaultdict
|
||||||
|
|
||||||
__version__ = '0.2'
|
__version__ = '0.3'
|
||||||
__all__ = [ 'Parser' ]
|
__all__ = [ 'Parser' ]
|
||||||
|
|
||||||
class YaccError(Exception):
|
class YaccError(Exception):
|
||||||
@ -55,12 +55,12 @@ ERROR_COUNT = 3 # Number of symbols that must be shifted to leave
|
|||||||
MAXINT = sys.maxsize
|
MAXINT = sys.maxsize
|
||||||
|
|
||||||
# This object is a stand-in for a logging object created by the
|
# This object is a stand-in for a logging object created by the
|
||||||
# logging module. PLY will use this by default to create things
|
# logging module. SLY will use this by default to create things
|
||||||
# such as the parser.out file. If a user wants more detailed
|
# such as the parser.out file. If a user wants more detailed
|
||||||
# information, they can create their own logging object and pass
|
# information, they can create their own logging object and pass
|
||||||
# it into PLY.
|
# it into SLY.
|
||||||
|
|
||||||
class PlyLogger(object):
|
class SlyLogger(object):
|
||||||
def __init__(self, f):
|
def __init__(self, f):
|
||||||
self.f = f
|
self.f = f
|
||||||
|
|
||||||
@ -1552,7 +1552,7 @@ def _collect_grammar_rules(func):
|
|||||||
|
|
||||||
return grammar
|
return grammar
|
||||||
|
|
||||||
class ParserMetaDict(OrderedDict):
|
class ParserMetaDict(dict):
|
||||||
'''
|
'''
|
||||||
Dictionary that allows decorated grammar rule functions to be overloaded
|
Dictionary that allows decorated grammar rule functions to be overloaded
|
||||||
'''
|
'''
|
||||||
@ -1561,6 +1561,12 @@ class ParserMetaDict(OrderedDict):
|
|||||||
value.next_func = self[key]
|
value.next_func = self[key]
|
||||||
super().__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
if key not in self and key.isupper() and key[:1] != '_':
|
||||||
|
return key.upper()
|
||||||
|
else:
|
||||||
|
return super().__getitem__(key)
|
||||||
|
|
||||||
class ParserMeta(type):
|
class ParserMeta(type):
|
||||||
@classmethod
|
@classmethod
|
||||||
def __prepare__(meta, *args, **kwargs):
|
def __prepare__(meta, *args, **kwargs):
|
||||||
@ -1582,7 +1588,7 @@ class ParserMeta(type):
|
|||||||
|
|
||||||
class Parser(metaclass=ParserMeta):
|
class Parser(metaclass=ParserMeta):
|
||||||
# Logging object where debugging/diagnostic messages are sent
|
# Logging object where debugging/diagnostic messages are sent
|
||||||
log = PlyLogger(sys.stderr)
|
log = SlyLogger(sys.stderr)
|
||||||
|
|
||||||
# Debugging filename where parsetab.out data can be written
|
# Debugging filename where parsetab.out data can be written
|
||||||
debugfile = None
|
debugfile = None
|
||||||
|
@ -99,8 +99,93 @@ def test_error_return():
|
|||||||
assert lexer.errors == [ ':+-' ]
|
assert lexer.errors == [ ':+-' ]
|
||||||
|
|
||||||
|
|
||||||
|
class ModernCalcLexer(Lexer):
|
||||||
|
# Set of token names. This is always required
|
||||||
|
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, LT, LE, IF, ELSE }
|
||||||
|
literals = { '(', ')' }
|
||||||
|
|
||||||
|
# String containing ignored characters between tokens
|
||||||
|
ignore = ' \t'
|
||||||
|
|
||||||
|
# Regular expression rules for tokens
|
||||||
|
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
||||||
|
ID['if'] = IF
|
||||||
|
ID['else'] = ELSE
|
||||||
|
|
||||||
|
NUMBER = r'\d+'
|
||||||
|
PLUS = r'\+'
|
||||||
|
MINUS = r'-'
|
||||||
|
TIMES = r'\*'
|
||||||
|
DIVIDE = r'/'
|
||||||
|
ASSIGN = r'='
|
||||||
|
LE = r'<='
|
||||||
|
LT = r'<'
|
||||||
|
|
||||||
|
def NUMBER(self, t):
|
||||||
|
t.value = int(t.value)
|
||||||
|
return t
|
||||||
|
|
||||||
|
# Ignored text
|
||||||
|
ignore_comment = r'\#.*'
|
||||||
|
|
||||||
|
@_(r'\n+')
|
||||||
|
def ignore_newline(self, t):
|
||||||
|
self.lineno += t.value.count('\n')
|
||||||
|
|
||||||
|
# Attached rule
|
||||||
|
def ID(self, t):
|
||||||
|
t.value = t.value.upper()
|
||||||
|
return t
|
||||||
|
|
||||||
|
def error(self, t):
|
||||||
|
self.errors.append(t.value)
|
||||||
|
self.index += 1
|
||||||
|
if hasattr(self, 'return_error'):
|
||||||
|
return t
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.errors = []
|
||||||
|
|
||||||
|
|
||||||
|
# Test basic recognition of various tokens and literals
|
||||||
|
def test_modern_tokens():
|
||||||
|
lexer = ModernCalcLexer()
|
||||||
|
toks = list(lexer.tokenize('abc if else 123 + - * / = < <= ( )'))
|
||||||
|
types = [t.type for t in toks]
|
||||||
|
vals = [t.value for t in toks]
|
||||||
|
assert types == ['ID','IF','ELSE', 'NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
|
||||||
|
assert vals == ['ABC','if','else', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
|
||||||
|
|
||||||
|
# Test ignored comments and newlines
|
||||||
|
def test_modern_ignored():
|
||||||
|
lexer = ModernCalcLexer()
|
||||||
|
toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n'))
|
||||||
|
types = [t.type for t in toks]
|
||||||
|
vals = [t.value for t in toks]
|
||||||
|
linenos = [t.lineno for t in toks]
|
||||||
|
assert types == ['NUMBER', 'ID']
|
||||||
|
assert vals == [123, 'ABC']
|
||||||
|
assert linenos == [4,5]
|
||||||
|
assert lexer.lineno == 6
|
||||||
|
|
||||||
|
# Test error handling
|
||||||
|
def test_modern_error():
|
||||||
|
lexer = ModernCalcLexer()
|
||||||
|
toks = list(lexer.tokenize('123 :+-'))
|
||||||
|
types = [t.type for t in toks]
|
||||||
|
vals = [t.value for t in toks]
|
||||||
|
assert types == ['NUMBER', 'PLUS', 'MINUS']
|
||||||
|
assert vals == [123, '+', '-']
|
||||||
|
assert lexer.errors == [ ':+-' ]
|
||||||
|
|
||||||
|
# Test error token return handling
|
||||||
|
def test_modern_error_return():
|
||||||
|
lexer = ModernCalcLexer()
|
||||||
|
lexer.return_error = True
|
||||||
|
toks = list(lexer.tokenize('123 :+-'))
|
||||||
|
types = [t.type for t in toks]
|
||||||
|
vals = [t.value for t in toks]
|
||||||
|
assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
|
||||||
|
assert vals == [123, ':+-', '+', '-']
|
||||||
|
assert lexer.errors == [ ':+-' ]
|
||||||
|
|
||||||
|
@ -3,16 +3,7 @@ from sly import Lexer, Parser
|
|||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN }
|
||||||
'ID',
|
|
||||||
'NUMBER',
|
|
||||||
'PLUS',
|
|
||||||
'MINUS',
|
|
||||||
'TIMES',
|
|
||||||
'DIVIDE',
|
|
||||||
'ASSIGN',
|
|
||||||
}
|
|
||||||
|
|
||||||
literals = { '(', ')' }
|
literals = { '(', ')' }
|
||||||
|
|
||||||
# String containing ignored characters between tokens
|
# String containing ignored characters between tokens
|
||||||
@ -38,8 +29,8 @@ class CalcLexer(Lexer):
|
|||||||
def newline(self, t):
|
def newline(self, t):
|
||||||
self.lineno += t.value.count('\n')
|
self.lineno += t.value.count('\n')
|
||||||
|
|
||||||
def error(self, value):
|
def error(self, t):
|
||||||
self.errors.append(value)
|
self.errors.append(t.value[0])
|
||||||
self.index += 1
|
self.index += 1
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -49,9 +40,9 @@ class CalcParser(Parser):
|
|||||||
tokens = CalcLexer.tokens
|
tokens = CalcLexer.tokens
|
||||||
|
|
||||||
precedence = (
|
precedence = (
|
||||||
('left', 'PLUS', 'MINUS'),
|
('left', PLUS, MINUS),
|
||||||
('left', 'TIMES', 'DIVIDE'),
|
('left', TIMES, DIVIDE),
|
||||||
('right', 'UMINUS'),
|
('right', UMINUS),
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user