Changes to token specification. More metamagic

This commit is contained in:
David Beazley
2018-01-27 15:27:15 -06:00
parent b74e7223ce
commit b088d9b2ce
10 changed files with 302 additions and 142 deletions

View File

@@ -60,9 +60,7 @@ expressions and store variables::
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
tokens = { NAME, NUMBER }
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }

View File

@@ -68,17 +68,8 @@ lexer that tokenizes the above text::
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
DIVIDE, ASSIGN, LPAREN, RPAREN }
# String containing ignored characters between tokens
ignore = ' \t'
@@ -131,19 +122,12 @@ In the example, the following code specified the token names::
class CalcLexer(Lexer):
...
# Set of token names. This is always required
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
DIVIDE, ASSIGN, LPAREN, RPAREN }
...
Token names should be specified using all-caps as shown.
Specification of token match patterns
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -167,7 +151,7 @@ short tokens. For example, if you wanted to have separate tokens for
example::
class MyLexer(Lexer):
tokens = {'ASSIGN', 'EQ', ...}
tokens = { ASSIGN, EQ, ...}
...
EQ = r'==' # MUST APPEAR FIRST! (LONGER)
ASSIGN = r'='
@@ -251,12 +235,10 @@ Instead of using the ``@_()`` decorator, you can also write a method
that matches the same name as a token previously specified as a
string. For example::
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
...
def ID(self, t):
reserved = { 'if', 'else', 'while', 'for' }
if t.value in reserved:
t.type = t.value.upper()
def NUMBER(self, t):
t.value = int(t.value)
return t
This is potentially useful trick for debugging a lexer. You can temporarily
@@ -264,6 +246,36 @@ attach a method a token and have it execute when the token is encountered.
If you later take the method away, the lexer will revert back to its original
behavior.
Token Remapping
^^^^^^^^^^^^^^^
Occasionally, you might need to remap tokens based on special cases.
Consider the case of matching identifiers such as "abc", "python", or "guido".
Certain identifiers such as "if", "else", and "while" might need to be
treated as special keywords. To handle this, include token remapping rules when
writing the lexer like this::
# calclex.py
from sly import Lexer
class CalcLexer(Lexer):
tokens = { ID, IF, ELSE, WHILE }
# String containing ignored characters (between tokens)
ignore = ' \t'
# Base ID rule
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
# Special cases
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
When parsing an identifier, the special cases will remap certain matching
values to a new token type. For example, if the value of an identifier is
"if" above, an ``IF`` token will be generated.
Line numbers and position tracking
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -385,26 +397,11 @@ into practice::
from sly import Lexer
class CalcLexer(Lexer):
# Set of reserved names (language keywords)
reserved_words = { 'WHILE', 'IF', 'ELSE', 'PRINT' }
# Set of token names. This is always required
tokens = {
'NUMBER',
'ID',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'EQ',
'LT',
'LE',
'GT',
'GE',
'NE',
*reserved_words,
}
tokens = { NUMBER, ID, WHILE, IF, ELSE, PRINT,
PLUS, MINUS, TIMES, DIVIDE, ASSIGN,
EQ, LT, LE, GT, GE, NE }
literals = { '(', ')', '{', '}', ';' }
@@ -429,12 +426,12 @@ into practice::
t.value = int(t.value)
return t
@_(r'[a-zA-Z_][a-zA-Z0-9_]*')
def ID(self, t):
# Check if name matches a reserved word (change token type if true)
if t.value.upper() in self.reserved_words:
t.type = t.value.upper()
return t
# Identifiers and keywords
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
ID['print'] = PRINT
ignore_comment = r'\#.*'
@@ -443,8 +440,8 @@ into practice::
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
def error(self, value):
print('Line %d: Bad character %r' % (self.lineno, value[0]))
def error(self, t):
print('Line %d: Bad character %r' % (self.lineno, t.value[0]))
self.index += 1
if __name__ == '__main__':
@@ -462,27 +459,27 @@ into practice::
If you run this code, you'll get output that looks like this::
Token(ID, 'x', 3, 12)
Token(ASSIGN, '=', 3, 14)
Token(NUMBER, 0, 3, 16)
Token(;, ';', 3, 17)
Token(WHILE, 'while', 4, 19)
Token((, '(', 4, 25)
Token(ID, 'x', 4, 26)
Token(LT, '<', 4, 28)
Token(NUMBER, 10, 4, 30)
Token(), ')', 4, 32)
Token({, '{', 4, 34)
Token(PRINT, 'print', 5, 40)
Token(ID, 'x', 5, 46)
Token(type='ID', value='x', lineno=3, index=20)
Token(type='ASSIGN', value='=', lineno=3, index=22)
Token(type='NUMBER', value=0, lineno=3, index=24)
Token(type=';', value=';', lineno=3, index=25)
Token(type='WHILE', value='while', lineno=4, index=31)
Token(type='(', value='(', lineno=4, index=37)
Token(type='ID', value='x', lineno=4, index=38)
Token(type='LT', value='<', lineno=4, index=40)
Token(type='NUMBER', value=10, lineno=4, index=42)
Token(type=')', value=')', lineno=4, index=44)
Token(type='{', value='{', lineno=4, index=46)
Token(type='PRINT', value='print', lineno=5, index=56)
Token(type='ID', value='x', lineno=5, index=62)
Line 5: Bad character ':'
Token(ID, 'x', 6, 53)
Token(ASSIGN, '=', 6, 55)
Token(ID, 'x', 6, 57)
Token(PLUS, '+', 6, 59)
Token(NUMBER, 1, 6, 61)
Token(;, ';', 6, 62)
Token(}, '}', 7, 64)
Token(type='ID', value='x', lineno=6, index=73)
Token(type='ASSIGN', value='=', lineno=6, index=75)
Token(type='ID', value='x', lineno=6, index=77)
Token(type='PLUS', value='+', lineno=6, index=79)
Token(type='NUMBER', value=1, lineno=6, index=81)
Token(type=';', value=';', lineno=6, index=82)
Token(type='}', value='}', lineno=7, index=88)
Study this example closely. It might take a bit to digest, but all of the
essential parts of writing a lexer are there. Tokens have to be specified
@@ -914,8 +911,8 @@ like this::
class CalcParser(Parser):
...
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
)
# Rules where precedence is applied
@@ -1004,9 +1001,9 @@ like this::
class CalcParser(Parser):
...
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('right', 'UMINUS'), # Unary minus operator
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS), # Unary minus operator
)
Now, in the grammar file, you write the unary minus rule like this::
@@ -1034,10 +1031,10 @@ operators like ``<`` and ``>`` but you didn't want combinations like
class MyParser(Parser):
...
precedence = (
('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIVIDE'),
('right', 'UMINUS'), # Unary minus operator
('nonassoc', LESSTHAN, GREATERTHAN), # Nonassociative operators
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS), # Unary minus operator
)
If you do this, the occurrence of input text such as ``a < b < c``