Various work in progress. Position tracking
This commit is contained in:
@ -1,6 +1,18 @@
In Progress
03/25/2022 Added automatic location tracking to the parser. Use
Parser.line_position(value) to return the line number
and Parser.index_position(value) to return a (start, end)
index pair. value is *any* object returned by one of
the various methods in the parser definition. Typically,
it would be a AST node. The parser tracks the data using
the value of id(value).
03/25/2022 Added .end attribute to tokens that specify the ending
index of the matching text. This is used to do more
precise location tracking for the purpose of issuing
more useful error messages.
05/09/2020 Experimental support for EBNF choices. For example:
@('term { PLUS|MINUS term }')
@ -73,9 +73,9 @@ class Token(object):
Representation of a single token.
__slots__ = ('type', 'value', 'lineno', 'index')
__slots__ = ('type', 'value', 'lineno', 'index', 'end')
def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
class TokenStr(str):
@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
tok.index = index
m = _master_re.match(text, index)
if m:
index = m.end()
tok.end = index = m.end()
tok.value =
tok.type = m.lastgroup
@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
# No match, see if the character is in literals
if text[index] in _literals:
tok.value = text[index]
tok.end = index + 1
tok.type = tok.value
index += 1
yield tok
@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
tok.value = text[index:]
tok = self.error(tok)
if tok is not None:
tok.end = self.index
yield tok
index = self.index
@ -126,8 +126,6 @@ class YaccProduction:
def lineno(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
lineno = getattr(tok, 'lineno', None)
if lineno:
return lineno
@ -136,13 +134,20 @@ class YaccProduction:
def index(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
index = getattr(tok, 'index', None)
if index is not None:
return index
raise AttributeError('No index attribute found')
def end(self):
result = None
for tok in self._slice:
r = getattr(tok, 'end', None)
if r:
result = r
return result
def __getattr__(self, name):
if name in self._namemap:
return self._namemap[name](self._slice)
@ -1806,12 +1811,6 @@ class ParserMeta(type):
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
# def _(rule, *extra):
# rules = [rule, *extra]
# def decorate(func):
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
# return func
# return decorate
d['_'] = _decorator
return d
@ -1822,6 +1821,9 @@ class ParserMeta(type):
return cls
class Parser(metaclass=ParserMeta):
# Automatic tracking of position information
track_positions = True
# Logging object where debugging/diagnostic messages are sent
log = SlyLogger(sys.stderr)
@ -2076,9 +2078,15 @@ class Parser(metaclass=ParserMeta):
self.tokens = tokens
self.statestack = statestack = [] # Stack of parsing states
self.symstack = symstack = [] # Stack of grammar symbols
pslice._stack = symstack # Associate the stack with the production
pslice._stack = symstack # Associate the stack with the production
# Set up position tracking
track_positions = self.track_positions
if not hasattr(self, '_line_positions'):
self._line_positions = { } # id: -> lineno
self._index_positions = { } # id: -> (start, end)
errtoken = None # Err token
while True:
# Get the next symbol on the input. If a lookahead symbol
@ -2093,7 +2101,7 @@ class Parser(metaclass=ParserMeta):
if not lookahead:
lookahead = YaccSymbol()
lookahead.type = '$end'
# Check the action table
ltype = lookahead.type
t = actions[self.state].get(ltype)
@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
value = p.func(self, pslice)
if value is pslice:
value = (pname, *(s.value for s in pslice._slice))
sym.value = value
# Record positions
if track_positions:
if plen:
sym.lineno = symstack[-plen].lineno
sym.index = symstack[-plen].index
sym.end = symstack[-1].end
# A zero-length production (what to put here?)
sym.lineno = None
sym.index = None
sym.end = None
self._line_positions[id(value)] = sym.lineno
self._index_positions[id(value)] = (sym.index, sym.end)
if plen:
del symstack[-plen:]
del statestack[-plen:]
@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
t.lineno = lookahead.lineno
if hasattr(lookahead, 'index'):
t.index = lookahead.index
if hasattr(lookahead, 'end'):
t.end = lookahead.end
t.value = lookahead
lookahead = t
@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):
# Call an error function here
raise RuntimeError('sly: internal parser error!!!\n')
# Return position tracking information
def line_position(self, value):
return self._line_positions[id(value)]
def index_position(self, value):
return self._index_positions[id(value)]
@ -1,11 +1,6 @@
import pytest
from sly import Lexer
import regex
except ImportError:
regex = None
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
@ -61,29 +56,6 @@ class CalcLexer(Lexer):
def __init__(self):
self.errors = []
if regex is not None:
class RegexModuleCalcLexer(Lexer):
regex_module = regex
tokens = { 'ID', 'PLUS', 'MINUS' }
literals = { '(', ')' }
ignore = ' \t'
ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
PLUS = r'\+'
MINUS = r'-'
ignore_comment = r'\#.*'
def newline(self, t):
self.lineno += t.value.count('\n')
def ID(self, t):
t.value = t.value.upper()
return t
# Test basic recognition of various tokens and literals
def test_tokens():
lexer = CalcLexer()
@ -93,17 +65,21 @@ def test_tokens():
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
# Test third-party regex module support
@pytest.mark.skipif(regex is None,
reason="third-party regex module not installed")
def test_3rd_party_regex_module():
lexer = RegexModuleCalcLexer()
toks = list(lexer.tokenize('a + b - c'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['ID','PLUS','ID','MINUS','ID']
assert vals == ['A', '+', 'B', '-', 'C']
# Test position tracking
def test_positions():
lexer = CalcLexer()
text = 'abc\n( )'
toks = list(lexer.tokenize(text))
lines = [t.lineno for t in toks ]
indices = [t.index for t in toks ]
ends = [t.end for t in toks]
values = [ text[t.index:t.end] for t in toks ]
assert values == ['abc', '(', ')']
assert lines == [1, 2, 2]
assert indices == [0, 4, 6]
assert ends == [3, 5, 7]
# Test ignored comments and newlines
def test_ignored():
lexer = CalcLexer()
@ -228,23 +204,5 @@ def test_modern_error_return():
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]
# Test Lexer Inheritance. This class should inherit all of the tokens
# and features of ModernCalcLexer, but add two new tokens to it. The
# PLUSPLUS token matches before the PLUS token.
if False:
class SubModernCalcLexer(ModernCalcLexer):
tokens |= { DOLLAR, PLUSPLUS }
DOLLAR = r'\$'
PLUSPLUS = r'\+\+'
def test_lexer_inherit():
lexer = SubModernCalcLexer()
toks = list(lexer.tokenize('123 + - $ ++ if'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
assert vals == [123, '+', '-', '$', '++', 'if']
Reference in New Issue
Block a user