Various work in progress. Position tracking

This commit is contained in:
David Beazley 2022-09-06 19:38:33 -05:00
parent cd9014eda2
commit 62203d8b75
4 changed files with 77 additions and 71 deletions

12
CHANGES
View File

@ -1,5 +1,17 @@
In Progress
-----------
03/25/2022 Added automatic location tracking to the parser. Use
Parser.line_position(value) to return the line number
and Parser.index_position(value) to return a (start, end)
index pair. value is *any* object returned by one of
the various methods in the parser definition. Typically,
it would be a AST node. The parser tracks the data using
the value of id(value).
03/25/2022 Added .end attribute to tokens that specify the ending
index of the matching text. This is used to do more
precise location tracking for the purpose of issuing
more useful error messages.
05/09/2020 Experimental support for EBNF choices. For example:

View File

@ -73,9 +73,9 @@ class Token(object):
'''
Representation of a single token.
'''
__slots__ = ('type', 'value', 'lineno', 'index')
__slots__ = ('type', 'value', 'lineno', 'index', 'end')
def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
class TokenStr(str):
@staticmethod
@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
tok.index = index
m = _master_re.match(text, index)
if m:
index = m.end()
tok.end = index = m.end()
tok.value = m.group()
tok.type = m.lastgroup
@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
# No match, see if the character is in literals
if text[index] in _literals:
tok.value = text[index]
tok.end = index + 1
tok.type = tok.value
index += 1
yield tok
@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
tok.value = text[index:]
tok = self.error(tok)
if tok is not None:
tok.end = self.index
yield tok
index = self.index

View File

@ -126,8 +126,6 @@ class YaccProduction:
@property
def lineno(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
lineno = getattr(tok, 'lineno', None)
if lineno:
return lineno
@ -136,13 +134,20 @@ class YaccProduction:
@property
def index(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
index = getattr(tok, 'index', None)
if index is not None:
return index
raise AttributeError('No index attribute found')
@property
def end(self):
result = None
for tok in self._slice:
r = getattr(tok, 'end', None)
if r:
result = r
return result
def __getattr__(self, name):
if name in self._namemap:
return self._namemap[name](self._slice)
@ -1806,12 +1811,6 @@ class ParserMeta(type):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
# def _(rule, *extra):
# rules = [rule, *extra]
# def decorate(func):
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
# return func
# return decorate
d['_'] = _decorator
return d
@ -1822,6 +1821,9 @@ class ParserMeta(type):
return cls
class Parser(metaclass=ParserMeta):
# Automatic tracking of position information
track_positions = True
# Logging object where debugging/diagnostic messages are sent
log = SlyLogger(sys.stderr)
@ -2079,6 +2081,12 @@ class Parser(metaclass=ParserMeta):
pslice._stack = symstack # Associate the stack with the production
self.restart()
# Set up position tracking
track_positions = self.track_positions
if not hasattr(self, '_line_positions'):
self._line_positions = { } # id: -> lineno
self._index_positions = { } # id: -> (start, end)
errtoken = None # Err token
while True:
# Get the next symbol on the input. If a lookahead symbol
@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
value = p.func(self, pslice)
if value is pslice:
value = (pname, *(s.value for s in pslice._slice))
sym.value = value
# Record positions
if track_positions:
if plen:
sym.lineno = symstack[-plen].lineno
sym.index = symstack[-plen].index
sym.end = symstack[-1].end
else:
# A zero-length production (what to put here?)
sym.lineno = None
sym.index = None
sym.end = None
self._line_positions[id(value)] = sym.lineno
self._index_positions[id(value)] = (sym.index, sym.end)
if plen:
del symstack[-plen:]
del statestack[-plen:]
@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
t.lineno = lookahead.lineno
if hasattr(lookahead, 'index'):
t.index = lookahead.index
if hasattr(lookahead, 'end'):
t.end = lookahead.end
t.value = lookahead
lookaheadstack.append(lookahead)
lookahead = t
@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):
# Call an error function here
raise RuntimeError('sly: internal parser error!!!\n')
# Return position tracking information
def line_position(self, value):
return self._line_positions[id(value)]
def index_position(self, value):
return self._index_positions[id(value)]

View File

@ -1,11 +1,6 @@
import pytest
from sly import Lexer
try:
import regex
except ImportError:
regex = None
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
@ -61,29 +56,6 @@ class CalcLexer(Lexer):
def __init__(self):
self.errors = []
if regex is not None:
class RegexModuleCalcLexer(Lexer):
regex_module = regex
tokens = { 'ID', 'PLUS', 'MINUS' }
literals = { '(', ')' }
ignore = ' \t'
ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
PLUS = r'\+'
MINUS = r'-'
ignore_comment = r'\#.*'
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def ID(self, t):
t.value = t.value.upper()
return t
# Test basic recognition of various tokens and literals
def test_tokens():
lexer = CalcLexer()
@ -93,16 +65,20 @@ def test_tokens():
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
# Test third-party regex module support
@pytest.mark.skipif(regex is None,
reason="third-party regex module not installed")
def test_3rd_party_regex_module():
lexer = RegexModuleCalcLexer()
toks = list(lexer.tokenize('a + b - c'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['ID','PLUS','ID','MINUS','ID']
assert vals == ['A', '+', 'B', '-', 'C']
# Test position tracking
def test_positions():
lexer = CalcLexer()
text = 'abc\n( )'
toks = list(lexer.tokenize(text))
lines = [t.lineno for t in toks ]
indices = [t.index for t in toks ]
ends = [t.end for t in toks]
values = [ text[t.index:t.end] for t in toks ]
assert values == ['abc', '(', ')']
assert lines == [1, 2, 2]
assert indices == [0, 4, 6]
assert ends == [3, 5, 7]
# Test ignored comments and newlines
def test_ignored():
@ -228,23 +204,5 @@ def test_modern_error_return():
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]
# Test Lexer Inheritance. This class should inherit all of the tokens
# and features of ModernCalcLexer, but add two new tokens to it. The
# PLUSPLUS token matches before the PLUS token.
if False:
class SubModernCalcLexer(ModernCalcLexer):
tokens |= { DOLLAR, PLUSPLUS }
DOLLAR = r'\$'
PLUSPLUS = r'\+\+'
PLUSPLUS.before = PLUS
def test_lexer_inherit():
lexer = SubModernCalcLexer()
toks = list(lexer.tokenize('123 + - $ ++ if'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
assert vals == [123, '+', '-', '$', '++', 'if']