Various work in progress. Position tracking

This commit is contained in:
David Beazley 2022-09-06 19:38:33 -05:00
parent cd9014eda2
commit 62203d8b75
4 changed files with 77 additions and 71 deletions

12
CHANGES
View File

@ -1,5 +1,17 @@
In Progress In Progress
----------- -----------
03/25/2022 Added automatic location tracking to the parser. Use
Parser.line_position(value) to return the line number
and Parser.index_position(value) to return a (start, end)
index pair. value is *any* object returned by one of
the various methods in the parser definition. Typically,
it would be a AST node. The parser tracks the data using
the value of id(value).
03/25/2022 Added .end attribute to tokens that specify the ending
index of the matching text. This is used to do more
precise location tracking for the purpose of issuing
more useful error messages.
05/09/2020 Experimental support for EBNF choices. For example: 05/09/2020 Experimental support for EBNF choices. For example:

View File

@ -73,9 +73,9 @@ class Token(object):
''' '''
Representation of a single token. Representation of a single token.
''' '''
__slots__ = ('type', 'value', 'lineno', 'index') __slots__ = ('type', 'value', 'lineno', 'index', 'end')
def __repr__(self): def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})' return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
class TokenStr(str): class TokenStr(str):
@staticmethod @staticmethod
@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
tok.index = index tok.index = index
m = _master_re.match(text, index) m = _master_re.match(text, index)
if m: if m:
index = m.end() tok.end = index = m.end()
tok.value = m.group() tok.value = m.group()
tok.type = m.lastgroup tok.type = m.lastgroup
@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
# No match, see if the character is in literals # No match, see if the character is in literals
if text[index] in _literals: if text[index] in _literals:
tok.value = text[index] tok.value = text[index]
tok.end = index + 1
tok.type = tok.value tok.type = tok.value
index += 1 index += 1
yield tok yield tok
@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
tok.value = text[index:] tok.value = text[index:]
tok = self.error(tok) tok = self.error(tok)
if tok is not None: if tok is not None:
tok.end = self.index
yield tok yield tok
index = self.index index = self.index

View File

@ -126,8 +126,6 @@ class YaccProduction:
@property @property
def lineno(self): def lineno(self):
for tok in self._slice: for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
lineno = getattr(tok, 'lineno', None) lineno = getattr(tok, 'lineno', None)
if lineno: if lineno:
return lineno return lineno
@ -136,13 +134,20 @@ class YaccProduction:
@property @property
def index(self): def index(self):
for tok in self._slice: for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
index = getattr(tok, 'index', None) index = getattr(tok, 'index', None)
if index is not None: if index is not None:
return index return index
raise AttributeError('No index attribute found') raise AttributeError('No index attribute found')
@property
def end(self):
result = None
for tok in self._slice:
r = getattr(tok, 'end', None)
if r:
result = r
return result
def __getattr__(self, name): def __getattr__(self, name):
if name in self._namemap: if name in self._namemap:
return self._namemap[name](self._slice) return self._namemap[name](self._slice)
@ -1806,12 +1811,6 @@ class ParserMeta(type):
@classmethod @classmethod
def __prepare__(meta, *args, **kwargs): def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict() d = ParserMetaDict()
# def _(rule, *extra):
# rules = [rule, *extra]
# def decorate(func):
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
# return func
# return decorate
d['_'] = _decorator d['_'] = _decorator
return d return d
@ -1822,6 +1821,9 @@ class ParserMeta(type):
return cls return cls
class Parser(metaclass=ParserMeta): class Parser(metaclass=ParserMeta):
# Automatic tracking of position information
track_positions = True
# Logging object where debugging/diagnostic messages are sent # Logging object where debugging/diagnostic messages are sent
log = SlyLogger(sys.stderr) log = SlyLogger(sys.stderr)
@ -2079,6 +2081,12 @@ class Parser(metaclass=ParserMeta):
pslice._stack = symstack # Associate the stack with the production pslice._stack = symstack # Associate the stack with the production
self.restart() self.restart()
# Set up position tracking
track_positions = self.track_positions
if not hasattr(self, '_line_positions'):
self._line_positions = { } # id: -> lineno
self._index_positions = { } # id: -> (start, end)
errtoken = None # Err token errtoken = None # Err token
while True: while True:
# Get the next symbol on the input. If a lookahead symbol # Get the next symbol on the input. If a lookahead symbol
@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
value = p.func(self, pslice) value = p.func(self, pslice)
if value is pslice: if value is pslice:
value = (pname, *(s.value for s in pslice._slice)) value = (pname, *(s.value for s in pslice._slice))
sym.value = value sym.value = value
# Record positions
if track_positions:
if plen:
sym.lineno = symstack[-plen].lineno
sym.index = symstack[-plen].index
sym.end = symstack[-1].end
else:
# A zero-length production (what to put here?)
sym.lineno = None
sym.index = None
sym.end = None
self._line_positions[id(value)] = sym.lineno
self._index_positions[id(value)] = (sym.index, sym.end)
if plen: if plen:
del symstack[-plen:] del symstack[-plen:]
del statestack[-plen:] del statestack[-plen:]
@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
t.lineno = lookahead.lineno t.lineno = lookahead.lineno
if hasattr(lookahead, 'index'): if hasattr(lookahead, 'index'):
t.index = lookahead.index t.index = lookahead.index
if hasattr(lookahead, 'end'):
t.end = lookahead.end
t.value = lookahead t.value = lookahead
lookaheadstack.append(lookahead) lookaheadstack.append(lookahead)
lookahead = t lookahead = t
@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):
# Call an error function here # Call an error function here
raise RuntimeError('sly: internal parser error!!!\n') raise RuntimeError('sly: internal parser error!!!\n')
# Return position tracking information
def line_position(self, value):
return self._line_positions[id(value)]
def index_position(self, value):
return self._index_positions[id(value)]

View File

@ -1,11 +1,6 @@
import pytest import pytest
from sly import Lexer from sly import Lexer
try:
import regex
except ImportError:
regex = None
class CalcLexer(Lexer): class CalcLexer(Lexer):
# Set of token names. This is always required # Set of token names. This is always required
tokens = { tokens = {
@ -61,29 +56,6 @@ class CalcLexer(Lexer):
def __init__(self): def __init__(self):
self.errors = [] self.errors = []
if regex is not None:
class RegexModuleCalcLexer(Lexer):
regex_module = regex
tokens = { 'ID', 'PLUS', 'MINUS' }
literals = { '(', ')' }
ignore = ' \t'
ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
PLUS = r'\+'
MINUS = r'-'
ignore_comment = r'\#.*'
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def ID(self, t):
t.value = t.value.upper()
return t
# Test basic recognition of various tokens and literals # Test basic recognition of various tokens and literals
def test_tokens(): def test_tokens():
lexer = CalcLexer() lexer = CalcLexer()
@ -93,16 +65,20 @@ def test_tokens():
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')'] assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')'] assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
# Test third-party regex module support # Test position tracking
@pytest.mark.skipif(regex is None, def test_positions():
reason="third-party regex module not installed") lexer = CalcLexer()
def test_3rd_party_regex_module(): text = 'abc\n( )'
lexer = RegexModuleCalcLexer() toks = list(lexer.tokenize(text))
toks = list(lexer.tokenize('a + b - c')) lines = [t.lineno for t in toks ]
types = [t.type for t in toks] indices = [t.index for t in toks ]
vals = [t.value for t in toks] ends = [t.end for t in toks]
assert types == ['ID','PLUS','ID','MINUS','ID'] values = [ text[t.index:t.end] for t in toks ]
assert vals == ['A', '+', 'B', '-', 'C'] assert values == ['abc', '(', ')']
assert lines == [1, 2, 2]
assert indices == [0, 4, 6]
assert ends == [3, 5, 7]
# Test ignored comments and newlines # Test ignored comments and newlines
def test_ignored(): def test_ignored():
@ -228,23 +204,5 @@ def test_modern_error_return():
assert vals == [123, ':+-', '+', '-'] assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ] assert lexer.errors == [ ':+-' ]
# Test Lexer Inheritance. This class should inherit all of the tokens
# and features of ModernCalcLexer, but add two new tokens to it. The
# PLUSPLUS token matches before the PLUS token.
if False:
class SubModernCalcLexer(ModernCalcLexer):
tokens |= { DOLLAR, PLUSPLUS }
DOLLAR = r'\$'
PLUSPLUS = r'\+\+'
PLUSPLUS.before = PLUS
def test_lexer_inherit():
lexer = SubModernCalcLexer()
toks = list(lexer.tokenize('123 + - $ ++ if'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
assert vals == [123, '+', '-', '$', '++', 'if']