Various work in progress. Position tracking
This commit is contained in:
parent
cd9014eda2
commit
62203d8b75
12
CHANGES
12
CHANGES
@ -1,5 +1,17 @@
|
||||
In Progress
|
||||
-----------
|
||||
03/25/2022 Added automatic location tracking to the parser. Use
|
||||
Parser.line_position(value) to return the line number
|
||||
and Parser.index_position(value) to return a (start, end)
|
||||
index pair. value is *any* object returned by one of
|
||||
the various methods in the parser definition. Typically,
|
||||
it would be a AST node. The parser tracks the data using
|
||||
the value of id(value).
|
||||
|
||||
03/25/2022 Added .end attribute to tokens that specify the ending
|
||||
index of the matching text. This is used to do more
|
||||
precise location tracking for the purpose of issuing
|
||||
more useful error messages.
|
||||
|
||||
05/09/2020 Experimental support for EBNF choices. For example:
|
||||
|
||||
|
@ -73,9 +73,9 @@ class Token(object):
|
||||
'''
|
||||
Representation of a single token.
|
||||
'''
|
||||
__slots__ = ('type', 'value', 'lineno', 'index')
|
||||
__slots__ = ('type', 'value', 'lineno', 'index', 'end')
|
||||
def __repr__(self):
|
||||
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
|
||||
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
|
||||
|
||||
class TokenStr(str):
|
||||
@staticmethod
|
||||
@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
|
||||
tok.index = index
|
||||
m = _master_re.match(text, index)
|
||||
if m:
|
||||
index = m.end()
|
||||
tok.end = index = m.end()
|
||||
tok.value = m.group()
|
||||
tok.type = m.lastgroup
|
||||
|
||||
@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
|
||||
# No match, see if the character is in literals
|
||||
if text[index] in _literals:
|
||||
tok.value = text[index]
|
||||
tok.end = index + 1
|
||||
tok.type = tok.value
|
||||
index += 1
|
||||
yield tok
|
||||
@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
|
||||
tok.value = text[index:]
|
||||
tok = self.error(tok)
|
||||
if tok is not None:
|
||||
tok.end = self.index
|
||||
yield tok
|
||||
|
||||
index = self.index
|
||||
|
56
sly/yacc.py
56
sly/yacc.py
@ -126,8 +126,6 @@ class YaccProduction:
|
||||
@property
|
||||
def lineno(self):
|
||||
for tok in self._slice:
|
||||
if isinstance(tok, YaccSymbol):
|
||||
continue
|
||||
lineno = getattr(tok, 'lineno', None)
|
||||
if lineno:
|
||||
return lineno
|
||||
@ -136,13 +134,20 @@ class YaccProduction:
|
||||
@property
|
||||
def index(self):
|
||||
for tok in self._slice:
|
||||
if isinstance(tok, YaccSymbol):
|
||||
continue
|
||||
index = getattr(tok, 'index', None)
|
||||
if index is not None:
|
||||
return index
|
||||
raise AttributeError('No index attribute found')
|
||||
|
||||
@property
|
||||
def end(self):
|
||||
result = None
|
||||
for tok in self._slice:
|
||||
r = getattr(tok, 'end', None)
|
||||
if r:
|
||||
result = r
|
||||
return result
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name in self._namemap:
|
||||
return self._namemap[name](self._slice)
|
||||
@ -1806,12 +1811,6 @@ class ParserMeta(type):
|
||||
@classmethod
|
||||
def __prepare__(meta, *args, **kwargs):
|
||||
d = ParserMetaDict()
|
||||
# def _(rule, *extra):
|
||||
# rules = [rule, *extra]
|
||||
# def decorate(func):
|
||||
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
|
||||
# return func
|
||||
# return decorate
|
||||
d['_'] = _decorator
|
||||
return d
|
||||
|
||||
@ -1822,6 +1821,9 @@ class ParserMeta(type):
|
||||
return cls
|
||||
|
||||
class Parser(metaclass=ParserMeta):
|
||||
# Automatic tracking of position information
|
||||
track_positions = True
|
||||
|
||||
# Logging object where debugging/diagnostic messages are sent
|
||||
log = SlyLogger(sys.stderr)
|
||||
|
||||
@ -2076,9 +2078,15 @@ class Parser(metaclass=ParserMeta):
|
||||
self.tokens = tokens
|
||||
self.statestack = statestack = [] # Stack of parsing states
|
||||
self.symstack = symstack = [] # Stack of grammar symbols
|
||||
pslice._stack = symstack # Associate the stack with the production
|
||||
pslice._stack = symstack # Associate the stack with the production
|
||||
self.restart()
|
||||
|
||||
# Set up position tracking
|
||||
track_positions = self.track_positions
|
||||
if not hasattr(self, '_line_positions'):
|
||||
self._line_positions = { } # id: -> lineno
|
||||
self._index_positions = { } # id: -> (start, end)
|
||||
|
||||
errtoken = None # Err token
|
||||
while True:
|
||||
# Get the next symbol on the input. If a lookahead symbol
|
||||
@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
|
||||
value = p.func(self, pslice)
|
||||
if value is pslice:
|
||||
value = (pname, *(s.value for s in pslice._slice))
|
||||
|
||||
sym.value = value
|
||||
|
||||
# Record positions
|
||||
if track_positions:
|
||||
if plen:
|
||||
sym.lineno = symstack[-plen].lineno
|
||||
sym.index = symstack[-plen].index
|
||||
sym.end = symstack[-1].end
|
||||
else:
|
||||
# A zero-length production (what to put here?)
|
||||
sym.lineno = None
|
||||
sym.index = None
|
||||
sym.end = None
|
||||
self._line_positions[id(value)] = sym.lineno
|
||||
self._index_positions[id(value)] = (sym.index, sym.end)
|
||||
|
||||
if plen:
|
||||
del symstack[-plen:]
|
||||
del statestack[-plen:]
|
||||
@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
|
||||
t.lineno = lookahead.lineno
|
||||
if hasattr(lookahead, 'index'):
|
||||
t.index = lookahead.index
|
||||
if hasattr(lookahead, 'end'):
|
||||
t.end = lookahead.end
|
||||
t.value = lookahead
|
||||
lookaheadstack.append(lookahead)
|
||||
lookahead = t
|
||||
@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):
|
||||
|
||||
# Call an error function here
|
||||
raise RuntimeError('sly: internal parser error!!!\n')
|
||||
|
||||
# Return position tracking information
|
||||
def line_position(self, value):
|
||||
return self._line_positions[id(value)]
|
||||
|
||||
def index_position(self, value):
|
||||
return self._index_positions[id(value)]
|
||||
|
||||
|
@ -1,11 +1,6 @@
|
||||
import pytest
|
||||
from sly import Lexer
|
||||
|
||||
try:
|
||||
import regex
|
||||
except ImportError:
|
||||
regex = None
|
||||
|
||||
class CalcLexer(Lexer):
|
||||
# Set of token names. This is always required
|
||||
tokens = {
|
||||
@ -61,29 +56,6 @@ class CalcLexer(Lexer):
|
||||
def __init__(self):
|
||||
self.errors = []
|
||||
|
||||
if regex is not None:
|
||||
class RegexModuleCalcLexer(Lexer):
|
||||
regex_module = regex
|
||||
|
||||
tokens = { 'ID', 'PLUS', 'MINUS' }
|
||||
|
||||
literals = { '(', ')' }
|
||||
ignore = ' \t'
|
||||
|
||||
ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
|
||||
PLUS = r'\+'
|
||||
MINUS = r'-'
|
||||
|
||||
ignore_comment = r'\#.*'
|
||||
|
||||
@_(r'\n+')
|
||||
def newline(self, t):
|
||||
self.lineno += t.value.count('\n')
|
||||
|
||||
def ID(self, t):
|
||||
t.value = t.value.upper()
|
||||
return t
|
||||
|
||||
# Test basic recognition of various tokens and literals
|
||||
def test_tokens():
|
||||
lexer = CalcLexer()
|
||||
@ -93,16 +65,20 @@ def test_tokens():
|
||||
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
|
||||
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
|
||||
|
||||
# Test third-party regex module support
|
||||
@pytest.mark.skipif(regex is None,
|
||||
reason="third-party regex module not installed")
|
||||
def test_3rd_party_regex_module():
|
||||
lexer = RegexModuleCalcLexer()
|
||||
toks = list(lexer.tokenize('a + b - c'))
|
||||
types = [t.type for t in toks]
|
||||
vals = [t.value for t in toks]
|
||||
assert types == ['ID','PLUS','ID','MINUS','ID']
|
||||
assert vals == ['A', '+', 'B', '-', 'C']
|
||||
# Test position tracking
|
||||
def test_positions():
|
||||
lexer = CalcLexer()
|
||||
text = 'abc\n( )'
|
||||
toks = list(lexer.tokenize(text))
|
||||
lines = [t.lineno for t in toks ]
|
||||
indices = [t.index for t in toks ]
|
||||
ends = [t.end for t in toks]
|
||||
values = [ text[t.index:t.end] for t in toks ]
|
||||
assert values == ['abc', '(', ')']
|
||||
assert lines == [1, 2, 2]
|
||||
assert indices == [0, 4, 6]
|
||||
assert ends == [3, 5, 7]
|
||||
|
||||
|
||||
# Test ignored comments and newlines
|
||||
def test_ignored():
|
||||
@ -228,23 +204,5 @@ def test_modern_error_return():
|
||||
assert vals == [123, ':+-', '+', '-']
|
||||
assert lexer.errors == [ ':+-' ]
|
||||
|
||||
# Test Lexer Inheritance. This class should inherit all of the tokens
|
||||
# and features of ModernCalcLexer, but add two new tokens to it. The
|
||||
# PLUSPLUS token matches before the PLUS token.
|
||||
|
||||
if False:
|
||||
class SubModernCalcLexer(ModernCalcLexer):
|
||||
tokens |= { DOLLAR, PLUSPLUS }
|
||||
DOLLAR = r'\$'
|
||||
PLUSPLUS = r'\+\+'
|
||||
PLUSPLUS.before = PLUS
|
||||
|
||||
def test_lexer_inherit():
|
||||
lexer = SubModernCalcLexer()
|
||||
toks = list(lexer.tokenize('123 + - $ ++ if'))
|
||||
types = [t.type for t in toks]
|
||||
vals = [t.value for t in toks]
|
||||
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
|
||||
assert vals == [123, '+', '-', '$', '++', 'if']
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user