Various work in progress. Position tracking
This commit is contained in:
parent
cd9014eda2
commit
62203d8b75
12
CHANGES
12
CHANGES
@ -1,6 +1,18 @@
|
|||||||
In Progress
|
In Progress
|
||||||
-----------
|
-----------
|
||||||
|
03/25/2022 Added automatic location tracking to the parser. Use
|
||||||
|
Parser.line_position(value) to return the line number
|
||||||
|
and Parser.index_position(value) to return a (start, end)
|
||||||
|
index pair. value is *any* object returned by one of
|
||||||
|
the various methods in the parser definition. Typically,
|
||||||
|
it would be a AST node. The parser tracks the data using
|
||||||
|
the value of id(value).
|
||||||
|
|
||||||
|
03/25/2022 Added .end attribute to tokens that specify the ending
|
||||||
|
index of the matching text. This is used to do more
|
||||||
|
precise location tracking for the purpose of issuing
|
||||||
|
more useful error messages.
|
||||||
|
|
||||||
05/09/2020 Experimental support for EBNF choices. For example:
|
05/09/2020 Experimental support for EBNF choices. For example:
|
||||||
|
|
||||||
@('term { PLUS|MINUS term }')
|
@('term { PLUS|MINUS term }')
|
||||||
|
@ -73,9 +73,9 @@ class Token(object):
|
|||||||
'''
|
'''
|
||||||
Representation of a single token.
|
Representation of a single token.
|
||||||
'''
|
'''
|
||||||
__slots__ = ('type', 'value', 'lineno', 'index')
|
__slots__ = ('type', 'value', 'lineno', 'index', 'end')
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
|
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
|
||||||
|
|
||||||
class TokenStr(str):
|
class TokenStr(str):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
tok.index = index
|
tok.index = index
|
||||||
m = _master_re.match(text, index)
|
m = _master_re.match(text, index)
|
||||||
if m:
|
if m:
|
||||||
index = m.end()
|
tok.end = index = m.end()
|
||||||
tok.value = m.group()
|
tok.value = m.group()
|
||||||
tok.type = m.lastgroup
|
tok.type = m.lastgroup
|
||||||
|
|
||||||
@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
# No match, see if the character is in literals
|
# No match, see if the character is in literals
|
||||||
if text[index] in _literals:
|
if text[index] in _literals:
|
||||||
tok.value = text[index]
|
tok.value = text[index]
|
||||||
|
tok.end = index + 1
|
||||||
tok.type = tok.value
|
tok.type = tok.value
|
||||||
index += 1
|
index += 1
|
||||||
yield tok
|
yield tok
|
||||||
@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
tok.value = text[index:]
|
tok.value = text[index:]
|
||||||
tok = self.error(tok)
|
tok = self.error(tok)
|
||||||
if tok is not None:
|
if tok is not None:
|
||||||
|
tok.end = self.index
|
||||||
yield tok
|
yield tok
|
||||||
|
|
||||||
index = self.index
|
index = self.index
|
||||||
|
58
sly/yacc.py
58
sly/yacc.py
@ -126,8 +126,6 @@ class YaccProduction:
|
|||||||
@property
|
@property
|
||||||
def lineno(self):
|
def lineno(self):
|
||||||
for tok in self._slice:
|
for tok in self._slice:
|
||||||
if isinstance(tok, YaccSymbol):
|
|
||||||
continue
|
|
||||||
lineno = getattr(tok, 'lineno', None)
|
lineno = getattr(tok, 'lineno', None)
|
||||||
if lineno:
|
if lineno:
|
||||||
return lineno
|
return lineno
|
||||||
@ -136,13 +134,20 @@ class YaccProduction:
|
|||||||
@property
|
@property
|
||||||
def index(self):
|
def index(self):
|
||||||
for tok in self._slice:
|
for tok in self._slice:
|
||||||
if isinstance(tok, YaccSymbol):
|
|
||||||
continue
|
|
||||||
index = getattr(tok, 'index', None)
|
index = getattr(tok, 'index', None)
|
||||||
if index is not None:
|
if index is not None:
|
||||||
return index
|
return index
|
||||||
raise AttributeError('No index attribute found')
|
raise AttributeError('No index attribute found')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def end(self):
|
||||||
|
result = None
|
||||||
|
for tok in self._slice:
|
||||||
|
r = getattr(tok, 'end', None)
|
||||||
|
if r:
|
||||||
|
result = r
|
||||||
|
return result
|
||||||
|
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
if name in self._namemap:
|
if name in self._namemap:
|
||||||
return self._namemap[name](self._slice)
|
return self._namemap[name](self._slice)
|
||||||
@ -1806,12 +1811,6 @@ class ParserMeta(type):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def __prepare__(meta, *args, **kwargs):
|
def __prepare__(meta, *args, **kwargs):
|
||||||
d = ParserMetaDict()
|
d = ParserMetaDict()
|
||||||
# def _(rule, *extra):
|
|
||||||
# rules = [rule, *extra]
|
|
||||||
# def decorate(func):
|
|
||||||
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
|
|
||||||
# return func
|
|
||||||
# return decorate
|
|
||||||
d['_'] = _decorator
|
d['_'] = _decorator
|
||||||
return d
|
return d
|
||||||
|
|
||||||
@ -1822,6 +1821,9 @@ class ParserMeta(type):
|
|||||||
return cls
|
return cls
|
||||||
|
|
||||||
class Parser(metaclass=ParserMeta):
|
class Parser(metaclass=ParserMeta):
|
||||||
|
# Automatic tracking of position information
|
||||||
|
track_positions = True
|
||||||
|
|
||||||
# Logging object where debugging/diagnostic messages are sent
|
# Logging object where debugging/diagnostic messages are sent
|
||||||
log = SlyLogger(sys.stderr)
|
log = SlyLogger(sys.stderr)
|
||||||
|
|
||||||
@ -2076,9 +2078,15 @@ class Parser(metaclass=ParserMeta):
|
|||||||
self.tokens = tokens
|
self.tokens = tokens
|
||||||
self.statestack = statestack = [] # Stack of parsing states
|
self.statestack = statestack = [] # Stack of parsing states
|
||||||
self.symstack = symstack = [] # Stack of grammar symbols
|
self.symstack = symstack = [] # Stack of grammar symbols
|
||||||
pslice._stack = symstack # Associate the stack with the production
|
pslice._stack = symstack # Associate the stack with the production
|
||||||
self.restart()
|
self.restart()
|
||||||
|
|
||||||
|
# Set up position tracking
|
||||||
|
track_positions = self.track_positions
|
||||||
|
if not hasattr(self, '_line_positions'):
|
||||||
|
self._line_positions = { } # id: -> lineno
|
||||||
|
self._index_positions = { } # id: -> (start, end)
|
||||||
|
|
||||||
errtoken = None # Err token
|
errtoken = None # Err token
|
||||||
while True:
|
while True:
|
||||||
# Get the next symbol on the input. If a lookahead symbol
|
# Get the next symbol on the input. If a lookahead symbol
|
||||||
@ -2093,7 +2101,7 @@ class Parser(metaclass=ParserMeta):
|
|||||||
if not lookahead:
|
if not lookahead:
|
||||||
lookahead = YaccSymbol()
|
lookahead = YaccSymbol()
|
||||||
lookahead.type = '$end'
|
lookahead.type = '$end'
|
||||||
|
|
||||||
# Check the action table
|
# Check the action table
|
||||||
ltype = lookahead.type
|
ltype = lookahead.type
|
||||||
t = actions[self.state].get(ltype)
|
t = actions[self.state].get(ltype)
|
||||||
@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
|
|||||||
value = p.func(self, pslice)
|
value = p.func(self, pslice)
|
||||||
if value is pslice:
|
if value is pslice:
|
||||||
value = (pname, *(s.value for s in pslice._slice))
|
value = (pname, *(s.value for s in pslice._slice))
|
||||||
|
|
||||||
sym.value = value
|
sym.value = value
|
||||||
|
|
||||||
|
# Record positions
|
||||||
|
if track_positions:
|
||||||
|
if plen:
|
||||||
|
sym.lineno = symstack[-plen].lineno
|
||||||
|
sym.index = symstack[-plen].index
|
||||||
|
sym.end = symstack[-1].end
|
||||||
|
else:
|
||||||
|
# A zero-length production (what to put here?)
|
||||||
|
sym.lineno = None
|
||||||
|
sym.index = None
|
||||||
|
sym.end = None
|
||||||
|
self._line_positions[id(value)] = sym.lineno
|
||||||
|
self._index_positions[id(value)] = (sym.index, sym.end)
|
||||||
|
|
||||||
if plen:
|
if plen:
|
||||||
del symstack[-plen:]
|
del symstack[-plen:]
|
||||||
del statestack[-plen:]
|
del statestack[-plen:]
|
||||||
@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
|
|||||||
t.lineno = lookahead.lineno
|
t.lineno = lookahead.lineno
|
||||||
if hasattr(lookahead, 'index'):
|
if hasattr(lookahead, 'index'):
|
||||||
t.index = lookahead.index
|
t.index = lookahead.index
|
||||||
|
if hasattr(lookahead, 'end'):
|
||||||
|
t.end = lookahead.end
|
||||||
t.value = lookahead
|
t.value = lookahead
|
||||||
lookaheadstack.append(lookahead)
|
lookaheadstack.append(lookahead)
|
||||||
lookahead = t
|
lookahead = t
|
||||||
@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):
|
|||||||
|
|
||||||
# Call an error function here
|
# Call an error function here
|
||||||
raise RuntimeError('sly: internal parser error!!!\n')
|
raise RuntimeError('sly: internal parser error!!!\n')
|
||||||
|
|
||||||
|
# Return position tracking information
|
||||||
|
def line_position(self, value):
|
||||||
|
return self._line_positions[id(value)]
|
||||||
|
|
||||||
|
def index_position(self, value):
|
||||||
|
return self._index_positions[id(value)]
|
||||||
|
|
||||||
|
@ -1,11 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from sly import Lexer
|
from sly import Lexer
|
||||||
|
|
||||||
try:
|
|
||||||
import regex
|
|
||||||
except ImportError:
|
|
||||||
regex = None
|
|
||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = {
|
||||||
@ -61,29 +56,6 @@ class CalcLexer(Lexer):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
if regex is not None:
|
|
||||||
class RegexModuleCalcLexer(Lexer):
|
|
||||||
regex_module = regex
|
|
||||||
|
|
||||||
tokens = { 'ID', 'PLUS', 'MINUS' }
|
|
||||||
|
|
||||||
literals = { '(', ')' }
|
|
||||||
ignore = ' \t'
|
|
||||||
|
|
||||||
ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
|
|
||||||
PLUS = r'\+'
|
|
||||||
MINUS = r'-'
|
|
||||||
|
|
||||||
ignore_comment = r'\#.*'
|
|
||||||
|
|
||||||
@_(r'\n+')
|
|
||||||
def newline(self, t):
|
|
||||||
self.lineno += t.value.count('\n')
|
|
||||||
|
|
||||||
def ID(self, t):
|
|
||||||
t.value = t.value.upper()
|
|
||||||
return t
|
|
||||||
|
|
||||||
# Test basic recognition of various tokens and literals
|
# Test basic recognition of various tokens and literals
|
||||||
def test_tokens():
|
def test_tokens():
|
||||||
lexer = CalcLexer()
|
lexer = CalcLexer()
|
||||||
@ -93,17 +65,21 @@ def test_tokens():
|
|||||||
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
|
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
|
||||||
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
|
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
|
||||||
|
|
||||||
# Test third-party regex module support
|
# Test position tracking
|
||||||
@pytest.mark.skipif(regex is None,
|
def test_positions():
|
||||||
reason="third-party regex module not installed")
|
lexer = CalcLexer()
|
||||||
def test_3rd_party_regex_module():
|
text = 'abc\n( )'
|
||||||
lexer = RegexModuleCalcLexer()
|
toks = list(lexer.tokenize(text))
|
||||||
toks = list(lexer.tokenize('a + b - c'))
|
lines = [t.lineno for t in toks ]
|
||||||
types = [t.type for t in toks]
|
indices = [t.index for t in toks ]
|
||||||
vals = [t.value for t in toks]
|
ends = [t.end for t in toks]
|
||||||
assert types == ['ID','PLUS','ID','MINUS','ID']
|
values = [ text[t.index:t.end] for t in toks ]
|
||||||
assert vals == ['A', '+', 'B', '-', 'C']
|
assert values == ['abc', '(', ')']
|
||||||
|
assert lines == [1, 2, 2]
|
||||||
|
assert indices == [0, 4, 6]
|
||||||
|
assert ends == [3, 5, 7]
|
||||||
|
|
||||||
|
|
||||||
# Test ignored comments and newlines
|
# Test ignored comments and newlines
|
||||||
def test_ignored():
|
def test_ignored():
|
||||||
lexer = CalcLexer()
|
lexer = CalcLexer()
|
||||||
@ -228,23 +204,5 @@ def test_modern_error_return():
|
|||||||
assert vals == [123, ':+-', '+', '-']
|
assert vals == [123, ':+-', '+', '-']
|
||||||
assert lexer.errors == [ ':+-' ]
|
assert lexer.errors == [ ':+-' ]
|
||||||
|
|
||||||
# Test Lexer Inheritance. This class should inherit all of the tokens
|
|
||||||
# and features of ModernCalcLexer, but add two new tokens to it. The
|
|
||||||
# PLUSPLUS token matches before the PLUS token.
|
|
||||||
|
|
||||||
if False:
|
|
||||||
class SubModernCalcLexer(ModernCalcLexer):
|
|
||||||
tokens |= { DOLLAR, PLUSPLUS }
|
|
||||||
DOLLAR = r'\$'
|
|
||||||
PLUSPLUS = r'\+\+'
|
|
||||||
PLUSPLUS.before = PLUS
|
|
||||||
|
|
||||||
def test_lexer_inherit():
|
|
||||||
lexer = SubModernCalcLexer()
|
|
||||||
toks = list(lexer.tokenize('123 + - $ ++ if'))
|
|
||||||
types = [t.type for t in toks]
|
|
||||||
vals = [t.value for t in toks]
|
|
||||||
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
|
|
||||||
assert vals == [123, '+', '-', '$', '++', 'if']
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user