From 62203d8b75e9e7c143c53c452489f226b06cecde Mon Sep 17 00:00:00 2001 From: David Beazley Date: Tue, 6 Sep 2022 19:38:33 -0500 Subject: [PATCH] Various work in progress. Position tracking --- CHANGES | 12 ++++++++ sly/lex.py | 8 ++++-- sly/yacc.py | 58 +++++++++++++++++++++++++++++++-------- tests/test_lex.py | 70 ++++++++++------------------------------------- 4 files changed, 77 insertions(+), 71 deletions(-) diff --git a/CHANGES b/CHANGES index f1846ea..66522c8 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,18 @@ In Progress ----------- +03/25/2022 Added automatic location tracking to the parser. Use + Parser.line_position(value) to return the line number + and Parser.index_position(value) to return a (start, end) + index pair. value is *any* object returned by one of + the various methods in the parser definition. Typically, + it would be a AST node. The parser tracks the data using + the value of id(value). +03/25/2022 Added .end attribute to tokens that specify the ending + index of the matching text. This is used to do more + precise location tracking for the purpose of issuing + more useful error messages. + 05/09/2020 Experimental support for EBNF choices. For example: @('term { PLUS|MINUS term }') diff --git a/sly/lex.py b/sly/lex.py index 2f3a345..b2e4725 100644 --- a/sly/lex.py +++ b/sly/lex.py @@ -73,9 +73,9 @@ class Token(object): ''' Representation of a single token. ''' - __slots__ = ('type', 'value', 'lineno', 'index') + __slots__ = ('type', 'value', 'lineno', 'index', 'end') def __repr__(self): - return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})' + return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})' class TokenStr(str): @staticmethod @@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta): tok.index = index m = _master_re.match(text, index) if m: - index = m.end() + tok.end = index = m.end() tok.value = m.group() tok.type = m.lastgroup @@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta): # No match, see if the character is in literals if text[index] in _literals: tok.value = text[index] + tok.end = index + 1 tok.type = tok.value index += 1 yield tok @@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta): tok.value = text[index:] tok = self.error(tok) if tok is not None: + tok.end = self.index yield tok index = self.index diff --git a/sly/yacc.py b/sly/yacc.py index 085ed6b..95e90cb 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -126,8 +126,6 @@ class YaccProduction: @property def lineno(self): for tok in self._slice: - if isinstance(tok, YaccSymbol): - continue lineno = getattr(tok, 'lineno', None) if lineno: return lineno @@ -136,13 +134,20 @@ class YaccProduction: @property def index(self): for tok in self._slice: - if isinstance(tok, YaccSymbol): - continue index = getattr(tok, 'index', None) if index is not None: return index raise AttributeError('No index attribute found') + @property + def end(self): + result = None + for tok in self._slice: + r = getattr(tok, 'end', None) + if r: + result = r + return result + def __getattr__(self, name): if name in self._namemap: return self._namemap[name](self._slice) @@ -1806,12 +1811,6 @@ class ParserMeta(type): @classmethod def __prepare__(meta, *args, **kwargs): d = ParserMetaDict() -# def _(rule, *extra): -# rules = [rule, *extra] -# def decorate(func): -# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ] -# return func -# return decorate d['_'] = _decorator return d @@ -1822,6 +1821,9 @@ class ParserMeta(type): return cls class Parser(metaclass=ParserMeta): + # Automatic tracking of position information + track_positions = True + # Logging object where debugging/diagnostic messages are sent log = SlyLogger(sys.stderr) @@ -2076,9 +2078,15 @@ class Parser(metaclass=ParserMeta): self.tokens = tokens self.statestack = statestack = [] # Stack of parsing states self.symstack = symstack = [] # Stack of grammar symbols - pslice._stack = symstack # Associate the stack with the production + pslice._stack = symstack # Associate the stack with the production self.restart() + # Set up position tracking + track_positions = self.track_positions + if not hasattr(self, '_line_positions'): + self._line_positions = { } # id: -> lineno + self._index_positions = { } # id: -> (start, end) + errtoken = None # Err token while True: # Get the next symbol on the input. If a lookahead symbol @@ -2093,7 +2101,7 @@ class Parser(metaclass=ParserMeta): if not lookahead: lookahead = YaccSymbol() lookahead.type = '$end' - + # Check the action table ltype = lookahead.type t = actions[self.state].get(ltype) @@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta): value = p.func(self, pslice) if value is pslice: value = (pname, *(s.value for s in pslice._slice)) + sym.value = value + + # Record positions + if track_positions: + if plen: + sym.lineno = symstack[-plen].lineno + sym.index = symstack[-plen].index + sym.end = symstack[-1].end + else: + # A zero-length production (what to put here?) + sym.lineno = None + sym.index = None + sym.end = None + self._line_positions[id(value)] = sym.lineno + self._index_positions[id(value)] = (sym.index, sym.end) + if plen: del symstack[-plen:] del statestack[-plen:] @@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta): t.lineno = lookahead.lineno if hasattr(lookahead, 'index'): t.index = lookahead.index + if hasattr(lookahead, 'end'): + t.end = lookahead.end t.value = lookahead lookaheadstack.append(lookahead) lookahead = t @@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta): # Call an error function here raise RuntimeError('sly: internal parser error!!!\n') + + # Return position tracking information + def line_position(self, value): + return self._line_positions[id(value)] + + def index_position(self, value): + return self._index_positions[id(value)] + diff --git a/tests/test_lex.py b/tests/test_lex.py index c7bf3e9..e936130 100644 --- a/tests/test_lex.py +++ b/tests/test_lex.py @@ -1,11 +1,6 @@ import pytest from sly import Lexer -try: - import regex -except ImportError: - regex = None - class CalcLexer(Lexer): # Set of token names. This is always required tokens = { @@ -61,29 +56,6 @@ class CalcLexer(Lexer): def __init__(self): self.errors = [] -if regex is not None: - class RegexModuleCalcLexer(Lexer): - regex_module = regex - - tokens = { 'ID', 'PLUS', 'MINUS' } - - literals = { '(', ')' } - ignore = ' \t' - - ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature - PLUS = r'\+' - MINUS = r'-' - - ignore_comment = r'\#.*' - - @_(r'\n+') - def newline(self, t): - self.lineno += t.value.count('\n') - - def ID(self, t): - t.value = t.value.upper() - return t - # Test basic recognition of various tokens and literals def test_tokens(): lexer = CalcLexer() @@ -93,17 +65,21 @@ def test_tokens(): assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')'] assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')'] -# Test third-party regex module support -@pytest.mark.skipif(regex is None, - reason="third-party regex module not installed") -def test_3rd_party_regex_module(): - lexer = RegexModuleCalcLexer() - toks = list(lexer.tokenize('a + b - c')) - types = [t.type for t in toks] - vals = [t.value for t in toks] - assert types == ['ID','PLUS','ID','MINUS','ID'] - assert vals == ['A', '+', 'B', '-', 'C'] +# Test position tracking +def test_positions(): + lexer = CalcLexer() + text = 'abc\n( )' + toks = list(lexer.tokenize(text)) + lines = [t.lineno for t in toks ] + indices = [t.index for t in toks ] + ends = [t.end for t in toks] + values = [ text[t.index:t.end] for t in toks ] + assert values == ['abc', '(', ')'] + assert lines == [1, 2, 2] + assert indices == [0, 4, 6] + assert ends == [3, 5, 7] + # Test ignored comments and newlines def test_ignored(): lexer = CalcLexer() @@ -228,23 +204,5 @@ def test_modern_error_return(): assert vals == [123, ':+-', '+', '-'] assert lexer.errors == [ ':+-' ] -# Test Lexer Inheritance. This class should inherit all of the tokens -# and features of ModernCalcLexer, but add two new tokens to it. The -# PLUSPLUS token matches before the PLUS token. - -if False: - class SubModernCalcLexer(ModernCalcLexer): - tokens |= { DOLLAR, PLUSPLUS } - DOLLAR = r'\$' - PLUSPLUS = r'\+\+' - PLUSPLUS.before = PLUS - - def test_lexer_inherit(): - lexer = SubModernCalcLexer() - toks = list(lexer.tokenize('123 + - $ ++ if')) - types = [t.type for t in toks] - vals = [t.value for t in toks] - assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF'] - assert vals == [123, '+', '-', '$', '++', 'if']