Various work in progress. Position tracking

2022-09-06 19:38:33 -05:00
parent cd9014eda2
commit 62203d8b75
4 changed files with 77 additions and 71 deletions
--- a/12
+++ b/12
@@ -1,6 +1,18 @@
 In Progress
 -----------
 03/25/2022 Added automatic location tracking to the parser.  Use
 	   Parser.line_position(value) to return the line number
           and Parser.index_position(value) to return a (start, end)
 	   index pair.  value is *any* object returned by one of
 	   the various methods in the parser definition. Typically,
 	   it would be a AST node.  The parser tracks the data using
 	   the value of id(value).
 03/25/2022 Added .end attribute to tokens that specify the ending
           index of the matching text.   This is used to do more
 	   precise location tracking for the purpose of issuing
 	   more useful error messages.
 05/09/2020 Experimental support for EBNF choices.  For example:
 	      @('term { PLUS|MINUS term }')
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -73,9 +73,9 @@ class Token(object):
    '''
    Representation of a single token.
    '''
-    __slots__ = ('type', 'value', 'lineno', 'index')
+    __slots__ = ('type', 'value', 'lineno', 'index', 'end')
    def __repr__(self):
-        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
+        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
 class TokenStr(str):
    @staticmethod
@@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
                tok.index = index
                m = _master_re.match(text, index)
                if m:
-                    index = m.end()
+                    tok.end = index = m.end()
                    tok.value = m.group()
                    tok.type = m.lastgroup
@@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
                    # No match, see if the character is in literals
                    if text[index] in _literals:
                        tok.value = text[index]
                        tok.end = index + 1
                        tok.type = tok.value
                        index += 1
                        yield tok
@@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
                        tok.value = text[index:]
                        tok = self.error(tok)
                        if tok is not None:
                            tok.end = self.index
                            yield tok
                        index = self.index
--- a/sly/yacc.py
+++ b/sly/yacc.py
@@ -126,8 +126,6 @@ class YaccProduction:
    @property
    def lineno(self):
        for tok in self._slice:
            if isinstance(tok, YaccSymbol):
                continue
            lineno = getattr(tok, 'lineno', None)
            if lineno:
                return lineno
@@ -136,13 +134,20 @@ class YaccProduction:
    @property
    def index(self):
        for tok in self._slice:
            if isinstance(tok, YaccSymbol):
                continue
            index = getattr(tok, 'index', None)
            if index is not None:
                return index
        raise AttributeError('No index attribute found')
    @property
    def end(self):
        result = None
        for tok in self._slice:
            r = getattr(tok, 'end', None)
            if r:
                result = r
        return result
    def __getattr__(self, name):
        if name in self._namemap:
            return self._namemap[name](self._slice)
@@ -1806,12 +1811,6 @@ class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = ParserMetaDict()
 #        def _(rule, *extra):
 #            rules = [rule, *extra]
 #            def decorate(func):
 #                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
 #                return func
 #            return decorate
        d['_'] = _decorator
        return d
@@ -1822,6 +1821,9 @@ class ParserMeta(type):
        return cls
 class Parser(metaclass=ParserMeta):
    # Automatic tracking of position information
    track_positions = True
    # Logging object where debugging/diagnostic messages are sent
    log = SlyLogger(sys.stderr)     
@@ -2076,9 +2078,15 @@ class Parser(metaclass=ParserMeta):
        self.tokens = tokens
        self.statestack = statestack = []                 # Stack of parsing states
        self.symstack = symstack = []                     # Stack of grammar symbols
-        pslice._stack = symstack                           # Associate the stack with the production
+        pslice._stack = symstack                          # Associate the stack with the production
        self.restart()
        # Set up position tracking
        track_positions = self.track_positions
        if not hasattr(self, '_line_positions'):
            self._line_positions = { }           # id: -> lineno
            self._index_positions = { }          # id: -> (start, end)
        errtoken   = None                                 # Err token
        while True:
            # Get the next symbol on the input.  If a lookahead symbol
@@ -2093,7 +2101,7 @@ class Parser(metaclass=ParserMeta):
                    if not lookahead:
                        lookahead = YaccSymbol()
                        lookahead.type = '$end'
-
+                    
                # Check the action table
                ltype = lookahead.type
                t = actions[self.state].get(ltype)
@@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
                    value = p.func(self, pslice)
                    if value is pslice:
                        value = (pname, *(s.value for s in pslice._slice))
                    sym.value = value
                    # Record positions
                    if track_positions:
                        if plen:
                            sym.lineno = symstack[-plen].lineno
                            sym.index = symstack[-plen].index
                            sym.end = symstack[-1].end
                        else:
                            # A zero-length production  (what to put here?)
                            sym.lineno = None
                            sym.index = None
                            sym.end = None
                        self._line_positions[id(value)] = sym.lineno
                        self._index_positions[id(value)] = (sym.index, sym.end)
                    if plen:
                        del symstack[-plen:]
                        del statestack[-plen:]
@@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
                        t.lineno = lookahead.lineno
                    if hasattr(lookahead, 'index'):
                        t.index = lookahead.index
                    if hasattr(lookahead, 'end'):
                        t.end = lookahead.end
                    t.value = lookahead
                    lookaheadstack.append(lookahead)
                    lookahead = t
@@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):
            # Call an error function here
            raise RuntimeError('sly: internal parser error!!!\n')
    # Return position tracking information
    def line_position(self, value):
        return self._line_positions[id(value)]
    def index_position(self, value):
        return self._index_positions[id(value)]
--- a/tests/test_lex.py
+++ b/tests/test_lex.py
@@ -1,11 +1,6 @@
 import pytest
 from sly import Lexer
 try:
    import regex
 except ImportError:
    regex = None
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = {
@@ -61,29 +56,6 @@ class CalcLexer(Lexer):
    def __init__(self):
        self.errors = []
 if regex is not None:
    class RegexModuleCalcLexer(Lexer):
        regex_module = regex
        tokens = { 'ID', 'PLUS', 'MINUS' }
        literals = { '(', ')' }
        ignore = ' \t'
        ID      = r'\p{Ll}+'  # Unicode lowercase letters, regex module feature
        PLUS    = r'\+'
        MINUS   = r'-'
        ignore_comment = r'\#.*'
        @_(r'\n+')
        def newline(self, t):
            self.lineno += t.value.count('\n')
        def ID(self, t):
            t.value = t.value.upper()
            return t
 # Test basic recognition of various tokens and literals
 def test_tokens():
    lexer = CalcLexer()
@@ -93,17 +65,21 @@ def test_tokens():
    assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
    assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
-# Test third-party regex module support
+# Test position tracking
-@pytest.mark.skipif(regex is None,
+def test_positions():
-                    reason="third-party regex module not installed")
+    lexer = CalcLexer()
-def test_3rd_party_regex_module():
+    text = 'abc\n( )'
-    lexer = RegexModuleCalcLexer()
+    toks = list(lexer.tokenize(text))
-    toks = list(lexer.tokenize('a + b - c'))
+    lines = [t.lineno for t in toks ]
-    types = [t.type for t in toks]
+    indices = [t.index for t in toks ]
-    vals = [t.value for t in toks]
+    ends = [t.end for t in toks]
-    assert types == ['ID','PLUS','ID','MINUS','ID']
+    values = [ text[t.index:t.end] for t in toks ]
-    assert vals == ['A', '+', 'B', '-', 'C']
+    assert values == ['abc', '(', ')']
    assert lines == [1, 2, 2]
    assert indices == [0, 4, 6]
    assert ends == [3, 5, 7]
 # Test ignored comments and newlines
 def test_ignored():
    lexer = CalcLexer()
@@ -228,23 +204,5 @@ def test_modern_error_return():
    assert vals == [123, ':+-', '+', '-']
    assert lexer.errors == [ ':+-' ]
 # Test Lexer Inheritance.  This class should inherit all of the tokens
 # and features of ModernCalcLexer, but add two new tokens to it.  The
 # PLUSPLUS token matches before the PLUS token.
 if False:
    class SubModernCalcLexer(ModernCalcLexer):
        tokens |= { DOLLAR, PLUSPLUS }
        DOLLAR = r'\$'
        PLUSPLUS = r'\+\+'
        PLUSPLUS.before = PLUS
    def test_lexer_inherit():
        lexer = SubModernCalcLexer()
        toks = list(lexer.tokenize('123 + - $ ++ if'))
        types = [t.type for t in toks]
        vals = [t.value for t in toks]
        assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
        assert vals == [123, '+', '-', '$', '++', 'if']