Various work in progress. Position tracking

2022-09-06 19:38:33 -05:00
parent cd9014eda2
commit 62203d8b75
4 changed files with 77 additions and 71 deletions
--- a/12
+++ b/12
@@ -1,5 +1,17 @@
 In Progress
 -----------
+03/25/2022 Added automatic location tracking to the parser.  Use
+	   Parser.line_position(value) to return the line number
+           and Parser.index_position(value) to return a (start, end)
+	   index pair.  value is *any* object returned by one of
+	   the various methods in the parser definition. Typically,
+	   it would be a AST node.  The parser tracks the data using
+	   the value of id(value).
+
+03/25/2022 Added .end attribute to tokens that specify the ending
+           index of the matching text.   This is used to do more
+	   precise location tracking for the purpose of issuing
+	   more useful error messages.
 	   
 05/09/2020 Experimental support for EBNF choices.  For example:

--- a/sly/lex.py
+++ b/sly/lex.py
@@ -73,9 +73,9 @@ class Token(object):
    '''
    Representation of a single token.
    '''
-    __slots__ = ('type', 'value', 'lineno', 'index')
+    __slots__ = ('type', 'value', 'lineno', 'index', 'end')
    def __repr__(self):
-        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
+        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'

 class TokenStr(str):
    @staticmethod
@@ -406,7 +406,7 @@ class Lexer(metaclass=LexerMeta):
                tok.index = index
                m = _master_re.match(text, index)
                if m:
-                    index = m.end()
+                    tok.end = index = m.end()
                    tok.value = m.group()
                    tok.type = m.lastgroup

@@ -431,6 +431,7 @@ class Lexer(metaclass=LexerMeta):
                    # No match, see if the character is in literals
                    if text[index] in _literals:
                        tok.value = text[index]
+                        tok.end = index + 1
                        tok.type = tok.value
                        index += 1
                        yield tok
@@ -442,6 +443,7 @@ class Lexer(metaclass=LexerMeta):
                        tok.value = text[index:]
                        tok = self.error(tok)
                        if tok is not None:
+                            tok.end = self.index
                            yield tok

                        index = self.index
--- a/sly/yacc.py
+++ b/sly/yacc.py
@@ -126,8 +126,6 @@ class YaccProduction:
    @property
    def lineno(self):
        for tok in self._slice:
-            if isinstance(tok, YaccSymbol):
-                continue
            lineno = getattr(tok, 'lineno', None)
            if lineno:
                return lineno
@@ -136,13 +134,20 @@ class YaccProduction:
    @property
    def index(self):
        for tok in self._slice:
-            if isinstance(tok, YaccSymbol):
-                continue
            index = getattr(tok, 'index', None)
            if index is not None:
                return index
        raise AttributeError('No index attribute found')

+    @property
+    def end(self):
+        result = None
+        for tok in self._slice:
+            r = getattr(tok, 'end', None)
+            if r:
+                result = r
+        return result
+    
    def __getattr__(self, name):
        if name in self._namemap:
            return self._namemap[name](self._slice)
@@ -1806,12 +1811,6 @@ class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
        d = ParserMetaDict()
-#        def _(rule, *extra):
-#            rules = [rule, *extra]
-#            def decorate(func):
-#                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
-#                return func
-#            return decorate
        d['_'] = _decorator
        return d

@@ -1822,6 +1821,9 @@ class ParserMeta(type):
        return cls

 class Parser(metaclass=ParserMeta):
+    # Automatic tracking of position information
+    track_positions = True
+    
    # Logging object where debugging/diagnostic messages are sent
    log = SlyLogger(sys.stderr)     

@@ -2076,9 +2078,15 @@ class Parser(metaclass=ParserMeta):
        self.tokens = tokens
        self.statestack = statestack = []                 # Stack of parsing states
        self.symstack = symstack = []                     # Stack of grammar symbols
-        pslice._stack = symstack                           # Associate the stack with the production
+        pslice._stack = symstack                          # Associate the stack with the production
        self.restart()

+        # Set up position tracking
+        track_positions = self.track_positions
+        if not hasattr(self, '_line_positions'):
+            self._line_positions = { }           # id: -> lineno
+            self._index_positions = { }          # id: -> (start, end)
+
        errtoken   = None                                 # Err token
        while True:
            # Get the next symbol on the input.  If a lookahead symbol
@@ -2129,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
                    value = p.func(self, pslice)
                    if value is pslice:
                        value = (pname, *(s.value for s in pslice._slice))
+
                    sym.value = value
+                        
+                    # Record positions
+                    if track_positions:
+                        if plen:
+                            sym.lineno = symstack[-plen].lineno
+                            sym.index = symstack[-plen].index
+                            sym.end = symstack[-1].end
+                        else:
+                            # A zero-length production  (what to put here?)
+                            sym.lineno = None
+                            sym.index = None
+                            sym.end = None
+                        self._line_positions[id(value)] = sym.lineno
+                        self._index_positions[id(value)] = (sym.index, sym.end)
+                            
                    if plen:
                        del symstack[-plen:]
                        del statestack[-plen:]
@@ -2214,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
                        t.lineno = lookahead.lineno
                    if hasattr(lookahead, 'index'):
                        t.index = lookahead.index
+                    if hasattr(lookahead, 'end'):
+                        t.end = lookahead.end
                    t.value = lookahead
                    lookaheadstack.append(lookahead)
                    lookahead = t
@@ -2225,3 +2251,11 @@ class Parser(metaclass=ParserMeta):

            # Call an error function here
            raise RuntimeError('sly: internal parser error!!!\n')
+
+    # Return position tracking information
+    def line_position(self, value):
+        return self._line_positions[id(value)]
+
+    def index_position(self, value):
+        return self._index_positions[id(value)]
+    
--- a/tests/test_lex.py
+++ b/tests/test_lex.py
@@ -1,11 +1,6 @@
 import pytest
 from sly import Lexer

-try:
-    import regex
-except ImportError:
-    regex = None
-
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = {
@@ -61,29 +56,6 @@ class CalcLexer(Lexer):
    def __init__(self):
        self.errors = []

-if regex is not None:
-    class RegexModuleCalcLexer(Lexer):
-        regex_module = regex
-
-        tokens = { 'ID', 'PLUS', 'MINUS' }
-
-        literals = { '(', ')' }
-        ignore = ' \t'
-
-        ID      = r'\p{Ll}+'  # Unicode lowercase letters, regex module feature
-        PLUS    = r'\+'
-        MINUS   = r'-'
-
-        ignore_comment = r'\#.*'
-
-        @_(r'\n+')
-        def newline(self, t):
-            self.lineno += t.value.count('\n')
-
-        def ID(self, t):
-            t.value = t.value.upper()
-            return t
-
 # Test basic recognition of various tokens and literals
 def test_tokens():
    lexer = CalcLexer()
@@ -93,16 +65,20 @@ def test_tokens():
    assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
    assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']

-# Test third-party regex module support
-@pytest.mark.skipif(regex is None,
-                    reason="third-party regex module not installed")
-def test_3rd_party_regex_module():
-    lexer = RegexModuleCalcLexer()
-    toks = list(lexer.tokenize('a + b - c'))
-    types = [t.type for t in toks]
-    vals = [t.value for t in toks]
-    assert types == ['ID','PLUS','ID','MINUS','ID']
-    assert vals == ['A', '+', 'B', '-', 'C']
+# Test position tracking
+def test_positions():
+    lexer = CalcLexer()
+    text = 'abc\n( )'
+    toks = list(lexer.tokenize(text))
+    lines = [t.lineno for t in toks ]
+    indices = [t.index for t in toks ]
+    ends = [t.end for t in toks]
+    values = [ text[t.index:t.end] for t in toks ]
+    assert values == ['abc', '(', ')']
+    assert lines == [1, 2, 2]
+    assert indices == [0, 4, 6]
+    assert ends == [3, 5, 7]
+
    
 # Test ignored comments and newlines
 def test_ignored():
@@ -228,23 +204,5 @@ def test_modern_error_return():
    assert vals == [123, ':+-', '+', '-']
    assert lexer.errors == [ ':+-' ]

-# Test Lexer Inheritance.  This class should inherit all of the tokens
-# and features of ModernCalcLexer, but add two new tokens to it.  The
-# PLUSPLUS token matches before the PLUS token.
-
-if False:
-    class SubModernCalcLexer(ModernCalcLexer):
-        tokens |= { DOLLAR, PLUSPLUS }
-        DOLLAR = r'\$'
-        PLUSPLUS = r'\+\+'
-        PLUSPLUS.before = PLUS
-
-    def test_lexer_inherit():
-        lexer = SubModernCalcLexer()
-        toks = list(lexer.tokenize('123 + - $ ++ if'))
-        types = [t.type for t in toks]
-        vals = [t.value for t in toks]
-        assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
-        assert vals == [123, '+', '-', '$', '++', 'if']