Changes to token specification. More metamagic

2018-01-27 15:27:15 -06:00
parent b74e7223ce
commit b088d9b2ce
10 changed files with 302 additions and 142 deletions
--- a/59
+++ b/59
@@ -1,5 +1,64 @@
 Version 0.3
 -----------
 1/27/2018  Tokens no longer have to be specified as strings.   For example, you
           can now write:
           from sly import Lexer
           class TheLexer(Lexer):
               tokens = { ID, NUMBER, PLUS, MINUS }
               ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
               NUMBER = r'\d+'
               PLUS = r'\+'
               MINUS = r'-'
           This convention also carries over to the parser for things such
           as precedence specifiers:
           from sly import Parser
           class TheParser(Parser):
                 tokens = TheLexer.tokens
                 precedence = (
                     ('left', PLUS, MINUS),
                     ('left', TIMES, DIVIDE),
                     ('right', UMINUS),
                  )
            ...
           Nevermind the fact that ID, NUMBER, PLUS, and MINUS appear to be
           undefined identifiers.  It all works. 
 1/27/2018  Tokens now allow special-case remapping.   For example:
           from sly import Lexer
           class TheLexer(Lexer):
               tokens = { ID, IF, ELSE, WHILE, NUMBER, PLUS, MINUS }
               ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
               ID['if'] = IF
               ID['else'] = ELSE
               ID['while'] = WHILE
               NUMBER = r'\d+'
               PLUS = r'\+'
               MINUS = r'-'
           In this code, the ID rule matches any identifier.  However,
           special cases have been made for IF, ELSE, and WHILE tokens.
           Previously, this had to be handled in a special action method 
           such as this:
               def ID(self, t):
                   if t.value in { 'if', 'else', 'while' }:
                       t.type = t.value.upper()
                   return t
           Nevermind the fact that the syntax appears to suggest that strings
           work as a kind of mutable mapping.
 1/16/2018  Usability improvement on Lexer class.  Regular expression rules
           specified as strings that don't match any name in tokens are
           now reported as errors.
--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,6 @@
-SLY (Sly Lex-Yacc)                   Version 0.2
+SLY (Sly Lex-Yacc)                   Version 0.3
-Copyright (C) 2016-2017
+Copyright (C) 2016-2018
 David M. Beazley (Dabeaz LLC)
 All rights reserved.
@@ -85,9 +85,7 @@ expressions and store variables:
    from sly import Lexer, Parser
    class CalcLexer(Lexer):
-        tokens = {
+        tokens = { NAME, NUMBER }
            'NAME', 'NUMBER',
            }
        ignore = ' \t'
        literals = { '=', '+', '-', '*', '/', '(', ')' }
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -60,9 +60,7 @@ expressions and store variables::
    from sly import Lexer, Parser
    class CalcLexer(Lexer):
-        tokens = {
+        tokens = { NAME, NUMBER }
            'NAME', 'NUMBER',
            }
        ignore = ' \t'
        literals = { '=', '+', '-', '*', '/', '(', ')' }
--- a/docs/sly.rst
+++ b/docs/sly.rst
@@ -68,17 +68,8 @@ lexer that tokenizes the above text::
    class CalcLexer(Lexer):
        # Set of token names.   This is always required
-        tokens = {
+        tokens = { ID, NUMBER, PLUS, MINUS, TIMES, 
-            'ID',       
+                   DIVIDE, ASSIGN, LPAREN, RPAREN }
            'NUMBER',
            'PLUS',
            'MINUS',
            'TIMES',
            'DIVIDE',
            'ASSIGN',
            'LPAREN',
            'RPAREN',
            }
        # String containing ignored characters between tokens
        ignore = ' \t'
@@ -131,19 +122,12 @@ In the example, the following code specified the token names::
    class CalcLexer(Lexer):
        ...
        # Set of token names.   This is always required
-        tokens = {
+        tokens = { ID, NUMBER, PLUS, MINUS, TIMES, 
-            'ID',
+                   DIVIDE, ASSIGN, LPAREN, RPAREN }
            'NUMBER',
            'PLUS',
            'MINUS',
            'TIMES',
            'DIVIDE',
            'ASSIGN',
            'LPAREN',
            'RPAREN',
            }
        ...
 Token names should be specified using all-caps as shown. 
 Specification of token match patterns
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -167,7 +151,7 @@ short tokens.  For example, if you wanted to have separate tokens for
 example::
    class MyLexer(Lexer):
-        tokens = {'ASSIGN', 'EQ', ...}
+        tokens = { ASSIGN, EQ, ...}
        ...
        EQ     = r'=='       # MUST APPEAR FIRST! (LONGER)
        ASSIGN = r'='
@@ -251,12 +235,10 @@ Instead of using the ``@_()`` decorator, you can also write a method
 that matches the same name as a token previously specified as a
 string. For example::
-    ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
+    NUMBER = r'\d+'
    ...
-    def ID(self, t):
+    def NUMBER(self, t):
-        reserved = { 'if', 'else', 'while', 'for' }
+        t.value = int(t.value)
        if t.value in reserved:
             t.type = t.value.upper()
        return t
 This is potentially useful trick for debugging a lexer.  You can temporarily
@@ -264,6 +246,36 @@ attach a method a token and have it execute when the token is encountered.
 If you later take the method away, the lexer will revert back to its original
 behavior.
 Token Remapping
 ^^^^^^^^^^^^^^^
 Occasionally, you might need to remap tokens based on special cases. 
 Consider the case of matching identifiers such as "abc", "python", or "guido".  
 Certain identifiers such as "if", "else", and "while" might need to be
 treated as special keywords.  To handle this, include token remapping rules when
 writing the lexer like this::
    # calclex.py
    from sly import Lexer
    class CalcLexer(Lexer):
        tokens = { ID, IF, ELSE, WHILE }
        # String containing ignored characters (between tokens)
        ignore = ' \t'
        # Base ID rule
        ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
        # Special cases
        ID['if'] = IF
        ID['else'] = ELSE
        ID['while'] = WHILE
 When parsing an identifier, the special cases will remap certain matching 
 values to a new token type.  For example, if the value of an identifier is
 "if" above, an ``IF`` token will be generated.
 Line numbers and position tracking
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -385,26 +397,11 @@ into practice::
    from sly import Lexer
    class CalcLexer(Lexer):
        # Set of reserved names (language keywords)
        reserved_words = { 'WHILE', 'IF', 'ELSE', 'PRINT' }
        # Set of token names.   This is always required
-        tokens = {
+        tokens = { NUMBER, ID, WHILE, IF, ELSE, PRINT,
-            'NUMBER',
+                   PLUS, MINUS, TIMES, DIVIDE, ASSIGN,
-            'ID',
+                   EQ, LT, LE, GT, GE, NE }
-            'PLUS',
+
            'MINUS',
            'TIMES',
            'DIVIDE',
            'ASSIGN',
            'EQ',
            'LT',
            'LE',
            'GT',
            'GE',
            'NE',
            *reserved_words,
            } 
        literals = { '(', ')', '{', '}', ';' }
@@ -429,12 +426,12 @@ into practice::
            t.value = int(t.value)
            return t
-        @_(r'[a-zA-Z_][a-zA-Z0-9_]*')
+        # Identifiers and keywords
-        def ID(self, t):
+        ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
-            # Check if name matches a reserved word (change token type if true)
+        ID['if'] = IF
-            if t.value.upper() in self.reserved_words:
+        ID['else'] = ELSE
-                t.type = t.value.upper()
+        ID['while'] = WHILE
-            return t
+        ID['print'] = PRINT
        ignore_comment = r'\#.*'
@@ -443,8 +440,8 @@ into practice::
        def ignore_newline(self, t):
            self.lineno += t.value.count('\n')
-        def error(self, value):
+        def error(self, t):
-            print('Line %d: Bad character %r' % (self.lineno, value[0]))
+            print('Line %d: Bad character %r' % (self.lineno, t.value[0]))
            self.index += 1
    if __name__ == '__main__':
@@ -462,27 +459,27 @@ into practice::
 If you run this code, you'll get output that looks like this::
-    Token(ID, 'x', 3, 12)
+    Token(type='ID', value='x', lineno=3, index=20)
-    Token(ASSIGN, '=', 3, 14)
+    Token(type='ASSIGN', value='=', lineno=3, index=22)
-    Token(NUMBER, 0, 3, 16)
+    Token(type='NUMBER', value=0, lineno=3, index=24)
-    Token(;, ';', 3, 17)
+    Token(type=';', value=';', lineno=3, index=25)
-    Token(WHILE, 'while', 4, 19)
+    Token(type='WHILE', value='while', lineno=4, index=31)
-    Token((, '(', 4, 25)
+    Token(type='(', value='(', lineno=4, index=37)
-    Token(ID, 'x', 4, 26)
+    Token(type='ID', value='x', lineno=4, index=38)
-    Token(LT, '<', 4, 28)
+    Token(type='LT', value='<', lineno=4, index=40)
-    Token(NUMBER, 10, 4, 30)
+    Token(type='NUMBER', value=10, lineno=4, index=42)
-    Token(), ')', 4, 32)
+    Token(type=')', value=')', lineno=4, index=44)
-    Token({, '{', 4, 34)
+    Token(type='{', value='{', lineno=4, index=46)
-    Token(PRINT, 'print', 5, 40)
+    Token(type='PRINT', value='print', lineno=5, index=56)
-    Token(ID, 'x', 5, 46)
+    Token(type='ID', value='x', lineno=5, index=62)
    Line 5: Bad character ':'
-    Token(ID, 'x', 6, 53)
+    Token(type='ID', value='x', lineno=6, index=73)
-    Token(ASSIGN, '=', 6, 55)
+    Token(type='ASSIGN', value='=', lineno=6, index=75)
-    Token(ID, 'x', 6, 57)
+    Token(type='ID', value='x', lineno=6, index=77)
-    Token(PLUS, '+', 6, 59)
+    Token(type='PLUS', value='+', lineno=6, index=79)
-    Token(NUMBER, 1, 6, 61)
+    Token(type='NUMBER', value=1, lineno=6, index=81)
-    Token(;, ';', 6, 62)
+    Token(type=';', value=';', lineno=6, index=82)
-    Token(}, '}', 7, 64)
+    Token(type='}', value='}', lineno=7, index=88)
 Study this example closely.  It might take a bit to digest, but all of the
 essential parts of writing a lexer are there. Tokens have to be specified
@@ -914,8 +911,8 @@ like this::
    class CalcParser(Parser):
        ...
        precedence = (
-           ('left', 'PLUS', 'MINUS'),
+           ('left', PLUS, MINUS),
-           ('left', 'TIMES', 'DIVIDE'),
+           ('left', TIMES, DIVIDE),
        )
        # Rules where precedence is applied
@@ -1004,9 +1001,9 @@ like this::
    class CalcParser(Parser):
        ...
        precedence = (
-            ('left', 'PLUS', 'MINUS'),
+            ('left', PLUS, MINUS),
-            ('left', 'TIMES', 'DIVIDE'),
+            ('left', TIMES, DIVIDE),
-            ('right', 'UMINUS'),            # Unary minus operator
+            ('right', UMINUS),            # Unary minus operator
        )
 Now, in the grammar file, you write the unary minus rule like this::
@@ -1034,10 +1031,10 @@ operators like ``<`` and ``>`` but you didn't want combinations like
    class MyParser(Parser):
         ...
         precedence = (
-              ('nonassoc', 'LESSTHAN', 'GREATERTHAN'),  # Nonassociative operators
+              ('nonassoc', LESSTHAN, GREATERTHAN),  # Nonassociative operators
-              ('left', 'PLUS', 'MINUS'),
+              ('left', PLUS, MINUS),
-              ('left', 'TIMES', 'DIVIDE'),
+              ('left', TIMES, DIVIDE),
-              ('right', 'UMINUS'),            # Unary minus operator
+              ('right', UMINUS),            # Unary minus operator
         )
 If you do this, the occurrence of input text such as ``a < b < c``
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@@ -9,28 +9,16 @@ from sly import Lexer, Parser
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
-    tokens = {
+    tokens = { NUMBER, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN }
        'ID',
        'NUMBER',
        'PLUS',
        'MINUS',
        'TIMES',
        'DIVIDE',
        'ASSIGN',
        'LPAREN',
        'RPAREN',
        }
    # String containing ignored characters between tokens
    ignore = ' \t'
    # Regular expression rules for tokens
    ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
    PLUS    = r'\+'
    MINUS   = r'-'
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
    LPAREN  = r'\('
    RPAREN  = r'\)'
--- a/example/calc_prec/calc.py
+++ b/example/calc_prec/calc.py
@@ -8,9 +8,7 @@ sys.path.insert(0, "../..")
 from sly import Lexer, Parser
 class CalcLexer(Lexer):
-    tokens = {
+    tokens = { NAME, NUMBER }
        'NAME', 'NUMBER',
        }
    ignore = ' \t'
    literals = { '=', '+', '-', '*', '/', '(', ')' }
@@ -36,7 +34,7 @@ class CalcParser(Parser):
    precedence = (
        ('left', '+', '-'),
        ('left', '*', '/'),
-        ('right', 'UMINUS'),
+        ('right', UMINUS),
        )
    def __init__(self):
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # sly: lex.py
 #
-# Copyright (C) 2016
+# Copyright (C) 2016 - 2018
 # David M. Beazley (Dabeaz LLC)
 # All rights reserved.
 #
@@ -31,11 +31,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # -----------------------------------------------------------------------------
-__version__    = '0.2'
+__version__    = '0.3'
 __all__ = ['Lexer', 'LexerStateChange']
 import re
 from collections import OrderedDict
 class LexError(Exception):
    '''
@@ -78,20 +77,41 @@ class Token(object):
    def __repr__(self):
        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
-class LexerMetaDict(OrderedDict):
+class TokenStr(str):
    @staticmethod
    def __new__(cls, value):
        self = super().__new__(cls, value)
        self.remap = { }
        return self
    def __setitem__(self, key, value):
        self.remap[key] = value
 class LexerMetaDict(dict):
    '''
    Special dictionary that prohits duplicate definitions in lexer specifications.
    '''
    def __setitem__(self, key, value):
        if isinstance(value, str):
            value = TokenStr(value)
        if key in self and not isinstance(value, property):
-            if isinstance(self[key], str):
+            prior = self[key]
            if isinstance(prior, str):
                if callable(value):
-                    value.pattern = self[key]
+                    value.pattern = prior
                    value.remap = getattr(prior, 'remap', None)
                else:
                    raise AttributeError(f'Name {key} redefined')
        super().__setitem__(key, value)
    def __getitem__(self, key):
        if key not in self and key.isupper() and key[:1] != '_':
            return key
        else:
            return super().__getitem__(key)
 class LexerMeta(type):
    '''
    Metaclass for collecting lexing rules
@@ -114,7 +134,12 @@ class LexerMeta(type):
    def __new__(meta, clsname, bases, attributes):
        del attributes['_']
        remapping = { key: val.remap for key, val in attributes.items()
                      if getattr(val, 'remap', None) }
        attributes = { key: str(val) if isinstance(val, TokenStr) else val
                       for key, val in attributes.items() }
        cls = super().__new__(meta, clsname, bases, attributes)
        cls._remapping = remapping
        cls._build(list(attributes.items()))
        return cls
@@ -159,6 +184,16 @@ class Lexer(metaclass=LexerMeta):
        cls._ignored_tokens = set(cls._ignored_tokens)
        cls._token_funcs = dict(cls._token_funcs)
        # Build a set of all remapped tokens
        remapped_tokens = set()
        for toks in cls._remapping.values():
            remapped_tokens.update(toks.values())
        undefined = remapped_tokens - cls._token_names
        if undefined:
            missing = ', '.join(undefined)
            raise LexerBuildError(f'{missing} not included in token(s)')
        parts = []
        for tokname, value in cls._collect_rules(definitions):
            if tokname.startswith('ignore_'):
@@ -169,8 +204,10 @@ class Lexer(metaclass=LexerMeta):
                pattern = value
            elif callable(value):
                pattern = value.pattern
                cls._token_funcs[tokname] = value
                pattern = getattr(value, 'pattern', None)
                if not pattern:
                    continue
            # Form the regular expression component
            part = f'(?P<{tokname}>{pattern})'
@@ -209,7 +246,7 @@ class Lexer(metaclass=LexerMeta):
            _ignore = self.ignore
            _token_funcs = self._token_funcs
            _literals = self._literals
-
+            _remapping = self._remapping
            self.text = text
            try:
                while True:
@@ -228,6 +265,9 @@ class Lexer(metaclass=LexerMeta):
                        index = m.end()
                        tok.value = m.group()
                        tok.type = m.lastgroup
                        if tok.type in _remapping:
                            tok.type = _remapping[tok.type].get(tok.value, tok.type)
                        if tok.type in _token_funcs:
                            self.index = index
                            self.lineno = lineno
--- a/sly/yacc.py
+++ b/sly/yacc.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # sly: yacc.py
 #
-# Copyright (C) 2016-2017
+# Copyright (C) 2016-2018
 # David M. Beazley (Dabeaz LLC)
 # All rights reserved.
 #
@@ -35,7 +35,7 @@ import sys
 import inspect
 from collections import OrderedDict, defaultdict
-__version__    = '0.2'
+__version__    = '0.3'
 __all__        = [ 'Parser' ]
 class YaccError(Exception):
@@ -55,12 +55,12 @@ ERROR_COUNT = 3                # Number of symbols that must be shifted to leave
 MAXINT = sys.maxsize
 # This object is a stand-in for a logging object created by the
-# logging module.   PLY will use this by default to create things
+# logging module.   SLY will use this by default to create things
 # such as the parser.out file.  If a user wants more detailed
 # information, they can create their own logging object and pass
-# it into PLY.
+# it into SLY.
-class PlyLogger(object):
+class SlyLogger(object):
    def __init__(self, f):
        self.f = f
@@ -1552,7 +1552,7 @@ def _collect_grammar_rules(func):
    return grammar
-class ParserMetaDict(OrderedDict):
+class ParserMetaDict(dict):
    '''
    Dictionary that allows decorated grammar rule functions to be overloaded
    '''
@@ -1561,6 +1561,12 @@ class ParserMetaDict(OrderedDict):
            value.next_func = self[key]
        super().__setitem__(key, value)
    def __getitem__(self, key):
        if key not in self and key.isupper() and key[:1] != '_':
            return key.upper()
        else:
            return super().__getitem__(key)
 class ParserMeta(type):
    @classmethod
    def __prepare__(meta, *args, **kwargs):
@@ -1582,7 +1588,7 @@ class ParserMeta(type):
 class Parser(metaclass=ParserMeta):
    # Logging object where debugging/diagnostic messages are sent
-    log = PlyLogger(sys.stderr)     
+    log = SlyLogger(sys.stderr)     
    # Debugging filename where parsetab.out data can be written
    debugfile = None
--- a/tests/test_lex.py
+++ b/tests/test_lex.py
@@ -99,8 +99,93 @@ def test_error_return():
    assert lexer.errors == [ ':+-' ]
 class ModernCalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, LT, LE, IF, ELSE }
    literals = { '(', ')' }
    # String containing ignored characters between tokens
    ignore = ' \t'
    # Regular expression rules for tokens
    ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
    ID['if'] = IF
    ID['else'] = ELSE
    NUMBER  = r'\d+'
    PLUS    = r'\+'
    MINUS   = r'-'
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
    LE      = r'<='
    LT      = r'<'
    def NUMBER(self, t):
        t.value = int(t.value)
        return t
    # Ignored text
    ignore_comment = r'\#.*'
    @_(r'\n+')
    def ignore_newline(self, t):
        self.lineno += t.value.count('\n')
    # Attached rule
    def ID(self, t):
        t.value = t.value.upper()
        return t
    def error(self, t):
        self.errors.append(t.value)
        self.index += 1
        if hasattr(self, 'return_error'):
            return t
    def __init__(self):
        self.errors = []
 # Test basic recognition of various tokens and literals
 def test_modern_tokens():
    lexer = ModernCalcLexer()
    toks = list(lexer.tokenize('abc if else 123 + - * / = < <= ( )'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['ID','IF','ELSE', 'NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
    assert vals == ['ABC','if','else', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
 # Test ignored comments and newlines
 def test_modern_ignored():
    lexer = ModernCalcLexer()
    toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    linenos = [t.lineno for t in toks]
    assert types == ['NUMBER', 'ID']
    assert vals == [123, 'ABC']
    assert linenos == [4,5]
    assert lexer.lineno == 6
 # Test error handling
 def test_modern_error():
    lexer = ModernCalcLexer()
    toks = list(lexer.tokenize('123 :+-'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['NUMBER', 'PLUS', 'MINUS']
    assert vals == [123, '+', '-']
    assert lexer.errors == [ ':+-' ]
 # Test error token return handling
 def test_modern_error_return():
    lexer = ModernCalcLexer()
    lexer.return_error = True
    toks = list(lexer.tokenize('123 :+-'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
    assert vals == [123, ':+-', '+', '-']
    assert lexer.errors == [ ':+-' ]
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -3,16 +3,7 @@ from sly import Lexer, Parser
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
-    tokens = {
+    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN }
        'ID',
        'NUMBER',
        'PLUS',
        'MINUS',
        'TIMES',
        'DIVIDE',
        'ASSIGN',
        }
    literals = { '(', ')' }
    # String containing ignored characters between tokens
@@ -38,8 +29,8 @@ class CalcLexer(Lexer):
    def newline(self, t):
        self.lineno += t.value.count('\n')
-    def error(self, value):
+    def error(self, t):
-        self.errors.append(value)
+        self.errors.append(t.value[0])
        self.index += 1
    def __init__(self):
@@ -49,9 +40,9 @@ class CalcParser(Parser):
    tokens = CalcLexer.tokens
    precedence = (
-        ('left', 'PLUS', 'MINUS'),
+        ('left', PLUS, MINUS),
-        ('left', 'TIMES', 'DIVIDE'),
+        ('left', TIMES, DIVIDE),
-        ('right', 'UMINUS'),
+        ('right', UMINUS),
        )
    def __init__(self):