sly/tests/test_lex.py

import pytest
from sly import Lexer

class CalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = {
        'ID',
        'NUMBER',
        'PLUS',
        'MINUS',
        'TIMES',
        'DIVIDE',
        'ASSIGN',
        'LT',
        'LE',
        }

    literals = { '(', ')' }

    # String containing ignored characters between tokens
    ignore = ' \t'

    # Regular expression rules for tokens
    ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
    PLUS    = r'\+'
    MINUS   = r'-'
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
    LE      = r'<='
    LT      = r'<'

    @_(r'\d+')
    def NUMBER(self, t):
        t.value = int(t.value)
        return t

    # Ignored text
    ignore_comment = r'\#.*'

    @_(r'\n+')
    def newline(self, t):
        self.lineno += t.value.count('\n')

    # Attached rule
    def ID(self, t):
        t.value = t.value.upper()
        return t

    def error(self, t):
        self.errors.append(t.value)
        self.index += 1
        if hasattr(self, 'return_error'):
            return t

    def __init__(self):
        self.errors = []

# Test basic recognition of various tokens and literals
def test_tokens():
    lexer = CalcLexer()
    toks = list(lexer.tokenize('abc 123 + - * / = < <= ( )'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
    assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']

# Test position tracking
def test_positions():
    lexer = CalcLexer()
    text = 'abc\n( )'
    toks = list(lexer.tokenize(text))
    lines = [t.lineno for t in toks ]
    indices = [t.index for t in toks ]
    ends = [t.end for t in toks]
    values = [ text[t.index:t.end] for t in toks ]
    assert values == ['abc', '(', ')']
    assert lines == [1, 2, 2]
    assert indices == [0, 4, 6]
    assert ends == [3, 5, 7]


# Test ignored comments and newlines
def test_ignored():
    lexer = CalcLexer()
    toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    linenos = [t.lineno for t in toks]
    assert types == ['NUMBER', 'ID']
    assert vals == [123, 'ABC']
    assert linenos == [4,5]
    assert lexer.lineno == 6

# Test error handling
def test_error():
    lexer = CalcLexer()
    toks = list(lexer.tokenize('123 :+-'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['NUMBER', 'PLUS', 'MINUS']
    assert vals == [123, '+', '-']
    assert lexer.errors == [ ':+-' ]

# Test error token return handling
def test_error_return():
    lexer = CalcLexer()
    lexer.return_error = True
    toks = list(lexer.tokenize('123 :+-'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
    assert vals == [123, ':+-', '+', '-']
    assert lexer.errors == [ ':+-' ]


class ModernCalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, LT, LE, IF, ELSE }
    literals = { '(', ')' }

    # String containing ignored characters between tokens
    ignore = ' \t'

    # Regular expression rules for tokens
    ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
    ID['if'] = IF
    ID['else'] = ELSE

    NUMBER  = r'\d+'
    PLUS    = r'\+'
    MINUS   = r'-'
    TIMES   = r'\*'
    DIVIDE  = r'/'
    ASSIGN  = r'='
    LE      = r'<='
    LT      = r'<'

    def NUMBER(self, t):
        t.value = int(t.value)
        return t

    # Ignored text
    ignore_comment = r'\#.*'

    @_(r'\n+')
    def ignore_newline(self, t):
        self.lineno += t.value.count('\n')

    # Attached rule
    def ID(self, t):
        t.value = t.value.upper()
        return t

    def error(self, t):
        self.errors.append(t.value)
        self.index += 1
        if hasattr(self, 'return_error'):
            return t

    def __init__(self):
        self.errors = []


# Test basic recognition of various tokens and literals
def test_modern_tokens():
    lexer = ModernCalcLexer()
    toks = list(lexer.tokenize('abc if else 123 + - * / = < <= ( )'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['ID','IF','ELSE', 'NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
    assert vals == ['ABC','if','else', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']

# Test ignored comments and newlines
def test_modern_ignored():
    lexer = ModernCalcLexer()
    toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    linenos = [t.lineno for t in toks]
    assert types == ['NUMBER', 'ID']
    assert vals == [123, 'ABC']
    assert linenos == [4,5]
    assert lexer.lineno == 6

# Test error handling
def test_modern_error():
    lexer = ModernCalcLexer()
    toks = list(lexer.tokenize('123 :+-'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['NUMBER', 'PLUS', 'MINUS']
    assert vals == [123, '+', '-']
    assert lexer.errors == [ ':+-' ]

# Test error token return handling
def test_modern_error_return():
    lexer = ModernCalcLexer()
    lexer.return_error = True
    toks = list(lexer.tokenize('123 :+-'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
    assert vals == [123, ':+-', '+', '-']
    assert lexer.errors == [ ':+-' ]