From 0083477f016a4d24b9c870c04a3349ecf88c9729 Mon Sep 17 00:00:00 2001 From: Akuli Date: Sun, 17 Feb 2019 22:55:49 +0200 Subject: [PATCH] Add support for third-party regex module Fixes #26. --- docs/sly.rst | 19 +++++++++++++++++++ setup.py | 2 +- sly/lex.py | 7 ++++--- tests/test_lex.py | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/docs/sly.rst b/docs/sly.rst index c500d5a..8ee47a0 100644 --- a/docs/sly.rst +++ b/docs/sly.rst @@ -385,6 +385,25 @@ might be useful if the parser wants to see error tokens for some reason--perhaps for the purposes of improved error messages or some other kind of error handling. +Third-Party Regex Module +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.4 + +The third-party `regex `_ module can be used +with sly. Like this:: + + from sly import Lexer + import regex + + class MyLexer(Lexer): + regex_module = regex + ... + +Now all regular expressions that ``MyLexer`` uses will be handled with the +``regex`` module. The ``regex_module`` can be set to any module that is +compatible with Python's standard library ``re``. + A More Complete Example ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/setup.py b/setup.py index d3a823c..dc9d05b 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ try: except ImportError: from distutils.core import setup -tests_require = ['pytest'] +tests_require = ['pytest', 'regex'] setup(name = "sly", description="SLY - Sly Lex Yacc", diff --git a/sly/lex.py b/sly/lex.py index e6c8ac3..246dd9e 100644 --- a/sly/lex.py +++ b/sly/lex.py @@ -186,6 +186,7 @@ class Lexer(metaclass=LexerMeta): literals = set() ignore = '' reflags = 0 + regex_module = re _token_names = set() _token_funcs = {} @@ -307,7 +308,7 @@ class Lexer(metaclass=LexerMeta): # Make sure the individual regex compiles properly try: - cpat = re.compile(part, cls.reflags) + cpat = cls.regex_module.compile(part, cls.reflags) except Exception as e: raise PatternError(f'Invalid regex for token {tokname}') from e @@ -322,8 +323,8 @@ class Lexer(metaclass=LexerMeta): # Form the master regular expression #previous = ('|' + cls._master_re.pattern) if cls._master_re else '' - # cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags) - cls._master_re = re.compile('|'.join(parts), cls.reflags) + # cls._master_re = cls.regex_module.compile('|'.join(parts) + previous, cls.reflags) + cls._master_re = cls.regex_module.compile('|'.join(parts), cls.reflags) # Verify that that ignore and literals specifiers match the input type if not isinstance(cls.ignore, str): diff --git a/tests/test_lex.py b/tests/test_lex.py index 7c7421b..c7bf3e9 100644 --- a/tests/test_lex.py +++ b/tests/test_lex.py @@ -1,6 +1,11 @@ import pytest from sly import Lexer +try: + import regex +except ImportError: + regex = None + class CalcLexer(Lexer): # Set of token names. This is always required tokens = { @@ -56,6 +61,29 @@ class CalcLexer(Lexer): def __init__(self): self.errors = [] +if regex is not None: + class RegexModuleCalcLexer(Lexer): + regex_module = regex + + tokens = { 'ID', 'PLUS', 'MINUS' } + + literals = { '(', ')' } + ignore = ' \t' + + ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature + PLUS = r'\+' + MINUS = r'-' + + ignore_comment = r'\#.*' + + @_(r'\n+') + def newline(self, t): + self.lineno += t.value.count('\n') + + def ID(self, t): + t.value = t.value.upper() + return t + # Test basic recognition of various tokens and literals def test_tokens(): lexer = CalcLexer() @@ -65,6 +93,17 @@ def test_tokens(): assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')'] assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')'] +# Test third-party regex module support +@pytest.mark.skipif(regex is None, + reason="third-party regex module not installed") +def test_3rd_party_regex_module(): + lexer = RegexModuleCalcLexer() + toks = list(lexer.tokenize('a + b - c')) + types = [t.type for t in toks] + vals = [t.value for t in toks] + assert types == ['ID','PLUS','ID','MINUS','ID'] + assert vals == ['A', '+', 'B', '-', 'C'] + # Test ignored comments and newlines def test_ignored(): lexer = CalcLexer()