Add support for third-party regex module

Fixes #26.
2019-02-17 22:55:49 +02:00 · 2019-02-17 22:55:49 +02:00 · 0083477f01
commit 0083477f01
parent 90a5484ea6
4 changed files with 63 additions and 4 deletions
--- a/docs/sly.rst
+++ b/docs/sly.rst
@ -385,6 +385,25 @@ might be useful if the parser wants to see error tokens for some
 reason--perhaps for the purposes of improved error messages or
 some other kind of error handling.
 Third-Party Regex Module
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. versionadded:: 0.4
 The third-party `regex <https://pypi.org/project/regex/>`_ module can be used
 with sly. Like this::
    from sly import Lexer
    import regex
    class MyLexer(Lexer):
        regex_module = regex
        ...
 Now all regular expressions that ``MyLexer`` uses will be handled with the
 ``regex`` module. The ``regex_module`` can be set to any module that is
 compatible with Python's standard library ``re``.
 A More Complete Example
 ^^^^^^^^^^^^^^^^^^^^^^^
--- a/setup.py
+++ b/setup.py
@ -3,7 +3,7 @@ try:
 except ImportError:
    from distutils.core import setup
-tests_require = ['pytest']
+tests_require = ['pytest', 'regex']
 setup(name = "sly",
            description="SLY - Sly Lex Yacc",
--- a/sly/lex.py
+++ b/sly/lex.py
@ -186,6 +186,7 @@ class Lexer(metaclass=LexerMeta):
    literals = set()
    ignore = ''
    reflags = 0
    regex_module = re
    _token_names = set()
    _token_funcs = {}
@ -307,7 +308,7 @@ class Lexer(metaclass=LexerMeta):
            # Make sure the individual regex compiles properly
            try:
-                cpat = re.compile(part, cls.reflags)
+                cpat = cls.regex_module.compile(part, cls.reflags)
            except Exception as e:
                raise PatternError(f'Invalid regex for token {tokname}') from e
@ -322,8 +323,8 @@ class Lexer(metaclass=LexerMeta):
        # Form the master regular expression
        #previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
-        # cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
+        # cls._master_re = cls.regex_module.compile('|'.join(parts) + previous, cls.reflags)
-        cls._master_re = re.compile('|'.join(parts), cls.reflags)
+        cls._master_re = cls.regex_module.compile('|'.join(parts), cls.reflags)
        # Verify that that ignore and literals specifiers match the input type
        if not isinstance(cls.ignore, str):
--- a/tests/test_lex.py
+++ b/tests/test_lex.py
@ -1,6 +1,11 @@
 import pytest
 from sly import Lexer
 try:
    import regex
 except ImportError:
    regex = None
 class CalcLexer(Lexer):
    # Set of token names.   This is always required
    tokens = {
@ -56,6 +61,29 @@ class CalcLexer(Lexer):
    def __init__(self):
        self.errors = []
 if regex is not None:
    class RegexModuleCalcLexer(Lexer):
        regex_module = regex
        tokens = { 'ID', 'PLUS', 'MINUS' }
        literals = { '(', ')' }
        ignore = ' \t'
        ID      = r'\p{Ll}+'  # Unicode lowercase letters, regex module feature
        PLUS    = r'\+'
        MINUS   = r'-'
        ignore_comment = r'\#.*'
        @_(r'\n+')
        def newline(self, t):
            self.lineno += t.value.count('\n')
        def ID(self, t):
            t.value = t.value.upper()
            return t
 # Test basic recognition of various tokens and literals
 def test_tokens():
    lexer = CalcLexer()
@ -65,6 +93,17 @@ def test_tokens():
    assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
    assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
 # Test third-party regex module support
@pytest.mark.skipif(regex is None,
                    reason="third-party regex module not installed")
 def test_3rd_party_regex_module():
    lexer = RegexModuleCalcLexer()
    toks = list(lexer.tokenize('a + b - c'))
    types = [t.type for t in toks]
    vals = [t.value for t in toks]
    assert types == ['ID','PLUS','ID','MINUS','ID']
    assert vals == ['A', '+', 'B', '-', 'C']
 # Test ignored comments and newlines
 def test_ignored():
    lexer = CalcLexer()