parent
90a5484ea6
commit
0083477f01
19
docs/sly.rst
19
docs/sly.rst
@ -385,6 +385,25 @@ might be useful if the parser wants to see error tokens for some
|
|||||||
reason--perhaps for the purposes of improved error messages or
|
reason--perhaps for the purposes of improved error messages or
|
||||||
some other kind of error handling.
|
some other kind of error handling.
|
||||||
|
|
||||||
|
Third-Party Regex Module
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. versionadded:: 0.4
|
||||||
|
|
||||||
|
The third-party `regex <https://pypi.org/project/regex/>`_ module can be used
|
||||||
|
with sly. Like this::
|
||||||
|
|
||||||
|
from sly import Lexer
|
||||||
|
import regex
|
||||||
|
|
||||||
|
class MyLexer(Lexer):
|
||||||
|
regex_module = regex
|
||||||
|
...
|
||||||
|
|
||||||
|
Now all regular expressions that ``MyLexer`` uses will be handled with the
|
||||||
|
``regex`` module. The ``regex_module`` can be set to any module that is
|
||||||
|
compatible with Python's standard library ``re``.
|
||||||
|
|
||||||
|
|
||||||
A More Complete Example
|
A More Complete Example
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
2
setup.py
2
setup.py
@ -3,7 +3,7 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
|
|
||||||
tests_require = ['pytest']
|
tests_require = ['pytest', 'regex']
|
||||||
|
|
||||||
setup(name = "sly",
|
setup(name = "sly",
|
||||||
description="SLY - Sly Lex Yacc",
|
description="SLY - Sly Lex Yacc",
|
||||||
|
@ -186,6 +186,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
literals = set()
|
literals = set()
|
||||||
ignore = ''
|
ignore = ''
|
||||||
reflags = 0
|
reflags = 0
|
||||||
|
regex_module = re
|
||||||
|
|
||||||
_token_names = set()
|
_token_names = set()
|
||||||
_token_funcs = {}
|
_token_funcs = {}
|
||||||
@ -307,7 +308,7 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
|
|
||||||
# Make sure the individual regex compiles properly
|
# Make sure the individual regex compiles properly
|
||||||
try:
|
try:
|
||||||
cpat = re.compile(part, cls.reflags)
|
cpat = cls.regex_module.compile(part, cls.reflags)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise PatternError(f'Invalid regex for token {tokname}') from e
|
raise PatternError(f'Invalid regex for token {tokname}') from e
|
||||||
|
|
||||||
@ -322,8 +323,8 @@ class Lexer(metaclass=LexerMeta):
|
|||||||
|
|
||||||
# Form the master regular expression
|
# Form the master regular expression
|
||||||
#previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
|
#previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
|
||||||
# cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
|
# cls._master_re = cls.regex_module.compile('|'.join(parts) + previous, cls.reflags)
|
||||||
cls._master_re = re.compile('|'.join(parts), cls.reflags)
|
cls._master_re = cls.regex_module.compile('|'.join(parts), cls.reflags)
|
||||||
|
|
||||||
# Verify that that ignore and literals specifiers match the input type
|
# Verify that that ignore and literals specifiers match the input type
|
||||||
if not isinstance(cls.ignore, str):
|
if not isinstance(cls.ignore, str):
|
||||||
|
@ -1,6 +1,11 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from sly import Lexer
|
from sly import Lexer
|
||||||
|
|
||||||
|
try:
|
||||||
|
import regex
|
||||||
|
except ImportError:
|
||||||
|
regex = None
|
||||||
|
|
||||||
class CalcLexer(Lexer):
|
class CalcLexer(Lexer):
|
||||||
# Set of token names. This is always required
|
# Set of token names. This is always required
|
||||||
tokens = {
|
tokens = {
|
||||||
@ -56,6 +61,29 @@ class CalcLexer(Lexer):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
|
if regex is not None:
|
||||||
|
class RegexModuleCalcLexer(Lexer):
|
||||||
|
regex_module = regex
|
||||||
|
|
||||||
|
tokens = { 'ID', 'PLUS', 'MINUS' }
|
||||||
|
|
||||||
|
literals = { '(', ')' }
|
||||||
|
ignore = ' \t'
|
||||||
|
|
||||||
|
ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
|
||||||
|
PLUS = r'\+'
|
||||||
|
MINUS = r'-'
|
||||||
|
|
||||||
|
ignore_comment = r'\#.*'
|
||||||
|
|
||||||
|
@_(r'\n+')
|
||||||
|
def newline(self, t):
|
||||||
|
self.lineno += t.value.count('\n')
|
||||||
|
|
||||||
|
def ID(self, t):
|
||||||
|
t.value = t.value.upper()
|
||||||
|
return t
|
||||||
|
|
||||||
# Test basic recognition of various tokens and literals
|
# Test basic recognition of various tokens and literals
|
||||||
def test_tokens():
|
def test_tokens():
|
||||||
lexer = CalcLexer()
|
lexer = CalcLexer()
|
||||||
@ -65,6 +93,17 @@ def test_tokens():
|
|||||||
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
|
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
|
||||||
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
|
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
|
||||||
|
|
||||||
|
# Test third-party regex module support
|
||||||
|
@pytest.mark.skipif(regex is None,
|
||||||
|
reason="third-party regex module not installed")
|
||||||
|
def test_3rd_party_regex_module():
|
||||||
|
lexer = RegexModuleCalcLexer()
|
||||||
|
toks = list(lexer.tokenize('a + b - c'))
|
||||||
|
types = [t.type for t in toks]
|
||||||
|
vals = [t.value for t in toks]
|
||||||
|
assert types == ['ID','PLUS','ID','MINUS','ID']
|
||||||
|
assert vals == ['A', '+', 'B', '-', 'C']
|
||||||
|
|
||||||
# Test ignored comments and newlines
|
# Test ignored comments and newlines
|
||||||
def test_ignored():
|
def test_ignored():
|
||||||
lexer = CalcLexer()
|
lexer = CalcLexer()
|
||||||
|
Loading…
Reference in New Issue
Block a user