Compare commits

...

63 Commits
0.1 ... master

Author SHA1 Message Date
David Beazley
539a85a5d5
Merge pull request #109 from shadchin/patch-1
Fix license classifier
2024-06-05 09:15:25 -05:00
Alexander Shadchin
069f6d7766
Fix license classifier 2024-06-05 13:04:43 +03:00
David Beazley
33d4f5afc0 Clarification on retirement 2023-02-14 10:43:21 -06:00
David Beazley
4000988231 Fix to manifest.in 2022-10-25 09:31:43 -05:00
David Beazley
b453ea1854 Updated long description 2022-10-25 09:29:14 -05:00
David Beazley
004df26293 Minor edits. Added Makefile 2022-10-25 09:28:13 -05:00
David Beazley
dbcf6d0f7f Added retirement notice 2022-10-11 08:44:48 -05:00
David Beazley
dd71d70882 Packaging reorganization 2022-09-06 20:15:16 -05:00
David Beazley
62203d8b75 Various work in progress. Position tracking 2022-09-06 19:38:33 -05:00
David Beazley
cd9014eda2 Fixed issue #94 2022-03-25 15:49:11 -05:00
David Beazley
f8fcbb080c Added test file 2021-06-03 19:44:13 -05:00
David Beazley
06b92e3e73 Fixed CHANGES 2020-07-29 04:29:30 -05:00
David Beazley
1f87ddaf39 Added EBNF choice handling 2020-05-09 12:24:33 -05:00
David Beazley
ab75a58b10 Naming enhancement 2020-05-09 09:23:26 -05:00
David Beazley
a128245cd3
Merge pull request #51 from abhaikollara/abhaikollara-patch-1
Minor typo fix
2020-04-29 19:59:02 -05:00
Abhai Kollara Dilip
cf35258d6f
Update sly.rst 2020-04-29 22:06:44 +05:30
David Beazley
78dc087c45 Minor edit 2020-03-12 08:37:41 -05:00
David Beazley
8a67ae946b Fixed doc typo 2020-03-07 13:16:04 -06:00
David Beazley
39ffd0361a Refinement of EBNF extensions 2020-03-07 06:28:19 -06:00
David Beazley
a2cdf52d0f Experimental EBNF features added 2020-03-06 20:58:48 -06:00
David Beazley
9944b6239c
Merge pull request #47 from lordmauve/patch-1
Update date for talk in README
2020-02-17 06:58:40 -06:00
Daniel Pope
aead4b43be
Update date for talk in README
The date for the Reinventing the Parser Generator talk was 2018 according to the YouTube title.
2020-02-17 12:03:36 +00:00
David Beazley
d9c763d2f7 Fixed mysterious error message if start defined as a function 2019-04-09 17:18:29 -05:00
David Beazley
5bf4d9707d Fixed mysterious 'unknown conflict' error 2019-04-09 17:10:00 -05:00
David Beazley
a728a23adf Added Wasm example 2019-02-17 19:48:18 -06:00
David Beazley
5a7f8ab652
Merge pull request #27 from Akuli/regex-module
Add support for third-party regex module
2019-02-17 19:42:41 -06:00
Akuli
0083477f01 Add support for third-party regex module
Fixes #26.
2019-02-17 22:55:49 +02:00
David Beazley
90a5484ea6 Minor edit 2018-11-24 10:23:07 -06:00
David Beazley
b8e5ac3f6b Added license 2018-11-24 10:21:25 -06:00
David Beazley
ac8e0ecba1 Fixed README 2018-11-24 10:21:05 -06:00
David Beazley
6cdb4d8ca7 Added talks. Minor reformatting 2018-11-24 10:19:37 -06:00
David Beazley
a84811539e Added RST README 2018-11-24 10:15:02 -06:00
David Beazley
d87f21a138
Merge pull request #18 from danshorstein/hotfix/readme-update
Fix bug in example on README
2018-11-21 04:18:05 -06:00
Dan Shorstein
dd41efd112 Fix bug in example on README to match example in calc.py; NUMBER was not returned as int 2018-11-20 22:12:31 -05:00
David Beazley
d1b9f64e49 Bumped version number 2018-11-18 07:57:34 -06:00
David Beazley
52c993e00c Merge branch 'master' of https://github.com/dabeaz/sly 2018-11-18 07:21:52 -06:00
David Beazley
2be28d29a5 Added contributing file 2018-11-18 07:20:26 -06:00
David Beazley
66b6bd73f8
Merge pull request #17 from sfingram/master
Minor typo in docs
2018-11-18 06:48:46 -06:00
David Beazley
503fae9e18 Various usability improvements 2018-11-18 06:42:22 -06:00
David Beazley
0ac3c1a0a3 Improved error checking and messages 2018-11-16 14:50:33 -06:00
David Beazley
16d700b310 Some warning message refinements 2018-11-13 19:28:51 -06:00
Stephen Ingram
e54c5dfcea Minor typo 2018-11-10 08:56:54 -05:00
David Beazley
5fdc971f36 Added docparse 2018-09-27 14:22:51 -05:00
David Beazley
6a27431f81 Fixed Issue #14. 2018-09-08 15:26:00 -05:00
David Beazley
a33ff221e5 Added getattr() support 2018-09-06 07:59:39 -05:00
David Beazley
3f7240b9a2 Updated CHANGES 2018-07-07 13:54:42 -05:00
David Beazley
995d0ecff1
Merge pull request #8 from xpvpc/master
cosmetic changes to docs
2018-05-19 06:29:13 -05:00
xpvpc
715222a0fc remove trailing whitespace 2018-05-14 15:44:21 +02:00
xpvpc
fb43a50f8a fix typos in docs 2018-05-14 15:43:42 +02:00
David Beazley
1251da034a Improvements to lexer inheritance 2018-04-01 20:06:27 -05:00
David Beazley
c5659a4465 Some work in progress on Lexer inheritance. Everything kind of broken 2018-03-30 14:23:34 -05:00
David Beazley
3a0ee0d9c1 WIP 2018-03-30 08:29:34 -05:00
David Beazley
08988d2798 Initial work on lexer states (in progress) 2018-03-29 17:51:58 -05:00
David Beazley
d0e34417bc WIP 2018-01-30 07:42:27 -06:00
David Beazley
51b01d8335 Version bump 2018-01-27 15:28:17 -06:00
David Beazley
b088d9b2ce Changes to token specification. More metamagic 2018-01-27 15:27:15 -06:00
David Beazley
b74e7223ce Added extra validation check in Lexer construction 2018-01-16 08:30:09 -06:00
David Beazley
e9346daff0 Fix default Lexer error() 2018-01-15 10:58:55 -06:00
David Beazley
cdd7a082a4 Cleanup 2018-01-10 06:16:15 -06:00
David Beazley
e05748494c Changes for 0.2 2018-01-10 06:09:20 -06:00
David Beazley
d8903d8301 Added Lexer state change 2017-09-01 06:31:51 -05:00
David Beazley
636197b9fd Merge pull request #5 from cdeil/patch-1
Fix typo in Token repr
2017-05-28 12:12:11 -05:00
Christoph Deil
b71fbdafe3 Fix typo in Token repr 2017-05-20 22:24:55 +02:00
29 changed files with 3533 additions and 771 deletions

257
CHANGES Normal file
View File

@ -0,0 +1,257 @@
Version 0.5
-----------
10/25/2022 ***IMPORTANT NOTE*** This is the last release to be made
on PyPi. If you want the latest version go to
https://github.com/dabeaz/sly.
09/06/2022 Modernization of the packaging infrastructure. Slight
project reorganization.
03/25/2022 Added automatic location tracking to the parser. Use
Parser.line_position(value) to return the line number
and Parser.index_position(value) to return a (start, end)
index pair. value is *any* object returned by one of
the various methods in the parser definition. Typically,
it would be a AST node. The parser tracks the data using
the value of id(value).
03/25/2022 Added .end attribute to tokens that specify the ending
index of the matching text. This is used to do more
precise location tracking for the purpose of issuing
more useful error messages.
05/09/2020 Experimental support for EBNF choices. For example:
@('term { PLUS|MINUS term }')
def expr(self, p):
lterm = p.pterm0
for op, rterm in p[1]:
lterm = BinOp(op, lterm, rterm)
One issue here is just how one refers to the choice
of values. There is no unified name to pick. So,
you basically have to do it using a numeric index like p[1].
In this case, p[1] is a list of all of the repeated items
(represented as tuples).
05/09/2020 Changed the internal names used for EBNF rules to make them
a bit easier to debug in the parser.out file.
Version 0.4
-----------
03/06/2020 Added experimental support for EBNF repetition and optional
syntax. For example, here is a rule for a comma-separated
expression list:
@('expr { COMMA expr }')
def exprlist(self, p):
return [ p.expr0 ] + p.expr1
In this code, the { ... } means zero-or-more repetitions.
It turns all symbols inside into lists. So, instead of
representing a single value, p.expr1 is now a list of
values.
An optional value can be enclosed in brackets like this:
@('VAR NAME [ EQUAL expr ] SEMI')
def variable_declaration(self, p):
print(f"Definining {p.NAME}. Initial value={p.expr}")
In this case, all symbols inside [ ... ] either have a value
if present or are assigned to None if missing.
In both cases, you continue to use the same name indexing
scheme used by the rest of SLY. For example, in the first
example above, you use "expr0" and "expr1" to refer to the
different "expr" symbols since that name appears in more
than one place.
04/09/2019 Fixed very mysterious error message that resulted if you
defined a grammar rule called "start". start can now
be a string or a function.
04/09/2019 Minor refinement to the reporting of reduce/reduce conflicts.
If a top grammar rule wasn't specified, SLY could fail with
a mysterious "unknown conflict" exception. This should be
fixed.
11/18/2018 Various usability fixes observed from last compilers course.
- Errors encountered during grammar construction are now
reported as part of the raised GrammarError exception
instead of via logging. This places them in the same
visual position as normal Python errors (at the end
of the traceback)
- Repeated warning messages about unused tokens have
been consolidated in a single warning message to make
the output less verbose.
- Grammar attributes (e.g., p.TOKEN) used during parsing
are now read-only.
- The error about "infinite recursion" is only checked
if there are no undefined grammar symbols. Sometimes
you'd get this message and be confused when the only
mistake was a bad token name or similar.
9/8/2018 Fixed Issue #14. YaccProduction index property causes
AttributeError if index is 0
9/5/2018 Added support for getattr() and related functions on
productions.
Version 0.3
-----------
4/1/2018 Support for Lexer inheritance added. For example:
from sly import Lexer
class BaseLexer(Lexer):
tokens = { NAME, NUMBER }
ignore = ' \t'
NAME = r'[a-zA-Z]+'
NUMBER = r'\d+'
class ChildLexer(BaseLexer):
tokens = { PLUS, MINUS }
PLUS = r'\+'
MINUS = r'-'
In this example, the ChildLexer class gets all of the tokens
from the parent class (BaseLexer) in addition to the new
definitions it added of its own.
One quirk of Lexer inheritance is that definition order has
an impact on the low-level regular expression parsing. By
default new definitions are always processed AFTER any previous
definitions. You can change this using the before() function
like this:
class GrandChildLexer(ChildLexer):
tokens = { PLUSPLUS, MINUSMINUS }
PLUSPLUS = before(PLUS, r'\+\+')
MINUSMINUS = before(MINUS, r'--')
In this example, the PLUSPLUS token is checked before the
PLUS token in the base class. Thus, an input text of '++'
will be parsed as a single token PLUSPLUS, not two PLUS tokens.
4/1/2018 Better support for lexing states. Each lexing state can be defined as
as a separate class. Use the begin(cls) method to switch to a
different state. For example:
from sly import Lexer
class LexerA(Lexer):
tokens = { NAME, NUMBER, LBRACE }
ignore = ' \t'
NAME = r'[a-zA-Z]+'
NUMBER = r'\d+'
LBRACE = r'\{'
def LBRACE(self, t):
self.begin(LexerB)
return t
class LexerB(Lexer):
tokens = { PLUS, MINUS, RBRACE }
ignore = ' \t'
PLUS = r'\+'
MINUS = r'-'
RBRACE = r'\}'
def RBRACE(self, t):
self.begin(LexerA)
return t
In this example, LexerA switches to a new state LexerB when
a left brace ({) is encountered. The begin() method causes
the state transition. LexerB switches back to state LexerA
when a right brace (}) is encountered.
An option to the begin() method, you can also use push_state(cls)
and pop_state(cls) methods. This manages the lexing states as a
stack. The pop_state() method will return back to the previous
lexing state.
1/27/2018 Tokens no longer have to be specified as strings. For example, you
can now write:
from sly import Lexer
class TheLexer(Lexer):
tokens = { ID, NUMBER, PLUS, MINUS }
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
PLUS = r'\+'
MINUS = r'-'
This convention also carries over to the parser for things such
as precedence specifiers:
from sly import Parser
class TheParser(Parser):
tokens = TheLexer.tokens
precedence = (
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS),
)
...
Nevermind the fact that ID, NUMBER, PLUS, and MINUS appear to be
undefined identifiers. It all works.
1/27/2018 Tokens now allow special-case remapping. For example:
from sly import Lexer
class TheLexer(Lexer):
tokens = { ID, IF, ELSE, WHILE, NUMBER, PLUS, MINUS }
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
NUMBER = r'\d+'
PLUS = r'\+'
MINUS = r'-'
In this code, the ID rule matches any identifier. However,
special cases have been made for IF, ELSE, and WHILE tokens.
Previously, this had to be handled in a special action method
such as this:
def ID(self, t):
if t.value in { 'if', 'else', 'while' }:
t.type = t.value.upper()
return t
Nevermind the fact that the syntax appears to suggest that strings
work as a kind of mutable mapping.
1/16/2018 Usability improvement on Lexer class. Regular expression rules
specified as strings that don't match any name in tokens are
now reported as errors.
Version 0.2
-----------
12/24/2017 The error(self, t) method of lexer objects now receives a
token as input. The value attribute of this token contains
all remaining input text. If the passed token is returned
by error(), then it shows up in the token stream where
can be processed by the parser.

45
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,45 @@
Contributing to SLY
===================
SLY, like most projects related to parser generators, is a niche
project. Although it appears to be a somewhat "new" project, it is
actually an outgrowth of the PLY project which has been around since
2001. Contributions of most kinds that make it better are
welcome--this includes code, documentation, examples, and feature
requests.
There aren't too many formal guidelines. If submitting a bug report,
any information that helps to reproduce the problem will be handy. If
submitting a pull request, try to make sure that SLY's test suite
still passes. Even if that's not the case though, that's okay--a
failed test might be something very minor that can fixed up after a
merge.
Project Scope
-------------
It is not my goal to turn SLY into a gigantic parsing framework with
every possible feature. What you see here is pretty much what it is--a
basic LALR(1) parser generator and tokenizer. If you've built something
useful that uses SLY or builds upon it, it's probably better served by
its own repository. Feel free to submit a pull request to the SLY README
file that includes a link to your project.
The SLY "Community" (or lack thereof)
-------------------------------------
As noted, parser generator tools are a highly niche area. It is
important to emphasize that SLY is a very much a side-project for
me. No funding is received for this work. I also run a business and
have a family with kids. These things have higher priority. As such,
there may be periods in which little activity is made on pull
requests, issues, and other development matters. Sometimes you might
only see a flurry of activity around the times when I use SLY in
a compilers course that I teach. Do not mistake "inaction" for
"disinterest." I am definitely interested in improving SLY--it's
just not practical for me to give it my undivided attention.
Important Note
--------------
As a general rule, pull requests related to third-party tooling (IDEs,
type-checkers, linters, code formatters, etc.) will not be accepted.
If you think something should be changed/improved in this regard,
please submit an issue instead.

39
LICENSE Normal file
View File

@ -0,0 +1,39 @@
SLY (Sly Lex-Yacc)
Copyright (C) 2016-2022
David M. Beazley (Dabeaz LLC)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the David Beazley or Dabeaz LLC may be used to
endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

4
MANIFEST.in Normal file
View File

@ -0,0 +1,4 @@
include Makefile CONTRIBUTING.md
recursive-include example *
recursive-include tests *
recursive-include docs *

21
Makefile Normal file
View File

@ -0,0 +1,21 @@
PYTHON=python3
VENV=.venv
# Setup and install all of the required tools for building, testing,
# and deploying
setup::
rm -rf $(VENV)
$(PYTHON) -m venv $(VENV)
./$(VENV)/bin/python -m pip install pytest
./$(VENV)/bin/python -m pip install pytest-cov
./$(VENV)/bin/python -m pip install build
./$(VENV)/bin/python -m pip install twine
# Run unit tests
test::
./$(VENV)/bin/python -m pip install .
./$(VENV)/bin/python -m pytest --cov
# Build an artifact suitable for installing with pip
build::
./$(VENV)/bin/python -m build

207
README.md
View File

@ -1,207 +0,0 @@
SLY (Sly Lex-Yacc) Version 0.1
Copyright (C) 2016-2017
David M. Beazley (Dabeaz LLC)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the David Beazley or Dabeaz LLC may be used to
endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CAUTION
=======
THIS IS A WORK IN PROGRESS. NO OFFICIAL RELEASE HAS BEEN MADE.
USE AT YOUR OWN RISK.
Requirements
============
SLY requires the use of Python 3.6 or greater. Older versions
of Python are not supported.
Introduction
============
SLY is a 100% Python implementation of the lex and yacc tools
commonly used to write parsers and compilers. Parsing is
based on the same LALR(1) algorithm used by many yacc tools.
Here are a few notable features:
- SLY provides *very* extensive error reporting and diagnostic
information to assist in parser construction. The original
implementation was developed for instructional purposes. As
a result, the system tries to identify the most common types
of errors made by novice users.
- SLY provides full support for empty productions, error recovery,
precedence specifiers, and moderately ambiguous grammars.
- SLY uses various Python metaprogramming features to specify
lexers and parsers. There are no generated files or extra
steps involved. You simply write Python code and run it.
- SLY can be used to build parsers for "real" programming languages.
Although it is not ultra-fast due to its Python implementation,
SLY can be used to parse grammars consisting of several hundred
rules (as might be found for a language like C).
SLY originates from the PLY project (http://www.dabeaz.com/ply/index.html).
However, it's been modernized a bit. In fact, don't expect any code
previously written for PLY to work. That said, most of the things
that were possible in PLY are also possible in SLY.
An Example
==========
SLY is probably best illustrated by an example. Here's what it
looks like to write a parser that can evaluate simple arithmetic
expressions and store variables:
# -----------------------------------------------------------------------------
# calc.py
# -----------------------------------------------------------------------------
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = {
'NAME', 'NUMBER',
}
ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' }
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, value):
print("Illegal character '%s'" % value[0])
self.index += 1
class CalcParser(Parser):
tokens = CalcLexer.tokens
precedence = (
('left', '+', '-'),
('left', '*', '/'),
('right', 'UMINUS'),
)
def __init__(self):
self.names = { }
@_('NAME "=" expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('expr "+" expr')
def expr(self, p):
return p.expr0 + p.expr1
@_('expr "-" expr')
def expr(self, p):
return p.expr0 - p.expr1
@_('expr "*" expr')
def expr(self, p):
return p.expr0 * p.expr1
@_('expr "/" expr')
def expr(self, p):
return p.expr0 / p.expr1
@_('"-" expr %prec UMINUS')
def expr(self, p):
return -p.expr
@_('"(" expr ")"')
def expr(self, p):
return p.expr
@_('NUMBER')
def expr(self, p):
return p.NUMBER
@_('NAME')
def expr(self, p):
try:
return self.names[p.NAME]
except LookupError:
print("Undefined name '%s'" % p.NAME)
return 0
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))
Documentation
=============
Further documentation can be found at https://sly.readthedocs.io/en/latest
Resources
=========
For a detailed overview of parsing theory, consult the excellent
book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and
Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown
may also be useful.
The GitHub page for SLY can be found at:
https://github.com/dabeaz/sly
Please direct bug reports and pull requests to the GitHub page.
To contact me directly, send email to dave@dabeaz.com or contact
me on Twitter (@dabeaz).
-- Dave

197
README.rst Normal file
View File

@ -0,0 +1,197 @@
SLY (Sly Lex-Yacc)
==================
SLY is a 100% Python implementation of the lex and yacc tools
commonly used to write parsers and compilers. Parsing is
based on the same LALR(1) algorithm used by many yacc tools.
Here are a few notable features:
- SLY provides *very* extensive error reporting and diagnostic
information to assist in parser construction. The original
implementation was developed for instructional purposes. As
a result, the system tries to identify the most common types
of errors made by novice users.
- SLY provides full support for empty productions, error recovery,
precedence specifiers, and moderately ambiguous grammars.
- SLY uses various Python metaprogramming features to specify
lexers and parsers. There are no generated files or extra
steps involved. You simply write Python code and run it.
- SLY can be used to build parsers for "real" programming languages.
Although it is not ultra-fast due to its Python implementation,
SLY can be used to parse grammars consisting of several hundred
rules (as might be found for a language like C).
SLY originates from the `PLY project <http://www.dabeaz.com/ply/index.html>`_.
However, it's been modernized a bit. In fact, don't expect any code
previously written for PLY to work. That said, most of the things
that were possible in PLY are also possible in SLY.
SLY is a modern library for performing lexing and parsing. It
implements the LALR(1) parsing algorithm, commonly used for
parsing and compiling various programming languages.
Important Notice : October 11, 2022
-----------------------------------
The SLY project is no longer making package-installable releases.
It's fully functional, but if choose to use it, you should
vendor the code into your application. SLY has zero-dependencies.
Although I am semi-retiring the project, I will respond to
bug reports and still may decide to make future changes to it
depending on my mood. I'd like to thank everyone who
has contributed to it over the years. --Dave
Requirements
------------
SLY requires the use of Python 3.6 or greater. Older versions
of Python are not supported.
An Example
----------
SLY is probably best illustrated by an example. Here's what it
looks like to write a parser that can evaluate simple arithmetic
expressions and store variables:
.. code:: python
# -----------------------------------------------------------------------------
# calc.py
# -----------------------------------------------------------------------------
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN }
ignore = ' \t'
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
# Special symbols
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
# Ignored pattern
ignore_newline = r'\n+'
# Extra action for newlines
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index += 1
class CalcParser(Parser):
tokens = CalcLexer.tokens
precedence = (
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS),
)
def __init__(self):
self.names = { }
@_('NAME ASSIGN expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('expr PLUS expr')
def expr(self, p):
return p.expr0 + p.expr1
@_('expr MINUS expr')
def expr(self, p):
return p.expr0 - p.expr1
@_('expr TIMES expr')
def expr(self, p):
return p.expr0 * p.expr1
@_('expr DIVIDE expr')
def expr(self, p):
return p.expr0 / p.expr1
@_('MINUS expr %prec UMINUS')
def expr(self, p):
return -p.expr
@_('LPAREN expr RPAREN')
def expr(self, p):
return p.expr
@_('NUMBER')
def expr(self, p):
return int(p.NUMBER)
@_('NAME')
def expr(self, p):
try:
return self.names[p.NAME]
except LookupError:
print(f'Undefined name {p.NAME!r}')
return 0
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))
Documentation
-------------
Further documentation can be found at `https://sly.readthedocs.io/en/latest <https://sly.readthedocs.io/en/latest>`_.
Talks
-----
* `Reinventing the Parser Generator <https://www.youtube.com/watch?v=zJ9z6Ge-vXs>`_, talk by David Beazley at PyCon 2018, Cleveland.
Resources
---------
For a detailed overview of parsing theory, consult the excellent
book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and
Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown
may also be useful.
The GitHub page for SLY can be found at:
``https://github.com/dabeaz/sly``
Please direct bug reports and pull requests to the GitHub page.
To contact me directly, send email to dave@dabeaz.com or contact
me on Twitter (@dabeaz).
-- Dave
P.S.
----
You should come take a `course <https://www.dabeaz.com/courses.html>`_!

View File

@ -60,9 +60,7 @@ expressions and store variables::
from sly import Lexer, Parser from sly import Lexer, Parser
class CalcLexer(Lexer): class CalcLexer(Lexer):
tokens = { tokens = { NAME, NUMBER }
'NAME', 'NUMBER',
}
ignore = ' \t' ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' } literals = { '=', '+', '-', '*', '/', '(', ')' }
@ -78,8 +76,8 @@ expressions and store variables::
def newline(self, t): def newline(self, t):
self.lineno += t.value.count('\n') self.lineno += t.value.count('\n')
def error(self, value): def error(self, t):
print("Illegal character '%s'" % value[0]) print("Illegal character '%s'" % t.value[0])
self.index += 1 self.index += 1
class CalcParser(Parser): class CalcParser(Parser):

View File

@ -57,7 +57,7 @@ described by the following list of token tuples::
[ ('ID','x'), ('EQUALS','='), ('NUMBER','3'), [ ('ID','x'), ('EQUALS','='), ('NUMBER','3'),
('PLUS','+'), ('NUMBER','42'), ('TIMES','*'), ('PLUS','+'), ('NUMBER','42'), ('TIMES','*'),
('LPAREN','('), ('ID','s'), ('MINUS','-'), ('LPAREN','('), ('ID','s'), ('MINUS','-'),
('ID','t'), ('RPAREN',')' ] ('ID','t'), ('RPAREN',')') ]
The SLY ``Lexer`` class is used to do this. Here is a sample of a simple The SLY ``Lexer`` class is used to do this. Here is a sample of a simple
lexer that tokenizes the above text:: lexer that tokenizes the above text::
@ -68,17 +68,8 @@ lexer that tokenizes the above text::
class CalcLexer(Lexer): class CalcLexer(Lexer):
# Set of token names. This is always required # Set of token names. This is always required
tokens = { tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
'ID', DIVIDE, ASSIGN, LPAREN, RPAREN }
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
# String containing ignored characters between tokens # String containing ignored characters between tokens
ignore = ' \t' ignore = ' \t'
@ -131,19 +122,12 @@ In the example, the following code specified the token names::
class CalcLexer(Lexer): class CalcLexer(Lexer):
... ...
# Set of token names. This is always required # Set of token names. This is always required
tokens = { tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
'ID', DIVIDE, ASSIGN, LPAREN, RPAREN }
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
... ...
Token names should be specified using all-caps as shown.
Specification of token match patterns Specification of token match patterns
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -154,12 +138,6 @@ names of the tokens provided in the ``tokens`` set. For example::
PLUS = r'\+' PLUS = r'\+'
MINUS = r'-' MINUS = r'-'
Regular expression patterns are compiled using the ``re.VERBOSE`` flag
which can be used to help readability. However,
unescaped whitespace is ignored and comments are allowed in this mode.
If your pattern involves whitespace, make sure you use ``\s``. If you
need to match the ``#`` character, use ``[#]`` or ``\#``.
Tokens are matched in the same order that patterns are listed in the Tokens are matched in the same order that patterns are listed in the
``Lexer`` class. Longer tokens always need to be specified before ``Lexer`` class. Longer tokens always need to be specified before
short tokens. For example, if you wanted to have separate tokens for short tokens. For example, if you wanted to have separate tokens for
@ -167,7 +145,7 @@ short tokens. For example, if you wanted to have separate tokens for
example:: example::
class MyLexer(Lexer): class MyLexer(Lexer):
tokens = {'ASSIGN', 'EQ', ...} tokens = { ASSIGN, EQ, ...}
... ...
EQ = r'==' # MUST APPEAR FIRST! (LONGER) EQ = r'==' # MUST APPEAR FIRST! (LONGER)
ASSIGN = r'=' ASSIGN = r'='
@ -251,12 +229,10 @@ Instead of using the ``@_()`` decorator, you can also write a method
that matches the same name as a token previously specified as a that matches the same name as a token previously specified as a
string. For example:: string. For example::
ID = r'[a-zA-Z_][a-zA-Z0-9_]*' NUMBER = r'\d+'
... ...
def ID(self, t): def NUMBER(self, t):
reserved = { 'if', 'else', 'while', 'for' } t.value = int(t.value)
if t.value in reserved:
t.type = t.value.upper()
return t return t
This is potentially useful trick for debugging a lexer. You can temporarily This is potentially useful trick for debugging a lexer. You can temporarily
@ -264,6 +240,36 @@ attach a method a token and have it execute when the token is encountered.
If you later take the method away, the lexer will revert back to its original If you later take the method away, the lexer will revert back to its original
behavior. behavior.
Token Remapping
^^^^^^^^^^^^^^^
Occasionally, you might need to remap tokens based on special cases.
Consider the case of matching identifiers such as "abc", "python", or "guido".
Certain identifiers such as "if", "else", and "while" might need to be
treated as special keywords. To handle this, include token remapping rules when
writing the lexer like this::
# calclex.py
from sly import Lexer
class CalcLexer(Lexer):
tokens = { ID, IF, ELSE, WHILE }
# String containing ignored characters (between tokens)
ignore = ' \t'
# Base ID rule
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
# Special cases
ID['if'] = IF
ID['else'] = ELSE
ID['while'] = WHILE
When parsing an identifier, the special cases will remap certain matching
values to a new token type. For example, if the value of an identifier is
"if" above, an ``IF`` token will be generated.
Line numbers and position tracking Line numbers and position tracking
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -350,15 +356,15 @@ Error handling
If a bad character is encountered while lexing, tokenizing will stop. If a bad character is encountered while lexing, tokenizing will stop.
However, you can add an ``error()`` method to handle lexing errors However, you can add an ``error()`` method to handle lexing errors
that occur when illegal characters are detected. The error method that occur when illegal characters are detected. The error method
receives a string containing all remaining untokenized text. A receives a ``Token`` where the ``value`` attribute contains all
typical handler might look at this text and skip ahead in some manner. remaining untokenized text. A typical handler might look at this text
For example:: and skip ahead in some manner. For example::
class MyLexer(Lexer): class MyLexer(Lexer):
... ...
# Error handling rule # Error handling rule
def error(self, value): def error(self, t):
print("Illegal character '%s'" % value[0]) print("Illegal character '%s'" % t.value[0])
self.index += 1 self.index += 1
In this case, we print the offending character and skip ahead In this case, we print the offending character and skip ahead
@ -367,6 +373,32 @@ parser is often a hard problem. An error handler might scan ahead
to a logical synchronization point such as a semicolon, a blank line, to a logical synchronization point such as a semicolon, a blank line,
or similar landmark. or similar landmark.
If the ``error()`` method also returns the passed token, it will
show up as an ``ERROR`` token in the resulting token stream. This
might be useful if the parser wants to see error tokens for some
reason--perhaps for the purposes of improved error messages or
some other kind of error handling.
Third-Party Regex Module
^^^^^^^^^^^^^^^^^^^^^^^^
.. versionadded:: 0.4
The third-party `regex <https://pypi.org/project/regex/>`_ module can be used
with sly. Like this::
from sly import Lexer
import regex
class MyLexer(Lexer):
regex_module = regex
...
Now all regular expressions that ``MyLexer`` uses will be handled with the
``regex`` module. The ``regex_module`` can be set to any module that is
compatible with Python's standard library ``re``.
A More Complete Example A More Complete Example
^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^
@ -378,26 +410,11 @@ into practice::
from sly import Lexer from sly import Lexer
class CalcLexer(Lexer): class CalcLexer(Lexer):
# Set of reserved names (language keywords)
reserved_words = { 'WHILE', 'IF', 'ELSE', 'PRINT' }
# Set of token names. This is always required # Set of token names. This is always required
tokens = { tokens = { NUMBER, ID, WHILE, IF, ELSE, PRINT,
'NUMBER', PLUS, MINUS, TIMES, DIVIDE, ASSIGN,
'ID', EQ, LT, LE, GT, GE, NE }
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'EQ',
'LT',
'LE',
'GT',
'GE',
'NE',
*reserved_words,
}
literals = { '(', ')', '{', '}', ';' } literals = { '(', ')', '{', '}', ';' }
@ -422,12 +439,12 @@ into practice::
t.value = int(t.value) t.value = int(t.value)
return t return t
@_(r'[a-zA-Z_][a-zA-Z0-9_]*') # Identifiers and keywords
def ID(self, t): ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
# Check if name matches a reserved word (change token type if true) ID['if'] = IF
if t.value.upper() in self.reserved_words: ID['else'] = ELSE
t.type = t.value.upper() ID['while'] = WHILE
return t ID['print'] = PRINT
ignore_comment = r'\#.*' ignore_comment = r'\#.*'
@ -436,8 +453,8 @@ into practice::
def ignore_newline(self, t): def ignore_newline(self, t):
self.lineno += t.value.count('\n') self.lineno += t.value.count('\n')
def error(self, value): def error(self, t):
print('Line %d: Bad character %r' % (self.lineno, value[0])) print('Line %d: Bad character %r' % (self.lineno, t.value[0]))
self.index += 1 self.index += 1
if __name__ == '__main__': if __name__ == '__main__':
@ -455,27 +472,27 @@ into practice::
If you run this code, you'll get output that looks like this:: If you run this code, you'll get output that looks like this::
Token(ID, 'x', 3, 12) Token(type='ID', value='x', lineno=3, index=20)
Token(ASSIGN, '=', 3, 14) Token(type='ASSIGN', value='=', lineno=3, index=22)
Token(NUMBER, 0, 3, 16) Token(type='NUMBER', value=0, lineno=3, index=24)
Token(;, ';', 3, 17) Token(type=';', value=';', lineno=3, index=25)
Token(WHILE, 'while', 4, 19) Token(type='WHILE', value='while', lineno=4, index=31)
Token((, '(', 4, 25) Token(type='(', value='(', lineno=4, index=37)
Token(ID, 'x', 4, 26) Token(type='ID', value='x', lineno=4, index=38)
Token(LT, '<', 4, 28) Token(type='LT', value='<', lineno=4, index=40)
Token(NUMBER, 10, 4, 30) Token(type='NUMBER', value=10, lineno=4, index=42)
Token(), ')', 4, 32) Token(type=')', value=')', lineno=4, index=44)
Token({, '{', 4, 34) Token(type='{', value='{', lineno=4, index=46)
Token(PRINT, 'print', 5, 40) Token(type='PRINT', value='print', lineno=5, index=56)
Token(ID, 'x', 5, 46) Token(type='ID', value='x', lineno=5, index=62)
Line 5: Bad character ':' Line 5: Bad character ':'
Token(ID, 'x', 6, 53) Token(type='ID', value='x', lineno=6, index=73)
Token(ASSIGN, '=', 6, 55) Token(type='ASSIGN', value='=', lineno=6, index=75)
Token(ID, 'x', 6, 57) Token(type='ID', value='x', lineno=6, index=77)
Token(PLUS, '+', 6, 59) Token(type='PLUS', value='+', lineno=6, index=79)
Token(NUMBER, 1, 6, 61) Token(type='NUMBER', value=1, lineno=6, index=81)
Token(;, ';', 6, 62) Token(type=';', value=';', lineno=6, index=82)
Token(}, '}', 7, 64) Token(type='}', value='}', lineno=7, index=88)
Study this example closely. It might take a bit to digest, but all of the Study this example closely. It might take a bit to digest, but all of the
essential parts of writing a lexer are there. Tokens have to be specified essential parts of writing a lexer are there. Tokens have to be specified
@ -848,6 +865,39 @@ string. However,writing an "empty" rule and using "empty" to denote an
empty production may be easier to read and more clearly state your empty production may be easier to read and more clearly state your
intention. intention.
EBNF Features (Optionals and Repeats)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Certain grammar features occur with some frequency. For example, suppose you want to
have an optional item as shown in the previous section. An alternate way to specify
it is to enclose one more more symbols in [ ] like this::
@_('[ item ] grok')
def spam(self, p):
if p.item is not None:
print("item was given and has value", p.item)
else:
print("item was not given")
@_('whatever')
def item(self, p):
...
In this case, the value of ``p.item`` is set to ``None`` if the value wasn't supplied.
Otherwise, it will have the value returned by the ``item`` rule below.
You can also encode repetitions. For example, a common construction is a
list of comma separated expressions. To parse that, you could write::
@_('expr { COMMA expr }')
def exprlist(self, p):
return [p.expr0] + p.expr1
In this example, the ``{ COMMA expr }`` represents zero or more repetitions
of a rule. The value of all symbols inside is now a list. So, ``p.expr1``
is a list of all expressions matched. Note, when duplicate symbol names
appear in a rule, they are distinguished by appending a numeric index as shown.
Dealing With Ambiguous Grammars Dealing With Ambiguous Grammars
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -907,8 +957,8 @@ like this::
class CalcParser(Parser): class CalcParser(Parser):
... ...
precedence = ( precedence = (
('left', 'PLUS', 'MINUS'), ('left', PLUS, MINUS),
('left', 'TIMES', 'DIVIDE'), ('left', TIMES, DIVIDE),
) )
# Rules where precedence is applied # Rules where precedence is applied
@ -997,9 +1047,9 @@ like this::
class CalcParser(Parser): class CalcParser(Parser):
... ...
precedence = ( precedence = (
('left', 'PLUS', 'MINUS'), ('left', PLUS, MINUS),
('left', 'TIMES', 'DIVIDE'), ('left', TIMES, DIVIDE),
('right', 'UMINUS'), # Unary minus operator ('right', UMINUS), # Unary minus operator
) )
Now, in the grammar file, you write the unary minus rule like this:: Now, in the grammar file, you write the unary minus rule like this::
@ -1027,10 +1077,10 @@ operators like ``<`` and ``>`` but you didn't want combinations like
class MyParser(Parser): class MyParser(Parser):
... ...
precedence = ( precedence = (
('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators ('nonassoc', LESSTHAN, GREATERTHAN), # Nonassociative operators
('left', 'PLUS', 'MINUS'), ('left', PLUS, MINUS),
('left', 'TIMES', 'DIVIDE'), ('left', TIMES, DIVIDE),
('right', 'UMINUS'), # Unary minus operator ('right', UMINUS), # Unary minus operator
) )
If you do this, the occurrence of input text such as ``a < b < c`` If you do this, the occurrence of input text such as ``a < b < c``
@ -1208,7 +1258,7 @@ appear as the last token on the right in an error rule. For example::
This is because the first bad token encountered will cause the rule to This is because the first bad token encountered will cause the rule to
be reduced--which may make it difficult to recover if more bad tokens be reduced--which may make it difficult to recover if more bad tokens
immediately follow. It's better to have some kind of landmark such as immediately follow. It's better to have some kind of landmark such as
a semicolon, closing parenthesese, or other token that can be used as a semicolon, closing parentheses, or other token that can be used as
a synchronization point. a synchronization point.
Panic mode recovery Panic mode recovery
@ -1343,13 +1393,13 @@ like this::
@_('LPAREN expr RPAREN') @_('LPAREN expr RPAREN')
def expr(self, p): def expr(self, p):
return ('group-expression',p.expr]) return ('group-expression', p.expr)
@_('NUMBER') @_('NUMBER')
def expr(self, p): def expr(self, p):
return ('number-expression', p.NUMBER) return ('number-expression', p.NUMBER)
Another approach is to create a set of data structure for different Another approach is to create a set of data structures for different
kinds of abstract syntax tree nodes and create different node types kinds of abstract syntax tree nodes and create different node types
in each rule:: in each rule::

View File

@ -3,29 +3,19 @@
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
import sys import sys
sys.path.insert(0, "../..") sys.path.insert(0, '../..')
from sly import Lexer, Parser from sly import Lexer, Parser
class CalcLexer(Lexer): class CalcLexer(Lexer):
# Set of token names. This is always required tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN }
tokens = {
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
'LPAREN',
'RPAREN',
}
# String containing ignored characters between tokens
ignore = ' \t' ignore = ' \t'
# Regular expression rules for tokens # Tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*' NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
# Special symbols
PLUS = r'\+' PLUS = r'\+'
MINUS = r'-' MINUS = r'-'
TIMES = r'\*' TIMES = r'\*'
@ -34,64 +24,80 @@ class CalcLexer(Lexer):
LPAREN = r'\(' LPAREN = r'\('
RPAREN = r'\)' RPAREN = r'\)'
@_(r'\d+') # Ignored pattern
def NUMBER(self, t): ignore_newline = r'\n+'
t.value = int(t.value)
return t
@_(r'\n+') # Extra action for newlines
def newline(self, t): def ignore_newline(self, t):
self.lineno += t.value.count('\n') self.lineno += t.value.count('\n')
def error(self, value): def error(self, t):
print("Illegal character '%s'" % value[0]) print("Illegal character '%s'" % t.value[0])
self.index += 1 self.index += 1
class CalcParser(Parser): class CalcParser(Parser):
# Get the token list from the lexer (required)
tokens = CalcLexer.tokens tokens = CalcLexer.tokens
# Grammar rules and actions precedence = (
@_('expr PLUS term') ('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS)
)
def __init__(self):
self.names = { }
@_('NAME ASSIGN expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('expr PLUS expr')
def expr(self, p): def expr(self, p):
return p.expr + p.term return p.expr0 + p.expr1
@_('expr MINUS term') @_('expr MINUS expr')
def expr(self, p): def expr(self, p):
return p.expr - p.term return p.expr0 - p.expr1
@_('term') @_('expr TIMES expr')
def expr(self, p): def expr(self, p):
return p.term return p.expr0 * p.expr1
@_('term TIMES factor') @_('expr DIVIDE expr')
def term(self, p): def expr(self, p):
return p.term * p.factor return p.expr0 / p.expr1
@_('term DIVIDE factor') @_('MINUS expr %prec UMINUS')
def term(self, p): def expr(self, p):
return p.term / p.factor return -p.expr
@_('factor')
def term(self, p):
return p.factor
@_('NUMBER')
def factor(self, p):
return p.NUMBER
@_('LPAREN expr RPAREN') @_('LPAREN expr RPAREN')
def factor(self, p): def expr(self, p):
return p.expr return p.expr
@_('NUMBER')
def expr(self, p):
return int(p.NUMBER)
@_('NAME')
def expr(self, p):
try:
return self.names[p.NAME]
except LookupError:
print(f'Undefined name {p.NAME!r}')
return 0
if __name__ == '__main__': if __name__ == '__main__':
lexer = CalcLexer() lexer = CalcLexer()
parser = CalcParser() parser = CalcParser()
while True: while True:
try: try:
text = input('calc > ') text = input('calc > ')
result = parser.parse(lexer.tokenize(text))
print(result)
except EOFError: except EOFError:
break break
if text:
parser.parse(lexer.tokenize(text))

101
example/calc_ebnf/calc.py Normal file
View File

@ -0,0 +1,101 @@
# -----------------------------------------------------------------------------
# calc.py
# -----------------------------------------------------------------------------
import sys
sys.path.insert(0, '../..')
from sly import Lexer, Parser
class CalcLexer(Lexer):
tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN }
ignore = ' \t'
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
NUMBER = r'\d+'
# Special symbols
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
# Ignored pattern
ignore_newline = r'\n+'
# Extra action for newlines
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index += 1
class CalcParser(Parser):
tokens = CalcLexer.tokens
def __init__(self):
self.names = { }
@_('NAME ASSIGN expr')
def statement(self, p):
self.names[p.NAME] = p.expr
@_('expr')
def statement(self, p):
print(p.expr)
@_('term { PLUS|MINUS term }')
def expr(self, p):
lval = p.term0
for op, rval in p[1]:
if op == '+':
lval = lval + rval
elif op == '-':
lval = lval - rval
return lval
@_('factor { TIMES|DIVIDE factor }')
def term(self, p):
lval = p.factor0
for op, rval in p[1]:
if op == '*':
lval = lval * rval
elif op == '/':
lval = lval / rval
return lval
@_('MINUS factor')
def factor(self, p):
return -p.factor
@_('LPAREN expr RPAREN')
def factor(self, p):
return p.expr
@_('NUMBER')
def factor(self, p):
return int(p.NUMBER)
@_('NAME')
def factor(self, p):
try:
return self.names[p.NAME]
except LookupError:
print(f'Undefined name {p.NAME!r}')
return 0
if __name__ == '__main__':
lexer = CalcLexer()
parser = CalcParser()
while True:
try:
text = input('calc > ')
except EOFError:
break
if text:
parser.parse(lexer.tokenize(text))

View File

@ -8,9 +8,7 @@ sys.path.insert(0, "../..")
from sly import Lexer, Parser from sly import Lexer, Parser
class CalcLexer(Lexer): class CalcLexer(Lexer):
tokens = { tokens = { NAME, NUMBER }
'NAME', 'NUMBER',
}
ignore = ' \t' ignore = ' \t'
literals = { '=', '+', '-', '*', '/', '(', ')' } literals = { '=', '+', '-', '*', '/', '(', ')' }
@ -26,8 +24,8 @@ class CalcLexer(Lexer):
def newline(self, t): def newline(self, t):
self.lineno += t.value.count('\n') self.lineno += t.value.count('\n')
def error(self, value): def error(self, t):
print("Illegal character '%s'" % value[0]) print("Illegal character '%s'" % t.value[0])
self.index += 1 self.index += 1
class CalcParser(Parser): class CalcParser(Parser):
@ -36,7 +34,7 @@ class CalcParser(Parser):
precedence = ( precedence = (
('left', '+', '-'), ('left', '+', '-'),
('left', '*', '/'), ('left', '*', '/'),
('right', 'UMINUS'), ('right', UMINUS),
) )
def __init__(self): def __init__(self):

179
example/schcls/schcls.py Normal file
View File

@ -0,0 +1,179 @@
# schcls.py
#
# Proof of concept--not complete
from sly.docparse import DocParseMeta
from sly import Lexer, Parser
class SchLexer(Lexer):
tokens = { NUMBER, NAME, DEFINE, SET }
ignore = ' \t'
literals = ['=','+','-','*','/','(',')','.']
NAME = '[a-zA-Z_!][a-zA-Z0-9_!]*'
NAME['define'] = DEFINE
NAME['set!'] = SET
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
@_(r'\n+')
def newline(self, t):
self.lineno = t.lineno + t.value.count('\n')
def error(self, t):
print(f"{self.cls_module}.{self.cls_name}:{self.lineno}: * Illegal character", repr(self.text[self.index]))
self.index += 1
class SchParser(Parser):
tokens = SchLexer.tokens
precedence = (
('left', '+','-'),
('left', '*','/')
)
def __init__(self):
self.env = { }
@_('declarations',
'')
def program(self, p):
return self.env
@_('declarations declaration')
def declarations(self, p):
pass
@_('declaration')
def declarations(self, p):
pass
@_("'(' DEFINE NAME expression ')'")
def declaration(self, p):
self.env[p.NAME] = p.expression
@_("'(' DEFINE '(' NAME arglist ')' exprlist ')'")
def declaration(self, p):
args = ','.join(p.arglist)
self.env[p.NAME] = eval(f"lambda {args}: ({','.join(p.exprlist)},)[-1]")
@_("'(' SET NAME '.' NAME expression ')'")
def expression(self, p):
return f'setattr({p.NAME0}, {p.NAME1!r}, {p.expression})'
@_("")
def arglist(self, p):
return []
@_("arglist_nonempty")
def arglist(self, p):
return p.arglist_nonempty
@_("arglist_nonempty NAME")
def arglist_nonempty(self, p):
p.arglist_nonempty.append(p.NAME)
return p.arglist_nonempty
@_("NAME")
def arglist_nonempty(self, p):
return [ p.NAME ]
@_("NUMBER")
def expression(self, p):
return str(p.NUMBER)
@_("name")
def expression(self, p):
return p.name
@_("'(' operator exprlist ')'")
def expression(self, p):
return '(' + p.operator.join(p.exprlist) + ')'
@_("'+'", "'-'", "'*'", "'/'")
def operator(self, p):
return p[0]
@_("'(' name exprlist ')'")
def expression(self, p):
return p.name + '(' + ','.join(p.exprlist) + ')'
@_("'(' name ')'")
def expression(self, p):
return p.name + '()'
@_('exprlist expression')
def exprlist(self, p):
p.exprlist.append(p.expression)
return p.exprlist
@_('expression')
def exprlist(self, p):
return [ p.expression ]
@_("NAME '.' NAME")
def name(self, p):
return f'{p.NAME0}.{p.NAME1}'
@_("NAME")
def name(self, p):
return p.NAME
def error(self, p):
print(f'{self.cls_module}.{self.cls_name}:{getattr(p,"lineno","")}: '
f'Syntax error at {getattr(p,"value","EOC")}')
class SchMeta(DocParseMeta):
lexer = SchLexer
parser = SchParser
class Sch(metaclass=SchMeta):
pass
class Rat(Sch):
'''
(define (__init__ self numer denom)
(set! self.numer numer)
(set! self.denom denom)
)
(define (__add__ self other)
(Rat (+ (* self.numer other.denom)
(* self.denom other.numer))
(* self.denom other.denom)
)
)
(define (__sub__ self other)
(Rat (- (* self.numer other.denom)
(* self.denom other.numer))
(* self.denom other.denom)
)
)
(define (__mul__ self other)
(Rat (* self.numer other.numer)
(* self.denom other.denom)
)
)
(define (__truediv__ self other)
(Rat (* self.numer other.denom)
(* self.denom other.numer)
)
)
'''
def __repr__(self):
return f'Rat({self.numer}, {self.denom})'
if __name__ == '__main__':
a = Rat(2, 3)
b = Rat(1, 4)
print(a + b)
print(a - b)
print(a * b)
print(a / b)

245
example/wasm/expr.py Normal file
View File

@ -0,0 +1,245 @@
# -----------------------------------------------------------------------------
# expr.py
#
# Proof-of-concept encoding of functions/expressions into Wasm.
#
# This file implements a mini-language for writing Wasm functions as expressions.
# It only supports integers.
#
# Here's a few examples:
#
# # Some basic function definitions
# add(x, y) = x + y;
# mul(x, y) = x * y;
# dsquare(x, y) = mul(x, x) + mul(y, y);
#
# # A recursive function
# fact(n) = if n < 1 then 1 else n*fact(n-1);
#
# The full grammar:
#
# functions : functions function
# | function
#
# function : NAME ( parms ) = expr ;
#
# expr : expr + expr
# | expr - expr
# | expr * expr
# | expr / expr
# | expr < expr
# | expr <= expr
# | expr > expr
# | expr >= expr
# | expr == expr
# | expr != expr
# | ( expr )
# | NAME (exprs)
# | if expr then expr else expr
# | NUMBER
#
# Note: This is implemented as one-pass compiler with no intermediate AST.
# Some of the grammar rules have to be written in a funny way to make this
# work. If doing this for real, I'd probably build an AST and construct
# Wasm code through AST walking.
# -----------------------------------------------------------------------------
import sys
sys.path.append('../..')
from sly import Lexer, Parser
import wasm
class ExprLexer(Lexer):
tokens = { NAME, NUMBER, PLUS, TIMES, MINUS, DIVIDE, LPAREN, RPAREN, COMMA,
LT, LE, GT, GE, EQ, NE, IF, THEN, ELSE, ASSIGN, SEMI }
ignore = ' \t'
# Tokens
NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
NAME['if'] = IF
NAME['then'] = THEN
NAME['else'] = ELSE
NUMBER = r'\d+'
# Special symbols
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
LPAREN = r'\('
RPAREN = r'\)'
COMMA = r','
LE = r'<='
LT = r'<'
GE = r'>='
GT = r'>'
EQ = r'=='
NE = r'!='
ASSIGN = r'='
SEMI = ';'
# Ignored pattern
ignore_newline = r'\n+'
ignore_comment = r'#.*\n'
# Extra action for newlines
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index += 1
class ExprParser(Parser):
tokens = ExprLexer.tokens
precedence = (
('left', IF, ELSE),
('left', EQ, NE, LT, LE, GT, GE),
('left', PLUS, MINUS),
('left', TIMES, DIVIDE),
('right', UMINUS)
)
def __init__(self):
self.functions = { }
self.module = wasm.Module()
@_('functions function')
def functions(self, p):
pass
@_('function')
def functions(self, p):
pass
@_('function_decl ASSIGN expr SEMI')
def function(self, p):
self.function.block_end()
self.function = None
@_('NAME LPAREN parms RPAREN')
def function_decl(self, p):
self.locals = { name:n for n, name in enumerate(p.parms) }
self.function = self.module.add_function(p.NAME, [wasm.i32]*len(p.parms), [wasm.i32])
self.functions[p.NAME] = self.function
@_('NAME LPAREN RPAREN')
def function_decl(self, p):
self.locals = { }
self.function = self.module.add_function(p.NAME, [], [wasm.i32])
self.functions[p.NAME] = self.function
@_('parms COMMA parm')
def parms(self, p):
return p.parms + [p.parm]
@_('parm')
def parms(self, p):
return [ p.parm ]
@_('NAME')
def parm(self, p):
return p.NAME
@_('expr PLUS expr')
def expr(self, p):
self.function.i32.add()
@_('expr MINUS expr')
def expr(self, p):
self.function.i32.sub()
@_('expr TIMES expr')
def expr(self, p):
self.function.i32.mul()
@_('expr DIVIDE expr')
def expr(self, p):
self.function.i32.div_s()
@_('expr LT expr')
def expr(self, p):
self.function.i32.lt_s()
@_('expr LE expr')
def expr(self, p):
self.function.i32.le_s()
@_('expr GT expr')
def expr(self, p):
self.function.i32.gt_s()
@_('expr GE expr')
def expr(self, p):
self.function.i32.ge_s()
@_('expr EQ expr')
def expr(self, p):
self.function.i32.eq()
@_('expr NE expr')
def expr(self, p):
self.function.i32.ne()
@_('MINUS expr %prec UMINUS')
def expr(self, p):
pass
@_('LPAREN expr RPAREN')
def expr(self, p):
pass
@_('NUMBER')
def expr(self, p):
self.function.i32.const(int(p.NUMBER))
@_('NAME')
def expr(self, p):
self.function.local.get(self.locals[p.NAME])
@_('NAME LPAREN exprlist RPAREN')
def expr(self, p):
self.function.call(self.functions[p.NAME])
@_('NAME LPAREN RPAREN')
def expr(self, p):
self.function.call(self.functions[p.NAME])
@_('IF expr thenexpr ELSE expr')
def expr(self, p):
self.function.block_end()
@_('exprlist COMMA expr')
def exprlist(self, p):
pass
@_('expr')
def exprlist(self, p):
pass
@_('startthen expr')
def thenexpr(self, p):
self.function.else_start()
@_('THEN')
def startthen(self, p):
self.function.if_start(wasm.i32)
if __name__ == '__main__':
import sys
if len(sys.argv) != 2:
raise SystemExit(f'Usage: {sys.argv[0]} module')
lexer = ExprLexer()
parser = ExprParser()
parser.parse(lexer.tokenize(open(sys.argv[1]).read()))
name = sys.argv[1].split('.')[0]
parser.module.write_wasm(name)
parser.module.write_html(name)
print(f'Wrote: {name}.wasm')
print(f'Wrote: {name}.html')
print('Use python3 -m http.server to test')

25
example/wasm/test.e Normal file
View File

@ -0,0 +1,25 @@
# Experimental Wasm function examples.
# To run:
#
# 1. First run python3 expr.py test.e
# 2. Use python3 -m http.server
#
# Go to a browser and visit http://localhost:8000/test.html.
# From the browser, open the Javascript console. Try executing
# the functions from there.
#
# Some basic functions
add(x,y) = x+y;
sub(x,y) = x-y;
mul(x,y) = x*y;
div(x,y) = x/y;
# A function calling other functions
dsquare(x,y) = mul(x,x) + mul(y,y);
# A conditional
minval(a, b) = if a < b then a else b;
# Some recursive functions
fact(n) = if n <= 1 then 1 else n*fact(n-1);
fib(n) = if n < 2 then 1 else fib(n-1) + fib(n-2);

32
example/wasm/test.html Normal file
View File

@ -0,0 +1,32 @@
<html>
<body>
<script>
var imports = {};
fetch("test.wasm").then(response =>
response.arrayBuffer()
).then(bytes =>
WebAssembly.instantiate(bytes, imports)
).then(results => {
window.dsquared = results.instance.exports.dsquared;
window.distance = results.instance.exports.distance;
window.getval = results.instance.exports.getval;
window.setval = results.instance.exports.setval;
});
</script>
<h3>module test</h3>
<p>
The following exports are made. Access from the JS console.
</p>
<p><tt>dsquared(f64, f64) -> f64</tt></p>
<p><tt>distance(f64, f64) -> f64</tt></p>
<p><tt>getval(i32) -> i32</tt></p>
<p><tt>setval(i32, i32) -> i32</tt></p>
</body>
</html>

942
example/wasm/wasm.py Normal file
View File

@ -0,0 +1,942 @@
# wasm.py
#
# Experimental builder for Wasm binary encoding. Use at your own peril.
#
# Author: David Beazley (@dabeaz)
# Copyright (C) 2019
# http://www.dabeaz.com
import struct
import enum
from collections import defaultdict
import json
def encode_unsigned(value):
'''
Produce an LEB128 encoded unsigned integer.
'''
parts = []
while value:
parts.append((value & 0x7f) | 0x80)
value >>= 7
if not parts:
parts.append(0)
parts[-1] &= 0x7f
return bytes(parts)
def encode_signed(value):
'''
Produce a LEB128 encoded signed integer.
'''
parts = [ ]
if value < 0:
# Sign extend the value up to a multiple of 7 bits
value = (1 << (value.bit_length() + (7 - value.bit_length() % 7))) + value
negative = True
else:
negative = False
while value:
parts.append((value & 0x7f) | 0x80)
value >>= 7
if not parts or (not negative and parts[-1] & 0x40):
parts.append(0)
parts[-1] &= 0x7f
return bytes(parts)
assert encode_unsigned(624485) == bytes([0xe5, 0x8e, 0x26])
assert encode_unsigned(127) == bytes([0x7f])
assert encode_signed(-624485) == bytes([0x9b, 0xf1, 0x59])
assert encode_signed(127) == bytes([0xff, 0x00])
def encode_f64(value):
'''
Encode a 64-bit floating point as little endian
'''
return struct.pack('<d', value)
def encode_f32(value):
'''
Encode a 32-bit floating point as little endian.
'''
return struct.pack('<f', value)
def encode_name(value):
'''
Encode a name as UTF-8
'''
data = value.encode('utf-8')
return encode_vector(data)
def encode_vector(items):
'''
Items is a list of encoded value or bytess
'''
if isinstance(items, bytes):
return encode_unsigned(len(items)) + items
else:
return encode_unsigned(len(items)) + b''.join(items)
# ------------------------------------------------------------
# Instruction encoding enums.
#
# Wasm defines 4 core data types [i32, i64, f32, f64]. These type
# names are used in various places (specifying functions, globals,
# etc.). However, the type names are also used as a namespace for
# type-specific instructions such as i32.add. We're going to use
# Python enums to set up this arrangement in a clever way that
# makes it possible to do both of these tasks.
# Metaclass for instruction encoding categories. The class itself
# can be used as an integer when encoding instructions.
class HexEnumMeta(enum.EnumMeta):
def __int__(cls):
return int(cls._encoding)
__index__ = __int__
def __repr__(cls):
return cls.__name__
@classmethod
def __prepare__(meta, name, bases, encoding=0):
return super().__prepare__(name, bases)
@staticmethod
def __new__(meta, clsname, bases, methods, encoding=0):
cls = super().__new__(meta, clsname, bases, methods)
cls._encoding = encoding
return cls
class HexEnum(enum.IntEnum):
def __repr__(self):
return f'<{self!s}: 0x{self:x}>'
HexEnum.__class__ = HexEnumMeta
class i32(HexEnum, encoding=0x7f):
eqz = 0x45
eq = 0x46
ne = 0x47
lt_s = 0x48
lt_u = 0x49
gt_s = 0x4a
gt_u = 0x4b
le_s = 0x4c
le_u = 0x4d
ge_s = 0x4e
ge_u = 0x4f
clz = 0x67
ctz = 0x68
popcnt = 0x69
add = 0x6a
sub = 0x6b
mul = 0x6c
div_s = 0x6d
div_u = 0x6e
rem_s = 0x6f
rem_u = 0x70
and_ = 0x71
or_ = 0x72
xor = 0x73
shl = 0x74
shr_s = 0x75
shr_u = 0x76
rotl = 0x77
rotr = 0x78
wrap_i64 = 0xa7
trunc_f32_s = 0xa8
trunc_f32_u = 0xa9
trunc_f64_s = 0xaa
trunc_f64_u = 0xab
reinterpret_f32 = 0xbc
load = 0x28
load8_s = 0x2c
load8_u = 0x2d
load16_s = 0x2e
load16_u = 0x2f
store = 0x36
store8 = 0x3a
store16 = 0x3b
const = 0x41
class i64(HexEnum, encoding=0x7e):
eqz = 0x50
eq = 0x51
ne = 0x52
lt_s = 0x53
lt_u = 0x54
gt_s = 0x55
gt_u = 0x56
le_s = 0x57
le_u = 0x58
ge_s = 0x59
ge_u = 0x5a
clz = 0x79
ctz = 0x7a
popcnt = 0x7b
add = 0x7c
sub = 0x7d
mul = 0x7e
div_s = 0x7f
div_u = 0x80
rem_s = 0x81
rem_u = 0x82
and_ = 0x83
or_ = 0x84
xor = 0x85
shl = 0x86
shr_s = 0x87
shr_u = 0x88
rotl = 0x89
rotr = 0x8a
extend_i32_s = 0xac
extend_i32_u = 0xad
trunc_f32_s = 0xae
trunc_f32_u = 0xaf
trunc_f64_s = 0xb0
trunc_f64_u = 0xb1
reinterpret_f64 = 0xbd
load = 0x29
load8_s = 0x30
load8_u = 0x31
load16_s = 0x32
load16_u = 0x33
load32_s = 0x34
load32_u = 0x35
store = 0x37
store8 = 0x3c
store16 = 0x3d
store32 = 0x3e
const = 0x42
class f32(HexEnum, encoding=0x7d):
eq = 0x5b
ne = 0x5c
lt = 0x5d
gt = 0x5e
le = 0x5f
ge = 0x60
abs = 0x8b
neg = 0x8c
ceil = 0x8d
floor = 0x8e
trunc = 0x8f
nearest = 0x90
sqrt = 0x91
add = 0x92
sub = 0x93
mul = 0x94
div = 0x95
min = 0x96
max = 0x97
copysign = 0x98
convert_i32_s = 0xb2
convert_i32_u = 0xb3
convert_i64_s = 0xb4
convert_i64_u = 0xb5
demote_f64 = 0xb6
reinterpret_i32 = 0xbe
load = 0x2a
store = 0x38
const = 0x43
class f64(HexEnum, encoding=0x7c):
eq = 0x61
ne = 0x62
lt = 0x63
gt = 0x64
le = 0x65
ge = 0x66
abs = 0x99
neg = 0x9a
ceil = 0x9b
floor = 0x9c
trunc = 0x9d
nearest = 0x9e
sqrt = 0x9f
add = 0xa0
sub = 0xa1
mul = 0xa2
div = 0xa3
min = 0xa4
max = 0xa5
copysign = 0xa6
convert_i32_s = 0xb7
convert_i32_u = 0xb8
convert_i64_s = 0xb9
convert_i64_u = 0xba
promote_f32 = 0xbb
reinterpret_i64 = 0xbf
load = 0x2b
store = 0x39
const = 0x44
class local(HexEnum):
get = 0x20
set = 0x21
tee = 0x22
class global_(HexEnum):
get = 0x23
set = 0x24
global_.__name__ = 'global'
# Special void type for block returns
void = 0x40
# ------------------------------------------------------------
def encode_function_type(parameters, results):
'''
parameters is a vector of value types
results is a vector value types
'''
enc_parms = bytes(parameters)
enc_results = bytes(results)
return b'\x60' + encode_vector(enc_parms) + encode_vector(enc_results)
def encode_limits(min, max=None):
if max is None:
return b'\x00' + encode_unsigned(min)
else:
return b'\x01' + encode_unsigned(min) + encode_unsigned(max)
def encode_table_type(elemtype, min, max=None):
return b'\x70' + encode_limits(min, max)
def encode_global_type(value_type, mut=True):
return bytes([value_type, mut])
# ----------------------------------------------------------------------
# Instruction builders
#
# Wasm instructions are grouped into different namespaces. For example:
#
# i32.add()
# local.get()
# memory.size()
# ...
#
# The classes that follow implement the namespace for different instruction
# categories.
# Builder for the local.* namespace
class SubBuilder:
def __init__(self, builder):
self._builder = builder
def _append(self, instr):
self._builder._code.append(instr)
class LocalBuilder(SubBuilder):
def get(self, localidx):
self._append([local.get, *encode_unsigned(localidx)])
def set(self, localidx):
self._append([local.set, *encode_unsigned(localidx)])
def tee(self, localidx):
self._append([local.tee, *encode_unsigned(localidx)])
class GlobalBuilder(SubBuilder):
def get(self, glob):
if isinstance(glob, int):
globidx = glob
else:
globidx = glob.idx
self._append([global_.get, *encode_unsigned(globidx)])
def set(self, glob):
if isinstance(glob, int):
globidx = glob
else:
globidx = glob.idx
self._append([global_.set, *encode_unsigned(globidx)])
class MemoryBuilder(SubBuilder):
def size(self):
self._append([0x3f, 0x00])
def grow(self):
self._append([0x40, 0x00])
class OpBuilder(SubBuilder):
_optable = None # To be supplied by subclasses
# Memory ops
def load(self, align, offset):
self._append([self._optable.load, *encode_unsigned(align), *encode_unsigned(offset)])
def load8_s(self, align, offset):
self._append([self._optable.load8_s, *encode_unsigned(align), *encode_unsigned(offset)])
def load8_u(self, align, offset):
self._append([self._optable.load8_u, *encode_unsigned(align), *encode_unsigned(offset)])
def load16_s(self, align, offset):
self._append([self._optable.load16_s, *encode_unsigned(align), *encode_unsigned(offset)])
def load16_u(self, align, offset):
self._append([self._optable.load16_u, *encode_unsigned(align), *encode_unsigned(offset)])
def load32_s(self, align, offset):
self._append([self._optable.load32_s, *encode_unsigned(align), *encode_unsigned(offset)])
def load32_u(self, align, offset):
self._append([self._optable.load32_u, *encode_unsigned(align), *encode_unsigned(offset)])
def store(self, align, offset):
self._append([self._optable.store, *encode_unsigned(align), *encode_unsigned(offset)])
def store8(self, align, offset):
self._append([self._optable.store8, *encode_unsigned(align), *encode_unsigned(offset)])
def store16(self, align, offset):
self._append([self._optable.store16, *encode_unsigned(align), *encode_unsigned(offset)])
def store32(self, align, offset):
self._append([self._optable.store32, *encode_unsigned(align), *encode_unsigned(offset)])
def __getattr__(self, key):
def call():
self._append([getattr(self._optable, key)])
return call
class I32OpBuilder(OpBuilder):
_optable = i32
def const(self, value):
self._append([self._optable.const, *encode_signed(value)])
class I64OpBuilder(OpBuilder):
_optable = i64
def const(self, value):
self._append([self._optable.const, *encode_signed(value)])
class F32OpBuilder(OpBuilder):
_optable = f32
def const(self, value):
self._append([self._optable.const, *encode_f32(value)])
class F64OpBuilder(OpBuilder):
_optable = f64
def const(self, value):
self._append([self._optable.const, *encode_f64(value)])
def _flatten(instr):
for x in instr:
if isinstance(x, list):
yield from _flatten(x)
else:
yield x
# High-level class that allows instructions to be easily encoded.
class InstructionBuilder:
def __init__(self):
self._code = [ ]
self.local = LocalBuilder(self)
self.global_ = GlobalBuilder(self)
self.i32 = I32OpBuilder(self)
self.i64 = I64OpBuilder(self)
self.f32 = F32OpBuilder(self)
self.f64 = F64OpBuilder(self)
# Control-flow stack.
self._control = [ None ]
def __iter__(self):
return iter(self._code)
# Resolve a human-readable label into control-stack index
def _resolve_label(self, label):
if isinstance(label, int):
return label
index = self._control.index(label)
return len(label) - 1 - index
# Control flow instructions
def unreachable(self):
self._code.append([0x01])
def nop(self):
self._code.append([0x01])
def block_start(self, result_type, label=None):
self._code.append([0x02, result_type])
self._control.append(label)
return len(self._control)
def block_end(self):
self._code.append([0x0b])
self._control.pop()
def loop_start(self, result_type, label=None):
self._code.append([0x03, result_type])
self._control.append(label)
return len(self._control)
def if_start(self, result_type, label=None):
self._code.append([0x04, result_type])
self._control.append(label)
def else_start(self):
self._code.append([0x05])
def br(self, label):
labelidx = self._resolve_label(label)
self._code.append([0x0c, *encode_unsigned(labelidx)])
def br_if(self, label):
labelidx = self._resolve_label(label)
self._code.append([0x0d, *encode_unsigned(labelidx)])
def br_table(self, labels, label):
enc_labels = [encode_unsigned(self._resolve_label(idx)) for idx in labels]
self._code.append([0x0e, *encode_vector(enc_labels), *encode_unsigned(self._resolve_label(label))])
def return_(self):
self._code.append([0x0f])
def call(self, func):
if isinstance(func, (ImportFunction,Function)):
self._code.append([0x10, *encode_unsigned(func._idx)])
else:
self._code.append([0x10, *encode_unsigned(func)])
def call_indirect(self, typesig):
if isinstance(typesig, Type):
typeidx = typesig.idx
else:
typeidx = typesig
self._code.append([0x11, *encode_unsigned(typeidx), 0x00])
def drop(self):
self._code.append([0x1a])
def select(self):
self._code.append([0x1b])
class Type:
def __init__(self, parms, results, idx):
self.parms = parms
self.results = results
self.idx = idx
def __repr__(self):
return f'{self.parms!r} -> {self.results!r}'
class ImportFunction:
def __init__(self, name, typesig, idx):
self._name = name
self._typesig = typesig
self._idx = idx
def __repr__(self):
return f'ImportFunction({self._name}, {self._typesig}, {self._idx})'
class Function(InstructionBuilder):
def __init__(self, name, typesig, idx, export=True):
super().__init__()
self._name = name
self._typesig = typesig
self._locals = list(typesig.parms)
self._export = export
self._idx = idx
def __repr__(self):
return f'Function({self._name}, {self._typesig}, {self._idx})'
# Allocate a new local variable of a given type
def alloc(self, valuetype):
self._locals.append(valuetype)
return len(self.locals) - 1
class ImportGlobal:
def __init__(self, name, valtype, idx):
self.name = name
self.valtype = valtype
self.idx = idx
def __repr__(self):
return f'ImportGlobal({self.name}, {self.valtype}, {self.idx})'
class Global:
def __init__(self, name, valtype, initializer, idx):
self.name = name
self.valtype = valtype
self.initializer = initializer
self.idx = idx
def __repr__(self):
return f'Global({self.name}, {self.valtype}, {self.initializer}, {self.idx})'
class Module:
def __init__(self):
# Vector of function type signatures. Signatures are reused
# if more than one function has the same signature.
self.type_section = []
# Vector of imported entities. These can be functions, globals,
# tables, and memories
self.import_section = []
# There are 4 basic entities within a Wasm file. Functions,
# globals, memories, and tables. Each kind of entity is
# stored in a separate list and is indexed by an integer
# index starting at 0. Imported entities must always
# go before entities defined in the Wasm module itself.
self.funcidx = 0
self.globalidx = 0
self.memoryidx = 0
self.tableidx = 0
self.function_section = [] # Vector of typeidx
self.global_section = [] # Vector of globals
self.table_section = [] # Vector of tables
self.memory_section = [] # Vector of memories
# Exported entities. A module may export functions, globals,
# tables, and memories
self.export_section = [] # Vector of exports
# Optional start function. A function that executes upon loading
self.start_section = None # Optional start function
# Initialization of table elements
self.element_section = []
# Code section for function bodies.
self.code_section = []
# Data section contains data segments
self.data_section = []
# List of function objects (to help with encoding)
self.functions = []
# Output for JS/Html
self.js_exports = "";
self.html_exports = "";
self.js_imports = defaultdict(dict)
def add_type(self, parms, results):
enc = encode_function_type(parms, results)
if enc in self.type_section:
return Type(parms, results, self.type_section.index(enc))
else:
self.type_section.append(enc)
return Type(parms, results, len(self.type_section) - 1)
def import_function(self, module, name, parms, results):
if len(self.function_section) > 0:
raise RuntimeError('function imports must go before first function definition')
typesig = self.add_type(parms, results)
code = encode_name(module) + encode_name(name) + b'\x00' + encode_unsigned(typesig.idx)
self.import_section.append(code)
self.js_imports[module][name] = f"function: {typesig}"
self.funcidx += 1
return ImportFunction(f'{module}.{name}', typesig, self.funcidx - 1)
def import_table(self, module, name, elemtype, min, max=None):
code = encode_name(module) + encode_name(name) + b'\x01' + encode_table_type(elemtype, min, max)
self.import_section.append(code)
self.js_imports[module][name] = "table:"
self.tableidx += 1
return self.tableidx - 1
def import_memtype(self, module, name, min, max=None):
code = encode_name(module) + encode_name(name) + b'\x02' + encode_limits(min, max)
self.import_section.append(code)
self.js_imports[module][name] = "memory:"
self.memoryidx += 1
return self.memoryidx - 1
def import_global(self, module, name, value_type):
if len(self.global_section) > 0:
raise RuntimeError('global imports must go before first global definition')
code = encode_name(module) + encode_name(name) + b'\x03' + encode_global_type(value_type, False)
self.import_section.append(code)
self.js_imports[module][name] = f"global: {value_type}"
self.globalidx += 1
return ImportGlobal(f'{module}.{name}', value_type, self.globalidx - 1)
def add_function(self, name, parms, results, export=True):
typesig = self.add_type(parms, results)
func = Function(name, typesig, self.funcidx, export)
self.funcidx += 1
self.functions.append(func)
self.function_section.append(encode_unsigned(typesig.idx))
self.html_exports += f'<p><tt>{name}({", ".join(str(p) for p in parms)}) -> {results[0]!s}</tt></p>\n'
return func
def add_table(self, elemtype, min, max=None):
self.table_section.append(encode_table_type(elemtype, min, max))
self.tableidx += 1
return self.tableidx - 1
def add_memory(self, min, max=None):
self.memory_section.append(encode_limits(min, max))
self.memoryidx += 1
return self.memoryidx - 1
def add_global(self, name, value_type, initializer, mutable=True, export=True):
code = encode_global_type(value_type, mutuable)
expr = InstructionBuilder()
getattr(expr, str(valtype)).const(initializer)
expr.finalize()
code += expr._code
self.global_section.append(code)
if export:
self.export_global(name, self.globalidx)
self.globalidx += 1
return Global(name, value_type, initializer, self.globalidx - 1)
def export_function(self, name, funcidx):
code = encode_name(name) + b'\x00' + encode_unsigned(funcidx)
self.export_section.append(code)
self.js_exports += f'window.{name} = results.instance.exports.{name};\n'
def export_table(self, name, tableidx):
code = encode_name(name) + b'\x01' + encode_unsigned(tableidx)
self.export_section.append(code)
def export_memory(self, name, memidx):
code = encode_name(name) + b'\x02' + encode_unsigned(memidx)
self.export_section.append(code)
def export_global(self, name, globalidx):
code = encode_name(name) + b'\x03' + encode_unsigned(globalidx)
self.export_section.append(code)
def start_function(self, funcidx):
self.start = encode_unsigned(funcidx)
def add_element(self, tableidx, expr, funcidxs):
code = encode_unsigned(tableidx) + expr.code
code += encode_vector([encode_unsigned(i) for i in funcidxs])
self.element_section.append(code)
def add_function_code(self, locals, expr):
# Locals is a list of valtypes [i32, i32, etc...]
# expr is an expression representing the actual code (InstructionBuilder)
locs = [ encode_unsigned(1) + bytes([loc]) for loc in locals ]
locs_code = encode_vector(locs)
func_code = locs_code + bytes(_flatten(expr))
code = encode_unsigned(len(func_code)) + func_code
self.code_section.append(code)
def add_data(self, memidx, expr, data):
# data is bytes
code = encode_unsigned(memidx) + expr.code + encode_vector([data[i:i+1] for i in range(len(data))])
self.data_section.append(code)
def _encode_section_vector(self, sectionid, contents):
if not contents:
return b''
contents_code = encode_vector(contents)
code = bytes([sectionid]) + encode_unsigned(len(contents_code)) + contents_code
return code
def encode(self):
for func in self.functions:
self.add_function_code(func._locals, func._code)
if func._export:
self.export_function(func._name, func._idx)
# Encode the whole module
code = b'\x00\x61\x73\x6d\x01\x00\x00\x00'
code += self._encode_section_vector(1, self.type_section)
code += self._encode_section_vector(2, self.import_section)
code += self._encode_section_vector(3, self.function_section)
code += self._encode_section_vector(4, self.table_section)
code += self._encode_section_vector(5, self.memory_section)
code += self._encode_section_vector(6, self.global_section)
code += self._encode_section_vector(7, self.export_section)
if self.start_section:
code += encode_unsigned(8) + self.start_section
code += self._encode_section_vector(9, self.element_section)
code += self._encode_section_vector(10, self.code_section)
code += self._encode_section_vector(11, self.data_section)
return code
def write_wasm(self, modname):
with open(f'{modname}.wasm', 'wb') as f:
f.write(self.encode())
def write_html(self, modname):
with open(f'{modname}.html', 'wt') as f:
f.write(js_template.format(
module=modname,
imports=json.dumps(self.js_imports, indent=4),
exports=self.js_exports,
exports_html=self.html_exports,
)
)
js_template = '''
<html>
<body>
<script>
var imports = {imports};
fetch("{module}.wasm").then(response =>
response.arrayBuffer()
).then(bytes =>
WebAssembly.instantiate(bytes, imports)
).then(results => {{
{exports}
}});
</script>
<h3>module {module}</h3>
<p>
The following exports are made. Access from the JS console.
</p>
{exports_html}
</body>
</html>
'''
def test1():
mod = Module()
# An external function import. Note: All imports MUST go first.
# Indexing affects function indexing for functions defined in the module.
# Import some functions from JS
# math_sin = mod.import_function('util', 'sin', [f64], [f64])
# math_cos = mod.import_function('util', 'cos', [f64], [f64])
# Import a function from another module entirely
# fact = mod.import_function('recurse', 'fact', [i32], [i32])
# Import a global variable (from JS?)
# FOO = mod.import_global('util', 'FOO', f64)
# A more complicated function
dsquared_func = mod.add_function('dsquared', [f64, f64], [f64])
dsquared_func.local.get(0)
dsquared_func.local.get(0)
dsquared_func.f64.mul()
dsquared_func.local.get(1)
dsquared_func.local.get(1)
dsquared_func.f64.mul()
dsquared_func.f64.add()
dsquared_func.block_end()
# A function calling another function
distance = mod.add_function('distance', [f64, f64], [f64])
distance.local.get(0)
distance.local.get(1)
distance.call(dsquared_func)
distance.f64.sqrt()
distance.block_end()
# A function calling out to JS
# ext = mod.add_function('ext', [f64, f64], [f64])
# ext.local.get(0)
# ext.call(math_sin)
# ext.local.get(1)
# ext.call(math_cos)
# ext.f64.add()
# ext.block_end()
# A function calling across modules
# tenf = mod.add_function('tenfact', [i32], [i32])
# tenf.local.get(0)
# tenf.call(fact)
# tenf.i32.const(10)
# tenf.i32.mul()
# tenf.block_end()
# A function accessing an imported global variable
# gf = mod.add_function('gf', [f64], [f64])
# gf.global_.get(FOO)
# gf.local.get(0)
# gf.f64.mul()
# gf.block_end()
# Memory
mod.add_memory(1)
mod.export_memory('memory', 0)
# Function that returns a byte value
getval = mod.add_function('getval', [i32], [i32])
getval.local.get(0)
getval.i32.load8_u(0, 0)
getval.block_end()
# Function that sets a byte value
setval = mod.add_function('setval', [i32,i32], [i32])
setval.local.get(0) # Memory address
setval.local.get(1) # value
setval.i32.store8(0,0)
setval.i32.const(1)
setval.block_end()
return mod
def test2():
mod = Module()
fact = mod.add_function('fact', [i32], [i32])
fact.local.get(0)
fact.i32.const(1)
fact.i32.lt_s()
fact.if_start(i32)
fact.i32.const(1)
fact.else_start()
fact.local.get(0)
fact.local.get(0)
fact.i32.const(1)
fact.i32.sub()
fact.call(fact)
fact.i32.mul()
fact.block_end()
fact.block_end()
return mod
if __name__ == '__main__':
mod = test1()
mod.write_wasm('test')
mod.write_html('test')

3
pyproject.toml Normal file
View File

@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"

18
setup.cfg Normal file
View File

@ -0,0 +1,18 @@
[metadata]
name = sly
version = 0.5
url = https://github.com/dabeaz/sly
author = David Beazley
author_email = "David Beazley" <dave@dabeaz.com>
description = "SLY - Sly Lex Yacc"
long_description = "SLY is an implementation of lex and yacc. No longer maintained on PyPI. Latest version on GitHub."
license = BSD-3-Clause
license_files = LICENSE
classifiers =
License :: OSI Approved :: BSD License
[options]
package_dir =
=src
packages = sly

View File

@ -1,28 +0,0 @@
try:
from setuptools import setup
except ImportError:
from distutils.core import setup
tests_require = ['pytest']
setup(name = "sly",
description="SLY - Sly Lex Yacc",
long_description = """
SLY is an implementation of lex and yacc for Python 3.
""",
license="""BSD""",
version = "0.1",
author = "David Beazley",
author_email = "dave@dabeaz.com",
maintainer = "David Beazley",
maintainer_email = "dave@dabeaz.com",
url = "https://github.com/dabeaz/sly",
packages = ['sly'],
tests_require = tests_require,
extras_require = {
'test': tests_require,
},
classifiers = [
'Programming Language :: Python :: 3',
]
)

View File

@ -1,257 +0,0 @@
# -----------------------------------------------------------------------------
# sly: lex.py
#
# Copyright (C) 2016
# David M. Beazley (Dabeaz LLC)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of the David Beazley or Dabeaz LLC may be used to
# endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
__version__ = '0.1'
__all__ = ['Lexer']
import re
from collections import OrderedDict
class LexError(Exception):
'''
Exception raised if an invalid character is encountered and no default
error handler function is defined. The .text attribute of the exception
contains all remaining untokenized text. The .error_index is the index
location of the error.
'''
def __init__(self, message, text, error_index):
self.args = (message,)
self.text = text
self.error_index = error_index
class PatternError(Exception):
'''
Exception raised if there's some kind of problem with the specified
regex patterns in the lexer.
'''
pass
class LexerBuildError(Exception):
'''
Exception raised if there's some sort of problem building the lexer.
'''
pass
class Token(object):
'''
Representation of a single token.
'''
__slots__ = ('type', 'value', 'lineno', 'index')
def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}'
class LexerMetaDict(OrderedDict):
'''
Special dictionary that prohits duplicate definitions in lexer specifications.
'''
def __setitem__(self, key, value):
if key in self and not isinstance(value, property):
if isinstance(self[key], str):
if callable(value):
value.pattern = self[key]
else:
raise AttributeError(f'Name {key} redefined')
super().__setitem__(key, value)
class LexerMeta(type):
'''
Metaclass for collecting lexing rules
'''
@classmethod
def __prepare__(meta, *args, **kwargs):
d = LexerMetaDict()
def _(pattern, *extra):
patterns = [pattern, *extra]
def decorate(func):
pattern = '|'.join(f'({pat})' for pat in patterns )
if hasattr(func, 'pattern'):
func.pattern = pattern + '|' + func.pattern
else:
func.pattern = pattern
return func
return decorate
d['_'] = _
return d
def __new__(meta, clsname, bases, attributes):
del attributes['_']
cls = super().__new__(meta, clsname, bases, attributes)
cls._build(list(attributes.items()))
return cls
class Lexer(metaclass=LexerMeta):
# These attributes may be defined in subclasses
tokens = set()
literals = set()
ignore = ''
reflags = 0
# These attributes are constructed automatically by the associated metaclass
_master_re = None
_token_names = set()
_literals = set()
_token_funcs = { }
_ignored_tokens = set()
@classmethod
def _collect_rules(cls, definitions):
'''
Collect all of the rules from class definitions that look like tokens
'''
rules = []
for key, value in definitions:
if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'pattern'):
rules.append((key, value))
return rules
@classmethod
def _build(cls, definitions):
'''
Build the lexer object from the collected tokens and regular expressions.
Validate the rules to make sure they look sane.
'''
if 'tokens' not in vars(cls):
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
cls._token_names = cls._token_names | set(cls.tokens)
cls._literals = cls._literals | set(cls.literals)
cls._ignored_tokens = set(cls._ignored_tokens)
cls._token_funcs = dict(cls._token_funcs)
parts = []
for tokname, value in cls._collect_rules(definitions):
if tokname.startswith('ignore_'):
tokname = tokname[7:]
cls._ignored_tokens.add(tokname)
if isinstance(value, str):
pattern = value
elif callable(value):
pattern = value.pattern
cls._token_funcs[tokname] = value
# Form the regular expression component
part = f'(?P<{tokname}>{pattern})'
# Make sure the individual regex compiles properly
try:
cpat = re.compile(part, cls.reflags)
except Exception as e:
raise PatternError(f'Invalid regex for token {tokname}') from e
# Verify that the pattern doesn't match the empty string
if cpat.match(''):
raise PatternError(f'Regex for token {tokname} matches empty input')
parts.append(part)
if not parts:
return
# Form the master regular expression
previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
# Verify that that ignore and literals specifiers match the input type
if not isinstance(cls.ignore, str):
raise LexerBuildError('ignore specifier must be a string')
if not all(isinstance(lit, str) for lit in cls.literals):
raise LexerBuildError('literals must be specified as strings')
def tokenize(self, text, lineno=1, index=0):
# Local copies of frequently used values
_ignored_tokens = self._ignored_tokens
_master_re = self._master_re
_ignore = self.ignore
_token_funcs = self._token_funcs
_literals = self._literals
self.text = text
try:
while True:
try:
if text[index] in _ignore:
index += 1
continue
except IndexError:
break
tok = Token()
tok.lineno = lineno
tok.index = index
m = _master_re.match(text, index)
if m:
index = m.end()
tok.value = m.group()
tok.type = m.lastgroup
if tok.type in _token_funcs:
self.index = index
self.lineno = lineno
tok = _token_funcs[tok.type](self, tok)
index = self.index
lineno = self.lineno
if not tok:
continue
if tok.type in _ignored_tokens:
continue
yield tok
else:
# No match, see if the character is in literals
if text[index] in _literals:
tok.value = text[index]
tok.type = tok.value
index += 1
yield tok
else:
# A lexing error
self.index = index
self.lineno = lineno
self.error(text[index:])
index = self.index
lineno = self.lineno
# Set the final state of the lexer before exiting (even if exception)
finally:
self.text = text
self.index = index
self.lineno = lineno
# Default implementations of the error handler. May be changed in subclasses
def error(self, value):
raise LexError(f'Illegal character {value[0]!r} at index {self.index}', value, self.index)

View File

@ -2,4 +2,5 @@
from .lex import * from .lex import *
from .yacc import * from .yacc import *
__version__ = "0.5"
__all__ = [ *lex.__all__, *yacc.__all__ ] __all__ = [ *lex.__all__, *yacc.__all__ ]

25
src/sly/ast.py Normal file
View File

@ -0,0 +1,25 @@
# sly/ast.py
import sys
class AST(object):
@classmethod
def __init_subclass__(cls, **kwargs):
mod = sys.modules[cls.__module__]
if not hasattr(cls, '__annotations__'):
return
hints = list(cls.__annotations__.items())
def __init__(self, *args, **kwargs):
if len(hints) != len(args):
raise TypeError(f'Expected {len(hints)} arguments')
for arg, (name, val) in zip(args, hints):
if isinstance(val, str):
val = getattr(mod, val)
if not isinstance(arg, val):
raise TypeError(f'{name} argument must be {val}')
setattr(self, name, arg)
cls.__init__ = __init__

60
src/sly/docparse.py Normal file
View File

@ -0,0 +1,60 @@
# docparse.py
#
# Support doc-string parsing classes
__all__ = [ 'DocParseMeta' ]
class DocParseMeta(type):
'''
Metaclass that processes the class docstring through a parser and
incorporates the result into the resulting class definition. This
allows Python classes to be defined with alternative syntax.
To use this class, you first need to define a lexer and parser:
from sly import Lexer, Parser
class MyLexer(Lexer):
...
class MyParser(Parser):
...
You then need to define a metaclass that inherits from DocParseMeta.
This class must specify the associated lexer and parser classes.
For example:
class MyDocParseMeta(DocParseMeta):
lexer = MyLexer
parser = MyParser
This metaclass is then used as a base for processing user-defined
classes:
class Base(metaclass=MyDocParseMeta):
pass
class Spam(Base):
"""
doc string is parsed
...
"""
It is expected that the MyParser() class would return a dictionary.
This dictionary is used to create the final class Spam in this example.
'''
@staticmethod
def __new__(meta, clsname, bases, clsdict):
if '__doc__' in clsdict:
lexer = meta.lexer()
parser = meta.parser()
lexer.cls_name = parser.cls_name = clsname
lexer.cls_qualname = parser.cls_qualname = clsdict['__qualname__']
lexer.cls_module = parser.cls_module = clsdict['__module__']
parsedict = parser.parse(lexer.tokenize(clsdict['__doc__']))
assert isinstance(parsedict, dict), 'Parser must return a dictionary'
clsdict.update(parsedict)
return super().__new__(meta, clsname, bases, clsdict)
@classmethod
def __init_subclass__(cls):
assert hasattr(cls, 'parser') and hasattr(cls, 'lexer')

460
src/sly/lex.py Normal file
View File

@ -0,0 +1,460 @@
# -----------------------------------------------------------------------------
# sly: lex.py
#
# Copyright (C) 2016 - 2018
# David M. Beazley (Dabeaz LLC)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of the David Beazley or Dabeaz LLC may be used to
# endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
__all__ = ['Lexer', 'LexerStateChange']
import re
import copy
class LexError(Exception):
'''
Exception raised if an invalid character is encountered and no default
error handler function is defined. The .text attribute of the exception
contains all remaining untokenized text. The .error_index is the index
location of the error.
'''
def __init__(self, message, text, error_index):
self.args = (message,)
self.text = text
self.error_index = error_index
class PatternError(Exception):
'''
Exception raised if there's some kind of problem with the specified
regex patterns in the lexer.
'''
pass
class LexerBuildError(Exception):
'''
Exception raised if there's some sort of problem building the lexer.
'''
pass
class LexerStateChange(Exception):
'''
Exception raised to force a lexing state change
'''
def __init__(self, newstate, tok=None):
self.newstate = newstate
self.tok = tok
class Token(object):
'''
Representation of a single token.
'''
__slots__ = ('type', 'value', 'lineno', 'index', 'end')
def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
class TokenStr(str):
@staticmethod
def __new__(cls, value, key=None, remap=None):
self = super().__new__(cls, value)
self.key = key
self.remap = remap
return self
# Implementation of TOKEN[value] = NEWTOKEN
def __setitem__(self, key, value):
if self.remap is not None:
self.remap[self.key, key] = value
# Implementation of del TOKEN[value]
def __delitem__(self, key):
if self.remap is not None:
self.remap[self.key, key] = self.key
class _Before:
def __init__(self, tok, pattern):
self.tok = tok
self.pattern = pattern
class LexerMetaDict(dict):
'''
Special dictionary that prohibits duplicate definitions in lexer specifications.
'''
def __init__(self):
self.before = { }
self.delete = [ ]
self.remap = { }
def __setitem__(self, key, value):
if isinstance(value, str):
value = TokenStr(value, key, self.remap)
if isinstance(value, _Before):
self.before[key] = value.tok
value = TokenStr(value.pattern, key, self.remap)
if key in self and not isinstance(value, property):
prior = self[key]
if isinstance(prior, str):
if callable(value):
value.pattern = prior
else:
raise AttributeError(f'Name {key} redefined')
super().__setitem__(key, value)
def __delitem__(self, key):
self.delete.append(key)
if key not in self and key.isupper():
pass
else:
return super().__delitem__(key)
def __getitem__(self, key):
if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_':
return TokenStr(key, key, self.remap)
else:
return super().__getitem__(key)
class LexerMeta(type):
'''
Metaclass for collecting lexing rules
'''
@classmethod
def __prepare__(meta, name, bases):
d = LexerMetaDict()
def _(pattern, *extra):
patterns = [pattern, *extra]
def decorate(func):
pattern = '|'.join(f'({pat})' for pat in patterns )
if hasattr(func, 'pattern'):
func.pattern = pattern + '|' + func.pattern
else:
func.pattern = pattern
return func
return decorate
d['_'] = _
d['before'] = _Before
return d
def __new__(meta, clsname, bases, attributes):
del attributes['_']
del attributes['before']
# Create attributes for use in the actual class body
cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val
for key, val in attributes.items() }
cls = super().__new__(meta, clsname, bases, cls_attributes)
# Attach various metadata to the class
cls._attributes = dict(attributes)
cls._remap = attributes.remap
cls._before = attributes.before
cls._delete = attributes.delete
cls._build()
return cls
class Lexer(metaclass=LexerMeta):
# These attributes may be defined in subclasses
tokens = set()
literals = set()
ignore = ''
reflags = 0
regex_module = re
_token_names = set()
_token_funcs = {}
_ignored_tokens = set()
_remapping = {}
_delete = {}
_remap = {}
# Internal attributes
__state_stack = None
__set_state = None
@classmethod
def _collect_rules(cls):
# Collect all of the rules from class definitions that look like token
# information. There are a few things that govern this:
#
# 1. Any definition of the form NAME = str is a token if NAME is
# is defined in the tokens set.
#
# 2. Any definition of the form ignore_NAME = str is a rule for an ignored
# token.
#
# 3. Any function defined with a 'pattern' attribute is treated as a rule.
# Such functions can be created with the @_ decorator or by defining
# function with the same name as a previously defined string.
#
# This function is responsible for keeping rules in order.
# Collect all previous rules from base classes
rules = []
for base in cls.__bases__:
if isinstance(base, LexerMeta):
rules.extend(base._rules)
# Dictionary of previous rules
existing = dict(rules)
for key, value in cls._attributes.items():
if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'):
if callable(value) and not hasattr(value, 'pattern'):
raise LexerBuildError(f"function {value} doesn't have a regex pattern")
if key in existing:
# The definition matches something that already existed in the base class.
# We replace it, but keep the original ordering
n = rules.index((key, existing[key]))
rules[n] = (key, value)
existing[key] = value
elif isinstance(value, TokenStr) and key in cls._before:
before = cls._before[key]
if before in existing:
# Position the token before another specified token
n = rules.index((before, existing[before]))
rules.insert(n, (key, value))
else:
# Put at the end of the rule list
rules.append((key, value))
existing[key] = value
else:
rules.append((key, value))
existing[key] = value
elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}:
raise LexerBuildError(f'{key} does not match a name in tokens')
# Apply deletion rules
rules = [ (key, value) for key, value in rules if key not in cls._delete ]
cls._rules = rules
@classmethod
def _build(cls):
'''
Build the lexer object from the collected tokens and regular expressions.
Validate the rules to make sure they look sane.
'''
if 'tokens' not in vars(cls):
raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute')
# Pull definitions created for any parent classes
cls._token_names = cls._token_names | set(cls.tokens)
cls._ignored_tokens = set(cls._ignored_tokens)
cls._token_funcs = dict(cls._token_funcs)
cls._remapping = dict(cls._remapping)
for (key, val), newtok in cls._remap.items():
if key not in cls._remapping:
cls._remapping[key] = {}
cls._remapping[key][val] = newtok
remapped_toks = set()
for d in cls._remapping.values():
remapped_toks.update(d.values())
undefined = remapped_toks - set(cls._token_names)
if undefined:
missing = ', '.join(undefined)
raise LexerBuildError(f'{missing} not included in token(s)')
cls._collect_rules()
parts = []
for tokname, value in cls._rules:
if tokname.startswith('ignore_'):
tokname = tokname[7:]
cls._ignored_tokens.add(tokname)
if isinstance(value, str):
pattern = value
elif callable(value):
cls._token_funcs[tokname] = value
pattern = getattr(value, 'pattern')
# Form the regular expression component
part = f'(?P<{tokname}>{pattern})'
# Make sure the individual regex compiles properly
try:
cpat = cls.regex_module.compile(part, cls.reflags)
except Exception as e:
raise PatternError(f'Invalid regex for token {tokname}') from e
# Verify that the pattern doesn't match the empty string
if cpat.match(''):
raise PatternError(f'Regex for token {tokname} matches empty input')
parts.append(part)
if not parts:
return
# Form the master regular expression
#previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
# cls._master_re = cls.regex_module.compile('|'.join(parts) + previous, cls.reflags)
cls._master_re = cls.regex_module.compile('|'.join(parts), cls.reflags)
# Verify that that ignore and literals specifiers match the input type
if not isinstance(cls.ignore, str):
raise LexerBuildError('ignore specifier must be a string')
if not all(isinstance(lit, str) for lit in cls.literals):
raise LexerBuildError('literals must be specified as strings')
def begin(self, cls):
'''
Begin a new lexer state
'''
assert isinstance(cls, LexerMeta), "state must be a subclass of Lexer"
if self.__set_state:
self.__set_state(cls)
self.__class__ = cls
def push_state(self, cls):
'''
Push a new lexer state onto the stack
'''
if self.__state_stack is None:
self.__state_stack = []
self.__state_stack.append(type(self))
self.begin(cls)
def pop_state(self):
'''
Pop a lexer state from the stack
'''
self.begin(self.__state_stack.pop())
def tokenize(self, text, lineno=1, index=0):
_ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None
# --- Support for state changes
def _set_state(cls):
nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping
_ignored_tokens = cls._ignored_tokens
_master_re = cls._master_re
_ignore = cls.ignore
_token_funcs = cls._token_funcs
_literals = cls.literals
_remapping = cls._remapping
self.__set_state = _set_state
_set_state(type(self))
# --- Support for backtracking
_mark_stack = []
def _mark():
_mark_stack.append((type(self), index, lineno))
self.mark = _mark
def _accept():
_mark_stack.pop()
self.accept = _accept
def _reject():
nonlocal index, lineno
cls, index, lineno = _mark_stack[-1]
_set_state(cls)
self.reject = _reject
# --- Main tokenization function
self.text = text
try:
while True:
try:
if text[index] in _ignore:
index += 1
continue
except IndexError:
return
tok = Token()
tok.lineno = lineno
tok.index = index
m = _master_re.match(text, index)
if m:
tok.end = index = m.end()
tok.value = m.group()
tok.type = m.lastgroup
if tok.type in _remapping:
tok.type = _remapping[tok.type].get(tok.value, tok.type)
if tok.type in _token_funcs:
self.index = index
self.lineno = lineno
tok = _token_funcs[tok.type](self, tok)
index = self.index
lineno = self.lineno
if not tok:
continue
if tok.type in _ignored_tokens:
continue
yield tok
else:
# No match, see if the character is in literals
if text[index] in _literals:
tok.value = text[index]
tok.end = index + 1
tok.type = tok.value
index += 1
yield tok
else:
# A lexing error
self.index = index
self.lineno = lineno
tok.type = 'ERROR'
tok.value = text[index:]
tok = self.error(tok)
if tok is not None:
tok.end = self.index
yield tok
index = self.index
lineno = self.lineno
# Set the final state of the lexer before exiting (even if exception)
finally:
self.text = text
self.index = index
self.lineno = lineno
# Default implementations of the error handler. May be changed in subclasses
def error(self, t):
raise LexError(f'Illegal character {t.value[0]!r} at index {self.index}', t.value, self.index)

View File

@ -1,7 +1,7 @@
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# sly: yacc.py # sly: yacc.py
# #
# Copyright (C) 2016-2017 # Copyright (C) 2016-2018
# David M. Beazley (Dabeaz LLC) # David M. Beazley (Dabeaz LLC)
# All rights reserved. # All rights reserved.
# #
@ -33,9 +33,8 @@
import sys import sys
import inspect import inspect
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict, Counter
__version__ = '0.1'
__all__ = [ 'Parser' ] __all__ = [ 'Parser' ]
class YaccError(Exception): class YaccError(Exception):
@ -55,12 +54,12 @@ ERROR_COUNT = 3 # Number of symbols that must be shifted to leave
MAXINT = sys.maxsize MAXINT = sys.maxsize
# This object is a stand-in for a logging object created by the # This object is a stand-in for a logging object created by the
# logging module. PLY will use this by default to create things # logging module. SLY will use this by default to create things
# such as the parser.out file. If a user wants more detailed # such as the parser.out file. If a user wants more detailed
# information, they can create their own logging object and pass # information, they can create their own logging object and pass
# it into PLY. # it into SLY.
class PlyLogger(object): class SlyLogger(object):
def __init__(self, f): def __init__(self, f):
self.f = f self.f = f
@ -103,6 +102,7 @@ class YaccSymbol:
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
class YaccProduction: class YaccProduction:
__slots__ = ('_slice', '_namemap', '_stack')
def __init__(self, s, stack=None): def __init__(self, s, stack=None):
self._slice = s self._slice = s
self._namemap = { } self._namemap = { }
@ -126,8 +126,6 @@ class YaccProduction:
@property @property
def lineno(self): def lineno(self):
for tok in self._slice: for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
lineno = getattr(tok, 'lineno', None) lineno = getattr(tok, 'lineno', None)
if lineno: if lineno:
return lineno return lineno
@ -136,21 +134,32 @@ class YaccProduction:
@property @property
def index(self): def index(self):
for tok in self._slice: for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
index = getattr(tok, 'index', None) index = getattr(tok, 'index', None)
if index: if index is not None:
return index return index
raise AttributeError('No index attribute found') raise AttributeError('No index attribute found')
@property
def end(self):
result = None
for tok in self._slice:
r = getattr(tok, 'end', None)
if r:
result = r
return result
def __getattr__(self, name): def __getattr__(self, name):
return self._slice[self._namemap[name]].value if name in self._namemap:
return self._namemap[name](self._slice)
else:
nameset = '{' + ', '.join(self._namemap) + '}'
raise AttributeError(f'No symbol {name}. Must be one of {nameset}.')
def __setattr__(self, name, value): def __setattr__(self, name, value):
if name[0:1] == '_' or name not in self._namemap: if name[:1] == '_':
super().__setattr__(name, value) super().__setattr__(name, value)
else: else:
self._slice[self._namemap[name]].value = value raise AttributeError(f"Can't reassign the value of attribute {name!r}")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# === Grammar Representation === # === Grammar Representation ===
@ -205,16 +214,36 @@ class Production(object):
if s not in self.usyms: if s not in self.usyms:
self.usyms.append(s) self.usyms.append(s)
# Create a dict mapping symbol names to indices # Create a name mapping
m = {} # First determine (in advance) if there are duplicate names
for key, indices in symmap.items(): namecount = defaultdict(int)
if len(indices) == 1: for key in self.prod:
m[key] = indices[0] namecount[key] += 1
else: if key in _name_aliases:
for n, index in enumerate(indices): for key in _name_aliases[key]:
m[key+str(n)] = index namecount[key] += 1
self.namemap = m # Now, walk through the names and generate accessor functions
nameuse = defaultdict(int)
namemap = { }
for index, key in enumerate(self.prod):
if namecount[key] > 1:
k = f'{key}{nameuse[key]}'
nameuse[key] += 1
else:
k = key
namemap[k] = lambda s,i=index: s[i].value
if key in _name_aliases:
for n, alias in enumerate(_name_aliases[key]):
if namecount[alias] > 1:
k = f'{alias}{nameuse[alias]}'
nameuse[alias] += 1
else:
k = alias
# The value is either a list (for repetition) or a tuple for optional
namemap[k] = lambda s,i=index,n=n: ([x[n] for x in s[i].value]) if isinstance(s[i].value, list) else s[i].value[n]
self.namemap = namemap
# List of all LR items for the production # List of all LR items for the production
self.lr_items = [] self.lr_items = []
@ -386,7 +415,7 @@ class Grammar(object):
if term in self.Precedence: if term in self.Precedence:
raise GrammarError(f'Precedence already specified for terminal {term!r}') raise GrammarError(f'Precedence already specified for terminal {term!r}')
if assoc not in ['left', 'right', 'nonassoc']: if assoc not in ['left', 'right', 'nonassoc']:
raise GrammarError("Associativity must be one of 'left','right', or 'nonassoc'") raise GrammarError(f"Associativity of {term!r} must be one of 'left','right', or 'nonassoc'")
self.Precedence[term] = (assoc, level) self.Precedence[term] = (assoc, level)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -482,6 +511,9 @@ class Grammar(object):
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def set_start(self, start=None): def set_start(self, start=None):
if callable(start):
start = start.__name__
if not start: if not start:
start = self.Productions[1].name start = self.Productions[1].name
@ -1410,7 +1442,7 @@ class LRTable(object):
if not rlevel: if not rlevel:
descrip.append(f' ! shift/reduce conflict for {a} resolved as shift') descrip.append(f' ! shift/reduce conflict for {a} resolved as shift')
self.sr_conflicts.append((st, a, 'shift')) self.sr_conflicts.append((st, a, 'shift'))
elif r < 0: elif r <= 0:
# Reduce/reduce conflict. In this case, we favor the rule # Reduce/reduce conflict. In this case, we favor the rule
# that was defined first in the grammar file # that was defined first in the grammar file
oldp = Productions[-r] oldp = Productions[-r]
@ -1447,7 +1479,7 @@ class LRTable(object):
if r > 0: if r > 0:
if r != j: if r != j:
raise LALRError(f'Shift/shift conflict in state {st}') raise LALRError(f'Shift/shift conflict in state {st}')
elif r < 0: elif r <= 0:
# Do a precedence check. # Do a precedence check.
# - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce rule is higher, we reduce.
# - if precedence of reduce is same and left assoc, we reduce. # - if precedence of reduce is same and left assoc, we reduce.
@ -1544,34 +1576,242 @@ def _collect_grammar_rules(func):
lineno = unwrapped.__code__.co_firstlineno lineno = unwrapped.__code__.co_firstlineno
for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)): for rule, lineno in zip(func.rules, range(lineno+len(func.rules)-1, 0, -1)):
syms = rule.split() syms = rule.split()
ebnf_prod = []
while ('{' in syms) or ('[' in syms):
for s in syms:
if s == '[':
syms, prod = _replace_ebnf_optional(syms)
ebnf_prod.extend(prod)
break
elif s == '{':
syms, prod = _replace_ebnf_repeat(syms)
ebnf_prod.extend(prod)
break
elif '|' in s:
syms, prod = _replace_ebnf_choice(syms)
ebnf_prod.extend(prod)
break
if syms[1:2] == [':'] or syms[1:2] == ['::=']: if syms[1:2] == [':'] or syms[1:2] == ['::=']:
grammar.append((func, filename, lineno, syms[0], syms[2:])) grammar.append((func, filename, lineno, syms[0], syms[2:]))
else: else:
grammar.append((func, filename, lineno, prodname, syms)) grammar.append((func, filename, lineno, prodname, syms))
grammar.extend(ebnf_prod)
func = getattr(func, 'next_func', None) func = getattr(func, 'next_func', None)
return grammar return grammar
class ParserMetaDict(OrderedDict): # Replace EBNF repetition
def _replace_ebnf_repeat(syms):
syms = list(syms)
first = syms.index('{')
end = syms.index('}', first)
# Look for choices inside
repeated_syms = syms[first+1:end]
if any('|' in sym for sym in repeated_syms):
repeated_syms, prods = _replace_ebnf_choice(repeated_syms)
else:
prods = []
symname, moreprods = _generate_repeat_rules(repeated_syms)
syms[first:end+1] = [symname]
return syms, prods + moreprods
def _replace_ebnf_optional(syms):
syms = list(syms)
first = syms.index('[')
end = syms.index(']', first)
symname, prods = _generate_optional_rules(syms[first+1:end])
syms[first:end+1] = [symname]
return syms, prods
def _replace_ebnf_choice(syms):
syms = list(syms)
newprods = [ ]
n = 0
while n < len(syms):
if '|' in syms[n]:
symname, prods = _generate_choice_rules(syms[n].split('|'))
syms[n] = symname
newprods.extend(prods)
n += 1
return syms, newprods
# Generate grammar rules for repeated items
_gencount = 0
# Dictionary mapping name aliases generated by EBNF rules.
_name_aliases = { }
def _sanitize_symbols(symbols):
for sym in symbols:
if sym.startswith("'"):
yield str(hex(ord(sym[1])))
elif sym.isidentifier():
yield sym
else:
yield sym.encode('utf-8').hex()
def _generate_repeat_rules(symbols):
'''
Symbols is a list of grammar symbols [ symbols ]. This
generates code corresponding to these grammar construction:
@('repeat : many')
def repeat(self, p):
return p.many
@('repeat :')
def repeat(self, p):
return []
@('many : many symbols')
def many(self, p):
p.many.append(symbols)
return p.many
@('many : symbols')
def many(self, p):
return [ p.symbols ]
'''
global _gencount
_gencount += 1
basename = f'_{_gencount}_' + '_'.join(_sanitize_symbols(symbols))
name = f'{basename}_repeat'
oname = f'{basename}_items'
iname = f'{basename}_item'
symtext = ' '.join(symbols)
_name_aliases[name] = symbols
productions = [ ]
_ = _decorator
@_(f'{name} : {oname}')
def repeat(self, p):
return getattr(p, oname)
@_(f'{name} : ')
def repeat2(self, p):
return []
productions.extend(_collect_grammar_rules(repeat))
productions.extend(_collect_grammar_rules(repeat2))
@_(f'{oname} : {oname} {iname}')
def many(self, p):
items = getattr(p, oname)
items.append(getattr(p, iname))
return items
@_(f'{oname} : {iname}')
def many2(self, p):
return [ getattr(p, iname) ]
productions.extend(_collect_grammar_rules(many))
productions.extend(_collect_grammar_rules(many2))
@_(f'{iname} : {symtext}')
def item(self, p):
return tuple(p)
productions.extend(_collect_grammar_rules(item))
return name, productions
def _generate_optional_rules(symbols):
'''
Symbols is a list of grammar symbols [ symbols ]. This
generates code corresponding to these grammar construction:
@('optional : symbols')
def optional(self, p):
return p.symbols
@('optional :')
def optional(self, p):
return None
'''
global _gencount
_gencount += 1
basename = f'_{_gencount}_' + '_'.join(_sanitize_symbols(symbols))
name = f'{basename}_optional'
symtext = ' '.join(symbols)
_name_aliases[name] = symbols
productions = [ ]
_ = _decorator
no_values = (None,) * len(symbols)
@_(f'{name} : {symtext}')
def optional(self, p):
return tuple(p)
@_(f'{name} : ')
def optional2(self, p):
return no_values
productions.extend(_collect_grammar_rules(optional))
productions.extend(_collect_grammar_rules(optional2))
return name, productions
def _generate_choice_rules(symbols):
'''
Symbols is a list of grammar symbols such as [ 'PLUS', 'MINUS' ].
This generates code corresponding to the following construction:
@('PLUS', 'MINUS')
def choice(self, p):
return p[0]
'''
global _gencount
_gencount += 1
basename = f'_{_gencount}_' + '_'.join(_sanitize_symbols(symbols))
name = f'{basename}_choice'
_ = _decorator
productions = [ ]
def choice(self, p):
return p[0]
choice.__name__ = name
choice = _(*symbols)(choice)
productions.extend(_collect_grammar_rules(choice))
return name, productions
class ParserMetaDict(dict):
''' '''
Dictionary that allows decorated grammar rule functions to be overloaded Dictionary that allows decorated grammar rule functions to be overloaded
''' '''
def __setitem__(self, key, value): def __setitem__(self, key, value):
if key in self and callable(value) and hasattr(value, 'rules'): if key in self and callable(value) and hasattr(value, 'rules'):
value.next_func = self[key] value.next_func = self[key]
if not hasattr(value.next_func, 'rules'):
raise GrammarError(f'Redefinition of {key}. Perhaps an earlier {key} is missing @_')
super().__setitem__(key, value) super().__setitem__(key, value)
class ParserMeta(type): def __getitem__(self, key):
@classmethod if key not in self and key.isupper() and key[:1] != '_':
def __prepare__(meta, *args, **kwargs): return key.upper()
d = ParserMetaDict() else:
def _(rule, *extra): return super().__getitem__(key)
def _decorator(rule, *extra):
rules = [rule, *extra] rules = [rule, *extra]
def decorate(func): def decorate(func):
func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ] func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
return func return func
return decorate return decorate
d['_'] = _
class ParserMeta(type):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
d['_'] = _decorator
return d return d
def __new__(meta, clsname, bases, attributes): def __new__(meta, clsname, bases, attributes):
@ -1581,8 +1821,11 @@ class ParserMeta(type):
return cls return cls
class Parser(metaclass=ParserMeta): class Parser(metaclass=ParserMeta):
# Automatic tracking of position information
track_positions = True
# Logging object where debugging/diagnostic messages are sent # Logging object where debugging/diagnostic messages are sent
log = PlyLogger(sys.stderr) log = SlyLogger(sys.stderr)
# Debugging filename where parsetab.out data can be written # Debugging filename where parsetab.out data can be written
debugfile = None debugfile = None
@ -1650,11 +1893,10 @@ class Parser(metaclass=ParserMeta):
Build the grammar from the grammar rules Build the grammar from the grammar rules
''' '''
grammar_rules = [] grammar_rules = []
fail = False errors = ''
# Check for non-empty symbols # Check for non-empty symbols
if not rules: if not rules:
cls.log.error('no grammar rules are defined') raise YaccError('No grammar rules are defined')
return False
grammar = Grammar(cls.tokens) grammar = Grammar(cls.tokens)
@ -1663,8 +1905,7 @@ class Parser(metaclass=ParserMeta):
try: try:
grammar.set_precedence(term, assoc, level) grammar.set_precedence(term, assoc, level)
except GrammarError as e: except GrammarError as e:
cls.log.error(str(e)) errors += f'{e}\n'
fail = True
for name, func in rules: for name, func in rules:
try: try:
@ -1673,25 +1914,22 @@ class Parser(metaclass=ParserMeta):
try: try:
grammar.add_production(prodname, syms, pfunc, rulefile, ruleline) grammar.add_production(prodname, syms, pfunc, rulefile, ruleline)
except GrammarError as e: except GrammarError as e:
cls.log.error(str(e)) errors += f'{e}\n'
fail = True
except SyntaxError as e: except SyntaxError as e:
cls.log.error(str(e)) errors += f'{e}\n'
fail = True
try: try:
grammar.set_start(getattr(cls, 'start', None)) grammar.set_start(getattr(cls, 'start', None))
except GrammarError as e: except GrammarError as e:
cls.log.error(str(e)) errors += f'{e}\n'
fail = True
undefined_symbols = grammar.undefined_symbols() undefined_symbols = grammar.undefined_symbols()
for sym, prod in undefined_symbols: for sym, prod in undefined_symbols:
cls.log.error(f'%s:%d: Symbol %r used, but not defined as a token or a rule', prod.file, prod.line, sym) errors += '%s:%d: Symbol %r used, but not defined as a token or a rule\n' % (prod.file, prod.line, sym)
fail = True
unused_terminals = grammar.unused_terminals() unused_terminals = grammar.unused_terminals()
for term in unused_terminals: if unused_terminals:
cls.log.warning('Token %r defined, but not used', term) unused_str = '{' + ','.join(unused_terminals) + '}'
cls.log.warning(f'Token{"(s)" if len(unused_terminals) >1 else ""} {unused_str} defined, but not used')
unused_rules = grammar.unused_rules() unused_rules = grammar.unused_rules()
for prod in unused_rules: for prod in unused_rules:
@ -1711,18 +1949,18 @@ class Parser(metaclass=ParserMeta):
for u in unreachable: for u in unreachable:
cls.log.warning('Symbol %r is unreachable', u) cls.log.warning('Symbol %r is unreachable', u)
if len(undefined_symbols) == 0:
infinite = grammar.infinite_cycles() infinite = grammar.infinite_cycles()
for inf in infinite: for inf in infinite:
cls.log.error('Infinite recursion detected for symbol %r', inf) errors += 'Infinite recursion detected for symbol %r\n' % inf
fail = True
unused_prec = grammar.unused_precedence() unused_prec = grammar.unused_precedence()
for term, assoc in unused_prec: for term, assoc in unused_prec:
cls.log.error('Precedence rule %r defined for unknown symbol %r', assoc, term) errors += 'Precedence rule %r defined for unknown symbol %r\n' % (assoc, term)
fail = True
cls._grammar = grammar cls._grammar = grammar
return not fail if errors:
raise YaccError('Unable to build grammar.\n'+errors)
@classmethod @classmethod
def __build_lrtables(cls): def __build_lrtables(cls):
@ -1776,8 +2014,7 @@ class Parser(metaclass=ParserMeta):
raise YaccError('Invalid parser specification') raise YaccError('Invalid parser specification')
# Build the underlying grammar object # Build the underlying grammar object
if not cls.__build_grammar(rules): cls.__build_grammar(rules)
raise YaccError('Invalid grammar')
# Build the LR tables # Build the LR tables
if not cls.__build_lrtables(): if not cls.__build_lrtables():
@ -1800,11 +2037,11 @@ class Parser(metaclass=ParserMeta):
if token: if token:
lineno = getattr(token, 'lineno', 0) lineno = getattr(token, 'lineno', 0)
if lineno: if lineno:
sys.stderr.write(f'yacc: Syntax error at line {lineno}, token={token.type}\n') sys.stderr.write(f'sly: Syntax error at line {lineno}, token={token.type}\n')
else: else:
sys.stderr.write(f'yacc: Syntax error, token={token.type}') sys.stderr.write(f'sly: Syntax error, token={token.type}')
else: else:
sys.stderr.write('yacc: Parse error in input. EOF\n') sys.stderr.write('sly: Parse error in input. EOF\n')
def errok(self): def errok(self):
''' '''
@ -1844,6 +2081,12 @@ class Parser(metaclass=ParserMeta):
pslice._stack = symstack # Associate the stack with the production pslice._stack = symstack # Associate the stack with the production
self.restart() self.restart()
# Set up position tracking
track_positions = self.track_positions
if not hasattr(self, '_line_positions'):
self._line_positions = { } # id: -> lineno
self._index_positions = { } # id: -> (start, end)
errtoken = None # Err token errtoken = None # Err token
while True: while True:
# Get the next symbol on the input. If a lookahead symbol # Get the next symbol on the input. If a lookahead symbol
@ -1894,7 +2137,23 @@ class Parser(metaclass=ParserMeta):
value = p.func(self, pslice) value = p.func(self, pslice)
if value is pslice: if value is pslice:
value = (pname, *(s.value for s in pslice._slice)) value = (pname, *(s.value for s in pslice._slice))
sym.value = value sym.value = value
# Record positions
if track_positions:
if plen:
sym.lineno = symstack[-plen].lineno
sym.index = symstack[-plen].index
sym.end = symstack[-1].end
else:
# A zero-length production (what to put here?)
sym.lineno = None
sym.index = None
sym.end = None
self._line_positions[id(value)] = sym.lineno
self._index_positions[id(value)] = (sym.index, sym.end)
if plen: if plen:
del symstack[-plen:] del symstack[-plen:]
del statestack[-plen:] del statestack[-plen:]
@ -1979,6 +2238,8 @@ class Parser(metaclass=ParserMeta):
t.lineno = lookahead.lineno t.lineno = lookahead.lineno
if hasattr(lookahead, 'index'): if hasattr(lookahead, 'index'):
t.index = lookahead.index t.index = lookahead.index
if hasattr(lookahead, 'end'):
t.end = lookahead.end
t.value = lookahead t.value = lookahead
lookaheadstack.append(lookahead) lookaheadstack.append(lookahead)
lookahead = t lookahead = t
@ -1989,4 +2250,12 @@ class Parser(metaclass=ParserMeta):
continue continue
# Call an error function here # Call an error function here
raise RuntimeError('yacc: internal parser error!!!\n') raise RuntimeError('sly: internal parser error!!!\n')
# Return position tracking information
def line_position(self, value):
return self._line_positions[id(value)]
def index_position(self, value):
return self._index_positions[id(value)]

152
tests/test_ebnf.py Normal file
View File

@ -0,0 +1,152 @@
import pytest
from sly import Lexer, Parser
class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
literals = { '(', ')' }
# String containing ignored characters between tokens
ignore = ' \t'
# Regular expression rules for tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
COMMA = r','
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
# Ignored text
ignore_comment = r'\#.*'
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
self.errors.append(t.value[0])
self.index += 1
def __init__(self):
self.errors = []
class CalcParser(Parser):
tokens = CalcLexer.tokens
def __init__(self):
self.names = { }
self.errors = [ ]
@_('ID ASSIGN expr')
def statement(self, p):
self.names[p.ID] = p.expr
@_('ID "(" [ arglist ] ")"')
def statement(self, p):
return (p.ID, p.arglist)
@_('expr { COMMA expr }')
def arglist(self, p):
return [p.expr0, *p.expr1]
@_('expr')
def statement(self, p):
return p.expr
@_('term { PLUS|MINUS term }')
def expr(self, p):
lval = p.term0
for op, rval in p[1]:
if op == '+':
lval = lval + rval
elif op == '-':
lval = lval - rval
return lval
@_('factor { TIMES|DIVIDE factor }')
def term(self, p):
lval = p.factor0
for op, rval in p[1]:
if op == '*':
lval = lval * rval
elif op == '/':
lval = lval / rval
return lval
@_('MINUS factor')
def factor(self, p):
return -p.factor
@_("'(' expr ')'")
def factor(self, p):
return p.expr
@_('NUMBER')
def factor(self, p):
return int(p.NUMBER)
@_('ID')
def factor(self, p):
try:
return self.names[p.ID]
except LookupError:
print(f'Undefined name {p.ID!r}')
return 0
def error(self, tok):
self.errors.append(tok)
# Test basic recognition of various tokens and literals
def test_simple():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a = 3 + 4 * (5 + 6)'))
assert result == None
assert parser.names['a'] == 47
result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
assert result == 47
def test_ebnf():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a()'))
assert result == ('a', None)
result = parser.parse(lexer.tokenize('a(2+3)'))
assert result == ('a', [5])
result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
assert result == ('a', [5, 9])
def test_parse_error():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a 123 4 + 5'))
assert result == 9
assert len(parser.errors) == 1
assert parser.errors[0].type == 'NUMBER'
assert parser.errors[0].value == 123
# TO DO: Add tests
# - error productions
# - embedded actions
# - lineno tracking
# - various error cases caught during parser construction

View File

@ -47,9 +47,11 @@ class CalcLexer(Lexer):
t.value = t.value.upper() t.value = t.value.upper()
return t return t
def error(self, value): def error(self, t):
self.errors.append(value) self.errors.append(t.value)
self.index += 1 self.index += 1
if hasattr(self, 'return_error'):
return t
def __init__(self): def __init__(self):
self.errors = [] self.errors = []
@ -63,6 +65,21 @@ def test_tokens():
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')'] assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')'] assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
# Test position tracking
def test_positions():
lexer = CalcLexer()
text = 'abc\n( )'
toks = list(lexer.tokenize(text))
lines = [t.lineno for t in toks ]
indices = [t.index for t in toks ]
ends = [t.end for t in toks]
values = [ text[t.index:t.end] for t in toks ]
assert values == ['abc', '(', ')']
assert lines == [1, 2, 2]
assert indices == [0, 4, 6]
assert ends == [3, 5, 7]
# Test ignored comments and newlines # Test ignored comments and newlines
def test_ignored(): def test_ignored():
lexer = CalcLexer() lexer = CalcLexer()
@ -85,9 +102,107 @@ def test_error():
assert vals == [123, '+', '-'] assert vals == [123, '+', '-']
assert lexer.errors == [ ':+-' ] assert lexer.errors == [ ':+-' ]
# Test error token return handling
def test_error_return():
lexer = CalcLexer()
lexer.return_error = True
toks = list(lexer.tokenize('123 :+-'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]
class ModernCalcLexer(Lexer):
# Set of token names. This is always required
tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, LT, LE, IF, ELSE }
literals = { '(', ')' }
# String containing ignored characters between tokens
ignore = ' \t'
# Regular expression rules for tokens
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['if'] = IF
ID['else'] = ELSE
NUMBER = r'\d+'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LE = r'<='
LT = r'<'
def NUMBER(self, t):
t.value = int(t.value)
return t
# Ignored text
ignore_comment = r'\#.*'
@_(r'\n+')
def ignore_newline(self, t):
self.lineno += t.value.count('\n')
# Attached rule
def ID(self, t):
t.value = t.value.upper()
return t
def error(self, t):
self.errors.append(t.value)
self.index += 1
if hasattr(self, 'return_error'):
return t
def __init__(self):
self.errors = []
# Test basic recognition of various tokens and literals
def test_modern_tokens():
lexer = ModernCalcLexer()
toks = list(lexer.tokenize('abc if else 123 + - * / = < <= ( )'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['ID','IF','ELSE', 'NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC','if','else', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
# Test ignored comments and newlines
def test_modern_ignored():
lexer = ModernCalcLexer()
toks = list(lexer.tokenize('\n\n# A comment\n123\nabc\n'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
linenos = [t.lineno for t in toks]
assert types == ['NUMBER', 'ID']
assert vals == [123, 'ABC']
assert linenos == [4,5]
assert lexer.lineno == 6
# Test error handling
def test_modern_error():
lexer = ModernCalcLexer()
toks = list(lexer.tokenize('123 :+-'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS']
assert vals == [123, '+', '-']
assert lexer.errors == [ ':+-' ]
# Test error token return handling
def test_modern_error_return():
lexer = ModernCalcLexer()
lexer.return_error = True
toks = list(lexer.tokenize('123 :+-'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'ERROR', 'PLUS', 'MINUS']
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]

View File

@ -3,16 +3,7 @@ from sly import Lexer, Parser
class CalcLexer(Lexer): class CalcLexer(Lexer):
# Set of token names. This is always required # Set of token names. This is always required
tokens = { tokens = { ID, NUMBER, PLUS, MINUS, TIMES, DIVIDE, ASSIGN, COMMA }
'ID',
'NUMBER',
'PLUS',
'MINUS',
'TIMES',
'DIVIDE',
'ASSIGN',
}
literals = { '(', ')' } literals = { '(', ')' }
# String containing ignored characters between tokens # String containing ignored characters between tokens
@ -25,6 +16,7 @@ class CalcLexer(Lexer):
TIMES = r'\*' TIMES = r'\*'
DIVIDE = r'/' DIVIDE = r'/'
ASSIGN = r'=' ASSIGN = r'='
COMMA = r','
@_(r'\d+') @_(r'\d+')
def NUMBER(self, t): def NUMBER(self, t):
@ -38,8 +30,8 @@ class CalcLexer(Lexer):
def newline(self, t): def newline(self, t):
self.lineno += t.value.count('\n') self.lineno += t.value.count('\n')
def error(self, value): def error(self, t):
self.errors.append(value) self.errors.append(t.value[0])
self.index += 1 self.index += 1
def __init__(self): def __init__(self):
@ -49,9 +41,9 @@ class CalcParser(Parser):
tokens = CalcLexer.tokens tokens = CalcLexer.tokens
precedence = ( precedence = (
('left', 'PLUS', 'MINUS'), ('left', PLUS, MINUS),
('left', 'TIMES', 'DIVIDE'), ('left', TIMES, DIVIDE),
('right', 'UMINUS'), ('right', UMINUS),
) )
def __init__(self): def __init__(self):
@ -62,6 +54,14 @@ class CalcParser(Parser):
def statement(self, p): def statement(self, p):
self.names[p.ID] = p.expr self.names[p.ID] = p.expr
@_('ID "(" [ arglist ] ")"')
def statement(self, p):
return (p.ID, p.arglist)
@_('expr { COMMA expr }')
def arglist(self, p):
return [p.expr0, *p.expr1]
@_('expr') @_('expr')
def statement(self, p): def statement(self, p):
return p.expr return p.expr
@ -118,6 +118,18 @@ def test_simple():
result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)')) result = parser.parse(lexer.tokenize('3 + 4 * (5 + 6)'))
assert result == 47 assert result == 47
def test_ebnf():
lexer = CalcLexer()
parser = CalcParser()
result = parser.parse(lexer.tokenize('a()'))
assert result == ('a', None)
result = parser.parse(lexer.tokenize('a(2+3)'))
assert result == ('a', [5])
result = parser.parse(lexer.tokenize('a(2+3, 4+5)'))
assert result == ('a', [5, 9])
def test_parse_error(): def test_parse_error():
lexer = CalcLexer() lexer = CalcLexer()
parser = CalcParser() parser = CalcParser()