Initial commit

2016-09-02 14:07:32 -05:00 · 2016-09-02 14:07:32 -05:00 · 36cf652eae
commit 36cf652eae
parent ae7dffaddd
6 changed files with 2372 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,4 @@
-# sly
-Sly Lex Yacc
+# SLY (Sly Lex Yacc)
+
+The name says it all.
+
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@ -0,0 +1,96 @@
+# -----------------------------------------------------------------------------
+# calc.py
+# -----------------------------------------------------------------------------
+
+import sys
+sys.path.insert(0, "../..")
+
+from sly import Lexer, Parser
+
+class CalcLexer(Lexer):
+    tokens = (
+        'NAME', 'NUMBER',
+        )
+    ignore = ' \t'
+    literals = ['=', '+', '-', '*', '/', '(', ')']
+
+    # Tokens
+    NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
+
+    @_(r'\d+')
+    def NUMBER(self, t):
+        t.value = int(t.value)
+        return t
+
+    @_(r'\n+')
+    def newline(self, t):
+        self.lineno += t.value.count('\n')
+
+    def error(self, value):
+        print("Illegal character '%s'" % value[0])
+        self.index += 1
+
+class CalcParser(Parser):
+    tokens = CalcLexer.tokens
+    
+    precedence = (
+        ('left', '+', '-'),
+        ('left', '*', '/'),
+        ('right', 'UMINUS'),
+    )
+
+    def __init__(self):
+        self.names = { }
+
+    @_('statement : NAME "=" expression')
+    def statement_assign(self, p):
+        self.names[p[1]] = p[3]
+
+    @_('statement : expression')
+    def statement_expr(self, p):
+        print(p[1])
+
+    @_('''expression : expression '+' expression
+                     | expression '-' expression
+                     | expression '*' expression
+                     | expression '/' expression''')
+    def expression_binop(self, p):
+        if p[2] == '+':
+            p[0] = p[1] + p[3]
+        elif p[2] == '-':
+            p[0] = p[1] - p[3]
+        elif p[2] == '*':
+            p[0] = p[1] * p[3]
+        elif p[2] == '/':
+            p[0] = p[1] / p[3]
+
+    @_('expression : "-" expression %prec UMINUS')
+    def expression_uminus(self, p):
+        p[0] = -p[2]
+
+    @_('expression : "(" expression ")"')
+    def expression_group(self, p):
+        p[0] = p[2]
+
+    @_('expression : NUMBER')
+    def expression_number(self, p):
+        p[0] = p[1]
+
+    @_('expression : NAME')
+    def expression_name(self, p):
+        try:
+            p[0] = self.names[p[1]]
+        except LookupError:
+            print("Undefined name '%s'" % p[1])
+            p[0] = 0
+
+
+if __name__ == '__main__':
+    parser = CalcParser()
+    while True:
+        try:
+            s = input('calc > ')
+        except EOFError:
+            break
+        if s:
+            parser.parse(CalcLexer(s))
--- a/sly/init.py
+++ b/sly/init.py
@ -0,0 +1,5 @@
+
+from .lex import *
+from .yacc import *
+
+__all__ = [ *lex.__all__, *yacc.__all__ ]
--- a/sly/_meta.py
+++ b/sly/_meta.py
@ -0,0 +1,25 @@
+from collections import OrderedDict
+
+class NoDupeDict(OrderedDict):
+    def __setitem__(self, key, value):
+        if key in self and not isinstance(value, property):
+            raise AttributeError('Name %s redefined' % (key))
+        super().__setitem__(key, value)
+
+class RuleMeta(type):
+    @staticmethod
+    def __prepare__(meta, *args, **kwargs):
+        d = NoDupeDict()
+        def _(rule):
+            def decorate(func):
+                func.rule = rule
+                return func
+            return decorate
+        d['_'] = _
+        return d
+
+    def __new__(meta, clsname, bases, attributes):
+        del attributes['_']
+        cls = super().__new__(meta, clsname, bases, attributes)
+        cls._build(list(attributes.items()))
+        return cls
--- a/sly/lex.py
+++ b/sly/lex.py
@ -0,0 +1,235 @@
+# -----------------------------------------------------------------------------
+# sly: lex.py
+#
+# Copyright (C) 2016
+# David M. Beazley (Dabeaz LLC)
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of the David Beazley or Dabeaz LLC may be used to
+#   endorse or promote products derived from this software without
+#  specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+
+__version__    = '0.0'
+__all__ = ['Lexer']
+
+import re
+from collections import OrderedDict
+
+from ._meta import RuleMeta
+
+class LexError(Exception):
+    '''
+    Exception raised if an invalid character is encountered and no default
+    error handler function is defined.  The .text attribute of the exception
+    contains all remaining untokenized text.
+    '''
+    def __init__(self, message, text):
+        self.args = (message,)
+        self.text = text
+
+class PatternError(Exception):
+    '''
+    Exception raised if there's some kind of problem with the specified
+    regex patterns in the lexer.
+    '''
+    pass
+
+class LexerBuildError(Exception):
+    '''
+    Exception raised if there's some sort of problem building the lexer.
+    '''
+    pass
+
+class Token(object):
+    '''
+    Representation of a single token.
+    '''
+    __slots__ = ('type', 'value', 'lineno', 'index')
+    def __repr__(self):
+        return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)
+
+class Lexer(metaclass=RuleMeta):
+    '''
+    Representation of a single lexing state.  This class is automatically constructed 
+    by the RuleDict during class definition.
+    '''
+
+    # These attributes may be defined in subclasses
+    tokens = set()
+    literals = set()
+    ignore = None
+    reflags = 0
+
+    # These attributes are constructed automatically by the associated metaclass
+    _master_re = None
+    _token_names = set()
+    _literals = set()
+    _token_funcs = { }
+    _ignored_tokens = set()
+    _input_type = str
+
+    def __init__(self, text, lineno=1, index=0):
+        self.text = text
+        self.lineno = lineno
+        self.index = index
+
+    @classmethod
+    def _collect_rules(cls, definitions):
+        '''
+        Collect all of the rules from class definitions that look like tokens
+        '''
+        rules = []
+        for key, value in definitions:
+            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'rule'):
+                rules.append((key, value))
+        return rules
+
+    @classmethod
+    def _build(cls, definitions):
+        '''
+        Build the lexer object from the collected tokens and regular expressions.
+        Validate the rules to make sure they look sane.
+        '''
+        if 'tokens' not in vars(cls):
+            raise LexerBuildError('%s class does not define a tokens attribute' % cls.__qualname__)
+
+        cls._token_names = cls._token_names | set(cls.tokens)
+        cls._literals = cls._literals | set(cls.literals)
+        cls._ignored_tokens = set(cls._ignored_tokens)
+        cls._token_funcs = dict(cls._token_funcs)
+
+        parts = []
+        for tokname, value in cls._collect_rules(definitions):
+            if tokname.startswith('ignore_'):
+                tokname = tokname[7:]
+                cls._ignored_tokens.add(tokname)
+
+            if isinstance(value, (str, bytes)):
+                pattern = value
+
+            elif callable(value):
+                pattern = value.rule
+                cls._token_funcs[tokname] = value
+
+            # Form the regular expression component 
+            if isinstance(pattern, str):
+                part = '(?P<%s>%s)' % (tokname, pattern)
+            else:
+                part = b'(?P<%s>%s)' % (tokname.encode('ascii'), pattern)
+
+            # Make sure the individual regex compiles properly
+            try:
+                cpat = re.compile(part, cls.reflags)
+            except Exception as e:
+                raise PatternError('Invalid regex for token %s' % tokname) from e
+
+            # Verify that the pattern doesn't match the empty string
+            if cpat.match(type(pattern)()):
+                raise PatternError('Regex for token %s matches empty input' % tokname)
+
+            parts.append(part)
+
+        # If no parts collected, then no rules to process
+        if not parts:
+            return
+
+        # Verify that all of the patterns are of the same type
+        if not all(type(part) == type(parts[0]) for part in parts):
+            raise LexerBuildError('Tokens are specified using both bytes and strings.')
+
+        # Form the master regular expression
+        if parts and isinstance(parts[0], bytes):
+            previous = (b'|' + cls._master_re.pattern) if cls._master_re else b''
+            cls._master_re = re.compile(b'|'.join(parts) + previous, cls.reflags)
+            cls._input_type = bytes
+        else:
+            previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
+            cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
+            cls._input_type = str
+
+        # Verify that that ignore and literals specifiers match the input type
+        if cls.ignore is not None and not isinstance(cls.ignore, cls._input_type):
+            raise LexerBuildError("ignore specifier type doesn't match token types (%s)" %
+                                  cls._input_type.__name__)
+        
+        if not all(isinstance(lit, cls._input_type) for lit in cls.literals):
+            raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
+                                  cls._input_type.__name__)
+
+    def __iter__(self):
+        text = self.text
+        index = self.index
+        while True:
+            try:
+                if text[index] in self.ignore:
+                    index += 1
+                    continue
+            except IndexError:
+                if self.eof:
+                    text = self.eof()
+                    if text is not None:
+                        self.text = text
+                        self.index = 0
+                        index = 0
+                        continue
+                break
+
+            tok = Token()
+            tok.lineno = self.lineno
+            tok.index = index
+            m = self._master_re.match(text, index)
+            if m:
+                index = m.end()
+                tok.value = m.group()
+                tok.type = m.lastgroup
+                if tok.type in self._token_funcs:
+                    self.index = index
+                    tok = self._token_funcs[tok.type](self, tok)
+                    index = self.index
+                    if not tok:
+                        continue
+
+                if tok.type in self._ignored_tokens:
+                    continue
+
+                yield tok
+            else:
+                # No match, see if the character is in literals
+                if text[index] in self._literals:
+                    tok.value = text[index]
+                    tok.type = tok.value
+                    index += 1
+                    yield tok
+                else:
+                    # A lexing error
+                    self.index = index
+                    self.error(self.text[self.index:])
+                    index = self.index
+
+    def error(self, value):
+        raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)
+
+    def eof(self):
+        pass
--- a/sly/yacc.py
+++ b/sly/yacc.py