Initial commit

2016-09-02 14:07:32 -05:00
parent ae7dffaddd
commit 36cf652eae
6 changed files with 2372 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -1,2 +1,4 @@
-# sly
+# SLY (Sly Lex Yacc)
-Sly Lex Yacc
+
 The name says it all.
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@@ -0,0 +1,96 @@
 # -----------------------------------------------------------------------------
 # calc.py
 # -----------------------------------------------------------------------------
 import sys
 sys.path.insert(0, "../..")
 from sly import Lexer, Parser
 class CalcLexer(Lexer):
    tokens = (
        'NAME', 'NUMBER',
        )
    ignore = ' \t'
    literals = ['=', '+', '-', '*', '/', '(', ')']
    # Tokens
    NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
    @_(r'\d+')
    def NUMBER(self, t):
        t.value = int(t.value)
        return t
    @_(r'\n+')
    def newline(self, t):
        self.lineno += t.value.count('\n')
    def error(self, value):
        print("Illegal character '%s'" % value[0])
        self.index += 1
 class CalcParser(Parser):
    tokens = CalcLexer.tokens
    precedence = (
        ('left', '+', '-'),
        ('left', '*', '/'),
        ('right', 'UMINUS'),
    )
    def __init__(self):
        self.names = { }
    @_('statement : NAME "=" expression')
    def statement_assign(self, p):
        self.names[p[1]] = p[3]
    @_('statement : expression')
    def statement_expr(self, p):
        print(p[1])
    @_('''expression : expression '+' expression
                     | expression '-' expression
                     | expression '*' expression
                     | expression '/' expression''')
    def expression_binop(self, p):
        if p[2] == '+':
            p[0] = p[1] + p[3]
        elif p[2] == '-':
            p[0] = p[1] - p[3]
        elif p[2] == '*':
            p[0] = p[1] * p[3]
        elif p[2] == '/':
            p[0] = p[1] / p[3]
    @_('expression : "-" expression %prec UMINUS')
    def expression_uminus(self, p):
        p[0] = -p[2]
    @_('expression : "(" expression ")"')
    def expression_group(self, p):
        p[0] = p[2]
    @_('expression : NUMBER')
    def expression_number(self, p):
        p[0] = p[1]
    @_('expression : NAME')
    def expression_name(self, p):
        try:
            p[0] = self.names[p[1]]
        except LookupError:
            print("Undefined name '%s'" % p[1])
            p[0] = 0
 if __name__ == '__main__':
    parser = CalcParser()
    while True:
        try:
            s = input('calc > ')
        except EOFError:
            break
        if s:
            parser.parse(CalcLexer(s))
--- a/sly/init.py
+++ b/sly/init.py
@@ -0,0 +1,5 @@
 from .lex import *
 from .yacc import *
 __all__ = [ *lex.__all__, *yacc.__all__ ]
--- a/sly/_meta.py
+++ b/sly/_meta.py
@@ -0,0 +1,25 @@
 from collections import OrderedDict
 class NoDupeDict(OrderedDict):
    def __setitem__(self, key, value):
        if key in self and not isinstance(value, property):
            raise AttributeError('Name %s redefined' % (key))
        super().__setitem__(key, value)
 class RuleMeta(type):
    @staticmethod
    def __prepare__(meta, *args, **kwargs):
        d = NoDupeDict()
        def _(rule):
            def decorate(func):
                func.rule = rule
                return func
            return decorate
        d['_'] = _
        return d
    def __new__(meta, clsname, bases, attributes):
        del attributes['_']
        cls = super().__new__(meta, clsname, bases, attributes)
        cls._build(list(attributes.items()))
        return cls
--- a/sly/lex.py
+++ b/sly/lex.py
@@ -0,0 +1,235 @@
 # -----------------------------------------------------------------------------
 # sly: lex.py
 #
 # Copyright (C) 2016
 # David M. Beazley (Dabeaz LLC)
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 # * Redistributions of source code must retain the above copyright notice,
 #   this list of conditions and the following disclaimer.
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 # * Neither the name of the David Beazley or Dabeaz LLC may be used to
 #   endorse or promote products derived from this software without
 #  specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # -----------------------------------------------------------------------------
 __version__    = '0.0'
 __all__ = ['Lexer']
 import re
 from collections import OrderedDict
 from ._meta import RuleMeta
 class LexError(Exception):
    '''
    Exception raised if an invalid character is encountered and no default
    error handler function is defined.  The .text attribute of the exception
    contains all remaining untokenized text.
    '''
    def __init__(self, message, text):
        self.args = (message,)
        self.text = text
 class PatternError(Exception):
    '''
    Exception raised if there's some kind of problem with the specified
    regex patterns in the lexer.
    '''
    pass
 class LexerBuildError(Exception):
    '''
    Exception raised if there's some sort of problem building the lexer.
    '''
    pass
 class Token(object):
    '''
    Representation of a single token.
    '''
    __slots__ = ('type', 'value', 'lineno', 'index')
    def __repr__(self):
        return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)
 class Lexer(metaclass=RuleMeta):
    '''
    Representation of a single lexing state.  This class is automatically constructed 
    by the RuleDict during class definition.
    '''
    # These attributes may be defined in subclasses
    tokens = set()
    literals = set()
    ignore = None
    reflags = 0
    # These attributes are constructed automatically by the associated metaclass
    _master_re = None
    _token_names = set()
    _literals = set()
    _token_funcs = { }
    _ignored_tokens = set()
    _input_type = str
    def __init__(self, text, lineno=1, index=0):
        self.text = text
        self.lineno = lineno
        self.index = index
    @classmethod
    def _collect_rules(cls, definitions):
        '''
        Collect all of the rules from class definitions that look like tokens
        '''
        rules = []
        for key, value in definitions:
            if (key in cls.tokens) or key.startswith('ignore_') or hasattr(value, 'rule'):
                rules.append((key, value))
        return rules
    @classmethod
    def _build(cls, definitions):
        '''
        Build the lexer object from the collected tokens and regular expressions.
        Validate the rules to make sure they look sane.
        '''
        if 'tokens' not in vars(cls):
            raise LexerBuildError('%s class does not define a tokens attribute' % cls.__qualname__)
        cls._token_names = cls._token_names | set(cls.tokens)
        cls._literals = cls._literals | set(cls.literals)
        cls._ignored_tokens = set(cls._ignored_tokens)
        cls._token_funcs = dict(cls._token_funcs)
        parts = []
        for tokname, value in cls._collect_rules(definitions):
            if tokname.startswith('ignore_'):
                tokname = tokname[7:]
                cls._ignored_tokens.add(tokname)
            if isinstance(value, (str, bytes)):
                pattern = value
            elif callable(value):
                pattern = value.rule
                cls._token_funcs[tokname] = value
            # Form the regular expression component 
            if isinstance(pattern, str):
                part = '(?P<%s>%s)' % (tokname, pattern)
            else:
                part = b'(?P<%s>%s)' % (tokname.encode('ascii'), pattern)
            # Make sure the individual regex compiles properly
            try:
                cpat = re.compile(part, cls.reflags)
            except Exception as e:
                raise PatternError('Invalid regex for token %s' % tokname) from e
            # Verify that the pattern doesn't match the empty string
            if cpat.match(type(pattern)()):
                raise PatternError('Regex for token %s matches empty input' % tokname)
            parts.append(part)
        # If no parts collected, then no rules to process
        if not parts:
            return
        # Verify that all of the patterns are of the same type
        if not all(type(part) == type(parts[0]) for part in parts):
            raise LexerBuildError('Tokens are specified using both bytes and strings.')
        # Form the master regular expression
        if parts and isinstance(parts[0], bytes):
            previous = (b'|' + cls._master_re.pattern) if cls._master_re else b''
            cls._master_re = re.compile(b'|'.join(parts) + previous, cls.reflags)
            cls._input_type = bytes
        else:
            previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
            cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
            cls._input_type = str
        # Verify that that ignore and literals specifiers match the input type
        if cls.ignore is not None and not isinstance(cls.ignore, cls._input_type):
            raise LexerBuildError("ignore specifier type doesn't match token types (%s)" %
                                  cls._input_type.__name__)
        if not all(isinstance(lit, cls._input_type) for lit in cls.literals):
            raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
                                  cls._input_type.__name__)
    def __iter__(self):
        text = self.text
        index = self.index
        while True:
            try:
                if text[index] in self.ignore:
                    index += 1
                    continue
            except IndexError:
                if self.eof:
                    text = self.eof()
                    if text is not None:
                        self.text = text
                        self.index = 0
                        index = 0
                        continue
                break
            tok = Token()
            tok.lineno = self.lineno
            tok.index = index
            m = self._master_re.match(text, index)
            if m:
                index = m.end()
                tok.value = m.group()
                tok.type = m.lastgroup
                if tok.type in self._token_funcs:
                    self.index = index
                    tok = self._token_funcs[tok.type](self, tok)
                    index = self.index
                    if not tok:
                        continue
                if tok.type in self._ignored_tokens:
                    continue
                yield tok
            else:
                # No match, see if the character is in literals
                if text[index] in self._literals:
                    tok.value = text[index]
                    tok.type = tok.value
                    index += 1
                    yield tok
                else:
                    # A lexing error
                    self.index = index
                    self.error(self.text[self.index:])
                    index = self.index
    def error(self, value):
        raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)
    def eof(self):
        pass
--- a/sly/yacc.py
+++ b/sly/yacc.py