Work in progress. Changes to parser production API

This commit is contained in:
David Beazley 2016-09-08 15:05:03 -05:00
parent 9a1899fa69
commit 05a709aaea
4 changed files with 1869 additions and 182 deletions

File diff suppressed because it is too large Load Diff

View File

@ -44,46 +44,45 @@ class CalcParser(Parser):
@_('NAME "=" expression')
def statement(self, p):
self.names[p[1]] = p[3]
self.names[p.NAME] = p.expression
@_('expression')
def statement(self, p):
print(p[1])
print(p.expression)
@_('expression "+" expression',
'expression "-" expression',
'expression "*" expression',
'expression "/" expression')
def expression(self, p):
if p[2] == '+':
p[0] = p[1] + p[3]
elif p[2] == '-':
p[0] = p[1] - p[3]
elif p[2] == '*':
p[0] = p[1] * p[3]
elif p[2] == '/':
p[0] = p[1] / p[3]
if p[1] == '+':
return p.expression0 + p.expression1
elif p[1] == '-':
return p.expression0 - p.expression1
elif p[1] == '*':
return p.expression0 * p.expression1
elif p[1] == '/':
return p.expression0 / p.expression1
@_('"-" expression %prec UMINUS')
def expression(self, p):
p[0] = -p[2]
return -p.expression
@_('"(" expression ")"')
def expression(self, p):
p[0] = p[2]
return p.expression
@_('NUMBER')
def expression(self, p):
p[0] = p[1]
return p.NUMBER
@_('NAME')
def expression(self, p):
try:
p[0] = self.names[p[1]]
return self.names[p.NAME]
except LookupError:
print("Undefined name '%s'" % p[1])
p[0] = 0
print("Undefined name '%s'" % p.NAME)
return 0
if __name__ == '__main__':
lexer = CalcLexer()

View File

@ -68,9 +68,9 @@ class Token(object):
def __repr__(self):
return 'Token(%s, %r, %d, %d)' % (self.type, self.value, self.lineno, self.index)
class NoDupeDict(OrderedDict):
class LexerMetaDict(OrderedDict):
'''
Special dictionary that prohits duplicate definitions.
Special dictionary that prohits duplicate definitions in lexer specifications.
'''
def __setitem__(self, key, value):
if key in self and not isinstance(value, property):
@ -83,17 +83,15 @@ class LexerMeta(type):
'''
@classmethod
def __prepare__(meta, *args, **kwargs):
d = NoDupeDict()
def _(*patterns):
d = LexerMetaDict()
def _(pattern, *extra):
patterns = [pattern, *extra]
def decorate(func):
for pattern in patterns:
if hasattr(func, 'pattern'):
if isinstance(pattern, str):
func.pattern = ''.join(['(', pattern, ')|(', func.pattern, ')'])
else:
func.pattern = b''.join([b'(', pattern, b')|(', func.pattern, b')'])
else:
func.pattern = pattern
pattern = '|'.join('(%s)' % pat for pat in patterns )
if hasattr(func, 'pattern'):
func.pattern = pattern + '|' + func.pattern
else:
func.pattern = pattern
return func
return decorate
d['_'] = _
@ -109,7 +107,7 @@ class Lexer(metaclass=LexerMeta):
# These attributes may be defined in subclasses
tokens = set()
literals = set()
ignore = None
ignore = ''
reflags = 0
# These attributes are constructed automatically by the associated metaclass
@ -118,7 +116,6 @@ class Lexer(metaclass=LexerMeta):
_literals = set()
_token_funcs = { }
_ignored_tokens = set()
_input_type = str
@classmethod
def _collect_rules(cls, definitions):
@ -151,7 +148,7 @@ class Lexer(metaclass=LexerMeta):
tokname = tokname[7:]
cls._ignored_tokens.add(tokname)
if isinstance(value, (str, bytes)):
if isinstance(value, str):
pattern = value
elif callable(value):
@ -159,10 +156,7 @@ class Lexer(metaclass=LexerMeta):
cls._token_funcs[tokname] = value
# Form the regular expression component
if isinstance(pattern, str):
part = '(?P<%s>%s)' % (tokname, pattern)
else:
part = b'(?P<%s>%s)' % (tokname.encode('ascii'), pattern)
part = '(?P<%s>%s)' % (tokname, pattern)
# Make sure the individual regex compiles properly
try:
@ -171,38 +165,24 @@ class Lexer(metaclass=LexerMeta):
raise PatternError('Invalid regex for token %s' % tokname) from e
# Verify that the pattern doesn't match the empty string
if cpat.match(type(pattern)()):
if cpat.match(''):
raise PatternError('Regex for token %s matches empty input' % tokname)
parts.append(part)
# If no parts collected, then no rules to process
if not parts:
return
# Verify that all of the patterns are of the same type
if not all(type(part) == type(parts[0]) for part in parts):
raise LexerBuildError('Tokens are specified using both bytes and strings.')
# Form the master regular expression
if parts and isinstance(parts[0], bytes):
previous = (b'|' + cls._master_re.pattern) if cls._master_re else b''
cls._master_re = re.compile(b'|'.join(parts) + previous, cls.reflags)
cls._input_type = bytes
else:
previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
cls._input_type = str
previous = ('|' + cls._master_re.pattern) if cls._master_re else ''
cls._master_re = re.compile('|'.join(parts) + previous, cls.reflags)
# Verify that that ignore and literals specifiers match the input type
if cls.ignore is not None and not isinstance(cls.ignore, cls._input_type):
raise LexerBuildError("ignore specifier type doesn't match token types (%s)" %
cls._input_type.__name__)
if not all(isinstance(lit, cls._input_type) for lit in cls.literals):
raise LexerBuildError("literals specifier not using same type as tokens (%s)" %
cls._input_type.__name__)
if not isinstance(cls.ignore, str):
raise LexerBuildError('ignore specifier must be a string')
if not all(isinstance(lit, str) for lit in cls.literals):
raise LexerBuildError("literals must be specified as strings")
def tokenize(self, text, lineno=1, index=0):
# Local copies of frequently used values
@ -220,11 +200,6 @@ class Lexer(metaclass=LexerMeta):
index += 1
continue
except IndexError:
if self.eof:
text = self.eof()
if text:
index = 0
continue
break
tok = Token()
@ -270,9 +245,6 @@ class Lexer(metaclass=LexerMeta):
self.index = index
self.lineno = lineno
# Default implementations of methods that may be subclassed by users
# Default implementations of the error handler. May be changed in subclasses
def error(self, value):
raise LexError("Illegal character %r at index %d" % (value[0], self.index), value)
def eof(self):
pass

View File

@ -33,7 +33,7 @@
import sys
import inspect
from collections import OrderedDict
from collections import OrderedDict, defaultdict
__version__ = '0.0'
__all__ = [ 'Parser' ]
@ -104,31 +104,39 @@ class YaccSymbol:
class YaccProduction:
def __init__(self, s, stack=None):
self.slice = s
self.stack = stack
self._slice = s
self._stack = stack
self._namemap = { }
def __getitem__(self, n):
if isinstance(n, slice):
return [s.value for s in self.slice[n]]
elif n >= 0:
return self.slice[n].value
if n >= 0:
return self._slice[n].value
else:
return self.stack[n].value
return self._stack[n].value
def __setitem__(self, n, v):
self.slice[n].value = v
self._slice[n].value = v
def __len__(self):
return len(self.slice)
return len(self._slice)
def lineno(self, n):
return getattr(self.slice[n], 'lineno', 0)
return getattr(self._slice[n], 'lineno', 0)
def set_lineno(self, n, lineno):
self.slice[n].lineno = lineno
self._slice[n].lineno = lineno
def index(self, n):
return getattr(self.slice[n], 'index', 0)
return getattr(self._slice[n], 'index', 0)
def __getattr__(self, name):
return self._slice[self._namemap[name]].value
def __setattr__(self, name, value):
if name[0:1] == '_' or name not in self._namemap:
super().__setattr__(name, value)
else:
self._slice[self._namemap[name]].value = value
# -----------------------------------------------------------------------------
# === Grammar Representation ===
@ -173,15 +181,27 @@ class Production(object):
self.prec = precedence
# Internal settings used during table construction
self.len = len(self.prod) # Length of the production
# Create a list of unique production symbols used in the production
self.usyms = []
for s in self.prod:
symmap = defaultdict(list)
for n, s in enumerate(self.prod):
symmap[s].append(n)
if s not in self.usyms:
self.usyms.append(s)
# Create a dict mapping symbol names to indices
m = {}
for key, indices in symmap.items():
if len(indices) == 1:
m[key] = indices[0]
else:
for n, index in enumerate(indices):
m[key+str(n)] = index
self.namemap = m
# List of all LR items for the production
self.lr_items = []
self.lr_next = None
@ -1512,9 +1532,10 @@ def _collect_grammar_rules(func):
else:
grammar.append((func, filename, lineno, prodname, syms))
func = getattr(func, 'next_func', None)
return grammar
class OverloadDict(OrderedDict):
class ParserMetaDict(OrderedDict):
'''
Dictionary that allows decorated grammar rule functions to be overloaded
'''
@ -1526,13 +1547,11 @@ class OverloadDict(OrderedDict):
class ParserMeta(type):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = OverloadDict()
def _(*rules):
d = ParserMetaDict()
def _(rule, *extra):
rules = [rule, *extra]
def decorate(func):
if hasattr(func, 'rules'):
func.rules.extend(rules[::-1])
else:
func.rules = list(rules[::-1])
func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
return func
return decorate
d['_'] = _
@ -1788,9 +1807,9 @@ class Parser(metaclass=ParserMeta):
self.statestack.append(0)
self.state = 0
def parse(self, lexer):
def parse(self, tokens):
'''
Parse the given input text. lexer is a Lexer object that produces tokens
Parse the given input tokens.
'''
lookahead = None # Current lookahead symbol
lookaheadstack = [] # Stack of lookahead symbols
@ -1801,10 +1820,6 @@ class Parser(metaclass=ParserMeta):
pslice = YaccProduction(None) # Production object passed to grammar rules
errorcount = 0 # Used during error recovery
# Save a local reference of the lexer being used
self.lexer = lexer
tokens = iter(self.lexer)
# Set up the state and symbol stacks
self.statestack = statestack = [] # Stack of parsing states
self.symstack = symstack = [] # Stack of grammar symbols
@ -1816,7 +1831,6 @@ class Parser(metaclass=ParserMeta):
# Get the next symbol on the input. If a lookahead symbol
# is already set, we just use that. Otherwise, we'll pull
# the next token off of the lookaheadstack or from the lexer
if self.state not in defaulted_states:
if not lookahead:
if not lookaheadstack:
@ -1852,74 +1866,22 @@ class Parser(metaclass=ParserMeta):
self.production = p = prod[-t]
pname = p.name
plen = p.len
pslice._namemap = p.namemap
# Call the production function
sym = YaccSymbol()
sym.type = pname # Production name
sym.value = None
pslice._slice = symstack[-plen:] if plen else []
if plen:
targ = symstack[-plen-1:]
targ[0] = sym
del symstack[-plen:]
del statestack[-plen:]
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# The code enclosed in this section is duplicated
# below as a performance optimization. Make sure
# changes get made in both locations.
sym = YaccSymbol()
sym.type = pname
sym.value = p.func(self, pslice)
symstack.append(sym)
pslice.slice = targ
try:
# Call the grammar rule with our special slice object
del symstack[-plen:]
p.func(self, pslice)
del statestack[-plen:]
symstack.append(sym)
self.state = goto[statestack[-1]][pname]
statestack.append(self.state)
except SyntaxError:
# If an error was set. Enter error recovery state
lookaheadstack.append(lookahead)
symstack.extend(targ[1:-1])
statestack.pop()
self.state = statestack[-1]
sym.type = 'error'
sym.value = 'error'
lookahead = sym
errorcount = ERROR_COUNT
self.errorok = False
continue
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
else:
targ = [sym]
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# The code enclosed in this section is duplicated
# above as a performance optimization. Make sure
# changes get made in both locations.
pslice.slice = targ
try:
# Call the grammar rule with our special slice object
p.func(self, pslice)
symstack.append(sym)
self.state = goto[statestack[-1]][pname]
statestack.append(self.state)
except SyntaxError:
# If an error was set. Enter error recovery state
lookaheadstack.append(lookahead)
statestack.pop()
self.state = statestack[-1]
sym.type = 'error'
sym.value = 'error'
lookahead = sym
errorcount = ERROR_COUNT
self.errorok = False
continue
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
self.state = goto[statestack[-1]][pname]
statestack.append(self.state)
continue
if t == 0:
n = symstack[-1]