diff --git a/LICENSE b/LICENSE index b9c95b8..46e89f5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2016 SimplePEG +Copyright (c) 2016 Oleksii Okhrymenko (aka aiboy) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md deleted file mode 100644 index d344bb8..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# python -Python version of SimplePEG diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..fcbb25a --- /dev/null +++ b/README.rst @@ -0,0 +1,19 @@ +Python version of SimplePEG +-------- + +To use, simply do:: + + >>> import SPEG from simplepeg + >>> parser = s.SPEG() + >>> # will throw Exception if grammar is invalid + >>> parser.parse_grammar('GRAMMAR test b -> "a";') + >>> # will throw Exception if text have invalid grammar + >>> ast = parser.parse_text('a') + >>> print ast.to_json() + +or:: + + >>> import SPEG from simplepeg + >>> parser = s.SPEG() + >>> ast = parser.parse('GRAMMAR test b -> "a";', 'a') + >>> print ast.to_json() \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..17b867b --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ +from setuptools import setup + +def readme(): + with open('README.rst') as f: + return f.read() + +setup(name='simplepeg', + version='1.0.0', + description='Python version of SimplePEG', + long_description=readme(), + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7', + 'Topic :: Text Processing :: Linguistic', + ], + url='https://github.com/SimplePEG/Python', + author='Oleksii Okhrymenko', + author_email='ai_boy@live.ru', + keywords='peg parser grammar', + license='MIT', + packages=['simplepeg'], + include_package_data=True, + zip_safe=False) \ No newline at end of file diff --git a/simplepeg/__init__.py b/simplepeg/__init__.py new file mode 100644 index 0000000..ed30687 --- /dev/null +++ b/simplepeg/__init__.py @@ -0,0 +1 @@ +from .speg import SPEG \ No newline at end of file diff --git a/simplepeg/rd_parser.py b/simplepeg/rd_parser.py new file mode 100644 index 0000000..7049f4c --- /dev/null +++ b/simplepeg/rd_parser.py @@ -0,0 +1,313 @@ +"""Recursince decend parser""" +# pylint: disable=too-few-public-methods + +import json +import re + +class State(object): + """Current parser state""" + text = "" + position = 0 + rules = [] + lastExpectations = [] + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def to_json(self): + """returns json string""" + return json.dumps(self, default=lambda o: o.__dict__, sort_keys=False, indent=2) + +class Node(object): + """Node of AST""" + match = "" + children = None + action = None + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def to_json(self): + """returns json string""" + return json.dumps(self, default=lambda o: o.__dict__, sort_keys=False, indent=2) + + +class Expectation(object): + """Expectation object""" + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def to_json(self): + """returns json string""" + return json.dumps(self, default=lambda o: o.__dict__, sort_keys=False, indent=2) + +def getLastError(state): + if len(state.lastExpectations) < 1: + return False + lines = state.text.split('\n') + last_exp_position = max([exp.position for exp in state.lastExpectations]) + last_position = 0 + line_of_error = '' + error_line_number = None + position_of_error = 0 + i = 0 + while i < len(lines): + line_lenght = len(lines[i]) + 1 + if last_exp_position >= last_position and last_exp_position < last_position + line_lenght: + line_of_error = lines[i] + position_of_error = last_exp_position - last_position + error_line_number = i + 1 + break + last_position += line_lenght + i += 1 + str_error_ln = str(error_line_number) + error_ln_length = len(str_error_ln) + unexpected_char = 'EOF' + if last_exp_position < len(state.text): + unexpected_char = state.text[last_exp_position] + unexpected = 'Unexpected "' + unexpected_char + '"' + expected_rules = [exp.rule for exp in state.lastExpectations] + expected = ' expected (' + ' or '.join(expected_rules) + ')' + pointer = ('-'*(position_of_error + 2 + error_ln_length)) + '^' + extra = line_of_error + '\n' + pointer + return unexpected + expected + '\n' + str_error_ln + ': ' + extra + +def string(rule): + def _(state): + state.lastExpectations = [] + if state.text[state.position:state.position+len(rule)] == rule: + start_position = state.position + state.position += len(rule) + return Node( + type='string', + match=rule, + start_position=start_position, + end_position=state.position + ) + else: + state.lastExpectations = [Expectation( + type='string', + rule=rule, + position=state.position + )] + return False + return _ + +def regex_char(rule): + def _(state): + state.lastExpectations = [] + match = re.match(rule, state.text[state.position:]) + if match and match.start() == 0: + start_position = state.position + state.position += match.end() + return Node( + type='regex_char', + match=match.group(0), + start_position=start_position, + end_position=state.position + ) + else: + state.lastExpectations = [Expectation( + type='regex_char', + rule=rule, + position=state.position + )] + return False + return _ + +def sequence(parsers): + def _(state): + asts = [] + start_position = state.position + i = 0 + while i < len(parsers): + ast = parsers[i](state) + if ast: + asts.append(ast) + else: + return False + i += 1 + match = ''.join([(ast.match if ast.match is not None else '') for ast in asts]) + return Node( + type='sequence', + match=match, + children=asts, + start_position=start_position, + end_position=state.position + ) + return _ + +def ordered_choice(parsers): + def _(state): + expectations = [] + initial_text = state.text + initial_position = state.position + i = 0 + while i < len(parsers): + ast = parsers[i](state) + if ast: + return Node( + type='ordered_choice', + match=ast.match, + children=[ast], + start_position=initial_position, + end_position=state.position, + ) + else: + state.text = initial_text + state.position = initial_position + expectations = expectations + state.lastExpectations + i += 1 + state.lastExpectations = expectations + return False + return _ + +def zero_or_more(parser): + def _(state): + asts = [] + start_position = state.position + ast = True + while ast: + state_position = state.position + ast = parser(state) + if ast: + asts.append(ast) + else: + state.position = state_position + state.lastExpectations = [] + match = ''.join([(ast.match if ast.match is not None else '') for ast in asts]) + return Node( + type='zero_or_more', + match=match, + children=asts, + start_position=start_position, + end_position=state.position + ) + return _ + +def one_or_more(parser): + def _(state): + asts = [] + start_position = state.position + ast = True + while ast: + state_position = state.position + ast = parser(state) + if ast: + asts.append(ast) + else: + state.position = state_position + if len(asts) > 0: + state.lastExpectations = [] + match = ''.join([(ast.match if ast.match is not None else '') for ast in asts]) + return Node( + type='one_or_more', + match=match, + children=asts, + start_position=start_position, + end_position=state.position + ) + else: + return False + return _ + +def optional(parser): + def _(state): + start_position = state.position + match = None + children = None + ast = parser(state) + if ast: + match = ast.match + children = [ast] + return Node( + type='optional', + match=match, + children=children, + start_position=start_position, + end_position=state.position + ) + return _ + +def and_predicate(parser): + def _(state): + current_text = state.text + current_position = state.position + ast = parser(state) + if ast: + state.text = current_text + state.position = current_position + return Node( + type='and_predicate', + match=None, + children=[ast], + start_position=state.position, + end_position=state.position + ) + else: + return False + return _ + +def not_predicate(parser): + def _(state): + current_text = state.text + current_position = state.position + ast = parser(state) + if ast: + state.text = current_text + state.position = current_position + state.lastExpectations = [Expectation( + type='not_predicate', + children=[ast], + position=state.position + )] + return False + else: + state.lastExpectations = [] + return Node( + type='not_predicate', + match=None, + children=[], + start_position=state.position, + end_position=state.position + ) + return _ + +def end_of_file(): + def _(state): + if len(state.text) == state.position: + return Node( + type='end_of_file', + match=None, + children=[], + start_position=state.position, + end_position=state.position + ) + else: + state.lastExpectations = [Expectation( + type='end_of_file', + rule='EOF', + position=state.position + )] + return False + return _ + +def rec(func): + """Allows you to do recurrcive currying""" + def _(*args, **kwargs): + return func()(*args, **kwargs) + return _ + + +def action(name, func): + def _(*args, **kwargs): + ast = func(*args, **kwargs) + if ast: + ast.action = name + return ast + return _ + +def call_rule_by_name(name): + def _(state): + rule = next((x for x in state.rules if x.name == name), None) + ast = rule.parser(state) + return ast + return _ diff --git a/simplepeg/speg.py b/simplepeg/speg.py new file mode 100644 index 0000000..ccfcc1c --- /dev/null +++ b/simplepeg/speg.py @@ -0,0 +1,95 @@ +from . import speg_visitor as sv +from . import speg_parser as sp +from . import rd_parser as rd + +class SPEG_actions(object): + def noop(self, node): + return node + def peg(self, node): + return node.children[2] + def parsing_body(self, node): + node.children = [child.children[0] for child in node.children] + return node + def parsing_rule(self, node): + rule = node.children[4] + return rd.Node( + name=node.children[0].match, + parser=rule + ) + def parsing_expression(self, node): + return node.children[0] + def parsing_sequence(self, node): + head = [node.children[0].children[0]] + tail = [child.children[1].children[0] for child in node.children[1].children] + return rd.sequence(head + tail) + def parsing_ordered_choice(self, node): + head = [node.children[0]] + tail = [child.children[3] for child in node.children[1].children] + return rd.ordered_choice(head + tail) + def parsing_sub_expression(self, node): + return node.children[0] + def parsing_group(self, node): + return node.children[2] + def parsing_atomic_expression(self, node): + return node.children[0] + def parsing_not_predicate(self, node): + return rd.not_predicate(node.children[1].children[0]) + def parsing_and_predicate(self, node): + return rd.and_predicate(node.children[1].children[0]) + def parsing_zero_or_more(self, node): + return rd.zero_or_more(node.children[0].children[0]) + def parsing_one_or_more(self, node): + return rd.one_or_more(node.children[0].children[0]) + def parsing_optional(self, node): + return rd.optional(node.children[0].children[0]) + def parsing_string(self, node): + return rd.string(node.children[1].match) + def parsing_regex_char(self, node): + return rd.regex_char(node.children[0].match) + def parsing_rule_call(self, node): + return rd.call_rule_by_name(node.match) + def parsing_end_of_file(self, node): + return rd.end_of_file() + +class SPEG(object): + def __init__(self): + self.parser = sp.SPEG_parser() + self.visitor = sv.SPEG_actions_visitor(SPEG_actions()) + self.speg_parser = None + def parse_grammar(self, grammar): + self.speg_parser = None + speg_ast = self.parser.parse(grammar) + if speg_ast: + self.speg_parser = self.visitor.visit(speg_ast) + else: + raise Exception('Failed to parse grammar: \n\n' + self.parser.getLastError()) + def parse_text(self, text): + if self.speg_parser: + rules = self.speg_parser.children + first_rule = rules[0] + first_rule_parser = first_rule.parser + state = rd.State(text=text, rules=rules) + ast = first_rule_parser(state) + if ast: + return ast + else: + raise Exception('Failed to parse text: \n\n' + rd.getLastError(state)) + else: + raise Exception('You need grammar to parse text. Call parseGrammar first') + def parse(self, grammar, text): + speg_ast = self.parser.parse(grammar) + if speg_ast: + visitor = sv.SPEG_actions_visitor(SPEG_actions()) + generated_parser = visitor.visit(speg_ast) + rules = generated_parser.children + first_rule = rules[0] + first_rule_parser = first_rule.parser + state = rd.State(text=text, rules=rules) + ast = first_rule_parser(state) + if ast: + return ast + else: + raise Exception('Failed to parse text: \n\n' + rd.getLastError(state)) + else: + raise Exception('Failed to parse grammar: \n\n' + self.parser.getLastError()) + \ No newline at end of file diff --git a/simplepeg/speg_parser.py b/simplepeg/speg_parser.py new file mode 100644 index 0000000..4d1fa6d --- /dev/null +++ b/simplepeg/speg_parser.py @@ -0,0 +1,198 @@ +from . import rd_parser as rd + +def peg(): + return rd.action('peg', rd.sequence([ + parsing_header(), + rd.one_or_more(_()), + parsing_body(), + rd.end_of_file() + ])) + +def parsing_header(): + return rd.action('noop', rd.sequence([ + rd.string('GRAMMAR'), + rd.one_or_more(_()), + rd.one_or_more(parsing_rule_name()) + ])) + +def parsing_body(): + return rd.action('parsing_body', rd.one_or_more(rd.ordered_choice([ + parsing_rule(), + rd.one_or_more(_()) + ]))) + +def parsing_rule(): + return rd.action('parsing_rule', rd.sequence([ + parsing_rule_name(), + rd.zero_or_more(_()), + rd.string('->'), + rd.zero_or_more(_()), + parsing_expression(), + rd.zero_or_more(_()), + rd.string(';'), + rd.zero_or_more(_()) + ])) + +def parsing_rule_name(): + return rd.action('noop', rd.sequence([ + rd.regex_char('[a-zA-Z]'), + rd.zero_or_more(rd.regex_char('[a-zA-Z_]')), + ])) + +def parsing_expression(): + return rd.action('parsing_expression', rd.ordered_choice([ + parsing_sequence(), + parsing_ordered_choice(), + parsing_sub_expression() + ])) + +def parsing_sequence(): + return rd.action('parsing_sequence', rd.sequence([ + rd.ordered_choice([ + parsing_ordered_choice(), + parsing_sub_expression() + ]), + rd.one_or_more(rd.sequence([ + rd.one_or_more(_()), + rd.ordered_choice([ + parsing_ordered_choice(), + parsing_sub_expression() + ]) + ])) + ])) + +def parsing_ordered_choice(): + return rd.action('parsing_ordered_choice', rd.sequence([ + parsing_sub_expression(), + rd.one_or_more(rd.sequence([ + rd.zero_or_more(_()), + rd.string('/'), + rd.zero_or_more(_()), + parsing_sub_expression(), + ])) + ])) + +def parsing_sub_expression(): + return rd.action('parsing_sub_expression', rd.ordered_choice([ + parsing_not_predicate(), + parsing_and_predicate(), + parsing_optional(), + parsing_one_or_more(), + parsing_zero_or_more(), + parsing_group(), + parsing_atomic_expression() + ])) + +def parsing_group(): + return rd.action('parsing_group', rd.sequence([ + rd.string('('), + rd.zero_or_more(_()), + rd.rec(parsing_expression), + rd.zero_or_more(_()), + rd.string(')') + ])) + +def parsing_atomic_expression(): + return rd.action('parsing_atomic_expression', rd.ordered_choice([ + parsing_string(), + parsing_regex_char(), + parsing_eof(), + parsing_rule_call() + ])) + +def parsing_not_predicate(): + return rd.action('parsing_not_predicate', rd.sequence([ + rd.string('!'), + rd.ordered_choice([ + parsing_group(), + parsing_atomic_expression() + ]) + ])) + +def parsing_and_predicate(): + return rd.action('parsing_and_predicate', rd.sequence([ + rd.string('&'), + rd.ordered_choice([ + parsing_group(), + parsing_atomic_expression() + ]) + ])) + +def parsing_zero_or_more(): + return rd.action('parsing_zero_or_more', rd.sequence([ + rd.ordered_choice([ + parsing_group(), + parsing_atomic_expression() + ]), + rd.string('*') + ])) + +def parsing_one_or_more(): + return rd.action('parsing_one_or_more', rd.sequence([ + rd.ordered_choice([ + parsing_group(), + parsing_atomic_expression() + ]), + rd.string('+') + ])) + +def parsing_optional(): + return rd.action('parsing_optional', rd.sequence([ + rd.ordered_choice([ + parsing_group(), + parsing_atomic_expression() + ]), + rd.string('?') + ])) + +def parsing_rule_call(): + return rd.action('parsing_rule_call', parsing_rule_name()) + +def parsing_string(): + return rd.action('parsing_string', rd.sequence([ + rd.string('"'), + rd.one_or_more(rd.ordered_choice([ + rd.string('\\"'), + rd.regex_char('[^"]'), + ])), + rd.string('"') + ])) + +def parsing_regex_char(): + return rd.action('parsing_regex_char', rd.ordered_choice([ + rd.sequence([ + rd.string('['), + rd.optional(rd.string('^')), + rd.one_or_more(rd.ordered_choice([ + rd.string('\\]'), + rd.string('\\['), + rd.regex_char('[^\\]]'), + ])), + rd.string(']') + ]), + rd.string('.') + ])) + +def parsing_eof(): + return rd.action('parsing_end_of_file', rd.string("EOF")) + +def _(): + return rd.action('noop', rd.regex_char('[\\s]')) + +class SPEG_parser(object): + """Class that allows you to parse PEG grammaras (EBNF-ish style)""" + parser = None + state = None + def __init__(self): + self.parser = peg() + def parse(self, text): + self.state = rd.State( + text=text, + position=0 + ) + ast = self.parser(self.state) + return ast + def getLastExpectations(self): + return self.state.lastExpectations + def getLastError(self): + return rd.getLastError(self.state) diff --git a/simplepeg/speg_visitor.py b/simplepeg/speg_visitor.py new file mode 100644 index 0000000..7c1aed5 --- /dev/null +++ b/simplepeg/speg_visitor.py @@ -0,0 +1,38 @@ +from . import rd_parser as rd + +class PEGJS_visitor(object): + def visit(self, node): + return getattr(self, node.type)(node) + def string(self, node): + return node.match + def regex_char(self, node): + return node.match + def sequence(self, node): + return [self.visit(child) for child in node.children] + def ordered_choice(self, node): + return [self.visit(child) for child in node.children] + def zero_or_more(self, node): + return [self.visit(child) for child in node.children] + def one_or_more(self, node): + return [self.visit(child) for child in node.children] + def optional(self, node): + return [self.visit(child) for child in node.children] + def and_predicate(self, node): + return None + def not_predicate(self, node): + return None + def end_of_file(self, node): + return None + +class SPEG_actions_visitor(object): + def __init__(self, actions): + self.actions = actions + def visit(self, node): + if not node: + print 1 + if node.children: + children = [self.visit(child) for child in node.children] + node.children = children + if self.actions and node.action: + return getattr(self.actions, node.action)(node) + return node