-
Notifications
You must be signed in to change notification settings - Fork 1
/
parsing.py
134 lines (123 loc) · 4.8 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import cfg
def scan(code):
"""Take a string and yield a series of tokens."""
# Add a newline to the end to allow peeking at the next character
# without requiring a check for end-of-string
code += "\n"
i = 0
while i < len(code):
char = code[i]
if char in cfg.WHITESPACE:
# Whitespace--ignore
pass
elif char == cfg.LINE_COMMENT_CHAR:
# Start of a comment--scan till newline
while code[i+1] != "\n":
i += 1
elif char in cfg.PARENS:
# Parentheses are their own tokens
yield char
elif char == cfg.STRING_DELIMITER:
# Start of a string literal--scan till the end of it
# A string literal looks like:
# "chars \"and\" \\escapes"
# which represents this string:
# chars "and" \escapes
start = i
i += 1
while code[i] != cfg.STRING_DELIMITER:
if code[i] == "\n":
# String literals that are unterminated by the end
# of a line get autoclosed
yield code[start:i] + cfg.STRING_DELIMITER
# Back the index up a notch so the newline gets
# scanned in its own right on the next iteration
i -= 1
break
elif code[i] == cfg.STRING_ESCAPE_CHAR:
# Include the whole escape sequence together
i += 2
# But make sure we didn't just walk off the end of
# the code
if i >= len(code):
cfg.warn("unterminated escape sequence in string")
yield code[start:i] + cfg.STRING_DELIMITER
i -= 1
break
else:
# Include a single character
i += 1
else:
yield code[start:i+1]
else:
# Start of a regular, non-extended token (symbol or
# numeric literal)--scan till the end of it
start = i
while code[i+1] not in cfg.SPECIAL_CHARS:
i += 1
yield code[start:i+1]
i += 1
def parse(code):
"""Take a series of expressions, yield a series of parse trees.
The code can be a string or an iterator that yields tokens.
Each resulting parse tree is a nested list.
"""
if isinstance(code, str):
# If we're given a raw codestring, scan it before parsing
code = scan(code)
try:
while True:
token = next(code)
if token == "(":
# After an opening parenthesis, parse expressions until
# the matching closing parenthesis and yield the
# resulting nested list
yield parse_expressions(code)
elif token == ")":
# Ignore unmatched closing parentheses with a warning
cfg.warn("unmatched closing parenthesis")
else:
# If the code doesn't start with a parenthesis, it must
# be a symbol, a string literal, or a numeric literal
yield parse_symbol_or_literal(token)
except StopIteration:
# Everything has been parsed
pass
def parse_expressions(code):
"""Take a token iterator and parse expressions from it until ).
This function assumes we're parsing an s-expression and that the
opening parenthesis has already been processed. So we parse the items
or (sub)expressions in the s-expr one after the other, turning them
into a list. When we go to parse another item and we find a closing
parenthesis, we've hit the end of the s-expr, so we return nil.
"""
parsed = []
while True:
try:
token = next(code)
except StopIteration:
# If the s-expression is unfinished and we've run out of tokens,
# supply the missing close-paren
token = ")"
if token == ")":
break
elif token == "(":
# The subexpression is itself a list
expr = parse_expressions(code)
else:
expr = parse_symbol_or_literal(token)
parsed.append(expr)
return parsed
def parse_symbol_or_literal(token):
"Take a string and parse it as a literal or a symbol."""
if token.startswith(cfg.STRING_DELIMITER):
# String literal--should be the same syntax as a Python string
# literal, so try just eval'ing it
# TODO: error handling
return eval(token)
if token.isdigit() or token.startswith("-") and token[1:].isdigit():
# Integer literal
return int(token)
else:
# If it's not any kind of recognized literal, it's a symbol
return cfg.Symbol(token)