-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize.c
190 lines (170 loc) · 5.26 KB
/
tokenize.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#include "shucc.h"
/**
* Skip symbol op.
* @param op symbol to be skipped
* @return bool true if skipped
*/
bool consume(char *op) {
if (token->kind != TK_RESERVED || strlen(op) != token->len || memcmp(token->str, op, token->len)) {
return false;
}
token = token->next;
return true;
}
/**
* Skip identifier.
* @return identifier
*/
Token *consume_ident() {
if (token->kind != TK_IDENT) {
return false;
}
Token *ident = token;
token = token->next;
return ident;
}
/**
* Expect the next token to be the symbol op (i.e. skip op) and raise error if not skipped
* @param op symbol to be expected
*/
void expect(char *op) {
if (token->kind != TK_RESERVED || strlen(op) != token->len || memcmp(token->str, op, token->len)) {
error("This is not '%s'", op);
}
token = token->next;
}
/**
* Expect number and set the value of the token.
* @return val the value of the token
*/
int expect_number() {
if (token->kind != TK_NUM) {
error("This is not a number");
}
int val = token->val;
token = token->next;
return val;
}
/**
* Check if the current token is a specific symbol or operation.
* @param op A symbol or operation
* @return bool
*/
bool peek(char *op) {
if (token->kind != TK_RESERVED || strlen(op) != token->len || memcmp(token->str, op, token->len)) {
return false;
}
return true;
}
/**
* Create a new token tok, which is the next token of cur, and returns it.
* @param kind kind of the new token
* @param cur current token
* @param str token string
* @param len length of reserved token
* @return tok created token
*/
Token *new_token(TokenKind kind, Token *cur, char *str, int len) {
Token *tok = calloc(1, sizeof(Token));
// map_insert (used inside [email protected]) uses strcmp, so str has to end with \0
char *str_sliced = calloc(len + 1, sizeof(char));
strncpy(str_sliced, str, len);
str_sliced[len] = '\0';
tok->kind = kind;
tok->str = str_sliced;
tok->len = len;
cur->next = tok;
return tok;
}
/**
* Check if *p starts with *q.
* @param p string to be checked
* @param q string to check
*/
bool startswith(char *p, char *q) { return memcmp(p, q, strlen(q)) == 0; }
/**
* Check if c is alphabet, number, or _.
* @param c charcter to be checked
* @return true if c is alphabet, number, or _
*/
bool is_alnum(char c) { return isalnum(c) || c == '_'; }
/**
* Recognize TK_RESERVED and returns it.
* @param p token to be recognized
* @return the keyword recognized
*/
char *read_reserved(char *p) {
char *keywords[] = {"return", "if", "else", "while", "for", "int", "sizeof"};
for (int i = 0; i < sizeof(keywords) / sizeof(keywords[0]); i++) {
int len = strlen(keywords[i]);
if (startswith(p, keywords[i]) && !is_alnum(p[len])) { // variable like return1 is skipped
return keywords[i];
}
}
return NULL;
}
/**
* Tokenize string *p.
* @param p string to be tokenized
* @return the first token of tokenized *p
*/
Token *tokenize(char *p) {
Token head; // the #0 token (which is used only inside this function)
head.next = NULL; // after the last token comes NULL
Token *cur = &head; // initialize cur with head
while (*p) {
// skip spaces
if (isspace(*p)) {
p++;
continue;
}
// tokenize keywords
char *keyword = read_reserved(p);
if (keyword) {
int len = strlen(keyword);
cur = new_token(TK_RESERVED, cur, p, len);
p += len;
continue;
}
// tokenize local variables
if (isalpha(*p) || *p == '_') { // the first letter must be alphabet or _
int len = 1;
while (is_alnum(p[len])) { // letters thereafter may include numbers
len++;
}
cur = new_token(TK_IDENT, cur, p, len);
p += len;
continue;
}
// tokenize reserved token
// two-letter reserved token
if (startswith(p, "<=") || startswith(p, ">=") || startswith(p, "!=") || startswith(p, "==")) {
cur = new_token(TK_RESERVED, cur, p, 2);
p += 2;
continue;
}
// single-letter reserved token
if (ispunct(*p)) { // !"#$%&'()*+,-./:;<=>?@[\]^_`{|}
cur = new_token(TK_RESERVED, cur, p, 1);
// create a new token, the type of which is TK_RESERVED, the previous token of which is cur,
// and the string representation of which is character *p
p++;
continue;
}
// tokenize number
/*
The strtol function is from stdlib.h.
It converts string *p to type long int with base 10.
If string *p is not totally number, then the rest of *p is left untouched.
The second argument, which is an address, will indicate the position of it.
*/
if (isdigit(*p)) {
cur = new_token(TK_NUM, cur, p, 0);
cur->val = strtol(p, &p, 10);
continue;
}
error("Could not tokenize the string");
}
new_token(TK_EOF, cur, p, 0); // create the last token
return head.next; // the first token of the tokenized string
}