Skip to content

Commit d1fd74e

Browse files
committed
wasm: Make room for iter node arity in CST representation
1 parent 678d76a commit d1fd74e

File tree

5 files changed

+145
-80
lines changed

5 files changed

+145
-80
lines changed

packages/miniohm-js/index.js

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
const WASM_PAGE_SIZE = 64 * 1024;
44
const INPUT_BUFFER_OFFSET = WASM_PAGE_SIZE;
5+
const CST_NODE_TYPE_MASK = 0b11;
6+
7+
const CstNodeType = {
8+
NONTERMINAL: 0,
9+
TERMINAL: 1,
10+
ITER: 2,
11+
};
512

613
// Bit flags for Unicode categories, based on the order that they appear in
714
// https://www.unicode.org/Public/16.0.0/ucd/extracted/DerivedGeneralCategory.txt
@@ -206,15 +213,15 @@ export class CstNode {
206213
}
207214

208215
isNonterminal() {
209-
return this._type >= 0;
216+
return (this._type & CST_NODE_TYPE_MASK) === CstNodeType.NONTERMINAL;
210217
}
211218

212219
isTerminal() {
213-
return this._type === -1;
220+
return (this._type & CST_NODE_TYPE_MASK) === CstNodeType.TERMINAL;
214221
}
215222

216223
isIter() {
217-
return this._type === -2;
224+
return (this._type & CST_NODE_TYPE_MASK) === CstNodeType.ITER;
218225
}
219226

220227
isOptional() {
@@ -226,8 +233,8 @@ export class CstNode {
226233
}
227234

228235
get ruleName() {
229-
const id = this._view.getInt32(this._base + 8, true);
230-
return this._ruleNames[id];
236+
const ruleId = this._view.getInt32(this._base + 8, true) >>> 2;
237+
return this._ruleNames[ruleId].split('<')[0];
231238
}
232239

233240
get count() {
@@ -239,8 +246,7 @@ export class CstNode {
239246
}
240247

241248
get _type() {
242-
const t = this._view.getInt32(this._base + 8, true);
243-
return t < 0 ? t : 0;
249+
return this._view.getInt32(this._base + 8, true);
244250
}
245251

246252
get children() {

packages/wasm/runtime/ohmRuntime.ts

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@ declare function matchUnicodeChar(categoryBitmap: i32): bool;
1616

1717
// CST nodes
1818
@inline const CST_NODE_OVERHEAD: usize = 16;
19-
@inline const NODE_TYPE_TERMINAL: i32 = -1;
20-
@inline const NODE_TYPE_ITERATION: i32 = -2;
19+
20+
// Node type is given by the two least sigificant bits.
21+
@inline const NODE_TYPE_NONTERMINAL: i32 = 0;
22+
@inline const NODE_TYPE_TERMINAL: i32 = 1;
23+
@inline const NODE_TYPE_ITERATION: i32 = 2;
2124

2225
// Memo table entries
2326
type MemoEntry = i32;
@@ -87,12 +90,8 @@ export function dummy(i: i32): void {
8790
store<i32>(ptr, len, 4);
8891
}
8992

90-
@inline function cstGetType(ptr: usize): i32 {
91-
return load<i32>(ptr, 8);
92-
}
93-
94-
@inline function cstSetType(ptr: usize, t: i32): void {
95-
store<i32>(ptr, t, 8);
93+
@inline function cstSetTypeAndDetails(ptr: usize, val: i32): void {
94+
store<i32>(ptr, val, 8);
9695
}
9796

9897
@inline function cstGetFailurePos(ptr: usize): i32 {
@@ -251,20 +250,20 @@ export function newTerminalNode(startIdx: i32, endIdx: i32): usize {
251250
const ptr = heap.alloc(CST_NODE_OVERHEAD);
252251
cstSetCount(ptr, 0);
253252
cstSetMatchLength(ptr, endIdx - startIdx);
254-
cstSetType(ptr, NODE_TYPE_TERMINAL);
253+
cstSetTypeAndDetails(ptr, NODE_TYPE_TERMINAL);
255254
cstSetFailurePos(ptr, 0);
256255
bindings.push(ptr);
257256
return ptr;
258257
}
259258

260259
// Create an internal (non-leaf) node (IterationNode or NonterminalNode).
261-
@inline function newNonLeafNode(startIdx: i32, endIdx: i32, type: i32, origNumBindings: i32, failurePos: i32): usize {
260+
@inline function newNonLeafNode(startIdx: i32, endIdx: i32, typeAndDetails: i32, origNumBindings: i32, failurePos: i32): usize {
262261
const bindingsLen = bindings.length;
263262
const numChildren = bindingsLen - origNumBindings;
264263
const ptr = heap.alloc(CST_NODE_OVERHEAD + numChildren * 4);
265264
cstSetCount(ptr, numChildren);
266265
cstSetMatchLength(ptr, endIdx - startIdx);
267-
cstSetType(ptr, type);
266+
cstSetTypeAndDetails(ptr, typeAndDetails);
268267
cstSetFailurePos(ptr, failurePos);
269268
for (let i = 0; i < numChildren; i++) {
270269
store<i32>(ptr + CST_NODE_OVERHEAD + i * 4, bindings[bindingsLen - numChildren + i]);
@@ -275,11 +274,13 @@ export function newTerminalNode(startIdx: i32, endIdx: i32): usize {
275274
}
276275

277276
export function newNonterminalNode(startIdx: i32, endIdx: i32, ruleId: i32, origNumBindings: i32, failurePos: i32): usize {
278-
return newNonLeafNode(startIdx, endIdx, ruleId, origNumBindings, failurePos);
277+
const typeAndDetails = (ruleId << 2) | NODE_TYPE_NONTERMINAL;
278+
return newNonLeafNode(startIdx, endIdx, typeAndDetails, origNumBindings, failurePos);
279279
}
280280

281-
export function newIterationNode(startIdx: i32, endIdx: i32, origNumBindings: i32): usize {
282-
return newNonLeafNode(startIdx, endIdx, NODE_TYPE_ITERATION, origNumBindings, -1);
281+
export function newIterationNode(startIdx: i32, endIdx: i32, origNumBindings: i32, arity: i32): usize {
282+
const typeAndDetails = (arity << 2) | NODE_TYPE_ITERATION;
283+
return newNonLeafNode(startIdx, endIdx, typeAndDetails, origNumBindings, -1);
283284
}
284285

285286
export function getBindingsLength(): i32 {

packages/wasm/src/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ class Assembler {
603603
this.getSavedPos();
604604
this.globalGet('pos');
605605
this.getSavedNumBindings();
606+
this.i32Const(0); // TODO: arity
606607
this.callPrebuiltFunc('newIterationNode');
607608
}
608609

packages/wasm/test/test-toAST.js

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,55 @@ test('toAST basic', async t => {
180180
};
181181
t.deepEqual(ast, expected, 'proper AST with explicity reintroduced node');
182182
});
183+
184+
// eslint-disable-next-line ava/no-skip-test
185+
test.skip('listOf and friends - #394', async t => {
186+
// By default, toAST assumes that lexical rules represent indivisible tokens,
187+
// but that doesn't make sense for listOf, nonemptyListOf, and emptyListOf.
188+
const g = ohm.grammar(`
189+
G {
190+
Exp = listOf<digit, "+">
191+
Exp2 = ListOf<digit, "+">
192+
}
193+
`);
194+
const m = await wasmMatcherForGrammar(g);
195+
196+
const ast = (input, mapping, ruleName = 'Exp') => {
197+
const toAst = toAstWithMapping(mapping);
198+
m.setInput(input);
199+
return toAst(m.match(ruleName));
200+
};
201+
// const astSyntactic = (input, mapping) => ast(input, mapping, 'Exp2');
202+
203+
// By default, the `listOf` action should pass through, and both `nonemptyListOf`
204+
// and `emptyListOf` should return an array.
205+
t.deepEqual(ast('3+5'), ['3', '5']);
206+
t.deepEqual(ast(''), []);
207+
208+
// // The AST should be the same whether we use `listOf` or `ListOf`.
209+
// t.deepEqual(ast('3+5'), astSyntactic('3 + 5'));
210+
// t.deepEqual(ast(''), astSyntactic(''));
211+
212+
// // Ensure that it's still be possible to override the default mappings.
213+
214+
// t.is(
215+
// ast('0+1', {
216+
// nonemptyListOf: (first, sep, rest) => 'XX',
217+
// }),
218+
// 'XX',
219+
// );
220+
221+
// t.is(
222+
// ast('1+2', {
223+
// nonemptyListOf: 0,
224+
// }),
225+
// '1',
226+
// );
227+
228+
// t.is(
229+
// ast('', {
230+
// emptyListOf: () => 'nix',
231+
// }),
232+
// 'nix',
233+
// );
234+
});

packages/wasm/test/test-wasm.js

Lines changed: 64 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,11 @@ test('cst returns', async t => {
4343
t.is(root.ruleName, 'start');
4444

4545
// "a"
46-
let {matchLength, _type, children} = root.children[0];
47-
t.is(children.length, 0);
48-
t.is(matchLength, 1);
49-
t.is(_type, -1);
46+
let term = root.children[0];
47+
t.is(term.children.length, 0);
48+
t.is(term.matchLength, 1);
49+
t.is(term._type, 1);
50+
t.true(term.isTerminal());
5051

5152
matcher = await wasmMatcherForGrammar(ohm.grammar('G { start = "a" b\nb = "b" }'));
5253

@@ -59,22 +60,21 @@ test('cst returns', async t => {
5960

6061
// "a"
6162
const [childA, childB] = root.children;
62-
({matchLength, _type, children} = childA);
63-
t.is(children.length, 0);
64-
t.is(matchLength, 1);
65-
t.is(_type, -1);
63+
t.is(childA.children.length, 0);
64+
t.is(childA.matchLength, 1);
65+
t.true(childA.isTerminal());
6666

6767
// NonterminalNode for b
6868
t.is(childB.children.length, 1);
6969
t.is(childB.matchLength, 1);
7070
t.is(childB.ruleName, 'b');
7171

7272
// TerminalNode for "b"
73-
// eslint-disable-next-line no-unused-vars
74-
({matchLength, _type, children} = childB.children[0]);
75-
t.is(children.length, 0);
76-
t.is(matchLength, 1);
77-
t.is(_type, -1);
73+
term = childB.children[0];
74+
t.is(term.children.length, 0);
75+
t.is(term.matchLength, 1);
76+
t.true(term.isTerminal());
77+
t.is(term.ctorName, '_terminal');
7878
});
7979

8080
test('cst with lookahead', async t => {
@@ -138,7 +138,7 @@ test('cst for opt', async t => {
138138
// iter
139139
let iter = root.children[0];
140140
t.is(iter.matchLength, 1);
141-
t.is(iter._type, -2);
141+
t.true(iter.isIter());
142142
t.is(iter.children.length, 1);
143143
t.is(iter.children[0].isTerminal(), true);
144144
t.is(iter.children[0].matchLength, 1);
@@ -157,7 +157,7 @@ test('cst for opt', async t => {
157157
// eslint-disable-next-line no-unused-vars
158158
iter = root.children[0];
159159
t.is(iter.matchLength, 0);
160-
t.is(iter._type, -2);
160+
t.true(iter.isIter());
161161
t.is(iter.children.length, 0);
162162
});
163163

@@ -176,7 +176,7 @@ test('cst for plus', async t => {
176176
// eslint-disable-next-line no-unused-vars
177177
const iter = root.children[0];
178178
t.is(iter.matchLength, 1);
179-
t.is(iter._type, -2);
179+
t.true(iter.isIter());
180180
t.is(iter.children.length, 1);
181181

182182
t.is(iter.children[0].isTerminal(), true);
@@ -205,7 +205,7 @@ test('cst with (small) repetition', async t => {
205205
const iter = root.children[0];
206206
t.is(iter.matchLength, 3);
207207
t.is(iter.children.length, 3);
208-
t.is(iter._type, -2);
208+
t.true(iter.isIter());
209209

210210
// Terminal children
211211
const [childA, childB, childC] = iter.children;
@@ -229,39 +229,36 @@ test('cst with repetition and lookahead', async t => {
229229
t.is(matchWithInput(matcher, input), 1);
230230

231231
// x
232-
let {matchLength, _type, children} = matcher.getCstRoot();
233-
t.is(matchLength, 3);
234-
t.is(children.length, 1);
235-
t.is(_type, 0);
232+
const root = matcher.getCstRoot();
233+
t.is(root.matchLength, 3);
234+
t.is(root.children.length, 1);
235+
t.true(root.isNonterminal());
236236

237237
// iter
238-
({matchLength, _type, children} = children[0]);
239-
t.is(matchLength, 3);
240-
t.is(children.length, 3);
241-
t.is(_type, -2);
238+
const iter = root.children[0];
239+
t.is(iter.matchLength, 3);
240+
t.is(iter.children.length, 3);
241+
t.true(iter.isIter());
242242

243-
const [childA, childB, childC] = children;
244-
({matchLength, _type, children} = childA);
245-
t.is(matchLength, 1);
246-
t.is(children.length, 1);
247-
t.is(_type, 0);
248-
t.is(children[0].isTerminal(), true);
249-
t.is(children[0].matchLength, 1);
243+
const [childA, childB, childC] = iter.children;
244+
t.is(childA.matchLength, 1);
245+
t.is(childA.children.length, 1);
246+
t.true(childA.isNonterminal());
247+
t.true(childA.children[0].isTerminal());
248+
t.is(childA.children[0].matchLength, 1);
250249

251-
({matchLength, _type, children} = childB);
252-
t.is(matchLength, 1);
253-
t.is(children.length, 1);
254-
t.is(_type, 0);
255-
t.is(children[0].isTerminal(), true);
256-
t.is(children[0].matchLength, 1);
250+
t.is(childB.matchLength, 1);
251+
t.is(childB.children.length, 1);
252+
t.true(childB.isNonterminal());
253+
t.true(childB.children[0].isTerminal());
254+
t.is(childB.children[0].matchLength, 1);
257255

258256
// eslint-disable-next-line no-unused-vars
259-
({matchLength, _type, children} = childC);
260-
t.is(matchLength, 1);
261-
t.is(children.length, 1);
262-
t.is(_type, 0);
263-
t.is(children[0].isTerminal(), true);
264-
t.is(children[0].matchLength, 1);
257+
t.is(childC.matchLength, 1);
258+
t.is(childC.children.length, 1);
259+
t.true(childC.isNonterminal());
260+
t.true(childC.children[0].isTerminal());
261+
t.is(childC.children[0].matchLength, 1);
265262

266263
matcher = await wasmMatcherForGrammar(ohm.grammar('G {x = (~space any)+ spaces any+}'));
267264
input = '/ab xy';
@@ -564,34 +561,31 @@ test('basic memoization', async t => {
564561
return view.getUint32(colOffset + SIZEOF_UINT32 * ruleId, true);
565562
};
566563

567-
const cstRoot = matcher.getCstRoot();
564+
const root = matcher.getCstRoot();
568565

569566
// start
570-
let {matchLength, _type, children} = cstRoot;
571-
t.is(matchLength, 2);
572-
t.is(children.length, 2);
573-
t.is(_type, 0);
567+
t.is(root.matchLength, 2);
568+
t.is(root.children.length, 2);
569+
t.is(root.ctorName, 'start');
574570

575-
const [childA, childB] = children;
571+
const [childA, childB] = root.children;
576572

577573
// "a"
578-
t.is(childA.isTerminal(), true);
574+
t.true(childA.isTerminal());
579575
t.is(childA.matchLength, 1);
580576

581577
// b
582-
// eslint-disable-next-line no-unused-vars
583-
({matchLength, _type, children} = childB);
584-
t.is(matchLength, 1);
585-
t.is(children.length, 1);
586-
t.is(_type, 0);
578+
t.is(childB.matchLength, 1);
579+
t.is(childB.children.length, 1);
580+
t.is(childB.ctorName, 'b');
587581

588582
// "b"
589-
t.is(children[0].isTerminal(), true);
590-
t.is(children[0].matchLength, 1);
583+
t.true(childB.children[0].isTerminal());
584+
t.is(childB.children[0].matchLength, 1);
591585

592586
// Expect memo for `b` at position 1, and `start` at position 0.
593587
t.is(getMemo(1, 'b'), childB._base);
594-
t.is(getMemo(0, 'start'), cstRoot._base);
588+
t.is(getMemo(0, 'start'), root._base);
595589
});
596590

597591
test('more memoization', async t => {
@@ -979,3 +973,14 @@ test.failing('unicode', async t => {
979973
t.is(matchWithInput(m, source), 1);
980974
t.is(unparse(m), source);
981975
});
976+
977+
// eslint-disable-next-line ava/no-skip-test
978+
test.skip('iter node map', async t => {
979+
const m = await wasmMatcherForGrammar(ohm.grammar('G { Start = (letter digit)* }'));
980+
t.is(matchWithInput(m, 'a1 b2 c 3'), 1);
981+
const iter = m.getCstRoot().children[0];
982+
t.deepEqual(
983+
iter.map((letter, digit) => `${digit}${letter}`),
984+
['1a', '2b', '3c'],
985+
);
986+
});

0 commit comments

Comments
 (0)