Skip to content

Commit e04f8d3

Browse files
committed
wasm: Add support for Unicode categories
1 parent 5e850b3 commit e04f8d3

File tree

5 files changed

+122
-36
lines changed

5 files changed

+122
-36
lines changed

packages/miniohm-js/index.js

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,32 @@
33
const WASM_PAGE_SIZE = 64 * 1024;
44
const INPUT_BUFFER_OFFSET = WASM_PAGE_SIZE;
55

6+
// Bit flags for Unicode categories, based on the order that they appear in
7+
// https://www.unicode.org/Public/16.0.0/ucd/extracted/DerivedGeneralCategory.txt
8+
9+
const UnicodeCategoryNames = [
10+
'Cn', // Unassigned
11+
'Lu', // Uppercase_Letter
12+
'Ll', // Lowercase_Letter
13+
'Lt', // Titlecase_Letter
14+
'Lm', // Modifier_Letter
15+
'Lo', // Other_Letter
16+
];
17+
18+
const utf8 = new TextDecoder('utf-8');
19+
20+
function regexFromCategoryBitmap(bitmap) {
21+
const cats = [];
22+
for (let i = 0; i < 32; i++) {
23+
const mask = 1 << i;
24+
if (bitmap & mask) cats.push(UnicodeCategoryNames[i]);
25+
}
26+
return new RegExp(
27+
cats.map(cat => `\\p{${cat}}`).join('|'),
28+
'uy', // u: unicode, y: sticky
29+
);
30+
}
31+
632
function assert(cond, msg) {
733
if (!cond) {
834
throw new Error(msg ?? 'assertion failed');
@@ -32,12 +58,38 @@ export class WasmMatcher {
3258
return name[0] === name[0].toUpperCase();
3359
},
3460
fillInputBuffer: this._fillInputBuffer.bind(this),
61+
matchUnicodeChar: (catBitmap, pos) => {
62+
const re = regexFromCategoryBitmap(catBitmap);
63+
return re.test(this._nextCodePoint());
64+
},
3565
},
3666
};
3767
this._ruleIds = new Map();
3868
this._ruleNames = [];
3969
}
4070

71+
// Return a JavaScript string containing the next code point from the input
72+
// buffer, and advance pos past it.
73+
_nextCodePoint() {
74+
const {pos, memory} = this._instance.exports;
75+
const offset = pos.value;
76+
const byteArr = new Uint8Array(memory.buffer, INPUT_BUFFER_OFFSET + offset);
77+
const firstByte = byteArr[0];
78+
let len;
79+
if ((firstByte & 0b10000000) === 0) {
80+
len = 1;
81+
} else if ((firstByte & 0b11100000) === 0b11000000) {
82+
len = 2;
83+
} else if ((firstByte & 0b11110000) === 0b11100000) {
84+
len = 3;
85+
} else {
86+
len = 4;
87+
}
88+
const str = utf8.decode(byteArr.subarray(0, len));
89+
pos.value += len;
90+
return str;
91+
}
92+
4193
_extractRuleIds(module) {
4294
const sections = WebAssembly.Module.customSections(module, 'ruleNames');
4395
if (sections.length === 0) {

packages/wasm/runtime/ohmRuntime.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ type ApplyResult = bool;
33
declare function fillInputBuffer(offset: i32, maxLen: i32): i32;
44
declare function printI32(val: i32): void;
55
declare function isRuleSyntactic(ruleId: i32): bool;
6+
declare function matchUnicodeChar(categoryBitmap: i32): bool;
67

78
@inline const IMPLICIT_SPACE_SKIPPING = true;
89

@@ -50,6 +51,10 @@ let rightmostFailurePos: i32 = 0;
5051
let sp: usize = 0;
5152
let bindings: Array<i32> = new Array<i32>();
5253

54+
export function dummy(i: i32): void {
55+
printI32(i);
56+
}
57+
5358
@inline function max<T>(a: T, b: T): T {
5459
return a > b ? a : b;
5560
}
@@ -289,3 +294,8 @@ export function getCstRoot(): usize {
289294
// TODO: Figure out how to handle this w.r.t. leading and trailing space.
290295
return bindings[0];
291296
}
297+
298+
// TODO: Find a way to call this directly from generated code.
299+
export function doMatchUnicodeChar(categoryBitmap: i32): bool {
300+
return matchUnicodeChar(categoryBitmap)
301+
}

packages/wasm/src/index.js

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,7 @@ export class Compiler {
792792
asm.addBlocktype([w.valtype.i32], [w.valtype.i32]);
793793
asm.addBlocktype([], [w.valtype.i32]); // Rule eval
794794
// (global $runtime/ohmRuntime/pos (mut i32) (i32.const 0))
795+
// (global $runtime/ohmRuntime/rightmostFailurePos (mut i32) (i32.const 0))
795796
// (global $runtime/ohmRuntime/sp (mut i32) (i32.const 0))
796797
// (global $~lib/shared/runtime/Runtime.Stub i32 (i32.const 0))
797798
// (global $~lib/shared/runtime/Runtime.Minimal i32 (i32.const 1))
@@ -1630,35 +1631,50 @@ export class Compiler {
16301631
const isLowercase = 'a' <= c && c <= 'z';
16311632
const isUppercase = 'A' <= c && c <= 'Z';
16321633
if ((exp.category === 'Lu' && isUppercase) || (exp.category === 'Ll' && isLowercase)) {
1633-
return w.labelidx(asm.depthOf('innerSuccess'));
1634+
return w.labelidx(asm.depthOf('fastSuccess'));
16341635
}
16351636
return w.labelidx(asm.depthOf('failure'));
16361637
});
1638+
16371639
this.wrapTerminalLike(() => {
16381640
asm.block(
16391641
w.blocktype.empty,
16401642
() => {
16411643
asm.block(
16421644
w.blocktype.empty,
16431645
() => {
1644-
asm.currCharCode();
1645-
asm.brTable(makeLabels(), w.labelidx(asm.depthOf('default')));
1646+
// Fast path: a jump table for ASCII characters.
1647+
asm.block(
1648+
w.blocktype.empty,
1649+
() => {
1650+
asm.currCharCode();
1651+
asm.brTable(makeLabels(), w.labelidx(asm.depthOf('default')));
1652+
},
1653+
'default',
1654+
);
1655+
// Fall through: not an ASCII character.
1656+
1657+
// Push the arg: a bitmap indicating the categories.
1658+
// prettier-ignore
1659+
switch (exp.category) {
1660+
case 'Lu': asm.i32Const(1 << 1); break;
1661+
case 'Ll': asm.i32Const(1 << 2); break;
1662+
case 'Ltmo': asm.i32Const((1 << 3) | (1 << 4) | (1 << 5)); break;
1663+
default: assert(false, 'not handled');
1664+
}
1665+
asm.callPrebuiltFunc('doMatchUnicodeChar');
1666+
asm.ifElse(
1667+
w.blocktype.empty,
1668+
() => asm.break(asm.depthOf('slowSuccess')),
1669+
() => asm.break(asm.depthOf('failure')),
1670+
);
16461671
},
1647-
'default',
1672+
'fastSuccess',
16481673
);
1649-
// Check for 0xff (end)
1650-
asm.currCharCode();
1651-
asm.i32Const(0xff);
1652-
asm.i32Eq();
1653-
asm.condBreak(asm.depthOf('failure'));
1654-
1655-
// Otherwise, trap.
1656-
// TODO: Replace this with a proper, out-of-line implementation.
1657-
asm.emit(instr.unreachable);
1674+
asm.incPos();
16581675
},
1659-
'innerSuccess',
1676+
'slowSuccess',
16601677
);
1661-
asm.incPos();
16621678
});
16631679
}
16641680
}

packages/wasm/test/test-failurePos.js

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,25 @@ function arbitraryEdit(input) {
7676
// - for some randomly-corrupted input, which fails to parse
7777
// - the rightmostFailurePosition reported by a JS matcher and a Wasm matcher
7878
// is the same.
79-
const sameFailurePos = (t, wasmMatcher) =>
80-
fc.property(arbitraryEdit(validInput), input => {
79+
function sameFailurePos(wasmMatcher) {
80+
return fc.property(arbitraryEdit(validInput), input => {
8181
wasmMatcher.setInput(input);
8282
fc.pre(wasmMatcher.match() === 0);
8383
assert.equal(
8484
ns.LiquidHTML.match(input).getRightmostFailurePosition(),
8585
wasmMatcher.getRightmostFailurePosition(),
8686
);
8787
});
88+
}
8889

8990
test('failure pos (fast-check)', async t => {
9091
const m = await wasmMatcherForGrammar(ns.LiquidHTML);
91-
t.notThrows(() => fc.assert(sameFailurePos(t, m), {verbose, includeErrorInReport: true}));
92+
const details = fc.check(sameFailurePos(m), {
93+
includeErrorInReport: true,
94+
interruptAfterTimeLimit: 1000,
95+
});
96+
t.log(`numRuns: ${details.numRuns}`);
97+
t.is(details.failed, false, `${fc.defaultReportMessage(details)}`);
9298
});
9399

94100
test('failure pos: basic 1', async t => {

packages/wasm/test/test-wasm.js

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import test from 'ava';
2+
import assert from 'node:assert/strict';
3+
import * as fc from 'fast-check';
24
import * as ohm from 'ohm-js';
35
import {performance} from 'perf_hooks';
46

@@ -34,14 +36,6 @@ function unparse(m, root) {
3436
return ans;
3537
}
3638

37-
// const dumpMemoTable = pos => {
38-
// const arr = [];
39-
// for (let i = 0; i < 6; i++) {
40-
// arr.push(view.getUint32(pos * Constants.MEMO_COL_SIZE_BYTES + i * 4, true));
41-
// }
42-
// console.log(arr.map(v => v.toString(16).padStart(8, '0')).join(' '));
43-
// };
44-
4539
test('input in memory', async t => {
4640
const g = ohm.grammar('G { start = "x" }');
4741
const matcher = await wasmMatcherForGrammar(g);
@@ -960,15 +954,23 @@ test('space skipping & lex', async t => {
960954
}
961955
});
962956

963-
test('unicode built-ins', async t => {
964-
const g = ohm.grammar(`
965-
G {
966-
Start = lower upper
967-
}`);
957+
// fast-check's stringMatching combiner doesn't support unicode regexes.
958+
const arbitraryStringMatching = regex =>
959+
fc.string({maxLength: 2, unit: 'binary'}).filter(str => regex.test(str));
960+
961+
test('unicode built-ins: non-ASII (fast-check)', async t => {
962+
const g = ohm.grammar('G { Start = letter letter }');
968963
const m = await wasmMatcherForGrammar(g);
969-
t.is(matchWithInput(m, 'aA'), 1);
970-
t.is(matchWithInput(m, ' aZ'), 1);
971-
t.is(matchWithInput(m, ' zA'), 1);
972-
t.is(matchWithInput(m, 'a@'), 0);
973-
t.is(matchWithInput(m, 'a['), 0);
964+
const hasExpectedResult = wasmMatcher => {
965+
return fc.property(arbitraryStringMatching(/^\p{L}\p{L}$/u), str => {
966+
wasmMatcher.setInput(str);
967+
assert.equal(wasmMatcher.match(), 1);
968+
});
969+
};
970+
const details = fc.check(hasExpectedResult(m), {
971+
includeErrorInReport: true,
972+
interruptAfterTimeLimit: 200,
973+
});
974+
t.log(`numRuns: ${details.numRuns}`);
975+
t.is(details.failed, false, `${fc.defaultReportMessage(details)}`);
974976
});

0 commit comments

Comments
 (0)