Skip to content

Commit

Permalink
More UnicodeSet and CharMap method (#76)
Browse files Browse the repository at this point in the history
  • Loading branch information
RunDevelopment authored Sep 13, 2023
1 parent ceb928d commit 8d684b7
Show file tree
Hide file tree
Showing 8 changed files with 1,756 additions and 97 deletions.
83 changes: 82 additions & 1 deletion src/char-map.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,27 @@ import { CharRange, CharSet } from "./char-set";
import { filterMut } from "./util";

export interface ReadonlyCharMap<T> extends Iterable<[CharRange, T]> {
/**
* Returns whether this map is empty.
*
* This is equivalent to `this.size === 0` and `this.entryCount === 0`.
*/
readonly isEmpty: boolean;
/**
* The number of characters in this map. This is different from {@link entryCount}.
*
* This is equivalent to `[...this.keys()].reduce((count, range) => count + range.max - range.min + 1, 0)`.
*/
readonly size: number;
/**
* The number of entires in this map.
*
* This is different from {@link size}. In general, you should use {@link size}, because it has the same semantics
* as `Set#size` and `Map#size`.
*
* This is equivalent to `[...this.entries()].length`.
*/
readonly entryCount: number;

/**
* Returns whether the given character is a key in the map.
Expand Down Expand Up @@ -44,14 +64,37 @@ export interface ReadonlyCharMap<T> extends Iterable<[CharRange, T]> {
* @param callback
*/
forEach(callback: (value: T, chars: CharRange, map: ReadonlyCharMap<T>) => void): void;
/**
* Returns all ranges of characters that are keys in the map.
*
* Keys will be returned in the same order as `this.entries()`.
*/
keys(): Iterable<CharRange>;
/**
* Returns all values in the map. Values might not be unique if more than one range maps to the same value.
*
* Values will be returned in the same order as `this.entries()`.
*/
values(): Iterable<T>;
entries(range?: CharRange): Iterable<[CharRange, T]>;
/**
* Returns all key-value pairs in the map.
*
* Entries will be returned in the order of ascending ranges.
*/
entries(): Iterable<[CharRange, T]>;

/**
* Returns a mapping from the values of this map to its keys.
*/
invert(maxCharacter: Char): Map<T, CharSet>;

/**
* Returns a new map with all values mapped by the given function.
*
* If no function is given, the identity function is used.
*/
copy(): CharMap<T>;
copy<U>(mapFn: (value: T) => U): CharMap<U>;
}

interface Item<T> {
Expand All @@ -76,6 +119,16 @@ export class CharMap<T> implements ReadonlyCharMap<T> {
get isEmpty(): boolean {
return this._array.length === 0;
}
get size(): number {
let count = 0;
for (const { range } of this._array) {
count += range.max - range.min + 1;
}
return count;
}
get entryCount(): number {
return this._array.length;
}

private _indexOf(char: Char): number | undefined {
let l = 0;
Expand Down Expand Up @@ -402,6 +455,34 @@ export class CharMap<T> implements ReadonlyCharMap<T> {
this._array = [];
}

copy(): CharMap<T>;
copy<U>(mapFn: (value: T) => U): CharMap<U>;
copy<U>(mapFn?: (value: T) => U): CharMap<U> {
if (!mapFn) {
const map = new CharMap<T>();
map._array = this._array.map(item => {
return { range: item.range, value: item.value };
});
return map as unknown as CharMap<U>;
}

const map = new CharMap<U>();
map._array = this._array.map(item => {
return { range: item.range, value: mapFn(item.value) };
});

// merge adjacent
filterMut(map._array, (item, prev) => {
if (prev && prev.range.max + 1 === item.range.min && prev.value === item.value) {
prev.range = { min: prev.range.min, max: item.range.max };
return false;
}
return true;
});

return map;
}

map(mapFn: (value: T, chars: CharRange, map: ReadonlyCharMap<T>) => T): void {
for (const item of this._array) {
item.value = mapFn(item.value, item.range, this);
Expand Down
16 changes: 16 additions & 0 deletions src/js/char-case-folding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,23 @@ import { Maximum } from "./maximum";
import { UnicodeCaseFolding } from "./unicode";
import { UTF16CaseFolding } from "./utf16-case-folding";

/**
* A set of functions that can be used to perform case-insensitive matching.
*
* It must fulfill the following conditions:
*
* 1. `canonicalize` must be idempotent, i.e. `canonicalize(canonicalize(char)) === canonicalize(char)`.
* 2. `toCharSet(canonicalize(a))` is the set of all characters `c` such that `canonicalize(a) === canonicalize(c)`.
*/
export interface CharCaseFolding {
/**
* The canonicalization function. This typically maps characters to their lowercase form.
*
* If no function is given, then the identity function is used. This also implies that `toCharSet` must return a
* set containing only the given character.
*
* @default char => char
*/
readonly canonicalize?: (char: Char) => Char;
readonly toCharSet: (char: Char) => CharSet;
}
Expand Down
21 changes: 1 addition & 20 deletions src/js/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -730,27 +730,8 @@ export class Parser {
return context.nc.newCharClass(element, chars.chars);
}

// ECMAScript spec says that alternatives are sorted by descending length.
// This isn't enough for uniqueness though, so we also sort by code point.
const words = [...chars.accept.wordSets];
if (!chars.chars.isEmpty) {
words.push([chars.chars]);
}
words.sort((a, b) => {
if (a.length !== b.length) {
return b.length - a.length;
}
for (let i = 0; i < a.length; i++) {
const diff = a[i].compare(b[i]);
if (diff !== 0) {
return diff;
}
}
return 0;
});

const alternation = context.nc.newAlt(element);
for (const word of words) {
for (const word of chars.wordSets) {
const alternative = context.nc.newConcat(element);
alternation.alternatives.push(alternative);

Expand Down
Loading

0 comments on commit 8d684b7

Please sign in to comment.