Skip to content

Commit

Permalink
Improved simplify, apply-assertions, and `remove-unnecessary-asse…
Browse files Browse the repository at this point in the history
…rtions` (#80)

* Improved `simplify`, `apply-assertions`, and `remove-unnecessary-assertions`

* Updated DFA min snaps
  • Loading branch information
RunDevelopment authored Oct 22, 2023
1 parent 7578796 commit 8d20d94
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 51 deletions.
180 changes: 137 additions & 43 deletions src/transformers/apply-assertions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import {
toMatchingDirection,
} from "../ast-analysis";
import { CharSet } from "../char-set";
import { cachedFunc, debugAssert, filterMut } from "../util";
import { assertNever, cachedFunc, debugAssert, filterMut } from "../util";
import { CreationOptions } from "./creation-options";
import {
SingleCharacterParent,
Expand Down Expand Up @@ -804,6 +804,42 @@ function moveAssertionOutsideLoop(
return undefined;
};

const endsWithSingleCharAssertion = (e: NoParent<Element | Concatenation>): boolean => {
switch (e.type) {
case "Assertion":
return e.kind === kind && isSingleCharacterParent(e);
case "CharacterClass":
case "Unknown":
return false;
case "Alternation":
return e.alternatives.every(endsWithSingleCharAssertion);
case "Quantifier":
return e.max === 1 && e.alternatives.every(endsWithSingleCharAssertion);
case "Concatenation": {
if (e.elements.length === 0) {
return false;
}
const B = atInRange(e.elements, lastIndexFor(direction));
return endsWithSingleCharAssertion(B);
}
default:
return assertNever(e);
}
};

/**
* Returns whether the given concatenation can be split into two parts A
* and B, such that B ends with a single-character assertion.
*
* @param alt
*/
const canSplit = (alt: NoParent<Concatenation>): boolean => {
if (alt.elements.length < 2) {
return false;
}
return endsWithSingleCharAssertion(alt);
};

for (let i = 0; i < elements.length; i++) {
const quant = elements[i];
if (
Expand All @@ -818,67 +854,125 @@ function moveAssertionOutsideLoop(
// find a fitting assertion
const alt = quant.alternatives[0];
const assertion = getAssertion(alt);
if (!assertion) {
// we couldn't find a fitting assertion
continue;
}
if (assertion) {
// trivially accepting?
let assertionChar = assertion.alternatives[0].elements[0].characters;
assertionChar = assertion.negate ? assertionChar.negate() : assertionChar;
const firstChar = getFirstCharConsumedBy(alt, direction, context.maxCharacter);
const triviallyAccepting = !firstChar.empty && firstChar.char.isSubsetOf(assertionChar);

// store the original quantifier min for later
const originalMin = quant.min;
if (quant.min === 0) {
quant.min = 1;
}

// remove the assertion
context.signalMutation();
alt.elements.splice(alt.elements.indexOf(assertion), 1);

let replacement: NoParent<Element>[];
if (triviallyAccepting) {
// `(a(?!b))+` => `a+(?!b)`
// the assertion has already been removed
replacement = withDirection(direction, [quant, assertion]);
} else {
// `(a(?!b))+` => `a((?!b)a)*(?!b)`
const prefix = copyNode(alt).elements;
const innerAssertion = copyNode(assertion);
pushFront(direction, alt.elements, innerAssertion);
quant.min--;
quant.max--;
if (direction === "ltr") {
replacement = [...prefix, quant, assertion];
} else {
replacement = [assertion, quant, ...prefix];
}
}

// trivially accepting?
let assertionChar = assertion.alternatives[0].elements[0].characters;
assertionChar = assertion.negate ? assertionChar.negate() : assertionChar;
const firstChar = getFirstCharConsumedBy(alt, direction, context.maxCharacter);
const triviallyAccepting = !firstChar.empty && firstChar.char.isSubsetOf(assertionChar);
if (originalMin === 0) {
// we need to wrap it in an optional group
replacement = [
{
type: "Quantifier",
min: 0,
max: 1,
lazy: quant.lazy,
alternatives: [
{
type: "Concatenation",
elements: replacement,
source: copySource(quant.source),
},
],
source: copySource(quant.source),
},
];
}

// store the original quantifier min for later
const originalMin = quant.min;
if (quant.min === 0) {
quant.min = 1;
elements.splice(i, 1, ...replacement);

continue;
}

// remove the assertion
context.signalMutation();
alt.elements.splice(alt.elements.indexOf(assertion), 1);
if (canSplit(alt)) {
if (getMaxDepth(quant) > 20 || countNodes(quant) > 100) {
continue;
}

let replacement: NoParent<Element>[];
if (triviallyAccepting) {
// `(a(?!b))+` => `a+(?!b)`
// the assertion has already been removed
replacement = withDirection(direction, [quant, assertion]);
} else {
// `(a(?!b))+` => `a((?!b)a)*(?!b)`
const prefix = copyNode(alt).elements;
const innerAssertion = copyNode(assertion);
pushFront(direction, alt.elements, innerAssertion);
quant.min--;
quant.max--;
if (direction === "ltr") {
replacement = [...prefix, quant, assertion];
} else {
replacement = [assertion, quant, ...prefix];
// we can split the concatenation into two parts A and B, such
// that B ends with a single-character assertion
let b = [atInRange(alt.elements, lastIndexFor(direction))];
let a = alt.elements.filter(e => e !== b[0]);
if (direction === "rtl") {
[a, b] = [b, a];
}
}

if (originalMin === 0) {
// we need to wrap it in an optional group
replacement = [
// The goal is to transform `(AB)+` => `A(BA)*B`. This should give
// us a better chance at removing assertions
let replacement: NoParent<Element>[] = [
...a.map(copyNode),
{
type: "Quantifier",
min: 0,
max: 1,
min: Math.max(0, quant.min - 1),
max: quant.max - 1,
lazy: quant.lazy,
alternatives: [
{
type: "Concatenation",
elements: replacement,
source: copySource(quant.source),
elements: [...b, ...a],
source: copySource(alt.source),
},
],
source: copySource(quant.source),
},
...b.map(copyNode),
];
}

elements.splice(i, 1, ...replacement);
if (quant.min === 0) {
// we need to wrap it in an optional group
replacement = [
{
type: "Quantifier",
min: 0,
max: 1,
lazy: quant.lazy,
alternatives: [
{
type: "Concatenation",
elements: replacement,
source: copySource(quant.source),
},
],
source: copySource(quant.source),
},
];
}

context.signalMutation();
elements.splice(i, 1, ...replacement);
continue;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/remove-unnecessary-assertions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ function analyzeAssertion(
}
const trivialNeighbor = getTrivialResultBecauseOfNeighbor(concatStack, assertion, context);
if (trivialNeighbor !== Result.DEPENDS) {
return trivial;
return trivialNeighbor;
}

// Now that the easy trivial cases are over, we have to be a little more clever. The basic idea here is that we
Expand Down
1 change: 1 addition & 0 deletions src/transformers/simplify.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export function simplify(options?: Readonly<CreationOptions>): CombinedTransform
sortAssertions(options),
factorOut(options),
makeGreedy(options),
removeUnnecessaryAssertions(options),
mergeWithQuantifier(options),
moveUpEmpty(options),
nestedQuantifiers(options),
Expand Down
2 changes: 1 addition & 1 deletion tests/__snapshots__/dfa-min.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1897,7 +1897,7 @@ RKDXb81h7ByTF8I61P9i/U+XZggHy0KNEilS24U+NqU=
`;

module.exports[n`DFA minimization >> 497: /\b[a-z_]\w*(?=\s*\()\b/i `] = lit`
Bw4gYAa7MtaF7YCPSkJu1quiBjjkEtcDBYGSD0j6sH0=
Pzj56ooZK2dcyafycKwYDm3OyTvuhVK39Cp0cgWIiQI=
`;

module.exports[n`DFA minimization >> 498: /[()>}]|\$[<{]/ `] = lit`
Expand Down
4 changes: 2 additions & 2 deletions tests/transformers/__snapshots__/apply-assertions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ module.exports[n`Transformers >> apply-assertions >> Prism regex snapshot `] = l
/(?:(?<!\w)0(?:X[\dA-F_]*[\dA-F]|B[01_]*[01])|(?:(?<!\w)\.\d+(?:_+\d+)*|(?<!\w)\d+(?:_+\d+)*(?:\.\d+(?:_+\d+)*)?)(?:E[-+]?\d+(?:_+\d+)*)?)(?:UL|LU|[DFLMU])?\b/i
/>>=?|<<=?|[-=]>|&&|\+\+|--|\|\||~|\?\?=?|[-!%&*+/<=>|^]=?/
/\?\.?|::|[(),.:;[\]{}]/
/\b(?:namespace|using)\s+(?:@|)[A-Z_a-z]\w*\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*\b)*(?=\s*[;{])/
/\b(?:namespace|using)\s+(?:@|)[A-Z_a-z]\w*\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*(?:\b(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*\b)*\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*)?\b)?(?=\s*[;{])/
/\b(?:default|typeof|sizeof)\s*\(\s*(?:[^\s()]|\s(?!\s*\))|\((?:[^()]|\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\))*\))*(?=\s*\))/
/(?:\((?:[^-%&()*+,/;<=>[\]\^|]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>|\((?:[^()]|\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\))*\)|\[\s*(?:,\s*)*\])+(?:,(?:[^-%&()*+,/;<=>[\]\^|]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>|\((?:[^()]|\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\))*\)|\[\s*(?:,\s*)*\])+)+\)|(?:@|(?!\b(?:class|enum|interface|struct|add|alias|and|ascending|async|await|by|descending|from|get|global|group|into|join|let|nameof|not|notnull|on|or|orderby|partial|remove|select|set|unmanaged|value|when|where|where|abstract|as|base|break|case|catch|checked|const|continue|default|delegate|do|else|event|explicit|extern|finally|fixed|for|foreach|goto|if|implicit|in|internal|is|lock|namespace|new|null|operator|out|override|params|private|protected|public|readonly|ref|return|sealed|sizeof|stackalloc|static|switch|this|throw|try|typeof|unchecked|unsafe|using|virtual|volatile|while|yield)(?:(?<!\w)\w|(?<=\w)(?!\w)))(?<!\w))[A-Z_a-z]\w*\b(?:\s*<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>)?(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*\b(?:\s*<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>)?)*)(?:\s*(?:\?\s*)?\[\s*(?:,\s*)*\])*(?:\s*\?)?(?=\s+(?:(?:@|(?!\b(?:class|enum|interface|struct|add|alias|and|ascending|async|await|by|descending|from|get|global|group|into|join|let|nameof|not|notnull|on|or|orderby|partial|remove|select|set|unmanaged|value|when|where|where|abstract|as|base|break|case|catch|checked|const|continue|default|delegate|do|else|event|explicit|extern|finally|fixed|for|foreach|goto|if|implicit|in|internal|is|lock|namespace|new|null|operator|out|override|params|private|protected|public|readonly|ref|return|sealed|sizeof|stackalloc|static|switch|this|throw|try|typeof|unchecked|unsafe|using|virtual|volatile|while|yield)(?:(?<!\w)\w|(?<=\w)(?!\w)))(?<!\w))[A-Z_a-z]\w*\b(?:\s*<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>)?(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*\b(?:\s*<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>)?)*\s*(?:=>|[({]|\.\s*this\s*\[)|this\s*\[))/
/(?<!\w)new\s+(?:\((?:[^-%&()*+,/;<=>[\]\^|]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>|\((?:[^()]|\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\))*\)|\[\s*(?:,\s*)*\])+(?:,(?:[^-%&()*+,/;<=>[\]\^|]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>|\((?:[^()]|\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\))*\)|\[\s*(?:,\s*)*\])+)+\)|(?:@|(?!\b(?:class|enum|interface|struct|add|alias|and|ascending|async|await|by|descending|from|get|global|group|into|join|let|nameof|not|notnull|on|or|orderby|partial|remove|select|set|unmanaged|value|when|where|where|abstract|as|base|break|case|catch|checked|const|continue|default|delegate|do|else|event|explicit|extern|finally|fixed|for|foreach|goto|if|implicit|in|internal|is|lock|namespace|new|null|operator|out|override|params|private|protected|public|readonly|ref|return|sealed|sizeof|stackalloc|static|switch|this|throw|try|typeof|unchecked|unsafe|using|virtual|volatile|while|yield)(?:(?<!\w)\w|(?<=\w)(?!\w)))(?<!\w))[A-Z_a-z]\w*\b(?:\s*<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>)?(?:\s*\.\s*(?:@|(?<!\w))[A-Z_a-z]\w*\b(?:\s*<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<(?:[^-%&*+/;<=>|^]|<[^-%&*+/;<=>|^]*>)*>)*>)*>)?)*)(?:\s*(?:\?\s*)?\[\s*(?:,\s*)*\])*(?:\s*\?)?(?=\s*[([{])/
Expand Down Expand Up @@ -1222,7 +1222,7 @@ module.exports[n`Transformers >> apply-assertions >> Prism regex snapshot `] = l
/&&&|<<<|>>>|\^\^\^|~~~|&&|\*\*|\.\.|::|<<|>>|<-|->|[!:=]=|<?\|{1,3}>?|\??(?:<=|>=|<>|[-%*+/<=>])\??|[!&?^]|~[-+~]|:>|:\?>?/
/\[<.+?>\]/
/(?:"""[^]*?"""|@"(?:""|[^"])*"|"(?:\\[^]|[^"\\])*")B?|'(?:[^'\\]|\\(?:.|\d{3}|x[\dA-Fa-f]{2}|u[\dA-Fa-f]{4}|U[\dA-Fa-f]{8}))'B?/
/(?:\b(?:exception|inherit|interface|new|of|type)\s+|\w\s*:\s*|\s:\??>\s*)[\w.]*(?:\.(?=\w)|\w(?!\w))(?:\s*(?:->|\*)\s*[\w.]*(?:\.(?=\w)|\w(?!\w)))*(?!\s*[.:])/
/(?:\b(?:exception|inherit|interface|new|of|type)\s+|\w\s*:\s*|\s:\??>\s*)[\w.]*(?:\.(?=\w)|\w(?!\w))(?:\s*(?:->|\*)\s*[\w.]*(?:(?:\.(?=\w)|\w(?!\w))\s*(?:->|\*)\s*[\w.]*)*(?:\.(?=\w)|\w(?!\w)))?(?!\s*[.:])/
/^[\t\x0b\f \xa0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]*#.*/m
/(?<!\w)0x[\dA-Fa-f]+(?:un|lf|LF)?\b/
/(?<!\w)0b[01]+(?:y|uy)?\b/
Expand Down
Loading

0 comments on commit 8d20d94

Please sign in to comment.