Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions benchmarks/src/regex_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@ void bm_match_sequence_of_as(benchmark::State& state, const char* pattern, synta
}
}

void bm_match_sequence_of_9a1b(benchmark::State& state, const char* pattern, syntax_option_type syntax = ECMAScript) {
string input;
for (int i = 0; i < state.range(); ++i) {
input += "aaaaaaaaab";
}

regex re{pattern, syntax};

for (auto _ : state) {
benchmark::DoNotOptimize(input);
const char* pos = input.data();
const char* end = input.data() + input.size();
cmatch match;
regex_match(pos, end, match, re);
}
}

void common_args(auto bm) {
bm->Arg(100)->Arg(200)->Arg(400);
}
Expand All @@ -36,5 +53,6 @@ BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(b|a)*", "(b|a)*")->Apply(common_arg
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*", "(a)(?:b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(b|a)*", "(a)(b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*c", "(a)(?:b|a)*c")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_9a1b, "(?:a*b)*", "(?:a*b)*")->Apply(common_args);

BENCHMARK_MAIN();
34 changes: 22 additions & 12 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1279,7 +1279,8 @@ enum _Node_flags : int { // flags for nfa nodes with special properties
_Fl_class_cl_all_bits = 0x800, // TRANSITION, ABI: GH-5242
_Fl_begin_needs_w = 0x100,
_Fl_begin_needs_s = 0x200,
_Fl_begin_needs_d = 0x400
_Fl_begin_needs_d = 0x400,
_Fl_rep_branchless = 0x800,
};

_BITMASK_OPS(_EMPTY_ARGUMENT, _Node_flags)
Expand Down Expand Up @@ -1900,7 +1901,7 @@ private:
void _Quantifier();
bool _Alternative();
void _Disjunction();
void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep);
void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep, bool _Nonreentrant);

_FwdIt _Pat;
_FwdIt _End;
Expand Down Expand Up @@ -4129,7 +4130,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
_Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep;
auto& _Sav = _Loop_vals[_Nr->_Loop_number];
bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0;
if (_Nr->_Simple_loop != 0) {
if (_Nr->_Simple_loop != 0 || (_Nr->_Flags & _Fl_rep_branchless) != 0) {
if (_Sav._Loop_idx == 1) {
auto& _Base_frame = _Frames[_Sav._Loop_frame_idx];
_Sav._Loop_length = _STD distance(_Base_frame._Pos, _Tgt_state._Cur);
Expand All @@ -4146,6 +4147,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N

// allocate stack frame holding loop-specific unwinding opcode for second rep and beyond
auto _New_frame_code = _Base_frame._Code == _Rx_unwind_ops::_Loop_simple_greedy_firstrep
|| _Base_frame._Code == _Rx_unwind_ops::_Loop_greedy
? _Rx_unwind_ops::_Loop_simple_greedy_lastrep
: _Rx_unwind_ops::_Do_nothing;
auto _New_frame_idx = _Push_frame(_New_frame_code, _Nr);
Expand Down Expand Up @@ -5424,33 +5426,35 @@ void _Parser2<_FwdIt, _Elem, _RxTraits>::_Disjunction() { // check for valid dis

template <class _FwdIt, class _Elem, class _RxTraits>
void _Parser2<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep) {
_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep, const bool _Nonreentrant) {
// walks regex NFA, calculates values of _Node_rep::_Simple_loop
for (; _Nx != _Ne && _Nx; _Nx = _Nx->_Next) {
switch (_Nx->_Kind) {
case _N_if:
// _Node_if inside a _Node_rep makes the rep not simple
if (_Outer_rep) {
_Outer_rep->_Simple_loop = 0;
_Outer_rep->_Flags &= ~_Fl_rep_branchless;
}

// visit each branch of the if
for (_Node_if* _Branch = static_cast<_Node_if*>(_Nx)->_Child; _Branch; _Branch = _Branch->_Child) {
_Calculate_loop_simplicity(_Branch->_Next, _Branch->_Endif, _Outer_rep);
_Calculate_loop_simplicity(_Branch->_Next, _Branch->_Endif, _Outer_rep, _Nonreentrant);
}
break;

case _N_assert:
// A positive lookahead assertion inside a _Node_rep makes the rep not simple
if (_Outer_rep) {
_Outer_rep->_Simple_loop = 0;
_Outer_rep->_Flags &= ~_Fl_rep_branchless;
}
_FALLTHROUGH;

case _N_neg_assert:
// visit the assertion body
// note _Outer_rep being reset: the assertion regex is completely independent
_Calculate_loop_simplicity(static_cast<_Node_assert*>(_Nx)->_Child, nullptr, nullptr);
_Calculate_loop_simplicity(static_cast<_Node_assert*>(_Nx)->_Child, nullptr, nullptr, true);
break;

case _N_rep:
Expand All @@ -5459,15 +5463,20 @@ void _Parser2<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
// If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop.
if (_Outer_rep) {
_Outer_rep->_Simple_loop = 0;
auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
if (_Outer_rep->_Max >= 0 && _Outer_rep->_Max <= 1) {
_Calculate_loop_simplicity(_Inner_rep->_Next, _Inner_rep->_End_rep->_Next, _Inner_rep);
_Nx = _Inner_rep->_End_rep;
} else {
_Outer_rep->_Flags &= ~_Fl_rep_branchless;
auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
_Inner_rep->_Flags |= _Fl_rep_branchless;
const bool _Inner_nonreentrant = _Outer_rep->_Max >= 0 && _Outer_rep->_Max <= 1 && _Nonreentrant;
if (!_Inner_nonreentrant) {
_Inner_rep->_Simple_loop = 0;
}

_Calculate_loop_simplicity(
_Inner_rep->_Next, _Inner_rep->_End_rep->_Next, _Inner_rep, _Inner_nonreentrant);
_Nx = _Inner_rep->_End_rep;
} else {
_Outer_rep = static_cast<_Node_rep*>(_Nx);
_Outer_rep->_Flags |= _Fl_rep_branchless;
}
break;

Expand All @@ -5492,6 +5501,7 @@ void _Parser2<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
if (_Coll_diff_size || _Node->_Equiv
|| ((_Flags & regex_constants::collate) && (_Node->_Ranges || (_Node->_Flags & _Fl_negate)))) {
_Outer_rep->_Simple_loop = 0;
_Outer_rep->_Flags &= ~_Fl_rep_branchless;
}
}
break;
Expand Down Expand Up @@ -5530,7 +5540,7 @@ _Root_node* _Parser2<_FwdIt, _Elem, _RxTraits>::_Compile() { // compile regular
_Res = _Nfa._End_pattern();
_Res->_Fl = _Flags;
_Res->_Marks = _Mark_count();
_Calculate_loop_simplicity(_Res, nullptr, nullptr);
_Calculate_loop_simplicity(_Res, nullptr, nullptr, true);
_Guard._Target = nullptr;
return _Res;
}
Expand Down
32 changes: 32 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2408,6 +2408,37 @@ void test_gh_5944() {
}
}

void test_gh_6022() {
// GH-6022: Optimize matching of branchless loops
g_regexTester.should_match("acddabb", R"((?:([ac])*d)*ab\1b)");
g_regexTester.should_match("acdaadabab", R"((?:([ac])*d)*ab\1b)");
g_regexTester.should_match("acddabcb", R"((?:([ac])*d)*dab\1b)");

g_regexTester.should_match("addabb", R"((?:(a){0,1}d)*ab\1b)");
g_regexTester.should_match("adadabab", R"((?:(a){0,1}d)*ab\1b)");
g_regexTester.should_match("adadabb", R"((?:(a){0,1}d)*adadab\1b)");
g_regexTester.should_not_match("adaddabab", R"((?:(a){0,1}d)*ab\1b)");
g_regexTester.should_not_match("dabb", R"((?:(a){1,1}d)+ab\1b)");
g_regexTester.should_not_match("addabb", R"((?:(a){1,2}d)+ab\1b)");
g_regexTester.should_not_match("adaadabb", R"((?:(a){1,2}d)+ab\1b)");

g_regexTester.should_match("bacabcdacdabbaddabbcdcdbc", R"((?:(?:([abc])([abc]))*d)+cd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdab", R"((?:(?:([abc])([abc]))*d)+dcd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdab", R"((?:(?:([abc])([abc]))*d)+bcdcd\1\2)");
g_regexTester.should_match("bacabcdacdabbaddabbcdcd", R"((?:(?:([abc])([abc]))*d)*abbcdcd\1\2)");
g_regexTester.should_match("bacabcdacdabbaddabbcdcdba", R"((?:(?:([abc])([abc]))*d)*dabbcdcd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdba", R"((?:(?:([abc])([abc]))*d)+ddabbcdcd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdab", R"((?:(?:([abc])([abc]))*d)+baddabbcdcd\1\2)");
g_regexTester.should_match("bacabcdacdabbaddabbcdcdac", R"((?:(?:([abc])([abc]))*d)*abbaddabbcdcd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdac", R"((?:(?:([abc])([abc]))*d)*dabbaddabcdcd\1\2)");
g_regexTester.should_match("bacabcdacdabbaddabbcdcdbc", R"((?:(?:([abc])([abc]))*d)*acdabbaddabbcdcd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdbc", R"((?:(?:([abc])([abc]))*d)*dacdabbaddabbcdcd\1\2)");
g_regexTester.should_not_match("bacabcdacdabbaddabbcdcdca", R"((?:(?:([abc])([abc]))*d)*bcdacdabbaddabbcdcd\1\2)");
g_regexTester.should_not_match(
"bacabcdacdabbaddabbcdcdba", R"((?:(?:([abc])([abc]))*d)*cabcdacdabbaddabbcdcd\1\2)");
g_regexTester.should_match("bacabcdacdabbaddabbcdcd", R"((?:(?:([abc])([abc]))*d)*bacabcdacdabbaddabbcdcd\1\2)");
}

int main() {
test_dev10_449367_case_insensitivity_should_work();
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
Expand Down Expand Up @@ -2469,6 +2500,7 @@ int main() {
test_gh_5918();
test_gh_5939();
test_gh_5944();
test_gh_6022();

return g_regexTester.result();
}