From ab45d80fa502cbb5a85cc5dcb4397fde8a1de678 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Sat, 1 Jun 2024 23:39:59 +0200 Subject: [PATCH] regex: implement `\xAF` and `\X1234` BSLS escape codes (fix #21607) (#21632) --- vlib/regex/README.md | 3 + vlib/regex/regex.v | 153 +++++++++++++++++++++++++++++++--------- vlib/regex/regex_test.v | 12 ++++ 3 files changed, 135 insertions(+), 33 deletions(-) diff --git a/vlib/regex/README.md b/vlib/regex/README.md index bb7f9ed3ca86c8..a9ce7b94b35648 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -103,6 +103,9 @@ A meta-char can match different types of characters. - `\a` matches only a lowercase char `[a-z]` - `\A` matches only an uppercase char `[A-Z]` +- `\x41` match a byte of value 0x41, `A` in ascii code +- `\X414C` match two consecutive bytes of value 0x414c, `AL` in ascii code + ### Quantifier Each token can have a quantifier, that specifies how many times the character diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 9625480cbebbd8..a85801523090c7 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -17,9 +17,7 @@ module regex import strings pub const v_regex_version = '1.0 alpha' // regex module version - pub const max_code_len = 256 // default small base code len for the regex programs - pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode @@ -32,29 +30,17 @@ pub const no_match_found = -1 // Errors pub const compile_ok = 0 // the regex string compiled, all ok - pub const err_char_unknown = -2 // the char used is unknow to the system - pub const err_undefined = -3 // the compiler symbol is undefined - pub const err_internal_error = -4 // Bug in the regex system!! - pub const err_cc_alloc_overflow = -5 // memory for char class full!! - pub const err_syntax_error = -6 // syntax error in regex compiling - pub const err_groups_overflow = -7 // max number of groups reached - pub const err_groups_max_nested = -8 // max number of nested group reached - pub const err_group_not_balanced = -9 // group not balanced - pub const err_group_qm_notation = -10 // group invalid notation - pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class - pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier - pub const err_consecutive_dots = -13 //************************************* @@ -66,9 +52,7 @@ const ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit avail // AA = 00 regular class // AA = 01 Negated class ^ char const ist_char_class = u32(0xD1000000) // MASK - const ist_char_class_pos = u32(0xD0000000) // char class normal [abc] - const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc] // dot char 10 0110 xx xxxxxxxx @@ -82,7 +66,6 @@ const ist_or_branch = u32(0x91000000) // OR case // groups 10 010Y xx xxxxxxxx const ist_group_start = u32(0x92000000) // group start ( - const ist_group_end = u32(0x94000000) // group end ) // control instructions @@ -257,6 +240,7 @@ mut: // char ch rune // char of the token if any ch_len u8 // char len + flag u8 // flag for general usage // Quantifiers / branch rep_min int // used also for jump next in the OR branch [no match] pc jump rep_max int // used also for jump next in the OR branch [ match] pc jump @@ -294,13 +278,9 @@ fn (mut tok Token) reset() { * ******************************************************************************/ pub const f_nl = 0x00000001 // end the match when find a new line symbol - pub const f_ms = 0x00000002 // match true only if the match is at the start of the string - pub const f_me = 0x00000004 // match true only if the match is at the end of the string - pub const f_efm = 0x00000100 // exit on first token matched, used by search - pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8 // behaviour modifier flags @@ -419,17 +399,22 @@ enum BSLS_parse_state { bsls_found bsls_char normal_char + hex_char } // parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present -fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) { +fn (re RE) parse_bsls(in_txt string, in_i int) (int, int, u32) { mut status := BSLS_parse_state.start mut i := in_i + mut hex_max_len := 2 + mut hex_res := u32(0) + mut hex_count := 0 for i < in_txt.len { // get our char char_tmp, char_len := re.get_char(in_txt, i) ch := u8(char_tmp) + // println("ch [${ch:c}]") if status == .start && ch == `\\` { status = .bsls_found @@ -441,26 +426,86 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) { if status == .bsls_found { for c, x in regex.bsls_validator_array { if x.ch == ch { - return c, i - in_i + 1 + return c, i - in_i + 1, hex_res } } + + // check for \x00 hex 8bit + if ch == `x` { + status = .hex_char + hex_max_len = 2 + i += char_len + continue + } + + // check for \x00 hex 16bit + if ch == `X` { + status = .hex_char + hex_max_len = 4 + i += char_len + continue + } + status = .normal_char continue } + // manage hex byte + if status == .hex_char { + if ch >= `0` && ch <= `9` { + hex_count++ + hex_res <<= 4 + hex_res += u32(ch - `0`) + i += char_len + } else if ch >= `A` && ch <= `F` { + hex_count++ + hex_res <<= 4 + hex_res += u32(ch - `A` + 10) + i += char_len + } else if ch >= `a` && ch <= `f` { + hex_count++ + hex_res <<= 4 + hex_res += u32(ch - `a` + 10) + i += char_len + } else { + return regex.err_syntax_error, i, hex_res + } + + // println("hex_res: ${hex_res:08x} hex_count: ${hex_count}") + + // look for more hex digits + if hex_count < hex_max_len { + continue + } + + // if over 8 nibble is more than 32 bit, error + if hex_count > hex_max_len { + return regex.err_syntax_error, i - in_i, hex_res + } + + if hex_count == hex_max_len { + // we have a good result + // println("RESULT hex_res: ${hex_res:08x} hex_count: ${hex_count}") + return -2, i - in_i, hex_res + } + + // MUST NOT BE HERE! + return regex.err_syntax_error, i, hex_res + } + // no BSLS validator, manage as normal escape char char if status == .normal_char { if ch in regex.bsls_escape_list { - return regex.no_match_found, i - in_i + 1 + return regex.no_match_found, i - in_i + 1, hex_res } - return regex.err_syntax_error, i - in_i + 1 + return regex.err_syntax_error, i - in_i + 1, hex_res } // at the present time we manage only one char after the \ break } // not our bsls return KO - return regex.err_syntax_error, i + return regex.err_syntax_error, i, hex_res } /****************************************************************************** @@ -1200,8 +1245,10 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { // ist_bsls_char if char_len == 1 && pc >= 0 { if u8(char_tmp) == `\\` { - bsls_index, tmp := re.parse_bsls(in_txt, i) - // println("index: $bsls_index str:${in_txt[i..i+tmp]}") + // if the index is negative: + // -1 ERROR + // -2 hex byte code BLSL + bsls_index, tmp, hex_res := re.parse_bsls(in_txt, i) if bsls_index >= 0 { i = i + tmp re.prog[pc].ist = u32(0) | regex.ist_bsls_char @@ -1212,6 +1259,34 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { pc = pc + 1 continue } + // hex char + // this code can mange up to \x for 32 bit + // at the present time only 8/16 bit are used + else if bsls_index == -2 { + mut value := hex_res + mut value_list := []u32{cap: 4} + mut count := 0 + for value > 0 { + value_list << value & 0xFF + value = value >> 8 + count++ + } + + count-- + for count >= 0 { + re.prog[pc].ist = regex.ist_simple_char + re.prog[pc].ch = value_list[count] + re.prog[pc].ch_len = u8(char_len) + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + re.prog[pc].flag = 1 // state a byte char + // println("char: ${char_tmp:c}") + pc = pc + 1 + count-- + } + i = i + tmp + continue + } // this is an escape char, skip the bsls and continue as a normal char else if bsls_index == regex.no_match_found { i += char_len @@ -1514,7 +1589,11 @@ pub fn (re RE) get_code() string { } else if ist == regex.ist_group_end { res.write_string(') GROUP_END #:${tk.group_id}') } else if ist == regex.ist_simple_char { - res.write_string('[${tk.ch:1c}] query_ch') + if tk.flag == 0 { + res.write_string('[${tk.ch:1c}] query_ch') + } else { + res.write_string('[0x${tk.ch:02X}]HEXquery_ch') + } } if tk.rep_max == regex.max_quantifier { @@ -1615,10 +1694,14 @@ pub fn (re RE) get_query() string { // char alone if ch == regex.ist_simple_char { - if u8(ch) in regex.bsls_escape_list { - res.write_string('\\') + if tk.flag == 0 { + if u8(ch) in regex.bsls_escape_list { + res.write_string('\\') + } + res.write_string('${tk.ch:c}') + } else { + res.write_string('\\x${tk.ch:02x}') } - res.write_string('${tk.ch:c}') } // quantifier @@ -1799,7 +1882,11 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) { buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ") if ist == regex.ist_simple_char { - buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]') + if re.prog[state.pc].flag == 0 { + buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]') + } else { + buf2.write_string('query_ch: [0x${re.prog[state.pc].ch:02X}]') + } } else { if ist == regex.ist_bsls_char { buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]') diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index 9d97af0bd68523..e812f076b353db 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -193,6 +193,18 @@ match_test_suite = [ // test has `\0` chars TestItem{"abcxyz", "^abc\0xyz$", -1,3}, TestItem{"abc\0xyz", "^abc\0xyz$", 0,7}, + + // test hex byte chars + TestItem{"abc_xyz", r"abc\x5Fxyz", 0,7}, + TestItem{"abc_xyz", r"^abc\x5fxyz$", 0,7}, + TestItem{"abcAxyz", r"^abc\x41xyz$", 0,7}, + TestItem{"abcAAxyz", r"^abc\x41+xyz$", 0,8}, + TestItem{"abcALxyz", r"^abc\x41\x4Cxyz$", 0,8}, + TestItem{"abcAAxyz", r"^abc\X4141xyz$", 0,8}, + TestItem{"abcALxyz", r"^abc\X414cxyz$", 0,8}, + TestItem{"abcALxyz", r"^abc\X414Cxyz$", 0,8}, + TestItem{"abcBxyz", r"^abc\x41+xyz$", -1,3}, + ] )