Skip to content

Commit

Permalink
regex: implement \xAF and \X1234 BSLS escape codes (fix vlang#21607
Browse files Browse the repository at this point in the history
  • Loading branch information
penguindark authored and raw-bin committed Jul 2, 2024
1 parent 7dc209d commit 8ce60f7
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 33 deletions.
3 changes: 3 additions & 0 deletions vlib/regex/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ A meta-char can match different types of characters.
- `\a` matches only a lowercase char `[a-z]`
- `\A` matches only an uppercase char `[A-Z]`

- `\x41` match a byte of value 0x41, `A` in ascii code
- `\X414C` match two consecutive bytes of value 0x414c, `AL` in ascii code

### Quantifier

Each token can have a quantifier, that specifies how many times the character
Expand Down
153 changes: 120 additions & 33 deletions vlib/regex/regex.v
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ module regex
import strings

pub const v_regex_version = '1.0 alpha' // regex module version

pub const max_code_len = 256 // default small base code len for the regex programs

pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30

// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
Expand All @@ -32,29 +30,17 @@ pub const no_match_found = -1

// Errors
pub const compile_ok = 0 // the regex string compiled, all ok

pub const err_char_unknown = -2 // the char used is unknow to the system

pub const err_undefined = -3 // the compiler symbol is undefined

pub const err_internal_error = -4 // Bug in the regex system!!

pub const err_cc_alloc_overflow = -5 // memory for char class full!!

pub const err_syntax_error = -6 // syntax error in regex compiling

pub const err_groups_overflow = -7 // max number of groups reached

pub const err_groups_max_nested = -8 // max number of nested group reached

pub const err_group_not_balanced = -9 // group not balanced

pub const err_group_qm_notation = -10 // group invalid notation

pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class

pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier

pub const err_consecutive_dots = -13

//*************************************
Expand All @@ -66,9 +52,7 @@ const ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit avail
// AA = 00 regular class
// AA = 01 Negated class ^ char
const ist_char_class = u32(0xD1000000) // MASK

const ist_char_class_pos = u32(0xD0000000) // char class normal [abc]

const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]

// dot char 10 0110 xx xxxxxxxx
Expand All @@ -82,7 +66,6 @@ const ist_or_branch = u32(0x91000000) // OR case

// groups 10 010Y xx xxxxxxxx
const ist_group_start = u32(0x92000000) // group start (

const ist_group_end = u32(0x94000000) // group end )

// control instructions
Expand Down Expand Up @@ -257,6 +240,7 @@ mut:
// char
ch rune // char of the token if any
ch_len u8 // char len
flag u8 // flag for general usage
// Quantifiers / branch
rep_min int // used also for jump next in the OR branch [no match] pc jump
rep_max int // used also for jump next in the OR branch [ match] pc jump
Expand Down Expand Up @@ -294,13 +278,9 @@ fn (mut tok Token) reset() {
*
******************************************************************************/
pub const f_nl = 0x00000001 // end the match when find a new line symbol

pub const f_ms = 0x00000002 // match true only if the match is at the start of the string

pub const f_me = 0x00000004 // match true only if the match is at the end of the string

pub const f_efm = 0x00000100 // exit on first token matched, used by search

pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8

// behaviour modifier flags
Expand Down Expand Up @@ -419,17 +399,22 @@ enum BSLS_parse_state {
bsls_found
bsls_char
normal_char
hex_char
}

// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
fn (re RE) parse_bsls(in_txt string, in_i int) (int, int, u32) {
mut status := BSLS_parse_state.start
mut i := in_i
mut hex_max_len := 2
mut hex_res := u32(0)
mut hex_count := 0

for i < in_txt.len {
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
ch := u8(char_tmp)
// println("ch [${ch:c}]")

if status == .start && ch == `\\` {
status = .bsls_found
Expand All @@ -441,26 +426,86 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
if status == .bsls_found {
for c, x in regex.bsls_validator_array {
if x.ch == ch {
return c, i - in_i + 1
return c, i - in_i + 1, hex_res
}
}

// check for \x00 hex 8bit
if ch == `x` {
status = .hex_char
hex_max_len = 2
i += char_len
continue
}

// check for \x00 hex 16bit
if ch == `X` {
status = .hex_char
hex_max_len = 4
i += char_len
continue
}

status = .normal_char
continue
}

// manage hex byte
if status == .hex_char {
if ch >= `0` && ch <= `9` {
hex_count++
hex_res <<= 4
hex_res += u32(ch - `0`)
i += char_len
} else if ch >= `A` && ch <= `F` {
hex_count++
hex_res <<= 4
hex_res += u32(ch - `A` + 10)
i += char_len
} else if ch >= `a` && ch <= `f` {
hex_count++
hex_res <<= 4
hex_res += u32(ch - `a` + 10)
i += char_len
} else {
return regex.err_syntax_error, i, hex_res
}

// println("hex_res: ${hex_res:08x} hex_count: ${hex_count}")

// look for more hex digits
if hex_count < hex_max_len {
continue
}

// if over 8 nibble is more than 32 bit, error
if hex_count > hex_max_len {
return regex.err_syntax_error, i - in_i, hex_res
}

if hex_count == hex_max_len {
// we have a good result
// println("RESULT hex_res: ${hex_res:08x} hex_count: ${hex_count}")
return -2, i - in_i, hex_res
}

// MUST NOT BE HERE!
return regex.err_syntax_error, i, hex_res
}

// no BSLS validator, manage as normal escape char char
if status == .normal_char {
if ch in regex.bsls_escape_list {
return regex.no_match_found, i - in_i + 1
return regex.no_match_found, i - in_i + 1, hex_res
}
return regex.err_syntax_error, i - in_i + 1
return regex.err_syntax_error, i - in_i + 1, hex_res
}

// at the present time we manage only one char after the \
break
}
// not our bsls return KO
return regex.err_syntax_error, i
return regex.err_syntax_error, i, hex_res
}

/******************************************************************************
Expand Down Expand Up @@ -1200,8 +1245,10 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
// ist_bsls_char
if char_len == 1 && pc >= 0 {
if u8(char_tmp) == `\\` {
bsls_index, tmp := re.parse_bsls(in_txt, i)
// println("index: $bsls_index str:${in_txt[i..i+tmp]}")
// if the index is negative:
// -1 ERROR
// -2 hex byte code BLSL
bsls_index, tmp, hex_res := re.parse_bsls(in_txt, i)
if bsls_index >= 0 {
i = i + tmp
re.prog[pc].ist = u32(0) | regex.ist_bsls_char
Expand All @@ -1212,6 +1259,34 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
pc = pc + 1
continue
}
// hex char
// this code can mange up to \x for 32 bit
// at the present time only 8/16 bit are used
else if bsls_index == -2 {
mut value := hex_res
mut value_list := []u32{cap: 4}
mut count := 0
for value > 0 {
value_list << value & 0xFF
value = value >> 8
count++
}

count--
for count >= 0 {
re.prog[pc].ist = regex.ist_simple_char
re.prog[pc].ch = value_list[count]
re.prog[pc].ch_len = u8(char_len)
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
re.prog[pc].flag = 1 // state a byte char
// println("char: ${char_tmp:c}")
pc = pc + 1
count--
}
i = i + tmp
continue
}
// this is an escape char, skip the bsls and continue as a normal char
else if bsls_index == regex.no_match_found {
i += char_len
Expand Down Expand Up @@ -1514,7 +1589,11 @@ pub fn (re RE) get_code() string {
} else if ist == regex.ist_group_end {
res.write_string(') GROUP_END #:${tk.group_id}')
} else if ist == regex.ist_simple_char {
res.write_string('[${tk.ch:1c}] query_ch')
if tk.flag == 0 {
res.write_string('[${tk.ch:1c}] query_ch')
} else {
res.write_string('[0x${tk.ch:02X}]HEXquery_ch')
}
}

if tk.rep_max == regex.max_quantifier {
Expand Down Expand Up @@ -1615,10 +1694,14 @@ pub fn (re RE) get_query() string {

// char alone
if ch == regex.ist_simple_char {
if u8(ch) in regex.bsls_escape_list {
res.write_string('\\')
if tk.flag == 0 {
if u8(ch) in regex.bsls_escape_list {
res.write_string('\\')
}
res.write_string('${tk.ch:c}')
} else {
res.write_string('\\x${tk.ch:02x}')
}
res.write_string('${tk.ch:c}')
}

// quantifier
Expand Down Expand Up @@ -1799,7 +1882,11 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")

if ist == regex.ist_simple_char {
buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
if re.prog[state.pc].flag == 0 {
buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
} else {
buf2.write_string('query_ch: [0x${re.prog[state.pc].ch:02X}]')
}
} else {
if ist == regex.ist_bsls_char {
buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
Expand Down
12 changes: 12 additions & 0 deletions vlib/regex/regex_test.v
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,18 @@ match_test_suite = [
// test has `\0` chars
TestItem{"abcxyz", "^abc\0xyz$", -1,3},
TestItem{"abc\0xyz", "^abc\0xyz$", 0,7},

// test hex byte chars
TestItem{"abc_xyz", r"abc\x5Fxyz", 0,7},
TestItem{"abc_xyz", r"^abc\x5fxyz$", 0,7},
TestItem{"abcAxyz", r"^abc\x41xyz$", 0,7},
TestItem{"abcAAxyz", r"^abc\x41+xyz$", 0,8},
TestItem{"abcALxyz", r"^abc\x41\x4Cxyz$", 0,8},
TestItem{"abcAAxyz", r"^abc\X4141xyz$", 0,8},
TestItem{"abcALxyz", r"^abc\X414cxyz$", 0,8},
TestItem{"abcALxyz", r"^abc\X414Cxyz$", 0,8},
TestItem{"abcBxyz", r"^abc\x41+xyz$", -1,3},

]
)

Expand Down

0 comments on commit 8ce60f7

Please sign in to comment.