From 803666337bb91bf2e149922dcdb5a50f31a0e89e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= Date: Fri, 4 Apr 2025 11:59:08 +0200 Subject: [PATCH] Support octal escapes as well as hexadecimal. * Factor out the hexadecimal escape parser and rewrite it to support sequences of arbitrary length as long as they don't overflow. * Replicate the same logic for octal escapes using a `\o` prefix. * Update the unit tests and documentation accordingly. --- doc/tre-syntax.html | 20 ++++--- lib/tre-parse.c | 140 ++++++++++++++++++++++++++++---------------- tests/retest.c | 35 ++++++++++- 3 files changed, 136 insertions(+), 59 deletions(-) diff --git a/doc/tre-syntax.html b/doc/tre-syntax.html index 640bb17..d8334ac 100644 --- a/doc/tre-syntax.html +++ b/doc/tre-syntax.html @@ -337,7 +337,9 @@

Literals

 literal ::= ordinary-character
+        |   "\o" ["0"-"7" ]{0,3}
         |   "\x" ["1"-"9" "a"-"f" "A"-"F"]{0,2}
+        |   "\o{" ["0"-"7" ]* "}"
         |   "\x{" ["1"-"9" "a"-"f" "A"-"F"]* "}"
         |   "\" character
 
@@ -345,15 +347,15 @@

Literals

A literal is either an ordinary character (a character that has no -other significance in the context), an 8 bit hexadecimal encoded -character (e.g. \x1B), a wide hexadecimal encoded character -(e.g. \x{263a}), or an escaped character. An escaped -character is a \ followed by any character, and matches that -character. Escaping can be used to match characters which have a -special meaning in regexp syntax. A \ cannot be the last -character of an ERE. Escaping also allows you to include a few -non-printable characters in the regular expression. These special -escape sequences include: +other significance in the context), an 8 bit octal or hexadecimal +encoded character (e.g. \x1B, \o33), a wide octal +or hexadecimal encoded character (e.g. \x{263a}, \o{23072}), or an +escaped character. An escaped character is a \ followed by +any character, and matches that character. Escaping can be used to +match characters which have a special meaning in regexp syntax. +A \ cannot be the last character of an ERE. Escaping also +allows you to include a few non-printable characters in the regular +expression. These special escape sequences include:

    diff --git a/lib/tre-parse.c b/lib/tre-parse.c index 9c3106c..afb0aed 100644 --- a/lib/tre-parse.c +++ b/lib/tre-parse.c @@ -252,6 +252,86 @@ tre_ctype_t tre_ctype(const char *name) #define REST(re) (int)(ctx->re_end - (re)), (re) +static reg_errcode_t +tre_parse_octal(tre_parse_ctx_t *ctx, unsigned long *valp) +{ + unsigned long val = 0; + unsigned long max = 0xff; + int wide = 0; + + if (ctx->re[0] == CHAR_LBRACE) + { + wide = 1; + max = 0x7fffffff; + ctx->re++; + } + + DPRINT(("parsing %s oct: '%.*" STRF "'\n", wide ? "wide" : "8bit", + REST(ctx->re - 1))); + while (ctx->re < ctx->re_end) + { + unsigned int digit; + if (ctx->re[0] >= L'0' && ctx->re[0] <= L'7') + digit = ctx->re[0] - L'0'; + else + break; + ctx->re++; + if (val > max >> 3) + return REG_EBRACE; + val = val << 3 | digit; + } + if (wide) + { + if (!(ctx->re < ctx->re_end && ctx->re[0] == CHAR_RBRACE)) + return REG_EBRACE; + ctx->re++; + } + *valp = val; + return REG_OK; +} + +static reg_errcode_t +tre_parse_hexadecimal(tre_parse_ctx_t *ctx, unsigned long *valp) +{ + unsigned long val = 0; + unsigned long max = 0xff; + int wide = 0; + + if (ctx->re[0] == CHAR_LBRACE) + { + wide = 1; + max = 0x7fffffff; + ctx->re++; + } + + DPRINT(("parsing %s hex: '%.*" STRF "'\n", wide ? "wide" : "8bit", + REST(ctx->re - 1))); + while (ctx->re < ctx->re_end) + { + unsigned int digit; + if (ctx->re[0] >= L'0' && ctx->re[0] <= L'9') + digit = ctx->re[0] - L'0'; + else if (ctx->re[0] >= L'A' && ctx->re[0] <= L'F') + digit = ctx->re[0] - L'A'; + else if (ctx->re[0] >= L'a' && ctx->re[0] <= L'f') + digit = ctx->re[0] - L'a'; + else + break; + ctx->re++; + if (val > max >> 4) + return REG_EBRACE; + val = val << 4 | digit; + } + if (wide) + { + if (!(ctx->re < ctx->re_end && ctx->re[0] == CHAR_RBRACE)) + return REG_EBRACE; + ctx->re++; + } + *valp = val; + return REG_OK; +} + static reg_errcode_t tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, tre_ctype_t neg_classes[], int *num_neg_classes, @@ -954,6 +1034,7 @@ tre_parse(tre_parse_ctx_t *ctx) reg_errcode_t status = REG_OK; tre_stack_t *stack = ctx->stack; size_t bottom = tre_stack_num_items(stack); + unsigned long val; int depth = 0; int temporary_cflags = 0; @@ -1463,57 +1544,18 @@ tre_parse(tre_parse_ctx_t *ctx) ASSERT_AT_EOW); ctx->re++; break; + case L'o': + ctx->re++; + if ((status = tre_parse_octal(ctx, &val)) != REG_OK) + return status; + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); + break; case L'x': ctx->re++; - if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) - { - /* 8 bit hex char. */ - char tmp[3] = {0, 0, 0}; - long val; - DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", - REST(ctx->re - 2))); - - if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) - { - tmp[0] = (char)ctx->re[0]; - ctx->re++; - } - if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) - { - tmp[1] = (char)ctx->re[0]; - ctx->re++; - } - val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); - break; - } - else if (ctx->re < ctx->re_end) - { - /* Wide char. */ - char tmp[9]; /* max 8 hex digits + terminator */ - long val; - size_t i = 0; - ctx->re++; - while (ctx->re_end - ctx->re >= 0) - { - if (ctx->re[0] == CHAR_RBRACE) - break; - if (tre_isxdigit(ctx->re[0]) && i < sizeof(tmp) - 1) - { - tmp[i] = (char)ctx->re[0]; - i++; - ctx->re++; - continue; - } - return REG_EBRACE; - } - ctx->re++; - tmp[i] = 0; - val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); - break; - } - /*FALLTHROUGH*/ + if ((status = tre_parse_hexadecimal(ctx, &val)) != REG_OK) + return status; + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); + break; default: if (tre_isdigit(*ctx->re)) diff --git a/tests/retest.c b/tests/retest.c index f6af42e..7393975 100644 --- a/tests/retest.c +++ b/tests/retest.c @@ -1367,6 +1367,35 @@ main(int argc, char **argv) test_comp("\\t", REG_EXTENDED, 0); test_comp("\\e", REG_EXTENDED, 0); + /* Test the \o33 and \o{23072} extensions for specifying 8 bit and wide + characters in octal. */ + test_comp("\\o101", REG_EXTENDED, 0); + test_exec("ABC", 0, REG_OK, 0, 1, END); + test_comp("\\o5", REG_EXTENDED, 0); + test_exec("\005", 0, REG_OK, 0, 1, END); + test_comp("\\o5r", REG_EXTENDED, 0); + test_exec("\005r", 0, REG_OK, 0, 2, END); + test_comp("\\o", REG_EXTENDED, 0); + test_nexec("\000", 1, 0, REG_OK, 0, 1, END); + test_comp("\\or", REG_EXTENDED, 0); + test_nexec("\000r", 2, 0, REG_OK, 0, 2, END); + test_comp("\\o{101}", REG_EXTENDED, 0); + test_exec("ABC", 0, REG_OK, 0, 1, END); + test_comp("\\o{5}", REG_EXTENDED, 0); + test_exec("\005", 0, REG_OK, 0, 1, END); + test_comp("\\o{5}r", REG_EXTENDED, 0); + test_exec("\005r", 0, REG_OK, 0, 2, END); + test_comp("\\o{}", REG_EXTENDED, 0); + test_nexec("\000", 1, 0, REG_OK, 0, 1, END); + test_comp("\\o{}r", REG_EXTENDED, 0); + test_nexec("\000r", 2, 0, REG_OK, 0, 2, END); + test_comp("\\o{00000000000}", REG_EXTENDED, 0); + test_comp("\\o{17777777777}", REG_EXTENDED, 0); + test_comp("\\o{20000000000}", REG_EXTENDED, REG_EBRACE); + test_comp("\\o{000000000000}", REG_EXTENDED, 0); + test_comp("\\o{017777777777}", REG_EXTENDED, 0); + test_comp("\\o{020000000000}", REG_EXTENDED, REG_EBRACE); + /* Test the \x1B and \x{263a} extensions for specifying 8 bit and wide characters in hexadecimal. */ test_comp("\\x41", REG_EXTENDED, 0); @@ -1390,7 +1419,11 @@ main(int argc, char **argv) test_comp("\\x{}r", REG_EXTENDED, 0); test_nexec("\000r", 2, 0, REG_OK, 0, 2, END); test_comp("\\x{00000000}", REG_EXTENDED, 0); - test_comp("\\x{000000000}", REG_EXTENDED, REG_EBRACE); + test_comp("\\x{7fffffff}", REG_EXTENDED, 0); + test_comp("\\x{800000000}", REG_EXTENDED, REG_EBRACE); + test_comp("\\x{000000000}", REG_EXTENDED, 0); + test_comp("\\x{07fffffff}", REG_EXTENDED, 0); + test_comp("\\x{080000000}", REG_EXTENDED, REG_EBRACE); /* Tests for (?inrU-inrU) and (?inrU-inrU:) */ test_comp("foo(?i)bar", REG_EXTENDED, 0);