Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions doc/tre-syntax.html
Original file line number Diff line number Diff line change
Expand Up @@ -337,23 +337,25 @@ <h3>Literals</h3>
<tr><td>
<pre>
<i>literal</i> ::= <i>ordinary-character</i>
| <b>"\o"</b> [<b>"0"</b>-<b>"7"</b> ]{0,3}
| <b>"\x"</b> [<b>"1"</b>-<b>"9"</b> <b>"a"-<b>"f"</b> <b>"A"</b>-<b>"F"</b>]{0,2}
| <b>"\o{"</b> [<b>"0"</b>-<b>"7"</b> ]* <b>"}"</b>
| <b>"\x{"</b> [<b>"1"</b>-<b>"9"</b> <b>"a"-<b>"f"</b> <b>"A"</b>-<b>"F"</b>]* <b>"}"</b>
| <b>"\"</b> <i>character</i>
</pre>
</td></tr>
</table>
<p>
A literal is either an ordinary character (a character that has no
other significance in the context), an 8 bit hexadecimal encoded
character (e.g. <tt>\x1B</tt>), a wide hexadecimal encoded character
(e.g. <tt>\x{263a}</tt>), or an escaped character. An escaped
character is a <tt>\</tt> followed by any character, and matches that
character. Escaping can be used to match characters which have a
special meaning in regexp syntax. A <tt>\</tt> cannot be the last
character of an ERE. Escaping also allows you to include a few
non-printable characters in the regular expression. These special
escape sequences include:
other significance in the context), an 8 bit octal or hexadecimal
encoded character (e.g. <tt>\x1B</tt>, <tt>\o33</tt>), a wide octal
or hexadecimal encoded character (e.g. <tt>\x{263a}, \o{23072}</tt>), or an
escaped character. An escaped character is a <tt>\</tt> followed by
any character, and matches that character. Escaping can be used to
match characters which have a special meaning in regexp syntax.
A <tt>\</tt> cannot be the last character of an ERE. Escaping also
allows you to include a few non-printable characters in the regular
expression. These special escape sequences include:
</p>

<ul>
Expand Down
140 changes: 91 additions & 49 deletions lib/tre-parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,86 @@ tre_ctype_t tre_ctype(const char *name)

#define REST(re) (int)(ctx->re_end - (re)), (re)

static reg_errcode_t
tre_parse_octal(tre_parse_ctx_t *ctx, unsigned long *valp)
{
unsigned long val = 0;
unsigned long max = 0xff;
int wide = 0;

if (ctx->re[0] == CHAR_LBRACE)
{
wide = 1;
max = 0x7fffffff;
ctx->re++;
}

DPRINT(("parsing %s oct: '%.*" STRF "'\n", wide ? "wide" : "8bit",
REST(ctx->re - 1)));
while (ctx->re < ctx->re_end)
{
unsigned int digit;
if (ctx->re[0] >= L'0' && ctx->re[0] <= L'7')
digit = ctx->re[0] - L'0';
else
break;
ctx->re++;
if (val > max >> 3)
return REG_EBRACE;
val = val << 3 | digit;
}
if (wide)
{
if (!(ctx->re < ctx->re_end && ctx->re[0] == CHAR_RBRACE))
return REG_EBRACE;
ctx->re++;
}
*valp = val;
return REG_OK;
}

static reg_errcode_t
tre_parse_hexadecimal(tre_parse_ctx_t *ctx, unsigned long *valp)
{
unsigned long val = 0;
unsigned long max = 0xff;
int wide = 0;

if (ctx->re[0] == CHAR_LBRACE)
{
wide = 1;
max = 0x7fffffff;
ctx->re++;
}

DPRINT(("parsing %s hex: '%.*" STRF "'\n", wide ? "wide" : "8bit",
REST(ctx->re - 1)));
while (ctx->re < ctx->re_end)
{
unsigned int digit;
if (ctx->re[0] >= L'0' && ctx->re[0] <= L'9')
digit = ctx->re[0] - L'0';
else if (ctx->re[0] >= L'A' && ctx->re[0] <= L'F')
digit = ctx->re[0] - L'A';
else if (ctx->re[0] >= L'a' && ctx->re[0] <= L'f')
digit = ctx->re[0] - L'a';
else
break;
ctx->re++;
if (val > max >> 4)
return REG_EBRACE;
val = val << 4 | digit;
}
if (wide)
{
if (!(ctx->re < ctx->re_end && ctx->re[0] == CHAR_RBRACE))
return REG_EBRACE;
ctx->re++;
}
*valp = val;
return REG_OK;
}

static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
tre_ctype_t neg_classes[], int *num_neg_classes,
Expand Down Expand Up @@ -954,6 +1034,7 @@ tre_parse(tre_parse_ctx_t *ctx)
reg_errcode_t status = REG_OK;
tre_stack_t *stack = ctx->stack;
size_t bottom = tre_stack_num_items(stack);
unsigned long val;
int depth = 0;
int temporary_cflags = 0;

Expand Down Expand Up @@ -1463,57 +1544,18 @@ tre_parse(tre_parse_ctx_t *ctx)
ASSERT_AT_EOW);
ctx->re++;
break;
case L'o':
ctx->re++;
if ((status = tre_parse_octal(ctx, &val)) != REG_OK)
return status;
result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
break;
case L'x':
ctx->re++;
if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
{
/* 8 bit hex char. */
char tmp[3] = {0, 0, 0};
long val;
DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
REST(ctx->re - 2)));

if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
{
tmp[0] = (char)ctx->re[0];
ctx->re++;
}
if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
{
tmp[1] = (char)ctx->re[0];
ctx->re++;
}
val = strtol(tmp, NULL, 16);
result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
break;
}
else if (ctx->re < ctx->re_end)
{
/* Wide char. */
char tmp[9]; /* max 8 hex digits + terminator */
long val;
size_t i = 0;
ctx->re++;
while (ctx->re_end - ctx->re >= 0)
{
if (ctx->re[0] == CHAR_RBRACE)
break;
if (tre_isxdigit(ctx->re[0]) && i < sizeof(tmp) - 1)
{
tmp[i] = (char)ctx->re[0];
i++;
ctx->re++;
continue;
}
return REG_EBRACE;
}
ctx->re++;
tmp[i] = 0;
val = strtol(tmp, NULL, 16);
result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
break;
}
/*FALLTHROUGH*/
if ((status = tre_parse_hexadecimal(ctx, &val)) != REG_OK)
return status;
result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
break;

default:
if (tre_isdigit(*ctx->re))
Expand Down
35 changes: 34 additions & 1 deletion tests/retest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1367,6 +1367,35 @@ main(int argc, char **argv)
test_comp("\\t", REG_EXTENDED, 0);
test_comp("\\e", REG_EXTENDED, 0);

/* Test the \o33 and \o{23072} extensions for specifying 8 bit and wide
characters in octal. */
test_comp("\\o101", REG_EXTENDED, 0);
test_exec("ABC", 0, REG_OK, 0, 1, END);
test_comp("\\o5", REG_EXTENDED, 0);
test_exec("\005", 0, REG_OK, 0, 1, END);
test_comp("\\o5r", REG_EXTENDED, 0);
test_exec("\005r", 0, REG_OK, 0, 2, END);
test_comp("\\o", REG_EXTENDED, 0);
test_nexec("\000", 1, 0, REG_OK, 0, 1, END);
test_comp("\\or", REG_EXTENDED, 0);
test_nexec("\000r", 2, 0, REG_OK, 0, 2, END);
test_comp("\\o{101}", REG_EXTENDED, 0);
test_exec("ABC", 0, REG_OK, 0, 1, END);
test_comp("\\o{5}", REG_EXTENDED, 0);
test_exec("\005", 0, REG_OK, 0, 1, END);
test_comp("\\o{5}r", REG_EXTENDED, 0);
test_exec("\005r", 0, REG_OK, 0, 2, END);
test_comp("\\o{}", REG_EXTENDED, 0);
test_nexec("\000", 1, 0, REG_OK, 0, 1, END);
test_comp("\\o{}r", REG_EXTENDED, 0);
test_nexec("\000r", 2, 0, REG_OK, 0, 2, END);
test_comp("\\o{00000000000}", REG_EXTENDED, 0);
test_comp("\\o{17777777777}", REG_EXTENDED, 0);
test_comp("\\o{20000000000}", REG_EXTENDED, REG_EBRACE);
test_comp("\\o{000000000000}", REG_EXTENDED, 0);
test_comp("\\o{017777777777}", REG_EXTENDED, 0);
test_comp("\\o{020000000000}", REG_EXTENDED, REG_EBRACE);

/* Test the \x1B and \x{263a} extensions for specifying 8 bit and wide
characters in hexadecimal. */
test_comp("\\x41", REG_EXTENDED, 0);
Expand All @@ -1390,7 +1419,11 @@ main(int argc, char **argv)
test_comp("\\x{}r", REG_EXTENDED, 0);
test_nexec("\000r", 2, 0, REG_OK, 0, 2, END);
test_comp("\\x{00000000}", REG_EXTENDED, 0);
test_comp("\\x{000000000}", REG_EXTENDED, REG_EBRACE);
test_comp("\\x{7fffffff}", REG_EXTENDED, 0);
test_comp("\\x{800000000}", REG_EXTENDED, REG_EBRACE);
test_comp("\\x{000000000}", REG_EXTENDED, 0);
test_comp("\\x{07fffffff}", REG_EXTENDED, 0);
test_comp("\\x{080000000}", REG_EXTENDED, REG_EBRACE);

/* Tests for (?inrU-inrU) and (?inrU-inrU:) */
test_comp("foo(?i)bar", REG_EXTENDED, 0);
Expand Down
Loading