From 803666337bb91bf2e149922dcdb5a50f31a0e89e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= <des@des.dev>
Date: Fri, 4 Apr 2025 11:59:08 +0200
Subject: [PATCH] Support octal escapes as well as hexadecimal.

* Factor out the hexadecimal escape parser and rewrite it to support
  sequences of arbitrary length as long as they don't overflow.
* Replicate the same logic for octal escapes using a `\o` prefix.
* Update the unit tests and documentation accordingly.
---
 doc/tre-syntax.html |  20 ++++---
 lib/tre-parse.c     | 140 ++++++++++++++++++++++++++++----------------
 tests/retest.c      |  35 ++++++++++-
 3 files changed, 136 insertions(+), 59 deletions(-)
diff --git a/doc/tre-syntax.html b/doc/tre-syntax.html
index 640bb17..d8334ac 100644
--- a/doc/tre-syntax.html
+++ b/doc/tre-syntax.html
@@ -337,7 +337,9 @@ <h3>Literals</h3>
 <tr><td>
 <pre>
 <i>literal</i> ::= <i>ordinary-character</i>
+        |   <b>"\o"</b> [<b>"0"</b>-<b>"7"</b> ]{0,3}
         |   <b>"\x"</b> [<b>"1"</b>-<b>"9"</b> <b>"a"-<b>"f"</b> <b>"A"</b>-<b>"F"</b>]{0,2}
+        |   <b>"\o{"</b> [<b>"0"</b>-<b>"7"</b> ]* <b>"}"</b>
         |   <b>"\x{"</b> [<b>"1"</b>-<b>"9"</b> <b>"a"-<b>"f"</b> <b>"A"</b>-<b>"F"</b>]* <b>"}"</b>
         |   <b>"\"</b> <i>character</i>
 </pre>
@@ -345,15 +347,15 @@ <h3>Literals</h3>
 </table>
 <p>
 A literal is either an ordinary character (a character that has no
-other significance in the context), an 8 bit hexadecimal encoded
-character (e.g. <tt>\x1B</tt>), a wide hexadecimal encoded character
-(e.g. <tt>\x{263a}</tt>), or an escaped character.  An escaped
-character is a <tt>\</tt> followed by any character, and matches that
-character.  Escaping can be used to match characters which have a
-special meaning in regexp syntax.  A <tt>\</tt> cannot be the last
-character of an ERE.  Escaping also allows you to include a few
-non-printable characters in the regular expression.  These special
-escape sequences include:
+other significance in the context), an 8 bit octal or hexadecimal
+encoded character (e.g. <tt>\x1B</tt>, <tt>\o33</tt>), a wide octal
+or hexadecimal encoded character (e.g. <tt>\x{263a}, \o{23072}</tt>), or an
+escaped character.  An escaped character is a <tt>\</tt> followed by
+any character, and matches that character.  Escaping can be used to
+match characters which have a special meaning in regexp syntax.
+A <tt>\</tt> cannot be the last character of an ERE.  Escaping also
+allows you to include a few non-printable characters in the regular
+expression.  These special escape sequences include:
 </p>
 
 <ul>
diff --git a/lib/tre-parse.c b/lib/tre-parse.c
index 9c3106c..afb0aed 100644
--- a/lib/tre-parse.c
+++ b/lib/tre-parse.c
@@ -252,6 +252,86 @@ tre_ctype_t tre_ctype(const char *name)
 
 #define REST(re) (int)(ctx->re_end - (re)), (re)
 
+static reg_errcode_t
+tre_parse_octal(tre_parse_ctx_t *ctx, unsigned long *valp)
+{
+  unsigned long val = 0;
+  unsigned long max = 0xff;
+  int wide = 0;
+
+  if (ctx->re[0] == CHAR_LBRACE)
+    {
+      wide = 1;
+      max = 0x7fffffff;
+      ctx->re++;
+    }
+
+  DPRINT(("parsing %s oct: '%.*" STRF "'\n", wide ? "wide" : "8bit",
+	  REST(ctx->re - 1)));
+  while (ctx->re < ctx->re_end)
+    {
+      unsigned int digit;
+      if (ctx->re[0] >= L'0' && ctx->re[0] <= L'7')
+	digit = ctx->re[0] - L'0';
+      else
+	break;
+      ctx->re++;
+      if (val > max >> 3)
+	return REG_EBRACE;
+      val = val << 3 | digit;
+    }
+  if (wide)
+    {
+      if (!(ctx->re < ctx->re_end && ctx->re[0] == CHAR_RBRACE))
+	return REG_EBRACE;
+      ctx->re++;
+    }
+  *valp = val;
+  return REG_OK;
+}
+
+static reg_errcode_t
+tre_parse_hexadecimal(tre_parse_ctx_t *ctx, unsigned long *valp)
+{
+  unsigned long val = 0;
+  unsigned long max = 0xff;
+  int wide = 0;
+
+  if (ctx->re[0] == CHAR_LBRACE)
+    {
+      wide = 1;
+      max = 0x7fffffff;
+      ctx->re++;
+    }
+
+  DPRINT(("parsing %s hex: '%.*" STRF "'\n", wide ? "wide" : "8bit",
+	  REST(ctx->re - 1)));
+  while (ctx->re < ctx->re_end)
+    {
+      unsigned int digit;
+      if (ctx->re[0] >= L'0' && ctx->re[0] <= L'9')
+	digit = ctx->re[0] - L'0';
+      else if (ctx->re[0] >= L'A' && ctx->re[0] <= L'F')
+	digit = ctx->re[0] - L'A';
+      else if (ctx->re[0] >= L'a' && ctx->re[0] <= L'f')
+	digit = ctx->re[0] - L'a';
+      else
+	break;
+      ctx->re++;
+      if (val > max >> 4)
+	return REG_EBRACE;
+      val = val << 4 | digit;
+    }
+  if (wide)
+    {
+      if (!(ctx->re < ctx->re_end && ctx->re[0] == CHAR_RBRACE))
+	return REG_EBRACE;
+      ctx->re++;
+    }
+  *valp = val;
+  return REG_OK;
+}
+
 static reg_errcode_t
 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
 			tre_ctype_t neg_classes[], int *num_neg_classes,
@@ -954,6 +1034,7 @@ tre_parse(tre_parse_ctx_t *ctx)
   reg_errcode_t status = REG_OK;
   tre_stack_t *stack = ctx->stack;
   size_t bottom = tre_stack_num_items(stack);
+  unsigned long val;
   int depth = 0;
   int temporary_cflags = 0;
 
@@ -1463,57 +1544,18 @@ tre_parse(tre_parse_ctx_t *ctx)
 					       ASSERT_AT_EOW);
 		  ctx->re++;
 		  break;
+		case L'o':
+		  ctx->re++;
+		  if ((status = tre_parse_octal(ctx, &val)) != REG_OK)
+		    return status;
+		  result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
+		  break;
 		case L'x':
 		  ctx->re++;
-		  if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
-		    {
-		      /* 8 bit hex char. */
-		      char tmp[3] = {0, 0, 0};
-		      long val;
-		      DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
-			      REST(ctx->re - 2)));
-
-		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
-			{
-			  tmp[0] = (char)ctx->re[0];
-			  ctx->re++;
-			}
-		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
-			{
-			  tmp[1] = (char)ctx->re[0];
-			  ctx->re++;
-			}
-		      val = strtol(tmp, NULL, 16);
-		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
-		      break;
-		    }
-		  else if (ctx->re < ctx->re_end)
-		    {
-		      /* Wide char. */
-		      char tmp[9]; /* max 8 hex digits + terminator */
-		      long val;
-		      size_t i = 0;
-		      ctx->re++;
-		      while (ctx->re_end - ctx->re >= 0)
-			{
-			  if (ctx->re[0] == CHAR_RBRACE)
-			    break;
-			  if (tre_isxdigit(ctx->re[0]) && i < sizeof(tmp) - 1)
-			    {
-			      tmp[i] = (char)ctx->re[0];
-			      i++;
-			      ctx->re++;
-			      continue;
-			    }
-			  return REG_EBRACE;
-			}
-		      ctx->re++;
-		      tmp[i] = 0;
-		      val = strtol(tmp, NULL, 16);
-		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
-		      break;
-		    }
-		  /*FALLTHROUGH*/
+		  if ((status = tre_parse_hexadecimal(ctx, &val)) != REG_OK)
+		    return status;
+		  result = tre_ast_new_literal(ctx->mem, (int)val, (int)val);
+		  break;
 
 		default:
 		  if (tre_isdigit(*ctx->re))
diff --git a/tests/retest.c b/tests/retest.c
index f6af42e..7393975 100644
--- a/tests/retest.c
+++ b/tests/retest.c
@@ -1367,6 +1367,35 @@ main(int argc, char **argv)
   test_comp("\\t", REG_EXTENDED, 0);
   test_comp("\\e", REG_EXTENDED, 0);
 
+  /* Test the \o33 and \o{23072} extensions for specifying 8 bit and wide
+     characters in octal. */
+  test_comp("\\o101", REG_EXTENDED, 0);
+  test_exec("ABC", 0, REG_OK, 0, 1, END);
+  test_comp("\\o5", REG_EXTENDED, 0);
+  test_exec("\005", 0, REG_OK, 0, 1, END);
+  test_comp("\\o5r", REG_EXTENDED, 0);
+  test_exec("\005r", 0, REG_OK, 0, 2, END);
+  test_comp("\\o", REG_EXTENDED, 0);
+  test_nexec("\000", 1, 0, REG_OK, 0, 1, END);
+  test_comp("\\or", REG_EXTENDED, 0);
+  test_nexec("\000r", 2, 0, REG_OK, 0, 2, END);
+  test_comp("\\o{101}", REG_EXTENDED, 0);
+  test_exec("ABC", 0, REG_OK, 0, 1, END);
+  test_comp("\\o{5}", REG_EXTENDED, 0);
+  test_exec("\005", 0, REG_OK, 0, 1, END);
+  test_comp("\\o{5}r", REG_EXTENDED, 0);
+  test_exec("\005r", 0, REG_OK, 0, 2, END);
+  test_comp("\\o{}", REG_EXTENDED, 0);
+  test_nexec("\000", 1, 0, REG_OK, 0, 1, END);
+  test_comp("\\o{}r", REG_EXTENDED, 0);
+  test_nexec("\000r", 2, 0, REG_OK, 0, 2, END);
+  test_comp("\\o{00000000000}", REG_EXTENDED, 0);
+  test_comp("\\o{17777777777}", REG_EXTENDED, 0);
+  test_comp("\\o{20000000000}", REG_EXTENDED, REG_EBRACE);
+  test_comp("\\o{000000000000}", REG_EXTENDED, 0);
+  test_comp("\\o{017777777777}", REG_EXTENDED, 0);
+  test_comp("\\o{020000000000}", REG_EXTENDED, REG_EBRACE);
+
   /* Test the \x1B and \x{263a} extensions for specifying 8 bit and wide
      characters in hexadecimal. */
   test_comp("\\x41", REG_EXTENDED, 0);
@@ -1390,7 +1419,11 @@ main(int argc, char **argv)
   test_comp("\\x{}r", REG_EXTENDED, 0);
   test_nexec("\000r", 2, 0, REG_OK, 0, 2, END);
   test_comp("\\x{00000000}", REG_EXTENDED, 0);
-  test_comp("\\x{000000000}", REG_EXTENDED, REG_EBRACE);
+  test_comp("\\x{7fffffff}", REG_EXTENDED, 0);
+  test_comp("\\x{800000000}", REG_EXTENDED, REG_EBRACE);
+  test_comp("\\x{000000000}", REG_EXTENDED, 0);
+  test_comp("\\x{07fffffff}", REG_EXTENDED, 0);
+  test_comp("\\x{080000000}", REG_EXTENDED, REG_EBRACE);
 
   /* Tests for (?inrU-inrU) and (?inrU-inrU:) */
   test_comp("foo(?i)bar", REG_EXTENDED, 0);