Skip to content

Commit 2fef163

Browse files
committed
Update [[:digit:]] in UCP mode to match Perl
1 parent 1b26eae commit 2fef163

File tree

11 files changed

+876
-830
lines changed

11 files changed

+876
-830
lines changed

ChangeLog

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,14 @@ extension, so PCRE2 follows ECMAScript rather than Perl.
123123
pcre2_match() was not fully resetting all captures that had been set within a
124124
(possibly recursive) subroutine call such as (?3).
125125

126+
32. Changed the meaning of \w (and its synonyms) in UCP mode to match Perl. It
127+
now matches characters whose general categories are L or N or whose particular
128+
categories are Mn (non-spacing mark) or Pc (combining puntuation). The latter
129+
includes underscore.
130+
131+
33. Changed the meaning of [:digit:] in UCP mode to match Perl. It now also
132+
matches the "fullwidth" versions of the hex digits.
133+
126134

127135
Version 10.42 11-December-2022
128136
------------------------------
@@ -146,11 +154,6 @@ maximum of 65535 is now silently applied.
146154

147155
5. Merged @carenas patch #175 which fixes #86 - segfault on aarch64 (ARM),
148156

149-
6. Changed the meaning of \w (and its synonyms) in UCP mode to match Perl. It
150-
now matches characters whose general categories are L or N or whose particular
151-
categories are Mn (non-spacing mark) or Pc (combining puntuation). The latter
152-
includes underscore.
153-
154157

155158
Version 10.41 06-December-2022
156159
------------------------------

doc/html/pcre2pattern.html

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1552,7 +1552,7 @@ <h1>pcre2pattern man page</h1>
15521552
[:upper:] becomes \p{Lu}
15531553
[:word:] becomes \p{Xwd}
15541554
</pre>
1555-
Negated versions, such as [:^alpha:] use \P instead of \p. Three other POSIX
1555+
Negated versions, such as [:^alpha:] use \P instead of \p. Four other POSIX
15561556
classes are handled specially in UCP mode:
15571557
</P>
15581558
<P>
@@ -1579,6 +1579,12 @@ <h1>pcre2pattern man page</h1>
15791579
property.
15801580
</P>
15811581
<P>
1582+
[:xdigit:]
1583+
In addition to the ASCII hexadecimal digits, this also matches the "fullwidth"
1584+
versions of those characters, whose Unicode code points start at U+FF10. This
1585+
is a change that was made in PCRE release 10.43 for Perl compatibility.
1586+
</P>
1587+
<P>
15821588
The other POSIX classes are unchanged by PCRE2_UCP, and match only characters
15831589
with code points less than 256. The effect of PCRE2_UCP on POSIX classes can be
15841590
negated by setting the PCRE2_EXTRA_ASCII_POSIX option, either when calling

doc/pcre2.txt

Lines changed: 820 additions & 814 deletions
Large diffs are not rendered by default.

doc/pcre2pattern.3

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1546,7 +1546,7 @@ follows:
15461546
[:upper:] becomes \ep{Lu}
15471547
[:word:] becomes \ep{Xwd}
15481548
.sp
1549-
Negated versions, such as [:^alpha:] use \eP instead of \ep. Three other POSIX
1549+
Negated versions, such as [:^alpha:] use \eP instead of \ep. Four other POSIX
15501550
classes are handled specially in UCP mode:
15511551
.TP 10
15521552
[:graph:]
@@ -1567,6 +1567,11 @@ not controls, that is, characters with the Zs property.
15671567
This matches all characters that have the Unicode P (punctuation) property,
15681568
plus those characters with code points less than 256 that have the S (Symbol)
15691569
property.
1570+
.TP 10
1571+
[:xdigit:]
1572+
In addition to the ASCII hexadecimal digits, this also matches the "fullwidth"
1573+
versions of those characters, whose Unicode code points start at U+FF10. This
1574+
is a change that was made in PCRE release 10.43 for Perl compatibility.
15701575
.P
15711576
The other POSIX classes are unchanged by PCRE2_UCP, and match only characters
15721577
with code points less than 256. The effect of PCRE2_UCP on POSIX classes can be

src/pcre2_compile.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -706,9 +706,10 @@ static const char posix_names[] =
706706
static const uint8_t posix_name_lengths[] = {
707707
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708708

709-
#define PC_GRAPH 8
710-
#define PC_PRINT 9
711-
#define PC_PUNCT 10
709+
#define PC_GRAPH 8
710+
#define PC_PRINT 9
711+
#define PC_PUNCT 10
712+
#define PC_XDIGIT 13
712713

713714
/* Table of class bit maps for each POSIX class. Each class is formed from a
714715
base map, with an optional addition or removal of another map. Then, for some
@@ -756,7 +757,7 @@ static int posix_substitutes[] = {
756757
PT_PXPUNCT, 0, /* punct */
757758
PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
758759
PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
759-
-1, 0 /* xdigit, treat as non-UCP */
760+
PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */
760761
};
761762
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
762763
#endif /* SUPPORT_UNICODE */
@@ -6027,7 +6028,8 @@ for (;; pptr++)
60276028
*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
60286029
*class_uchardata++ = (PCRE2_UCHAR)
60296030
((posix_class == PC_GRAPH)? PT_PXGRAPH :
6030-
(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6031+
(posix_class == PC_PRINT)? PT_PXPRINT :
6032+
(posix_class == PC_XDIGIT)? PT_PXXDIGIT : PT_PXPUNCT);
60316033
*class_uchardata++ = 0;
60326034
xclass_has_prop = TRUE;
60336035
goto CONTINUE_CLASS;

src/pcre2_internal.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1299,7 +1299,7 @@ match. */
12991299
#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */
13001300
#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */
13011301
#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */
1302-
#define PT_WORD 9 /* Word - L plus N plus underscore */
1302+
#define PT_WORD 9 /* Word - L, N, Mn, or Pc */
13031303
#define PT_CLIST 10 /* Pseudo-property: match character list */
13041304
#define PT_UCNC 11 /* Universal Character nameable character */
13051305
#define PT_BIDICL 12 /* Specified bidi class */
@@ -1315,6 +1315,7 @@ table. */
13151315
#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */
13161316
#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */
13171317
#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */
1318+
#define PT_PXXDIGIT 17 /* [:xdigit:] - hex digits */
13181319

13191320
/* This value is used when parsing \p and \P escapes to indicate that neither
13201321
\p{script:...} nor \p{scx:...} has been encountered. */

src/pcre2_printint.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,10 @@ for(;;)
778778
fprintf(f, "[:%spunct:]", notch);
779779
break;
780780

781+
case PT_PXXDIGIT:
782+
fprintf(f, "[:%sxdigit:]", notch);
783+
break;
784+
781785
default:
782786
s = get_ucpname(ptype, pvalue);
783787
fprintf(f, "\\%c{%c%s}", ((notch[0] == '^')? 'P':'p'),

src/pcre2_xclass.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,18 @@ while ((t = *data++) != XCL_END)
276276
return !negated;
277277
break;
278278

279+
/* Perl has two sets of hex digits */
280+
281+
case PT_PXXDIGIT:
282+
if (((c >= CHAR_0 && c <= CHAR_9) ||
283+
(c >= CHAR_A && c <= CHAR_F) ||
284+
(c >= CHAR_a && c <= CHAR_f) ||
285+
(c >= 0xff10 && c <= 0xff19) || /* Fullwidth digits */
286+
(c >= 0xff21 && c <= 0xff26) || /* Fullwidth letters */
287+
(c >= 0xff41 && c <= 0xff46)) == isprop)
288+
return !negated;
289+
break;
290+
279291
/* This should never occur, but compilers may mutter if there is no
280292
default. */
281293

testdata/testinput4

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1497,7 +1497,7 @@
14971497
Az_\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}1\x{660}\x{bef}\x{16ee}
14981498

14991499
/^[[:xdigit:]]*/utf,ucp
1500-
1a\x{660}\x{bef}\x{16ee}
1500+
1a\x{660}\x{bef}\x{16ee}\=no_jit
15011501

15021502
/^\d+/utf,ucp
15031503
1\x{660}\x{bef}\x{16ee}
@@ -2853,4 +2853,7 @@
28532853
/[[:word:]]+/utf,ucp
28542854
--cafe\x{300}_au\x{203f}lait!\=no_jit
28552855

2856+
/[[:xdigit:]]+/utf,ucp
2857+
--123ef\x{ff10}\x{ff19}\x{ff21}\x{ff26}\x{ff1a}\=no_jit
2858+
28562859
# End of testinput4

testdata/testoutput4

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2450,7 +2450,7 @@ No match
24502450
0: Az_\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}1\x{660}\x{bef}\x{16ee}
24512451

24522452
/^[[:xdigit:]]*/utf,ucp
2453-
1a\x{660}\x{bef}\x{16ee}
2453+
1a\x{660}\x{bef}\x{16ee}\=no_jit
24542454
0: 1a
24552455

24562456
/^\d+/utf,ucp
@@ -4563,4 +4563,8 @@ No match
45634563
--cafe\x{300}_au\x{203f}lait!\=no_jit
45644564
0: cafe\x{300}_au\x{203f}lait
45654565

4566+
/[[:xdigit:]]+/utf,ucp
4567+
--123ef\x{ff10}\x{ff19}\x{ff21}\x{ff26}\x{ff1a}\=no_jit
4568+
0: 123ef\x{ff10}\x{ff19}\x{ff21}\x{ff26}
4569+
45664570
# End of testinput4

0 commit comments

Comments
 (0)