Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

APItest/t/utf8_warn_base: Add tests #22646

Merged
merged 5 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -5945,8 +5945,7 @@ RS |UV |check_locale_boundary_crossing \
|NN STRLEN *lenp
RTi |int |does_utf8_overflow \
|NN const U8 * const s \
|NN const U8 *e \
|const bool consider_overlongs
|NN const U8 *e
RTi |int |isFF_overlong |NN const U8 * const s \
|const STRLEN len
Ri |bool |is_utf8_common |NN const U8 * const p \
Expand Down
28 changes: 18 additions & 10 deletions ext/XS-APItest/t/utf8_warn_base.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1190,7 +1190,12 @@ ($)
# We try various combinations of malformations that can occur
foreach my $short (0, 1) {
next if $skip_most_tests && $short;
foreach my $unexpected_noncont (0, 1) {
# Insert an unexpected non-continuation in every possible position
my $unexpected_noncont;
for ($unexpected_noncont = $length - $short - 1;
$unexpected_noncont > 0;
$unexpected_noncont--)
{
next if $skip_most_tests && $unexpected_noncont;
foreach my $overlong (0, 1) {
next if $overlong && $skip_most_tests;
Expand Down Expand Up @@ -1318,11 +1323,14 @@ ($)

if ($unexpected_noncont) {

# To force this malformation, change the final continuation
# byte into a start byte.
my $pos = ($short) ? -2 : -1;
substr($this_bytes, $pos, 1) = $known_start_byte;
$this_expected_len--;
# The overlong tweaking above changes the first bytes to
# specified values; we better not override those.
next if $overlong;

# To force this malformation, change a continuation byte into a
# start byte.
substr($this_bytes, $unexpected_noncont, 1) = $known_start_byte;
$this_expected_len = $unexpected_noncont;
}

# The whole point of a test that is malformed from the beginning
Expand Down Expand Up @@ -1551,9 +1559,9 @@ ($)
# Test partial character handling, for each byte not a
# full character
my $did_test_partial = 0;
for (my $j = 1; $j < $this_length - 1; $j++) {
for (my $byte_count = 1; $byte_count < $this_expected_len - 1; $byte_count++) {
$did_test_partial = 1;
my $partial = substr($this_bytes, 0, $j);
my $partial = substr($this_bytes, 0, $byte_count);
my $ret_should_be;
my $comment;
if ($disallow_type || $malformations_name) {
Expand Down Expand Up @@ -1582,7 +1590,7 @@ ($)
$needed_to_tell = $dl if $dl < $needed_to_tell;
}

if ($j < $needed_to_tell) {
if ($byte_count < $needed_to_tell) {
$ret_should_be = 1;
$comment .= ", but need $needed_to_tell"
. " bytes to discern:";
Expand All @@ -1596,7 +1604,7 @@ ($)
undef @warnings_gotten;

$ret = test_is_utf8_valid_partial_char_flags($partial,
$j, $disallow_flags);
$byte_count, $disallow_flags);
is($ret, $ret_should_be,
" And is_utf8_valid_partial_char_flags("
. display_bytes($partial)
Expand Down
2 changes: 1 addition & 1 deletion proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions t/op/utf8decode.t
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,9 @@ __DATA__
3.4 Concatenation of incomplete sequences
3.4.1 N15 - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
3.5 Impossible bytes (but not with Perl's extended UTF-8)
3.5.1 n - 1 fe - 1 byte available, need 7
3.5.2 n - 1 ff - 1 byte available, need 13
3.5.3 N7 - 4 fe:fe:ff:ff - byte 0xfe
3.5.1 N2,1 - 1 fe - 1 byte available, need 7
3.5.2 N2,1 - 1 ff - 1 byte available, need 13
3.5.3 N11,7 - 4 fe:fe:ff:ff - byte 0xfe
4 Overlong sequences
4.1 Examples of an overlong ASCII character
4.1.1 n - 2 c0:af - overlong
Expand Down
113 changes: 50 additions & 63 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -597,35 +597,34 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
#endif

PERL_STATIC_INLINE int
S_does_utf8_overflow(const U8 * const s,
const U8 * e,
const bool consider_overlongs)
S_does_utf8_overflow(const U8 * const s, const U8 * e)
{
PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;

/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
* 'e' - 1 would overflow an IV on this platform; that is if it represents
* a code point larger than the highest representable code point. It
* returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
* enough information to tell. This last return value can happen if the
* sequence is incomplete, missing some trailing bytes that would form a
* complete character. If there are enough bytes to make a definitive
* decision, this function does so.
*
* If 'consider_overlongs' is TRUE, the function checks for the possibility
* that the sequence is an overlong that doesn't overflow. Otherwise, it
* assumes the sequence is not an overlong. This can give different
* results only on ASCII 32-bit platforms.
*
* (For ASCII platforms, we could use memcmp() because we don't have to
* convert each byte to I8, but it's very rare input indeed that would
* approach overflow, so the loop below will likely only get executed once.)
*
*/
* a code point larger than the highest representable code point. The
* possible returns are: */
#define NO_OVERFLOW 0 /* Definitely doesn't overflow */

/* There aren't enough examinable bytes available to be sure. This can happen
* if the sequence is incomplete, missing some trailing bytes that would form a
* complete character. */
#define COULD_OVERFLOW 1

/* This overflows if not also overlong, and like COULD_OVERFLOW, there aren't
* enough available bytes to be sure, but since overlongs are very rarely
* encountered, for most purposes consider it to overflow */
#define ALMOST_CERTAINLY_OVERFLOWS 2

#define OVERFLOWS 3 /* Definitely overflows */

/* Note that the values are ordered so that you can use '>=' in checking
* the return value. */

const STRLEN len = e - s;
const U8 *x;
const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
int is_overlong = 0;

PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;

for (x = s; x < e; x++, y++) {

Expand All @@ -635,13 +634,13 @@ S_does_utf8_overflow(const U8 * const s,
* bytes larger than those omitted bytes, and therefore 'x' can't
* overflow */
if (*y == '\0') {
return 0;
return NO_OVERFLOW;
}

/* If this byte is less than the corresponding highest non-overflowing
* UTF-8, the sequence doesn't overflow */
if (NATIVE_UTF8_TO_I8(*x) < *y) {
return 0;
return NO_OVERFLOW;
}

if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
Expand All @@ -652,33 +651,25 @@ S_does_utf8_overflow(const U8 * const s,
/* Got to the end, and all bytes are the same. If the input is a whole
* character, it doesn't overflow. And if it is a partial character,
* there's not enough information to tell */
return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;

overflows_if_not_overlong:

/* Here, a well-formed sequence overflows. If we are assuming
* well-formedness, return that it overflows. */
if (! consider_overlongs) {
return 1;
}
return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? NO_OVERFLOW
: COULD_OVERFLOW;

/* Here, it could be the overlong malformation, and might not actually
* overflow if you were to calculate it out.
*
* See if it actually is overlong */
is_overlong = is_utf8_overlong(s, len);
overflows_if_not_overlong: ;

/* If it isn't overlong, is well-formed, so overflows */
if (is_overlong == 0) {
return 1;
/* Here, the sequence overflows if not overlong. Check for that */
int is_overlong = is_utf8_overlong(s, len);
if (LIKELY(is_overlong == 0)) {
return OVERFLOWS;
}

/* Not long enough to determine */
if (is_overlong < 0) {
return -1;
return ALMOST_CERTAINLY_OVERFLOWS;
}

/* Here, it appears to overflow, but it is also overlong */
/* Here, it appears to overflow, but it is also overlong. That overlong
* may evaluate to something that doesn't overflow; or it may evaluate to
* something that does. Figure it out */

#if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS

Expand All @@ -699,11 +690,12 @@ S_does_utf8_overflow(const U8 * const s,
*
* FE consists of 7 bytes total; the FE start byte contributes 0 bits of
* information (the high 7 bits, all ones, say that the sequence is 7 bytes
* long, and the bottom, zero, bit is s placeholder. That leaves the 6
* continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
If that number of bits doesn't exceed the word size, it can't overflow. */
* long, and the bottom, zero, bit is 0, so doesn't add anything. That
* leaves the 6 continuation bytes to contribute
* UTF_CONTINUATION_BYTE_INFO_BITS each. If that number of bits doesn't
* exceed the word size, it can't overflow. */

return 0;
return NO_OVERFLOW;

#else

Expand All @@ -715,21 +707,23 @@ S_does_utf8_overflow(const U8 * const s,
*
* That means only the FF start byte can have an overflowing overlong. */
if (*s < 0xFF) {
return 0;
return NO_OVERFLOW;
}

/* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
* evaluates to 2**31, so overflows an IV. For a UV it's
* \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
# define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82"
# define OVERFLOWS_MIN_STRING "\xff\x80\x80\x80\x80\x80\x80\x82"

if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */
return -1;
if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS_MIN_STRING)) {
return ALMOST_CERTAINLY_OVERFLOWS; /* Not enough info to be sure */
}

# define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)

return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
return (strnGE((const char *) s, OVERFLOWS_MIN_STRING, STRLENs(OVERFLOWS_MIN_STRING)))
? OVERFLOWS
: NO_OVERFLOW;

#endif

Expand Down Expand Up @@ -895,9 +889,7 @@ Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
s++;
}

if (0 < does_utf8_overflow(s0, e,
FALSE /* Don't consider_overlongs */
)) {
if (does_utf8_overflow(s0, e) == OVERFLOWS) {
return 0;
}

Expand Down Expand Up @@ -1567,10 +1559,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,

/* Check for overflow. The algorithm requires us to not look past the end
* of the current character, even if partial, so the upper limit is 's' */
if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1 /* Do consider overlongs */
)))
{
if (UNLIKELY(does_utf8_overflow(s0, s) >= ALMOST_CERTAINLY_OVERFLOWS)) {
possible_problems |= UTF8_GOT_OVERFLOW;
uv = UNICODE_REPLACEMENT;
}
Expand Down Expand Up @@ -4124,9 +4113,7 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
if ( ckWARN_d(WARN_NON_UNICODE)
|| UNLIKELY(0 < does_utf8_overflow(s, s + len,
0 /* Don't consider overlongs */
)))
|| UNLIKELY(does_utf8_overflow(s, s + len) >= ALMOST_CERTAINLY_OVERFLOWS))
{
/* A side effect of this function will be to warn */
(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
Expand Down
Loading