diff --git a/autodoc.pl b/autodoc.pl index e19d12a522e9..bac73619a167 100644 --- a/autodoc.pl +++ b/autodoc.pl @@ -1899,7 +1899,7 @@ ($fh, $section_name, $element_name, $docref) # Here, has a long name and we didn't create one just # above. Check that there really is a long name entry. my $real_proto = delete $protos{"Perl_$name"}; - if ($real_proto) { + if ($real_proto || $flags =~ /m/) { # Set up to redo the loop at the end. This iteration # adds the short form; the redo causes its long form diff --git a/embed.fnc b/embed.fnc index f490397f7de6..ae17ffcda520 100644 --- a/embed.fnc +++ b/embed.fnc @@ -14,7 +14,8 @@ : real (full) name, with any appropriate thread context paramaters, thus hiding : that detail from the typical code. : -: Most macros (as opposed to functions) listed here are the complete full name. +: Many macros (as opposed to functions) listed here are the complete full name, +: though we may want to start converting those to have full names. : : All non-static functions defined by perl need to be listed in this file. : embed.pl uses the entries here to construct: @@ -157,8 +158,9 @@ : and you know at a glance that the macro actually has documentation. It : doesn't by itself create any documentation; instead the other apidoc lines : pull in information specified by these lines. Many of the lines in this -: file for macros could be pulled out of here and replaced by these lines -: throughout the source. It is a goal to do that as convenience dictates. +: file for macros that don't also have the 'p' flag (described below) could be +: pulled out of here and replaced by these lines throughout the source. It is +: a goal to do that as convenience dictates. : : The other apidoc lines either have the usage data as part of the line, or : pull in the data from this file or apidoc_defn lines. @@ -210,10 +212,10 @@ : line that begins with an '='. In particular, an '=cut' line ends that : documentation without introducing something new. : -: Various macros and other elements aren't listed here in embed.fnc. They are -: documented in the same manner, but since they don't have this file to get -: information from, the defining lines have the syntax and meaning they do in -: this file, so it can be specified: +: Various macros and other elements aren't listed here in embed.fnc (though +: they could be). They are documented in the same manner, but since they don't +: have this file to get information from, the defining lines have the syntax +: and meaning they do in this file, so it can be specified: : : =for apidoc flags|return_type|name|arg1|arg2|...|argN : =for apidoc_item flags|return_type|name|arg1|arg2|...|argN @@ -301,22 +303,21 @@ : functions flagged with this, the installation can run Configure with : the -Accflags='-DNO_MATHOMS' parameter to not even compile them. : -: Sometimes the function has been subsumed by a more general one (say, -: by adding a flags parameter), and a macro exists with the original -: short name API, and it calls the new function, bypassing this one, and -: the original 'Perl_' form is being deprecated. In this case also -: specify the 'M' flag. +: If the function can be implemented as a macro (that evaluates its +: arguments exactly once), use the 'm' and 'p' flags together to implement +: this. (See the discussion under 'm'.) Another option for this is to +: use the 'M' flag. : -: Without the M flag, these functions should be deprecated, and it is an -: error to not also specify the 'D' flag. +: Without the m or M flags, these functions should be deprecated, and it +: is an error to not also specify the 'D' flag. : : The 'b' functions are normally moved to mathoms.c, but if : circumstances dictate otherwise, they can be anywhere, provided the : whole function is wrapped with : -: #ifndef NO_MATHOMS -: ... -: #endif +: #ifndef NO_MATHOMS +: ... +: #endif : : Note that this flag no longer automatically adds a 'Perl_' prefix to : the name. Additionally specify 'p' to do that. @@ -370,10 +371,10 @@ : then it is assumed to take a strftime-style format string as the 1st : arg; otherwise it's assumed to take a printf style format string, not : necessarily the 1st arg. All the arguments following the second form -: (including possibly '...') are assumed to be for the format. +: (including possibly '...') are assumed to be for the format. : : embed.h: any entry in here for the second form is suppressed because -: of varargs +: of varargs : proto.h: add __attribute__format__ (or ...null_ok__) : : 'F' Function has a '...' parameter, but don't assume it is a format. This @@ -396,7 +397,7 @@ : one NN argument. : : proto.h: PERL_ARGS_ASSERT macro is not defined unless the function -: has NN arguments +: has NN arguments : : 'h' Hide any documentation that would normally go into perlapi or : perlintern. This is typically used when the documentation is actually @@ -427,7 +428,7 @@ : particular C file(s) or in the perl core.) Therefore, all non-guarded : functions should also have the 'p' flag specified to avoid polluting : the XS code name space. Otherwise, this flag also turns on the 'S' -: flag. +: flag. : : proto.h: function is declared as PERL_STATIC_INLINE : @@ -439,23 +440,46 @@ : __attribute__always_inline__ is added : : 'm' Implemented as a macro; there is no function associated with this -: name, and hence no long Perl_ or S_ name. However, if the macro name -: itself begins with 'Perl_', autodoc.pl will show a thread context -: parameter unless the 'T' flag is specified. +: name. There is no long S_ name. +: +: However, you may #define the macro with a long name like 'Perl_foo', +: and specify the 'p' flag. This will cause an embed.h entry to be +: created that #defines 'foo' as 'Perl_foo'. This can be used to make +: any macro have a long name, perhaps to avoid name collisions. It is +: particularly useful tp preserve backward compatibility when a function +: is converted to be a macro. Most of mathoms.c could be converted to +: use this facility. When there is no thread context involved, you just +: do something like +: +: #define Perl_foo(a, b, c) Perl_bar(a, b, 0, c) +: +: Otherwise consider this general case where there is a series of macros +: that build on the previous ones by calling something with a different +: name or with an extra parameter beyond what the previous one did: +: +: #define Perl_foo(mTHX, a) Perl_bar1(aTHX, a) +: #define Perl_bar1(mTHX, a) Perl_bar2(aTHX, a, 0) +: #define Perl_bar2(mTHX, a, b) Perl_bar3(aTHX, a, b, 0) +: #define Perl_bar3(mTHX, a, b, c) Perl_func(aTHX_ a, b, c, 0) +: +: Use the formal parameter name 'mTHX,' (which stands for "macro thread +: context") as the first in each macro definition, and call the next +: macro in the sequence with 'aTHX,' (Note the commas). Eventually, the +: sequence will end with a function call (or else there would be no need +: for thread context). For that instead call it with 'aTHX_' (with an +: underscore instead of a comma). : : suppress proto.h entry (actually, not suppressed, but commented out) -: suppress entry in the list of exported symbols available on all platforms -: suppress embed.h entry, as the implementation should furnish the macro +: suppress entry in the list of exported symbols available on all +: platforms +: suppress embed.h entry (when no 'p' flag), as the implementation +: should furnish the macro : : 'M' The implementation is furnishing its own macro instead of relying on : the automatically generated short name macro (which simply expands to : call the real name function). One reason to do this is if the -: parameters need to be cast from what the caller has, or if there is a -: macro that bypasses this function (whose long name is being retained -: for backward compatibility for those who call it with that name). An -: example is when a new function is created with an extra parameter and -: a wrapper macro is added that has the old API, but calls the new one -: with the exta parameter set to a default. +: parameters need to be cast from what the caller has. There is less +: need to do this now that 'm' and 'p' together is supported. : : This flag requires the 'p' flag to be specified, as there would be no : need to do this if the function weren't publicly accessible before. @@ -489,10 +513,10 @@ : : 'o' Has no Perl_foo or S_foo compatibility macro: : -: This is used for whatever reason to force the function to be called -: with the long name. Perhaps there is a varargs issue. Use the 'M' -: flag instead for wrapper macros, and legacy-only functions should -: also use 'b'. +: This is used for whatever reason to force the function to be called +: with the long name. Perhaps there is a varargs issue. Use the 'M' +: or 'm' flags instead for wrapper macros, and legacy-only functions +: should also use 'b'. : : embed.h: suppress "#define foo Perl_foo" : @@ -517,9 +541,10 @@ : : proto.h: add __attribute__pure__ : -: 'p' Function in source code has a Perl_ prefix: +: 'p' Function or macro in source code has a Perl_ prefix: : -: proto.h: function is declared as Perl_foo rather than foo +: proto.h: function or macro is declared as Perl_foo rather than foo +: (though the entries for macros will be commented out) : embed.h: "#define foo Perl_foo" entries added : : 'R' Return value must not be ignored (also implied by 'a' and 'P' flags): @@ -543,8 +568,8 @@ : : 's' Static function, but function in source code has a Perl_ prefix: : -: This is used for functions that have always had a Perl_ prefix, but -: have been moved to a header file and declared static. +: This is used for functions that have always had a Perl_ prefix, but +: have been moved to a header file and declared static. : : proto.h: function is declared as Perl_foo rather than foo : STATIC is added to declaration; @@ -579,11 +604,11 @@ : compatibility issues. : : 'W' Add a comma_pDEPTH argument to function prototypes, and a comma_aDEPTH -: argument to the function calls. This means that under DEBUGGING a -: depth argument is added to the functions, which is used for example by -: the regex engine for debugging and trace output. A non DEBUGGING build -: will not pass the unused argument. Currently restricted to functions -: with at least one argument. +: argument to the function calls. This means that under DEBUGGING a +: depth argument is added to the functions, which is used for example by +: the regex engine for debugging and trace output. A non DEBUGGING build +: will not pass the unused argument. Currently restricted to functions +: with at least one argument. : : 'X' Explicitly exported: : @@ -762,14 +787,9 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \ |STRLEN blen \ |NN const U8 *u \ |STRLEN ulen -AMdp |U8 * |bytes_from_utf8|NN const U8 *s \ +Adp |U8 * |bytes_from_utf8|NN const U8 *s \ |NN STRLEN *lenp \ |NN bool *is_utf8p -CTdp |U8 * |bytes_from_utf8_loc \ - |NN const U8 *s \ - |NN STRLEN *lenp \ - |NN bool *is_utf8p \ - |NULLOK const U8 **first_unconverted Adp |U8 * |bytes_to_utf8 |NN const U8 *s \ |NN STRLEN *lenp AOdp |SSize_t|call_argv |NN const char *sub_name \ @@ -3657,6 +3677,19 @@ CDbdp |UV |utf8n_to_uvuni |NN const U8 *s \ |U32 flags Adpx |U8 * |utf8_to_bytes |NN U8 *s \ |NN STRLEN *lenp +Cp |PL_utf8_to_bytes_ret|utf8_to_bytes_ \ + |NN U8 **s_ptr \ + |NN STRLEN *lenp \ + |Perl_utf8_to_bytes_arg result_as +Admp |PL_utf8_to_bytes_ret|utf8_to_bytes_new_pv \ + |NN U8 const **s_ptr \ + |NN STRLEN *lenp +Admp |PL_utf8_to_bytes_ret|utf8_to_bytes_overwrite \ + |NN U8 **s_ptr \ + |NN STRLEN *lenp +Admp |PL_utf8_to_bytes_ret|utf8_to_bytes_temp_pv \ + |NN U8 const **s_ptr \ + |NN STRLEN *lenp EMXp |U8 * |utf16_to_utf8 |NN U8 *p \ |NN U8 *d \ |Size_t bytelen \ diff --git a/embed.h b/embed.h index 74aaea51bcd5..0999ce468a30 100644 --- a/embed.h +++ b/embed.h @@ -156,7 +156,7 @@ # define block_gimme() Perl_block_gimme(aTHX) # define block_start(a) Perl_block_start(aTHX_ a) # define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) -# define bytes_from_utf8_loc Perl_bytes_from_utf8_loc +# define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) # define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) # define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) # define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) @@ -791,6 +791,10 @@ # define utf8_hop_safe Perl_utf8_hop_safe # define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b) # define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) +# define utf8_to_bytes_(a,b,c) Perl_utf8_to_bytes_(aTHX_ a,b,c) +# define utf8_to_bytes_new_pv(a,b) Perl_utf8_to_bytes_new_pv(aTHX,a,b) +# define utf8_to_bytes_overwrite(a,b) Perl_utf8_to_bytes_overwrite(aTHX,a,b) +# define utf8_to_bytes_temp_pv(a,b) Perl_utf8_to_bytes_temp_pv(aTHX,a,b) # define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c) # define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs # define uvoffuni_to_utf8_flags_msgs(a,b,c,d) Perl_uvoffuni_to_utf8_flags_msgs(aTHX_ a,b,c,d) diff --git a/mathoms.c b/mathoms.c index 27fa2969d1b1..b7c4760964f1 100644 --- a/mathoms.c +++ b/mathoms.c @@ -22,15 +22,15 @@ /* * This file contains mathoms, various binary artifacts from previous * versions of Perl which we cannot completely remove from the core - * code. There are two reasons functions should be here: + * code. There is only one reason these days for functions should be here: * * 1) A function has been replaced by a macro within a minor release, * so XS modules compiled against an older release will expect to * still be able to link against the function - * 2) A function Perl_foo(...) with #define foo Perl_foo(aTHX_ ...) - * has been replaced by a macro, e.g. #define foo(...) foo_flags(...,0) - * but XS code may still explicitly use the long form, i.e. - * Perl_foo(aTHX_ ...) + * + * It used to be that this was the way to handle the case were a function + * Perl_foo(...) had been replaced by a macro. But see the 'm' flag discussion + * in embed.fnc for a better way to handle this. * * This file can't just be cleaned out periodically, because that would break * builds with -DPERL_NO_SHORT_NAMES diff --git a/proto.h b/proto.h index cb2fa83c0dfd..ee045eab1fe5 100644 --- a/proto.h +++ b/proto.h @@ -408,11 +408,6 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p); #define PERL_ARGS_ASSERT_BYTES_FROM_UTF8 \ assert(s); assert(lenp); assert(is_utf8p) -PERL_CALLCONV U8 * -Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8 **first_unconverted); -#define PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC \ - assert(s); assert(lenp); assert(is_utf8p) - PERL_CALLCONV U8 * Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); #define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ @@ -5137,6 +5132,20 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp); #define PERL_ARGS_ASSERT_UTF8_TO_BYTES \ assert(s); assert(lenp) +PERL_CALLCONV PL_utf8_to_bytes_ret +Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, Perl_utf8_to_bytes_arg result_as); +#define PERL_ARGS_ASSERT_UTF8_TO_BYTES_ \ + assert(s_ptr); assert(lenp) + +/* PERL_CALLCONV PL_utf8_to_bytes_ret +Perl_utf8_to_bytes_new_pv(pTHX_ U8 const **s_ptr, STRLEN *lenp); */ + +/* PERL_CALLCONV PL_utf8_to_bytes_ret +Perl_utf8_to_bytes_overwrite(pTHX_ U8 **s_ptr, STRLEN *lenp); */ + +/* PERL_CALLCONV PL_utf8_to_bytes_ret +Perl_utf8_to_bytes_temp_pv(pTHX_ U8 const **s_ptr, STRLEN *lenp); */ + PERL_CALLCONV U8 * Perl_utf8_to_utf16_base(pTHX_ U8 *s, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low); #define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \ diff --git a/regen/embed.pl b/regen/embed.pl index 5fce877c6c16..6c4e5ee4e8c1 100755 --- a/regen/embed.pl +++ b/regen/embed.pl @@ -137,11 +137,14 @@ sub generate_proto_h { die_at_end "$plain_func: S and p flags are mutually exclusive" if $flags =~ /S/ && $flags =~ /p/; - die_at_end "$plain_func: m and $1 flags are mutually exclusive" - if $has_mflag && $flags =~ /([pS])/; - - die_at_end "$plain_func: u flag only usable with m" if $flags =~ /u/ - && ! $has_mflag; + if ($has_mflag) { + if ($flags =~ /S/) { + die_at_end "$plain_func: m and S flags are mutually exclusive"; + } + } + else { + die_at_end "$plain_func: u flag only usable with m" if $flags =~ /u/; + } my ($static_flag, @extra_static_flags)= $flags =~/([SsIi])/g; @@ -500,9 +503,9 @@ sub embed_h { my $ind= $level ? " " : ""; $ind .= " " x ($level-1) if $level>1; my $inner_ind= $ind ? " " : " "; - unless ($flags =~ /[omM]/) { + if ($flags !~ /[omM]/ or ($flags =~ /m/ && $flags =~ /p/)) { my $argc = scalar @$args; - if ($flags =~ /T/) { + if ($flags =~ /[T]/) { my $full_name = full_name($func, $flags); next if $full_name eq $func; # Don't output a no-op. $ret = indent_define($func, $full_name, $ind); @@ -525,8 +528,11 @@ sub embed_h { $use_va_list ? ("__VA_ARGS__") : ()); $ret = "#${ind}define $func($paramlist) "; add_indent($ret,full_name($func, $flags) . "(aTHX"); - $ret .= "_ " if $replacelist; - $ret .= $replacelist; + if ($replacelist) { + $ret .= ($flags =~ /m/) ? "," : "_ "; + $ret .= $replacelist; + } + if ($flags =~ /W/) { if ($replacelist) { $ret .= " comma_aDEPTH"; diff --git a/utf8.c b/utf8.c index 88f8299792fe..3d115b5730c8 100644 --- a/utf8.c +++ b/utf8.c @@ -2352,52 +2352,188 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) } /* -=for apidoc utf8_to_bytes +=for apidoc utf8_to_bytes_overwrite +=for apidoc_item utf8_to_bytes_new_pv +=for apidoc_item utf8_to_bytes_temp_pv +=for apidoc_item utf8_to_bytes +=for apidoc_item bytes_from_utf8 -Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding. -Unlike L, this over-writes the original string, and -updates C<*lenp> to contain the new length. -Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1. +These each convert a string encoded as UTF-8 into the equivalent native byte +representation, if possible. The first three forms are preferred; their API is +more convenient to use, and each return non-zero if the result is in bytes; +zero if the conversion failed. -Upon successful return, the number of variants in the string can be computed by -having saved the value of C<*lenp> before the call, and subtracting the -after-call value of C<*lenp> from it. +=over 4 + +=item C + +=item C + +=item C + +These differ only in the form of the returned string and the allowed constness +of the input string. In each, if the input string was already in native bytes +or was not convertible, the input isn't changed. + +C overwrites the input string with the bytes +conversion. Hence, the input string should not be C. (Converting the +multi-byte UTF-8 encoding to single bytes never expands the result, so +overwriting is always feasible.) + +Both C and C allocate new memory +to hold the converted string, never changing the input. Hence the input string +may be C. They differ in that C arranges for the +new memory to automatically be freed. With C, the caller +is responsible for freeing the memory. + +In each of these three functions, the input C is a pointer to the string +to be converted (so that the first byte will be at C<*sptr[0]>), and C<*lenp> +is its length. + +The return of each can be one of three values: + +=over 4 + +=item C (or its equivalent, 0 or C) + +This happens when the input is not well-formed UTF-8 or contains at least one +UTF-8 sequence that represents a code point that can't be expressed as a byte. +The contents of C<*s_ptr> and C<*lenp> are not changed. + +=item C + +The input turned out to already be in bytes form. The contents of C<*s_ptr> +and C<*lenp> are not changed. + +=item C + +The input was successfully translated to native bytes. The result will be +NUL-terminated even if the original wasn't. + +=over 4 + +=item For C, + +The input string C<*s_ptr> was overwritten with the native bytes, including a +NUL terminator. C<*lenp> has been updated with the new length. + +=item For C and C + +The input string was not changed. Instead, new memory has been allocated +containing the translation of the input into native bytes with a NUL terminator +byte. C<*s_ptr> now points to that new memory, and C<*lenp> contains its +length. + +For C, the new memory has been arranged to be +automatically freed, via a call to C>. + +For C, it is the caller's responsibility to free the new +memory when done using it. + +=back + +=back + +Except when calling C, most likely you can treat the +return as a boolean. With that function you will need to know whether or not +there is memory that has to be freed. + +Note that in all cases, C<*s_ptr> and C<*lenp> will have correct and consistent +values, updated as was necessary. + +Also note that when the return isn't C, the number of variants +in the string can be computed by having saved the value of C<*lenp> before the +call, and subtracting the after-call value of C<*lenp> from it. This is also +true for the other two functions described below. + +=item C + +Plain C also converts a UTF-8 encoded string to bytes, but there +are more glitches that the caller has to be prepared to handle. + +The input string is passed with one less indirection level, C. + +=over + +=item If the conversion was successful or a noop + +The function returns C (unchanged), but its contents were changed if this +was not a noop; C<*lenp> will be updated as necessary to be the correct length. + +=item If the conversion failed -If you need a copy of the string, see L. +The function returns NULL and sets C<*lenp> to -1, cast to C. +This means that you will have to use a temporary containing the string length +to pass to the function if you will need the value afterwards. + +The contents of C were not changed. + +=back + +=item C + +C also converts a potentially UTF-8 encoded string C to +bytes. It preserves C, allocating new memory for the converted string. + +In contrast to the other functions, the input string to this one need not +be UTF-8. If not, the caller has set C<*is_utf8p> to be C, and the +function does nothing, returning the original C. + +Also do nothing if there are code points in the string not expressible in +native byte encoding, returning the original C. + +Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a +newly created string containing the native byte equivalent of C, and whose +length is returned in C<*lenp>, updated. The new string is C-terminated. +The caller is responsible for arranging for the memory used by this string to +get freed. + +The major problem with this function is that memory is allocated and filled +even when the input string was already in bytes form. + +=back + +New code should use the first three functions listed above. =cut */ -U8 * -Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) +PL_utf8_to_bytes_ret +Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, + Perl_utf8_to_bytes_arg result_as) { - U8 * first_variant; + PERL_ARGS_ASSERT_UTF8_TO_BYTES_; - PERL_ARGS_ASSERT_UTF8_TO_BYTES; - PERL_UNUSED_CONTEXT; + U8 * first_variant; /* This is a no-op if no variants at all in the input */ - if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) { - return s; + if (is_utf8_invariant_string_loc(*s_ptr, *lenp, + (const U8 **) &first_variant)) + { + return PL_was_noop; } /* Nothing before 'first_variant' needs to be changed, so start the real * work there */ - U8 * const save = s; - U8 * const send = s + *lenp; - s = first_variant; + U8 * const s0 = *s_ptr; + const U8 * const send = s0 + *lenp; + U8 * s = first_variant; + Size_t invariant_length = first_variant - s0; + Size_t variant_count = 0; #ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */ - /* There is some start-up/tear-down overhead with this, so no real gain + /* Do a first pass through the string to see if it actually is translatable + * into bytes, and if so, how big the result is. On long strings this is + * done a word at a time, so is relatively quick. (There is some + * start-up/tear-down overhead with the per-word algorithm, so no real gain * unless the remaining portion of the string is long enough. The current - * value is just a guess. */ + * value is just a guess.) On EBCDIC, it's always per-byte. */ if ((send - s) > (ptrdiff_t) (5 * PERL_WORDSIZE)) { - /* First, go through the string a word at-a-time to verify that it is - * downgradable. If it contains any start byte besides C2 and C3, then - * it isn't. */ + /* If the string contains any start byte besides C2 and C3, then it + * isn't translatable into bytes */ const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0; const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2; @@ -2414,11 +2550,13 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) while (s < partial_word_end) { if (! UTF8_IS_INVARIANT(*s)) { if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - *lenp = ((STRLEN) -1); - return NULL; + return PL_cant_convert; } + s++; + variant_count++; } + s++; } @@ -2465,10 +2603,15 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) * If they're not equal, there are start bytes that aren't C2 * nor C3, hence this is not downgradable */ if (start_bytes != C2_C3_start_bytes) { - *lenp = ((STRLEN) -1); - return NULL; + return PL_cant_convert; } + /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an + explanation of how this works */ + variant_count += + (Size_t) (((((start_bytes)) >> 7) * PERL_COUNT_MULTIPLIER) + >> ((PERL_WORDSIZE - 1) * CHARBITS)); + s += PERL_WORDSIZE; } while (s + PERL_WORDSIZE <= send); @@ -2477,262 +2620,174 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) * first byte of the character */ if (s > first_variant && UTF8_IS_START(*(s-1))) { s--; + variant_count--; } } #endif - - /* Do the straggler bytes beyond the final word boundary (or all bytes - * in the case of EBCDIC) */ + /* Do the straggler bytes beyond what the loop above did */ while (s < send) { if (! UTF8_IS_INVARIANT(*s)) { if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - *lenp = ((STRLEN) -1); - return NULL; + return PL_cant_convert; } s++; + variant_count++; } s++; } - /* Here, we passed the tests above. For the EBCDIC case, everything - * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC, - * it means only that all start bytes were C2 or C3, hence any - * well-formed sequences are downgradable. But we didn't test, for - * example, that there weren't two C2's in a row. That means that in - * the loop below, we have to be sure things are well-formed. Because - * this is very very likely, and we don't care about having speedy - * handling of malformed input, the loop proceeds as if well formed, - * and should a malformed one come along, it undoes what it already has - * done */ - - U8 * d = s = first_variant; + /* Here, we passed the tests above and know how many UTF-8 variant + * characters there are, which allows us to calculate the size to malloc + * for the non-destructive case */ + U8 *d0; + if (result_as != PL_utf8_to_bytes_new_memory) { + d0 = s0; + } + else { + Newx(d0, (*lenp) + 1 - variant_count, U8); + Copy(s0, d0, invariant_length, U8); + } + U8 * d = d0 + invariant_length; + + /* For the cases where the per-word algorithm wasn't used, everything is + * well-formed and can definitely be translated. When the per word + * algorithm was used, it found that all start bytes in the string were C2 + * or C3, hence any well-formed sequences are convertible to bytes. But we + * didn't test, for example, that there weren't two C2's in a row. That + * means that in the loop below, we have to be sure things are well-formed. + * Because it is very very unlikely that we got this far for something + * malformed, and because we prioritize speed in the normal case over the + * malformed one, we go ahead and do the translation, and undo it if found + * to be necessary. */ + s = first_variant; while (s < send) { - U8 * s1; - - if (UVCHR_IS_INVARIANT(*s)) { - *d++ = *s++; - continue; - } - - /* Here it is two-byte encoded. */ - if ( LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s)) - && LIKELY(UTF8_IS_CONTINUATION((s[1])))) - { - U8 first_byte = *s++; - *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s); - s++; - continue; - } - - /* Here, it is malformed. This shouldn't happen on EBCDIC, and on - * ASCII platforms, we know that the only start bytes in the text - * are C2 and C3, and the code above has made sure that it doesn't - * end with a start byte. That means the only malformations that - * are possible are a start byte without a continuation (either - * followed by another start byte or an invariant) or an unexpected - * continuation. - * - * We have to undo all we've done before, back down to the first - * UTF-8 variant. Note that each 2-byte variant we've done so far - * (converted to single byte) slides things to the left one byte, - * and so we have bytes that haven't been written over. - * - * Here, 'd' points to the next position to overwrite, and 's' - * points to the first invalid byte. That means 'd's contents - * haven't been changed yet, nor has anything else beyond it in the - * string. In restoring to the original contents, we don't need to - * do anything past (d-1). - * - * In particular, the bytes from 'd' to 's' have not been changed. - * This loop uses a new variable 's1' (to avoid confusing 'source' - * and 'destination') set to 'd', and moves 's' and 's1' in lock - * step back so that afterwards, 's1' points to the first changed - * byte that will be the source for the first byte (or bytes) at - * 's' that need to be changed back. Note that s1 can expand to - * two bytes */ - s1 = d; - while (s >= d) { - s--; - if (! UVCHR_IS_INVARIANT(*s1)) { - s--; + U8 c = *s++; + if (! UVCHR_IS_INVARIANT(c)) { + + /* Then it is a multi-byte character. The first pass above + * determined that the string contains only invariants, the C2 and + * C3 start bytes, and continuation bytes. The condition above + * excluded this from being an invariant. To be well formed, it + * needs to be a start byte followed by a continuation byte. */ + if ( UNLIKELY( UTF8_IS_CONTINUATION(c)) + || UNLIKELY( s >= send) + || UNLIKELY(! UTF8_IS_CONTINUATION(*s))) + { + goto cant_convert; } - s1--; - } - /* Do the changing back */ - while (s1 >= first_variant) { - if (UVCHR_IS_INVARIANT(*s1)) { - *s-- = *s1--; - } - else { - *s-- = UTF8_EIGHT_BIT_LO(*s1); - *s-- = UTF8_EIGHT_BIT_HI(*s1); - s1--; - } + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); + s++; } - *lenp = ((STRLEN) -1); - return NULL; + *d++ = c; } /* Success! */ *d = '\0'; - *lenp = d - save; - - return save; -} - -/* -=for apidoc bytes_from_utf8 - -Converts a potentially UTF-8 encoded string C of length C<*lenp> into native -byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C is -actually encoded in UTF-8. + *lenp = d - d0; -Unlike L but like L, this is non-destructive of -the input string. + if (result_as != PL_utf8_to_bytes_overwrite) { + *s_ptr = d0; + if (result_as == PL_utf8_to_bytes_use_temporary) { + SAVEFREEPV(*s_ptr); + } + } -Do nothing if C<*is_utf8p> is 0, or if there are code points in the string -not expressible in native byte encoding. In these cases, C<*is_utf8p> and -C<*lenp> are unchanged, and the return value is the original C. + return PL_converted; -Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a -newly created string containing a downgraded copy of C, and whose length is -returned in C<*lenp>, updated. The new string is C-terminated. The -caller is responsible for arranging for the memory used by this string to get -freed. - -Upon successful return, the number of variants in the string can be computed by -having saved the value of C<*lenp> before the call, and subtracting the -after-call value of C<*lenp> from it. + cant_convert: ; -=cut + /* Here, we found a malformation in the input. This won't happen except + * when the per-word algorithm was used in the first pass, because that may + * miss some malformations. It determined that the only start bytes in the + * text are C2 and C3, but didn't examine it to make sure each of those was + * followed by precisely one continuation, for example. + * + * If the result is in newly allocated memory, just free it */ + if (result_as != PL_utf8_to_bytes_overwrite) { + Safefree(d0); + return PL_cant_convert; + } -There is a macro that avoids this function call, but this is retained for -anyone who calls it with the Perl_ prefix */ + /* Otherwise, we have to undo all we've done before, back down to the first + * UTF-8 variant. Note that each 2-byte variant we've done so far + * (converted to single byte) slides things to the left one byte, and so we + * have bytes that haven't been written over. + * + * Here, 'd' points to the next position to overwrite, and 's' points to + * the first invalid byte. That means 'd's contents haven't been changed + * yet, nor has anything else beyond it in the string. In restoring to the + * original contents, we don't need to do anything past (d-1). + * + * In particular, the bytes from 'd' to 's' have not been changed. This + * loop uses a new variable 's1' (to avoid confusing 'source' and + * 'destination') set to 'd', and moves 's' and 's1' in lock step back so + * that afterwards, 's1' points to the first changed byte that will be the + * source for the first byte (or bytes) at 's' that need to be changed + * back. Note that s1 can expand to two bytes */ + U8 * s1 = d; + while (s >= d) { + s--; + if (! UVCHR_IS_INVARIANT(*s1)) { + s--; + } + s1--; + } -U8 * -Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) -{ - PERL_ARGS_ASSERT_BYTES_FROM_UTF8; - PERL_UNUSED_CONTEXT; + /* Do the changing back */ + while (s1 >= first_variant) { + if (UVCHR_IS_INVARIANT(*s1)) { + *s-- = *s1--; + } + else { + *s-- = UTF8_EIGHT_BIT_LO(*s1); + *s-- = UTF8_EIGHT_BIT_HI(*s1); + s1--; + } + } - return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); + return PL_cant_convert; } -/* -=for apidoc bytes_from_utf8_loc - -Like C()>, but takes an extra parameter, a pointer -to where to store the location of the first character in C<"s"> that cannot be -converted to non-UTF8. - -If that parameter is C, this function behaves identically to -C. - -Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to -C, except it also sets C<*first_non_downgradable> to C. - -Otherwise, the function returns a newly created C-terminated string -containing the non-UTF8 equivalent of the convertible first portion of -C<"s">. C<*lenp> is set to its length, not including the terminating C. -If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, -and C<*first_non_downgradable> is set to C. - -Otherwise, C<*first_non_downgradable> is set to point to the first byte of the -first character in the original string that wasn't converted. C<*is_utf8p> is -unchanged. Note that the new string may have length 0. - -Another way to look at it is, if C<*first_non_downgradable> is non-C and -C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and -converts as many characters in it as possible stopping at the first one it -finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is -set to point to that. The function returns the portion that could be converted -in a newly created C-terminated string, and C<*lenp> is set to its length, -not including the terminating C. If the very first character in the -original could not be converted, C<*lenp> will be 0, and the new string will -contain just a single C. If the entire input string was converted, -C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C. - -Upon successful return, the number of variants in the converted portion of the -string can be computed by having saved the value of C<*lenp> before the call, -and subtracting the after-call value of C<*lenp> from it. - -=cut - - -*/ - U8 * -Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) +Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) { - PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; - - if (! *is_utf8p) { - if (first_unconverted) { - *first_unconverted = NULL; - } - - return (U8 *) s; - } - - const U8 * const s0 = s; - const U8 * const send = s + *lenp; - const U8 * first_variant; + PERL_ARGS_ASSERT_UTF8_TO_BYTES; - /* The initial portion of 's' that consists of invariants can be Copied - * as-is. If it is entirely invariant, the whole thing can be Copied. */ - if (is_utf8_invariant_string_loc(s, *lenp, &first_variant)) { - first_variant = send; + if (utf8_to_bytes_overwrite(&s, lenp)) { + return s; } - U8 *d; - Newx(d, (*lenp) + 1, U8); - Copy(s, d, first_variant - s, U8); + *lenp = (STRLEN) -1; + return NULL; +} - U8 *converted_start = d; - d += first_variant - s; - s = first_variant; +U8 * +Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) +{ + PERL_ARGS_ASSERT_BYTES_FROM_UTF8; - while (s < send) { - U8 c = *s++; - if (! UTF8_IS_INVARIANT(c)) { - - /* Then it is multi-byte encoded. If the code point is above 0xFF, - * have to stop now */ - if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { - if (first_unconverted) { - *first_unconverted = s - 1; - goto finish_and_return; - } - else { - Safefree(converted_start); - return (U8 *) s0; - } + if (*is_utf8p) { + PL_utf8_to_bytes_ret ret = utf8_to_bytes_new_pv(&s, lenp); + if (ret) { + *is_utf8p = false; + + /* Our callers are always expecting new memory upon success. Give + * it to them, adding a trailing NUL if not already there */ + if (ret == PL_was_noop) { + U8 * new_s; + Newx(new_s, *lenp + 1, U8); + Copy(s, new_s, *lenp, U8); + new_s[*lenp] = '\0'; + s = new_s; } - - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); - s++; } - *d++ = c; } - /* Here, converted the whole of the input */ - *is_utf8p = FALSE; - if (first_unconverted) { - *first_unconverted = NULL; - } - - finish_and_return: - *d = '\0'; - *lenp = d - converted_start; - - /* Trim unused space */ - Renew(converted_start, *lenp + 1, U8); - - return converted_start; + return (U8 *) s; } /* diff --git a/utf8.h b/utf8.h index e4f98270b2bb..365dbb021b27 100644 --- a/utf8.h +++ b/utf8.h @@ -1296,8 +1296,25 @@ point's representation. #define SHARP_S_SKIP 2 #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) -#define bytes_from_utf8(s, lenp, is_utf8p) \ - bytes_from_utf8_loc(s, lenp, is_utf8p, 0) + +typedef enum { + PL_cant_convert = 0, + PL_was_noop, + PL_converted, +} PL_utf8_to_bytes_ret; + +typedef enum { + PL_utf8_to_bytes_overwrite = 0, + PL_utf8_to_bytes_new_memory, + PL_utf8_to_bytes_use_temporary, +} Perl_utf8_to_bytes_arg; + +#define Perl_utf8_to_bytes_overwrite(mTHX, s, l) \ + Perl_utf8_to_bytes_(aTHX_ s, l, PL_utf8_to_bytes_overwrite) +#define Perl_utf8_to_bytes_new_pv(mTHX, s, l) \ + Perl_utf8_to_bytes_(aTHX_ (U8 **) s, l, PL_utf8_to_bytes_new_memory) +#define Perl_utf8_to_bytes_temp_pv(mTHX, s, l) \ + Perl_utf8_to_bytes_(aTHX_ (U8 **) s, l, PL_utf8_to_bytes_use_temporary) /* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is * retained solely for backwards compatibility */