diff --git a/embed.fnc b/embed.fnc index cc2b9ac7aaaf..fbb3ed84f0ca 100644 --- a/embed.fnc +++ b/embed.fnc @@ -800,6 +800,9 @@ Adp |U8 * |bytes_to_utf8_free_me \ |NN const U8 *s \ |NN STRLEN *lenp \ |NULLOK void **free_me +Adip |U8 * |bytes_to_utf8_temp_pv \ + |NN const U8 *s \ + |NN STRLEN *lenp AOdp |SSize_t|call_argv |NN const char *sub_name \ |I32 flags \ |NN char **argv diff --git a/embed.h b/embed.h index 883b2fb72bdb..28d55b529cb5 100644 --- a/embed.h +++ b/embed.h @@ -157,6 +157,7 @@ # define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) # define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) # define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c) +# define bytes_to_utf8_temp_pv(a,b) Perl_bytes_to_utf8_temp_pv(aTHX_ a,b) # define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv # define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) # define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) diff --git a/inline.h b/inline.h index bad3967673f4..0bac2329a888 100644 --- a/inline.h +++ b/inline.h @@ -1236,6 +1236,19 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) return bytes_to_utf8_free_me(s, lenp, NULL); } +PERL_STATIC_INLINE U8 * +Perl_bytes_to_utf8_temp_pv(pTHX_ const U8 *s, STRLEN *lenp) +{ + void * free_me = NULL; + U8 * converted = bytes_to_utf8_free_me(s, lenp, &free_me); + + if (free_me) { + SAVEFREEPV(free_me); + } + + return converted; +} + PERL_STATIC_INLINE bool Perl_utf8_to_bytes_new_pv(pTHX_ U8 const **s_ptr, STRLEN *lenp, void ** free_me) { diff --git a/pod/perldelta.pod b/pod/perldelta.pod index e0c3a7a72c16..35b6362898e3 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -360,7 +360,16 @@ well. =item * -XXX +Two new API functions are introduced to convert strings encoded in +native bytes format to UTF-8. These return the string unchanged if its +UTF-8 representation is the same as the original. Otherwise, new memory +is allocated to contain the converted string. This is in contrast to +the existing L> which always allocates new +memory. The new functions are L> and +L>. +L> arranges for the new memory to +automatically be freed. With C, you are +responsible for freeing any newly allocated memory. =back diff --git a/proto.h b/proto.h index 513965b4ef88..3b125616efa0 100644 --- a/proto.h +++ b/proto.h @@ -9644,6 +9644,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); # define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ assert(s); assert(lenp) +PERL_STATIC_INLINE U8 * +Perl_bytes_to_utf8_temp_pv(pTHX_ const U8 *s, STRLEN *lenp); +# define PERL_ARGS_ASSERT_BYTES_TO_UTF8_TEMP_PV \ + assert(s); assert(lenp) + PERL_STATIC_INLINE void Perl_clear_defarray_simple(pTHX_ AV *av); # define PERL_ARGS_ASSERT_CLEAR_DEFARRAY_SIMPLE \ diff --git a/utf8.c b/utf8.c index 928b6242a179..f2e8b80c7b4a 100644 --- a/utf8.c +++ b/utf8.c @@ -3256,6 +3256,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) /* =for apidoc bytes_to_utf8 =for apidoc_item bytes_to_utf8_free_me +=for apidoc_item bytes_to_utf8_temp_pv These each convert a string C of length C<*lenp> bytes from the native encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to @@ -3275,6 +3276,13 @@ already there. In both cases, the caller is responsible for arranging for any new memory to get freed. +C simply returns a pointer to the input string if the +string's UTF-8 representation is the same as its native representation, thus +behaving like C in this situation. Otherwise, it +behaves like C, returning a pointer to new memory containing the +conversion of the input. The difference is that it also arranges for the new +memory to automatically be freed by calling C> on it. + C takes an extra parameter, C to communicate. to the caller that memory was allocated or not. If that parameter is NULL, C acts identically to C, always