From 5d31895bad7914d64b4bbbebc170a28c4daa2d45 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 4 Dec 2024 17:12:11 -0700 Subject: [PATCH] Add new function bytes_to_utf8_free_me This is like bytes_to_utf8, but if the representation of the input string is the same in UTF-8 as it is in native format, the allocation of new memory is skipped. This presents optimization possibilities. --- embed.fnc | 6 ++++- embed.h | 3 ++- proto.h | 7 +++-- utf8.c | 80 ++++++++++++++++++++++++++++++++++++++++++------------- utf8.h | 1 + 5 files changed, 75 insertions(+), 22 deletions(-) diff --git a/embed.fnc b/embed.fnc index 7792a28e7a3c..ac63eff2d390 100644 --- a/embed.fnc +++ b/embed.fnc @@ -794,8 +794,12 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \ Adp |U8 * |bytes_from_utf8|NN const U8 *s \ |NN STRLEN *lenp \ |NN bool *is_utf8p -Adp |U8 * |bytes_to_utf8 |NN const U8 *s \ +Admp |U8 * |bytes_to_utf8 |NN const U8 *s \ |NN STRLEN *lenp +Adp |U8 * |bytes_to_utf8_free_me \ + |NN const U8 *s \ + |NN STRLEN *lenp \ + |NULLOK const U8 **free_me AOdp |SSize_t|call_argv |NN const char *sub_name \ |I32 flags \ |NN char **argv diff --git a/embed.h b/embed.h index dfcc4f4881e6..b3ad1973b659 100644 --- a/embed.h +++ b/embed.h @@ -155,7 +155,8 @@ # define block_start(a) Perl_block_start(aTHX_ a) # define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) # define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) -# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) +# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX,a,b) +# define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c) # define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv # define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) # define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) diff --git a/proto.h b/proto.h index 32e8d48f4fa7..521c33684483 100644 --- a/proto.h +++ b/proto.h @@ -398,9 +398,12 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p); #define PERL_ARGS_ASSERT_BYTES_FROM_UTF8 \ assert(s); assert(lenp); assert(is_utf8p) +/* PERL_CALLCONV U8 * +Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); */ + PERL_CALLCONV U8 * -Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); -#define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ +Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, STRLEN *lenp, const U8 **free_me); +#define PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME \ assert(s); assert(lenp) /* PERL_CALLCONV bool diff --git a/utf8.c b/utf8.c index 1da50e1bc39d..4d16d5308b31 100644 --- a/utf8.c +++ b/utf8.c @@ -3182,20 +3182,45 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) } /* -=for apidoc bytes_to_utf8 - -Converts a string C of length C<*lenp> bytes from the native encoding into -UTF-8. -Returns a pointer to the newly-created string, and sets C<*lenp> to -reflect the new length in bytes. The caller is responsible for arranging for -the memory used by this string to get freed. +=for apidoc bytes_to_utf8 +=for apidoc_item bytes_to_utf8_free_me + +These each convert a string C of length C<*lenp> bytes from the native +encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to +the UTF-8 string, and setting C<*lenp> to its length in bytes, while making +sure that the string is terminated by a C character. + +They differ in that C takes an extra parameter +C. If that parameter is NULL, this function behaves identically to +C. + +But if not NULL, the function skips allocating new memory if the input string +already is C-terminated, and its UTF-8 representation is the same as its +native representation. In other words it returns the input string if +converting the string would be a no-op. It sets C<*free_me> to NULL in that +case. Otherwise C<*free_me> is set to the address of the newly allocated +memory. Note that in both cases, you can pass that result to C> +and it will do the right thing. + +Note that when new memory is allocated, the caller is responsible for arranging +for that memory to get freed. (This is transparent to the caller if +C is called with C.) + +The two forms have subtle differences in trailing C handling. +C does not look for any trailing C. Instead it +overallocates space for the copy by 1 byte and adds a C to the end of it. +If the input C<*lenp> included a C, there will be two trailing C +characters; otherwise just 1. There is code that depends on this behavior. +C has to look for a trailing C in order to be able +to guarantee that the result has one if no copy is made. If the input C<*lenp> +doesn't include any C character at the end, this form has to assume that +one doesn't exist, and will create a copy. Only if C<*lenp> does include the +C, does this form check to see if a copy can be avoided. Upon return, the number of variants in the string can be computed by having saved the value of C<*lenp> before the call, and subtracting it from the after-call value of C<*lenp>. -A C character will be written after the end of the string. - If you want to convert to UTF-8 from encodings other than the native (Latin1 or EBCDIC), see L(). @@ -3204,26 +3229,45 @@ see L(). */ U8* -Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) +Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp, + const U8 ** free_me_ptr) { - const U8 * const send = s + (*lenp); + PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME; + PERL_UNUSED_CONTEXT; + + const Size_t len = *lenp; + const U8 * const send = s + len; + const Size_t variant_count = variant_under_utf8_count(s, send); + const bool has_trailing_NUL = free_me_ptr && len > 0 && *(send - 1) == '\0'; + + /* Return the input unchanged if the flag indicates to do so, and there + * are no characters that differ when represented in UTF-8, and the + * original is NUL-terminated */ + if (free_me_ptr != NULL && variant_count == 0 && has_trailing_NUL) { + *free_me_ptr = NULL; + return (U8 *) s; + } + U8 *d; U8 *dst; - PERL_ARGS_ASSERT_BYTES_TO_UTF8; - PERL_UNUSED_CONTEXT; - - /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ - Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); + /* 1 for each byte except any trailing NUL + * + 1 for each byte that expands to two + * + 1 for the trailing NUL */ + Newx(d, len - has_trailing_NUL + variant_count + 1, U8); dst = d; - while (s < send) { + while (s < send - has_trailing_NUL) { append_utf8_from_native_byte(*s, &d); s++; } *d = '\0'; - *lenp = d-dst; + *lenp = d - dst; + + if (free_me_ptr != NULL) { + *free_me_ptr = dst; + } return dst; } diff --git a/utf8.h b/utf8.h index d62a06f742af..929ea68b1b27 100644 --- a/utf8.h +++ b/utf8.h @@ -1328,6 +1328,7 @@ point's representation. #define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) +#define Perl_bytes_to_utf8(mTHX, s, lenp) Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL) typedef enum { PL_utf8_to_bytes_overwrite = 0, PL_utf8_to_bytes_new_memory,