mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-20 11:40:18 +02:00
Fix scm_to_utf8_stringn to return the length in bytes, et al
* libguile/strings.c (u32_u8_length_in_bytes): Internal static function renamed from u32_u8_strlen, whose name was potentially confusing. For added safety, handle everything that can be encoded in the more general UTF-8 encoding: up to six bytes for each code point, with code points up to 2^31-1. (scm_to_utf8_stringn): NUL-terminate only if (lenp == NULL). If (lenp != NULL) return the length in bytes in *lenp.
This commit is contained in:
parent
2c1b79513b
commit
e26da7a24e
1 changed files with 36 additions and 16 deletions
|
@ -1874,19 +1874,28 @@ latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
|
|||
return u8_result;
|
||||
}
|
||||
|
||||
/* From RFC 3629:
|
||||
/* UTF-8 code table
|
||||
|
||||
(Note that this includes code points that are not allowed by Unicode,
|
||||
but since this function has no way to report an error, and its
|
||||
purpose is to determine the size of destination buffers for
|
||||
libunicode conversion functions, we err on the safe side and handle
|
||||
everything that libunicode might conceivably handle, now or in the
|
||||
future.)
|
||||
|
||||
Char. number range | UTF-8 octet sequence
|
||||
(hexadecimal) | (binary)
|
||||
--------------------+---------------------------------------------
|
||||
--------------------+------------------------------------------------------
|
||||
0000 0000-0000 007F | 0xxxxxxx
|
||||
0000 0080-0000 07FF | 110xxxxx 10xxxxxx
|
||||
0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
|
||||
0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*/
|
||||
|
||||
static size_t
|
||||
u32_u8_strlen (const scm_t_uint32 *str, size_t len)
|
||||
u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
|
||||
{
|
||||
size_t ret, i;
|
||||
|
||||
|
@ -1900,8 +1909,12 @@ u32_u8_strlen (const scm_t_uint32 *str, size_t len)
|
|||
ret += 2;
|
||||
else if (c <= 0xffff)
|
||||
ret += 3;
|
||||
else
|
||||
else if (c <= 0x1fffff)
|
||||
ret += 4;
|
||||
else if (c <= 0x3ffffff)
|
||||
ret += 5;
|
||||
else
|
||||
ret += 6;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -1917,21 +1930,28 @@ scm_to_utf8_stringn (SCM str, size_t *lenp)
|
|||
else
|
||||
{
|
||||
scm_t_uint8 *buf, *ret;
|
||||
size_t len, allocated;
|
||||
size_t predicted_len, actual_len; /* length in bytes */
|
||||
|
||||
len = u32_u8_strlen ((scm_t_uint32*)scm_i_string_wide_chars (str),
|
||||
scm_i_string_length (str));
|
||||
allocated = len + 1;
|
||||
buf = scm_malloc (allocated);
|
||||
predicted_len = u32_u8_length_in_bytes
|
||||
((scm_t_uint32 *) scm_i_string_wide_chars (str),
|
||||
scm_i_string_length (str));
|
||||
|
||||
ret = u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
|
||||
scm_i_string_length (str), buf, &len);
|
||||
|
||||
if (ret == buf && len + 1 == allocated)
|
||||
if (lenp)
|
||||
{
|
||||
ret[len] = 0;
|
||||
return (char *) ret;
|
||||
*lenp = predicted_len;
|
||||
buf = scm_malloc (predicted_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
buf = scm_malloc (predicted_len + 1);
|
||||
ret[predicted_len] = 0;
|
||||
}
|
||||
|
||||
ret = u32_to_u8 ((scm_t_uint32 *) scm_i_string_wide_chars (str),
|
||||
scm_i_string_length (str), buf, &actual_len);
|
||||
|
||||
if (SCM_LIKELY (ret == buf && actual_len == predicted_len))
|
||||
return (char *) ret;
|
||||
|
||||
/* An error: a bad codepoint. */
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue