1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 11:40:18 +02:00

Fix scm_to_utf8_stringn to return the length in bytes, et al

* libguile/strings.c (u32_u8_length_in_bytes): Internal static function
  renamed from u32_u8_strlen, whose name was potentially confusing.  For
  added safety, handle everything that can be encoded in the more
  general UTF-8 encoding: up to six bytes for each code point, with code
  points up to 2^31-1.

  (scm_to_utf8_stringn): NUL-terminate only if (lenp == NULL).
  If (lenp != NULL) return the length in bytes in *lenp.
This commit is contained in:
Mark H Weaver 2012-04-02 18:55:45 -04:00
parent 2c1b79513b
commit e26da7a24e

View file

@ -1874,19 +1874,28 @@ latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
return u8_result;
}
/* From RFC 3629:
/* UTF-8 code table
(Note that this includes code points that are not allowed by Unicode,
but since this function has no way to report an error, and its
purpose is to determine the size of destination buffers for
libunicode conversion functions, we err on the safe side and handle
everything that libunicode might conceivably handle, now or in the
future.)
Char. number range | UTF-8 octet sequence
(hexadecimal) | (binary)
--------------------+---------------------------------------------
--------------------+------------------------------------------------------
0000 0000-0000 007F | 0xxxxxxx
0000 0080-0000 07FF | 110xxxxx 10xxxxxx
0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
static size_t
u32_u8_strlen (const scm_t_uint32 *str, size_t len)
u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
{
size_t ret, i;
@ -1900,8 +1909,12 @@ u32_u8_strlen (const scm_t_uint32 *str, size_t len)
ret += 2;
else if (c <= 0xffff)
ret += 3;
else
else if (c <= 0x1fffff)
ret += 4;
else if (c <= 0x3ffffff)
ret += 5;
else
ret += 6;
}
return ret;
@ -1917,21 +1930,28 @@ scm_to_utf8_stringn (SCM str, size_t *lenp)
else
{
scm_t_uint8 *buf, *ret;
size_t len, allocated;
size_t predicted_len, actual_len; /* length in bytes */
len = u32_u8_strlen ((scm_t_uint32*)scm_i_string_wide_chars (str),
scm_i_string_length (str));
allocated = len + 1;
buf = scm_malloc (allocated);
predicted_len = u32_u8_length_in_bytes
((scm_t_uint32 *) scm_i_string_wide_chars (str),
scm_i_string_length (str));
ret = u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
scm_i_string_length (str), buf, &len);
if (ret == buf && len + 1 == allocated)
if (lenp)
{
ret[len] = 0;
return (char *) ret;
*lenp = predicted_len;
buf = scm_malloc (predicted_len);
}
else
{
buf = scm_malloc (predicted_len + 1);
ret[predicted_len] = 0;
}
ret = u32_to_u8 ((scm_t_uint32 *) scm_i_string_wide_chars (str),
scm_i_string_length (str), buf, &actual_len);
if (SCM_LIKELY (ret == buf && actual_len == predicted_len))
return (char *) ret;
/* An error: a bad codepoint. */
{