Fix scm_to_utf8_stringn to return the length in bytes, et al

* libguile/strings.c (u32_u8_length_in_bytes): Internal static function renamed from u32_u8_strlen, whose name was potentially confusing. For added safety, handle everything that can be encoded in the more general UTF-8 encoding: up to six bytes for each code point, with code points up to 2^31-1. (scm_to_utf8_stringn): NUL-terminate only if (lenp == NULL). If (lenp != NULL) return the length in bytes in *lenp.
2025-07-09 02:50:20 +02:00 · 2012-04-02 18:55:45 -04:00 · 2012-04-02 18:55:45 -04:00 · e26da7a24e
commit e26da7a24e
parent 2c1b79513b
1 changed files with 36 additions and 16 deletions
--- a/libguile/strings.c
+++ b/libguile/strings.c
@ -1874,19 +1874,28 @@ latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
  return u8_result;
 }

-/* From RFC 3629:
+/* UTF-8 code table
+
+   (Note that this includes code points that are not allowed by Unicode,
+    but since this function has no way to report an error, and its
+    purpose is to determine the size of destination buffers for
+    libunicode conversion functions, we err on the safe side and handle
+    everything that libunicode might conceivably handle, now or in the
+    future.)

   Char. number range  |        UTF-8 octet sequence
      (hexadecimal)    |              (binary)
-   --------------------+---------------------------------------------
+   --------------------+------------------------------------------------------
   0000 0000-0000 007F | 0xxxxxxx
   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
-   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 */

 static size_t
-u32_u8_strlen (const scm_t_uint32 *str, size_t len)
+u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
 {
  size_t ret, i;

@ -1900,8 +1909,12 @@ u32_u8_strlen (const scm_t_uint32 *str, size_t len)
        ret += 2;
      else if (c <= 0xffff)
        ret += 3;
-      else
+      else if (c <= 0x1fffff)
        ret += 4;
+      else if (c <= 0x3ffffff)
+        ret += 5;
+      else
+        ret += 6;
    }

  return ret;
@ -1917,21 +1930,28 @@ scm_to_utf8_stringn (SCM str, size_t *lenp)
  else
    {
      scm_t_uint8 *buf, *ret;
-      size_t len, allocated;
+      size_t predicted_len, actual_len;  /* length in bytes */

-      len = u32_u8_strlen ((scm_t_uint32*)scm_i_string_wide_chars (str),
-                           scm_i_string_length (str));
-      allocated = len + 1;
-      buf = scm_malloc (allocated);
+      predicted_len = u32_u8_length_in_bytes
+        ((scm_t_uint32 *) scm_i_string_wide_chars (str),
+         scm_i_string_length (str));

-      ret = u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
-                       scm_i_string_length (str), buf, &len);
-
-      if (ret == buf && len + 1 == allocated)
+      if (lenp)
        {
-          ret[len] = 0;
-          return (char *) ret;
+          *lenp = predicted_len;
+          buf = scm_malloc (predicted_len);
        }
+      else
+        {
+          buf = scm_malloc (predicted_len + 1);
+          ret[predicted_len] = 0;
+        }
+
+      ret = u32_to_u8 ((scm_t_uint32 *) scm_i_string_wide_chars (str),
+                       scm_i_string_length (str), buf, &actual_len);
+
+      if (SCM_LIKELY (ret == buf && actual_len == predicted_len))
+        return (char *) ret;

      /* An error: a bad codepoint.  */
      {