1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-29 14:30:34 +02:00

Modify bytevectors/string conversions to allow wide strings

The bytevector to string conversion functions were accomplished
by converting via locale strings.  This did not allow conversions
of wide strings in an 8-bit locale.  This is avoided by using knowledge
of the storage format of the string.

* libguile/bytevectors.c (STRING_TO_UTF, scm_string_to_utf8): modify
  string to bytevector conversion to use internal string information
  (UTF_TO_STRING, scm_utf8_to_string): modify bytevector to string
  conversion
This commit is contained in:
Michael Gran 2009-10-28 06:24:23 -07:00
parent b02b05332f
commit 3a5bc4fada

View file

@ -40,6 +40,7 @@
#include <byteswap.h> #include <byteswap.h>
#include <striconveh.h> #include <striconveh.h>
#include <uniconv.h> #include <uniconv.h>
#include <unistr.h>
#ifdef HAVE_LIMITS_H #ifdef HAVE_LIMITS_H
# include <limits.h> # include <limits.h>
@ -1871,58 +1872,50 @@ utf_encoding_name (char *name, size_t utf_width, SCM endianness)
#define MAX_UTF_ENCODING_NAME_LEN 16 #define MAX_UTF_ENCODING_NAME_LEN 16
/* Produce the body of a `string->utf' function. */ /* Produce the body of a `string->utf' function. */
#define STRING_TO_UTF(_utf_width) \ #define STRING_TO_UTF(_utf_width) \
SCM utf; \ SCM utf; \
int err; \ int err; \
char *c_str; \ char c_utf_name[MAX_UTF_ENCODING_NAME_LEN]; \
char c_utf_name[MAX_UTF_ENCODING_NAME_LEN]; \ char *c_utf = NULL; \
char *c_utf = NULL, *c_locale; \ size_t c_strlen, c_utf_len = 0; \
size_t c_strlen, c_raw_strlen, c_utf_len = 0; \ \
\ SCM_VALIDATE_STRING (1, str); \
SCM_VALIDATE_STRING (1, str); \ if (endianness == SCM_UNDEFINED) \
if (endianness == SCM_UNDEFINED) \ endianness = scm_sym_big; \
endianness = scm_sym_big; \ else \
else \ SCM_VALIDATE_SYMBOL (2, endianness); \
SCM_VALIDATE_SYMBOL (2, endianness); \ \
\ utf_encoding_name (c_utf_name, (_utf_width), endianness); \
c_strlen = scm_c_string_length (str); \ \
c_raw_strlen = c_strlen * ((_utf_width) / 8); \ c_strlen = scm_i_string_length (str); \
do \ if (scm_i_is_narrow_string (str)) \
{ \ { \
c_str = (char *) alloca (c_raw_strlen + 1); \ err = mem_iconveh (scm_i_string_chars (str), c_strlen, \
c_raw_strlen = scm_to_locale_stringbuf (str, c_str, c_strlen); \ "ISO-8859-1", c_utf_name, \
} \ iconveh_question_mark, NULL, \
while (c_raw_strlen > c_strlen); \ &c_utf, &c_utf_len); \
c_str[c_raw_strlen] = '\0'; \ if (SCM_UNLIKELY (err)) \
\ scm_syserror_msg (FUNC_NAME, "failed to convert string: ~A", \
utf_encoding_name (c_utf_name, (_utf_width), endianness); \ scm_list_1 (str), err); \
\ } \
c_locale = (char *) alloca (strlen (locale_charset ()) + 1); \ else \
strcpy (c_locale, locale_charset ()); \ { \
\ const scm_t_wchar *wbuf = scm_i_string_wide_chars (str); \
err = mem_iconveh (c_str, c_raw_strlen, \ c_utf = u32_conv_to_encoding (c_utf_name, \
c_locale, c_utf_name, \ iconveh_question_mark, \
iconveh_question_mark, NULL, \ (scm_t_uint32 *) wbuf, \
&c_utf, &c_utf_len); \ c_strlen, NULL, NULL, &c_utf_len); \
if (SCM_UNLIKELY (err)) \ if (SCM_UNLIKELY (c_utf == NULL)) \
scm_syserror_msg (FUNC_NAME, "failed to convert string: ~A", \ scm_syserror_msg (FUNC_NAME, "failed to convert string: ~A", \
scm_list_1 (str), err); \ scm_list_1 (str), errno); \
else \ } \
{ \ scm_dynwind_begin (0); \
/* C_UTF is null-terminated. It is malloc(3)-allocated, so we cannot \ scm_dynwind_free (c_utf); \
use `scm_c_take_bytevector ()'. */ \ utf = make_bytevector (c_utf_len, SCM_ARRAY_ELEMENT_TYPE_VU8); \
scm_dynwind_begin (0); \ memcpy (SCM_BYTEVECTOR_CONTENTS (utf), c_utf, c_utf_len); \
scm_dynwind_free (c_utf); \ scm_dynwind_end (); \
\ \
utf = make_bytevector (c_utf_len, \ return (utf);
SCM_ARRAY_ELEMENT_TYPE_VU8); \
memcpy (SCM_BYTEVECTOR_CONTENTS (utf), c_utf, \
c_utf_len); \
\
scm_dynwind_end (); \
} \
\
return (utf);
@ -1934,36 +1927,30 @@ SCM_DEFINE (scm_string_to_utf8, "string->utf8",
#define FUNC_NAME s_scm_string_to_utf8 #define FUNC_NAME s_scm_string_to_utf8
{ {
SCM utf; SCM utf;
char *c_str;
uint8_t *c_utf; uint8_t *c_utf;
size_t c_strlen, c_raw_strlen; size_t c_strlen, c_utf_len = 0;
SCM_VALIDATE_STRING (1, str); SCM_VALIDATE_STRING (1, str);
c_strlen = scm_c_string_length (str); c_strlen = scm_i_string_length (str);
c_raw_strlen = c_strlen; if (scm_i_is_narrow_string (str))
do c_utf = u8_conv_from_encoding ("ISO-8859-1", iconveh_question_mark,
scm_i_string_chars (str), c_strlen,
NULL, NULL, &c_utf_len);
else
{ {
c_str = (char *) alloca (c_raw_strlen + 1); const scm_t_wchar *wbuf = scm_i_string_wide_chars (str);
c_raw_strlen = scm_to_locale_stringbuf (str, c_str, c_strlen); c_utf = u32_to_u8 ((const uint32_t *) wbuf, c_strlen, NULL, &c_utf_len);
} }
while (c_raw_strlen > c_strlen);
c_str[c_raw_strlen] = '\0';
c_utf = u8_strconv_from_locale (c_str);
if (SCM_UNLIKELY (c_utf == NULL)) if (SCM_UNLIKELY (c_utf == NULL))
scm_syserror (FUNC_NAME); scm_syserror (FUNC_NAME);
else else
{ {
/* C_UTF is null-terminated. It is malloc(3)-allocated, so we cannot
use `scm_c_take_bytevector ()'. */
scm_dynwind_begin (0); scm_dynwind_begin (0);
scm_dynwind_free (c_utf); scm_dynwind_free (c_utf);
utf = make_bytevector (UTF_STRLEN (8, c_utf), utf = make_bytevector (c_utf_len, SCM_ARRAY_ELEMENT_TYPE_VU8);
SCM_ARRAY_ELEMENT_TYPE_VU8); memcpy (SCM_BYTEVECTOR_CONTENTS (utf), c_utf, c_utf_len);
memcpy (SCM_BYTEVECTOR_CONTENTS (utf), c_utf,
UTF_STRLEN (8, c_utf));
scm_dynwind_end (); scm_dynwind_end ();
} }
@ -2000,10 +1987,10 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
#define UTF_TO_STRING(_utf_width) \ #define UTF_TO_STRING(_utf_width) \
SCM str = SCM_BOOL_F; \ SCM str = SCM_BOOL_F; \
int err; \ int err; \
char *c_str = NULL, *c_locale; \ char *c_str = NULL; \
char c_utf_name[MAX_UTF_ENCODING_NAME_LEN]; \ char c_utf_name[MAX_UTF_ENCODING_NAME_LEN]; \
const char *c_utf; \ char *c_utf; \
size_t c_strlen = 0, c_utf_len; \ size_t c_strlen = 0, c_utf_len = 0; \
\ \
SCM_VALIDATE_BYTEVECTOR (1, utf); \ SCM_VALIDATE_BYTEVECTOR (1, utf); \
if (endianness == SCM_UNDEFINED) \ if (endianness == SCM_UNDEFINED) \
@ -2015,20 +2002,19 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); \ c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); \
utf_encoding_name (c_utf_name, (_utf_width), endianness); \ utf_encoding_name (c_utf_name, (_utf_width), endianness); \
\ \
c_locale = (char *) alloca (strlen (locale_charset ()) + 1); \
strcpy (c_locale, locale_charset ()); \
\
err = mem_iconveh (c_utf, c_utf_len, \ err = mem_iconveh (c_utf, c_utf_len, \
c_utf_name, c_locale, \ c_utf_name, "UTF-8", \
iconveh_question_mark, NULL, \ iconveh_question_mark, NULL, \
&c_str, &c_strlen); \ &c_str, &c_strlen); \
if (SCM_UNLIKELY (err)) \ if (SCM_UNLIKELY (err)) \
scm_syserror_msg (FUNC_NAME, "failed to convert to string: ~A", \ scm_syserror_msg (FUNC_NAME, "failed to convert to string: ~A", \
scm_list_1 (utf), err); \ scm_list_1 (utf), err); \
else \ else \
/* C_STR is null-terminated. */ \ { \
str = scm_take_locale_stringn (c_str, c_strlen); \ str = scm_from_stringn (c_str, c_strlen, "UTF-8", \
\ SCM_FAILED_CONVERSION_ERROR); \
free (c_str); \
} \
return (str); return (str);
@ -2040,29 +2026,15 @@ SCM_DEFINE (scm_utf8_to_string, "utf8->string",
#define FUNC_NAME s_scm_utf8_to_string #define FUNC_NAME s_scm_utf8_to_string
{ {
SCM str; SCM str;
int err;
char *c_str = NULL, *c_locale;
const char *c_utf; const char *c_utf;
size_t c_utf_len, c_strlen = 0; size_t c_utf_len = 0;
SCM_VALIDATE_BYTEVECTOR (1, utf); SCM_VALIDATE_BYTEVECTOR (1, utf);
c_utf_len = SCM_BYTEVECTOR_LENGTH (utf); c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);
c_locale = (char *) alloca (strlen (locale_charset ()) + 1);
strcpy (c_locale, locale_charset ());
c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);
err = mem_iconveh (c_utf, c_utf_len, str = scm_from_stringn (c_utf, c_utf_len, "UTF-8",
"UTF-8", c_locale, SCM_FAILED_CONVERSION_ERROR);
iconveh_question_mark, NULL,
&c_str, &c_strlen);
if (SCM_UNLIKELY (err))
scm_syserror_msg (FUNC_NAME, "failed to convert to string: ~A",
scm_list_1 (utf), err);
else
/* C_STR is null-terminated. */
str = scm_take_locale_stringn (c_str, c_strlen);
return (str); return (str);
} }
@ -2090,7 +2062,6 @@ SCM_DEFINE (scm_utf32_to_string, "utf32->string",
} }
#undef FUNC_NAME #undef FUNC_NAME
/* Bytevectors as generalized vectors & arrays. */ /* Bytevectors as generalized vectors & arrays. */