1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-10 14:00:21 +02:00

Modify socket and time functions for wide strings

* libguile/socket.c (scm_recv): receive the message without holding the
  stringbuf writing lock
  (scm_send): try to narrow a string before using it

* libguile/stime.c (strftime): convert string to UTF-8 so that it can
  be safely passed to strftime
  (strptime): convert input string to UTF-8 so that it can be safely
  passed through strptime

* libguile/strings.c (narrow_stringbuf): new function
  (scm_i_try_narrow_string): new function

* libguile/strings.h: new declaration for scm_i_try_narrow_string
This commit is contained in:
Michael Gran 2009-08-23 06:50:45 -07:00
parent 27646f414e
commit 587a33556f
5 changed files with 200 additions and 33 deletions

View file

@ -33,6 +33,7 @@
#include "libguile/strings.h"
#include "libguile/vectors.h"
#include "libguile/dynwind.h"
#include "libguile/srfi-13.h"
#include "libguile/validate.h"
#include "libguile/socket.h"
@ -1414,6 +1415,8 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
"protocols, if a packet larger than this limit is encountered\n"
"then some data\n"
"will be irrevocably lost.\n\n"
"The data is assumed to be binary, and there is no decoding of\n"
"of locale-encoded strings.\n\n"
"The optional @var{flags} argument is a value or\n"
"bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n"
"The value returned is the number of bytes read from the\n"
@ -1428,6 +1431,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
int flg;
char *dest;
size_t len;
SCM msg;
SCM_VALIDATE_OPFPORT (1, sock);
SCM_VALIDATE_STRING (2, buf);
@ -1437,16 +1441,16 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
flg = scm_to_int (flags);
fd = SCM_FPORT_FDES (sock);
len = scm_i_string_length (buf);
buf = scm_i_string_start_writing (buf);
dest = scm_i_string_writable_chars (buf);
len = scm_i_string_length (buf);
msg = scm_i_make_string (len, &dest);
SCM_SYSCALL (rv = recv (fd, dest, len, flg));
scm_i_string_stop_writing ();
scm_string_copy_x (buf, scm_from_int (0),
msg, scm_from_int (0), scm_from_size_t (len));
if (rv == -1)
SCM_SYSERROR;
scm_remember_upto_here_1 (buf);
scm_remember_upto_here_2 (buf, msg);
return scm_from_int (rv);
}
#undef FUNC_NAME
@ -1464,18 +1468,28 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0,
"bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n"
"Note that the data is written directly to the socket\n"
"file descriptor:\n"
"any unflushed buffered port data is ignored.")
"any unflushed buffered port data is ignored.\n\n"
"This operation is defined only for strings containing codepoints\n"
"zero to 255.")
#define FUNC_NAME s_scm_send
{
int rv;
int fd;
int flg;
const char *src;
char *src;
size_t len;
sock = SCM_COERCE_OUTPORT (sock);
SCM_VALIDATE_OPFPORT (1, sock);
SCM_VALIDATE_STRING (2, message);
/* If the string is wide, see if it can be coerced into
a narrow string. */
if (!scm_i_is_narrow_string (message)
|| scm_i_try_narrow_string (message))
SCM_MISC_ERROR ("the message string is not 8-bit: ~s",
scm_list_1 (message));
if (SCM_UNBNDP (flags))
flg = 0;
else
@ -1592,7 +1606,9 @@ SCM_DEFINE (scm_sendto, "sendto", 3, 1, 1,
"set to be non-blocking.\n"
"Note that the data is written directly to the socket\n"
"file descriptor:\n"
"any unflushed buffered port data is ignored.")
"any unflushed buffered port data is ignored.\n"
"This operation is defined only for strings containing codepoints\n"
"zero to 255.")
#define FUNC_NAME s_scm_sendto
{
int rv;

View file

@ -46,6 +46,7 @@
#include <stdio.h>
#include <errno.h>
#include <strftime.h>
#include <unistr.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
@ -53,6 +54,7 @@
#include "libguile/strings.h"
#include "libguile/vectors.h"
#include "libguile/dynwind.h"
#include "libguile/strings.h"
#include "libguile/validate.h"
#include "libguile/stime.h"
@ -624,18 +626,20 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
{
struct tm t;
char *tbuf;
scm_t_uint8 *tbuf;
int size = 50;
const char *fmt;
char *myfmt;
scm_t_uint8 *fmt;
scm_t_uint8 *myfmt;
int len;
SCM result;
SCM_VALIDATE_STRING (1, format);
bdtime2c (stime, &t, SCM_ARG2, FUNC_NAME);
fmt = scm_i_string_chars (format);
len = scm_i_string_length (format);
/* Convert string to UTF-8 so that non-ASCII characters in the
format are passed through unchanged. */
fmt = scm_i_to_utf8_string (format);
len = strlen ((const char *) fmt);
/* Ugly hack: strftime can return 0 if its buffer is too small,
but some valid time strings (e.g. "%p") can sometimes produce
@ -643,9 +647,11 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
character to the format string, so that valid returns are always
nonzero. */
myfmt = scm_malloc (len+2);
*myfmt = 'x';
strncpy(myfmt+1, fmt, len);
myfmt[len+1] = 0;
*myfmt = (scm_t_uint8) 'x';
strncpy ((char *) myfmt + 1, (const char *) fmt, len);
myfmt[len + 1] = 0;
scm_remember_upto_here_1 (format);
free (fmt);
tbuf = scm_malloc (size);
{
@ -680,7 +686,8 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
/* Use `nstrftime ()' from Gnulib, which supports all GNU extensions
supported by glibc. */
while ((len = nstrftime (tbuf, size, myfmt, &t, 0, 0)) == 0)
while ((len = nstrftime ((char *) tbuf, size,
(const char *) myfmt, &t, 0, 0)) == 0)
{
free (tbuf);
size *= 2;
@ -696,7 +703,7 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
#endif
}
result = scm_from_locale_stringn (tbuf + 1, len - 1);
result = scm_i_from_utf8_string ((const scm_t_uint8 *) tbuf + 1);
free (tbuf);
free (myfmt);
#if HAVE_STRUCT_TM_TM_ZONE
@ -722,14 +729,17 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
#define FUNC_NAME s_scm_strptime
{
struct tm t;
const char *fmt, *str, *rest;
scm_t_uint8 *fmt, *str, *rest;
size_t used_len;
long zoff;
SCM_VALIDATE_STRING (1, format);
SCM_VALIDATE_STRING (2, string);
fmt = scm_i_string_chars (format);
str = scm_i_string_chars (string);
/* Convert strings to UTF-8 so that non-ASCII characters are passed
through unchanged. */
fmt = scm_i_to_utf8_string (format);
str = scm_i_to_utf8_string (string);
/* initialize the struct tm */
#define tm_init(field) t.field = 0
@ -751,7 +761,8 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
fields, hence the use of SCM_CRITICAL_SECTION_START. */
t.tm_isdst = -1;
SCM_CRITICAL_SECTION_START;
rest = strptime (str, fmt, &t);
rest = (scm_t_uint8 *) strptime ((const char *) str,
(const char *) fmt, &t);
SCM_CRITICAL_SECTION_END;
if (rest == NULL)
{
@ -759,6 +770,9 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
instance it doesn't. Force a sensible value for our error
message. */
errno = EINVAL;
scm_remember_upto_here_2 (format, string);
free (str);
free (fmt);
SCM_SYSERROR;
}
@ -770,8 +784,14 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
zoff = 0;
#endif
/* Compute the number of UTF-8 characters. */
used_len = u8_strnlen (str, rest-str);
scm_remember_upto_here_2 (format, string);
free (str);
free (fmt);
return scm_cons (filltime (&t, zoff, NULL),
scm_from_signed_integer (rest - str));
scm_from_signed_integer (used_len));
}
#undef FUNC_NAME
#endif /* HAVE_STRPTIME */

View file

@ -239,6 +239,36 @@ widen_stringbuf (SCM buf)
}
}
/* Convert a stringbuf of 32-bit UCS-4-encoded characters to one
containing 8-bit Latin-1-encoded characters, if possible. */
static void
narrow_stringbuf (SCM buf)
{
size_t i, len;
scm_t_wchar *wmem;
char *mem;
if (!STRINGBUF_WIDE (buf))
return;
len = STRINGBUF_OUTLINE_LENGTH (buf);
i = 0;
wmem = STRINGBUF_WIDE_CHARS (buf);
while (i < len)
if (wmem[i++] > 0xFF)
return;
mem = scm_gc_malloc (sizeof (char) * (len + 1), "string");
for (i = 0; i < len; i++)
mem[i] = (unsigned char) wmem[i];
scm_gc_free (wmem, sizeof (scm_t_wchar) * (len + 1), "string");
SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_WIDE);
SCM_SET_CELL_WORD_1 (buf, mem);
SCM_SET_CELL_WORD_2 (buf, len);
}
scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
/* Copy-on-write strings.
@ -459,6 +489,18 @@ scm_i_is_narrow_string (SCM str)
return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
}
/* Try to coerce a string to be narrow. It if is narrow already, do
nothing. If it is wide, shrink it to narrow if none of its
characters are above 0xFF. Return true if the string is narrow or
was made to be narrow. */
int
scm_i_try_narrow_string (SCM str)
{
narrow_stringbuf (STRING_STRINGBUF (str));
return scm_i_is_narrow_string (str);
}
/* Returns a pointer to the 8-bit Latin-1 encoded character array of
STR. */
const char *
@ -623,7 +665,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
if (scm_i_is_narrow_string (str))
{
char *dst = scm_i_string_writable_chars (str);
dst[p] = (char) (unsigned char) chr;
dst[p] = chr;
}
else
{
@ -633,7 +675,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
}
/* Symbols.
Basic symbol creation and accessing is done here, the rest is in
symbols.[hc]. This has been done to keep stringbufs and the
internals of strings and string-like objects confined to this file.
@ -866,7 +908,7 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
else
e5 = scm_cons (scm_from_locale_symbol ("read-only"),
SCM_BOOL_F);
/* Stringbuf info */
if (!STRINGBUF_WIDE (buf))
{
@ -1426,6 +1468,80 @@ scm_from_locale_string (const char *str)
return scm_from_locale_stringn (str, -1);
}
static SCM
scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_t_string_failed_conversion_handler handler)
{
size_t u32len, i;
scm_t_wchar *u32;
int wide = 0;
SCM res;
u32len = 0;
u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
(enum iconv_ilseq_handler)
handler,
str, len,
NULL,
NULL, &u32len);
if (u32 == NULL)
{
if (errno == ENOMEM)
scm_memory_error ("locale string conversion");
else
{
/* There are invalid sequences in the input string. Since
it is partially nonsense, what is the best strategy for
printing it in the error message? */
SCM errstr;
char *dst;
/* We'll just print it unconverted and hope for the best. */
errstr = scm_i_make_string (len, &dst);
memcpy (dst, str, len);
scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
scm_list_2 (scm_from_locale_string (encoding),
errstr));
scm_remember_upto_here_1 (errstr);
}
}
i = 0;
while (i < u32len)
if (u32[i++] > 0xFF)
{
wide = 1;
break;
}
if (!wide)
{
char *dst;
res = scm_i_make_string (u32len, &dst);
for (i = 0; i < u32len; i ++)
dst[i] = (unsigned char) u32[i];
dst[u32len] = '\0';
}
else
{
scm_t_wchar *wdst;
res = scm_i_make_wide_string (u32len, &wdst);
u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
wdst[u32len] = 0;
}
free (u32);
return res;
}
SCM
scm_i_from_utf8_string (const scm_t_uint8 *str)
{
return scm_from_stringn ((const char *) str,
strlen ((char *) str), "UTF-8",
SCM_FAILED_CONVERSION_ERROR);
}
/* Create a new scheme string from the C string STR. The memory of
STR may be used directly as storage for the new string. */
SCM
@ -1519,16 +1635,15 @@ scm_to_locale_stringn (SCM str, size_t * lenp)
/* In the future, enc will hold the port's encoding. */
enc = NULL;
return scm_to_stringn (str, lenp, enc,
return scm_to_stringn (str, lenp, enc,
SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
}
/* Low-level scheme to C string conversion function. */
char *
scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
scm_t_string_failed_conversion_handler handler)
{
static const char iso[11] = "ISO-8859-1";
char *buf;
size_t ilen, len, i;
@ -1544,7 +1659,7 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
*lenp = 0;
return buf;
}
if (lenp == NULL)
for (i = 0; i < ilen; i++)
if (scm_i_string_ref (str, i) == '\0')
@ -1570,16 +1685,16 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
}
}
buf = NULL;
len = 0;
buf = u32_conv_to_encoding (iso,
buf = u32_conv_to_encoding (encoding ? encoding : "ISO-8859-1",
(enum iconv_ilseq_handler) handler,
(scm_t_uint32 *) scm_i_string_wide_chars (str),
ilen, NULL, NULL, &len);
if (buf == NULL)
scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
scm_list_2 (scm_from_locale_string (iso), str));
scm_list_2 (scm_from_locale_string (encoding), str));
if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
unistring_escapes_to_guile_escapes (&buf, &len);
@ -1602,6 +1717,14 @@ scm_to_locale_string (SCM str)
return scm_to_locale_stringn (str, NULL);
}
scm_t_uint8 *
scm_i_to_utf8_string (SCM str)
{
char *u8str;
u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
return (scm_t_uint8 *) u8str;
}
size_t
scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
{

View file

@ -124,6 +124,7 @@ SCM_API SCM scm_c_substring_copy (SCM str, size_t start, size_t end);
SCM_API int scm_is_string (SCM x);
SCM_API SCM scm_from_locale_string (const char *str);
SCM_API SCM scm_from_locale_stringn (const char *str, size_t len);
SCM_INTERNAL SCM scm_i_from_utf8_string (const scm_t_uint8 *str);
SCM_API SCM scm_take_locale_string (char *str);
SCM_API SCM scm_take_locale_stringn (char *str, size_t len);
SCM_API char *scm_to_locale_string (SCM str);
@ -132,6 +133,7 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
const char *encoding,
scm_t_string_failed_conversion_handler
handler);
SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str);
SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
SCM_API SCM scm_makfromstrs (int argc, char **argv);
@ -168,6 +170,7 @@ SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym);
SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym);
SCM_INTERNAL size_t scm_i_symbol_length (SCM sym);
SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str);
SCM_INTERNAL int scm_i_try_narrow_string (SCM str);
SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end);
SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x);

View file

@ -202,6 +202,11 @@
(string=? (strftime "%Z" t)
"ZOW")))
(pass-if "strftime passes wide characters"
(let ((t (localtime (current-time))))
(string=? (substring (strftime "\u0100%Z" t) 0 1)
"\u0100")))
(with-test-prefix "C99 %z format"
;; %z here is quite possibly affected by the same tm:gmtoff vs current