1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 11:50:28 +02:00

Thread-safety fixes for iconv and ports

* libguile/ports-internal.h (scm_t_port): Rework to store iconv
  descriptors inline, so that the port finalizer doesn't race with the
  iconv descriptor finalizer.  Access is serialized through a lock.
  Fixes a bug whereby if the port finalizer and the descriptor finalizer
  run on different threads, the close-port run by the port finalizer
  could try to free the iconv descriptors at the same time as the
  descriptor finalizer.
* libguile/ports.c (iconv_lock): New static variable.
  (scm_c_make_port_with_encoding): Initialize iconv-related fields.
  (scm_close_port): Lock while frobbing iconv descriptors.
  (prepare_iconv_descriptors): Adapt.
  (scm_specialize_port_encoding_x, scm_i_set_port_encoding_x): Lock
  while preparing iconv descriptors.
  (scm_port_acquire_iconv_descriptors)
  (scm_port_release_iconv_descriptors): New functions, which replace
  scm_i_port_iconv_descriptors.
  (scm_port_decode_char): Lock around iconv operations.
  (port_clear_stream_start_for_bom_write): Acquire iconv descriptors
  before checking precise_encoding, to make sure precise_encoding is
  initialized.
* libguile/print.c (display_string_using_iconv): Adapt to use the new
  interface to get iconv descriptors from a port.
This commit is contained in:
Andy Wingo 2016-05-23 16:37:23 +02:00
parent c95a19376b
commit 6bf7ec0c9c
3 changed files with 128 additions and 130 deletions

View file

@ -23,6 +23,7 @@
#define SCM_PORTS_INTERNAL #define SCM_PORTS_INTERNAL
#include <assert.h> #include <assert.h>
#include <iconv.h>
#include "libguile/_scm.h" #include "libguile/_scm.h"
#include "libguile/ports.h" #include "libguile/ports.h"
@ -302,24 +303,6 @@ scm_port_buffer_putback (SCM buf, const scm_t_uint8 *src, size_t count)
src, count); src, count);
} }
/* This is a separate object so that only those ports that use iconv
cause finalizers to be registered. */
struct scm_iconv_descriptors
{
/* This is the same as pt->encoding, except if pt->encoding is UTF-16
or UTF-32, in which case this is UTF-16LE or a similar
byte-order-specialed version of UTF-16 or UTF-32. We don't re-set
pt->encoding because being just plain UTF-16 or UTF-32 has an
additional meaning, being that we should consume and produce byte
order marker codepoints as appropriate. */
SCM precise_encoding;
/* input/output iconv conversion descriptors */
void *input_cd;
void *output_cd;
};
typedef struct scm_iconv_descriptors scm_t_iconv_descriptors;
struct scm_t_port struct scm_t_port
{ {
/* Source location information. */ /* Source location information. */
@ -342,15 +325,26 @@ struct scm_t_port
/* True if the port is random access. Implies that the buffers must /* True if the port is random access. Implies that the buffers must
be flushed before switching between reading and writing, seeking, be flushed before switching between reading and writing, seeking,
and so on. */ and so on. */
int rw_random; unsigned rw_random : 1;
unsigned at_stream_start_for_bom_read : 1;
unsigned at_stream_start_for_bom_write : 1;
/* Character encoding support. */ /* Character encoding support. */
SCM encoding; /* A symbol of upper-case ASCII. */ SCM encoding; /* A symbol of upper-case ASCII. */
SCM conversion_strategy; /* A symbol; either substitute, error, or escape. */ SCM conversion_strategy; /* A symbol; either substitute, error, or escape. */
unsigned at_stream_start_for_bom_read : 1; /* This is the same as pt->encoding, except if `encoding' is UTF-16 or
unsigned at_stream_start_for_bom_write : 1; UTF-32, in which case this is UTF-16LE or a similar
scm_t_iconv_descriptors *iconv_descriptors; byte-order-specialed version of UTF-16 or UTF-32. This is a
separate field from `encoding' because being just plain UTF-16 or
UTF-32 has an additional meaning, being that we should consume and
produce byte order marker codepoints as appropriate. Set to #f
before the iconv descriptors have been opened. */
SCM precise_encoding; /* with iconv_lock */
iconv_t input_cd; /* with iconv_lock */
iconv_t output_cd; /* with iconv_lock */
/* Port properties. */
SCM alist; SCM alist;
}; };
@ -359,6 +353,9 @@ struct scm_t_port
#define SCM_FILENAME(x) (SCM_PORT (x)->file_name) #define SCM_FILENAME(x) (SCM_PORT (x)->file_name)
#define SCM_SET_FILENAME(x, n) (SCM_PORT (x)->file_name = (n)) #define SCM_SET_FILENAME(x, n) (SCM_PORT (x)->file_name = (n))
SCM_INTERNAL scm_t_iconv_descriptors * scm_i_port_iconv_descriptors (SCM port); SCM_INTERNAL void scm_port_acquire_iconv_descriptors (SCM port,
iconv_t *input_cd,
iconv_t *output_cd);
SCM_INTERNAL void scm_port_release_iconv_descriptors (SCM port);
#endif #endif

View file

@ -110,6 +110,11 @@ static SCM sym_substitute;
static SCM sym_escape; static SCM sym_escape;
/* We have to serialize operations on any given iconv descriptor. */
static scm_i_pthread_mutex_t iconv_lock = SCM_I_PTHREAD_MUTEX_INITIALIZER;
/* See Unicode 8.0 section 5.22, "Best Practice for U+FFFD /* See Unicode 8.0 section 5.22, "Best Practice for U+FFFD
@ -675,12 +680,15 @@ scm_c_make_port_with_encoding (scm_t_port_type *ptob, unsigned long mode_bits,
pt->encoding = encoding; pt->encoding = encoding;
pt->conversion_strategy = conversion_strategy; pt->conversion_strategy = conversion_strategy;
pt->file_name = SCM_BOOL_F; pt->file_name = SCM_BOOL_F;
pt->iconv_descriptors = NULL;
pt->position = scm_cons (SCM_INUM0, SCM_INUM0); pt->position = scm_cons (SCM_INUM0, SCM_INUM0);
pt->at_stream_start_for_bom_read = 1; pt->at_stream_start_for_bom_read = 1;
pt->at_stream_start_for_bom_write = 1; pt->at_stream_start_for_bom_write = 1;
pt->precise_encoding = SCM_BOOL_F;
pt->input_cd = (iconv_t) -1;
pt->output_cd = (iconv_t) -1;
pt->alist = SCM_EOL; pt->alist = SCM_EOL;
if (SCM_PORT_TYPE (ret)->flags & SCM_PORT_TYPE_NEEDS_CLOSE_ON_GC) if (SCM_PORT_TYPE (ret)->flags & SCM_PORT_TYPE_NEEDS_CLOSE_ON_GC)
@ -770,12 +778,6 @@ SCM_DEFINE (scm_eof_object_p, "eof-object?", 1, 0, 0,
/* Closing ports. */ /* Closing ports. */
static void close_iconv_descriptors (scm_t_iconv_descriptors *id);
/* scm_close_port
* Call the close operation on a port object.
* see also scm_close.
*/
SCM_DEFINE (scm_close_port, "close-port", 1, 0, 0, SCM_DEFINE (scm_close_port, "close-port", 1, 0, 0,
(SCM port), (SCM port),
"Close the specified port object. Return @code{#t} if it\n" "Close the specified port object. Return @code{#t} if it\n"
@ -809,13 +811,17 @@ SCM_DEFINE (scm_close_port, "close-port", 1, 0, 0,
should be resilient to non-local exits. */ should be resilient to non-local exits. */
SCM_PORT_TYPE (port)->close (port); SCM_PORT_TYPE (port)->close (port);
if (pt->iconv_descriptors) scm_i_pthread_mutex_lock (&iconv_lock);
if (scm_is_true (pt->precise_encoding))
{ {
/* If we don't get here, the iconv_descriptors finalizer will if (pt->input_cd != (iconv_t) -1)
clean up. */ iconv_close (pt->input_cd);
close_iconv_descriptors (pt->iconv_descriptors); if (pt->output_cd != (iconv_t) -1)
pt->iconv_descriptors = NULL; iconv_close (pt->output_cd);
pt->precise_encoding = SCM_BOOL_F;
pt->input_cd = pt->output_cd = (iconv_t) -1;
} }
scm_i_pthread_mutex_unlock (&iconv_lock);
return SCM_BOOL_T; return SCM_BOOL_T;
} }
@ -979,51 +985,53 @@ static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE};
static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF}; static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF};
static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00}; static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
/* Called with the iconv lock. Will release the lock before throwing
any error. */
static void static void
finalize_iconv_descriptors (void *ptr, void *data) prepare_iconv_descriptors (SCM port, SCM precise_encoding)
{ {
close_iconv_descriptors (ptr); scm_t_port *pt = SCM_PORT (port);
}
static scm_t_iconv_descriptors *
open_iconv_descriptors (SCM precise_encoding, int reading, int writing)
{
const char *encoding;
scm_t_iconv_descriptors *id;
iconv_t input_cd, output_cd; iconv_t input_cd, output_cd;
const char *encoding;
size_t i; size_t i;
input_cd = (iconv_t) -1; /* If the specified encoding is UTF-16 or UTF-32, then default to
output_cd = (iconv_t) -1; big-endian byte order. This fallback isn't necessary if you read
on the port before writing to it, as the read will sniff the BOM if
any and specialize the encoding; see the manual. */
if (scm_is_eq (precise_encoding, sym_UTF_16))
precise_encoding = sym_UTF_16BE;
else if (scm_is_eq (precise_encoding, sym_UTF_32))
precise_encoding = sym_UTF_32BE;
if (scm_is_eq (pt->precise_encoding, precise_encoding))
return;
input_cd = output_cd = (iconv_t) -1;
if (!scm_is_symbol (precise_encoding))
goto invalid_encoding;
encoding = scm_i_symbol_chars (precise_encoding); encoding = scm_i_symbol_chars (precise_encoding);
for (i = 0; encoding[i]; i++) for (i = 0; encoding[i]; i++)
if (encoding[i] > 127) if (encoding[i] > 127)
goto invalid_encoding; goto invalid_encoding;
if (reading) /* Open a iconv conversion descriptors between ENCODING and UTF-8. We
choose UTF-8, not UTF-32, because iconv implementations can
typically convert from anything to UTF-8, but not to UTF-32 (see
http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html,
for more details). */
if (SCM_INPUT_PORT_P (port))
{ {
/* Open an input iconv conversion descriptor, from ENCODING
to UTF-8. We choose UTF-8, not UTF-32, because iconv
implementations can typically convert from anything to
UTF-8, but not to UTF-32 (see
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
/* Assume opening an iconv descriptor causes about 16 KB of
allocation. */
scm_gc_register_allocation (16 * 1024);
input_cd = iconv_open ("UTF-8", encoding); input_cd = iconv_open ("UTF-8", encoding);
if (input_cd == (iconv_t) -1) if (input_cd == (iconv_t) -1)
goto invalid_encoding; goto invalid_encoding;
} }
if (writing) if (SCM_OUTPUT_PORT_P (port))
{ {
/* Assume opening an iconv descriptor causes about 16 KB of
allocation. */
scm_gc_register_allocation (16 * 1024);
output_cd = iconv_open (encoding, "UTF-8"); output_cd = iconv_open (encoding, "UTF-8");
if (output_cd == (iconv_t) -1) if (output_cd == (iconv_t) -1)
{ {
@ -1033,55 +1041,27 @@ open_iconv_descriptors (SCM precise_encoding, int reading, int writing)
} }
} }
id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors"); if (pt->input_cd != (iconv_t) -1)
id->precise_encoding = precise_encoding; iconv_close (pt->input_cd);
id->input_cd = input_cd; if (pt->output_cd != (iconv_t) -1)
id->output_cd = output_cd; iconv_close (pt->output_cd);
/* Register a finalizer to close the descriptors. */ pt->precise_encoding = precise_encoding;
scm_i_set_finalizer (id, finalize_iconv_descriptors, NULL); pt->input_cd = input_cd;
pt->output_cd = output_cd;
return id; /* Make sure this port has a finalizer. */
scm_i_set_finalizer (SCM2PTR (port), finalize_port, NULL);
return;
invalid_encoding: invalid_encoding:
scm_i_pthread_mutex_unlock (&iconv_lock);
scm_misc_error ("open_iconv_descriptors", scm_misc_error ("open_iconv_descriptors",
"invalid or unknown character encoding ~s", "invalid or unknown character encoding ~s",
scm_list_1 (precise_encoding)); scm_list_1 (precise_encoding));
} }
static void
close_iconv_descriptors (scm_t_iconv_descriptors *id)
{
if (id->input_cd != (iconv_t) -1)
iconv_close (id->input_cd);
if (id->output_cd != (iconv_t) -1)
iconv_close (id->output_cd);
id->input_cd = (void *) -1;
id->output_cd = (void *) -1;
}
static void
prepare_iconv_descriptors (SCM port, SCM encoding)
{
scm_t_port *pt = SCM_PORT (port);
scm_t_iconv_descriptors *desc = pt->iconv_descriptors;
/* If the specified encoding is UTF-16 or UTF-32, then default to
big-endian byte order. This fallback isn't necessary if you read
on the port before writing to it, as the read will sniff the BOM if
any and specialize the encoding; see the manual. */
if (scm_is_eq (encoding, sym_UTF_16))
encoding = sym_UTF_16BE;
else if (scm_is_eq (encoding, sym_UTF_32))
encoding = sym_UTF_32BE;
if (desc && scm_is_eq (desc->precise_encoding, encoding))
return;
pt->iconv_descriptors = open_iconv_descriptors
(encoding, SCM_INPUT_PORT_P (port), SCM_OUTPUT_PORT_P (port));
}
SCM_INTERNAL SCM scm_specialize_port_encoding_x (SCM port, SCM encoding); SCM_INTERNAL SCM scm_specialize_port_encoding_x (SCM port, SCM encoding);
SCM_DEFINE (scm_specialize_port_encoding_x, SCM_DEFINE (scm_specialize_port_encoding_x,
"specialize-port-encoding!", 2, 0, 0, "specialize-port-encoding!", 2, 0, 0,
@ -1107,33 +1087,41 @@ SCM_DEFINE (scm_specialize_port_encoding_x,
else else
SCM_OUT_OF_RANGE (2, encoding); SCM_OUT_OF_RANGE (2, encoding);
scm_i_pthread_mutex_lock (&iconv_lock);
prepare_iconv_descriptors (port, encoding); prepare_iconv_descriptors (port, encoding);
scm_i_pthread_mutex_unlock (&iconv_lock);
return SCM_UNSPECIFIED; return SCM_UNSPECIFIED;
} }
#undef FUNC_NAME #undef FUNC_NAME
scm_t_iconv_descriptors * /* Acquire the iconv lock and fill in *INPUT_CD and/or *OUTPUT_CD. */
scm_i_port_iconv_descriptors (SCM port) void
scm_port_acquire_iconv_descriptors (SCM port, iconv_t *input_cd,
iconv_t *output_cd)
{ {
scm_t_port *pt = SCM_PORT (port); scm_t_port *pt = SCM_PORT (port);
if (!pt->iconv_descriptors) scm_i_pthread_mutex_lock (&iconv_lock);
if (scm_is_false (pt->precise_encoding))
prepare_iconv_descriptors (port, pt->encoding); prepare_iconv_descriptors (port, pt->encoding);
if (input_cd)
*input_cd = pt->input_cd;
if (output_cd)
*output_cd = pt->output_cd;
}
return pt->iconv_descriptors; void
scm_port_release_iconv_descriptors (SCM port)
{
scm_i_pthread_mutex_unlock (&iconv_lock);
} }
/* The name of the encoding is itself encoded in ASCII. */ /* The name of the encoding is itself encoded in ASCII. */
void void
scm_i_set_port_encoding_x (SCM port, const char *encoding) scm_i_set_port_encoding_x (SCM port, const char *encoding)
{ {
scm_t_port *pt; scm_t_port *pt = SCM_PORT (port);
scm_t_iconv_descriptors *prev;
/* Set the character encoding for this port. */
pt = SCM_PORT (port);
prev = pt->iconv_descriptors;
/* In order to handle cases where the encoding changes mid-stream /* In order to handle cases where the encoding changes mid-stream
(e.g. within an HTTP stream, or within a file that is composed of (e.g. within an HTTP stream, or within a file that is composed of
@ -1144,9 +1132,14 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
pt->at_stream_start_for_bom_write = 1; pt->at_stream_start_for_bom_write = 1;
pt->encoding = canonicalize_encoding (encoding); pt->encoding = canonicalize_encoding (encoding);
pt->iconv_descriptors = NULL; scm_i_pthread_mutex_lock (&iconv_lock);
if (prev) if (pt->input_cd != (iconv_t) -1)
close_iconv_descriptors (prev); iconv_close (pt->input_cd);
if (pt->output_cd != (iconv_t) -1)
iconv_close (pt->output_cd);
pt->precise_encoding = SCM_BOOL_F;
pt->input_cd = pt->output_cd = (iconv_t) -1;
scm_i_pthread_mutex_unlock (&iconv_lock);
} }
SCM_DEFINE (scm_sys_port_encoding, "%port-encoding", 1, 0, 0, SCM_DEFINE (scm_sys_port_encoding, "%port-encoding", 1, 0, 0,
@ -1783,7 +1776,7 @@ SCM_DEFINE (scm_port_decode_char, "port-decode-char", 4, 0, 0,
{ {
char *input, *output; char *input, *output;
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE]; scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
scm_t_iconv_descriptors *id; iconv_t input_cd;
size_t c_start, c_count; size_t c_start, c_count;
size_t input_left, output_left, done; size_t input_left, output_left, done;
@ -1794,14 +1787,15 @@ SCM_DEFINE (scm_port_decode_char, "port-decode-char", 4, 0, 0,
SCM_ASSERT_RANGE (3, start, c_start <= SCM_BYTEVECTOR_LENGTH (bv)); SCM_ASSERT_RANGE (3, start, c_start <= SCM_BYTEVECTOR_LENGTH (bv));
SCM_ASSERT_RANGE (4, count, c_count <= SCM_BYTEVECTOR_LENGTH (bv) - c_start); SCM_ASSERT_RANGE (4, count, c_count <= SCM_BYTEVECTOR_LENGTH (bv) - c_start);
id = scm_i_port_iconv_descriptors (port);
input = (char *) SCM_BYTEVECTOR_CONTENTS (bv) + c_start; input = (char *) SCM_BYTEVECTOR_CONTENTS (bv) + c_start;
input_left = c_count; input_left = c_count;
output = (char *) utf8_buf; output = (char *) utf8_buf;
output_left = sizeof (utf8_buf); output_left = sizeof (utf8_buf);
/* FIXME: locking! */ /* FIXME: locking! */
done = iconv (id->input_cd, &input, &input_left, &output, &output_left); scm_port_acquire_iconv_descriptors (port, &input_cd, NULL);
done = iconv (input_cd, &input, &input_left, &output, &output_left);
scm_port_release_iconv_descriptors (port);
if (done == (size_t) -1) if (done == (size_t) -1)
{ {
@ -1856,7 +1850,8 @@ peek_iconv_codepoint (SCM port, size_t *len)
/* Make sure iconv descriptors have been opened even if /* Make sure iconv descriptors have been opened even if
there were no bytes, to be sure that a decoding error there were no bytes, to be sure that a decoding error
is signalled if the encoding itself was invalid. */ is signalled if the encoding itself was invalid. */
scm_i_port_iconv_descriptors (port); scm_port_acquire_iconv_descriptors (port, NULL, NULL);
scm_port_release_iconv_descriptors (port);
return EOF; return EOF;
} }
@ -2469,16 +2464,22 @@ port_clear_stream_start_for_bom_write (SCM port, enum bom_io_mode io_mode)
/* Write a BOM if appropriate. */ /* Write a BOM if appropriate. */
if (scm_is_eq (pt->encoding, sym_UTF_16)) if (scm_is_eq (pt->encoding, sym_UTF_16))
{ {
scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port); SCM precise_encoding;
if (scm_is_eq (id->precise_encoding, sym_UTF_16LE)) scm_port_acquire_iconv_descriptors (port, NULL, NULL);
precise_encoding = pt->precise_encoding;
scm_port_release_iconv_descriptors (port);
if (scm_is_eq (precise_encoding, sym_UTF_16LE))
scm_c_write (port, scm_utf16le_bom, sizeof (scm_utf16le_bom)); scm_c_write (port, scm_utf16le_bom, sizeof (scm_utf16le_bom));
else else
scm_c_write (port, scm_utf16be_bom, sizeof (scm_utf16be_bom)); scm_c_write (port, scm_utf16be_bom, sizeof (scm_utf16be_bom));
} }
else if (scm_is_eq (pt->encoding, sym_UTF_32)) else if (scm_is_eq (pt->encoding, sym_UTF_32))
{ {
scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port); SCM precise_encoding;
if (scm_is_eq (id->precise_encoding, sym_UTF_32LE)) scm_port_acquire_iconv_descriptors (port, NULL, NULL);
precise_encoding = pt->precise_encoding;
scm_port_release_iconv_descriptors (port);
if (scm_is_eq (precise_encoding, sym_UTF_32LE))
scm_c_write (port, scm_utf32le_bom, sizeof (scm_utf32le_bom)); scm_c_write (port, scm_utf32le_bom, sizeof (scm_utf32le_bom));
else else
scm_c_write (port, scm_utf32be_bom, sizeof (scm_utf32be_bom)); scm_c_write (port, scm_utf32be_bom, sizeof (scm_utf32be_bom));

View file

@ -24,7 +24,6 @@
#endif #endif
#include <errno.h> #include <errno.h>
#include <iconv.h>
#include <stdio.h> #include <stdio.h>
#include <assert.h> #include <assert.h>
@ -1026,9 +1025,7 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
scm_t_string_failed_conversion_handler strategy) scm_t_string_failed_conversion_handler strategy)
{ {
size_t printed; size_t printed;
scm_t_iconv_descriptors *id; iconv_t output_cd;
id = scm_i_port_iconv_descriptors (port);
printed = 0; printed = 0;
@ -1057,8 +1054,9 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
output = encoded_output; output = encoded_output;
output_left = sizeof (encoded_output); output_left = sizeof (encoded_output);
done = iconv (id->output_cd, &input, &input_left, scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
&output, &output_left); done = iconv (output_cd, &input, &input_left, &output, &output_left);
scm_port_release_iconv_descriptors (port);
output_len = sizeof (encoded_output) - output_left; output_len = sizeof (encoded_output) - output_left;
@ -1067,7 +1065,9 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
int errno_save = errno; int errno_save = errno;
/* Reset the `iconv' state. */ /* Reset the `iconv' state. */
iconv (id->output_cd, NULL, NULL, NULL, NULL); scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
iconv (output_cd, NULL, NULL, NULL, NULL);
scm_port_release_iconv_descriptors (port);
/* Print the OUTPUT_LEN bytes successfully converted. */ /* Print the OUTPUT_LEN bytes successfully converted. */
scm_lfwrite (encoded_output, output_len, port); scm_lfwrite (encoded_output, output_len, port);