1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-10 14:00:21 +02:00

Handle BOM around fill/flush instead of peek/put

* libguile/print.c (display_string_using_iconv): Remove BOM handling;
  this is now handled by scm_lfwrite.
* libguile/ports.c (open_iconv_descriptors): Refactor to take encoding
  as a symbol.
  (prepare_iconv_descriptors): New helper.
  (scm_i_port_iconv_descriptors): Remove scm_t_port_rw_active argument,
  and don't sniff UTF-16/UTF-32 byte orders here.  Instead BOM handlers
  will call prepare_iconv_descriptors.
  (scm_c_read_bytes): Call new port_clear_stream_start_for_bom_read
  helper.
  (port_maybe_consume_initial_byte_order_mark)
  (scm_port_maybe_consume_initial_byte_order_mark): Remove.  Leaves
  Scheme %peek-char broken but it's unused currently so that's OK.
  (peek_iconv_codepoint): Fetch iconv descriptors after doing fill-input
  because it's fill-input that will sniff the BOM.
  (peek_codepoint): Instead of handling BOM at every character, handle
  in fill-input instead.
  (maybe_consume_bom, port_clear_stream_start_for_bom_read)
  (port_clear_stream_start_for_bom_write): New helpers.
  (scm_fill_input): Slurp a BOM if needed.
  (scm_i_write): Clear the start-of-stream-for-bom-write flag.
  (scm_lfwrite): Write a BOM if needed.
This commit is contained in:
Andy Wingo 2016-05-05 13:00:19 +02:00
parent d7a111b0ec
commit 86267af8b3
3 changed files with 185 additions and 152 deletions

View file

@ -210,6 +210,13 @@ typedef enum scm_port_encoding_mode scm_t_port_encoding_mode;
cause finalizers to be registered. */
struct scm_iconv_descriptors
{
/* This is the same as pt->encoding, except if pt->encoding is UTF-16
or UTF-32, in which case this is UTF-16LE or a similar
byte-order-specialed version of UTF-16 or UTF-32. We don't re-set
pt->encoding because being just plain UTF-16 or UTF-32 has an
additional meaning, being that we should consume and produce byte
order marker codepoints as appropriate. */
SCM precise_encoding;
/* input/output iconv conversion descriptors */
void *input_cd;
void *output_cd;
@ -237,7 +244,6 @@ typedef enum scm_t_port_rw_active {
SCM_PORT_WRITE = 2
} scm_t_port_rw_active;
SCM_INTERNAL scm_t_iconv_descriptors *
scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode);
SCM_INTERNAL scm_t_iconv_descriptors * scm_i_port_iconv_descriptors (SCM port);
#endif

View file

@ -1028,63 +1028,12 @@ scm_i_set_default_port_conversion_strategy (SCM sym)
scm_fluid_set_x (SCM_VARIABLE_REF (default_conversion_strategy_var), sym);
}
/* If the next LEN bytes from PORT are equal to those in BYTES, then
return 1, else return 0. Leave the port position unchanged. */
static int
looking_at_bytes (SCM port, const unsigned char *bytes, int len)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
int i = 0;
while (i < len && scm_peek_byte_or_eof (port) == bytes[i])
{
scm_port_buffer_did_take (pt->read_buf, 1);
i++;
}
scm_unget_bytes (bytes, i, port);
return (i == len);
}
static const unsigned char scm_utf8_bom[3] = {0xEF, 0xBB, 0xBF};
static const unsigned char scm_utf16be_bom[2] = {0xFE, 0xFF};
static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE};
static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF};
static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
/* Decide what byte order to use for a UTF-16 port. Return "UTF-16BE"
or "UTF-16LE". MODE must be either SCM_PORT_READ or SCM_PORT_WRITE,
and specifies which operation is about to be done. The MODE
determines how we will decide the byte order. We deliberately avoid
reading from the port unless the user is about to do so. If the user
is about to read, then we look for a BOM, and if present, we use it
to determine the byte order. Otherwise we choose big endian, as
recommended by the Unicode Standard. Note that the BOM (if any) is
not consumed here. */
static SCM
decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
{
if (mode == SCM_PORT_READ
&& SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
&& looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
return sym_UTF_16LE;
else
return sym_UTF_16BE;
}
/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE"
or "UTF-32LE". See the comment above 'decide_utf16_encoding' for
details. */
static SCM
decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
{
if (mode == SCM_PORT_READ
&& SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
&& looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
return sym_UTF_32LE;
else
return sym_UTF_32BE;
}
static void
finalize_iconv_descriptors (void *ptr, void *data)
{
@ -1092,8 +1041,9 @@ finalize_iconv_descriptors (void *ptr, void *data)
}
static scm_t_iconv_descriptors *
open_iconv_descriptors (const char *encoding, int reading, int writing)
open_iconv_descriptors (SCM precise_encoding, int reading, int writing)
{
const char *encoding;
scm_t_iconv_descriptors *id;
iconv_t input_cd, output_cd;
size_t i;
@ -1101,6 +1051,7 @@ open_iconv_descriptors (const char *encoding, int reading, int writing)
input_cd = (iconv_t) -1;
output_cd = (iconv_t) -1;
encoding = scm_i_symbol_chars (precise_encoding);
for (i = 0; encoding[i]; i++)
if (encoding[i] > 127)
goto invalid_encoding;
@ -1138,6 +1089,7 @@ open_iconv_descriptors (const char *encoding, int reading, int writing)
}
id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors");
id->precise_encoding = precise_encoding;
id->input_cd = input_cd;
id->output_cd = output_cd;
@ -1147,13 +1099,9 @@ open_iconv_descriptors (const char *encoding, int reading, int writing)
return id;
invalid_encoding:
{
SCM err;
err = scm_from_latin1_string (encoding);
scm_misc_error ("open_iconv_descriptors",
"invalid or unknown character encoding ~s",
scm_list_1 (err));
}
scm_misc_error ("open_iconv_descriptors",
"invalid or unknown character encoding ~s",
scm_list_1 (precise_encoding));
}
static void
@ -1167,30 +1115,35 @@ close_iconv_descriptors (scm_t_iconv_descriptors *id)
id->output_cd = (void *) -1;
}
scm_t_iconv_descriptors *
scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
static void
prepare_iconv_descriptors (SCM port, SCM encoding)
{
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
scm_t_iconv_descriptors *desc = pti->iconv_descriptors;
/* If the specified encoding is UTF-16 or UTF-32, then default to
big-endian byte order. This fallback isn't necessary if you read
on the port before writing to it, as the read will sniff the BOM if
any and specialize the encoding; see the manual. */
if (scm_is_eq (encoding, sym_UTF_16))
encoding = sym_UTF_16BE;
else if (scm_is_eq (encoding, sym_UTF_32))
encoding = sym_UTF_32BE;
if (desc && scm_is_eq (desc->precise_encoding, encoding))
return;
pti->iconv_descriptors = open_iconv_descriptors
(encoding, SCM_INPUT_PORT_P (port), SCM_OUTPUT_PORT_P (port));
}
scm_t_iconv_descriptors *
scm_i_port_iconv_descriptors (SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (!pti->iconv_descriptors)
{
SCM precise_encoding;
/* If the specified encoding is UTF-16 or UTF-32, then make
that more precise by deciding what byte order to use. */
if (scm_is_eq (pt->encoding, sym_UTF_16))
precise_encoding = decide_utf16_encoding (port, mode);
else if (scm_is_eq (pt->encoding, sym_UTF_32))
precise_encoding = decide_utf32_encoding (port, mode);
else
precise_encoding = pt->encoding;
pti->iconv_descriptors =
open_iconv_descriptors (scm_i_symbol_chars (precise_encoding),
SCM_INPUT_PORT_P (port),
SCM_OUTPUT_PORT_P (port));
}
prepare_iconv_descriptors (port, SCM_PTAB_ENTRY (port)->encoding);
return pti->iconv_descriptors;
}
@ -1450,6 +1403,14 @@ scm_i_read_bytes (SCM port, SCM dst, size_t start, size_t count)
return filled;
}
/* In text mode, we will slurp a BOM from the beginning of a UTF-8,
UTF-16, or UTF-32 stream, and write one at the beginning of a UTF-16
or UTF-32 stream. In binary mode, we won't. The mode depends on the
caller. */
enum bom_io_mode { BOM_IO_TEXT, BOM_IO_BINARY };
static size_t port_clear_stream_start_for_bom_read (SCM, enum bom_io_mode);
static void port_clear_stream_start_for_bom_write (SCM, enum bom_io_mode);
/* Used by an application to read arbitrary number of bytes from an SCM
port. Same semantics as libc read, except that scm_c_read_bytes only
returns less than SIZE bytes if at end-of-file.
@ -1472,6 +1433,8 @@ scm_c_read_bytes (SCM port, SCM dst, size_t start, size_t count)
if (pt->rw_random)
scm_flush (port);
port_clear_stream_start_for_bom_read (port, BOM_IO_BINARY);
/* Take bytes first from the port's read buffer. */
{
size_t did_read = scm_port_buffer_take (read_buf, dst_ptr, to_read);
@ -1553,50 +1516,6 @@ scm_c_read (SCM port, void *buffer, size_t size)
}
#undef FUNC_NAME
static int
port_maybe_consume_initial_byte_order_mark (SCM port, scm_t_wchar codepoint,
size_t len)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (!pti->at_stream_start_for_bom_read) return 0;
/* Record that we're no longer at stream start. */
pti->at_stream_start_for_bom_read = 0;
if (pt->rw_random)
pti->at_stream_start_for_bom_write = 0;
if (codepoint != SCM_UNICODE_BOM) return 0;
/* If we just read a BOM in an encoding that recognizes them, then
silently consume it. */
if (scm_is_eq (pt->encoding, sym_UTF_8)
|| scm_is_eq (pt->encoding, sym_UTF_16)
|| scm_is_eq (pt->encoding, sym_UTF_32))
{
scm_port_buffer_did_take (pt->read_buf, len);
return 1;
}
return 0;
}
SCM_DEFINE (scm_port_maybe_consume_initial_byte_order_mark,
"port-maybe-consume-initial-byte-order-mark", 3, 0, 0,
(SCM port, SCM codepoint, SCM len),
"")
#define FUNC_NAME s_scm_port_maybe_consume_initial_byte_order_mark
{
SCM_VALIDATE_PORT (1, port);
return scm_from_bool
(port_maybe_consume_initial_byte_order_mark
(port,
SCM_CHARP (codepoint) ? SCM_CHAR (codepoint) : EOF,
scm_to_size_t (len)));
}
#undef FUNC_NAME
/* Update the line and column number of PORT after consumption of C. */
static inline void
update_port_lf (scm_t_wchar c, SCM port)
@ -1790,8 +1709,6 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
size_t input_size = 0;
id = scm_i_port_iconv_descriptors (port, SCM_PORT_READ);
for (;;)
{
SCM read_buf;
@ -1799,6 +1716,8 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
size_t input_left, output_left, done;
read_buf = scm_fill_input (port, input_size + 1);
id = scm_i_port_iconv_descriptors (port);
if (scm_port_buffer_can_take (read_buf) <= input_size)
{
if (input_size == 0)
@ -1867,12 +1786,7 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
else
err = peek_iconv_codepoint (port, codepoint, len);
if (SCM_LIKELY (err == 0))
{
if (port_maybe_consume_initial_byte_order_mark (port, *codepoint, *len))
return peek_codepoint (port, codepoint, len);
}
else if (scm_is_eq (pt->conversion_strategy, sym_substitute))
if (err != 0 && scm_is_eq (pt->conversion_strategy, sym_substitute))
{
*codepoint = '?';
err = 0;
@ -2352,16 +2266,138 @@ scm_flush (SCM port)
scm_i_write (port, buf);
}
/* Return number of bytes consumed, or zero if no BOM was consumed. */
static size_t
maybe_consume_bom (SCM port, const unsigned char *bom, size_t bom_len)
{
SCM read_buf;
const scm_t_uint8 *buf;
if (peek_byte_or_eof (port) != bom[0])
return 0;
/* Make sure there's enough space in the buffer for a BOM. Now that
we matched the first byte, we know we're going to have to read this
many bytes anyway. */
read_buf = scm_fill_input (port, bom_len);
buf = scm_port_buffer_take_pointer (read_buf);
if (scm_port_buffer_can_take (read_buf) < bom_len)
return 0;
if (memcmp (buf, bom, bom_len) != 0)
return 0;
scm_port_buffer_did_take (read_buf, bom_len);
return bom_len;
}
static size_t
port_clear_stream_start_for_bom_read (SCM port, enum bom_io_mode io_mode)
{
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
scm_t_port *pt;
if (!pti->at_stream_start_for_bom_read)
return 0;
/* Maybe slurp off a byte-order marker. */
pt = SCM_PTAB_ENTRY (port);
pti->at_stream_start_for_bom_read = 0;
if (pt->rw_random)
pti->at_stream_start_for_bom_write = 0;
if (io_mode == BOM_IO_BINARY)
return 0;
if (scm_is_eq (pt->encoding, sym_UTF_8))
return maybe_consume_bom (port, scm_utf8_bom, sizeof (scm_utf8_bom));
if (scm_is_eq (pt->encoding, sym_UTF_16))
{
if (maybe_consume_bom (port, scm_utf16le_bom, sizeof (scm_utf16le_bom)))
{
prepare_iconv_descriptors (port, sym_UTF_16LE);
return 2;
}
if (maybe_consume_bom (port, scm_utf16be_bom, sizeof (scm_utf16be_bom)))
{
prepare_iconv_descriptors (port, sym_UTF_16BE);
return 2;
}
/* Big-endian by default. */
prepare_iconv_descriptors (port, sym_UTF_16BE);
return 0;
}
if (scm_is_eq (pt->encoding, sym_UTF_32))
{
if (maybe_consume_bom (port, scm_utf32le_bom, sizeof (scm_utf32le_bom)))
{
/* Big-endian by default. */
prepare_iconv_descriptors (port, sym_UTF_32LE);
return 4;
}
if (maybe_consume_bom (port, scm_utf32be_bom, sizeof (scm_utf32be_bom)))
{
prepare_iconv_descriptors (port, sym_UTF_32BE);
return 4;
}
/* Big-endian by default. */
prepare_iconv_descriptors (port, sym_UTF_32BE);
return 0;
}
return 0;
}
static void
port_clear_stream_start_for_bom_write (SCM port, enum bom_io_mode io_mode)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (!pti->at_stream_start_for_bom_write)
return;
/* Record that we're no longer at stream start. */
pti->at_stream_start_for_bom_write = 0;
if (pt->rw_random)
pti->at_stream_start_for_bom_read = 0;
/* Write a BOM if appropriate. */
if (scm_is_eq (pt->encoding, sym_UTF_16))
{
scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
if (scm_is_eq (id->precise_encoding, sym_UTF_16LE))
scm_c_write (port, scm_utf16le_bom, sizeof (scm_utf16le_bom));
else
scm_c_write (port, scm_utf16be_bom, sizeof (scm_utf16be_bom));
}
else if (scm_is_eq (pt->encoding, sym_UTF_32))
{
scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
if (scm_is_eq (id->precise_encoding, sym_UTF_32LE))
scm_c_write (port, scm_utf32le_bom, sizeof (scm_utf32le_bom));
else
scm_c_write (port, scm_utf32be_bom, sizeof (scm_utf32be_bom));
}
}
SCM
scm_fill_input (SCM port, size_t minimum_size)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
SCM read_buf = pt->read_buf;
size_t buffered = scm_port_buffer_can_take (read_buf);
SCM read_buf;
size_t buffered;
if (minimum_size == 0)
minimum_size = 1;
port_clear_stream_start_for_bom_read (port, BOM_IO_TEXT);
read_buf = pt->read_buf;
buffered = scm_port_buffer_can_take (read_buf);
if (buffered >= minimum_size
|| scm_is_true (scm_port_buffer_has_eof_p (read_buf)))
return read_buf;
@ -2525,6 +2561,8 @@ scm_i_write (SCM port, SCM buf)
{
size_t start, count;
port_clear_stream_start_for_bom_write (port, BOM_IO_BINARY);
/* Update cursors before attempting to write, assuming that I/O errors
are sticky. That way if the write throws an error, causing the
computation to abort, and possibly causing the port to be collected
@ -2633,6 +2671,11 @@ scm_lfwrite (const char *ptr, size_t size, SCM port)
{
int saved_line;
if (size == 0)
return;
port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
scm_c_write (port, ptr, size);
saved_line = SCM_LINUM (port);

View file

@ -1027,24 +1027,8 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
{
size_t printed;
scm_t_iconv_descriptors *id;
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
id = scm_i_port_iconv_descriptors (port, SCM_PORT_WRITE);
if (SCM_UNLIKELY (pti->at_stream_start_for_bom_write && len > 0))
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
/* Record that we're no longer at stream start. */
pti->at_stream_start_for_bom_write = 0;
if (pt->rw_random)
pti->at_stream_start_for_bom_read = 0;
/* Write a BOM if appropriate. */
if (SCM_UNLIKELY (scm_is_eq (pt->encoding, sym_UTF_16)
|| scm_is_eq (pt->encoding, sym_UTF_32)))
display_character (SCM_UNICODE_BOM, port, iconveh_error);
}
id = scm_i_port_iconv_descriptors (port);
printed = 0;