mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-28 07:50:20 +02:00
Special-case UTF-8 ports to bypass `iconv' entirely.
* libguile/ports.c (update_port_lf): Handle EOF. (get_utf8_codepoint, get_iconv_codepoint): New functions. (get_codepoint): Use them. (scm_i_set_port_encoding_x): Don't open conversion descriptors when ENCODING is "UTF-8". * libguile/print.c (display_string_as_utf8, display_string_using_iconv): New functions. (display_string): Use them. * test-suite/tests/ports.test ("string ports")[#xc2 #x41 #x42]: Add a note that this is not the wrong behavior per Unicode 6.0.0.
This commit is contained in:
parent
1f78c6691f
commit
7b292a9d34
3 changed files with 287 additions and 63 deletions
258
libguile/ports.c
258
libguile/ports.c
|
@ -1057,6 +1057,7 @@ update_port_lf (scm_t_wchar c, SCM port)
|
||||||
switch (c)
|
switch (c)
|
||||||
{
|
{
|
||||||
case '\a':
|
case '\a':
|
||||||
|
case EOF:
|
||||||
break;
|
break;
|
||||||
case '\b':
|
case '\b':
|
||||||
SCM_DECCOL (port);
|
SCM_DECCOL (port);
|
||||||
|
@ -1115,23 +1116,162 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
|
||||||
return codepoint;
|
return codepoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
|
/* Read a UTF-8 sequence from PORT. On success, return 0 and set
|
||||||
with the byte representation of the codepoint in PORT's encoding, and
|
*CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
|
||||||
set *LEN to the length in bytes of that representation. Return 0 on
|
representation, and set *LEN to the length in bytes. Return
|
||||||
success and an errno value on error. */
|
`EILSEQ' on error. */
|
||||||
static int
|
static int
|
||||||
get_codepoint (SCM port, scm_t_wchar *codepoint,
|
get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||||
{
|
{
|
||||||
|
#define ASSERT_NOT_EOF(b) \
|
||||||
|
if (SCM_UNLIKELY ((b) == EOF)) \
|
||||||
|
goto invalid_seq
|
||||||
|
|
||||||
|
int byte;
|
||||||
|
|
||||||
|
*len = 0;
|
||||||
|
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
if (byte == EOF)
|
||||||
|
{
|
||||||
|
*codepoint = EOF;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf[0] = (scm_t_uint8) byte;
|
||||||
|
*len = 1;
|
||||||
|
|
||||||
|
if (buf[0] <= 0x7f)
|
||||||
|
/* 1-byte form. */
|
||||||
|
*codepoint = buf[0];
|
||||||
|
else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
|
||||||
|
{
|
||||||
|
/* 2-byte form. */
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
|
buf[1] = (scm_t_uint8) byte;
|
||||||
|
*len = 2;
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
|
goto invalid_seq;
|
||||||
|
|
||||||
|
*codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
|
||||||
|
| (buf[1] & 0x3f);
|
||||||
|
}
|
||||||
|
else if ((buf[0] & 0xf0) == 0xe0)
|
||||||
|
{
|
||||||
|
/* 3-byte form. */
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
if (SCM_UNLIKELY (byte == EOF))
|
||||||
|
goto invalid_seq;
|
||||||
|
|
||||||
|
buf[1] = (scm_t_uint8) byte;
|
||||||
|
*len = 2;
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
|
||||||
|
|| (buf[0] == 0xe0 && byte < 0xa0)
|
||||||
|
|| (buf[0] == 0xed && byte > 0x9f)))
|
||||||
|
{
|
||||||
|
/* Swallow the 3rd byte. */
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
*len = 3, buf[2] = byte;
|
||||||
|
goto invalid_seq;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
|
buf[2] = (scm_t_uint8) byte;
|
||||||
|
*len = 3;
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
|
goto invalid_seq;
|
||||||
|
|
||||||
|
*codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
|
||||||
|
| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
|
||||||
|
| (buf[2] & 0x3f);
|
||||||
|
}
|
||||||
|
else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
|
||||||
|
{
|
||||||
|
/* 4-byte form. */
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
|
buf[1] = (scm_t_uint8) byte;
|
||||||
|
*len = 2;
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
|
||||||
|
|| (buf[0] == 0xf0 && byte < 0x90)
|
||||||
|
|| (buf[0] == 0xf4 && byte > 0x8f)))
|
||||||
|
{
|
||||||
|
/* Swallow the 3rd and 4th bytes. */
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
*len = 3, buf[2] = byte;
|
||||||
|
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
*len = 4, buf[3] = byte;
|
||||||
|
goto invalid_seq;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
|
buf[2] = (scm_t_uint8) byte;
|
||||||
|
*len = 3;
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
|
{
|
||||||
|
/* Swallow the 4th byte. */
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
*len = 4, buf[3] = byte;
|
||||||
|
goto invalid_seq;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte = scm_get_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
|
buf[3] = (scm_t_uint8) byte;
|
||||||
|
*len = 4;
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
|
goto invalid_seq;
|
||||||
|
|
||||||
|
*codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
|
||||||
|
| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
|
||||||
|
| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
|
||||||
|
| (buf[3] & 0x3f);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
goto invalid_seq;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
invalid_seq:
|
||||||
|
return EILSEQ;
|
||||||
|
|
||||||
|
#undef ASSERT_NOT_EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Likewise, read a byte sequence from PORT, passing it through its
|
||||||
|
input conversion descriptor. */
|
||||||
|
static int
|
||||||
|
get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
|
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||||
|
{
|
||||||
|
scm_t_port *pt;
|
||||||
int err, byte_read;
|
int err, byte_read;
|
||||||
size_t bytes_consumed, output_size;
|
size_t bytes_consumed, output_size;
|
||||||
char *output;
|
char *output;
|
||||||
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
||||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
|
||||||
|
|
||||||
if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
|
pt = SCM_PTAB_ENTRY (port);
|
||||||
/* Initialize the conversion descriptors. */
|
|
||||||
scm_i_set_port_encoding_x (port, pt->encoding);
|
|
||||||
|
|
||||||
for (output_size = 0, output = (char *) utf8_buf,
|
for (output_size = 0, output = (char *) utf8_buf,
|
||||||
bytes_consumed = 0, err = 0;
|
bytes_consumed = 0, err = 0;
|
||||||
|
@ -1177,30 +1317,45 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
if (SCM_UNLIKELY (output_size == 0))
|
if (SCM_UNLIKELY (output_size == 0))
|
||||||
/* An unterminated sequence. */
|
/* An unterminated sequence. */
|
||||||
err = EILSEQ;
|
err = EILSEQ;
|
||||||
|
else if (SCM_LIKELY (err == 0))
|
||||||
if (SCM_UNLIKELY (err != 0))
|
|
||||||
{
|
|
||||||
/* Reset the `iconv' state. */
|
|
||||||
iconv (pt->input_cd, NULL, NULL, NULL, NULL);
|
|
||||||
|
|
||||||
if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
|
|
||||||
{
|
|
||||||
*codepoint = '?';
|
|
||||||
err = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Fail when the strategy is SCM_ICONVEH_ERROR or
|
|
||||||
SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
|
|
||||||
input encoding errors.) */
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
/* Convert the UTF8_BUF sequence to a Unicode code point. */
|
/* Convert the UTF8_BUF sequence to a Unicode code point. */
|
||||||
*codepoint = utf8_to_codepoint (utf8_buf, output_size);
|
*codepoint = utf8_to_codepoint (utf8_buf, output_size);
|
||||||
update_port_lf (*codepoint, port);
|
*len = bytes_consumed;
|
||||||
}
|
}
|
||||||
|
|
||||||
*len = bytes_consumed;
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
|
||||||
|
with the byte representation of the codepoint in PORT's encoding, and
|
||||||
|
set *LEN to the length in bytes of that representation. Return 0 on
|
||||||
|
success and an errno value on error. */
|
||||||
|
static int
|
||||||
|
get_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
|
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||||
|
|
||||||
|
if (pt->input_cd == (iconv_t) -1)
|
||||||
|
/* Initialize the conversion descriptors, if needed. */
|
||||||
|
scm_i_set_port_encoding_x (port, pt->encoding);
|
||||||
|
|
||||||
|
/* FIXME: In 2.1, add a flag to determine whether a port is UTF-8. */
|
||||||
|
if (pt->input_cd == (iconv_t) -1)
|
||||||
|
err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
|
||||||
|
else
|
||||||
|
err = get_iconv_codepoint (port, codepoint, buf, len);
|
||||||
|
|
||||||
|
if (SCM_LIKELY (err == 0))
|
||||||
|
update_port_lf (*codepoint, port);
|
||||||
|
else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
|
||||||
|
{
|
||||||
|
*codepoint = '?';
|
||||||
|
err = 0;
|
||||||
|
update_port_lf (*codepoint, port);
|
||||||
|
}
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
@ -2031,28 +2186,35 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
encoding = "ISO-8859-1";
|
encoding = "ISO-8859-1";
|
||||||
|
|
||||||
pt->encoding = scm_gc_strdup (encoding, "port");
|
if (pt->encoding != encoding)
|
||||||
|
pt->encoding = scm_gc_strdup (encoding, "port");
|
||||||
|
|
||||||
if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
|
/* If ENCODING is UTF-8, then no conversion descriptor is opened
|
||||||
|
because we do I/O ourselves. This saves 100+ KiB for each
|
||||||
|
descriptor. */
|
||||||
|
if (strcmp (encoding, "UTF-8"))
|
||||||
{
|
{
|
||||||
/* Open an input iconv conversion descriptor, from ENCODING
|
if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
|
||||||
to UTF-8. We choose UTF-8, not UTF-32, because iconv
|
|
||||||
implementations can typically convert from anything to
|
|
||||||
UTF-8, but not to UTF-32 (see
|
|
||||||
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
|
|
||||||
new_input_cd = iconv_open ("UTF-8", encoding);
|
|
||||||
if (new_input_cd == (iconv_t) -1)
|
|
||||||
goto invalid_encoding;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
|
|
||||||
{
|
|
||||||
new_output_cd = iconv_open (encoding, "UTF-8");
|
|
||||||
if (new_output_cd == (iconv_t) -1)
|
|
||||||
{
|
{
|
||||||
if (new_input_cd != (iconv_t) -1)
|
/* Open an input iconv conversion descriptor, from ENCODING
|
||||||
iconv_close (new_input_cd);
|
to UTF-8. We choose UTF-8, not UTF-32, because iconv
|
||||||
goto invalid_encoding;
|
implementations can typically convert from anything to
|
||||||
|
UTF-8, but not to UTF-32 (see
|
||||||
|
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
|
||||||
|
new_input_cd = iconv_open ("UTF-8", encoding);
|
||||||
|
if (new_input_cd == (iconv_t) -1)
|
||||||
|
goto invalid_encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
|
||||||
|
{
|
||||||
|
new_output_cd = iconv_open (encoding, "UTF-8");
|
||||||
|
if (new_output_cd == (iconv_t) -1)
|
||||||
|
{
|
||||||
|
if (new_input_cd != (iconv_t) -1)
|
||||||
|
iconv_close (new_input_cd);
|
||||||
|
goto invalid_encoding;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -821,31 +821,57 @@ codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Display the LEN codepoints in STR to PORT according to STRATEGY;
|
|
||||||
return the number of codepoints successfully displayed. If NARROW_P,
|
|
||||||
then STR is interpreted as a sequence of `char', denoting a Latin-1
|
|
||||||
string; otherwise it's interpreted as a sequence of
|
|
||||||
`scm_t_wchar'. */
|
|
||||||
static size_t
|
|
||||||
display_string (const void *str, int narrow_p,
|
|
||||||
size_t len, SCM port,
|
|
||||||
scm_t_string_failed_conversion_handler strategy)
|
|
||||||
|
|
||||||
{
|
|
||||||
#define STR_REF(s, x) \
|
#define STR_REF(s, x) \
|
||||||
(narrow_p \
|
(narrow_p \
|
||||||
? (scm_t_wchar) ((unsigned char *) (s))[x] \
|
? (scm_t_wchar) ((unsigned char *) (s))[x] \
|
||||||
: ((scm_t_wchar *) (s))[x])
|
: ((scm_t_wchar *) (s))[x])
|
||||||
|
|
||||||
|
/* Write STR to PORT as UTF-8. STR is a LEN-codepoint string; it is
|
||||||
|
narrow if NARROW_P is true, wide otherwise. Return LEN. */
|
||||||
|
static size_t
|
||||||
|
display_string_as_utf8 (const void *str, int narrow_p, size_t len,
|
||||||
|
SCM port)
|
||||||
|
{
|
||||||
|
size_t printed = 0;
|
||||||
|
|
||||||
|
while (len > printed)
|
||||||
|
{
|
||||||
|
size_t utf8_len, i;
|
||||||
|
char *input, utf8_buf[256];
|
||||||
|
|
||||||
|
/* Convert STR to UTF-8. */
|
||||||
|
for (i = printed, utf8_len = 0, input = utf8_buf;
|
||||||
|
i < len && utf8_len + 4 < sizeof (utf8_buf);
|
||||||
|
i++)
|
||||||
|
{
|
||||||
|
utf8_len += codepoint_to_utf8 (STR_REF (str, i),
|
||||||
|
(scm_t_uint8 *) input);
|
||||||
|
input = utf8_buf + utf8_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* INPUT was successfully converted, entirely; print the
|
||||||
|
result. */
|
||||||
|
scm_lfwrite (utf8_buf, utf8_len, port);
|
||||||
|
printed += i - printed;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert (printed == len);
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Convert STR through PORT's output conversion descriptor and write the
|
||||||
|
output to PORT. Return the number of codepoints written. */
|
||||||
|
static size_t
|
||||||
|
display_string_using_iconv (const void *str, int narrow_p, size_t len,
|
||||||
|
SCM port,
|
||||||
|
scm_t_string_failed_conversion_handler strategy)
|
||||||
|
{
|
||||||
size_t printed;
|
size_t printed;
|
||||||
scm_t_port *pt;
|
scm_t_port *pt;
|
||||||
|
|
||||||
pt = SCM_PTAB_ENTRY (port);
|
pt = SCM_PTAB_ENTRY (port);
|
||||||
|
|
||||||
if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
|
|
||||||
/* Initialize the conversion descriptors. */
|
|
||||||
scm_i_set_port_encoding_x (port, pt->encoding);
|
|
||||||
|
|
||||||
printed = 0;
|
printed = 0;
|
||||||
|
|
||||||
while (len > printed)
|
while (len > printed)
|
||||||
|
@ -928,7 +954,35 @@ display_string (const void *str, int narrow_p,
|
||||||
}
|
}
|
||||||
|
|
||||||
return printed;
|
return printed;
|
||||||
|
}
|
||||||
|
|
||||||
#undef STR_REF
|
#undef STR_REF
|
||||||
|
|
||||||
|
/* Display the LEN codepoints in STR to PORT according to STRATEGY;
|
||||||
|
return the number of codepoints successfully displayed. If NARROW_P,
|
||||||
|
then STR is interpreted as a sequence of `char', denoting a Latin-1
|
||||||
|
string; otherwise it's interpreted as a sequence of
|
||||||
|
`scm_t_wchar'. */
|
||||||
|
static size_t
|
||||||
|
display_string (const void *str, int narrow_p,
|
||||||
|
size_t len, SCM port,
|
||||||
|
scm_t_string_failed_conversion_handler strategy)
|
||||||
|
|
||||||
|
{
|
||||||
|
scm_t_port *pt;
|
||||||
|
|
||||||
|
pt = SCM_PTAB_ENTRY (port);
|
||||||
|
|
||||||
|
if (pt->output_cd == (iconv_t) -1)
|
||||||
|
/* Initialize the conversion descriptors, if needed. */
|
||||||
|
scm_i_set_port_encoding_x (port, pt->encoding);
|
||||||
|
|
||||||
|
/* FIXME: In 2.1, add a flag to determine whether a port is UTF-8. */
|
||||||
|
if (pt->output_cd == (iconv_t) -1)
|
||||||
|
return display_string_as_utf8 (str, narrow_p, len, port);
|
||||||
|
else
|
||||||
|
return display_string_using_iconv (str, narrow_p, len,
|
||||||
|
port, strategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Attempt to display CH to PORT according to STRATEGY. Return non-zero
|
/* Attempt to display CH to PORT according to STRATEGY. Return non-zero
|
||||||
|
|
|
@ -572,6 +572,14 @@
|
||||||
eof))
|
eof))
|
||||||
|
|
||||||
(test-decoding-error (#xc2 #x41 #x42) "UTF-8"
|
(test-decoding-error (#xc2 #x41 #x42) "UTF-8"
|
||||||
|
;; FIXME: This is the behavior of glibc/libiconv but it does not
|
||||||
|
;; conform to the Unicode 6.0.0 recommendation: according to it,
|
||||||
|
;; the #\A should not be swallowed (Section 3.9 reads:
|
||||||
|
;; "If the converter encounters an ill-formed UTF-8 code unit
|
||||||
|
;; sequence which starts with a valid first byte, but which does
|
||||||
|
;; not continue with valid successor bytes (see Table 3-7), it
|
||||||
|
;; must not consume the successor bytes".)
|
||||||
|
|
||||||
(error ;; 41: should be in the 80..BF range
|
(error ;; 41: should be in the 80..BF range
|
||||||
#\B
|
#\B
|
||||||
eof))
|
eof))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue