mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-06-09 13:30:26 +02:00
Rewrite get_iconv_codepoint to fix a bug involving byte-order marks.
* libguile/ports.c (get_iconv_codepoint): Rewrite to fix a bug and improve efficiency and clarity. Previously, it incorrectly assumed that iconv would never consume input without producing output, which led to a buffer overrun and subsequent assertion failure. This happens when a byte-order mark is consumed by iconv at the beginning of the stream when using the UTF-16 or UTF-32 encodings. * test-suite/tests/ports.test (unicode byte-order marks (BOMs)): Add tests.
This commit is contained in:
parent
8a2b596579
commit
1ee237d9a1
2 changed files with 140 additions and 40 deletions
|
@ -1306,65 +1306,73 @@ static int
|
||||||
get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
|
get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||||
{
|
{
|
||||||
scm_t_iconv_descriptors *id;
|
scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
|
||||||
int err, byte_read;
|
|
||||||
size_t bytes_consumed, output_size;
|
|
||||||
char *output;
|
|
||||||
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
||||||
|
size_t input_size = 0;
|
||||||
|
|
||||||
id = scm_i_port_iconv_descriptors (port);
|
for (;;)
|
||||||
|
|
||||||
for (output_size = 0, output = (char *) utf8_buf,
|
|
||||||
bytes_consumed = 0, err = 0;
|
|
||||||
err == 0 && output_size == 0
|
|
||||||
&& (bytes_consumed == 0 || byte_read != EOF);
|
|
||||||
bytes_consumed++)
|
|
||||||
{
|
{
|
||||||
char *input;
|
int byte_read;
|
||||||
|
char *input, *output;
|
||||||
size_t input_left, output_left, done;
|
size_t input_left, output_left, done;
|
||||||
|
|
||||||
byte_read = scm_get_byte_or_eof (port);
|
byte_read = scm_get_byte_or_eof (port);
|
||||||
if (byte_read == EOF)
|
if (SCM_UNLIKELY (byte_read == EOF))
|
||||||
{
|
{
|
||||||
if (bytes_consumed == 0)
|
if (SCM_LIKELY (input_size == 0))
|
||||||
{
|
{
|
||||||
*codepoint = (scm_t_wchar) EOF;
|
*codepoint = (scm_t_wchar) EOF;
|
||||||
*len = 0;
|
*len = input_size;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
continue;
|
/* EOF found in the middle of a multibyte character. */
|
||||||
|
return EILSEQ;
|
||||||
}
|
}
|
||||||
|
|
||||||
buf[bytes_consumed] = byte_read;
|
buf[input_size++] = byte_read;
|
||||||
|
|
||||||
input = buf;
|
input = buf;
|
||||||
input_left = bytes_consumed + 1;
|
input_left = input_size;
|
||||||
|
output = (char *) utf8_buf;
|
||||||
output_left = sizeof (utf8_buf);
|
output_left = sizeof (utf8_buf);
|
||||||
|
|
||||||
done = iconv (id->input_cd, &input, &input_left, &output, &output_left);
|
done = iconv (id->input_cd, &input, &input_left, &output, &output_left);
|
||||||
|
|
||||||
if (done == (size_t) -1)
|
if (done == (size_t) -1)
|
||||||
{
|
{
|
||||||
err = errno;
|
int err = errno;
|
||||||
if (err == EINVAL)
|
if (SCM_LIKELY (err == EINVAL))
|
||||||
/* Missing input: keep trying. */
|
/* The input byte sequence did not form a complete
|
||||||
err = 0;
|
character. Read another byte and try again. */
|
||||||
|
continue;
|
||||||
|
else
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
output_size = sizeof (utf8_buf) - output_left;
|
{
|
||||||
|
size_t output_size = sizeof (utf8_buf) - output_left;
|
||||||
|
if (SCM_LIKELY (output_size > 0))
|
||||||
|
{
|
||||||
|
/* iconv generated output. Convert the UTF8_BUF sequence
|
||||||
|
to a Unicode code point. */
|
||||||
|
*codepoint = utf8_to_codepoint (utf8_buf, output_size);
|
||||||
|
*len = input_size;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* iconv consumed some bytes without producing any output.
|
||||||
|
Most likely this means that a Unicode byte-order mark
|
||||||
|
(BOM) was consumed, which should not be included in the
|
||||||
|
returned buf. Shift any remaining bytes to the beginning
|
||||||
|
of buf, and continue the loop. */
|
||||||
|
memmove (buf, input, input_left);
|
||||||
|
input_size = input_left;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (SCM_UNLIKELY (output_size == 0))
|
|
||||||
/* An unterminated sequence. */
|
|
||||||
err = EILSEQ;
|
|
||||||
else if (SCM_LIKELY (err == 0))
|
|
||||||
{
|
|
||||||
/* Convert the UTF8_BUF sequence to a Unicode code point. */
|
|
||||||
*codepoint = utf8_to_codepoint (utf8_buf, output_size);
|
|
||||||
*len = bytes_consumed;
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
|
/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
|
||||||
|
|
|
@ -1149,6 +1149,98 @@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(define (bv-read-test encoding bv)
|
||||||
|
(let ((port (open-bytevector-input-port bv)))
|
||||||
|
(set-port-encoding! port encoding)
|
||||||
|
(read-string port)))
|
||||||
|
|
||||||
|
(with-test-prefix "unicode byte-order marks (BOMs)"
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded from Latin-1 stream"
|
||||||
|
"\xEF\xBB\xBF\x61"
|
||||||
|
(bv-read-test "ISO-8859-1" #vu8(#xEF #xBB #xBF #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded from Latin-2 stream"
|
||||||
|
"\u010F\u0165\u017C\x61"
|
||||||
|
(bv-read-test "ISO-8859-2" #vu8(#xEF #xBB #xBF #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded from UTF-16BE stream"
|
||||||
|
"\uFEFF\x61"
|
||||||
|
(bv-read-test "UTF-16BE" #vu8(#xFE #xFF #x00 #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded from UTF-16LE stream"
|
||||||
|
"\uFEFF\x61"
|
||||||
|
(bv-read-test "UTF-16LE" #vu8(#xFF #xFE #x61 #x00)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded from UTF-32BE stream"
|
||||||
|
"\uFEFF\x61"
|
||||||
|
(bv-read-test "UTF-32BE" #vu8(#x00 #x00 #xFE #xFF
|
||||||
|
#x00 #x00 #x00 #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded from UTF-32LE stream"
|
||||||
|
"\uFEFF\x61"
|
||||||
|
(bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00
|
||||||
|
#x61 #x00 #x00 #x00)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM discarded from start of UTF-16 stream (BE)"
|
||||||
|
"a"
|
||||||
|
(bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "Only one BOM discarded from start of UTF-16 stream (BE)"
|
||||||
|
"\uFEFFa"
|
||||||
|
(bv-read-test "UTF-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded unless at start of UTF-16 stream"
|
||||||
|
"a\uFEFFb"
|
||||||
|
(let ((be (bv-read-test "UTF-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62)))
|
||||||
|
(le (bv-read-test "UTF-16" #vu8(#x61 #x00 #xFF #xFE #x62 #x00))))
|
||||||
|
(if (char=? #\a (string-ref be 0))
|
||||||
|
be
|
||||||
|
le)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM discarded from start of UTF-16 stream (LE)"
|
||||||
|
"a"
|
||||||
|
(bv-read-test "UTF-16" #vu8(#xFF #xFE #x61 #x00)))
|
||||||
|
|
||||||
|
(pass-if-equal "Only one BOM discarded from start of UTF-16 stream (LE)"
|
||||||
|
"\uFEFFa"
|
||||||
|
(bv-read-test "UTF-16" #vu8(#xFF #xFE #xFF #xFE #x61 #x00)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM discarded from start of UTF-32 stream (BE)"
|
||||||
|
"a"
|
||||||
|
(bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF #x00 #x00 #x00 #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "Only one BOM discarded from start of UTF-32 stream (BE)"
|
||||||
|
"\uFEFFa"
|
||||||
|
(bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF
|
||||||
|
#x00 #x00 #xFE #xFF
|
||||||
|
#x00 #x00 #x00 #x61)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM not discarded unless at start of UTF-32 stream"
|
||||||
|
"a\uFEFFb"
|
||||||
|
(let ((be (bv-read-test "UTF-32" #vu8(#x00 #x00 #x00 #x61
|
||||||
|
#x00 #x00 #xFE #xFF
|
||||||
|
#x00 #x00 #x00 #x62)))
|
||||||
|
(le (bv-read-test "UTF-32" #vu8(#x61 #x00 #x00 #x00
|
||||||
|
#xFF #xFE #x00 #x00
|
||||||
|
#x62 #x00 #x00 #x00))))
|
||||||
|
(if (char=? #\a (string-ref be 0))
|
||||||
|
be
|
||||||
|
le)))
|
||||||
|
|
||||||
|
(pass-if-equal "BOM discarded from start of UTF-32 stream (LE)"
|
||||||
|
"a"
|
||||||
|
(bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
|
||||||
|
#x61 #x00 #x00 #x00)))
|
||||||
|
|
||||||
|
(pass-if-equal "Only one BOM discarded from start of UTF-32 stream (LE)"
|
||||||
|
"\uFEFFa"
|
||||||
|
(bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
|
||||||
|
#xFF #xFE #x00 #x00
|
||||||
|
#x61 #x00 #x00 #x00))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
(define-syntax-rule (with-load-path path body ...)
|
(define-syntax-rule (with-load-path path body ...)
|
||||||
(let ((new path)
|
(let ((new path)
|
||||||
(old %load-path))
|
(old %load-path))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue