From 1ee237d9a159e0e9a995ecb9fea24e1d39a7c5e1 Mon Sep 17 00:00:00 2001 From: Mark H Weaver Date: Tue, 2 Apr 2013 17:26:37 -0400 Subject: [PATCH] Rewrite get_iconv_codepoint to fix a bug involving byte-order marks. * libguile/ports.c (get_iconv_codepoint): Rewrite to fix a bug and improve efficiency and clarity. Previously, it incorrectly assumed that iconv would never consume input without producing output, which led to a buffer overrun and subsequent assertion failure. This happens when a byte-order mark is consumed by iconv at the beginning of the stream when using the UTF-16 or UTF-32 encodings. * test-suite/tests/ports.test (unicode byte-order marks (BOMs)): Add tests. --- libguile/ports.c | 88 +++++++++++++++++++---------------- test-suite/tests/ports.test | 92 +++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 40 deletions(-) diff --git a/libguile/ports.c b/libguile/ports.c index ee14ca55a..2170d967b 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -1306,65 +1306,73 @@ static int get_iconv_codepoint (SCM port, scm_t_wchar *codepoint, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) { - scm_t_iconv_descriptors *id; - int err, byte_read; - size_t bytes_consumed, output_size; - char *output; + scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port); scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE]; + size_t input_size = 0; - id = scm_i_port_iconv_descriptors (port); - - for (output_size = 0, output = (char *) utf8_buf, - bytes_consumed = 0, err = 0; - err == 0 && output_size == 0 - && (bytes_consumed == 0 || byte_read != EOF); - bytes_consumed++) + for (;;) { - char *input; + int byte_read; + char *input, *output; size_t input_left, output_left, done; byte_read = scm_get_byte_or_eof (port); - if (byte_read == EOF) + if (SCM_UNLIKELY (byte_read == EOF)) { - if (bytes_consumed == 0) - { - *codepoint = (scm_t_wchar) EOF; - *len = 0; - return 0; - } - else - continue; + if (SCM_LIKELY (input_size == 0)) + { + *codepoint = (scm_t_wchar) EOF; + *len = input_size; + return 0; + } + else + /* EOF found in the middle of a multibyte character. */ + return EILSEQ; } - buf[bytes_consumed] = byte_read; + buf[input_size++] = byte_read; input = buf; - input_left = bytes_consumed + 1; + input_left = input_size; + output = (char *) utf8_buf; output_left = sizeof (utf8_buf); done = iconv (id->input_cd, &input, &input_left, &output, &output_left); + if (done == (size_t) -1) { - err = errno; - if (err == EINVAL) - /* Missing input: keep trying. */ - err = 0; + int err = errno; + if (SCM_LIKELY (err == EINVAL)) + /* The input byte sequence did not form a complete + character. Read another byte and try again. */ + continue; + else + return err; } else - output_size = sizeof (utf8_buf) - output_left; + { + size_t output_size = sizeof (utf8_buf) - output_left; + if (SCM_LIKELY (output_size > 0)) + { + /* iconv generated output. Convert the UTF8_BUF sequence + to a Unicode code point. */ + *codepoint = utf8_to_codepoint (utf8_buf, output_size); + *len = input_size; + return 0; + } + else + { + /* iconv consumed some bytes without producing any output. + Most likely this means that a Unicode byte-order mark + (BOM) was consumed, which should not be included in the + returned buf. Shift any remaining bytes to the beginning + of buf, and continue the loop. */ + memmove (buf, input, input_left); + input_size = input_left; + continue; + } + } } - - if (SCM_UNLIKELY (output_size == 0)) - /* An unterminated sequence. */ - err = EILSEQ; - else if (SCM_LIKELY (err == 0)) - { - /* Convert the UTF8_BUF sequence to a Unicode code point. */ - *codepoint = utf8_to_codepoint (utf8_buf, output_size); - *len = bytes_consumed; - } - - return err; } /* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index 886ab2418..c73e6be10 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -1149,6 +1149,98 @@ +(define (bv-read-test encoding bv) + (let ((port (open-bytevector-input-port bv))) + (set-port-encoding! port encoding) + (read-string port))) + +(with-test-prefix "unicode byte-order marks (BOMs)" + + (pass-if-equal "BOM not discarded from Latin-1 stream" + "\xEF\xBB\xBF\x61" + (bv-read-test "ISO-8859-1" #vu8(#xEF #xBB #xBF #x61))) + + (pass-if-equal "BOM not discarded from Latin-2 stream" + "\u010F\u0165\u017C\x61" + (bv-read-test "ISO-8859-2" #vu8(#xEF #xBB #xBF #x61))) + + (pass-if-equal "BOM not discarded from UTF-16BE stream" + "\uFEFF\x61" + (bv-read-test "UTF-16BE" #vu8(#xFE #xFF #x00 #x61))) + + (pass-if-equal "BOM not discarded from UTF-16LE stream" + "\uFEFF\x61" + (bv-read-test "UTF-16LE" #vu8(#xFF #xFE #x61 #x00))) + + (pass-if-equal "BOM not discarded from UTF-32BE stream" + "\uFEFF\x61" + (bv-read-test "UTF-32BE" #vu8(#x00 #x00 #xFE #xFF + #x00 #x00 #x00 #x61))) + + (pass-if-equal "BOM not discarded from UTF-32LE stream" + "\uFEFF\x61" + (bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00 + #x61 #x00 #x00 #x00))) + + (pass-if-equal "BOM discarded from start of UTF-16 stream (BE)" + "a" + (bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61))) + + (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (BE)" + "\uFEFFa" + (bv-read-test "UTF-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61))) + + (pass-if-equal "BOM not discarded unless at start of UTF-16 stream" + "a\uFEFFb" + (let ((be (bv-read-test "UTF-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62))) + (le (bv-read-test "UTF-16" #vu8(#x61 #x00 #xFF #xFE #x62 #x00)))) + (if (char=? #\a (string-ref be 0)) + be + le))) + + (pass-if-equal "BOM discarded from start of UTF-16 stream (LE)" + "a" + (bv-read-test "UTF-16" #vu8(#xFF #xFE #x61 #x00))) + + (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (LE)" + "\uFEFFa" + (bv-read-test "UTF-16" #vu8(#xFF #xFE #xFF #xFE #x61 #x00))) + + (pass-if-equal "BOM discarded from start of UTF-32 stream (BE)" + "a" + (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF #x00 #x00 #x00 #x61))) + + (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (BE)" + "\uFEFFa" + (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF + #x00 #x00 #xFE #xFF + #x00 #x00 #x00 #x61))) + + (pass-if-equal "BOM not discarded unless at start of UTF-32 stream" + "a\uFEFFb" + (let ((be (bv-read-test "UTF-32" #vu8(#x00 #x00 #x00 #x61 + #x00 #x00 #xFE #xFF + #x00 #x00 #x00 #x62))) + (le (bv-read-test "UTF-32" #vu8(#x61 #x00 #x00 #x00 + #xFF #xFE #x00 #x00 + #x62 #x00 #x00 #x00)))) + (if (char=? #\a (string-ref be 0)) + be + le))) + + (pass-if-equal "BOM discarded from start of UTF-32 stream (LE)" + "a" + (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00 + #x61 #x00 #x00 #x00))) + + (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (LE)" + "\uFEFFa" + (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00 + #xFF #xFE #x00 #x00 + #x61 #x00 #x00 #x00)))) + + + (define-syntax-rule (with-load-path path body ...) (let ((new path) (old %load-path))