From 1ee237d9a159e0e9a995ecb9fea24e1d39a7c5e1 Mon Sep 17 00:00:00 2001
From: Mark H Weaver <mhw@netris.org>
Date: Tue, 2 Apr 2013 17:26:37 -0400
Subject: [PATCH] Rewrite get_iconv_codepoint to fix a bug involving byte-order
 marks.

* libguile/ports.c (get_iconv_codepoint): Rewrite to fix a bug and
  improve efficiency and clarity.  Previously, it incorrectly assumed
  that iconv would never consume input without producing output, which
  led to a buffer overrun and subsequent assertion failure.  This
  happens when a byte-order mark is consumed by iconv at the beginning
  of the stream when using the UTF-16 or UTF-32 encodings.

* test-suite/tests/ports.test (unicode byte-order marks (BOMs)):
  Add tests.
---
 libguile/ports.c            | 88 +++++++++++++++++++----------------
 test-suite/tests/ports.test | 92 +++++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 40 deletions(-)

diff --git a/libguile/ports.c b/libguile/ports.c
index ee14ca55a..2170d967b 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1306,65 +1306,73 @@ static int
 get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
 		     char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
-  scm_t_iconv_descriptors *id;
-  int err, byte_read;
-  size_t bytes_consumed, output_size;
-  char *output;
+  scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
   scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
+  size_t input_size = 0;
 
-  id = scm_i_port_iconv_descriptors (port);
-
-  for (output_size = 0, output = (char *) utf8_buf,
-	 bytes_consumed = 0, err = 0;
-       err == 0 && output_size == 0
-	 && (bytes_consumed == 0 || byte_read != EOF);
-       bytes_consumed++)
+  for (;;)
     {
-      char *input;
+      int byte_read;
+      char *input, *output;
       size_t input_left, output_left, done;
 
       byte_read = scm_get_byte_or_eof (port);
-      if (byte_read == EOF)
+      if (SCM_UNLIKELY (byte_read == EOF))
 	{
-	  if (bytes_consumed == 0)
-	    {
-	      *codepoint = (scm_t_wchar) EOF;
-	      *len = 0;
-	      return 0;
-	    }
-	  else
-	    continue;
+          if (SCM_LIKELY (input_size == 0))
+            {
+              *codepoint = (scm_t_wchar) EOF;
+              *len = input_size;
+              return 0;
+            }
+          else
+            /* EOF found in the middle of a multibyte character. */
+            return EILSEQ;
 	}
 
-      buf[bytes_consumed] = byte_read;
+      buf[input_size++] = byte_read;
 
       input = buf;
-      input_left = bytes_consumed + 1;
+      input_left = input_size;
+      output = (char *) utf8_buf;
       output_left = sizeof (utf8_buf);
 
       done = iconv (id->input_cd, &input, &input_left, &output, &output_left);
+
       if (done == (size_t) -1)
 	{
-	  err = errno;
-	  if (err == EINVAL)
-	    /* Missing input: keep trying.  */
-	    err = 0;
+	  int err = errno;
+	  if (SCM_LIKELY (err == EINVAL))
+            /* The input byte sequence did not form a complete
+               character.  Read another byte and try again. */
+            continue;
+          else
+            return err;
 	}
       else
-	output_size = sizeof (utf8_buf) - output_left;
+        {
+          size_t output_size = sizeof (utf8_buf) - output_left;
+          if (SCM_LIKELY (output_size > 0))
+            {
+              /* iconv generated output.  Convert the UTF8_BUF sequence
+                 to a Unicode code point.  */
+              *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+              *len = input_size;
+              return 0;
+            }
+          else
+            {
+              /* iconv consumed some bytes without producing any output.
+                 Most likely this means that a Unicode byte-order mark
+                 (BOM) was consumed, which should not be included in the
+                 returned buf.  Shift any remaining bytes to the beginning
+                 of buf, and continue the loop. */
+              memmove (buf, input, input_left);
+              input_size = input_left;
+              continue;
+            }
+        }
     }
-
-  if (SCM_UNLIKELY (output_size == 0))
-    /* An unterminated sequence.  */
-    err = EILSEQ;
-  else if (SCM_LIKELY (err == 0))
-    {
-      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
-      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
-      *len = bytes_consumed;
-    }
-
-  return err;
 }
 
 /* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 886ab2418..c73e6be10 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -1149,6 +1149,98 @@
 
 
 
+(define (bv-read-test encoding bv)
+  (let ((port (open-bytevector-input-port bv)))
+    (set-port-encoding! port encoding)
+    (read-string port)))
+
+(with-test-prefix "unicode byte-order marks (BOMs)"
+
+  (pass-if-equal "BOM not discarded from Latin-1 stream"
+      "\xEF\xBB\xBF\x61"
+    (bv-read-test "ISO-8859-1" #vu8(#xEF #xBB #xBF #x61)))
+
+  (pass-if-equal "BOM not discarded from Latin-2 stream"
+      "\u010F\u0165\u017C\x61"
+    (bv-read-test "ISO-8859-2" #vu8(#xEF #xBB #xBF #x61)))
+
+  (pass-if-equal "BOM not discarded from UTF-16BE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-16BE" #vu8(#xFE #xFF #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded from UTF-16LE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-16LE" #vu8(#xFF #xFE #x61 #x00)))
+
+  (pass-if-equal "BOM not discarded from UTF-32BE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-32BE" #vu8(#x00 #x00 #xFE #xFF
+                                  #x00 #x00 #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded from UTF-32LE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00
+                                  #x61 #x00 #x00 #x00)))
+
+  (pass-if-equal "BOM discarded from start of UTF-16 stream (BE)"
+      "a"
+    (bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (BE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded unless at start of UTF-16 stream"
+      "a\uFEFFb"
+    (let ((be (bv-read-test "UTF-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62)))
+          (le (bv-read-test "UTF-16" #vu8(#x61 #x00 #xFF #xFE #x62 #x00))))
+      (if (char=? #\a (string-ref be 0))
+          be
+          le)))
+
+  (pass-if-equal "BOM discarded from start of UTF-16 stream (LE)"
+      "a"
+    (bv-read-test "UTF-16" #vu8(#xFF #xFE #x61 #x00)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (LE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-16" #vu8(#xFF #xFE #xFF #xFE #x61 #x00)))
+
+  (pass-if-equal "BOM discarded from start of UTF-32 stream (BE)"
+      "a"
+    (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF #x00 #x00 #x00 #x61)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (BE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF
+                                #x00 #x00 #xFE #xFF
+                                #x00 #x00 #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded unless at start of UTF-32 stream"
+      "a\uFEFFb"
+    (let ((be (bv-read-test "UTF-32" #vu8(#x00 #x00 #x00 #x61
+                                          #x00 #x00 #xFE #xFF
+                                          #x00 #x00 #x00 #x62)))
+          (le (bv-read-test "UTF-32" #vu8(#x61 #x00 #x00 #x00
+                                          #xFF #xFE #x00 #x00
+                                          #x62 #x00 #x00 #x00))))
+      (if (char=? #\a (string-ref be 0))
+          be
+          le)))
+
+  (pass-if-equal "BOM discarded from start of UTF-32 stream (LE)"
+      "a"
+    (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
+                                #x61 #x00 #x00 #x00)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (LE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
+                                #xFF #xFE #x00 #x00
+                                #x61 #x00 #x00 #x00))))
+
+
+
 (define-syntax-rule (with-load-path path body ...)
   (let ((new path)
         (old %load-path))