From 1e058add7b9568fb3a37e4fa82360d183d0a26ee Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@pobox.com>
Date: Mon, 16 May 2016 10:44:21 +0200
Subject: [PATCH] U+FFFD is the input substitution character

* libguile/ports.c (UNICODE_REPLACEMENT_CHARACTER):
* libguile/ports.c (peek_utf8_codepoint)
  (scm_port_decode_char, peek_iconv_codepoint):
* module/ice-9/sports.scm (peek-char-and-len/utf8):
  (peek-char-and-len/iconv): Return U+FFFD when we get a decoding error
  when reading, instead of '?', in accordance with Unicode
  recommendations.
* test-suite/tests/iconv.test:
* test-suite/tests/ports.test:
* test-suite/tests/rdelim.test: Update tests.
* NEWS: Update.
---
 NEWS                         |  7 ++++
 doc/ref/api-io.texi          | 76 ++++++++++++++++++++----------------
 libguile/ports.c             | 12 ++++--
 module/ice-9/sports.scm      |  6 +--
 test-suite/tests/iconv.test  |  2 +-
 test-suite/tests/ports.test  |  4 +-
 test-suite/tests/rdelim.test |  2 +-
 7 files changed, 65 insertions(+), 44 deletions(-)

diff --git a/NEWS b/NEWS
index 7165f8d9e..3e64129e4 100644
--- a/NEWS
+++ b/NEWS
@@ -71,6 +71,13 @@ raise an error on bad input.  Guile now raises an error without
 advancing the read pointer.  To skip over a bad encoding, set the port
 conversion strategy to "substitute" and read a substitute character.
 
+** Decoding errors with `substitute' strategy return U+FFFD
+
+It used to be that decoding errors with the `substitute' conversion
+strategy would replace the bad bytes with a `?' character.  This has
+been changed to use the standard U+FFFD REPLACEMENT CHARACTER, in
+accordance with the Unicode recommendations.
+
 ** API to define new port types from C has changed
 
 See the newly expanded "I/O Extensions" in the manual, for full details.
diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi
index 23d3b50cd..5b200977b 100644
--- a/doc/ref/api-io.texi
+++ b/doc/ref/api-io.texi
@@ -78,11 +78,17 @@ string doesn't depend on its context: the same byte sequence will always
 return the same string.  A couple of modal encodings are in common use,
 like ISO-2022-JP and ISO-2022-KR, and they are not yet supported.
 
-Each port also has an associated conversion strategy: what to do when
-a Guile character can't be converted to the port's encoded character
-representation for output. There are three possible strategies: to
-raise an error, to replace the character with a hex escape, or to
-replace the character with a substitute character.
+@cindex port conversion strategy
+@cindex conversion strategy, port
+@cindex decoding error
+@cindex encoding error
+Each port also has an associated conversion strategy, which determines
+what to do when a Guile character can't be converted to the port's
+encoded character representation for output.  There are three possible
+strategies: to raise an error, to replace the character with a hex
+escape, or to replace the character with a substitute character.  Port
+conversion strategies are also used when decoding characters from an
+input port.
 
 Finally, all ports have associated input and output buffers, as
 appropriate.  Buffering is a common strategy to limit the overhead of
@@ -142,14 +148,10 @@ its input and output.  The value @code{#f} is equivalent to @code{"ISO-8859-1"}.
 
 @deffn {Scheme Procedure} set-port-conversion-strategy! port sym
 @deffnx {C Function} scm_set_port_conversion_strategy_x (port, sym)
-Sets the behavior of the interpreter when outputting a character that
-is not representable in the port's current encoding.  @var{sym} can be
-either @code{'error}, @code{'substitute}, or @code{'escape}.  If it is
-@code{'error}, an error will be thrown when an nonconvertible character
-is encountered.  If it is @code{'substitute}, then nonconvertible
-characters will be replaced with approximate characters, or with
-question marks if no approximately correct character is available.  If
-it is @code{'escape}, it will appear as a hex escape when output.
+Sets the behavior of Guile when outputting a character that is not
+representable in the port's current encoding, or when Guile encounters a
+decoding error when trying to read a character.  @var{sym} can be either
+@code{error}, @code{substitute}, or @code{escape}.
 
 If @var{port} is an open port, the conversion error behavior
 is set for that port.  If it is @code{#f}, it is set as the
@@ -157,15 +159,27 @@ default behavior for any future ports that get created in
 this thread.
 @end deffn
 
+For an output port, a there are three possible port conversion
+strategies.  The @code{error} strategy will throw an error when a
+nonconvertible character is encountered.  The @code{substitute} strategy
+will replace nonconvertible characters with a question mark (@samp{?}).
+Finally the @code{escape} strategy will print nonconvertible characters
+as a hex escape, using the escaping that is recognized by Guile's string
+syntax.  Note that if the port's encoding is a Unicode encoding, like
+@code{UTF-8}, then encoding errors are impossible.
+
+For an input port, the @code{error} strategy will cause Guile to throw
+an error if it encounters an invalid encoding, such as might happen if
+you tried to read @code{ISO-8859-1} as @code{UTF-8}.  The error is
+thrown before advancing the read position.  The @code{substitute}
+strategy will replace the bad bytes with a U+FFFD replacement character,
+in accordance with Unicode recommendations.  When reading from an input
+port, the @code{escape} strategy is treated as if it were @code{error}.
+
 @deffn {Scheme Procedure} port-conversion-strategy port
 @deffnx {C Function} scm_port_conversion_strategy (port)
-Returns the behavior of the port when outputting a character that is
-not representable in the port's current encoding.  It returns the
-symbol @code{error} if unrepresentable characters should cause
-exceptions, @code{substitute} if the port should try to replace
-unrepresentable characters with question marks or approximate
-characters, or @code{escape} if unrepresentable characters should be
-converted to string escapes.
+Returns the behavior of the port when outputting a character that is not
+representable in the port's current encoding.
 
 If @var{port} is @code{#f}, then the current default behavior will be
 returned.  New ports will have this default behavior when they are
@@ -179,9 +193,9 @@ and for other conversion routines such as @code{scm_to_stringn},
 @code{pointer->string}.
 
 Its value must be one of the symbols described above, with the same
-semantics: @code{'error}, @code{'substitute}, or @code{'escape}.
+semantics: @code{error}, @code{substitute}, or @code{escape}.
 
-When Guile starts, its value is @code{'substitute}.
+When Guile starts, its value is @code{substitute}.
 
 Note that @code{(set-port-conversion-strategy! #f @var{sym})} is
 equivalent to @code{(fluid-set! %default-port-conversion-strategy
@@ -226,13 +240,10 @@ interactive port that has no ready characters.
 @rnindex read-char
 @deffn {Scheme Procedure} read-char [port]
 @deffnx {C Function} scm_read_char (port)
-Return the next character available from @var{port}, updating
-@var{port} to point to the following character.  If no more
-characters are available, the end-of-file object is returned.
-
-When @var{port}'s data cannot be decoded according to its character
-encoding, a @code{decoding-error} is raised and @var{port} is not
-advanced past the erroneous byte sequence.
+Return the next character available from @var{port}, updating @var{port}
+to point to the following character.  If no more characters are
+available, the end-of-file object is returned.  A decoding error, if
+any, is handled in accordance with the port's conversion strategy.
 @end deffn
 
 @deftypefn {C Function} size_t scm_c_read (SCM port, void *buffer, size_t size)
@@ -262,8 +273,8 @@ return the value returned by the preceding call to
 an interactive port will hang waiting for input whenever a call
 to @code{read-char} would have hung.
 
-As for @code{read-char}, a @code{decoding-error} may be raised
-if such a situation occurs.
+As for @code{read-char}, decoding errors are handled in accordance with
+the port's conversion strategy.
 @end deffn
 
 @deffn {Scheme Procedure} unread-char cobj [port]
@@ -627,9 +638,6 @@ Push the terminating delimiter (if any) back on to the port.
 Return a pair containing the string read from the port and the
 terminating delimiter or end-of-file object.
 @end table
-
-Like @code{read-char}, this procedure can throw to @code{decoding-error}
-(@pxref{Reading, @code{read-char}}).
 @end deffn
 
 @c begin (scm-doc-string "rdelim.scm" "read-line!")
diff --git a/libguile/ports.c b/libguile/ports.c
index a89c7e48e..c67bdf53b 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -109,6 +109,12 @@ static SCM sym_substitute;
 static SCM sym_escape;
 
 
+
+
+/* See Unicode 8.0 section 5.22, "Best Practice for U+FFFD
+   Substitution".  */
+static const scm_t_wchar UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
+
 
 
 static SCM trampoline_to_c_read_subr;
@@ -1590,7 +1596,7 @@ peek_utf8_codepoint (SCM port, size_t *len)
  decoding_error:
   if (scm_is_eq (SCM_PORT (port)->conversion_strategy, sym_substitute))
     /* *len already set.  */
-    return '?';
+    return UNICODE_REPLACEMENT_CHARACTER;
 
   scm_decoding_error ("peek-char", EILSEQ, "input decoding error", port);
   /* Not reached.  */
@@ -1648,7 +1654,7 @@ SCM_DEFINE (scm_port_decode_char, "port-decode-char", 4, 0, 0,
         return SCM_BOOL_F;
       else if (scm_is_eq (SCM_PORT (port)->conversion_strategy,
                           sym_substitute))
-        return SCM_MAKE_CHAR ('?');
+        return SCM_MAKE_CHAR (UNICODE_REPLACEMENT_CHARACTER);
       else
         scm_decoding_error ("decode-char", err, "input decoding error", port);
     }
@@ -1699,7 +1705,7 @@ peek_iconv_codepoint (SCM port, size_t *len)
           /* EOF found in the middle of a multibyte character. */
           if (scm_is_eq (SCM_PORT (port)->conversion_strategy,
                          sym_substitute))
-            return '?';
+            return UNICODE_REPLACEMENT_CHARACTER;
 
           scm_decoding_error ("peek-char", EILSEQ,
                               "input decoding error", port);
diff --git a/module/ice-9/sports.scm b/module/ice-9/sports.scm
index 55f507866..6fd7ddd31 100644
--- a/module/ice-9/sports.scm
+++ b/module/ice-9/sports.scm
@@ -291,7 +291,7 @@
 (define (peek-char-and-len/utf8 port first-byte)
   (define (bad-utf8 len)
     (if (eq? (port-conversion-strategy port) 'substitute)
-        (values #\? len)
+        (values #\xFFFD len)
         (decoding-error "peek-char" port)))
   (if (< first-byte #x80)
       (values (integer->char first-byte) 1)
@@ -308,7 +308,7 @@
               (let ((len (bad-utf8-len bv cur buffering first-byte)))
                 (when (zero? len) (error "internal error"))
                 (if (eq? (port-conversion-strategy port) 'substitute)
-                    (values #\? len)
+                    (values #\xFFFD len)
                     (decoding-error "peek-char" port))))
             (decode-utf8 bv cur buffering first-byte values bad-utf8))))))
 
@@ -327,7 +327,7 @@
              ((zero? prev-input-size)
               (values the-eof-object 0))
              ((eq? (port-conversion-strategy port) 'substitute)
-              (values #\? prev-input-size))
+              (values #\xFFFD prev-input-size))
              (else
               (decoding-error "peek-char" port))))
            ((port-decode-char port (port-buffer-bytevector buf)
diff --git a/test-suite/tests/iconv.test b/test-suite/tests/iconv.test
index be36336f3..676d94821 100644
--- a/test-suite/tests/iconv.test
+++ b/test-suite/tests/iconv.test
@@ -97,7 +97,7 @@
     (pass-if "misparse latin1 as utf8 with substitutions"
       (equal? (bytevector->string (string->bytevector s "latin1")
                                   "utf-8" 'substitute)
-              "?t?"))
+              "\uFFFDt\uFFFD"))
 
     (pass-if-exception "misparse latin1 as ascii" exception:decoding-error
       (bytevector->string (string->bytevector s "latin1") "ascii"))))
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 3bb001e4d..029dd2dd9 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -834,7 +834,7 @@
                               ;; If `proc' is `read-char', this will
                               ;; skip over the bad bytes.
                               (let ((c (proc p)))
-                                (unless (eqv? c #\?)
+                                (unless (eqv? c #\xFFFD)
                                   (error "unexpected char" c))
                                 (set-port-conversion-strategy! p strategy)
                                 #t)))
@@ -846,7 +846,7 @@
                      ((_ port (proc -> error))
                       (if (eq? 'substitute
                                (port-conversion-strategy port))
-                          (eqv? (proc port) #\?)
+                          (eqv? (proc port) #\xFFFD)
                           (decoding-error? port proc)))
                      ((_ port (proc -> eof))
                       (eof-object? (proc port)))
diff --git a/test-suite/tests/rdelim.test b/test-suite/tests/rdelim.test
index de384c508..3aaa0b253 100644
--- a/test-suite/tests/rdelim.test
+++ b/test-suite/tests/rdelim.test
@@ -87,7 +87,7 @@
     (let ((p (open-bytevector-input-port #vu8(65 255 66 67 68))))
       (set-port-encoding! p "UTF-8")
       (set-port-conversion-strategy! p 'substitute)
-      (and (string=? (read-line p) "A?BCD")
+      (and (string=? (read-line p) "A\uFFFDBCD")
            (eof-object? (read-line p))))))