From 1e058add7b9568fb3a37e4fa82360d183d0a26ee Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Mon, 16 May 2016 10:44:21 +0200 Subject: [PATCH] U+FFFD is the input substitution character * libguile/ports.c (UNICODE_REPLACEMENT_CHARACTER): * libguile/ports.c (peek_utf8_codepoint) (scm_port_decode_char, peek_iconv_codepoint): * module/ice-9/sports.scm (peek-char-and-len/utf8): (peek-char-and-len/iconv): Return U+FFFD when we get a decoding error when reading, instead of '?', in accordance with Unicode recommendations. * test-suite/tests/iconv.test: * test-suite/tests/ports.test: * test-suite/tests/rdelim.test: Update tests. * NEWS: Update. --- NEWS | 7 ++++ doc/ref/api-io.texi | 76 ++++++++++++++++++++---------------- libguile/ports.c | 12 ++++-- module/ice-9/sports.scm | 6 +-- test-suite/tests/iconv.test | 2 +- test-suite/tests/ports.test | 4 +- test-suite/tests/rdelim.test | 2 +- 7 files changed, 65 insertions(+), 44 deletions(-) diff --git a/NEWS b/NEWS index 7165f8d9e..3e64129e4 100644 --- a/NEWS +++ b/NEWS @@ -71,6 +71,13 @@ raise an error on bad input. Guile now raises an error without advancing the read pointer. To skip over a bad encoding, set the port conversion strategy to "substitute" and read a substitute character. +** Decoding errors with `substitute' strategy return U+FFFD + +It used to be that decoding errors with the `substitute' conversion +strategy would replace the bad bytes with a `?' character. This has +been changed to use the standard U+FFFD REPLACEMENT CHARACTER, in +accordance with the Unicode recommendations. + ** API to define new port types from C has changed See the newly expanded "I/O Extensions" in the manual, for full details. diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi index 23d3b50cd..5b200977b 100644 --- a/doc/ref/api-io.texi +++ b/doc/ref/api-io.texi @@ -78,11 +78,17 @@ string doesn't depend on its context: the same byte sequence will always return the same string. A couple of modal encodings are in common use, like ISO-2022-JP and ISO-2022-KR, and they are not yet supported. -Each port also has an associated conversion strategy: what to do when -a Guile character can't be converted to the port's encoded character -representation for output. There are three possible strategies: to -raise an error, to replace the character with a hex escape, or to -replace the character with a substitute character. +@cindex port conversion strategy +@cindex conversion strategy, port +@cindex decoding error +@cindex encoding error +Each port also has an associated conversion strategy, which determines +what to do when a Guile character can't be converted to the port's +encoded character representation for output. There are three possible +strategies: to raise an error, to replace the character with a hex +escape, or to replace the character with a substitute character. Port +conversion strategies are also used when decoding characters from an +input port. Finally, all ports have associated input and output buffers, as appropriate. Buffering is a common strategy to limit the overhead of @@ -142,14 +148,10 @@ its input and output. The value @code{#f} is equivalent to @code{"ISO-8859-1"}. @deffn {Scheme Procedure} set-port-conversion-strategy! port sym @deffnx {C Function} scm_set_port_conversion_strategy_x (port, sym) -Sets the behavior of the interpreter when outputting a character that -is not representable in the port's current encoding. @var{sym} can be -either @code{'error}, @code{'substitute}, or @code{'escape}. If it is -@code{'error}, an error will be thrown when an nonconvertible character -is encountered. If it is @code{'substitute}, then nonconvertible -characters will be replaced with approximate characters, or with -question marks if no approximately correct character is available. If -it is @code{'escape}, it will appear as a hex escape when output. +Sets the behavior of Guile when outputting a character that is not +representable in the port's current encoding, or when Guile encounters a +decoding error when trying to read a character. @var{sym} can be either +@code{error}, @code{substitute}, or @code{escape}. If @var{port} is an open port, the conversion error behavior is set for that port. If it is @code{#f}, it is set as the @@ -157,15 +159,27 @@ default behavior for any future ports that get created in this thread. @end deffn +For an output port, a there are three possible port conversion +strategies. The @code{error} strategy will throw an error when a +nonconvertible character is encountered. The @code{substitute} strategy +will replace nonconvertible characters with a question mark (@samp{?}). +Finally the @code{escape} strategy will print nonconvertible characters +as a hex escape, using the escaping that is recognized by Guile's string +syntax. Note that if the port's encoding is a Unicode encoding, like +@code{UTF-8}, then encoding errors are impossible. + +For an input port, the @code{error} strategy will cause Guile to throw +an error if it encounters an invalid encoding, such as might happen if +you tried to read @code{ISO-8859-1} as @code{UTF-8}. The error is +thrown before advancing the read position. The @code{substitute} +strategy will replace the bad bytes with a U+FFFD replacement character, +in accordance with Unicode recommendations. When reading from an input +port, the @code{escape} strategy is treated as if it were @code{error}. + @deffn {Scheme Procedure} port-conversion-strategy port @deffnx {C Function} scm_port_conversion_strategy (port) -Returns the behavior of the port when outputting a character that is -not representable in the port's current encoding. It returns the -symbol @code{error} if unrepresentable characters should cause -exceptions, @code{substitute} if the port should try to replace -unrepresentable characters with question marks or approximate -characters, or @code{escape} if unrepresentable characters should be -converted to string escapes. +Returns the behavior of the port when outputting a character that is not +representable in the port's current encoding. If @var{port} is @code{#f}, then the current default behavior will be returned. New ports will have this default behavior when they are @@ -179,9 +193,9 @@ and for other conversion routines such as @code{scm_to_stringn}, @code{pointer->string}. Its value must be one of the symbols described above, with the same -semantics: @code{'error}, @code{'substitute}, or @code{'escape}. +semantics: @code{error}, @code{substitute}, or @code{escape}. -When Guile starts, its value is @code{'substitute}. +When Guile starts, its value is @code{substitute}. Note that @code{(set-port-conversion-strategy! #f @var{sym})} is equivalent to @code{(fluid-set! %default-port-conversion-strategy @@ -226,13 +240,10 @@ interactive port that has no ready characters. @rnindex read-char @deffn {Scheme Procedure} read-char [port] @deffnx {C Function} scm_read_char (port) -Return the next character available from @var{port}, updating -@var{port} to point to the following character. If no more -characters are available, the end-of-file object is returned. - -When @var{port}'s data cannot be decoded according to its character -encoding, a @code{decoding-error} is raised and @var{port} is not -advanced past the erroneous byte sequence. +Return the next character available from @var{port}, updating @var{port} +to point to the following character. If no more characters are +available, the end-of-file object is returned. A decoding error, if +any, is handled in accordance with the port's conversion strategy. @end deffn @deftypefn {C Function} size_t scm_c_read (SCM port, void *buffer, size_t size) @@ -262,8 +273,8 @@ return the value returned by the preceding call to an interactive port will hang waiting for input whenever a call to @code{read-char} would have hung. -As for @code{read-char}, a @code{decoding-error} may be raised -if such a situation occurs. +As for @code{read-char}, decoding errors are handled in accordance with +the port's conversion strategy. @end deffn @deffn {Scheme Procedure} unread-char cobj [port] @@ -627,9 +638,6 @@ Push the terminating delimiter (if any) back on to the port. Return a pair containing the string read from the port and the terminating delimiter or end-of-file object. @end table - -Like @code{read-char}, this procedure can throw to @code{decoding-error} -(@pxref{Reading, @code{read-char}}). @end deffn @c begin (scm-doc-string "rdelim.scm" "read-line!") diff --git a/libguile/ports.c b/libguile/ports.c index a89c7e48e..c67bdf53b 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -109,6 +109,12 @@ static SCM sym_substitute; static SCM sym_escape; + + +/* See Unicode 8.0 section 5.22, "Best Practice for U+FFFD + Substitution". */ +static const scm_t_wchar UNICODE_REPLACEMENT_CHARACTER = 0xFFFD; + static SCM trampoline_to_c_read_subr; @@ -1590,7 +1596,7 @@ peek_utf8_codepoint (SCM port, size_t *len) decoding_error: if (scm_is_eq (SCM_PORT (port)->conversion_strategy, sym_substitute)) /* *len already set. */ - return '?'; + return UNICODE_REPLACEMENT_CHARACTER; scm_decoding_error ("peek-char", EILSEQ, "input decoding error", port); /* Not reached. */ @@ -1648,7 +1654,7 @@ SCM_DEFINE (scm_port_decode_char, "port-decode-char", 4, 0, 0, return SCM_BOOL_F; else if (scm_is_eq (SCM_PORT (port)->conversion_strategy, sym_substitute)) - return SCM_MAKE_CHAR ('?'); + return SCM_MAKE_CHAR (UNICODE_REPLACEMENT_CHARACTER); else scm_decoding_error ("decode-char", err, "input decoding error", port); } @@ -1699,7 +1705,7 @@ peek_iconv_codepoint (SCM port, size_t *len) /* EOF found in the middle of a multibyte character. */ if (scm_is_eq (SCM_PORT (port)->conversion_strategy, sym_substitute)) - return '?'; + return UNICODE_REPLACEMENT_CHARACTER; scm_decoding_error ("peek-char", EILSEQ, "input decoding error", port); diff --git a/module/ice-9/sports.scm b/module/ice-9/sports.scm index 55f507866..6fd7ddd31 100644 --- a/module/ice-9/sports.scm +++ b/module/ice-9/sports.scm @@ -291,7 +291,7 @@ (define (peek-char-and-len/utf8 port first-byte) (define (bad-utf8 len) (if (eq? (port-conversion-strategy port) 'substitute) - (values #\? len) + (values #\xFFFD len) (decoding-error "peek-char" port))) (if (< first-byte #x80) (values (integer->char first-byte) 1) @@ -308,7 +308,7 @@ (let ((len (bad-utf8-len bv cur buffering first-byte))) (when (zero? len) (error "internal error")) (if (eq? (port-conversion-strategy port) 'substitute) - (values #\? len) + (values #\xFFFD len) (decoding-error "peek-char" port)))) (decode-utf8 bv cur buffering first-byte values bad-utf8)))))) @@ -327,7 +327,7 @@ ((zero? prev-input-size) (values the-eof-object 0)) ((eq? (port-conversion-strategy port) 'substitute) - (values #\? prev-input-size)) + (values #\xFFFD prev-input-size)) (else (decoding-error "peek-char" port)))) ((port-decode-char port (port-buffer-bytevector buf) diff --git a/test-suite/tests/iconv.test b/test-suite/tests/iconv.test index be36336f3..676d94821 100644 --- a/test-suite/tests/iconv.test +++ b/test-suite/tests/iconv.test @@ -97,7 +97,7 @@ (pass-if "misparse latin1 as utf8 with substitutions" (equal? (bytevector->string (string->bytevector s "latin1") "utf-8" 'substitute) - "?t?")) + "\uFFFDt\uFFFD")) (pass-if-exception "misparse latin1 as ascii" exception:decoding-error (bytevector->string (string->bytevector s "latin1") "ascii")))) diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index 3bb001e4d..029dd2dd9 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -834,7 +834,7 @@ ;; If `proc' is `read-char', this will ;; skip over the bad bytes. (let ((c (proc p))) - (unless (eqv? c #\?) + (unless (eqv? c #\xFFFD) (error "unexpected char" c)) (set-port-conversion-strategy! p strategy) #t))) @@ -846,7 +846,7 @@ ((_ port (proc -> error)) (if (eq? 'substitute (port-conversion-strategy port)) - (eqv? (proc port) #\?) + (eqv? (proc port) #\xFFFD) (decoding-error? port proc))) ((_ port (proc -> eof)) (eof-object? (proc port))) diff --git a/test-suite/tests/rdelim.test b/test-suite/tests/rdelim.test index de384c508..3aaa0b253 100644 --- a/test-suite/tests/rdelim.test +++ b/test-suite/tests/rdelim.test @@ -87,7 +87,7 @@ (let ((p (open-bytevector-input-port #vu8(65 255 66 67 68)))) (set-port-encoding! p "UTF-8") (set-port-conversion-strategy! p 'substitute) - (and (string=? (read-line p) "A?BCD") + (and (string=? (read-line p) "A\uFFFDBCD") (eof-object? (read-line p))))))