1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-16 16:50:21 +02:00

Have read-char' & co. throw to decoding-error'.

* libguile/ports.c (scm_read_char): Mention `decoding-error' in the
  docstring.
  (get_codepoint): Change to return an error code; add `codepoint'
  output parameter.  Don't raise an error from here.
  (scm_getc): Raise an error with `scm_decoding_error' if
  `get_codepoint' returns an error.
  (scm_peek_char): Likewise.  Update docstring.

* libguile/strings.c (scm_decoding_error_key): New variable.
  (scm_decoding_error): New function.
  (scm_from_stringn): Use `scm_decoding_error' instead of
  `scm_encoding_error'.

* libguile/strings.h (scm_decoding_error): New declaration.

* test-suite/tests/ports.test ("string ports")["read-char, wrong
  encoding, error"]: Change to expect `decoding-error'.  Make sure PORT
  points past the error.
  ["read-char, wrong encoding, escape"]: Likewise.
  ["peek-char, wrong encoding, error"]: New test.

* test-suite/tests/r6rs-ports.test ("7.2.11 Binary
  Output")["put-bytevector with wrong-encoding string port"]: Change to
  expect `decoding-error'.
  ("8.2.6  Input and output ports")["transcoded-port [error handling
  mode = raise]"]: Likewise.

* test-suite/tests/rdelim.test ("read-line")["decoding error", "decoding
  error, substitute"]: New tests.

* doc/ref/api-io.texi (Reading): Update documentation of `read-char' and
  `peek-char'.
  (Line/Delimited): Update documentation of `read-line'.
This commit is contained in:
Ludovic Courtès 2011-02-02 15:52:56 +01:00
parent d6cf96974e
commit c62da8f891
7 changed files with 196 additions and 77 deletions

View file

@ -1,7 +1,7 @@
@c -*-texinfo-*- @c -*-texinfo-*-
@c This is part of the GNU Guile Reference Manual. @c This is part of the GNU Guile Reference Manual.
@c Copyright (C) 1996, 1997, 2000, 2001, 2002, 2003, 2004, 2007, 2009, 2010 @c Copyright (C) 1996, 1997, 2000, 2001, 2002, 2003, 2004, 2007, 2009,
@c Free Software Foundation, Inc. @c 2010, 2011 Free Software Foundation, Inc.
@c See the file guile.texi for copying conditions. @c See the file guile.texi for copying conditions.
@node Input and Output @node Input and Output
@ -210,6 +210,10 @@ interactive port that has no ready characters.
Return the next character available from @var{port}, updating Return the next character available from @var{port}, updating
@var{port} to point to the following character. If no more @var{port} to point to the following character. If no more
characters are available, the end-of-file object is returned. characters are available, the end-of-file object is returned.
When @var{port}'s data cannot be decoded according to its
character encoding, a @code{decoding-error} is raised and
@var{port} points past the erroneous byte sequence.
@end deffn @end deffn
@deftypefn {C Function} size_t scm_c_read (SCM port, void *buffer, size_t size) @deftypefn {C Function} size_t scm_c_read (SCM port, void *buffer, size_t size)
@ -238,6 +242,11 @@ return the value returned by the preceding call to
@code{peek-char}. In particular, a call to @code{peek-char} on @code{peek-char}. In particular, a call to @code{peek-char} on
an interactive port will hang waiting for input whenever a call an interactive port will hang waiting for input whenever a call
to @code{read-char} would have hung. to @code{read-char} would have hung.
As for @code{read-char}, a @code{decoding-error} may be raised
if such a situation occurs. However, unlike with @code{read-char},
@var{port} still points at the beginning of the erroneous byte
sequence when the error is raised.
@end deffn @end deffn
@deffn {Scheme Procedure} unread-char cobj [port] @deffn {Scheme Procedure} unread-char cobj [port]
@ -513,6 +522,9 @@ Push the terminating delimiter (if any) back on to the port.
Return a pair containing the string read from the port and the Return a pair containing the string read from the port and the
terminating delimiter or end-of-file object. terminating delimiter or end-of-file object.
@end table @end table
Like @code{read-char}, this procedure can throw to @code{decoding-error}
(@pxref{Reading, @code{read-char}}).
@end deffn @end deffn
@c begin (scm-doc-string "rdelim.scm" "read-line!") @c begin (scm-doc-string "rdelim.scm" "read-line!")

View file

@ -1029,7 +1029,11 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
(SCM port), (SCM port),
"Return the next character available from @var{port}, updating\n" "Return the next character available from @var{port}, updating\n"
"@var{port} to point to the following character. If no more\n" "@var{port} to point to the following character. If no more\n"
"characters are available, the end-of-file object is returned.") "characters are available, the end-of-file object is returned.\n"
"\n"
"When @var{port}'s data cannot be decoded according to its\n"
"character encoding, a @code{decoding-error} is raised and\n"
"@var{port} points past the erroneous byte sequence.\n")
#define FUNC_NAME s_scm_read_char #define FUNC_NAME s_scm_read_char
{ {
scm_t_wchar c; scm_t_wchar c;
@ -1108,17 +1112,16 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
return codepoint; return codepoint;
} }
/* Read a codepoint from PORT and return it. Fill BUF with the byte /* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
representation of the codepoint in PORT's encoding, and set *LEN to with the byte representation of the codepoint in PORT's encoding, and
the length in bytes of that representation. Raise an error on set *LEN to the length in bytes of that representation. Return 0 on
failure. */ success and an errno value on error. */
static scm_t_wchar static int
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) get_codepoint (SCM port, scm_t_wchar *codepoint,
#define FUNC_NAME "scm_getc" char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
{ {
int err, byte_read; int err, byte_read;
size_t bytes_consumed, output_size; size_t bytes_consumed, output_size;
scm_t_wchar codepoint;
char *output; char *output;
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE]; scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
scm_t_port *pt = SCM_PTAB_ENTRY (port); scm_t_port *pt = SCM_PTAB_ENTRY (port);
@ -1140,7 +1143,11 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
if (byte_read == EOF) if (byte_read == EOF)
{ {
if (bytes_consumed == 0) if (bytes_consumed == 0)
return (scm_t_wchar) EOF; {
*codepoint = (scm_t_wchar) EOF;
*len = 0;
return 0;
}
else else
continue; continue;
} }
@ -1164,53 +1171,52 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
output_size = sizeof (utf8_buf) - output_left; output_size = sizeof (utf8_buf) - output_left;
} }
if (err != 0) if (SCM_UNLIKELY (err != 0))
{ {
/* Reset the `iconv' state. */ /* Reset the `iconv' state. */
iconv (pt->input_cd, NULL, NULL, NULL, NULL); iconv (pt->input_cd, NULL, NULL, NULL, NULL);
if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK) if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
codepoint = '?'; {
else *codepoint = '?';
err = 0;
}
/* Fail when the strategy is SCM_ICONVEH_ERROR or /* Fail when the strategy is SCM_ICONVEH_ERROR or
SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
for input encoding errors.) */ input encoding errors.) */
goto failure;
} }
else else
/* Convert the UTF8_BUF sequence to a Unicode code point. */ /* Convert the UTF8_BUF sequence to a Unicode code point. */
codepoint = utf8_to_codepoint (utf8_buf, output_size); *codepoint = utf8_to_codepoint (utf8_buf, output_size);
update_port_lf (codepoint, port); if (SCM_LIKELY (err == 0))
update_port_lf (*codepoint, port);
*len = bytes_consumed; *len = bytes_consumed;
return codepoint; return err;
failure:
{
SCM bv;
bv = scm_c_make_bytevector (bytes_consumed);
memcpy (SCM_BYTEVECTOR_CONTENTS (bv), buf, bytes_consumed);
scm_encoding_error (FUNC_NAME, err, "input decoding error",
pt->encoding, "UTF-8", bv);
} }
/* Never gets here. */
return 0;
}
#undef FUNC_NAME
/* Read a codepoint from PORT and return it. */ /* Read a codepoint from PORT and return it. */
scm_t_wchar scm_t_wchar
scm_getc (SCM port) scm_getc (SCM port)
#define FUNC_NAME "scm_getc"
{ {
int err;
size_t len; size_t len;
scm_t_wchar codepoint;
char buf[SCM_MBCHAR_BUF_SIZE]; char buf[SCM_MBCHAR_BUF_SIZE];
return get_codepoint (port, buf, &len); err = get_codepoint (port, &codepoint, buf, &len);
if (SCM_UNLIKELY (err != 0))
/* At this point PORT should point past the invalid encoding, as per
R6RS-lib Section 8.2.4. */
scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
return codepoint;
} }
#undef FUNC_NAME
/* this should only be called when the read buffer is empty. it /* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from tries to refill the read buffer. it returns the first char from
@ -1623,13 +1629,19 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
"return the value returned by the preceding call to\n" "return the value returned by the preceding call to\n"
"@code{peek-char}. In particular, a call to @code{peek-char} on\n" "@code{peek-char}. In particular, a call to @code{peek-char} on\n"
"an interactive port will hang waiting for input whenever a call\n" "an interactive port will hang waiting for input whenever a call\n"
"to @code{read-char} would have hung.") "to @code{read-char} would have hung.\n"
"\n"
"As for @code{read-char}, a @code{decoding-error} may be raised\n"
"if such a situation occurs. However, unlike with @code{read-char},\n"
"@var{port} still points at the beginning of the erroneous byte\n"
"sequence when the error is raised.\n")
#define FUNC_NAME s_scm_peek_char #define FUNC_NAME s_scm_peek_char
{ {
int err;
SCM result; SCM result;
scm_t_wchar c; scm_t_wchar c;
char bytes[SCM_MBCHAR_BUF_SIZE]; char bytes[SCM_MBCHAR_BUF_SIZE];
long column, line; long column, line, i;
size_t len; size_t len;
if (SCM_UNBNDP (port)) if (SCM_UNBNDP (port))
@ -1639,21 +1651,25 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
column = SCM_COL (port); column = SCM_COL (port);
line = SCM_LINUM (port); line = SCM_LINUM (port);
c = get_codepoint (port, bytes, &len); err = get_codepoint (port, &c, bytes, &len);
if (c == EOF)
result = SCM_EOF_VAL;
else
{
long i;
result = SCM_MAKE_CHAR (c);
for (i = len - 1; i >= 0; i--) for (i = len - 1; i >= 0; i--)
scm_unget_byte (bytes[i], port); scm_unget_byte (bytes[i], port);
SCM_COL (port) = column; SCM_COL (port) = column;
SCM_LINUM (port) = line; SCM_LINUM (port) = line;
if (SCM_UNLIKELY (err != 0))
{
scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
/* Shouldn't happen since `catch' always aborts to prompt. */
result = SCM_BOOL_F;
} }
else if (c == EOF)
result = SCM_EOF_VAL;
else
result = SCM_MAKE_CHAR (c);
return result; return result;
} }

View file

@ -1407,9 +1407,11 @@ SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
/* Conversion to/from other encodings. */ /* Charset conversion error handling. */
SCM_SYMBOL (scm_encoding_error_key, "encoding-error"); SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
void void
scm_encoding_error (const char *subr, int err, const char *message, scm_encoding_error (const char *subr, int err, const char *message,
const char *from, const char *to, SCM string_or_bv) const char *from, const char *to, SCM string_or_bv)
@ -1428,6 +1430,22 @@ scm_encoding_error (const char *subr, int err, const char *message,
SCM_UNDEFINED)); SCM_UNDEFINED));
} }
/* Raise an exception informing of an encoding error on PORT. This
means that a character could not be written in PORT's encoding. */
void
scm_decoding_error (const char *subr, int err, const char *message, SCM port)
{
scm_throw (scm_decoding_error_key,
scm_list_n (scm_from_locale_string (subr),
scm_from_locale_string (message),
scm_from_int (err),
port,
SCM_UNDEFINED));
}
/* String conversion to/from C. */
SCM SCM
scm_from_stringn (const char *str, size_t len, const char *encoding, scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_t_string_failed_conversion_handler handler) scm_t_string_failed_conversion_handler handler)
@ -1473,9 +1491,8 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
memcpy (buf, str, len); memcpy (buf, str, len);
bv = scm_c_take_bytevector (buf, len); bv = scm_c_take_bytevector (buf, len);
scm_encoding_error (__func__, errno, scm_decoding_error (__func__, errno,
"input locale conversion error", "input locale conversion error", bv);
encoding, "UTF-32", bv);
} }
i = 0; i = 0;

View file

@ -214,6 +214,8 @@ SCM_INTERNAL void scm_encoding_error (const char *subr, int err,
const char *message, const char *message,
const char *from, const char *to, const char *from, const char *to,
SCM string_or_bv); SCM string_or_bv);
SCM_INTERNAL void scm_decoding_error (const char *subr, int err,
const char *message, SCM port);
/* internal utility functions. */ /* internal utility functions. */

View file

@ -464,29 +464,80 @@
(= (port-line p) 0) (= (port-line p) 0)
(= (port-column p) 0)))) (= (port-column p) 0))))
(pass-if-exception "read-char, wrong encoding, error" (pass-if "read-char, wrong encoding, error"
exception:encoding-error
(let ((p (with-fluids ((%default-port-encoding "UTF-8")) (let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-bytevector-input-port #vu8(255 1 2 3))))) (open-bytevector-input-port #vu8(255 65 66 67)))))
(catch 'decoding-error
(lambda ()
(set-port-conversion-strategy! p 'error) (set-port-conversion-strategy! p 'error)
(read-char p) (read-char p)
#t)) #f)
(lambda (key subr message err port)
(and (eq? port p)
(pass-if-exception "read-char, wrong encoding, escape" ;; PORT should point past the error.
exception:encoding-error (equal? '(#\A #\B #\C)
;; `escape' should behave like `error'. (list (read-char port)
(read-char port)
(read-char port)))
(eof-object? (read-char port)))))))
(pass-if "read-char, wrong encoding, escape"
;; `escape' should behave exactly like `error'.
(let ((p (with-fluids ((%default-port-encoding "UTF-8")) (let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-bytevector-input-port #vu8(255 1 2 3))))) (open-bytevector-input-port #vu8(255 65 66 67)))))
(catch 'decoding-error
(lambda ()
(set-port-conversion-strategy! p 'escape) (set-port-conversion-strategy! p 'escape)
(read-char p) (read-char p)
#t)) #f)
(lambda (key subr message err port)
(and (eq? port p)
;; PORT should point past the error.
(equal? '(#\A #\B #\C)
(list (read-char port)
(read-char port)
(read-char port)))
(eof-object? (read-char port)))))))
(pass-if "read-char, wrong encoding, substitute" (pass-if "read-char, wrong encoding, substitute"
(let ((p (with-fluids ((%default-port-encoding "UTF-8")) (let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-bytevector-input-port #vu8(255 206 187 206 188))))) (open-bytevector-input-port #vu8(255 206 187 206 188)))))
(set-port-conversion-strategy! p 'substitute) (set-port-conversion-strategy! p 'substitute)
(equal? (list (read-char p) (read-char p) (read-char p)) (equal? (list (read-char p) (read-char p) (read-char p))
'(#\? #\λ #\μ))))) '(#\? #\λ #\μ))))
(pass-if "peek-char, wrong encoding, error"
(let-syntax ((decoding-error?
(syntax-rules ()
((_ port exp)
(catch 'decoding-error
(lambda ()
(pk 'exp exp)
#f)
(lambda (key subr message errno p)
(eq? p port)))))))
(let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-bytevector-input-port #vu8(255 65 66 67)))))
(set-port-conversion-strategy! p 'error)
;; `peek-char' should repeatedly raise an error.
(and (decoding-error? p (peek-char p))
(decoding-error? p (peek-char p))
(decoding-error? p (peek-char p))
;; Move past the error.
(decoding-error? p (read-char p))
;; Finish happily.
(equal? '(#\A #\B #\C)
(list (read-char p)
(read-char p)
(read-char p)))
(eof-object? (read-char p)))))))
(with-test-prefix "call-with-output-string" (with-test-prefix "call-with-output-string"

View file

@ -262,16 +262,14 @@
(pass-if "put-bytevector with wrong-encoding string port" (pass-if "put-bytevector with wrong-encoding string port"
(let* ((str "hello, world") (let* ((str "hello, world")
(bv (string->utf16 str))) (bv (string->utf16 str)))
(catch 'encoding-error (catch 'decoding-error
(lambda () (lambda ()
(with-fluids ((%default-port-encoding "UTF-32")) (with-fluids ((%default-port-encoding "UTF-32"))
(call-with-output-string (call-with-output-string
(lambda (port) (lambda (port)
(put-bytevector port bv))))) (put-bytevector port bv)))))
(lambda (key subr message errno from to faulty-bv) (lambda (key subr message errno port)
(and (bytevector=? faulty-bv bv) (string? (strerror errno)))))))
(string=? to "UTF-32")
(string? (strerror errno))))))))
(with-test-prefix "7.2.7 Input Ports" (with-test-prefix "7.2.7 Input Ports"
@ -545,7 +543,7 @@
(b (open-bytevector-input-port #vu8(255 2 1))) (b (open-bytevector-input-port #vu8(255 2 1)))
(tp (transcoded-port b t))) (tp (transcoded-port b t)))
;; FIXME: Should be (guard (c ((i/o-decoding-error? c) #t)) ...). ;; FIXME: Should be (guard (c ((i/o-decoding-error? c) #t)) ...).
(catch 'encoding-error (catch 'decoding-error
(lambda () (lambda ()
(get-line tp) (get-line tp)
#f) #f)

View file

@ -19,6 +19,7 @@
(define-module (test-suite test-rdelim) (define-module (test-suite test-rdelim)
#:use-module (ice-9 rdelim) #:use-module (ice-9 rdelim)
#:use-module ((rnrs io ports) #:select (open-bytevector-input-port))
#:use-module (test-suite lib)) #:use-module (test-suite lib))
(with-fluids ((%default-port-encoding "UTF-8")) (with-fluids ((%default-port-encoding "UTF-8"))
@ -68,6 +69,28 @@
(p (open-input-string s))) (p (open-input-string s)))
(and (equal? (string-tokenize s) (and (equal? (string-tokenize s)
(list (read-line p) (read-line p))) (list (read-line p) (read-line p)))
(eof-object? (read-line p)))))
(pass-if "decoding error"
(let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-bytevector-input-port #vu8(65 255 66 67 68)))))
(set-port-conversion-strategy! p 'error)
(catch 'decoding-error
(lambda ()
(read-line p)
#f)
(lambda (key subr message err port)
(and (eq? port p)
;; PORT should now point past the error.
(string=? (read-line p) "BCD")
(eof-object? (read-line p)))))))
(pass-if "decoding error, substitute"
(let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-bytevector-input-port #vu8(65 255 66 67 68)))))
(set-port-conversion-strategy! p 'substitute)
(and (string=? (read-line p) "A?BCD")
(eof-object? (read-line p))))))) (eof-object? (read-line p)))))))
;;; Local Variables: ;;; Local Variables: