mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 11:50:28 +02:00
Simplify decoding error handling
* libguile/ports.c (peek_utf8_codepoint, peek_latin1_codepoint): (peek_iconv_codepoint, peek_codepoint): Refactor to push error handling to the leaves, where errors happen. Just return the (possibly substituted) codepoint, without an error code; if there's really an error, we should raise it. (scm_getc, scm_peek_char): Adapt.
This commit is contained in:
parent
1953d29038
commit
08c67dbef8
1 changed files with 88 additions and 113 deletions
201
libguile/ports.c
201
libguile/ports.c
|
@ -1598,27 +1598,27 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
|
||||||
return codepoint;
|
return codepoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Peek a UTF-8 sequence from PORT. On success, return 0, set
|
/* Peek a UTF-8 sequence from PORT. On success, return the codepoint
|
||||||
*CODEPOINT to the codepoint that was read, and set *LEN to the length
|
that was read, and set *LEN to the length in bytes. If there was a
|
||||||
in bytes. Return `EILSEQ' on error, setting *LEN to the shortest
|
decoding error and the port conversion strategy was `substitute',
|
||||||
prefix that cannot begin a valid UTF-8 sequence. */
|
then return #\? and set *LEN to the length of the shortest prefix
|
||||||
static int
|
that cannot begin a valid UTF-8 sequence. Otherwise signal an
|
||||||
peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
error. */
|
||||||
|
static scm_t_wchar
|
||||||
|
peek_utf8_codepoint (SCM port, size_t *len)
|
||||||
{
|
{
|
||||||
|
#define DECODING_ERROR(bytes) \
|
||||||
|
do { *len = bytes; goto decoding_error; } while (0)
|
||||||
|
#define RETURN(bytes, codepoint) \
|
||||||
|
do { *len = bytes; return codepoint; } while (0)
|
||||||
|
|
||||||
int first_byte;
|
int first_byte;
|
||||||
|
|
||||||
first_byte = peek_byte_or_eof (port);
|
first_byte = peek_byte_or_eof (port);
|
||||||
if (first_byte == EOF)
|
if (first_byte == EOF)
|
||||||
{
|
RETURN (0, EOF);
|
||||||
*codepoint = EOF;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else if (first_byte < 0x80)
|
else if (first_byte < 0x80)
|
||||||
{
|
RETURN (1, first_byte);
|
||||||
*codepoint = first_byte;
|
|
||||||
*len = 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else if (first_byte >= 0xc2 && first_byte <= 0xdf)
|
else if (first_byte >= 0xc2 && first_byte <= 0xdf)
|
||||||
{
|
{
|
||||||
SCM read_buf = scm_fill_input (port, 2);
|
SCM read_buf = scm_fill_input (port, 2);
|
||||||
|
@ -1626,14 +1626,9 @@ peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
||||||
const scm_t_uint8 *ptr = scm_port_buffer_take_pointer (read_buf);
|
const scm_t_uint8 *ptr = scm_port_buffer_take_pointer (read_buf);
|
||||||
|
|
||||||
if (can_take < 2 || (ptr[1] & 0xc0) != 0x80)
|
if (can_take < 2 || (ptr[1] & 0xc0) != 0x80)
|
||||||
{
|
DECODING_ERROR (1);
|
||||||
*len = 1;
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
|
||||||
|
|
||||||
*codepoint = (first_byte & 0x1f) << 6UL | (ptr[1] & 0x3f);
|
RETURN (2, (first_byte & 0x1f) << 6UL | (ptr[1] & 0x3f));
|
||||||
*len = 2;
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
else if ((first_byte & 0xf0) == 0xe0)
|
else if ((first_byte & 0xf0) == 0xe0)
|
||||||
{
|
{
|
||||||
|
@ -1644,22 +1639,15 @@ peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
||||||
if (can_take < 2 || (ptr[1] & 0xc0) != 0x80
|
if (can_take < 2 || (ptr[1] & 0xc0) != 0x80
|
||||||
|| (ptr[0] == 0xe0 && ptr[1] < 0xa0)
|
|| (ptr[0] == 0xe0 && ptr[1] < 0xa0)
|
||||||
|| (ptr[0] == 0xed && ptr[1] > 0x9f))
|
|| (ptr[0] == 0xed && ptr[1] > 0x9f))
|
||||||
{
|
DECODING_ERROR (1);
|
||||||
*len = 1;
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (can_take < 3 || (ptr[2] & 0xc0) != 0x80)
|
if (can_take < 3 || (ptr[2] & 0xc0) != 0x80)
|
||||||
{
|
DECODING_ERROR (2);
|
||||||
*len = 2;
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
|
||||||
|
|
||||||
*codepoint = ((scm_t_wchar) ptr[0] & 0x0f) << 12UL
|
RETURN (3,
|
||||||
| ((scm_t_wchar) ptr[1] & 0x3f) << 6UL
|
((scm_t_wchar) ptr[0] & 0x0f) << 12UL
|
||||||
| (ptr[2] & 0x3f);
|
| ((scm_t_wchar) ptr[1] & 0x3f) << 6UL
|
||||||
*len = 3;
|
| (ptr[2] & 0x3f));
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
else if (first_byte >= 0xf0 && first_byte <= 0xf4)
|
else if (first_byte >= 0xf0 && first_byte <= 0xf4)
|
||||||
{
|
{
|
||||||
|
@ -1670,56 +1658,55 @@ peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
||||||
if (can_take < 2 || (ptr[1] & 0xc0) != 0x80
|
if (can_take < 2 || (ptr[1] & 0xc0) != 0x80
|
||||||
|| (ptr[0] == 0xf0 && ptr[1] < 0x90)
|
|| (ptr[0] == 0xf0 && ptr[1] < 0x90)
|
||||||
|| (ptr[0] == 0xf4 && ptr[1] > 0x8f))
|
|| (ptr[0] == 0xf4 && ptr[1] > 0x8f))
|
||||||
{
|
DECODING_ERROR (1);
|
||||||
*len = 1;
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (can_take < 3 || (ptr[2] & 0xc0) != 0x80)
|
if (can_take < 3 || (ptr[2] & 0xc0) != 0x80)
|
||||||
{
|
DECODING_ERROR (2);
|
||||||
*len = 2;
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (can_take < 4 || (ptr[3] & 0xc0) != 0x80)
|
if (can_take < 4 || (ptr[3] & 0xc0) != 0x80)
|
||||||
{
|
DECODING_ERROR (3);
|
||||||
*len = 3;
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
|
||||||
|
|
||||||
*codepoint = ((scm_t_wchar) ptr[0] & 0x07) << 18UL
|
RETURN (4,
|
||||||
| ((scm_t_wchar) ptr[1] & 0x3f) << 12UL
|
((scm_t_wchar) ptr[0] & 0x07) << 18UL
|
||||||
| ((scm_t_wchar) ptr[2] & 0x3f) << 6UL
|
| ((scm_t_wchar) ptr[1] & 0x3f) << 12UL
|
||||||
| (ptr[3] & 0x3f);
|
| ((scm_t_wchar) ptr[2] & 0x3f) << 6UL
|
||||||
*len = 4;
|
| (ptr[3] & 0x3f));
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
DECODING_ERROR (1);
|
||||||
*len = 1;
|
|
||||||
return EILSEQ;
|
decoding_error:
|
||||||
}
|
if (scm_is_eq (SCM_PTAB_ENTRY (port)->conversion_strategy, sym_substitute))
|
||||||
|
/* *len already set. */
|
||||||
|
return '?';
|
||||||
|
|
||||||
|
scm_decoding_error ("peek-char", EILSEQ, "input decoding error", port);
|
||||||
|
/* Not reached. */
|
||||||
|
return 0;
|
||||||
|
#undef DECODING_ERROR
|
||||||
|
#undef RETURN
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Peek an ISO-8859-1 codepoint (a byte) from PORT. On success, return
|
/* Peek an ISO-8859-1 codepoint (a byte) from PORT. On success, return
|
||||||
0, set *CODEPOINT to the codepoint that was peeked, and set *LEN to
|
the codepoint, and set *LEN to 1. Otherwise on EOF set *LEN to 0. */
|
||||||
the length in bytes. No encoding error is possible. */
|
static scm_t_wchar
|
||||||
static int
|
peek_latin1_codepoint (SCM port, size_t *len)
|
||||||
peek_latin1_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
|
||||||
{
|
{
|
||||||
*codepoint = peek_byte_or_eof (port);
|
scm_t_wchar ret = peek_byte_or_eof (port);
|
||||||
if (*codepoint == EOF)
|
|
||||||
*len = 0;
|
*len = ret == EOF ? 0 : 1;
|
||||||
else
|
|
||||||
*len = 1;
|
return ret;
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Peek a codepoint from PORT, decoding it through iconv. On success,
|
/* Peek a codepoint from PORT, decoding it through iconv. On success,
|
||||||
return 0, set *CODEPOINT to the codepoint that was peeked, and set
|
return the codepoint and set *LEN to the length in bytes. If there
|
||||||
*LEN to the length in bytes. Return `EILSEQ' on decoding error. */
|
was a decoding error and the port conversion strategy was
|
||||||
static int
|
`substitute', then return #\? and set *LEN to the length of the
|
||||||
peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
shortest prefix that cannot begin a valid UTF-8 sequence. Otherwise
|
||||||
|
signal an error. */
|
||||||
|
static scm_t_wchar
|
||||||
|
peek_iconv_codepoint (SCM port, size_t *len)
|
||||||
{
|
{
|
||||||
scm_t_iconv_descriptors *id;
|
scm_t_iconv_descriptors *id;
|
||||||
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
||||||
|
@ -1736,16 +1723,13 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
||||||
|
|
||||||
if (scm_port_buffer_can_take (read_buf) <= input_size)
|
if (scm_port_buffer_can_take (read_buf) <= input_size)
|
||||||
{
|
{
|
||||||
|
*len = input_size;
|
||||||
if (input_size == 0)
|
if (input_size == 0)
|
||||||
/* Normal EOF. */
|
/* Normal EOF. */
|
||||||
{
|
return EOF;
|
||||||
*codepoint = (scm_t_wchar) EOF;
|
|
||||||
*len = 0;
|
/* EOF found in the middle of a multibyte character. */
|
||||||
return 0;
|
goto decoding_error;
|
||||||
}
|
|
||||||
else
|
|
||||||
/* EOF found in the middle of a multibyte character. */
|
|
||||||
return EILSEQ;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
input_size++;
|
input_size++;
|
||||||
|
@ -1764,8 +1748,9 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
||||||
/* The input byte sequence did not form a complete
|
/* The input byte sequence did not form a complete
|
||||||
character. Read another byte and try again. */
|
character. Read another byte and try again. */
|
||||||
continue;
|
continue;
|
||||||
else
|
|
||||||
return err;
|
*len = input_size;
|
||||||
|
goto decoding_error;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1779,36 +1764,35 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
||||||
|
|
||||||
/* iconv generated output. Convert the UTF8_BUF sequence
|
/* iconv generated output. Convert the UTF8_BUF sequence
|
||||||
to a Unicode code point. */
|
to a Unicode code point. */
|
||||||
*codepoint = utf8_to_codepoint (utf8_buf, output_size);
|
|
||||||
*len = input_size;
|
*len = input_size;
|
||||||
return 0;
|
return utf8_to_codepoint (utf8_buf, output_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
decoding_error:
|
||||||
|
if (scm_is_eq (SCM_PTAB_ENTRY (port)->conversion_strategy, sym_substitute))
|
||||||
|
return '?';
|
||||||
|
|
||||||
|
scm_decoding_error ("peek-char", EILSEQ, "input decoding error",
|
||||||
|
port);
|
||||||
|
/* Not reached. */
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Peek a codepoint from PORT and return it in *CODEPOINT. Set *LEN to
|
/* Peek a codepoint from PORT and return it in *CODEPOINT. Set *LEN to
|
||||||
the length in bytes of that representation. Return 0 on success and
|
the length in bytes of that representation. Return 0 on success and
|
||||||
an errno value on error. */
|
an errno value on error. */
|
||||||
static SCM_C_INLINE int
|
static SCM_C_INLINE scm_t_wchar
|
||||||
peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
peek_codepoint (SCM port, size_t *len)
|
||||||
{
|
{
|
||||||
int err;
|
SCM encoding = SCM_PTAB_ENTRY (port)->encoding;
|
||||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
|
||||||
|
|
||||||
if (scm_is_eq (pt->encoding, sym_UTF_8))
|
if (scm_is_eq (encoding, sym_UTF_8))
|
||||||
err = peek_utf8_codepoint (port, codepoint, len);
|
return peek_utf8_codepoint (port, len);
|
||||||
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
else if (scm_is_eq (encoding, sym_ISO_8859_1))
|
||||||
err = peek_latin1_codepoint (port, codepoint, len);
|
return peek_latin1_codepoint (port, len);
|
||||||
else
|
else
|
||||||
err = peek_iconv_codepoint (port, codepoint, len);
|
return peek_iconv_codepoint (port, len);
|
||||||
|
|
||||||
if (err != 0 && scm_is_eq (pt->conversion_strategy, sym_substitute))
|
|
||||||
{
|
|
||||||
*codepoint = '?';
|
|
||||||
err = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Read a codepoint from PORT and return it. */
|
/* Read a codepoint from PORT and return it. */
|
||||||
|
@ -1816,13 +1800,10 @@ scm_t_wchar
|
||||||
scm_getc (SCM port)
|
scm_getc (SCM port)
|
||||||
#define FUNC_NAME "scm_getc"
|
#define FUNC_NAME "scm_getc"
|
||||||
{
|
{
|
||||||
int err;
|
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
scm_t_wchar codepoint = EOF;
|
scm_t_wchar codepoint;
|
||||||
|
|
||||||
err = peek_codepoint (port, &codepoint, &len);
|
codepoint = peek_codepoint (port, &len);
|
||||||
if (SCM_UNLIKELY (err != 0))
|
|
||||||
scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
|
|
||||||
scm_port_buffer_did_take (SCM_PTAB_ENTRY (port)->read_buf, len);
|
scm_port_buffer_did_take (SCM_PTAB_ENTRY (port)->read_buf, len);
|
||||||
if (codepoint == EOF)
|
if (codepoint == EOF)
|
||||||
scm_i_clear_pending_eof (port);
|
scm_i_clear_pending_eof (port);
|
||||||
|
@ -2009,7 +1990,6 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
|
||||||
"sequence when the error is raised.\n")
|
"sequence when the error is raised.\n")
|
||||||
#define FUNC_NAME s_scm_peek_char
|
#define FUNC_NAME s_scm_peek_char
|
||||||
{
|
{
|
||||||
int err;
|
|
||||||
scm_t_wchar c;
|
scm_t_wchar c;
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
|
||||||
|
@ -2017,14 +1997,9 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
|
||||||
port = scm_current_input_port ();
|
port = scm_current_input_port ();
|
||||||
SCM_VALIDATE_OPINPORT (1, port);
|
SCM_VALIDATE_OPINPORT (1, port);
|
||||||
|
|
||||||
err = peek_codepoint (port, &c, &len);
|
c = peek_codepoint (port, &len);
|
||||||
|
|
||||||
if (err == 0)
|
return c == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (c);
|
||||||
return c == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (c);
|
|
||||||
|
|
||||||
scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
|
|
||||||
/* Not reached. */
|
|
||||||
return SCM_BOOL_F;
|
|
||||||
}
|
}
|
||||||
#undef FUNC_NAME
|
#undef FUNC_NAME
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue