mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-20 19:50:24 +02:00
Fix `get_utf8_codepoint' to not consume valid starting bytes.
Thanks to Mark H. Weaver for pointing this out. * libguile/ports.c (CONSUME_PEEKED_BYTE): New macro. (get_utf8_codepoint): New variable `pt'. Use `scm_peek_byte_or_eof'/`CONSUME_PEEKED_BYTE' pairs instead of `scm_get_byte_or_eof'. * test-suite/tests/ports.test ("string ports")[#xc2 #x41 #x42, #xe0 #xa0 #x41 #x42, #xf0 #x88 #x88 #x88]: Fix to conform to Unicode 6.0.0. [#xe0 #x88 #x88]: Remove test. [#xf0 #x80 #x80 #x41]: New test.
This commit is contained in:
parent
452c5ad912
commit
7be1705dbd
2 changed files with 67 additions and 64 deletions
|
@ -1127,10 +1127,14 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
#define ASSERT_NOT_EOF(b) \
|
#define ASSERT_NOT_EOF(b) \
|
||||||
if (SCM_UNLIKELY ((b) == EOF)) \
|
if (SCM_UNLIKELY ((b) == EOF)) \
|
||||||
goto invalid_seq
|
goto invalid_seq
|
||||||
|
#define CONSUME_PEEKED_BYTE() \
|
||||||
|
pt->read_pos++
|
||||||
|
|
||||||
int byte;
|
int byte;
|
||||||
|
scm_t_port *pt;
|
||||||
|
|
||||||
*len = 0;
|
*len = 0;
|
||||||
|
pt = SCM_PTAB_ENTRY (port);
|
||||||
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
byte = scm_get_byte_or_eof (port);
|
||||||
if (byte == EOF)
|
if (byte == EOF)
|
||||||
|
@ -1148,49 +1152,44 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
|
else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
|
||||||
{
|
{
|
||||||
/* 2-byte form. */
|
/* 2-byte form. */
|
||||||
byte = scm_get_byte_or_eof (port);
|
byte = scm_peek_byte_or_eof (port);
|
||||||
ASSERT_NOT_EOF (byte);
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
buf[1] = (scm_t_uint8) byte;
|
|
||||||
*len = 2;
|
|
||||||
|
|
||||||
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
goto invalid_seq;
|
goto invalid_seq;
|
||||||
|
|
||||||
|
CONSUME_PEEKED_BYTE ();
|
||||||
|
buf[1] = (scm_t_uint8) byte;
|
||||||
|
*len = 2;
|
||||||
|
|
||||||
*codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
|
*codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
|
||||||
| (buf[1] & 0x3f);
|
| (buf[1] & 0x3f);
|
||||||
}
|
}
|
||||||
else if ((buf[0] & 0xf0) == 0xe0)
|
else if ((buf[0] & 0xf0) == 0xe0)
|
||||||
{
|
{
|
||||||
/* 3-byte form. */
|
/* 3-byte form. */
|
||||||
byte = scm_get_byte_or_eof (port);
|
byte = scm_peek_byte_or_eof (port);
|
||||||
if (SCM_UNLIKELY (byte == EOF))
|
ASSERT_NOT_EOF (byte);
|
||||||
goto invalid_seq;
|
|
||||||
|
|
||||||
buf[1] = (scm_t_uint8) byte;
|
|
||||||
*len = 2;
|
|
||||||
|
|
||||||
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
|
||||||
|| (buf[0] == 0xe0 && byte < 0xa0)
|
|| (buf[0] == 0xe0 && byte < 0xa0)
|
||||||
|| (buf[0] == 0xed && byte > 0x9f)))
|
|| (buf[0] == 0xed && byte > 0x9f)))
|
||||||
{
|
|
||||||
/* Swallow the 3rd byte. */
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
|
||||||
ASSERT_NOT_EOF (byte);
|
|
||||||
*len = 3, buf[2] = byte;
|
|
||||||
goto invalid_seq;
|
goto invalid_seq;
|
||||||
}
|
|
||||||
|
|
||||||
|
CONSUME_PEEKED_BYTE ();
|
||||||
|
buf[1] = (scm_t_uint8) byte;
|
||||||
|
*len = 2;
|
||||||
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
byte = scm_peek_byte_or_eof (port);
|
||||||
ASSERT_NOT_EOF (byte);
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
buf[2] = (scm_t_uint8) byte;
|
|
||||||
*len = 3;
|
|
||||||
|
|
||||||
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
goto invalid_seq;
|
goto invalid_seq;
|
||||||
|
|
||||||
|
CONSUME_PEEKED_BYTE ();
|
||||||
|
buf[2] = (scm_t_uint8) byte;
|
||||||
|
*len = 3;
|
||||||
|
|
||||||
*codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
|
*codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
|
||||||
| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
|
| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
|
||||||
| (buf[2] & 0x3f);
|
| (buf[2] & 0x3f);
|
||||||
|
@ -1198,51 +1197,38 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
|
else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
|
||||||
{
|
{
|
||||||
/* 4-byte form. */
|
/* 4-byte form. */
|
||||||
byte = scm_get_byte_or_eof (port);
|
byte = scm_peek_byte_or_eof (port);
|
||||||
ASSERT_NOT_EOF (byte);
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
buf[1] = (scm_t_uint8) byte;
|
|
||||||
*len = 2;
|
|
||||||
|
|
||||||
if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
|
if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
|
||||||
|| (buf[0] == 0xf0 && byte < 0x90)
|
|| (buf[0] == 0xf0 && byte < 0x90)
|
||||||
|| (buf[0] == 0xf4 && byte > 0x8f)))
|
|| (buf[0] == 0xf4 && byte > 0x8f)))
|
||||||
{
|
|
||||||
/* Swallow the 3rd and 4th bytes. */
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
|
||||||
ASSERT_NOT_EOF (byte);
|
|
||||||
*len = 3, buf[2] = byte;
|
|
||||||
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
|
||||||
ASSERT_NOT_EOF (byte);
|
|
||||||
*len = 4, buf[3] = byte;
|
|
||||||
goto invalid_seq;
|
goto invalid_seq;
|
||||||
}
|
|
||||||
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
CONSUME_PEEKED_BYTE ();
|
||||||
|
buf[1] = (scm_t_uint8) byte;
|
||||||
|
*len = 2;
|
||||||
|
|
||||||
|
byte = scm_peek_byte_or_eof (port);
|
||||||
ASSERT_NOT_EOF (byte);
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
|
goto invalid_seq;
|
||||||
|
|
||||||
|
CONSUME_PEEKED_BYTE ();
|
||||||
buf[2] = (scm_t_uint8) byte;
|
buf[2] = (scm_t_uint8) byte;
|
||||||
*len = 3;
|
*len = 3;
|
||||||
|
|
||||||
|
byte = scm_peek_byte_or_eof (port);
|
||||||
|
ASSERT_NOT_EOF (byte);
|
||||||
|
|
||||||
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
||||||
{
|
|
||||||
/* Swallow the 4th byte. */
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
|
||||||
ASSERT_NOT_EOF (byte);
|
|
||||||
*len = 4, buf[3] = byte;
|
|
||||||
goto invalid_seq;
|
goto invalid_seq;
|
||||||
}
|
|
||||||
|
|
||||||
byte = scm_get_byte_or_eof (port);
|
|
||||||
ASSERT_NOT_EOF (byte);
|
|
||||||
|
|
||||||
|
CONSUME_PEEKED_BYTE ();
|
||||||
buf[3] = (scm_t_uint8) byte;
|
buf[3] = (scm_t_uint8) byte;
|
||||||
*len = 4;
|
*len = 4;
|
||||||
|
|
||||||
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
|
|
||||||
goto invalid_seq;
|
|
||||||
|
|
||||||
*codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
|
*codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
|
||||||
| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
|
| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
|
||||||
| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
|
| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
|
||||||
|
@ -1254,8 +1240,14 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
invalid_seq:
|
invalid_seq:
|
||||||
|
/* Here we could choose the consume the faulty byte when it's not a
|
||||||
|
valid starting byte, but it's not a requirement. What Section 3.9
|
||||||
|
of Unicode 6.0.0 mandates, though, is to not consume a byte that
|
||||||
|
would otherwise be a valid starting byte. */
|
||||||
|
|
||||||
return EILSEQ;
|
return EILSEQ;
|
||||||
|
|
||||||
|
#undef CONSUME_PEEKED_BYTE
|
||||||
#undef ASSERT_NOT_EOF
|
#undef ASSERT_NOT_EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -572,29 +572,40 @@
|
||||||
eof))
|
eof))
|
||||||
|
|
||||||
(test-decoding-error (#xc2 #x41 #x42) "UTF-8"
|
(test-decoding-error (#xc2 #x41 #x42) "UTF-8"
|
||||||
;; FIXME: This is the behavior of glibc/libiconv but it does not
|
;; Section 3.9 of Unicode 6.0.0 reads:
|
||||||
;; conform to the Unicode 6.0.0 recommendation: according to it,
|
|
||||||
;; the #\A should not be swallowed (Section 3.9 reads:
|
|
||||||
;; "If the converter encounters an ill-formed UTF-8 code unit
|
;; "If the converter encounters an ill-formed UTF-8 code unit
|
||||||
;; sequence which starts with a valid first byte, but which does
|
;; sequence which starts with a valid first byte, but which does
|
||||||
;; not continue with valid successor bytes (see Table 3-7), it
|
;; not continue with valid successor bytes (see Table 3-7), it
|
||||||
;; must not consume the successor bytes".)
|
;; must not consume the successor bytes".
|
||||||
|
;; Glibc/libiconv do not conform to it and instead swallow the
|
||||||
(error ;; 41: should be in the 80..BF range
|
;; #x41. This example appears literally in Section 3.9.
|
||||||
|
(error ;; 41: invalid successor
|
||||||
|
#\A ;; 41: valid starting byte
|
||||||
#\B
|
#\B
|
||||||
eof))
|
eof))
|
||||||
|
|
||||||
(test-decoding-error (#xe0 #x88 #x88) "UTF-8"
|
(test-decoding-error (#xf0 #x80 #x80 #x41) "UTF-8"
|
||||||
|
;; According to Unicode 6.0.0, Section 3.9, "the only formal
|
||||||
|
;; requirement mandated by Unicode conformance for a converter is
|
||||||
|
;; that the <41> be processed and correctly interpreted as
|
||||||
|
;; <U+0041>".
|
||||||
(error ;; 2nd byte should be in the A0..BF range
|
(error ;; 2nd byte should be in the A0..BF range
|
||||||
|
error ;; 80: not a valid starting byte
|
||||||
|
error ;; 80: not a valid starting byte
|
||||||
|
#\A
|
||||||
eof))
|
eof))
|
||||||
|
|
||||||
(test-decoding-error (#xe0 #xa0 #x41 #x42) "UTF-8"
|
(test-decoding-error (#xe0 #xa0 #x41 #x42) "UTF-8"
|
||||||
(error ;; 3rd byte should be in the 80..BF range
|
(error ;; 3rd byte should be in the 80..BF range
|
||||||
|
#\A
|
||||||
#\B
|
#\B
|
||||||
eof))
|
eof))
|
||||||
|
|
||||||
(test-decoding-error (#xf0 #x88 #x88 #x88) "UTF-8"
|
(test-decoding-error (#xf0 #x88 #x88 #x88) "UTF-8"
|
||||||
(error ;; 2nd byte should be in the 90..BF range
|
(error ;; 2nd byte should be in the 90..BF range
|
||||||
|
error ;; 88: not a valid starting byte
|
||||||
|
error ;; 88: not a valid starting byte
|
||||||
|
error ;; 88: not a valid starting byte
|
||||||
eof))))
|
eof))))
|
||||||
|
|
||||||
(with-test-prefix "call-with-output-string"
|
(with-test-prefix "call-with-output-string"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue