Special-case UTF-8 ports to bypass `iconv' entirely.

* libguile/ports.c (update_port_lf): Handle EOF. (get_utf8_codepoint, get_iconv_codepoint): New functions. (get_codepoint): Use them. (scm_i_set_port_encoding_x): Don't open conversion descriptors when ENCODING is "UTF-8". * libguile/print.c (display_string_as_utf8, display_string_using_iconv): New functions. (display_string): Use them. * test-suite/tests/ports.test ("string ports")[#xc2 #x41 #x42]: Add a note that this is not the wrong behavior per Unicode 6.0.0.
2025-07-12 12:10:30 +02:00 · 2011-05-06 17:54:09 +02:00 · 2011-05-06 17:54:09 +02:00 · 7b292a9d34
commit 7b292a9d34
parent 1f78c6691f
3 changed files with 287 additions and 63 deletions
--- a/libguile/ports.c
+++ b/libguile/ports.c
@ -1057,6 +1057,7 @@ update_port_lf (scm_t_wchar c, SCM port)
  switch (c)
    {
    case '\a':
    case EOF:
      break;
    case '\b':
      SCM_DECCOL (port);
@ -1115,23 +1116,162 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
  return codepoint;
 }
-/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
+/* Read a UTF-8 sequence from PORT.  On success, return 0 and set
-   with the byte representation of the codepoint in PORT's encoding, and
+   *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
-   set *LEN to the length in bytes of that representation.  Return 0 on
+   representation, and set *LEN to the length in bytes.  Return
-   success and an errno value on error.  */
+   `EILSEQ' on error.  */
 static int
-get_codepoint (SCM port, scm_t_wchar *codepoint,
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
-	       char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+		    scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
 #define ASSERT_NOT_EOF(b)			\
  if (SCM_UNLIKELY ((b) == EOF))		\
    goto invalid_seq
  int byte;
  *len = 0;
  byte = scm_get_byte_or_eof (port);
  if (byte == EOF)
    {
      *codepoint = EOF;
      return 0;
    }
  buf[0] = (scm_t_uint8) byte;
  *len = 1;
  if (buf[0] <= 0x7f)
    /* 1-byte form.  */
    *codepoint = buf[0];
  else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
    {
      /* 2-byte form.  */
      byte = scm_get_byte_or_eof (port);
      ASSERT_NOT_EOF (byte);
      buf[1] = (scm_t_uint8) byte;
      *len = 2;
      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
 	goto invalid_seq;
      *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
 	| (buf[1] & 0x3f);
    }
  else if ((buf[0] & 0xf0) == 0xe0)
    {
      /* 3-byte form.  */
      byte = scm_get_byte_or_eof (port);
      if (SCM_UNLIKELY (byte == EOF))
 	goto invalid_seq;
      buf[1] = (scm_t_uint8) byte;
      *len = 2;
      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
 			|| (buf[0] == 0xe0 && byte < 0xa0)
 			|| (buf[0] == 0xed && byte > 0x9f)))
 	{
 	  /* Swallow the 3rd byte.  */
 	  byte = scm_get_byte_or_eof (port);
 	  ASSERT_NOT_EOF (byte);
 	  *len = 3, buf[2] = byte;
 	  goto invalid_seq;
 	}
      byte = scm_get_byte_or_eof (port);
      ASSERT_NOT_EOF (byte);
      buf[2] = (scm_t_uint8) byte;
      *len = 3;
      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
 	goto invalid_seq;
      *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
 	| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
 	| (buf[2] & 0x3f);
    }
  else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
    {
      /* 4-byte form.  */
      byte = scm_get_byte_or_eof (port);
      ASSERT_NOT_EOF (byte);
      buf[1] = (scm_t_uint8) byte;
      *len = 2;
      if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
 			|| (buf[0] == 0xf0 && byte < 0x90)
 			|| (buf[0] == 0xf4 && byte > 0x8f)))
 	{
 	  /* Swallow the 3rd and 4th bytes.  */
 	  byte = scm_get_byte_or_eof (port);
 	  ASSERT_NOT_EOF (byte);
 	  *len = 3, buf[2] = byte;
 	  byte = scm_get_byte_or_eof (port);
 	  ASSERT_NOT_EOF (byte);
 	  *len = 4, buf[3] = byte;
 	  goto invalid_seq;
 	}
      byte = scm_get_byte_or_eof (port);
      ASSERT_NOT_EOF (byte);
      buf[2] = (scm_t_uint8) byte;
      *len = 3;
      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
 	{
 	  /* Swallow the 4th byte.  */
 	  byte = scm_get_byte_or_eof (port);
 	  ASSERT_NOT_EOF (byte);
 	  *len = 4, buf[3] = byte;
 	  goto invalid_seq;
 	}
      byte = scm_get_byte_or_eof (port);
      ASSERT_NOT_EOF (byte);
      buf[3] = (scm_t_uint8) byte;
      *len = 4;
      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
 	goto invalid_seq;
      *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
 	| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
 	| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
 	| (buf[3] & 0x3f);
    }
  else
    goto invalid_seq;
  return 0;
 invalid_seq:
  return EILSEQ;
 #undef ASSERT_NOT_EOF
 }
 /* Likewise, read a byte sequence from PORT, passing it through its
   input conversion descriptor.  */
 static int
 get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
 		     char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
  scm_t_port *pt;
  int err, byte_read;
  size_t bytes_consumed, output_size;
  char *output;
  scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
  scm_t_port *pt = SCM_PTAB_ENTRY (port);
-  if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
+  pt = SCM_PTAB_ENTRY (port);
    /* Initialize the conversion descriptors.  */
    scm_i_set_port_encoding_x (port, pt->encoding);
  for (output_size = 0, output = (char *) utf8_buf,
 	 bytes_consumed = 0, err = 0;
@ -1177,30 +1317,45 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
  if (SCM_UNLIKELY (output_size == 0))
    /* An unterminated sequence.  */
    err = EILSEQ;
-
+  else if (SCM_LIKELY (err == 0))
  if (SCM_UNLIKELY (err != 0))
    {
      /* Reset the `iconv' state.  */
      iconv (pt->input_cd, NULL, NULL, NULL, NULL);
      if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
 	{
 	  *codepoint = '?';
 	  err = 0;
 	}
      /* Fail when the strategy is SCM_ICONVEH_ERROR or
 	 SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
 	 input encoding errors.)  */
    }
  else
    {
      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
-      update_port_lf (*codepoint, port);
+      *len = bytes_consumed;
    }
-  *len = bytes_consumed;
+  return err;
 }
 /* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
   with the byte representation of the codepoint in PORT's encoding, and
   set *LEN to the length in bytes of that representation.  Return 0 on
   success and an errno value on error.  */
 static int
 get_codepoint (SCM port, scm_t_wchar *codepoint,
 	       char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
  int err;
  scm_t_port *pt = SCM_PTAB_ENTRY (port);
  if (pt->input_cd == (iconv_t) -1)
    /* Initialize the conversion descriptors, if needed.  */
    scm_i_set_port_encoding_x (port, pt->encoding);
  /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8.  */
  if (pt->input_cd == (iconv_t) -1)
    err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
  else
    err = get_iconv_codepoint (port, codepoint, buf, len);
  if (SCM_LIKELY (err == 0))
    update_port_lf (*codepoint, port);
  else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
    {
      *codepoint = '?';
      err = 0;
      update_port_lf (*codepoint, port);
    }
  return err;
 }
@ -2031,28 +2186,35 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
  if (encoding == NULL)
    encoding = "ISO-8859-1";
-  pt->encoding = scm_gc_strdup (encoding, "port");
+  if (pt->encoding != encoding)
    pt->encoding = scm_gc_strdup (encoding, "port");
-  if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+  /* If ENCODING is UTF-8, then no conversion descriptor is opened
     because we do I/O ourselves.  This saves 100+ KiB for each
     descriptor.  */
  if (strcmp (encoding, "UTF-8"))
    {
-      /* Open an input iconv conversion descriptor, from ENCODING
+      if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
 	 to UTF-8.  We choose UTF-8, not UTF-32, because iconv
 	 implementations can typically convert from anything to
 	 UTF-8, but not to UTF-32 (see
 	 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
      new_input_cd = iconv_open ("UTF-8", encoding);
      if (new_input_cd == (iconv_t) -1)
 	goto invalid_encoding;
    }
  if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
    {
      new_output_cd = iconv_open (encoding, "UTF-8");
      if (new_output_cd == (iconv_t) -1)
 	{
-	  if (new_input_cd != (iconv_t) -1)
+	  /* Open an input iconv conversion descriptor, from ENCODING
-	    iconv_close (new_input_cd);
+	     to UTF-8.  We choose UTF-8, not UTF-32, because iconv
-	  goto invalid_encoding;
+	     implementations can typically convert from anything to
 	     UTF-8, but not to UTF-32 (see
 	     <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
 	  new_input_cd = iconv_open ("UTF-8", encoding);
 	  if (new_input_cd == (iconv_t) -1)
 	    goto invalid_encoding;
 	}
      if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
 	{
 	  new_output_cd = iconv_open (encoding, "UTF-8");
 	  if (new_output_cd == (iconv_t) -1)
 	    {
 	      if (new_input_cd != (iconv_t) -1)
 		iconv_close (new_input_cd);
 	      goto invalid_encoding;
 	    }
 	}
    }
--- a/libguile/print.c
+++ b/libguile/print.c
@ -821,31 +821,57 @@ codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
  return len;
 }
 /* Display the LEN codepoints in STR to PORT according to STRATEGY;
   return the number of codepoints successfully displayed.  If NARROW_P,
   then STR is interpreted as a sequence of `char', denoting a Latin-1
   string; otherwise it's interpreted as a sequence of
   `scm_t_wchar'.  */
 static size_t
 display_string (const void *str, int narrow_p,
 		size_t len, SCM port,
 		scm_t_string_failed_conversion_handler strategy)
 {
 #define STR_REF(s, x)				\
  (narrow_p					\
   ? (scm_t_wchar) ((unsigned char *) (s))[x]	\
   : ((scm_t_wchar *) (s))[x])
 /* Write STR to PORT as UTF-8.  STR is a LEN-codepoint string; it is
   narrow if NARROW_P is true, wide otherwise.  Return LEN.  */
 static size_t
 display_string_as_utf8 (const void *str, int narrow_p, size_t len,
 			SCM port)
 {
  size_t printed = 0;
  while (len > printed)
    {
      size_t utf8_len, i;
      char *input, utf8_buf[256];
      /* Convert STR to UTF-8.  */
      for (i = printed, utf8_len = 0, input = utf8_buf;
 	   i < len && utf8_len + 4 < sizeof (utf8_buf);
 	   i++)
 	{
 	  utf8_len += codepoint_to_utf8 (STR_REF (str, i),
 					 (scm_t_uint8 *) input);
 	  input = utf8_buf + utf8_len;
 	}
      /* INPUT was successfully converted, entirely; print the
 	 result.  */
      scm_lfwrite (utf8_buf, utf8_len, port);
      printed += i - printed;
    }
  assert (printed == len);
  return len;
 }
 /* Convert STR through PORT's output conversion descriptor and write the
   output to PORT.  Return the number of codepoints written.  */
 static size_t
 display_string_using_iconv (const void *str, int narrow_p, size_t len,
 			    SCM port,
 			    scm_t_string_failed_conversion_handler strategy)
 {
  size_t printed;
  scm_t_port *pt;
  pt = SCM_PTAB_ENTRY (port);
  if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
    /* Initialize the conversion descriptors.  */
    scm_i_set_port_encoding_x (port, pt->encoding);
  printed = 0;
  while (len > printed)
@ -928,7 +954,35 @@ display_string (const void *str, int narrow_p,
    }
  return printed;
 }
 #undef STR_REF
 /* Display the LEN codepoints in STR to PORT according to STRATEGY;
   return the number of codepoints successfully displayed.  If NARROW_P,
   then STR is interpreted as a sequence of `char', denoting a Latin-1
   string; otherwise it's interpreted as a sequence of
   `scm_t_wchar'.  */
 static size_t
 display_string (const void *str, int narrow_p,
 		size_t len, SCM port,
 		scm_t_string_failed_conversion_handler strategy)
 {
  scm_t_port *pt;
  pt = SCM_PTAB_ENTRY (port);
  if (pt->output_cd == (iconv_t) -1)
    /* Initialize the conversion descriptors, if needed.  */
    scm_i_set_port_encoding_x (port, pt->encoding);
  /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8.  */
  if (pt->output_cd == (iconv_t) -1)
    return display_string_as_utf8 (str, narrow_p, len, port);
  else
    return display_string_using_iconv (str, narrow_p, len,
 				       port, strategy);
 }
 /* Attempt to display CH to PORT according to STRATEGY.  Return non-zero
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@ -572,6 +572,14 @@
       eof))
    (test-decoding-error (#xc2 #x41 #x42) "UTF-8"
      ;; FIXME: This is the behavior of glibc/libiconv but it does not
      ;; conform to the Unicode 6.0.0 recommendation: according to it,
      ;; the #\A should not be swallowed (Section 3.9 reads:
      ;; "If the converter encounters an ill-formed UTF-8 code unit
      ;; sequence which starts with a valid first byte, but which does
      ;; not continue with valid successor bytes (see Table 3-7), it
      ;; must not consume the successor bytes".)
      (error                ;; 41: should be in the 80..BF range
       #\B
       eof))