Rewrite read-char', display', etc. using iconv calls instead of libunistring.

Thanks to Bruno Haible for his suggestions. See <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>, for details. * libguile/ports.c (register_finalizer_for_port): Always register a finalizer for PORT. (finalize_port): Close ENTRY->input_cd and ENTRY->output_cd. (scm_new_port_table_entry): Initialize the `input_cd' and `output_cd' fields. (utf8_to_codepoint): New function. (get_codepoint): Rewrite to use `iconv' instead of libunistring. (scm_i_set_port_encoding_x): Initialize the `input_cd' and `output_cd' fields. (update_port_lf): Move upward. Use `switch' instead of `if's. * libguile/ports.h (scm_t_port)[input_cd, output_cd]: New fields. * libguile/print.c (codepoint_to_utf8, display_string): New functions. (display_character): Use `display_string'. (write_combining_character): Likewise. (iprin1): Use `display_string' instead of `scm_lfwrite_str', and `display_character' instead of `scm_putc'. (write_character): Likewise. (write_character_escaped): New function. * test-suite/tests/encoding-escapes.test ("display output escapes")["Rashomon"]: Use lower-case escapes. ["fake escape"]: New test.
2025-05-20 11:40:18 +02:00 · 2011-01-23 00:18:27 +01:00 · 2011-01-23 00:18:27 +01:00 · f4bc4e5934
commit f4bc4e5934
parent 8e43ed5d0b
4 changed files with 483 additions and 311 deletions
--- a/libguile/ports.c
+++ b/libguile/ports.c
@ -1,5 +1,6 @@
-/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
- * 
+/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
+ *   2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+ *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 3 of
@ -30,6 +31,7 @@
 #include <errno.h>
 #include <fcntl.h>  /* for chsize on mingw */
 #include <assert.h>
+#include <iconv.h>
 #include <uniconv.h>
 #include <unistr.h>
 #include <striconveh.h>
@ -515,22 +517,21 @@ scm_i_pthread_mutex_t scm_i_port_table_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;

 static void finalize_port (GC_PTR, GC_PTR);

-/* Register a finalizer for PORT, if needed by its port type.  */
+/* Register a finalizer for PORT.  */
 static SCM_C_INLINE_KEYWORD void
 register_finalizer_for_port (SCM port)
 {
  long port_type;
+  GC_finalization_proc prev_finalizer;
+  GC_PTR prev_finalization_data;

  port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
-  if (scm_ptobs[port_type].free)
-    {
-      GC_finalization_proc prev_finalizer;
-      GC_PTR prev_finalization_data;

-      GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
-				      &prev_finalizer,
-				      &prev_finalization_data);
-    }
+  /* Register a finalizer for PORT so that its iconv CDs get freed and
+     optionally its type's `free' function gets called.  */
+  GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
+				  &prev_finalizer,
+				  &prev_finalization_data);
 }

 /* Finalize the object (a port) pointed to by PTR.  */
@ -550,6 +551,8 @@ finalize_port (GC_PTR ptr, GC_PTR data)
 	register_finalizer_for_port (port);
      else
 	{
+	  scm_t_port *entry;
+
 	  port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
 	  if (port_type >= scm_numptob)
 	    abort ();
@ -559,6 +562,13 @@ finalize_port (GC_PTR ptr, GC_PTR data)
 	       is for explicit `close-port' by user.  */
 	    scm_ptobs[port_type].free (port);

+	  entry = SCM_PTAB_ENTRY (port);
+
+	  if (entry->input_cd != (iconv_t) -1)
+	    iconv_close (entry->input_cd);
+	  if (entry->output_cd != (iconv_t) -1)
+	    iconv_close (entry->output_cd);
+
 	  SCM_SETSTREAM (port, 0);
 	  SCM_CLR_PORT_OPEN_FLAG (port);

@ -594,6 +604,11 @@ scm_new_port_table_entry (scm_t_bits tag)
    entry->encoding = NULL;
  else
    entry->encoding = scm_gc_strdup (enc, "port");
+
+  /* The conversion descriptors will be opened lazily.  */
+  entry->input_cd = (iconv_t) -1;
+  entry->output_cd = (iconv_t) -1;
+
  entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);

  SCM_SET_CELL_TYPE (z, tag);
@ -1028,100 +1043,11 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
 }
 #undef FUNC_NAME

-#define SCM_MBCHAR_BUF_SIZE (4)
-
-/* Read a codepoint from PORT and return it.  Fill BUF with the byte
-   representation of the codepoint in PORT's encoding, and set *LEN to
-   the length in bytes of that representation.  Raise an error on
-   failure.  */
-static scm_t_wchar
-get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+/* Update the line and column number of PORT after consumption of C.  */
+static inline void
+update_port_lf (scm_t_wchar c, SCM port)
 {
-  int c;
-  size_t bufcount = 0;
-  scm_t_uint32 result_buf;
-  scm_t_wchar codepoint = 0;
-  scm_t_uint32 *u32;
-  size_t u32len;
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
-
-  c = scm_get_byte_or_eof (port);
-  if (c == EOF)
-    return (scm_t_wchar) EOF;
-
-  buf[0] = c;
-  bufcount++;
-
-  if (pt->encoding == NULL)
-    {
-      /* The encoding is Latin-1: bytes are characters.  */
-      codepoint = (unsigned char) buf[0];
-      goto success;
-    }
-
-  for (;;)
-    {
-      u32len = sizeof (result_buf) / sizeof (scm_t_uint32);
-      u32 = u32_conv_from_encoding (pt->encoding,
-                                    (enum iconv_ilseq_handler) pt->ilseq_handler,
-				    buf, bufcount, NULL, &result_buf, &u32len);
-      if (u32 == NULL || u32len == 0)
-	{
-	  if (errno == ENOMEM)
-	    scm_memory_error ("Input decoding");
-
-	  /* Otherwise errno is EILSEQ or EINVAL, so perhaps more
-             bytes are needed.  Keep looping.  */
-	}
-      else
-	{
-	  /* Complete codepoint found. */
-	  codepoint = u32[0];
-
-	  if (SCM_UNLIKELY (u32 != &result_buf))
-	    /* libunistring up to 0.9.3 (included) would always heap-allocate
-	       the result even when a large-enough RESULT_BUF is supplied, see
-	       <http://lists.gnu.org/archive/html/bug-libunistring/2010-07/msg00003.html>.  */
-	    free (u32);
-
-	  goto success;
-	}
-
-      if (bufcount == SCM_MBCHAR_BUF_SIZE)
-	{
-	  /* We've read several bytes and didn't find a good
-	     codepoint.  Give up.  */
-	  goto failure;
-	}
-
-      c = scm_get_byte_or_eof (port);
-
-      if (c == EOF)
-	{
-	  /* EOF before a complete character was read.  Push it all
-	     back and return EOF. */
-	  while (bufcount > 0)
-	    {
-	      /* FIXME: this will probably cause errors in the port column. */
-	      scm_unget_byte (buf[bufcount-1], port);
-	      bufcount --;
-	    }
-          return EOF;
-	}
-      
-      if (c == '\n')
-	{
-          /* It is always invalid to have EOL in the middle of a
-             multibyte character.  */
-	  scm_unget_byte ('\n', port);
-	  goto failure;
-	}
-	
-      buf[bufcount++] = c;
-    }
-
- success:
-  switch (codepoint)
+  switch (c)
    {
    case '\a':
      break;
@ -1130,7 +1056,7 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
      break;
    case '\n':
      SCM_INCLINE (port);
-        break;
+      break;
    case '\r':
      SCM_ZEROCOL (port);
      break;
@ -1141,18 +1067,120 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
      SCM_INCCOL (port);
      break;
    }
+}

-  *len = bufcount;
+#define SCM_MBCHAR_BUF_SIZE (4)
+
+/* Convert the SIZE-byte UTF-8 sequence in UTF8_BUF to a codepoint.
+   UTF8_BUF is assumed to contain a valid UTF-8 sequence.  */
+static scm_t_wchar
+utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
+{
+  scm_t_wchar codepoint;
+
+  if (utf8_buf[0] <= 0x7f)
+    {
+      assert (size == 1);
+      codepoint = utf8_buf[0];
+    }
+  else if ((utf8_buf[0] & 0xe0) == 0xc0)
+    {
+      assert (size == 2);
+      codepoint = ((scm_t_wchar) utf8_buf[0] & 0x1f) << 6UL
+	| (utf8_buf[1] & 0x3f);
+    }
+  else if ((utf8_buf[0] & 0xf0) == 0xe0)
+    {
+      assert (size == 3);
+      codepoint = ((scm_t_wchar) utf8_buf[0] & 0x0f) << 12UL
+	| ((scm_t_wchar) utf8_buf[1] & 0x3f) << 6UL
+	| (utf8_buf[2] & 0x3f);
+    }
+  else
+    {
+      assert (size == 4);
+      codepoint = ((scm_t_wchar) utf8_buf[0] & 0x07) << 18UL
+	| ((scm_t_wchar) utf8_buf[1] & 0x3f) << 12UL
+	| ((scm_t_wchar) utf8_buf[2] & 0x3f) << 6UL
+	| (utf8_buf[3] & 0x3f);
+    }
+
+  return codepoint;
+}
+
+/* Read a codepoint from PORT and return it.  Fill BUF with the byte
+   representation of the codepoint in PORT's encoding, and set *LEN to
+   the length in bytes of that representation.  Raise an error on
+   failure.  */
+static scm_t_wchar
+get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  int err, byte_read;
+  size_t bytes_consumed, output_size;
+  scm_t_wchar codepoint;
+  char *output;
+  scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
+    /* Initialize the conversion descriptors.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  for (output_size = 0, output = (char *) utf8_buf,
+	 bytes_consumed = 0, err = 0;
+       err == 0 && output_size == 0
+	 && (bytes_consumed == 0 || byte_read != EOF);
+       bytes_consumed++)
+    {
+      char *input;
+      size_t input_left, output_left, done;
+
+      byte_read = scm_get_byte_or_eof (port);
+      if (byte_read == EOF)
+	{
+	  if (bytes_consumed == 0)
+	    return (scm_t_wchar) EOF;
+	  else
+	    continue;
+	}
+
+      buf[bytes_consumed] = byte_read;
+
+      input = buf;
+      input_left = bytes_consumed + 1;
+      output_left = sizeof (utf8_buf);
+
+      done = iconv (pt->input_cd, &input, &input_left,
+		    &output, &output_left);
+      if (done == (size_t) -1)
+	{
+	  err = errno;
+	  if (err == EINVAL)
+	    /* Missing input: keep trying.  */
+	    err = 0;
+	}
+      else
+	output_size = sizeof (utf8_buf) - output_left;
+    }
+
+  if (err != 0)
+    goto failure;
+
+  /* Convert the UTF8_BUF sequence to a Unicode code point.  */
+  codepoint = utf8_to_codepoint (utf8_buf, output_size);
+  update_port_lf (codepoint, port);
+
+  *len = bytes_consumed;

  return codepoint;

 failure:
  {
    char *err_buf;
-    SCM err_str = scm_i_make_string (bufcount, &err_buf);
-    memcpy (err_buf, buf, bufcount);
+    SCM err_str = scm_i_make_string (bytes_consumed, &err_buf);
+    memcpy (err_buf, buf, bytes_consumed);

-    if (errno == EILSEQ)
+    if (err == EILSEQ)
      scm_misc_error (NULL, "input encoding error for ~s: ~s",
 		      scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
 				  err_str));
@ -1205,23 +1233,6 @@ scm_fill_input (SCM port)
 * This function differs from scm_c_write; it updates port line and
 * column. */

-static void
-update_port_lf (scm_t_wchar c, SCM port)
-{
-  if (c == '\a')
-    ;                           /* Do nothing. */
-  else if (c == '\b')
-    SCM_DECCOL (port);
-  else if (c == '\n')
-    SCM_INCLINE (port);
-  else if (c == '\r')
-    SCM_ZEROCOL (port);
-  else if (c == '\t')
-    SCM_TABCOL (port);
-  else
-    SCM_INCCOL (port);
-}
-
 void
 scm_lfwrite (const char *ptr, size_t size, SCM port)
 {
@ -1278,6 +1289,7 @@ scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
 }

 /* Write a scheme string STR to PORT.  */
+/* FIXME: Get rid of it.  */
 void
 scm_lfwrite_str (SCM str, SCM port)
 {
@ -2060,12 +2072,7 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
    {
      valid_enc = find_valid_encoding (enc);
      if (valid_enc == NULL)
-        {
-          SCM err;
-          err = scm_from_locale_string (enc);
-          scm_misc_error (NULL, "invalid or unknown character encoding ~s",
-                          scm_list_1 (err));
-        }
+	goto invalid_encoding;
    }

  if (scm_is_false (port))
@ -2087,13 +2094,62 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
    }
  else
    {
+      iconv_t new_input_cd, new_output_cd;
+
+      new_input_cd = (iconv_t) -1;
+      new_output_cd = (iconv_t) -1;
+
      /* Set the character encoding for this port.  */
      pt = SCM_PTAB_ENTRY (port);
      if (valid_enc == NULL)
        pt->encoding = NULL;
      else
        pt->encoding = scm_gc_strdup (valid_enc, "port");
+
+      if (valid_enc == NULL)
+	valid_enc = "ISO-8859-1";
+
+      if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+	{
+	  /* Open an input iconv conversion descriptor, from VALID_ENC
+	     to UTF-8.  We choose UTF-8, not UTF-32, because iconv
+	     implementations can typically convert from anything to
+	     UTF-8, but not to UTF-32 (see
+	     <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
+	  new_input_cd = iconv_open ("UTF-8", valid_enc);
+	  if (new_input_cd == (iconv_t) -1)
+	    goto invalid_encoding;
+	}
+
+      if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
+	{
+	  new_output_cd = iconv_open (valid_enc, "UTF-8");
+	  if (new_output_cd == (iconv_t) -1)
+	    {
+	      if (new_input_cd != (iconv_t) -1)
+		iconv_close (new_input_cd);
+	      goto invalid_encoding;
+	    }
+	}
+
+      if (pt->input_cd != (iconv_t) -1)
+	iconv_close (pt->input_cd);
+      if (pt->output_cd != (iconv_t) -1)
+	iconv_close (pt->output_cd);
+
+      pt->input_cd = new_input_cd;
+      pt->output_cd = new_output_cd;
    }
+
+  return;
+
+ invalid_encoding:
+  {
+    SCM err;
+    err = scm_from_locale_string (enc);
+    scm_misc_error (NULL, "invalid or unknown character encoding ~s",
+		    scm_list_1 (err));
+  }
 }

 SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
--- a/libguile/ports.h
+++ b/libguile/ports.h
@ -3,7 +3,7 @@
 #ifndef SCM_PORTS_H
 #define SCM_PORTS_H

-/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
@ -109,6 +109,10 @@ typedef struct
  /* a buffer for un-read chars and strings.  */
  unsigned char *putback_buf;
  size_t putback_buf_size;        /* allocated size of putback_buf.  */
+
+  /* input/output iconv conversion descriptors */
+  void *input_cd;
+  void *output_cd;
 } scm_t_port;


--- a/libguile/print.c
+++ b/libguile/print.c
@ -1,5 +1,6 @@
-/* Copyright (C) 1995-1999,2000,2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
- * 
+/* Copyright (C) 1995-1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008,
+ *   2009, 2010, 2011 Free Software Foundation, Inc.
+ *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 3 of
@ -23,6 +24,10 @@
 #endif

 #include <errno.h>
+#include <iconv.h>
+#include <stdio.h>
+#include <assert.h>
+
 #include <uniconv.h>
 #include <unictype.h>

@ -56,10 +61,16 @@

 /* Character printers.  */

+static size_t display_string (const void *, int, size_t, SCM,
+			      scm_t_string_failed_conversion_handler);
+
 static int display_character (scm_t_wchar, SCM,
 			      scm_t_string_failed_conversion_handler);
+
 static void write_character (scm_t_wchar, SCM, int);

+static void write_character_escaped (scm_t_wchar, int, SCM);
+


 /* {Names of immediate symbols}
@ -541,16 +552,31 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
            {
              size_t len, i;

-              scm_putc ('"', port);
+              display_character ('"', port, iconveh_question_mark);
              len = scm_i_string_length (exp);
              for (i = 0; i < len; ++i)
 		write_character (scm_i_string_ref (exp, i), port, 1);

-              scm_putc ('"', port);
+              display_character ('"', port, iconveh_question_mark);
              scm_remember_upto_here_1 (exp);
            }
          else
-            scm_lfwrite_str (exp, port);
+	    {
+	      size_t len, printed;
+
+	      len = scm_i_string_length (exp);
+	      printed = display_string (scm_i_string_data (exp),
+					scm_i_is_narrow_string (exp),
+					len, port,
+					scm_i_get_conversion_strategy (port));
+	      if (SCM_UNLIKELY (printed < len))
+		/* FIXME: Provide the error location.  */
+		scm_encoding_error (__func__, errno,
+				    "cannot convert to output locale",
+				    "UTF-32", scm_i_get_port_encoding (port),
+				    exp);
+	    }
+
          scm_remember_upto_here_1 (exp);
          break;
 	case scm_tc7_symbol:
@ -740,6 +766,154 @@ scm_prin1 (SCM exp, SCM port, int writingp)
    }
 }

+/* Convert codepoint CH to UTF-8 and store the result in UTF8.  Return
+   the number of bytes of the UTF-8-encoded string.  */
+static size_t
+codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
+{
+  size_t len;
+  scm_t_uint32 codepoint;
+
+  codepoint = (scm_t_uint32) ch;
+
+  if (codepoint <= 0x7f)
+    {
+      len = 1;
+      utf8[0] = (scm_t_uint8) codepoint;
+    }
+  else if (codepoint <= 0x7ffUL)
+    {
+      len = 2;
+      utf8[0] = 0xc0 | (codepoint >> 6);
+      utf8[1] = 0x80 | (codepoint & 0x3f);
+    }
+  else if (codepoint <= 0xffffUL)
+    {
+      len = 3;
+      utf8[0] = 0xe0 | (codepoint >> 12);
+      utf8[1] = 0x80 | ((codepoint >> 6) & 0x3f);
+      utf8[2] = 0x80 | (codepoint & 0x3f);
+    }
+  else
+    {
+      len = 4;
+      utf8[0] = 0xf0 | (codepoint >> 18);
+      utf8[1] = 0x80 | ((codepoint >> 12) & 0x3f);
+      utf8[2] = 0x80 | ((codepoint >> 6) & 0x3f);
+      utf8[3] = 0x80 | (codepoint & 0x3f);
+    }
+
+  return len;
+}
+
+/* Display the LEN codepoints in STR to PORT according to STRATEGY;
+   return the number of codepoints successfully displayed.  If NARROW_P,
+   then STR is interpreted as a sequence of `char', denoting a Latin-1
+   string; otherwise it's interpreted as a sequence of
+   `scm_t_wchar'.  */
+static size_t
+display_string (const void *str, int narrow_p,
+		size_t len, SCM port,
+		scm_t_string_failed_conversion_handler strategy)
+
+{
+#define STR_REF(s, x)				\
+  (narrow_p					\
+   ? (scm_t_wchar) ((unsigned char *) (s))[x]	\
+   : ((scm_t_wchar *) (s))[x])
+
+  size_t printed;
+  scm_t_port *pt;
+
+  pt = SCM_PTAB_ENTRY (port);
+
+  if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
+    /* Initialize the conversion descriptors.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  printed = 0;
+
+  while (len > printed)
+    {
+      size_t done, utf8_len, input_left, output_left, i;
+      size_t codepoints_read, output_len;
+      char *input, *output;
+      char utf8_buf[256], encoded_output[256];
+      size_t offsets[256];
+
+      /* Convert STR to UTF-8.  */
+      for (i = printed, utf8_len = 0, input = utf8_buf;
+	   i < len && utf8_len + 4 < sizeof (utf8_buf);
+	   i++)
+	{
+	  offsets[utf8_len] = i;
+	  utf8_len += codepoint_to_utf8 (STR_REF (str, i),
+					 (scm_t_uint8 *) input);
+	  input = utf8_buf + utf8_len;
+	}
+
+      input = utf8_buf;
+      input_left = utf8_len;
+
+      output = encoded_output;
+      output_left = sizeof (encoded_output);
+
+      done = iconv (pt->output_cd, &input, &input_left,
+		    &output, &output_left);
+
+      output_len = sizeof (encoded_output) - output_left;
+
+      if (SCM_UNLIKELY (done == (size_t) -1))
+	{
+	  /* Reset the `iconv' state.  */
+	  iconv (pt->output_cd, NULL, NULL, NULL, NULL);
+
+	  if (errno == EILSEQ &&
+	      strategy != SCM_FAILED_CONVERSION_ERROR)
+	    {
+	      /* Conversion failed somewhere in INPUT and we want to
+		 escape or substitute the offending input character.  */
+
+	      /* Print the OUTPUT_LEN bytes successfully converted.  */
+	      scm_lfwrite (encoded_output, output_len, port);
+
+	      /* See how many input codepoints these OUTPUT_LEN bytes
+		 corresponds to.  */
+	      codepoints_read = offsets[input - utf8_buf] - printed;
+	      printed += codepoints_read;
+
+	      if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+		{
+		  scm_t_wchar ch;
+
+		  /* Find CH, the offending codepoint, and escape it.  */
+		  ch = STR_REF (str, offsets[input - utf8_buf]);
+		  write_character_escaped (ch, 1, port);
+		}
+	      else
+		/* STRATEGY is `SCM_FAILED_CONVERSION_QUESTION_MARK'.  */
+		display_string ("?", 1, 1, port, strategy);
+
+	      printed++;
+	    }
+	  else
+	    /* Something bad happened that we can't handle: bail out.  */
+	    break;
+	}
+      else
+	{
+	  /* INPUT was successfully converted, entirely; print the
+	     result.  */
+	  scm_lfwrite (encoded_output, output_len, port);
+	  codepoints_read = i - printed;
+	  printed += codepoints_read;
+	}
+    }
+
+  return printed;
+#undef STR_REF
+}
+
 /* Attempt to display CH to PORT according to STRATEGY.  Return non-zero
   if CH was successfully displayed, zero otherwise (e.g., if it was not
   representable in PORT's encoding.)  */
@ -747,62 +921,7 @@ static int
 display_character (scm_t_wchar ch, SCM port,
 		   scm_t_string_failed_conversion_handler strategy)
 {
-  int printed;
-  const char *encoding;
-
-  encoding = scm_i_get_port_encoding (port);
-  if (encoding == NULL)
-    {
-      if (ch <= 0xff)
-	{
-	  scm_putc (ch, port);
-	  printed = 1;
-	}
-      else
-	printed = 0;
-    }
-  else
-    {
-      size_t len;
-      char locale_encoded[8 * sizeof (ch)], *result;
-
-      len = sizeof (locale_encoded);
-      result = u32_conv_to_encoding (encoding, strategy,
-				     (scm_t_uint32 *) &ch, 1,
-				     NULL, locale_encoded, &len);
-      if (result != NULL)
-	{
-	  /* CH is graphic; print it.  */
-
-	  if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
-	    {
-	      /* Apply the same escaping syntax as in `write_character'.  */
-	      if (SCM_R6RS_ESCAPES_P)
-		{
-		  /* LOCALE_ENCODED is large enough to store an R6RS
-		     `\xNNNN;' escape sequence.  However, libunistring
-		     up to 0.9.3 (included) always returns a
-		     heap-allocated RESULT.  */
-		  if (SCM_UNLIKELY (result != locale_encoded))
-		    result = scm_realloc (result, len * 7);
-
-		  scm_i_unistring_escapes_to_r6rs_escapes (result, &len);
-		}
-	      else
-		scm_i_unistring_escapes_to_guile_escapes (result, &len);
-	    }
-
-	  scm_lfwrite (result, len, port);
-	  printed = 1;
-
-	  if (SCM_UNLIKELY (result != locale_encoded))
-	    free (result);
-	}
-      else
-	printed = 0;
-    }
-
-  return printed;
+  return display_string (&ch, 0, 1, port, strategy) == 1;
 }

 /* Attempt to pretty-print CH, a combining character, to PORT.  Return
@ -811,39 +930,101 @@ display_character (scm_t_wchar ch, SCM port,
 static int
 write_combining_character (scm_t_wchar ch, SCM port)
 {
-  int printed;
-  const char *encoding;
+  scm_t_wchar str[2];

-  encoding = scm_i_get_port_encoding (port);
-  if (encoding != NULL)
+  str[0] = SCM_CODEPOINT_DOTTED_CIRCLE;
+  str[1] = ch;
+
+  return display_string (str, 0, 2, port, iconveh_error) == 2;
+}
+
+/* Write CH to PORT in its escaped form, using the string escape syntax
+   if STRING_ESCAPES_P is non-zero.  */
+static void
+write_character_escaped (scm_t_wchar ch, int string_escapes_p, SCM port)
+{
+  if (string_escapes_p)
    {
-      scm_t_wchar str[2];
-      char locale_encoded[sizeof (str)], *result;
-      size_t len;
+      /* Represent CH using the in-string escape syntax.  */

-      str[0] = SCM_CODEPOINT_DOTTED_CIRCLE;
-      str[1] = ch;
+      static const char hex[] = "0123456789abcdef";
+      static const char escapes[7] = "abtnvfr";
+      char buf[9];

-      len = sizeof (locale_encoded);
-      result = u32_conv_to_encoding (encoding, iconveh_error,
-				     (scm_t_uint32 *) str, 2,
-				     NULL, locale_encoded, &len);
-      if (result != NULL)
+      if (ch >= 0x07 && ch <= 0x0D && ch != 0x0A)
 	{
-	  scm_lfwrite (result, len, port);
-	  printed = 1;
-	  if (SCM_UNLIKELY (result != locale_encoded))
-	    free (result);
+	  /* Use special escapes for some C0 controls.  */
+	  buf[0] = '\\';
+	  buf[1] = escapes[ch - 0x07];
+	  scm_lfwrite (buf, 2, port);
+	}
+      else if (!SCM_R6RS_ESCAPES_P)
+	{
+	  if (ch <= 0xFF)
+	    {
+	      buf[0] = '\\';
+	      buf[1] = 'x';
+	      buf[2] = hex[ch / 16];
+	      buf[3] = hex[ch % 16];
+	      scm_lfwrite (buf, 4, port);
+	    }
+	  else if (ch <= 0xFFFF)
+	    {
+	      buf[0] = '\\';
+	      buf[1] = 'u';
+	      buf[2] = hex[(ch & 0xF000) >> 12];
+	      buf[3] = hex[(ch & 0xF00) >> 8];
+	      buf[4] = hex[(ch & 0xF0) >> 4];
+	      buf[5] = hex[(ch & 0xF)];
+	      scm_lfwrite (buf, 6, port);
+	    }
+	  else if (ch > 0xFFFF)
+	    {
+	      buf[0] = '\\';
+	      buf[1] = 'U';
+	      buf[2] = hex[(ch & 0xF00000) >> 20];
+	      buf[3] = hex[(ch & 0xF0000) >> 16];
+	      buf[4] = hex[(ch & 0xF000) >> 12];
+	      buf[5] = hex[(ch & 0xF00) >> 8];
+	      buf[6] = hex[(ch & 0xF0) >> 4];
+	      buf[7] = hex[(ch & 0xF)];
+	      scm_lfwrite (buf, 8, port);
+	    }
 	}
      else
-	/* Can't write the result to PORT.  */
-	printed = 0;
+	{
+	  /* Print an R6RS variable-length hex escape: "\xNNNN;".  */
+	  scm_t_wchar ch2 = ch;
+
+	  int i = 8;
+	  buf[i] = ';';
+	  i --;
+	  if (ch == 0)
+	    buf[i--] = '0';
+	  else
+	    while (ch2 > 0)
+	      {
+		buf[i] = hex[ch2 & 0xF];
+		ch2 >>= 4;
+		i --;
+	      }
+	  buf[i] = 'x';
+	  i --;
+	  buf[i] = '\\';
+	  scm_lfwrite (buf + i, 9 - i, port);
+	}
    }
  else
-    /* PORT is Latin-1-encoded and can't display the fancy things.  */
-    printed = 0;
+    {
+      /* Represent CH using the character escape syntax.  */
+      const char *name;

-  return printed;
+      name = scm_i_charname (SCM_MAKE_CHAR (ch));
+      if (name != NULL)
+	scm_puts (name, port);
+      else
+	PRINT_CHAR_ESCAPE (ch, port);
+    }
 }

 /* Write CH to PORT, escaping it if it's non-graphic or not
@ -854,25 +1035,28 @@ static void
 write_character (scm_t_wchar ch, SCM port, int string_escapes_p)
 {
  int printed = 0;
+  scm_t_string_failed_conversion_handler strategy;
+
+  strategy = scm_i_get_conversion_strategy (port);

  if (string_escapes_p)
    {
      /* Check if CH deserves special treatment.  */
      if (ch == '"' || ch == '\\')
 	{
-	  scm_putc ('\\', port);
-	  scm_putc (ch, port);
+	  display_character ('\\', port, iconveh_question_mark);
+	  display_character (ch, port, strategy);
 	  printed = 1;
 	}
      else if (ch == ' ' || ch == '\n')
 	{
-	  scm_putc (ch, port);
+	  display_character (ch, port, strategy);
 	  printed = 1;
 	}
    }
  else
    {
-      scm_puts ("#\\", port);
+      display_string ("#\\", 1, 2, port, iconveh_question_mark);

      if (uc_combining_class (ch) != UC_CCC_NR)
 	/* Character is a combining character, so attempt to
@ -891,93 +1075,8 @@ write_character (scm_t_wchar ch, SCM port, int string_escapes_p)
    printed = display_character (ch, port, iconveh_error);

  if (!printed)
-    {
-      /* CH isn't graphic or cannot be represented in PORT's
-	 encoding.  */
-
-      if (string_escapes_p)
-	{
-	  /* Represent CH using the in-string escape syntax.  */
-
-	  static const char hex[] = "0123456789abcdef";
-          static const char escapes[7] = "abtnvfr";
-	  char buf[9];
-
-          if (ch >= 0x07 && ch <= 0x0D && ch != 0x0A)
-            {
-              /* Use special escapes for some C0 controls.  */
-              buf[0] = '\\';
-              buf[1] = escapes[ch - 0x07];
-              scm_lfwrite (buf, 2, port);
-            }
-          else if (!SCM_R6RS_ESCAPES_P)
-	    {
-	      if (ch <= 0xFF)
-		{
-		  buf[0] = '\\';
-		  buf[1] = 'x';
-		  buf[2] = hex[ch / 16];
-		  buf[3] = hex[ch % 16];
-		  scm_lfwrite (buf, 4, port);
-		}
-	      else if (ch <= 0xFFFF)
-		{
-		  buf[0] = '\\';
-		  buf[1] = 'u';
-		  buf[2] = hex[(ch & 0xF000) >> 12];
-		  buf[3] = hex[(ch & 0xF00) >> 8];
-		  buf[4] = hex[(ch & 0xF0) >> 4];
-		  buf[5] = hex[(ch & 0xF)];
-		  scm_lfwrite (buf, 6, port);
-		}
-	      else if (ch > 0xFFFF)
-		{
-		  buf[0] = '\\';
-		  buf[1] = 'U';
-		  buf[2] = hex[(ch & 0xF00000) >> 20];
-		  buf[3] = hex[(ch & 0xF0000) >> 16];
-		  buf[4] = hex[(ch & 0xF000) >> 12];
-		  buf[5] = hex[(ch & 0xF00) >> 8];
-		  buf[6] = hex[(ch & 0xF0) >> 4];
-		  buf[7] = hex[(ch & 0xF)];
-		  scm_lfwrite (buf, 8, port);
-		}
-	    }
-	  else
-	    {
-	      /* Print an R6RS variable-length hex escape: "\xNNNN;".  */
-	      scm_t_wchar ch2 = ch;
-
-	      int i = 8;
-	      buf[i] = ';';
-	      i --;
-	      if (ch == 0)
-		buf[i--] = '0';
-	      else
-		while (ch2 > 0)
-		  {
-		    buf[i] = hex[ch2 & 0xF];
-		    ch2 >>= 4;
-		    i --;
-		  }
-	      buf[i] = 'x';
-	      i --;
-	      buf[i] = '\\';
-	      scm_lfwrite (buf + i, 9 - i, port);
-	    }
-	}
-      else
-	{
-	  /* Represent CH using the character escape syntax.  */
-	  const char *name;
-
-	  name = scm_i_charname (SCM_MAKE_CHAR (ch));
-	  if (name != NULL)
-	    scm_puts (name, port);
-	  else
-	    PRINT_CHAR_ESCAPE (ch, port);
-	}
-    }
+    /* CH isn't graphic or cannot be represented in PORT's encoding.  */
+    write_character_escaped (ch, string_escapes_p, port);
 }

 /* Print an integer.
--- a/test-suite/tests/encoding-escapes.test
+++ b/test-suite/tests/encoding-escapes.test
@ -1,6 +1,6 @@
 ;;;; encoding-escapes.test --- test suite for Guile's string encodings -*- mode: scheme; coding: utf-8 -*-
 ;;;;
-;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc.
+;;;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
 ;;;;
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@ -118,8 +118,21 @@
      (set-port-encoding! pt "ASCII")
      (set-port-conversion-strategy! pt 'escape)
      (display s4 pt)
-      (string=? "\\u7F85\\u751F\\u9580"
-                (get-output-string pt)))))
+      (string=? "\\u7f85\\u751f\\u9580"
+                (get-output-string pt))))
+
+  (pass-if "fake escape"
+           ;; The input string below contains something that looks like
+           ;; an escape in libunistring syntax, but which should be left
+           ;; as is in the output.  See
+           ;; <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
+           ;; for background info.
+           (let ((pt (open-output-string)))
+	     (set-port-encoding! pt "ASCII")
+             (set-port-conversion-strategy! pt 'escape)
+             (display "λ -- \\u0012" pt)
+             (string=? "\\u03bb -- \\u0012"
+                       (get-output-string pt)))))

 (with-test-prefix "input escapes"