1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 11:40:18 +02:00

Rewrite read-char', display', etc. using iconv calls instead of libunistring.

Thanks to Bruno Haible for his suggestions.  See
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>,
for details.

* libguile/ports.c (register_finalizer_for_port): Always register a
  finalizer for PORT.
  (finalize_port): Close ENTRY->input_cd and ENTRY->output_cd.
  (scm_new_port_table_entry): Initialize the `input_cd' and `output_cd'
  fields.
  (utf8_to_codepoint): New function.
  (get_codepoint): Rewrite to use `iconv' instead of libunistring.
  (scm_i_set_port_encoding_x): Initialize the `input_cd' and `output_cd'
  fields.
  (update_port_lf): Move upward.  Use `switch' instead of `if's.

* libguile/ports.h (scm_t_port)[input_cd, output_cd]: New fields.

* libguile/print.c (codepoint_to_utf8, display_string): New functions.
  (display_character): Use `display_string'.
  (write_combining_character): Likewise.
  (iprin1): Use `display_string' instead of `scm_lfwrite_str', and
  `display_character' instead of `scm_putc'.
  (write_character): Likewise.
  (write_character_escaped): New function.

* test-suite/tests/encoding-escapes.test ("display output
  escapes")["Rashomon"]: Use lower-case escapes.
  ["fake escape"]: New test.
This commit is contained in:
Ludovic Courtès 2011-01-23 00:18:27 +01:00
parent 8e43ed5d0b
commit f4bc4e5934
4 changed files with 483 additions and 311 deletions

View file

@ -1,5 +1,6 @@
/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
*
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
* 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 3 of
@ -30,6 +31,7 @@
#include <errno.h>
#include <fcntl.h> /* for chsize on mingw */
#include <assert.h>
#include <iconv.h>
#include <uniconv.h>
#include <unistr.h>
#include <striconveh.h>
@ -515,22 +517,21 @@ scm_i_pthread_mutex_t scm_i_port_table_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
static void finalize_port (GC_PTR, GC_PTR);
/* Register a finalizer for PORT, if needed by its port type. */
/* Register a finalizer for PORT. */
static SCM_C_INLINE_KEYWORD void
register_finalizer_for_port (SCM port)
{
long port_type;
GC_finalization_proc prev_finalizer;
GC_PTR prev_finalization_data;
port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
if (scm_ptobs[port_type].free)
{
GC_finalization_proc prev_finalizer;
GC_PTR prev_finalization_data;
GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
&prev_finalizer,
&prev_finalization_data);
}
/* Register a finalizer for PORT so that its iconv CDs get freed and
optionally its type's `free' function gets called. */
GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
&prev_finalizer,
&prev_finalization_data);
}
/* Finalize the object (a port) pointed to by PTR. */
@ -550,6 +551,8 @@ finalize_port (GC_PTR ptr, GC_PTR data)
register_finalizer_for_port (port);
else
{
scm_t_port *entry;
port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
if (port_type >= scm_numptob)
abort ();
@ -559,6 +562,13 @@ finalize_port (GC_PTR ptr, GC_PTR data)
is for explicit `close-port' by user. */
scm_ptobs[port_type].free (port);
entry = SCM_PTAB_ENTRY (port);
if (entry->input_cd != (iconv_t) -1)
iconv_close (entry->input_cd);
if (entry->output_cd != (iconv_t) -1)
iconv_close (entry->output_cd);
SCM_SETSTREAM (port, 0);
SCM_CLR_PORT_OPEN_FLAG (port);
@ -594,6 +604,11 @@ scm_new_port_table_entry (scm_t_bits tag)
entry->encoding = NULL;
else
entry->encoding = scm_gc_strdup (enc, "port");
/* The conversion descriptors will be opened lazily. */
entry->input_cd = (iconv_t) -1;
entry->output_cd = (iconv_t) -1;
entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);
SCM_SET_CELL_TYPE (z, tag);
@ -1028,100 +1043,11 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
}
#undef FUNC_NAME
#define SCM_MBCHAR_BUF_SIZE (4)
/* Read a codepoint from PORT and return it. Fill BUF with the byte
representation of the codepoint in PORT's encoding, and set *LEN to
the length in bytes of that representation. Raise an error on
failure. */
static scm_t_wchar
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
/* Update the line and column number of PORT after consumption of C. */
static inline void
update_port_lf (scm_t_wchar c, SCM port)
{
int c;
size_t bufcount = 0;
scm_t_uint32 result_buf;
scm_t_wchar codepoint = 0;
scm_t_uint32 *u32;
size_t u32len;
scm_t_port *pt = SCM_PTAB_ENTRY (port);
c = scm_get_byte_or_eof (port);
if (c == EOF)
return (scm_t_wchar) EOF;
buf[0] = c;
bufcount++;
if (pt->encoding == NULL)
{
/* The encoding is Latin-1: bytes are characters. */
codepoint = (unsigned char) buf[0];
goto success;
}
for (;;)
{
u32len = sizeof (result_buf) / sizeof (scm_t_uint32);
u32 = u32_conv_from_encoding (pt->encoding,
(enum iconv_ilseq_handler) pt->ilseq_handler,
buf, bufcount, NULL, &result_buf, &u32len);
if (u32 == NULL || u32len == 0)
{
if (errno == ENOMEM)
scm_memory_error ("Input decoding");
/* Otherwise errno is EILSEQ or EINVAL, so perhaps more
bytes are needed. Keep looping. */
}
else
{
/* Complete codepoint found. */
codepoint = u32[0];
if (SCM_UNLIKELY (u32 != &result_buf))
/* libunistring up to 0.9.3 (included) would always heap-allocate
the result even when a large-enough RESULT_BUF is supplied, see
<http://lists.gnu.org/archive/html/bug-libunistring/2010-07/msg00003.html>. */
free (u32);
goto success;
}
if (bufcount == SCM_MBCHAR_BUF_SIZE)
{
/* We've read several bytes and didn't find a good
codepoint. Give up. */
goto failure;
}
c = scm_get_byte_or_eof (port);
if (c == EOF)
{
/* EOF before a complete character was read. Push it all
back and return EOF. */
while (bufcount > 0)
{
/* FIXME: this will probably cause errors in the port column. */
scm_unget_byte (buf[bufcount-1], port);
bufcount --;
}
return EOF;
}
if (c == '\n')
{
/* It is always invalid to have EOL in the middle of a
multibyte character. */
scm_unget_byte ('\n', port);
goto failure;
}
buf[bufcount++] = c;
}
success:
switch (codepoint)
switch (c)
{
case '\a':
break;
@ -1130,7 +1056,7 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
break;
case '\n':
SCM_INCLINE (port);
break;
break;
case '\r':
SCM_ZEROCOL (port);
break;
@ -1141,18 +1067,120 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
SCM_INCCOL (port);
break;
}
}
*len = bufcount;
#define SCM_MBCHAR_BUF_SIZE (4)
/* Convert the SIZE-byte UTF-8 sequence in UTF8_BUF to a codepoint.
UTF8_BUF is assumed to contain a valid UTF-8 sequence. */
static scm_t_wchar
utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
{
scm_t_wchar codepoint;
if (utf8_buf[0] <= 0x7f)
{
assert (size == 1);
codepoint = utf8_buf[0];
}
else if ((utf8_buf[0] & 0xe0) == 0xc0)
{
assert (size == 2);
codepoint = ((scm_t_wchar) utf8_buf[0] & 0x1f) << 6UL
| (utf8_buf[1] & 0x3f);
}
else if ((utf8_buf[0] & 0xf0) == 0xe0)
{
assert (size == 3);
codepoint = ((scm_t_wchar) utf8_buf[0] & 0x0f) << 12UL
| ((scm_t_wchar) utf8_buf[1] & 0x3f) << 6UL
| (utf8_buf[2] & 0x3f);
}
else
{
assert (size == 4);
codepoint = ((scm_t_wchar) utf8_buf[0] & 0x07) << 18UL
| ((scm_t_wchar) utf8_buf[1] & 0x3f) << 12UL
| ((scm_t_wchar) utf8_buf[2] & 0x3f) << 6UL
| (utf8_buf[3] & 0x3f);
}
return codepoint;
}
/* Read a codepoint from PORT and return it. Fill BUF with the byte
representation of the codepoint in PORT's encoding, and set *LEN to
the length in bytes of that representation. Raise an error on
failure. */
static scm_t_wchar
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
{
int err, byte_read;
size_t bytes_consumed, output_size;
scm_t_wchar codepoint;
char *output;
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
scm_t_port *pt = SCM_PTAB_ENTRY (port);
if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
/* Initialize the conversion descriptors. */
scm_i_set_port_encoding_x (port, pt->encoding);
for (output_size = 0, output = (char *) utf8_buf,
bytes_consumed = 0, err = 0;
err == 0 && output_size == 0
&& (bytes_consumed == 0 || byte_read != EOF);
bytes_consumed++)
{
char *input;
size_t input_left, output_left, done;
byte_read = scm_get_byte_or_eof (port);
if (byte_read == EOF)
{
if (bytes_consumed == 0)
return (scm_t_wchar) EOF;
else
continue;
}
buf[bytes_consumed] = byte_read;
input = buf;
input_left = bytes_consumed + 1;
output_left = sizeof (utf8_buf);
done = iconv (pt->input_cd, &input, &input_left,
&output, &output_left);
if (done == (size_t) -1)
{
err = errno;
if (err == EINVAL)
/* Missing input: keep trying. */
err = 0;
}
else
output_size = sizeof (utf8_buf) - output_left;
}
if (err != 0)
goto failure;
/* Convert the UTF8_BUF sequence to a Unicode code point. */
codepoint = utf8_to_codepoint (utf8_buf, output_size);
update_port_lf (codepoint, port);
*len = bytes_consumed;
return codepoint;
failure:
{
char *err_buf;
SCM err_str = scm_i_make_string (bufcount, &err_buf);
memcpy (err_buf, buf, bufcount);
SCM err_str = scm_i_make_string (bytes_consumed, &err_buf);
memcpy (err_buf, buf, bytes_consumed);
if (errno == EILSEQ)
if (err == EILSEQ)
scm_misc_error (NULL, "input encoding error for ~s: ~s",
scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
err_str));
@ -1205,23 +1233,6 @@ scm_fill_input (SCM port)
* This function differs from scm_c_write; it updates port line and
* column. */
static void
update_port_lf (scm_t_wchar c, SCM port)
{
if (c == '\a')
; /* Do nothing. */
else if (c == '\b')
SCM_DECCOL (port);
else if (c == '\n')
SCM_INCLINE (port);
else if (c == '\r')
SCM_ZEROCOL (port);
else if (c == '\t')
SCM_TABCOL (port);
else
SCM_INCCOL (port);
}
void
scm_lfwrite (const char *ptr, size_t size, SCM port)
{
@ -1278,6 +1289,7 @@ scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
}
/* Write a scheme string STR to PORT. */
/* FIXME: Get rid of it. */
void
scm_lfwrite_str (SCM str, SCM port)
{
@ -2060,12 +2072,7 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
{
valid_enc = find_valid_encoding (enc);
if (valid_enc == NULL)
{
SCM err;
err = scm_from_locale_string (enc);
scm_misc_error (NULL, "invalid or unknown character encoding ~s",
scm_list_1 (err));
}
goto invalid_encoding;
}
if (scm_is_false (port))
@ -2087,13 +2094,62 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
}
else
{
iconv_t new_input_cd, new_output_cd;
new_input_cd = (iconv_t) -1;
new_output_cd = (iconv_t) -1;
/* Set the character encoding for this port. */
pt = SCM_PTAB_ENTRY (port);
if (valid_enc == NULL)
pt->encoding = NULL;
else
pt->encoding = scm_gc_strdup (valid_enc, "port");
if (valid_enc == NULL)
valid_enc = "ISO-8859-1";
if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
{
/* Open an input iconv conversion descriptor, from VALID_ENC
to UTF-8. We choose UTF-8, not UTF-32, because iconv
implementations can typically convert from anything to
UTF-8, but not to UTF-32 (see
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
new_input_cd = iconv_open ("UTF-8", valid_enc);
if (new_input_cd == (iconv_t) -1)
goto invalid_encoding;
}
if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
{
new_output_cd = iconv_open (valid_enc, "UTF-8");
if (new_output_cd == (iconv_t) -1)
{
if (new_input_cd != (iconv_t) -1)
iconv_close (new_input_cd);
goto invalid_encoding;
}
}
if (pt->input_cd != (iconv_t) -1)
iconv_close (pt->input_cd);
if (pt->output_cd != (iconv_t) -1)
iconv_close (pt->output_cd);
pt->input_cd = new_input_cd;
pt->output_cd = new_output_cd;
}
return;
invalid_encoding:
{
SCM err;
err = scm_from_locale_string (enc);
scm_misc_error (NULL, "invalid or unknown character encoding ~s",
scm_list_1 (err));
}
}
SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,

View file

@ -3,7 +3,7 @@
#ifndef SCM_PORTS_H
#define SCM_PORTS_H
/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@ -109,6 +109,10 @@ typedef struct
/* a buffer for un-read chars and strings. */
unsigned char *putback_buf;
size_t putback_buf_size; /* allocated size of putback_buf. */
/* input/output iconv conversion descriptors */
void *input_cd;
void *output_cd;
} scm_t_port;

View file

@ -1,5 +1,6 @@
/* Copyright (C) 1995-1999,2000,2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
*
/* Copyright (C) 1995-1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008,
* 2009, 2010, 2011 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 3 of
@ -23,6 +24,10 @@
#endif
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <assert.h>
#include <uniconv.h>
#include <unictype.h>
@ -56,10 +61,16 @@
/* Character printers. */
static size_t display_string (const void *, int, size_t, SCM,
scm_t_string_failed_conversion_handler);
static int display_character (scm_t_wchar, SCM,
scm_t_string_failed_conversion_handler);
static void write_character (scm_t_wchar, SCM, int);
static void write_character_escaped (scm_t_wchar, int, SCM);
/* {Names of immediate symbols}
@ -541,16 +552,31 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
{
size_t len, i;
scm_putc ('"', port);
display_character ('"', port, iconveh_question_mark);
len = scm_i_string_length (exp);
for (i = 0; i < len; ++i)
write_character (scm_i_string_ref (exp, i), port, 1);
scm_putc ('"', port);
display_character ('"', port, iconveh_question_mark);
scm_remember_upto_here_1 (exp);
}
else
scm_lfwrite_str (exp, port);
{
size_t len, printed;
len = scm_i_string_length (exp);
printed = display_string (scm_i_string_data (exp),
scm_i_is_narrow_string (exp),
len, port,
scm_i_get_conversion_strategy (port));
if (SCM_UNLIKELY (printed < len))
/* FIXME: Provide the error location. */
scm_encoding_error (__func__, errno,
"cannot convert to output locale",
"UTF-32", scm_i_get_port_encoding (port),
exp);
}
scm_remember_upto_here_1 (exp);
break;
case scm_tc7_symbol:
@ -740,6 +766,154 @@ scm_prin1 (SCM exp, SCM port, int writingp)
}
}
/* Convert codepoint CH to UTF-8 and store the result in UTF8. Return
the number of bytes of the UTF-8-encoded string. */
static size_t
codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
{
size_t len;
scm_t_uint32 codepoint;
codepoint = (scm_t_uint32) ch;
if (codepoint <= 0x7f)
{
len = 1;
utf8[0] = (scm_t_uint8) codepoint;
}
else if (codepoint <= 0x7ffUL)
{
len = 2;
utf8[0] = 0xc0 | (codepoint >> 6);
utf8[1] = 0x80 | (codepoint & 0x3f);
}
else if (codepoint <= 0xffffUL)
{
len = 3;
utf8[0] = 0xe0 | (codepoint >> 12);
utf8[1] = 0x80 | ((codepoint >> 6) & 0x3f);
utf8[2] = 0x80 | (codepoint & 0x3f);
}
else
{
len = 4;
utf8[0] = 0xf0 | (codepoint >> 18);
utf8[1] = 0x80 | ((codepoint >> 12) & 0x3f);
utf8[2] = 0x80 | ((codepoint >> 6) & 0x3f);
utf8[3] = 0x80 | (codepoint & 0x3f);
}
return len;
}
/* Display the LEN codepoints in STR to PORT according to STRATEGY;
return the number of codepoints successfully displayed. If NARROW_P,
then STR is interpreted as a sequence of `char', denoting a Latin-1
string; otherwise it's interpreted as a sequence of
`scm_t_wchar'. */
static size_t
display_string (const void *str, int narrow_p,
size_t len, SCM port,
scm_t_string_failed_conversion_handler strategy)
{
#define STR_REF(s, x) \
(narrow_p \
? (scm_t_wchar) ((unsigned char *) (s))[x] \
: ((scm_t_wchar *) (s))[x])
size_t printed;
scm_t_port *pt;
pt = SCM_PTAB_ENTRY (port);
if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
/* Initialize the conversion descriptors. */
scm_i_set_port_encoding_x (port, pt->encoding);
printed = 0;
while (len > printed)
{
size_t done, utf8_len, input_left, output_left, i;
size_t codepoints_read, output_len;
char *input, *output;
char utf8_buf[256], encoded_output[256];
size_t offsets[256];
/* Convert STR to UTF-8. */
for (i = printed, utf8_len = 0, input = utf8_buf;
i < len && utf8_len + 4 < sizeof (utf8_buf);
i++)
{
offsets[utf8_len] = i;
utf8_len += codepoint_to_utf8 (STR_REF (str, i),
(scm_t_uint8 *) input);
input = utf8_buf + utf8_len;
}
input = utf8_buf;
input_left = utf8_len;
output = encoded_output;
output_left = sizeof (encoded_output);
done = iconv (pt->output_cd, &input, &input_left,
&output, &output_left);
output_len = sizeof (encoded_output) - output_left;
if (SCM_UNLIKELY (done == (size_t) -1))
{
/* Reset the `iconv' state. */
iconv (pt->output_cd, NULL, NULL, NULL, NULL);
if (errno == EILSEQ &&
strategy != SCM_FAILED_CONVERSION_ERROR)
{
/* Conversion failed somewhere in INPUT and we want to
escape or substitute the offending input character. */
/* Print the OUTPUT_LEN bytes successfully converted. */
scm_lfwrite (encoded_output, output_len, port);
/* See how many input codepoints these OUTPUT_LEN bytes
corresponds to. */
codepoints_read = offsets[input - utf8_buf] - printed;
printed += codepoints_read;
if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
{
scm_t_wchar ch;
/* Find CH, the offending codepoint, and escape it. */
ch = STR_REF (str, offsets[input - utf8_buf]);
write_character_escaped (ch, 1, port);
}
else
/* STRATEGY is `SCM_FAILED_CONVERSION_QUESTION_MARK'. */
display_string ("?", 1, 1, port, strategy);
printed++;
}
else
/* Something bad happened that we can't handle: bail out. */
break;
}
else
{
/* INPUT was successfully converted, entirely; print the
result. */
scm_lfwrite (encoded_output, output_len, port);
codepoints_read = i - printed;
printed += codepoints_read;
}
}
return printed;
#undef STR_REF
}
/* Attempt to display CH to PORT according to STRATEGY. Return non-zero
if CH was successfully displayed, zero otherwise (e.g., if it was not
representable in PORT's encoding.) */
@ -747,62 +921,7 @@ static int
display_character (scm_t_wchar ch, SCM port,
scm_t_string_failed_conversion_handler strategy)
{
int printed;
const char *encoding;
encoding = scm_i_get_port_encoding (port);
if (encoding == NULL)
{
if (ch <= 0xff)
{
scm_putc (ch, port);
printed = 1;
}
else
printed = 0;
}
else
{
size_t len;
char locale_encoded[8 * sizeof (ch)], *result;
len = sizeof (locale_encoded);
result = u32_conv_to_encoding (encoding, strategy,
(scm_t_uint32 *) &ch, 1,
NULL, locale_encoded, &len);
if (result != NULL)
{
/* CH is graphic; print it. */
if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
{
/* Apply the same escaping syntax as in `write_character'. */
if (SCM_R6RS_ESCAPES_P)
{
/* LOCALE_ENCODED is large enough to store an R6RS
`\xNNNN;' escape sequence. However, libunistring
up to 0.9.3 (included) always returns a
heap-allocated RESULT. */
if (SCM_UNLIKELY (result != locale_encoded))
result = scm_realloc (result, len * 7);
scm_i_unistring_escapes_to_r6rs_escapes (result, &len);
}
else
scm_i_unistring_escapes_to_guile_escapes (result, &len);
}
scm_lfwrite (result, len, port);
printed = 1;
if (SCM_UNLIKELY (result != locale_encoded))
free (result);
}
else
printed = 0;
}
return printed;
return display_string (&ch, 0, 1, port, strategy) == 1;
}
/* Attempt to pretty-print CH, a combining character, to PORT. Return
@ -811,39 +930,101 @@ display_character (scm_t_wchar ch, SCM port,
static int
write_combining_character (scm_t_wchar ch, SCM port)
{
int printed;
const char *encoding;
scm_t_wchar str[2];
encoding = scm_i_get_port_encoding (port);
if (encoding != NULL)
str[0] = SCM_CODEPOINT_DOTTED_CIRCLE;
str[1] = ch;
return display_string (str, 0, 2, port, iconveh_error) == 2;
}
/* Write CH to PORT in its escaped form, using the string escape syntax
if STRING_ESCAPES_P is non-zero. */
static void
write_character_escaped (scm_t_wchar ch, int string_escapes_p, SCM port)
{
if (string_escapes_p)
{
scm_t_wchar str[2];
char locale_encoded[sizeof (str)], *result;
size_t len;
/* Represent CH using the in-string escape syntax. */
str[0] = SCM_CODEPOINT_DOTTED_CIRCLE;
str[1] = ch;
static const char hex[] = "0123456789abcdef";
static const char escapes[7] = "abtnvfr";
char buf[9];
len = sizeof (locale_encoded);
result = u32_conv_to_encoding (encoding, iconveh_error,
(scm_t_uint32 *) str, 2,
NULL, locale_encoded, &len);
if (result != NULL)
if (ch >= 0x07 && ch <= 0x0D && ch != 0x0A)
{
scm_lfwrite (result, len, port);
printed = 1;
if (SCM_UNLIKELY (result != locale_encoded))
free (result);
/* Use special escapes for some C0 controls. */
buf[0] = '\\';
buf[1] = escapes[ch - 0x07];
scm_lfwrite (buf, 2, port);
}
else if (!SCM_R6RS_ESCAPES_P)
{
if (ch <= 0xFF)
{
buf[0] = '\\';
buf[1] = 'x';
buf[2] = hex[ch / 16];
buf[3] = hex[ch % 16];
scm_lfwrite (buf, 4, port);
}
else if (ch <= 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'u';
buf[2] = hex[(ch & 0xF000) >> 12];
buf[3] = hex[(ch & 0xF00) >> 8];
buf[4] = hex[(ch & 0xF0) >> 4];
buf[5] = hex[(ch & 0xF)];
scm_lfwrite (buf, 6, port);
}
else if (ch > 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'U';
buf[2] = hex[(ch & 0xF00000) >> 20];
buf[3] = hex[(ch & 0xF0000) >> 16];
buf[4] = hex[(ch & 0xF000) >> 12];
buf[5] = hex[(ch & 0xF00) >> 8];
buf[6] = hex[(ch & 0xF0) >> 4];
buf[7] = hex[(ch & 0xF)];
scm_lfwrite (buf, 8, port);
}
}
else
/* Can't write the result to PORT. */
printed = 0;
{
/* Print an R6RS variable-length hex escape: "\xNNNN;". */
scm_t_wchar ch2 = ch;
int i = 8;
buf[i] = ';';
i --;
if (ch == 0)
buf[i--] = '0';
else
while (ch2 > 0)
{
buf[i] = hex[ch2 & 0xF];
ch2 >>= 4;
i --;
}
buf[i] = 'x';
i --;
buf[i] = '\\';
scm_lfwrite (buf + i, 9 - i, port);
}
}
else
/* PORT is Latin-1-encoded and can't display the fancy things. */
printed = 0;
{
/* Represent CH using the character escape syntax. */
const char *name;
return printed;
name = scm_i_charname (SCM_MAKE_CHAR (ch));
if (name != NULL)
scm_puts (name, port);
else
PRINT_CHAR_ESCAPE (ch, port);
}
}
/* Write CH to PORT, escaping it if it's non-graphic or not
@ -854,25 +1035,28 @@ static void
write_character (scm_t_wchar ch, SCM port, int string_escapes_p)
{
int printed = 0;
scm_t_string_failed_conversion_handler strategy;
strategy = scm_i_get_conversion_strategy (port);
if (string_escapes_p)
{
/* Check if CH deserves special treatment. */
if (ch == '"' || ch == '\\')
{
scm_putc ('\\', port);
scm_putc (ch, port);
display_character ('\\', port, iconveh_question_mark);
display_character (ch, port, strategy);
printed = 1;
}
else if (ch == ' ' || ch == '\n')
{
scm_putc (ch, port);
display_character (ch, port, strategy);
printed = 1;
}
}
else
{
scm_puts ("#\\", port);
display_string ("#\\", 1, 2, port, iconveh_question_mark);
if (uc_combining_class (ch) != UC_CCC_NR)
/* Character is a combining character, so attempt to
@ -891,93 +1075,8 @@ write_character (scm_t_wchar ch, SCM port, int string_escapes_p)
printed = display_character (ch, port, iconveh_error);
if (!printed)
{
/* CH isn't graphic or cannot be represented in PORT's
encoding. */
if (string_escapes_p)
{
/* Represent CH using the in-string escape syntax. */
static const char hex[] = "0123456789abcdef";
static const char escapes[7] = "abtnvfr";
char buf[9];
if (ch >= 0x07 && ch <= 0x0D && ch != 0x0A)
{
/* Use special escapes for some C0 controls. */
buf[0] = '\\';
buf[1] = escapes[ch - 0x07];
scm_lfwrite (buf, 2, port);
}
else if (!SCM_R6RS_ESCAPES_P)
{
if (ch <= 0xFF)
{
buf[0] = '\\';
buf[1] = 'x';
buf[2] = hex[ch / 16];
buf[3] = hex[ch % 16];
scm_lfwrite (buf, 4, port);
}
else if (ch <= 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'u';
buf[2] = hex[(ch & 0xF000) >> 12];
buf[3] = hex[(ch & 0xF00) >> 8];
buf[4] = hex[(ch & 0xF0) >> 4];
buf[5] = hex[(ch & 0xF)];
scm_lfwrite (buf, 6, port);
}
else if (ch > 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'U';
buf[2] = hex[(ch & 0xF00000) >> 20];
buf[3] = hex[(ch & 0xF0000) >> 16];
buf[4] = hex[(ch & 0xF000) >> 12];
buf[5] = hex[(ch & 0xF00) >> 8];
buf[6] = hex[(ch & 0xF0) >> 4];
buf[7] = hex[(ch & 0xF)];
scm_lfwrite (buf, 8, port);
}
}
else
{
/* Print an R6RS variable-length hex escape: "\xNNNN;". */
scm_t_wchar ch2 = ch;
int i = 8;
buf[i] = ';';
i --;
if (ch == 0)
buf[i--] = '0';
else
while (ch2 > 0)
{
buf[i] = hex[ch2 & 0xF];
ch2 >>= 4;
i --;
}
buf[i] = 'x';
i --;
buf[i] = '\\';
scm_lfwrite (buf + i, 9 - i, port);
}
}
else
{
/* Represent CH using the character escape syntax. */
const char *name;
name = scm_i_charname (SCM_MAKE_CHAR (ch));
if (name != NULL)
scm_puts (name, port);
else
PRINT_CHAR_ESCAPE (ch, port);
}
}
/* CH isn't graphic or cannot be represented in PORT's encoding. */
write_character_escaped (ch, string_escapes_p, port);
}
/* Print an integer.

View file

@ -1,6 +1,6 @@
;;;; encoding-escapes.test --- test suite for Guile's string encodings -*- mode: scheme; coding: utf-8 -*-
;;;;
;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc.
;;;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@ -118,8 +118,21 @@
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'escape)
(display s4 pt)
(string=? "\\u7F85\\u751F\\u9580"
(get-output-string pt)))))
(string=? "\\u7f85\\u751f\\u9580"
(get-output-string pt))))
(pass-if "fake escape"
;; The input string below contains something that looks like
;; an escape in libunistring syntax, but which should be left
;; as is in the output. See
;; <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
;; for background info.
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'escape)
(display "λ -- \\u0012" pt)
(string=? "\\u03bb -- \\u0012"
(get-output-string pt)))))
(with-test-prefix "input escapes"