mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-20 11:40:18 +02:00
Rewrite read-char',
display', etc. using iconv calls instead of libunistring.
Thanks to Bruno Haible for his suggestions. See <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>, for details. * libguile/ports.c (register_finalizer_for_port): Always register a finalizer for PORT. (finalize_port): Close ENTRY->input_cd and ENTRY->output_cd. (scm_new_port_table_entry): Initialize the `input_cd' and `output_cd' fields. (utf8_to_codepoint): New function. (get_codepoint): Rewrite to use `iconv' instead of libunistring. (scm_i_set_port_encoding_x): Initialize the `input_cd' and `output_cd' fields. (update_port_lf): Move upward. Use `switch' instead of `if's. * libguile/ports.h (scm_t_port)[input_cd, output_cd]: New fields. * libguile/print.c (codepoint_to_utf8, display_string): New functions. (display_character): Use `display_string'. (write_combining_character): Likewise. (iprin1): Use `display_string' instead of `scm_lfwrite_str', and `display_character' instead of `scm_putc'. (write_character): Likewise. (write_character_escaped): New function. * test-suite/tests/encoding-escapes.test ("display output escapes")["Rashomon"]: Use lower-case escapes. ["fake escape"]: New test.
This commit is contained in:
parent
8e43ed5d0b
commit
f4bc4e5934
4 changed files with 483 additions and 311 deletions
320
libguile/ports.c
320
libguile/ports.c
|
@ -1,5 +1,6 @@
|
|||
/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
|
||||
*
|
||||
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
||||
* 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
* as published by the Free Software Foundation; either version 3 of
|
||||
|
@ -30,6 +31,7 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h> /* for chsize on mingw */
|
||||
#include <assert.h>
|
||||
#include <iconv.h>
|
||||
#include <uniconv.h>
|
||||
#include <unistr.h>
|
||||
#include <striconveh.h>
|
||||
|
@ -515,22 +517,21 @@ scm_i_pthread_mutex_t scm_i_port_table_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
|
|||
|
||||
static void finalize_port (GC_PTR, GC_PTR);
|
||||
|
||||
/* Register a finalizer for PORT, if needed by its port type. */
|
||||
/* Register a finalizer for PORT. */
|
||||
static SCM_C_INLINE_KEYWORD void
|
||||
register_finalizer_for_port (SCM port)
|
||||
{
|
||||
long port_type;
|
||||
GC_finalization_proc prev_finalizer;
|
||||
GC_PTR prev_finalization_data;
|
||||
|
||||
port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
|
||||
if (scm_ptobs[port_type].free)
|
||||
{
|
||||
GC_finalization_proc prev_finalizer;
|
||||
GC_PTR prev_finalization_data;
|
||||
|
||||
GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
|
||||
&prev_finalizer,
|
||||
&prev_finalization_data);
|
||||
}
|
||||
/* Register a finalizer for PORT so that its iconv CDs get freed and
|
||||
optionally its type's `free' function gets called. */
|
||||
GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
|
||||
&prev_finalizer,
|
||||
&prev_finalization_data);
|
||||
}
|
||||
|
||||
/* Finalize the object (a port) pointed to by PTR. */
|
||||
|
@ -550,6 +551,8 @@ finalize_port (GC_PTR ptr, GC_PTR data)
|
|||
register_finalizer_for_port (port);
|
||||
else
|
||||
{
|
||||
scm_t_port *entry;
|
||||
|
||||
port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
|
||||
if (port_type >= scm_numptob)
|
||||
abort ();
|
||||
|
@ -559,6 +562,13 @@ finalize_port (GC_PTR ptr, GC_PTR data)
|
|||
is for explicit `close-port' by user. */
|
||||
scm_ptobs[port_type].free (port);
|
||||
|
||||
entry = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (entry->input_cd != (iconv_t) -1)
|
||||
iconv_close (entry->input_cd);
|
||||
if (entry->output_cd != (iconv_t) -1)
|
||||
iconv_close (entry->output_cd);
|
||||
|
||||
SCM_SETSTREAM (port, 0);
|
||||
SCM_CLR_PORT_OPEN_FLAG (port);
|
||||
|
||||
|
@ -594,6 +604,11 @@ scm_new_port_table_entry (scm_t_bits tag)
|
|||
entry->encoding = NULL;
|
||||
else
|
||||
entry->encoding = scm_gc_strdup (enc, "port");
|
||||
|
||||
/* The conversion descriptors will be opened lazily. */
|
||||
entry->input_cd = (iconv_t) -1;
|
||||
entry->output_cd = (iconv_t) -1;
|
||||
|
||||
entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);
|
||||
|
||||
SCM_SET_CELL_TYPE (z, tag);
|
||||
|
@ -1028,100 +1043,11 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
|
|||
}
|
||||
#undef FUNC_NAME
|
||||
|
||||
#define SCM_MBCHAR_BUF_SIZE (4)
|
||||
|
||||
/* Read a codepoint from PORT and return it. Fill BUF with the byte
|
||||
representation of the codepoint in PORT's encoding, and set *LEN to
|
||||
the length in bytes of that representation. Raise an error on
|
||||
failure. */
|
||||
static scm_t_wchar
|
||||
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||
/* Update the line and column number of PORT after consumption of C. */
|
||||
static inline void
|
||||
update_port_lf (scm_t_wchar c, SCM port)
|
||||
{
|
||||
int c;
|
||||
size_t bufcount = 0;
|
||||
scm_t_uint32 result_buf;
|
||||
scm_t_wchar codepoint = 0;
|
||||
scm_t_uint32 *u32;
|
||||
size_t u32len;
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
c = scm_get_byte_or_eof (port);
|
||||
if (c == EOF)
|
||||
return (scm_t_wchar) EOF;
|
||||
|
||||
buf[0] = c;
|
||||
bufcount++;
|
||||
|
||||
if (pt->encoding == NULL)
|
||||
{
|
||||
/* The encoding is Latin-1: bytes are characters. */
|
||||
codepoint = (unsigned char) buf[0];
|
||||
goto success;
|
||||
}
|
||||
|
||||
for (;;)
|
||||
{
|
||||
u32len = sizeof (result_buf) / sizeof (scm_t_uint32);
|
||||
u32 = u32_conv_from_encoding (pt->encoding,
|
||||
(enum iconv_ilseq_handler) pt->ilseq_handler,
|
||||
buf, bufcount, NULL, &result_buf, &u32len);
|
||||
if (u32 == NULL || u32len == 0)
|
||||
{
|
||||
if (errno == ENOMEM)
|
||||
scm_memory_error ("Input decoding");
|
||||
|
||||
/* Otherwise errno is EILSEQ or EINVAL, so perhaps more
|
||||
bytes are needed. Keep looping. */
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Complete codepoint found. */
|
||||
codepoint = u32[0];
|
||||
|
||||
if (SCM_UNLIKELY (u32 != &result_buf))
|
||||
/* libunistring up to 0.9.3 (included) would always heap-allocate
|
||||
the result even when a large-enough RESULT_BUF is supplied, see
|
||||
<http://lists.gnu.org/archive/html/bug-libunistring/2010-07/msg00003.html>. */
|
||||
free (u32);
|
||||
|
||||
goto success;
|
||||
}
|
||||
|
||||
if (bufcount == SCM_MBCHAR_BUF_SIZE)
|
||||
{
|
||||
/* We've read several bytes and didn't find a good
|
||||
codepoint. Give up. */
|
||||
goto failure;
|
||||
}
|
||||
|
||||
c = scm_get_byte_or_eof (port);
|
||||
|
||||
if (c == EOF)
|
||||
{
|
||||
/* EOF before a complete character was read. Push it all
|
||||
back and return EOF. */
|
||||
while (bufcount > 0)
|
||||
{
|
||||
/* FIXME: this will probably cause errors in the port column. */
|
||||
scm_unget_byte (buf[bufcount-1], port);
|
||||
bufcount --;
|
||||
}
|
||||
return EOF;
|
||||
}
|
||||
|
||||
if (c == '\n')
|
||||
{
|
||||
/* It is always invalid to have EOL in the middle of a
|
||||
multibyte character. */
|
||||
scm_unget_byte ('\n', port);
|
||||
goto failure;
|
||||
}
|
||||
|
||||
buf[bufcount++] = c;
|
||||
}
|
||||
|
||||
success:
|
||||
switch (codepoint)
|
||||
switch (c)
|
||||
{
|
||||
case '\a':
|
||||
break;
|
||||
|
@ -1130,7 +1056,7 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
|||
break;
|
||||
case '\n':
|
||||
SCM_INCLINE (port);
|
||||
break;
|
||||
break;
|
||||
case '\r':
|
||||
SCM_ZEROCOL (port);
|
||||
break;
|
||||
|
@ -1141,18 +1067,120 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
|||
SCM_INCCOL (port);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*len = bufcount;
|
||||
#define SCM_MBCHAR_BUF_SIZE (4)
|
||||
|
||||
/* Convert the SIZE-byte UTF-8 sequence in UTF8_BUF to a codepoint.
|
||||
UTF8_BUF is assumed to contain a valid UTF-8 sequence. */
|
||||
static scm_t_wchar
|
||||
utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
|
||||
{
|
||||
scm_t_wchar codepoint;
|
||||
|
||||
if (utf8_buf[0] <= 0x7f)
|
||||
{
|
||||
assert (size == 1);
|
||||
codepoint = utf8_buf[0];
|
||||
}
|
||||
else if ((utf8_buf[0] & 0xe0) == 0xc0)
|
||||
{
|
||||
assert (size == 2);
|
||||
codepoint = ((scm_t_wchar) utf8_buf[0] & 0x1f) << 6UL
|
||||
| (utf8_buf[1] & 0x3f);
|
||||
}
|
||||
else if ((utf8_buf[0] & 0xf0) == 0xe0)
|
||||
{
|
||||
assert (size == 3);
|
||||
codepoint = ((scm_t_wchar) utf8_buf[0] & 0x0f) << 12UL
|
||||
| ((scm_t_wchar) utf8_buf[1] & 0x3f) << 6UL
|
||||
| (utf8_buf[2] & 0x3f);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert (size == 4);
|
||||
codepoint = ((scm_t_wchar) utf8_buf[0] & 0x07) << 18UL
|
||||
| ((scm_t_wchar) utf8_buf[1] & 0x3f) << 12UL
|
||||
| ((scm_t_wchar) utf8_buf[2] & 0x3f) << 6UL
|
||||
| (utf8_buf[3] & 0x3f);
|
||||
}
|
||||
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
/* Read a codepoint from PORT and return it. Fill BUF with the byte
|
||||
representation of the codepoint in PORT's encoding, and set *LEN to
|
||||
the length in bytes of that representation. Raise an error on
|
||||
failure. */
|
||||
static scm_t_wchar
|
||||
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||
{
|
||||
int err, byte_read;
|
||||
size_t bytes_consumed, output_size;
|
||||
scm_t_wchar codepoint;
|
||||
char *output;
|
||||
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
|
||||
/* Initialize the conversion descriptors. */
|
||||
scm_i_set_port_encoding_x (port, pt->encoding);
|
||||
|
||||
for (output_size = 0, output = (char *) utf8_buf,
|
||||
bytes_consumed = 0, err = 0;
|
||||
err == 0 && output_size == 0
|
||||
&& (bytes_consumed == 0 || byte_read != EOF);
|
||||
bytes_consumed++)
|
||||
{
|
||||
char *input;
|
||||
size_t input_left, output_left, done;
|
||||
|
||||
byte_read = scm_get_byte_or_eof (port);
|
||||
if (byte_read == EOF)
|
||||
{
|
||||
if (bytes_consumed == 0)
|
||||
return (scm_t_wchar) EOF;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
|
||||
buf[bytes_consumed] = byte_read;
|
||||
|
||||
input = buf;
|
||||
input_left = bytes_consumed + 1;
|
||||
output_left = sizeof (utf8_buf);
|
||||
|
||||
done = iconv (pt->input_cd, &input, &input_left,
|
||||
&output, &output_left);
|
||||
if (done == (size_t) -1)
|
||||
{
|
||||
err = errno;
|
||||
if (err == EINVAL)
|
||||
/* Missing input: keep trying. */
|
||||
err = 0;
|
||||
}
|
||||
else
|
||||
output_size = sizeof (utf8_buf) - output_left;
|
||||
}
|
||||
|
||||
if (err != 0)
|
||||
goto failure;
|
||||
|
||||
/* Convert the UTF8_BUF sequence to a Unicode code point. */
|
||||
codepoint = utf8_to_codepoint (utf8_buf, output_size);
|
||||
update_port_lf (codepoint, port);
|
||||
|
||||
*len = bytes_consumed;
|
||||
|
||||
return codepoint;
|
||||
|
||||
failure:
|
||||
{
|
||||
char *err_buf;
|
||||
SCM err_str = scm_i_make_string (bufcount, &err_buf);
|
||||
memcpy (err_buf, buf, bufcount);
|
||||
SCM err_str = scm_i_make_string (bytes_consumed, &err_buf);
|
||||
memcpy (err_buf, buf, bytes_consumed);
|
||||
|
||||
if (errno == EILSEQ)
|
||||
if (err == EILSEQ)
|
||||
scm_misc_error (NULL, "input encoding error for ~s: ~s",
|
||||
scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
|
||||
err_str));
|
||||
|
@ -1205,23 +1233,6 @@ scm_fill_input (SCM port)
|
|||
* This function differs from scm_c_write; it updates port line and
|
||||
* column. */
|
||||
|
||||
static void
|
||||
update_port_lf (scm_t_wchar c, SCM port)
|
||||
{
|
||||
if (c == '\a')
|
||||
; /* Do nothing. */
|
||||
else if (c == '\b')
|
||||
SCM_DECCOL (port);
|
||||
else if (c == '\n')
|
||||
SCM_INCLINE (port);
|
||||
else if (c == '\r')
|
||||
SCM_ZEROCOL (port);
|
||||
else if (c == '\t')
|
||||
SCM_TABCOL (port);
|
||||
else
|
||||
SCM_INCCOL (port);
|
||||
}
|
||||
|
||||
void
|
||||
scm_lfwrite (const char *ptr, size_t size, SCM port)
|
||||
{
|
||||
|
@ -1278,6 +1289,7 @@ scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
|
|||
}
|
||||
|
||||
/* Write a scheme string STR to PORT. */
|
||||
/* FIXME: Get rid of it. */
|
||||
void
|
||||
scm_lfwrite_str (SCM str, SCM port)
|
||||
{
|
||||
|
@ -2060,12 +2072,7 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
|
|||
{
|
||||
valid_enc = find_valid_encoding (enc);
|
||||
if (valid_enc == NULL)
|
||||
{
|
||||
SCM err;
|
||||
err = scm_from_locale_string (enc);
|
||||
scm_misc_error (NULL, "invalid or unknown character encoding ~s",
|
||||
scm_list_1 (err));
|
||||
}
|
||||
goto invalid_encoding;
|
||||
}
|
||||
|
||||
if (scm_is_false (port))
|
||||
|
@ -2087,13 +2094,62 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
|
|||
}
|
||||
else
|
||||
{
|
||||
iconv_t new_input_cd, new_output_cd;
|
||||
|
||||
new_input_cd = (iconv_t) -1;
|
||||
new_output_cd = (iconv_t) -1;
|
||||
|
||||
/* Set the character encoding for this port. */
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
if (valid_enc == NULL)
|
||||
pt->encoding = NULL;
|
||||
else
|
||||
pt->encoding = scm_gc_strdup (valid_enc, "port");
|
||||
|
||||
if (valid_enc == NULL)
|
||||
valid_enc = "ISO-8859-1";
|
||||
|
||||
if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
|
||||
{
|
||||
/* Open an input iconv conversion descriptor, from VALID_ENC
|
||||
to UTF-8. We choose UTF-8, not UTF-32, because iconv
|
||||
implementations can typically convert from anything to
|
||||
UTF-8, but not to UTF-32 (see
|
||||
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
|
||||
new_input_cd = iconv_open ("UTF-8", valid_enc);
|
||||
if (new_input_cd == (iconv_t) -1)
|
||||
goto invalid_encoding;
|
||||
}
|
||||
|
||||
if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
|
||||
{
|
||||
new_output_cd = iconv_open (valid_enc, "UTF-8");
|
||||
if (new_output_cd == (iconv_t) -1)
|
||||
{
|
||||
if (new_input_cd != (iconv_t) -1)
|
||||
iconv_close (new_input_cd);
|
||||
goto invalid_encoding;
|
||||
}
|
||||
}
|
||||
|
||||
if (pt->input_cd != (iconv_t) -1)
|
||||
iconv_close (pt->input_cd);
|
||||
if (pt->output_cd != (iconv_t) -1)
|
||||
iconv_close (pt->output_cd);
|
||||
|
||||
pt->input_cd = new_input_cd;
|
||||
pt->output_cd = new_output_cd;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
invalid_encoding:
|
||||
{
|
||||
SCM err;
|
||||
err = scm_from_locale_string (enc);
|
||||
scm_misc_error (NULL, "invalid or unknown character encoding ~s",
|
||||
scm_list_1 (err));
|
||||
}
|
||||
}
|
||||
|
||||
SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#ifndef SCM_PORTS_H
|
||||
#define SCM_PORTS_H
|
||||
|
||||
/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
|
@ -109,6 +109,10 @@ typedef struct
|
|||
/* a buffer for un-read chars and strings. */
|
||||
unsigned char *putback_buf;
|
||||
size_t putback_buf_size; /* allocated size of putback_buf. */
|
||||
|
||||
/* input/output iconv conversion descriptors */
|
||||
void *input_cd;
|
||||
void *output_cd;
|
||||
} scm_t_port;
|
||||
|
||||
|
||||
|
|
449
libguile/print.c
449
libguile/print.c
|
@ -1,5 +1,6 @@
|
|||
/* Copyright (C) 1995-1999,2000,2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
|
||||
*
|
||||
/* Copyright (C) 1995-1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008,
|
||||
* 2009, 2010, 2011 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
* as published by the Free Software Foundation; either version 3 of
|
||||
|
@ -23,6 +24,10 @@
|
|||
#endif
|
||||
|
||||
#include <errno.h>
|
||||
#include <iconv.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <uniconv.h>
|
||||
#include <unictype.h>
|
||||
|
||||
|
@ -56,10 +61,16 @@
|
|||
|
||||
/* Character printers. */
|
||||
|
||||
static size_t display_string (const void *, int, size_t, SCM,
|
||||
scm_t_string_failed_conversion_handler);
|
||||
|
||||
static int display_character (scm_t_wchar, SCM,
|
||||
scm_t_string_failed_conversion_handler);
|
||||
|
||||
static void write_character (scm_t_wchar, SCM, int);
|
||||
|
||||
static void write_character_escaped (scm_t_wchar, int, SCM);
|
||||
|
||||
|
||||
|
||||
/* {Names of immediate symbols}
|
||||
|
@ -541,16 +552,31 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
|
|||
{
|
||||
size_t len, i;
|
||||
|
||||
scm_putc ('"', port);
|
||||
display_character ('"', port, iconveh_question_mark);
|
||||
len = scm_i_string_length (exp);
|
||||
for (i = 0; i < len; ++i)
|
||||
write_character (scm_i_string_ref (exp, i), port, 1);
|
||||
|
||||
scm_putc ('"', port);
|
||||
display_character ('"', port, iconveh_question_mark);
|
||||
scm_remember_upto_here_1 (exp);
|
||||
}
|
||||
else
|
||||
scm_lfwrite_str (exp, port);
|
||||
{
|
||||
size_t len, printed;
|
||||
|
||||
len = scm_i_string_length (exp);
|
||||
printed = display_string (scm_i_string_data (exp),
|
||||
scm_i_is_narrow_string (exp),
|
||||
len, port,
|
||||
scm_i_get_conversion_strategy (port));
|
||||
if (SCM_UNLIKELY (printed < len))
|
||||
/* FIXME: Provide the error location. */
|
||||
scm_encoding_error (__func__, errno,
|
||||
"cannot convert to output locale",
|
||||
"UTF-32", scm_i_get_port_encoding (port),
|
||||
exp);
|
||||
}
|
||||
|
||||
scm_remember_upto_here_1 (exp);
|
||||
break;
|
||||
case scm_tc7_symbol:
|
||||
|
@ -740,6 +766,154 @@ scm_prin1 (SCM exp, SCM port, int writingp)
|
|||
}
|
||||
}
|
||||
|
||||
/* Convert codepoint CH to UTF-8 and store the result in UTF8. Return
|
||||
the number of bytes of the UTF-8-encoded string. */
|
||||
static size_t
|
||||
codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
|
||||
{
|
||||
size_t len;
|
||||
scm_t_uint32 codepoint;
|
||||
|
||||
codepoint = (scm_t_uint32) ch;
|
||||
|
||||
if (codepoint <= 0x7f)
|
||||
{
|
||||
len = 1;
|
||||
utf8[0] = (scm_t_uint8) codepoint;
|
||||
}
|
||||
else if (codepoint <= 0x7ffUL)
|
||||
{
|
||||
len = 2;
|
||||
utf8[0] = 0xc0 | (codepoint >> 6);
|
||||
utf8[1] = 0x80 | (codepoint & 0x3f);
|
||||
}
|
||||
else if (codepoint <= 0xffffUL)
|
||||
{
|
||||
len = 3;
|
||||
utf8[0] = 0xe0 | (codepoint >> 12);
|
||||
utf8[1] = 0x80 | ((codepoint >> 6) & 0x3f);
|
||||
utf8[2] = 0x80 | (codepoint & 0x3f);
|
||||
}
|
||||
else
|
||||
{
|
||||
len = 4;
|
||||
utf8[0] = 0xf0 | (codepoint >> 18);
|
||||
utf8[1] = 0x80 | ((codepoint >> 12) & 0x3f);
|
||||
utf8[2] = 0x80 | ((codepoint >> 6) & 0x3f);
|
||||
utf8[3] = 0x80 | (codepoint & 0x3f);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/* Display the LEN codepoints in STR to PORT according to STRATEGY;
|
||||
return the number of codepoints successfully displayed. If NARROW_P,
|
||||
then STR is interpreted as a sequence of `char', denoting a Latin-1
|
||||
string; otherwise it's interpreted as a sequence of
|
||||
`scm_t_wchar'. */
|
||||
static size_t
|
||||
display_string (const void *str, int narrow_p,
|
||||
size_t len, SCM port,
|
||||
scm_t_string_failed_conversion_handler strategy)
|
||||
|
||||
{
|
||||
#define STR_REF(s, x) \
|
||||
(narrow_p \
|
||||
? (scm_t_wchar) ((unsigned char *) (s))[x] \
|
||||
: ((scm_t_wchar *) (s))[x])
|
||||
|
||||
size_t printed;
|
||||
scm_t_port *pt;
|
||||
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
|
||||
/* Initialize the conversion descriptors. */
|
||||
scm_i_set_port_encoding_x (port, pt->encoding);
|
||||
|
||||
printed = 0;
|
||||
|
||||
while (len > printed)
|
||||
{
|
||||
size_t done, utf8_len, input_left, output_left, i;
|
||||
size_t codepoints_read, output_len;
|
||||
char *input, *output;
|
||||
char utf8_buf[256], encoded_output[256];
|
||||
size_t offsets[256];
|
||||
|
||||
/* Convert STR to UTF-8. */
|
||||
for (i = printed, utf8_len = 0, input = utf8_buf;
|
||||
i < len && utf8_len + 4 < sizeof (utf8_buf);
|
||||
i++)
|
||||
{
|
||||
offsets[utf8_len] = i;
|
||||
utf8_len += codepoint_to_utf8 (STR_REF (str, i),
|
||||
(scm_t_uint8 *) input);
|
||||
input = utf8_buf + utf8_len;
|
||||
}
|
||||
|
||||
input = utf8_buf;
|
||||
input_left = utf8_len;
|
||||
|
||||
output = encoded_output;
|
||||
output_left = sizeof (encoded_output);
|
||||
|
||||
done = iconv (pt->output_cd, &input, &input_left,
|
||||
&output, &output_left);
|
||||
|
||||
output_len = sizeof (encoded_output) - output_left;
|
||||
|
||||
if (SCM_UNLIKELY (done == (size_t) -1))
|
||||
{
|
||||
/* Reset the `iconv' state. */
|
||||
iconv (pt->output_cd, NULL, NULL, NULL, NULL);
|
||||
|
||||
if (errno == EILSEQ &&
|
||||
strategy != SCM_FAILED_CONVERSION_ERROR)
|
||||
{
|
||||
/* Conversion failed somewhere in INPUT and we want to
|
||||
escape or substitute the offending input character. */
|
||||
|
||||
/* Print the OUTPUT_LEN bytes successfully converted. */
|
||||
scm_lfwrite (encoded_output, output_len, port);
|
||||
|
||||
/* See how many input codepoints these OUTPUT_LEN bytes
|
||||
corresponds to. */
|
||||
codepoints_read = offsets[input - utf8_buf] - printed;
|
||||
printed += codepoints_read;
|
||||
|
||||
if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
|
||||
{
|
||||
scm_t_wchar ch;
|
||||
|
||||
/* Find CH, the offending codepoint, and escape it. */
|
||||
ch = STR_REF (str, offsets[input - utf8_buf]);
|
||||
write_character_escaped (ch, 1, port);
|
||||
}
|
||||
else
|
||||
/* STRATEGY is `SCM_FAILED_CONVERSION_QUESTION_MARK'. */
|
||||
display_string ("?", 1, 1, port, strategy);
|
||||
|
||||
printed++;
|
||||
}
|
||||
else
|
||||
/* Something bad happened that we can't handle: bail out. */
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* INPUT was successfully converted, entirely; print the
|
||||
result. */
|
||||
scm_lfwrite (encoded_output, output_len, port);
|
||||
codepoints_read = i - printed;
|
||||
printed += codepoints_read;
|
||||
}
|
||||
}
|
||||
|
||||
return printed;
|
||||
#undef STR_REF
|
||||
}
|
||||
|
||||
/* Attempt to display CH to PORT according to STRATEGY. Return non-zero
|
||||
if CH was successfully displayed, zero otherwise (e.g., if it was not
|
||||
representable in PORT's encoding.) */
|
||||
|
@ -747,62 +921,7 @@ static int
|
|||
display_character (scm_t_wchar ch, SCM port,
|
||||
scm_t_string_failed_conversion_handler strategy)
|
||||
{
|
||||
int printed;
|
||||
const char *encoding;
|
||||
|
||||
encoding = scm_i_get_port_encoding (port);
|
||||
if (encoding == NULL)
|
||||
{
|
||||
if (ch <= 0xff)
|
||||
{
|
||||
scm_putc (ch, port);
|
||||
printed = 1;
|
||||
}
|
||||
else
|
||||
printed = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t len;
|
||||
char locale_encoded[8 * sizeof (ch)], *result;
|
||||
|
||||
len = sizeof (locale_encoded);
|
||||
result = u32_conv_to_encoding (encoding, strategy,
|
||||
(scm_t_uint32 *) &ch, 1,
|
||||
NULL, locale_encoded, &len);
|
||||
if (result != NULL)
|
||||
{
|
||||
/* CH is graphic; print it. */
|
||||
|
||||
if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
|
||||
{
|
||||
/* Apply the same escaping syntax as in `write_character'. */
|
||||
if (SCM_R6RS_ESCAPES_P)
|
||||
{
|
||||
/* LOCALE_ENCODED is large enough to store an R6RS
|
||||
`\xNNNN;' escape sequence. However, libunistring
|
||||
up to 0.9.3 (included) always returns a
|
||||
heap-allocated RESULT. */
|
||||
if (SCM_UNLIKELY (result != locale_encoded))
|
||||
result = scm_realloc (result, len * 7);
|
||||
|
||||
scm_i_unistring_escapes_to_r6rs_escapes (result, &len);
|
||||
}
|
||||
else
|
||||
scm_i_unistring_escapes_to_guile_escapes (result, &len);
|
||||
}
|
||||
|
||||
scm_lfwrite (result, len, port);
|
||||
printed = 1;
|
||||
|
||||
if (SCM_UNLIKELY (result != locale_encoded))
|
||||
free (result);
|
||||
}
|
||||
else
|
||||
printed = 0;
|
||||
}
|
||||
|
||||
return printed;
|
||||
return display_string (&ch, 0, 1, port, strategy) == 1;
|
||||
}
|
||||
|
||||
/* Attempt to pretty-print CH, a combining character, to PORT. Return
|
||||
|
@ -811,39 +930,101 @@ display_character (scm_t_wchar ch, SCM port,
|
|||
static int
|
||||
write_combining_character (scm_t_wchar ch, SCM port)
|
||||
{
|
||||
int printed;
|
||||
const char *encoding;
|
||||
scm_t_wchar str[2];
|
||||
|
||||
encoding = scm_i_get_port_encoding (port);
|
||||
if (encoding != NULL)
|
||||
str[0] = SCM_CODEPOINT_DOTTED_CIRCLE;
|
||||
str[1] = ch;
|
||||
|
||||
return display_string (str, 0, 2, port, iconveh_error) == 2;
|
||||
}
|
||||
|
||||
/* Write CH to PORT in its escaped form, using the string escape syntax
|
||||
if STRING_ESCAPES_P is non-zero. */
|
||||
static void
|
||||
write_character_escaped (scm_t_wchar ch, int string_escapes_p, SCM port)
|
||||
{
|
||||
if (string_escapes_p)
|
||||
{
|
||||
scm_t_wchar str[2];
|
||||
char locale_encoded[sizeof (str)], *result;
|
||||
size_t len;
|
||||
/* Represent CH using the in-string escape syntax. */
|
||||
|
||||
str[0] = SCM_CODEPOINT_DOTTED_CIRCLE;
|
||||
str[1] = ch;
|
||||
static const char hex[] = "0123456789abcdef";
|
||||
static const char escapes[7] = "abtnvfr";
|
||||
char buf[9];
|
||||
|
||||
len = sizeof (locale_encoded);
|
||||
result = u32_conv_to_encoding (encoding, iconveh_error,
|
||||
(scm_t_uint32 *) str, 2,
|
||||
NULL, locale_encoded, &len);
|
||||
if (result != NULL)
|
||||
if (ch >= 0x07 && ch <= 0x0D && ch != 0x0A)
|
||||
{
|
||||
scm_lfwrite (result, len, port);
|
||||
printed = 1;
|
||||
if (SCM_UNLIKELY (result != locale_encoded))
|
||||
free (result);
|
||||
/* Use special escapes for some C0 controls. */
|
||||
buf[0] = '\\';
|
||||
buf[1] = escapes[ch - 0x07];
|
||||
scm_lfwrite (buf, 2, port);
|
||||
}
|
||||
else if (!SCM_R6RS_ESCAPES_P)
|
||||
{
|
||||
if (ch <= 0xFF)
|
||||
{
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'x';
|
||||
buf[2] = hex[ch / 16];
|
||||
buf[3] = hex[ch % 16];
|
||||
scm_lfwrite (buf, 4, port);
|
||||
}
|
||||
else if (ch <= 0xFFFF)
|
||||
{
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'u';
|
||||
buf[2] = hex[(ch & 0xF000) >> 12];
|
||||
buf[3] = hex[(ch & 0xF00) >> 8];
|
||||
buf[4] = hex[(ch & 0xF0) >> 4];
|
||||
buf[5] = hex[(ch & 0xF)];
|
||||
scm_lfwrite (buf, 6, port);
|
||||
}
|
||||
else if (ch > 0xFFFF)
|
||||
{
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'U';
|
||||
buf[2] = hex[(ch & 0xF00000) >> 20];
|
||||
buf[3] = hex[(ch & 0xF0000) >> 16];
|
||||
buf[4] = hex[(ch & 0xF000) >> 12];
|
||||
buf[5] = hex[(ch & 0xF00) >> 8];
|
||||
buf[6] = hex[(ch & 0xF0) >> 4];
|
||||
buf[7] = hex[(ch & 0xF)];
|
||||
scm_lfwrite (buf, 8, port);
|
||||
}
|
||||
}
|
||||
else
|
||||
/* Can't write the result to PORT. */
|
||||
printed = 0;
|
||||
{
|
||||
/* Print an R6RS variable-length hex escape: "\xNNNN;". */
|
||||
scm_t_wchar ch2 = ch;
|
||||
|
||||
int i = 8;
|
||||
buf[i] = ';';
|
||||
i --;
|
||||
if (ch == 0)
|
||||
buf[i--] = '0';
|
||||
else
|
||||
while (ch2 > 0)
|
||||
{
|
||||
buf[i] = hex[ch2 & 0xF];
|
||||
ch2 >>= 4;
|
||||
i --;
|
||||
}
|
||||
buf[i] = 'x';
|
||||
i --;
|
||||
buf[i] = '\\';
|
||||
scm_lfwrite (buf + i, 9 - i, port);
|
||||
}
|
||||
}
|
||||
else
|
||||
/* PORT is Latin-1-encoded and can't display the fancy things. */
|
||||
printed = 0;
|
||||
{
|
||||
/* Represent CH using the character escape syntax. */
|
||||
const char *name;
|
||||
|
||||
return printed;
|
||||
name = scm_i_charname (SCM_MAKE_CHAR (ch));
|
||||
if (name != NULL)
|
||||
scm_puts (name, port);
|
||||
else
|
||||
PRINT_CHAR_ESCAPE (ch, port);
|
||||
}
|
||||
}
|
||||
|
||||
/* Write CH to PORT, escaping it if it's non-graphic or not
|
||||
|
@ -854,25 +1035,28 @@ static void
|
|||
write_character (scm_t_wchar ch, SCM port, int string_escapes_p)
|
||||
{
|
||||
int printed = 0;
|
||||
scm_t_string_failed_conversion_handler strategy;
|
||||
|
||||
strategy = scm_i_get_conversion_strategy (port);
|
||||
|
||||
if (string_escapes_p)
|
||||
{
|
||||
/* Check if CH deserves special treatment. */
|
||||
if (ch == '"' || ch == '\\')
|
||||
{
|
||||
scm_putc ('\\', port);
|
||||
scm_putc (ch, port);
|
||||
display_character ('\\', port, iconveh_question_mark);
|
||||
display_character (ch, port, strategy);
|
||||
printed = 1;
|
||||
}
|
||||
else if (ch == ' ' || ch == '\n')
|
||||
{
|
||||
scm_putc (ch, port);
|
||||
display_character (ch, port, strategy);
|
||||
printed = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
scm_puts ("#\\", port);
|
||||
display_string ("#\\", 1, 2, port, iconveh_question_mark);
|
||||
|
||||
if (uc_combining_class (ch) != UC_CCC_NR)
|
||||
/* Character is a combining character, so attempt to
|
||||
|
@ -891,93 +1075,8 @@ write_character (scm_t_wchar ch, SCM port, int string_escapes_p)
|
|||
printed = display_character (ch, port, iconveh_error);
|
||||
|
||||
if (!printed)
|
||||
{
|
||||
/* CH isn't graphic or cannot be represented in PORT's
|
||||
encoding. */
|
||||
|
||||
if (string_escapes_p)
|
||||
{
|
||||
/* Represent CH using the in-string escape syntax. */
|
||||
|
||||
static const char hex[] = "0123456789abcdef";
|
||||
static const char escapes[7] = "abtnvfr";
|
||||
char buf[9];
|
||||
|
||||
if (ch >= 0x07 && ch <= 0x0D && ch != 0x0A)
|
||||
{
|
||||
/* Use special escapes for some C0 controls. */
|
||||
buf[0] = '\\';
|
||||
buf[1] = escapes[ch - 0x07];
|
||||
scm_lfwrite (buf, 2, port);
|
||||
}
|
||||
else if (!SCM_R6RS_ESCAPES_P)
|
||||
{
|
||||
if (ch <= 0xFF)
|
||||
{
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'x';
|
||||
buf[2] = hex[ch / 16];
|
||||
buf[3] = hex[ch % 16];
|
||||
scm_lfwrite (buf, 4, port);
|
||||
}
|
||||
else if (ch <= 0xFFFF)
|
||||
{
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'u';
|
||||
buf[2] = hex[(ch & 0xF000) >> 12];
|
||||
buf[3] = hex[(ch & 0xF00) >> 8];
|
||||
buf[4] = hex[(ch & 0xF0) >> 4];
|
||||
buf[5] = hex[(ch & 0xF)];
|
||||
scm_lfwrite (buf, 6, port);
|
||||
}
|
||||
else if (ch > 0xFFFF)
|
||||
{
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'U';
|
||||
buf[2] = hex[(ch & 0xF00000) >> 20];
|
||||
buf[3] = hex[(ch & 0xF0000) >> 16];
|
||||
buf[4] = hex[(ch & 0xF000) >> 12];
|
||||
buf[5] = hex[(ch & 0xF00) >> 8];
|
||||
buf[6] = hex[(ch & 0xF0) >> 4];
|
||||
buf[7] = hex[(ch & 0xF)];
|
||||
scm_lfwrite (buf, 8, port);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Print an R6RS variable-length hex escape: "\xNNNN;". */
|
||||
scm_t_wchar ch2 = ch;
|
||||
|
||||
int i = 8;
|
||||
buf[i] = ';';
|
||||
i --;
|
||||
if (ch == 0)
|
||||
buf[i--] = '0';
|
||||
else
|
||||
while (ch2 > 0)
|
||||
{
|
||||
buf[i] = hex[ch2 & 0xF];
|
||||
ch2 >>= 4;
|
||||
i --;
|
||||
}
|
||||
buf[i] = 'x';
|
||||
i --;
|
||||
buf[i] = '\\';
|
||||
scm_lfwrite (buf + i, 9 - i, port);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Represent CH using the character escape syntax. */
|
||||
const char *name;
|
||||
|
||||
name = scm_i_charname (SCM_MAKE_CHAR (ch));
|
||||
if (name != NULL)
|
||||
scm_puts (name, port);
|
||||
else
|
||||
PRINT_CHAR_ESCAPE (ch, port);
|
||||
}
|
||||
}
|
||||
/* CH isn't graphic or cannot be represented in PORT's encoding. */
|
||||
write_character_escaped (ch, string_escapes_p, port);
|
||||
}
|
||||
|
||||
/* Print an integer.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
;;;; encoding-escapes.test --- test suite for Guile's string encodings -*- mode: scheme; coding: utf-8 -*-
|
||||
;;;;
|
||||
;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc.
|
||||
;;;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
|
||||
;;;;
|
||||
;;;; This library is free software; you can redistribute it and/or
|
||||
;;;; modify it under the terms of the GNU Lesser General Public
|
||||
|
@ -118,8 +118,21 @@
|
|||
(set-port-encoding! pt "ASCII")
|
||||
(set-port-conversion-strategy! pt 'escape)
|
||||
(display s4 pt)
|
||||
(string=? "\\u7F85\\u751F\\u9580"
|
||||
(get-output-string pt)))))
|
||||
(string=? "\\u7f85\\u751f\\u9580"
|
||||
(get-output-string pt))))
|
||||
|
||||
(pass-if "fake escape"
|
||||
;; The input string below contains something that looks like
|
||||
;; an escape in libunistring syntax, but which should be left
|
||||
;; as is in the output. See
|
||||
;; <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
|
||||
;; for background info.
|
||||
(let ((pt (open-output-string)))
|
||||
(set-port-encoding! pt "ASCII")
|
||||
(set-port-conversion-strategy! pt 'escape)
|
||||
(display "λ -- \\u0012" pt)
|
||||
(string=? "\\u03bb -- \\u0012"
|
||||
(get-output-string pt)))))
|
||||
|
||||
(with-test-prefix "input escapes"
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue