mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-20 03:30:27 +02:00
Port encoding internally represented as symbol
* libguile/ports-internal.h (scm_t_port_internal): Remove encoding_mode member. * libguile/ports.h (scm_t_port): "encoding" member is now a SCM symbol. * libguile/ports.c (scm_init_ports): Define symbols for the encodings that we handle explicitly. (encoding_matches): Adapt to check against an encoding as a symbol. (canonicalize_encoding): Return an encoding as a symbol. (scm_c_make_port_with_encoding, scm_i_set_default_port_encoding) (decide_utf16_encoding, decide_utf32_encoding) (scm_i_port_iconv_descriptors, scm_i_set_port_encoding_x) (scm_port_encoding, peek_codepoint, scm_ungetc): Adapt to encoding change. * libguile/print.c (display_string_using_iconv, display_string): * libguile/read.c (scm_read_character): * libguile/strings.c (scm_from_port_stringn, scm_to_port_stringn): Adapt to port encoding change.
This commit is contained in:
parent
422f65fe09
commit
d8711b9759
6 changed files with 80 additions and 84 deletions
|
@ -221,7 +221,6 @@ struct scm_port_internal
|
|||
{
|
||||
unsigned at_stream_start_for_bom_read : 1;
|
||||
unsigned at_stream_start_for_bom_write : 1;
|
||||
scm_t_port_encoding_mode encoding_mode;
|
||||
scm_t_iconv_descriptors *iconv_descriptors;
|
||||
SCM alist;
|
||||
};
|
||||
|
|
119
libguile/ports.c
119
libguile/ports.c
|
@ -91,6 +91,18 @@
|
|||
#endif
|
||||
|
||||
|
||||
|
||||
/* We need these symbols early, before (ice-9 ports) loads in the
|
||||
snarfed definitions, so we can't use SCM_SYMBOL. */
|
||||
static SCM sym_UTF_8;
|
||||
static SCM sym_ISO_8859_1;
|
||||
static SCM sym_UTF_16;
|
||||
static SCM sym_UTF_16LE;
|
||||
static SCM sym_UTF_16BE;
|
||||
static SCM sym_UTF_32;
|
||||
static SCM sym_UTF_32LE;
|
||||
static SCM sym_UTF_32BE;
|
||||
|
||||
/* Port encodings are case-insensitive ASCII strings. */
|
||||
static char
|
||||
ascii_toupper (char c)
|
||||
|
@ -103,8 +115,10 @@ ascii_toupper (char c)
|
|||
on ports or in the default encoding fluid are in upper-case, and can
|
||||
be compared with strcmp. */
|
||||
static int
|
||||
encoding_matches (const char *enc, const char *upper)
|
||||
encoding_matches (const char *enc, SCM upper_symbol)
|
||||
{
|
||||
const char *upper = scm_i_symbol_chars (upper_symbol);
|
||||
|
||||
if (!enc)
|
||||
enc = "ISO-8859-1";
|
||||
|
||||
|
@ -115,14 +129,16 @@ encoding_matches (const char *enc, const char *upper)
|
|||
return !*upper;
|
||||
}
|
||||
|
||||
static char*
|
||||
static SCM
|
||||
canonicalize_encoding (const char *enc)
|
||||
{
|
||||
char *ret;
|
||||
int i;
|
||||
|
||||
if (!enc)
|
||||
return "ISO-8859-1";
|
||||
if (!enc || encoding_matches (enc, sym_ISO_8859_1))
|
||||
return sym_ISO_8859_1;
|
||||
if (encoding_matches (enc, sym_UTF_8))
|
||||
return sym_UTF_8;
|
||||
|
||||
ret = scm_gc_strdup (enc, "port");
|
||||
|
||||
|
@ -136,7 +152,7 @@ canonicalize_encoding (const char *enc)
|
|||
ret[i] = ascii_toupper (ret[i]);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return scm_from_latin1_symbol (ret);
|
||||
}
|
||||
|
||||
|
||||
|
@ -758,22 +774,7 @@ scm_c_make_port_with_encoding (scm_t_bits tag, unsigned long mode_bits,
|
|||
entry->rw_random = ptob->seek != NULL;
|
||||
entry->port = ret;
|
||||
entry->stream = stream;
|
||||
|
||||
if (encoding_matches (encoding, "UTF-8"))
|
||||
{
|
||||
pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
|
||||
entry->encoding = "UTF-8";
|
||||
}
|
||||
else if (encoding_matches (encoding, "ISO-8859-1"))
|
||||
{
|
||||
pti->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
|
||||
entry->encoding = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
|
||||
entry->encoding = canonicalize_encoding (encoding);
|
||||
}
|
||||
entry->encoding = canonicalize_encoding (encoding);
|
||||
|
||||
entry->ilseq_handler = handler;
|
||||
pti->iconv_descriptors = NULL;
|
||||
|
@ -970,11 +971,11 @@ static SCM default_port_encoding_var;
|
|||
void
|
||||
scm_i_set_default_port_encoding (const char *encoding)
|
||||
{
|
||||
if (encoding_matches (encoding, "ISO-8859-1"))
|
||||
if (encoding_matches (encoding, sym_ISO_8859_1))
|
||||
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
|
||||
else
|
||||
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
|
||||
scm_from_latin1_string (canonicalize_encoding (encoding)));
|
||||
scm_symbol_to_string (canonicalize_encoding (encoding)));
|
||||
}
|
||||
|
||||
/* Return the name of the default encoding for newly created ports. */
|
||||
|
@ -1079,29 +1080,29 @@ static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
|
|||
to determine the byte order. Otherwise we choose big endian, as
|
||||
recommended by the Unicode Standard. Note that the BOM (if any) is
|
||||
not consumed here. */
|
||||
static const char *
|
||||
static SCM
|
||||
decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
|
||||
{
|
||||
if (mode == SCM_PORT_READ
|
||||
&& SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
|
||||
&& looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
|
||||
return "UTF-16LE";
|
||||
return sym_UTF_16LE;
|
||||
else
|
||||
return "UTF-16BE";
|
||||
return sym_UTF_16BE;
|
||||
}
|
||||
|
||||
/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE"
|
||||
or "UTF-32LE". See the comment above 'decide_utf16_encoding' for
|
||||
details. */
|
||||
static const char *
|
||||
static SCM
|
||||
decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
|
||||
{
|
||||
if (mode == SCM_PORT_READ
|
||||
&& SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
|
||||
&& looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
|
||||
return "UTF-32LE";
|
||||
return sym_UTF_32LE;
|
||||
else
|
||||
return "UTF-32BE";
|
||||
return sym_UTF_32BE;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1189,29 +1190,24 @@ close_iconv_descriptors (scm_t_iconv_descriptors *id)
|
|||
scm_t_iconv_descriptors *
|
||||
scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
|
||||
{
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
|
||||
|
||||
assert (pti->encoding_mode == SCM_PORT_ENCODING_MODE_ICONV);
|
||||
|
||||
if (!pti->iconv_descriptors)
|
||||
{
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
const char *precise_encoding;
|
||||
|
||||
if (!pt->encoding)
|
||||
pt->encoding = "ISO-8859-1";
|
||||
SCM precise_encoding;
|
||||
|
||||
/* If the specified encoding is UTF-16 or UTF-32, then make
|
||||
that more precise by deciding what byte order to use. */
|
||||
if (strcmp (pt->encoding, "UTF-16") == 0)
|
||||
if (scm_is_eq (pt->encoding, sym_UTF_16))
|
||||
precise_encoding = decide_utf16_encoding (port, mode);
|
||||
else if (strcmp (pt->encoding, "UTF-32") == 0)
|
||||
else if (scm_is_eq (pt->encoding, sym_UTF_32))
|
||||
precise_encoding = decide_utf32_encoding (port, mode);
|
||||
else
|
||||
precise_encoding = pt->encoding;
|
||||
|
||||
pti->iconv_descriptors =
|
||||
open_iconv_descriptors (precise_encoding,
|
||||
open_iconv_descriptors (scm_i_symbol_chars (precise_encoding),
|
||||
SCM_INPUT_PORT_P (port),
|
||||
SCM_OUTPUT_PORT_P (port));
|
||||
}
|
||||
|
@ -1239,22 +1235,7 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
|
|||
position. */
|
||||
pti->at_stream_start_for_bom_read = 1;
|
||||
pti->at_stream_start_for_bom_write = 1;
|
||||
|
||||
if (encoding_matches (encoding, "UTF-8"))
|
||||
{
|
||||
pt->encoding = "UTF-8";
|
||||
pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
|
||||
}
|
||||
else if (encoding_matches (encoding, "ISO-8859-1"))
|
||||
{
|
||||
pt->encoding = "ISO-8859-1";
|
||||
pti->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
|
||||
}
|
||||
else
|
||||
{
|
||||
pt->encoding = canonicalize_encoding (encoding);
|
||||
pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
|
||||
}
|
||||
pt->encoding = canonicalize_encoding (encoding);
|
||||
|
||||
pti->iconv_descriptors = NULL;
|
||||
if (prev)
|
||||
|
@ -1269,7 +1250,7 @@ SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
|
|||
{
|
||||
SCM_VALIDATE_PORT (1, port);
|
||||
|
||||
return scm_from_latin1_string (SCM_PTAB_ENTRY (port)->encoding);
|
||||
return scm_symbol_to_string (SCM_PTAB_ENTRY (port)->encoding);
|
||||
}
|
||||
#undef FUNC_NAME
|
||||
|
||||
|
@ -1856,9 +1837,9 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
|||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
|
||||
|
||||
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
|
||||
if (scm_is_eq (pt->encoding, sym_UTF_8))
|
||||
err = peek_utf8_codepoint (port, codepoint, len);
|
||||
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
|
||||
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
||||
err = peek_latin1_codepoint (port, codepoint, len);
|
||||
else
|
||||
err = peek_iconv_codepoint (port, codepoint, len);
|
||||
|
@ -1876,9 +1857,9 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
|
|||
then silently consume it and read another code point. */
|
||||
if (SCM_UNLIKELY
|
||||
(*codepoint == SCM_UNICODE_BOM
|
||||
&& (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
|
||||
|| strcmp (pt->encoding, "UTF-16") == 0
|
||||
|| strcmp (pt->encoding, "UTF-32") == 0)))
|
||||
&& (scm_is_eq (pt->encoding, sym_UTF_8)
|
||||
|| scm_is_eq (pt->encoding, sym_UTF_16)
|
||||
|| scm_is_eq (pt->encoding, sym_UTF_32))))
|
||||
{
|
||||
scm_port_buffer_did_take (pt->read_buf, *len);
|
||||
return peek_codepoint (port, codepoint, len);
|
||||
|
@ -2022,14 +2003,13 @@ scm_ungetc (scm_t_wchar c, SCM port)
|
|||
#define FUNC_NAME "scm_ungetc"
|
||||
{
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
|
||||
char *result;
|
||||
char result_buf[10];
|
||||
size_t len;
|
||||
|
||||
len = sizeof (result_buf);
|
||||
|
||||
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
|
||||
if (scm_is_eq (pt->encoding, sym_UTF_8))
|
||||
{
|
||||
if (c < 0x80)
|
||||
{
|
||||
|
@ -2041,14 +2021,14 @@ scm_ungetc (scm_t_wchar c, SCM port)
|
|||
result =
|
||||
(char *) u32_to_u8 ((uint32_t *) &c, 1, (uint8_t *) result_buf, &len);
|
||||
}
|
||||
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1 && c <= 0xff)
|
||||
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1) && c <= 0xff)
|
||||
{
|
||||
result_buf[0] = (char) c;
|
||||
result = result_buf;
|
||||
len = 1;
|
||||
}
|
||||
else
|
||||
result = u32_conv_to_encoding (pt->encoding,
|
||||
result = u32_conv_to_encoding (scm_i_symbol_chars (pt->encoding),
|
||||
(enum iconv_ilseq_handler) pt->ilseq_handler,
|
||||
(uint32_t *) &c, 1, NULL,
|
||||
result_buf, &len);
|
||||
|
@ -3163,6 +3143,15 @@ scm_init_ice_9_ports (void)
|
|||
void
|
||||
scm_init_ports (void)
|
||||
{
|
||||
sym_UTF_8 = scm_from_latin1_symbol ("UTF-8");
|
||||
sym_ISO_8859_1 = scm_from_latin1_symbol ("ISO-8859-1");
|
||||
sym_UTF_16 = scm_from_latin1_symbol ("UTF-16");
|
||||
sym_UTF_16LE = scm_from_latin1_symbol ("UTF-16LE");
|
||||
sym_UTF_16BE = scm_from_latin1_symbol ("UTF-16BE");
|
||||
sym_UTF_32 = scm_from_latin1_symbol ("UTF-32");
|
||||
sym_UTF_32LE = scm_from_latin1_symbol ("UTF-32LE");
|
||||
sym_UTF_32BE = scm_from_latin1_symbol ("UTF-32BE");
|
||||
|
||||
trampoline_to_c_read_subr =
|
||||
scm_c_make_gsubr ("port-read", 4, 0, 0,
|
||||
(scm_t_subr) trampoline_to_c_read);
|
||||
|
|
|
@ -114,7 +114,7 @@ typedef struct
|
|||
int rw_random;
|
||||
|
||||
/* Character encoding support. */
|
||||
char *encoding;
|
||||
SCM encoding; /* A symbol of upper-case ASCII. */
|
||||
scm_t_string_failed_conversion_handler ilseq_handler;
|
||||
} scm_t_port;
|
||||
|
||||
|
|
|
@ -64,6 +64,11 @@
|
|||
#define PORT_CONVERSION_HANDLER(port) \
|
||||
SCM_PTAB_ENTRY (port)->ilseq_handler
|
||||
|
||||
SCM_SYMBOL (sym_UTF_8, "UTF-8");
|
||||
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
|
||||
SCM_SYMBOL (sym_UTF_16, "UTF-16");
|
||||
SCM_SYMBOL (sym_UTF_32, "UTF-32");
|
||||
|
||||
static size_t display_string (const void *, int, size_t, SCM,
|
||||
scm_t_string_failed_conversion_handler);
|
||||
|
||||
|
@ -1036,8 +1041,8 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
|
|||
pti->at_stream_start_for_bom_read = 0;
|
||||
|
||||
/* Write a BOM if appropriate. */
|
||||
if (SCM_UNLIKELY (strcmp(pt->encoding, "UTF-16") == 0
|
||||
|| strcmp(pt->encoding, "UTF-32") == 0))
|
||||
if (SCM_UNLIKELY (scm_is_eq (pt->encoding, sym_UTF_16)
|
||||
|| scm_is_eq (pt->encoding, sym_UTF_32)))
|
||||
display_character (SCM_UNICODE_BOM, port, iconveh_error);
|
||||
}
|
||||
|
||||
|
@ -1135,13 +1140,13 @@ display_string (const void *str, int narrow_p,
|
|||
size_t len, SCM port,
|
||||
scm_t_string_failed_conversion_handler strategy)
|
||||
{
|
||||
scm_t_port_internal *pti;
|
||||
scm_t_port *pt;
|
||||
|
||||
pti = SCM_PORT_GET_INTERNAL (port);
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
|
||||
if (scm_is_eq (pt->encoding, sym_UTF_8))
|
||||
return display_string_as_utf8 (str, narrow_p, len, port);
|
||||
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
|
||||
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
||||
return display_string_as_latin1 (str, narrow_p, len, port, strategy);
|
||||
else
|
||||
return display_string_using_iconv (str, narrow_p, len, port, strategy);
|
||||
|
|
|
@ -64,6 +64,7 @@ SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
|
|||
SCM_SYMBOL (scm_keyword_prefix, "prefix");
|
||||
SCM_SYMBOL (scm_keyword_postfix, "postfix");
|
||||
SCM_SYMBOL (sym_nil, "nil");
|
||||
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
|
||||
|
||||
/* SRFI-105 curly infix expression support */
|
||||
SCM_SYMBOL (sym_nfx, "$nfx$");
|
||||
|
@ -1040,7 +1041,7 @@ scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
|
|||
size_t charname_len, bytes_read;
|
||||
scm_t_wchar cp;
|
||||
int overflow;
|
||||
scm_t_port_internal *pti;
|
||||
scm_t_port *pt;
|
||||
|
||||
overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
|
||||
&bytes_read);
|
||||
|
@ -1058,14 +1059,14 @@ scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
|
|||
return (SCM_MAKE_CHAR (chr));
|
||||
}
|
||||
|
||||
pti = SCM_PORT_GET_INTERNAL (port);
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
/* Simple ASCII characters can be processed immediately. Also, simple
|
||||
ISO-8859-1 characters can be processed immediately if the encoding for this
|
||||
port is ISO-8859-1. */
|
||||
if (bytes_read == 1 &&
|
||||
((unsigned char) buffer[0] <= 127
|
||||
|| pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
|
||||
|| scm_is_eq (pt->encoding, sym_ISO_8859_1)))
|
||||
{
|
||||
SCM_COL (port) += 1;
|
||||
return SCM_MAKE_CHAR (buffer[0]);
|
||||
|
|
|
@ -51,6 +51,8 @@
|
|||
/* {Strings}
|
||||
*/
|
||||
|
||||
SCM_SYMBOL (sym_UTF_8, "UTF-8");
|
||||
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
|
||||
|
||||
/* Stringbufs
|
||||
*
|
||||
|
@ -1758,16 +1760,16 @@ SCM
|
|||
scm_from_port_stringn (const char *str, size_t len, SCM port)
|
||||
{
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
|
||||
|
||||
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
|
||||
if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
||||
return scm_from_latin1_stringn (str, len);
|
||||
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
|
||||
else if (scm_is_eq (pt->encoding, sym_UTF_8)
|
||||
&& (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
|
||||
|| (u8_check ((uint8_t *) str, len) == NULL)))
|
||||
return scm_from_utf8_stringn (str, len);
|
||||
else
|
||||
return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
|
||||
return scm_from_stringn (str, len, scm_i_symbol_chars (pt->encoding),
|
||||
pt->ilseq_handler);
|
||||
}
|
||||
|
||||
/* Create a new scheme string from the C string STR. The memory of
|
||||
|
@ -2165,15 +2167,15 @@ char *
|
|||
scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
|
||||
{
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
|
||||
|
||||
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
|
||||
if (scm_is_eq (pt->encoding, sym_ISO_8859_1)
|
||||
&& pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
|
||||
return scm_to_latin1_stringn (str, lenp);
|
||||
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
|
||||
else if (scm_is_eq (pt->encoding, sym_UTF_8))
|
||||
return scm_to_utf8_stringn (str, lenp);
|
||||
else
|
||||
return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
|
||||
return scm_to_stringn (str, lenp, scm_i_symbol_chars (pt->encoding),
|
||||
pt->ilseq_handler);
|
||||
}
|
||||
|
||||
/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue