1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 03:30:27 +02:00

Port encoding internally represented as symbol

* libguile/ports-internal.h (scm_t_port_internal): Remove encoding_mode
  member.
* libguile/ports.h (scm_t_port): "encoding" member is now a SCM symbol.
* libguile/ports.c (scm_init_ports): Define symbols for the encodings
  that we handle explicitly.
  (encoding_matches): Adapt to check against an encoding as a symbol.
  (canonicalize_encoding): Return an encoding as a symbol.
  (scm_c_make_port_with_encoding, scm_i_set_default_port_encoding)
  (decide_utf16_encoding, decide_utf32_encoding)
  (scm_i_port_iconv_descriptors, scm_i_set_port_encoding_x)
  (scm_port_encoding, peek_codepoint, scm_ungetc): Adapt to encoding
  change.

* libguile/print.c (display_string_using_iconv, display_string):
* libguile/read.c (scm_read_character):
* libguile/strings.c (scm_from_port_stringn, scm_to_port_stringn): Adapt
  to port encoding change.
This commit is contained in:
Andy Wingo 2016-05-03 10:52:54 +02:00
parent 422f65fe09
commit d8711b9759
6 changed files with 80 additions and 84 deletions

View file

@ -221,7 +221,6 @@ struct scm_port_internal
{
unsigned at_stream_start_for_bom_read : 1;
unsigned at_stream_start_for_bom_write : 1;
scm_t_port_encoding_mode encoding_mode;
scm_t_iconv_descriptors *iconv_descriptors;
SCM alist;
};

View file

@ -91,6 +91,18 @@
#endif
/* We need these symbols early, before (ice-9 ports) loads in the
snarfed definitions, so we can't use SCM_SYMBOL. */
static SCM sym_UTF_8;
static SCM sym_ISO_8859_1;
static SCM sym_UTF_16;
static SCM sym_UTF_16LE;
static SCM sym_UTF_16BE;
static SCM sym_UTF_32;
static SCM sym_UTF_32LE;
static SCM sym_UTF_32BE;
/* Port encodings are case-insensitive ASCII strings. */
static char
ascii_toupper (char c)
@ -103,8 +115,10 @@ ascii_toupper (char c)
on ports or in the default encoding fluid are in upper-case, and can
be compared with strcmp. */
static int
encoding_matches (const char *enc, const char *upper)
encoding_matches (const char *enc, SCM upper_symbol)
{
const char *upper = scm_i_symbol_chars (upper_symbol);
if (!enc)
enc = "ISO-8859-1";
@ -115,14 +129,16 @@ encoding_matches (const char *enc, const char *upper)
return !*upper;
}
static char*
static SCM
canonicalize_encoding (const char *enc)
{
char *ret;
int i;
if (!enc)
return "ISO-8859-1";
if (!enc || encoding_matches (enc, sym_ISO_8859_1))
return sym_ISO_8859_1;
if (encoding_matches (enc, sym_UTF_8))
return sym_UTF_8;
ret = scm_gc_strdup (enc, "port");
@ -136,7 +152,7 @@ canonicalize_encoding (const char *enc)
ret[i] = ascii_toupper (ret[i]);
}
return ret;
return scm_from_latin1_symbol (ret);
}
@ -758,22 +774,7 @@ scm_c_make_port_with_encoding (scm_t_bits tag, unsigned long mode_bits,
entry->rw_random = ptob->seek != NULL;
entry->port = ret;
entry->stream = stream;
if (encoding_matches (encoding, "UTF-8"))
{
pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
entry->encoding = "UTF-8";
}
else if (encoding_matches (encoding, "ISO-8859-1"))
{
pti->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
entry->encoding = "ISO-8859-1";
}
else
{
pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
entry->encoding = canonicalize_encoding (encoding);
}
entry->encoding = canonicalize_encoding (encoding);
entry->ilseq_handler = handler;
pti->iconv_descriptors = NULL;
@ -970,11 +971,11 @@ static SCM default_port_encoding_var;
void
scm_i_set_default_port_encoding (const char *encoding)
{
if (encoding_matches (encoding, "ISO-8859-1"))
if (encoding_matches (encoding, sym_ISO_8859_1))
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
else
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
scm_from_latin1_string (canonicalize_encoding (encoding)));
scm_symbol_to_string (canonicalize_encoding (encoding)));
}
/* Return the name of the default encoding for newly created ports. */
@ -1079,29 +1080,29 @@ static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
to determine the byte order. Otherwise we choose big endian, as
recommended by the Unicode Standard. Note that the BOM (if any) is
not consumed here. */
static const char *
static SCM
decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
{
if (mode == SCM_PORT_READ
&& SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
&& looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
return "UTF-16LE";
return sym_UTF_16LE;
else
return "UTF-16BE";
return sym_UTF_16BE;
}
/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE"
or "UTF-32LE". See the comment above 'decide_utf16_encoding' for
details. */
static const char *
static SCM
decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
{
if (mode == SCM_PORT_READ
&& SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
&& looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
return "UTF-32LE";
return sym_UTF_32LE;
else
return "UTF-32BE";
return sym_UTF_32BE;
}
static void
@ -1189,29 +1190,24 @@ close_iconv_descriptors (scm_t_iconv_descriptors *id)
scm_t_iconv_descriptors *
scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
assert (pti->encoding_mode == SCM_PORT_ENCODING_MODE_ICONV);
if (!pti->iconv_descriptors)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
const char *precise_encoding;
if (!pt->encoding)
pt->encoding = "ISO-8859-1";
SCM precise_encoding;
/* If the specified encoding is UTF-16 or UTF-32, then make
that more precise by deciding what byte order to use. */
if (strcmp (pt->encoding, "UTF-16") == 0)
if (scm_is_eq (pt->encoding, sym_UTF_16))
precise_encoding = decide_utf16_encoding (port, mode);
else if (strcmp (pt->encoding, "UTF-32") == 0)
else if (scm_is_eq (pt->encoding, sym_UTF_32))
precise_encoding = decide_utf32_encoding (port, mode);
else
precise_encoding = pt->encoding;
pti->iconv_descriptors =
open_iconv_descriptors (precise_encoding,
open_iconv_descriptors (scm_i_symbol_chars (precise_encoding),
SCM_INPUT_PORT_P (port),
SCM_OUTPUT_PORT_P (port));
}
@ -1239,22 +1235,7 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
position. */
pti->at_stream_start_for_bom_read = 1;
pti->at_stream_start_for_bom_write = 1;
if (encoding_matches (encoding, "UTF-8"))
{
pt->encoding = "UTF-8";
pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
}
else if (encoding_matches (encoding, "ISO-8859-1"))
{
pt->encoding = "ISO-8859-1";
pti->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
}
else
{
pt->encoding = canonicalize_encoding (encoding);
pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
}
pt->encoding = canonicalize_encoding (encoding);
pti->iconv_descriptors = NULL;
if (prev)
@ -1269,7 +1250,7 @@ SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
{
SCM_VALIDATE_PORT (1, port);
return scm_from_latin1_string (SCM_PTAB_ENTRY (port)->encoding);
return scm_symbol_to_string (SCM_PTAB_ENTRY (port)->encoding);
}
#undef FUNC_NAME
@ -1856,9 +1837,9 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
if (scm_is_eq (pt->encoding, sym_UTF_8))
err = peek_utf8_codepoint (port, codepoint, len);
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
err = peek_latin1_codepoint (port, codepoint, len);
else
err = peek_iconv_codepoint (port, codepoint, len);
@ -1876,9 +1857,9 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
then silently consume it and read another code point. */
if (SCM_UNLIKELY
(*codepoint == SCM_UNICODE_BOM
&& (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
|| strcmp (pt->encoding, "UTF-16") == 0
|| strcmp (pt->encoding, "UTF-32") == 0)))
&& (scm_is_eq (pt->encoding, sym_UTF_8)
|| scm_is_eq (pt->encoding, sym_UTF_16)
|| scm_is_eq (pt->encoding, sym_UTF_32))))
{
scm_port_buffer_did_take (pt->read_buf, *len);
return peek_codepoint (port, codepoint, len);
@ -2022,14 +2003,13 @@ scm_ungetc (scm_t_wchar c, SCM port)
#define FUNC_NAME "scm_ungetc"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
char *result;
char result_buf[10];
size_t len;
len = sizeof (result_buf);
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
if (scm_is_eq (pt->encoding, sym_UTF_8))
{
if (c < 0x80)
{
@ -2041,14 +2021,14 @@ scm_ungetc (scm_t_wchar c, SCM port)
result =
(char *) u32_to_u8 ((uint32_t *) &c, 1, (uint8_t *) result_buf, &len);
}
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1 && c <= 0xff)
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1) && c <= 0xff)
{
result_buf[0] = (char) c;
result = result_buf;
len = 1;
}
else
result = u32_conv_to_encoding (pt->encoding,
result = u32_conv_to_encoding (scm_i_symbol_chars (pt->encoding),
(enum iconv_ilseq_handler) pt->ilseq_handler,
(uint32_t *) &c, 1, NULL,
result_buf, &len);
@ -3163,6 +3143,15 @@ scm_init_ice_9_ports (void)
void
scm_init_ports (void)
{
sym_UTF_8 = scm_from_latin1_symbol ("UTF-8");
sym_ISO_8859_1 = scm_from_latin1_symbol ("ISO-8859-1");
sym_UTF_16 = scm_from_latin1_symbol ("UTF-16");
sym_UTF_16LE = scm_from_latin1_symbol ("UTF-16LE");
sym_UTF_16BE = scm_from_latin1_symbol ("UTF-16BE");
sym_UTF_32 = scm_from_latin1_symbol ("UTF-32");
sym_UTF_32LE = scm_from_latin1_symbol ("UTF-32LE");
sym_UTF_32BE = scm_from_latin1_symbol ("UTF-32BE");
trampoline_to_c_read_subr =
scm_c_make_gsubr ("port-read", 4, 0, 0,
(scm_t_subr) trampoline_to_c_read);

View file

@ -114,7 +114,7 @@ typedef struct
int rw_random;
/* Character encoding support. */
char *encoding;
SCM encoding; /* A symbol of upper-case ASCII. */
scm_t_string_failed_conversion_handler ilseq_handler;
} scm_t_port;

View file

@ -64,6 +64,11 @@
#define PORT_CONVERSION_HANDLER(port) \
SCM_PTAB_ENTRY (port)->ilseq_handler
SCM_SYMBOL (sym_UTF_8, "UTF-8");
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
SCM_SYMBOL (sym_UTF_16, "UTF-16");
SCM_SYMBOL (sym_UTF_32, "UTF-32");
static size_t display_string (const void *, int, size_t, SCM,
scm_t_string_failed_conversion_handler);
@ -1036,8 +1041,8 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
pti->at_stream_start_for_bom_read = 0;
/* Write a BOM if appropriate. */
if (SCM_UNLIKELY (strcmp(pt->encoding, "UTF-16") == 0
|| strcmp(pt->encoding, "UTF-32") == 0))
if (SCM_UNLIKELY (scm_is_eq (pt->encoding, sym_UTF_16)
|| scm_is_eq (pt->encoding, sym_UTF_32)))
display_character (SCM_UNICODE_BOM, port, iconveh_error);
}
@ -1135,13 +1140,13 @@ display_string (const void *str, int narrow_p,
size_t len, SCM port,
scm_t_string_failed_conversion_handler strategy)
{
scm_t_port_internal *pti;
scm_t_port *pt;
pti = SCM_PORT_GET_INTERNAL (port);
pt = SCM_PTAB_ENTRY (port);
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
if (scm_is_eq (pt->encoding, sym_UTF_8))
return display_string_as_utf8 (str, narrow_p, len, port);
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
else if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
return display_string_as_latin1 (str, narrow_p, len, port, strategy);
else
return display_string_using_iconv (str, narrow_p, len, port, strategy);

View file

@ -64,6 +64,7 @@ SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
SCM_SYMBOL (scm_keyword_prefix, "prefix");
SCM_SYMBOL (scm_keyword_postfix, "postfix");
SCM_SYMBOL (sym_nil, "nil");
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
/* SRFI-105 curly infix expression support */
SCM_SYMBOL (sym_nfx, "$nfx$");
@ -1040,7 +1041,7 @@ scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
size_t charname_len, bytes_read;
scm_t_wchar cp;
int overflow;
scm_t_port_internal *pti;
scm_t_port *pt;
overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
&bytes_read);
@ -1058,14 +1059,14 @@ scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
return (SCM_MAKE_CHAR (chr));
}
pti = SCM_PORT_GET_INTERNAL (port);
pt = SCM_PTAB_ENTRY (port);
/* Simple ASCII characters can be processed immediately. Also, simple
ISO-8859-1 characters can be processed immediately if the encoding for this
port is ISO-8859-1. */
if (bytes_read == 1 &&
((unsigned char) buffer[0] <= 127
|| pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
|| scm_is_eq (pt->encoding, sym_ISO_8859_1)))
{
SCM_COL (port) += 1;
return SCM_MAKE_CHAR (buffer[0]);

View file

@ -51,6 +51,8 @@
/* {Strings}
*/
SCM_SYMBOL (sym_UTF_8, "UTF-8");
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
/* Stringbufs
*
@ -1758,16 +1760,16 @@ SCM
scm_from_port_stringn (const char *str, size_t len, SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
return scm_from_latin1_stringn (str, len);
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
else if (scm_is_eq (pt->encoding, sym_UTF_8)
&& (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
|| (u8_check ((uint8_t *) str, len) == NULL)))
return scm_from_utf8_stringn (str, len);
else
return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
return scm_from_stringn (str, len, scm_i_symbol_chars (pt->encoding),
pt->ilseq_handler);
}
/* Create a new scheme string from the C string STR. The memory of
@ -2165,15 +2167,15 @@ char *
scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
if (scm_is_eq (pt->encoding, sym_ISO_8859_1)
&& pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
return scm_to_latin1_stringn (str, lenp);
else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
else if (scm_is_eq (pt->encoding, sym_UTF_8))
return scm_to_utf8_stringn (str, lenp);
else
return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
return scm_to_stringn (str, lenp, scm_i_symbol_chars (pt->encoding),
pt->ilseq_handler);
}
/* Return a malloc(3)-allocated buffer containing the contents of STR encoded