1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-05 11:40:20 +02:00

Add full Unicode capability to ports and the default reader

Ports are given two additional properties: a character encoding and
a conversion failure strategy.  These properties have getters and setters.
The new properties are used to convert any locale text to/from the
internal representation of strings.

If unspecified, ports use a default value. The default value of these
properties is held in a fluid.  The default character encoding can be
modified by calling setlocale.

ISO-8859-1 is treated specially.  Since it is a native encoding of
strings, it can be processed more quickly.  Source code is assumed to be
ISO-8859-1 unless otherwise specified.  The encoding of a source code
file can be given as 'coding: XXXXX' in a magic comment at the top of a
file.

The C functions that deal with encoding often use a null pointer
as shorthand for the native Latin-1 encoding, for efficiency's sake.

* test-suite/tests/encoding-iso88591.test: new tests
* test-suite/tests/encoding-iso88597.test: new tests
* test-suite/tests/encoding-utf8.test: new tests
* test-suite/tests/encoding-escapes.test: new tests
* test-suite/tests/numbers.test: declare 'binary' encoding
* test-suite/tests/ports.test: declare 'binary' encoding
* test-suite/tests/r6rs-ports.test: declare 'binary' encoding

* module/system/base/compile.scm (compile-file): use source-code
  file's self-declared encoding when compiling files

* libguile/strports.c: store string ports in locale encoding
  (scm_strport_to_locale_u8vector, scm_call_with_output_locale_u8vector)
  (scm_open_input_locale_u8vector, scm_get_output_locale_u8vector):
  new functions

* libguile/strings.h: new declaration for scm_i_string_contains_char

* libguile/strings.c (scm_i_string_contains_char): new function
  (scm_from_stringn, scm_to_stringn):  use NULL for Latin-1
  (scm_from_locale_stringn, scm_to_locale_stringn): respect character
  encoding of input and output ports

* libguile/read.h: declaration for scm_scan_for_encoding

* libguile/read.c:
  (read_token): now takes scheme string instead of C string/length
  (read_complete_token): new function
  (scm_read_sexp, scm_read_number, scm_read_mixed_case_symbol)
  (scm_read_number_and_radix, scm_read_quote, scm_read_semicolon_comment)
  (scm_read_srfi4_vector, scm_read_bytevector, scm_read_guile_bit_vector)
  (scm_read_scsh_block_comment, scm_read_commented_expression)
  (scm_read_extended_symbol, scm_read_sharp_extension, scm_read_shart)
  (scm_read_expression): use scm_t_wchar for char type, use read_complete_token
  (scm_scan_for_encoding): new function to find a file's character encoding
  (scm_file_encoding): new function to find a port's character encoding

* libguile/rdelim.c: don't unpack strings

* libguile/print.h: declaration for modified function
  scm_i_charprint

* libguile/print.c: use locale when printing characters and
  strings
  (scm_i_charprint): input parameter is now scm_t_wchar
  (scm_simple_format): don't unpack strings

* libguile/posix.h: new declaration for scm_setbinary.

* libguile/posix.c (scm_setlocale): set default and stdio port
  encodings based on the locale's character encoding
  (scm_setbinary): new function

* libguile/ports.h (scm_t_port): add encoding and failed
  conversion handler to port type.  Declarations for new or modified
  functions scm_getc, scm_unget_byte, scm_ungetc,
  scm_i_get_port_encoding, scm_i_set_port_encoding_x,
  scm_port_encoding, scm_set_port_encoding_x,
  scm_i_get_conversion_strategy, scm_i_set_conversion_strategy_x,
  scm_port_conversion_strategy, scm_set_port_conversion_strategy_x.

* libguile/ports.c: assign the current ports to zero on startup so
  we can see if they've been set.
  (scm_current_input_port, scm_current_output_port,
  scm_current_error_port): return #f if the port is not yet
  initialized
  (scm_new_port_table_entry): set up a new port's encoding and
  illegal sequence handler based on the thread's current defaults
  (scm_i_remove_port): free port encoding name when port is removed
  (scm_i_mode_bits_n): now takes a scheme string instead of a c
  string and length.  All callers changed.
  (SCM_MBCHAR_BUF_SIZE): new const
  (scm_getc): new function, since the scm_getc in inline.h is now
  scm_get_byte_or_eof.  This pulls one codepoint from a port.
  (scm_lfwrite_substr, scm_lfwrite_str): now uses port's encoding
  (scm_unget_byte): new function, incorportaing the low-level functionality
  of scm_ungetc
  (scm_ungetc): uses scm_unget_byte

* libguile/numbers.h (scm_t_wchar): compilation order problem with
  scm_t_wchar being use in functions in multiple headers.  Forward
  declare scm_t_wchar.

* libguile/load.c (scm_primitive_load): scan for file encoding at
  top of file and use it to set the load port's encoding

* libguile/inline.h (scm_get_byte_or_eof): new function
  incorporating most of the functionality of scm_getc.

* libguile/fports.c (fport_fill_input): now returns scm_t_wchar

* libguile/chars.h (scm_t_wchar): avoid compilation order problem
  with declaration of scm_t_wchar
This commit is contained in:
Michael Gran 2009-08-25 07:54:37 -07:00
parent 9db8cf1634
commit 889975e51a
26 changed files with 1705 additions and 316 deletions

View file

@ -24,7 +24,11 @@
#include "libguile/__scm.h"
#include "libguile/numbers.h"
#ifndef SCM_T_WCHAR_DEFINED
typedef scm_t_int32 scm_t_wchar;
#define SCM_T_WCHAR_DEFINED
#endif /* SCM_T_WCHAR_DEFINED */
/* Immediate Characters

View file

@ -587,7 +587,7 @@ static void fport_flush (SCM port);
/* fill a port's read-buffer with a single read. returns the first
char or EOF if end of file. */
static int
static scm_t_wchar
fport_fill_input (SCM port)
{
long count;
@ -601,7 +601,7 @@ fport_fill_input (SCM port)
if (count == -1)
scm_syserror ("fport_fill_input");
if (count == 0)
return EOF;
return (scm_t_wchar) EOF;
else
{
pt->read_pos = pt->read_buf;

View file

@ -86,7 +86,7 @@ SCM_API void scm_array_handle_set (scm_t_array_handle *h, ssize_t pos, SCM val);
SCM_API int scm_is_pair (SCM x);
SCM_API int scm_getc (SCM port);
SCM_API int scm_get_byte_or_eof (SCM port);
SCM_API void scm_putc (char c, SCM port);
SCM_API void scm_puts (const char *str_data, SCM port);
@ -290,7 +290,7 @@ scm_is_pair (SCM x)
SCM_C_EXTERN_INLINE
#endif
int
scm_getc (SCM port)
scm_get_byte_or_eof (SCM port)
{
int c;
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@ -310,27 +310,6 @@ scm_getc (SCM port)
c = *(pt->read_pos++);
switch (c)
{
case '\a':
break;
case '\b':
SCM_DECCOL (port);
break;
case '\n':
SCM_INCLINE (port);
break;
case '\r':
SCM_ZEROCOL (port);
break;
case '\t':
SCM_TABCOL (port);
break;
default:
SCM_INCCOL (port);
break;
}
return c;
}

View file

@ -85,6 +85,7 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
#define FUNC_NAME s_scm_primitive_load
{
SCM hook = *scm_loc_load_hook;
char *encoding;
SCM_VALIDATE_STRING (1, filename);
if (scm_is_true (hook) && scm_is_false (scm_procedure_p (hook)))
SCM_MISC_ERROR ("value of %load-hook is neither a procedure nor #f",
@ -97,7 +98,15 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
SCM port = scm_open_file (filename, scm_from_locale_string ("r"));
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
scm_i_dynwind_current_load_port (port);
encoding = scm_scan_for_encoding (port);
if (encoding)
{
scm_i_set_port_encoding_x (port, encoding);
free (encoding);
}
else
/* The file has no encoding declaraed. We'll presume Latin-1. */
scm_i_set_port_encoding_x (port, NULL);
while (1)
{
SCM reader, form;

View file

@ -28,6 +28,11 @@
#include "libguile/__scm.h"
#include "libguile/print.h"
#ifndef SCM_T_WCHAR_DEFINED
typedef scm_t_int32 scm_t_wchar;
#define SCM_T_WCHAR_DEFINED
#endif /* SCM_T_WCHAR_DEFINED */
#if SCM_HAVE_FLOATINGPOINT_H
# include <floatingpoint.h>
#endif
@ -174,7 +179,6 @@ typedef struct scm_t_complex
double imag;
} scm_t_complex;
typedef scm_t_int32 scm_t_wchar;

View file

@ -30,6 +30,9 @@
#include <errno.h>
#include <fcntl.h> /* for chsize on mingw */
#include <assert.h>
#include <uniconv.h>
#include <unistr.h>
#include <striconveh.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
@ -51,6 +54,7 @@
#include "libguile/vectors.h"
#include "libguile/weaks.h"
#include "libguile/fluids.h"
#include "libguile/eq.h"
#ifdef HAVE_STRING_H
#include <string.h>
@ -358,10 +362,10 @@ SCM_DEFINE (scm_drain_input, "drain-input", 1, 0, 0,
/* Standard ports --- current input, output, error, and more(!). */
static SCM cur_inport_fluid;
static SCM cur_outport_fluid;
static SCM cur_errport_fluid;
static SCM cur_loadport_fluid;
static SCM cur_inport_fluid = 0;
static SCM cur_outport_fluid = 0;
static SCM cur_errport_fluid = 0;
static SCM cur_loadport_fluid = 0;
SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
(),
@ -370,7 +374,10 @@ SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
"returns the @dfn{standard input} in Unix and C terminology.")
#define FUNC_NAME s_scm_current_input_port
{
return scm_fluid_ref (cur_inport_fluid);
if (cur_inport_fluid)
return scm_fluid_ref (cur_inport_fluid);
else
return SCM_BOOL_F;
}
#undef FUNC_NAME
@ -382,7 +389,10 @@ SCM_DEFINE (scm_current_output_port, "current-output-port", 0, 0, 0,
"Unix and C terminology.")
#define FUNC_NAME s_scm_current_output_port
{
return scm_fluid_ref (cur_outport_fluid);
if (cur_outport_fluid)
return scm_fluid_ref (cur_outport_fluid);
else
return SCM_BOOL_F;
}
#undef FUNC_NAME
@ -392,7 +402,10 @@ SCM_DEFINE (scm_current_error_port, "current-error-port", 0, 0, 0,
"@dfn{standard error} in Unix and C terminology).")
#define FUNC_NAME s_scm_current_error_port
{
return scm_fluid_ref (cur_errport_fluid);
if (cur_errport_fluid)
return scm_fluid_ref (cur_errport_fluid);
else
return SCM_BOOL_F;
}
#undef FUNC_NAME
@ -509,10 +522,18 @@ scm_new_port_table_entry (scm_t_bits tag)
SCM z = scm_cons (SCM_EOL, SCM_EOL);
scm_t_port *entry = (scm_t_port *) scm_gc_calloc (sizeof (scm_t_port), "port");
const char *enc;
entry->file_name = SCM_BOOL_F;
entry->rw_active = SCM_PORT_NEITHER;
entry->port = z;
/* Initialize this port with the thread's current default
encoding. */
if ((enc = scm_i_get_port_encoding (SCM_BOOL_F)) == NULL)
entry->encoding = NULL;
else
entry->encoding = strdup (enc);
entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);
SCM_SET_CELL_TYPE (z, tag);
SCM_SETPTAB_ENTRY (z, entry);
@ -549,6 +570,11 @@ scm_i_remove_port (SCM port)
scm_t_port *p = SCM_PTAB_ENTRY (port);
if (p->putback_buf)
scm_gc_free (p->putback_buf, p->putback_buf_size, "putback buffer");
if (p->encoding)
{
free (p->encoding);
p->encoding = NULL;
}
scm_gc_free (p, sizeof (scm_t_port), "port");
SCM_SETPTAB_ENTRY (port, 0);
@ -632,21 +658,22 @@ SCM_DEFINE (scm_set_port_revealed_x, "set-port-revealed!", 2, 0, 0,
*/
static long
scm_i_mode_bits_n (const char *modes, size_t n)
scm_i_mode_bits_n (SCM modes)
{
return (SCM_OPN
| (memchr (modes, 'r', n) || memchr (modes, '+', n) ? SCM_RDNG : 0)
| ( memchr (modes, 'w', n)
|| memchr (modes, 'a', n)
|| memchr (modes, '+', n) ? SCM_WRTNG : 0)
| (memchr (modes, '0', n) ? SCM_BUF0 : 0)
| (memchr (modes, 'l', n) ? SCM_BUFLINE : 0));
| (scm_i_string_contains_char (modes, 'r')
|| scm_i_string_contains_char (modes, '+') ? SCM_RDNG : 0)
| (scm_i_string_contains_char (modes, 'w')
|| scm_i_string_contains_char (modes, 'a')
|| scm_i_string_contains_char (modes, '+') ? SCM_WRTNG : 0)
| (scm_i_string_contains_char (modes, '0') ? SCM_BUF0 : 0)
| (scm_i_string_contains_char (modes, 'l') ? SCM_BUFLINE : 0));
}
long
scm_mode_bits (char *modes)
{
return scm_i_mode_bits_n (modes, strlen (modes));
return scm_i_mode_bits (scm_from_locale_string (modes));
}
long
@ -657,8 +684,7 @@ scm_i_mode_bits (SCM modes)
if (!scm_is_string (modes))
scm_wrong_type_arg_msg (NULL, 0, modes, "string");
bits = scm_i_mode_bits_n (scm_i_string_chars (modes),
scm_i_string_length (modes));
bits = scm_i_mode_bits_n (modes);
scm_remember_upto_here_1 (modes);
return bits;
}
@ -929,7 +955,7 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
"characters are available, the end-of-file object is returned.")
#define FUNC_NAME s_scm_read_char
{
int c;
scm_t_wchar c;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
SCM_VALIDATE_OPINPORT (1, port);
@ -940,6 +966,133 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
}
#undef FUNC_NAME
#define SCM_MBCHAR_BUF_SIZE (4)
/* Get one codepoint from a file, using the port's encoding. */
scm_t_wchar
scm_getc (SCM port)
{
int c;
unsigned int bufcount = 0;
char buf[SCM_MBCHAR_BUF_SIZE];
scm_t_wchar codepoint = 0;
scm_t_uint32 *u32;
size_t u32len;
scm_t_port *pt = SCM_PTAB_ENTRY (port);
c = scm_get_byte_or_eof (port);
if (c == EOF)
return (scm_t_wchar) EOF;
buf[0] = c;
bufcount++;
if (pt->encoding == NULL)
{
/* The encoding is Latin-1: bytes are characters. */
codepoint = buf[0];
goto success;
}
for (;;)
{
u32 = u32_conv_from_encoding (pt->encoding,
(enum iconv_ilseq_handler) pt->ilseq_handler,
buf, bufcount, NULL, NULL, &u32len);
if (u32 == NULL || u32len == 0)
{
if (errno == ENOMEM)
scm_memory_error ("Input decoding");
/* Otherwise errno is EILSEQ or EINVAL, so perhaps more
bytes are needed. Keep looping. */
}
else
{
/* Complete codepoint found. */
codepoint = u32[0];
free (u32);
goto success;
}
if (bufcount == SCM_MBCHAR_BUF_SIZE)
{
/* We've read several bytes and didn't find a good
codepoint. Give up. */
goto failure;
}
c = scm_get_byte_or_eof (port);
if (c == EOF)
{
/* EOF before a complete character was read. Push it all
back and return EOF. */
while (bufcount > 0)
{
/* FIXME: this will probably cause errors in the port column. */
scm_unget_byte (buf[bufcount-1], port);
bufcount --;
}
return EOF;
}
if (c == '\n')
{
/* It is always invalid to have EOL in the middle of a
multibyte character. */
scm_unget_byte ('\n', port);
goto failure;
}
buf[bufcount++] = c;
}
success:
switch (codepoint)
{
case '\a':
break;
case '\b':
SCM_DECCOL (port);
break;
case '\n':
SCM_INCLINE (port);
break;
case '\r':
SCM_ZEROCOL (port);
break;
case '\t':
SCM_TABCOL (port);
break;
default:
SCM_INCCOL (port);
break;
}
return codepoint;
failure:
{
char *err_buf;
SCM err_str = scm_i_make_string (bufcount, &err_buf);
memcpy (err_buf, buf, bufcount);
if (errno == EILSEQ)
scm_misc_error (NULL, "input encoding error for ~s: ~s",
scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
err_str));
else
scm_misc_error (NULL, "input encoding error (invalid) for ~s: ~s\n",
scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
err_str));
}
/* Never gets here. */
return 0;
}
/* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from
the port, which is either EOF or *(pt->read_pos). */
@ -1027,7 +1180,7 @@ scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
stringbuf_write_mutex. So, one shouldn't use scm_lfwrite_substr
if the stringbuf write mutex may still be held elsewhere. */
buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
NULL, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
pt->encoding, pt->ilseq_handler);
ptob->write (port, buf, len);
free (buf);
@ -1056,7 +1209,7 @@ scm_lfwrite_str (SCM str, SCM port)
scm_end_input (port);
buf = scm_to_stringn (str, &len,
NULL, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
pt->encoding, pt->ilseq_handler);
ptob->write (port, buf, len);
free (buf);
@ -1257,8 +1410,8 @@ scm_end_input (SCM port)
void
scm_ungetc (int c, SCM port)
#define FUNC_NAME "scm_ungetc"
scm_unget_byte (int c, SCM port)
#define FUNC_NAME "scm_unget_byte"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@ -1317,6 +1470,25 @@ scm_ungetc (int c, SCM port)
if (pt->rw_random)
pt->rw_active = SCM_PORT_READ;
}
#undef FUNC_NAME
void
scm_ungetc (scm_t_wchar c, SCM port)
#define FUNC_NAME "scm_ungetc"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_wchar *wbuf;
SCM str = scm_i_make_wide_string (1, &wbuf);
char *buf;
size_t len;
int i;
wbuf[0] = c;
buf = scm_to_stringn (str, &len, pt->encoding, pt->ilseq_handler);
for (i = len - 1; i >= 0; i--)
scm_unget_byte (buf[i], port);
if (c == '\n')
{
@ -1363,7 +1535,7 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
"to @code{read-char} would have hung.")
#define FUNC_NAME s_scm_peek_char
{
int c, column;
scm_t_wchar c, column;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
@ -1409,13 +1581,17 @@ SCM_DEFINE (scm_unread_string, "unread-string", 2, 0, 0,
"@var{port} is not supplied, the current-input-port is used.")
#define FUNC_NAME s_scm_unread_string
{
int n;
SCM_VALIDATE_STRING (1, str);
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
SCM_VALIDATE_OPINPORT (2, port);
scm_ungets (scm_i_string_chars (str), scm_i_string_length (str), port);
n = scm_i_string_length (str);
while (n--)
scm_ungetc (scm_i_string_ref (str, n), port);
return str;
}
@ -1670,6 +1846,328 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
}
#undef FUNC_NAME
/* The default port encoding for this locale. New ports will have this
encoding. If it is a string, that is the encoding. If it #f, it
is in the native (Latin-1) encoding. */
SCM_GLOBAL_VARIABLE (scm_port_encoding_var, "%port-encoding");
static int scm_port_encoding_init = 0;
/* Return a C string representation of the current encoding. */
const char *
scm_i_get_port_encoding (SCM port)
{
SCM encoding;
if (scm_is_false (port))
{
if (!scm_port_encoding_init)
return NULL;
else if (!scm_is_fluid (SCM_VARIABLE_REF (scm_port_encoding_var)))
return NULL;
else
{
encoding = scm_fluid_ref (SCM_VARIABLE_REF (scm_port_encoding_var));
if (!scm_is_string (encoding))
return NULL;
else
return scm_i_string_chars (encoding);
}
}
else
{
scm_t_port *pt;
pt = SCM_PTAB_ENTRY (port);
if (pt->encoding)
return pt->encoding;
else
return NULL;
}
}
/* Returns ENC is if is a recognized encoding. If it isn't, it tries
to find an alias of ENC that is valid. Otherwise, it returns
NULL. */
static const char *
find_valid_encoding (const char *enc)
{
int isvalid = 0;
const char str[] = " ";
scm_t_uint32 *u32;
size_t u32len;
u32 = u32_conv_from_encoding (enc, iconveh_error, str, 1,
NULL, NULL, &u32len);
isvalid = (u32 != NULL);
free (u32);
if (isvalid)
return enc;
return NULL;
}
void
scm_i_set_port_encoding_x (SCM port, const char *enc)
{
const char *valid_enc;
scm_t_port *pt;
/* Null is shorthand for the native, Latin-1 encoding. */
if (enc == NULL)
valid_enc = NULL;
else
{
valid_enc = find_valid_encoding (enc);
if (valid_enc == NULL)
{
SCM err;
err = scm_from_locale_string (enc);
scm_misc_error (NULL, "invalid or unknown character encoding ~s",
scm_list_1 (err));
}
}
if (scm_is_false (port))
{
/* Set the default encoding for future ports. */
if (!scm_port_encoding_init
|| !scm_is_fluid (SCM_VARIABLE_REF (scm_port_encoding_var)))
scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
SCM_EOL);
if (valid_enc == NULL
|| !strcmp (valid_enc, "ASCII")
|| !strcmp (valid_enc, "ANSI_X3.4-1968")
|| !strcmp (valid_enc, "ISO-8859-1"))
scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var), SCM_BOOL_F);
else
scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var),
scm_from_locale_string (valid_enc));
}
else
{
/* Set the character encoding for this port. */
pt = SCM_PTAB_ENTRY (port);
if (pt->encoding)
free (pt->encoding);
if (valid_enc == NULL)
pt->encoding = NULL;
else
pt->encoding = strdup (valid_enc);
}
}
SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
(SCM port),
"Returns, as a string, the character encoding that @var{port}\n"
"uses to interpret its input and output.\n")
#define FUNC_NAME s_scm_port_encoding
{
scm_t_port *pt;
const char *enc;
SCM_VALIDATE_PORT (1, port);
pt = SCM_PTAB_ENTRY (port);
enc = scm_i_get_port_encoding (port);
if (enc)
return scm_from_locale_string (pt->encoding);
else
return scm_from_locale_string ("NONE");
}
#undef FUNC_NAME
SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0,
(SCM port, SCM enc),
"Sets the character encoding that will be used to interpret all\n"
"port I/O. New ports are created with the encoding\n"
"appropriate for the current locale if @code{setlocale} has \n"
"been called or ISO-8859-1 otherwise\n"
"and this procedure can be used to modify that encoding.\n")
#define FUNC_NAME s_scm_set_port_encoding_x
{
char *enc_str;
const char *valid_enc_str;
SCM_VALIDATE_PORT (1, port);
SCM_VALIDATE_STRING (2, enc);
enc_str = scm_to_locale_string (enc);
valid_enc_str = find_valid_encoding (enc_str);
if (valid_enc_str == NULL)
{
free (enc_str);
scm_misc_error (FUNC_NAME, "invalid or unknown character encoding ~s",
scm_list_1 (enc));
}
else
{
scm_i_set_port_encoding_x (port, valid_enc_str);
free (enc_str);
}
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
/* This determines how conversions handle unconvertible characters. */
SCM_GLOBAL_VARIABLE (scm_conversion_strategy, "%port-conversion-strategy");
static int scm_conversion_strategy_init = 0;
scm_t_string_failed_conversion_handler
scm_i_get_conversion_strategy (SCM port)
{
SCM encoding;
if (scm_is_false (port))
{
if (!scm_conversion_strategy_init
|| !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
return SCM_FAILED_CONVERSION_QUESTION_MARK;
else
{
encoding = scm_fluid_ref (SCM_VARIABLE_REF (scm_conversion_strategy));
if (scm_is_false (encoding))
return SCM_FAILED_CONVERSION_QUESTION_MARK;
else
return (scm_t_string_failed_conversion_handler) scm_to_int (encoding);
}
}
else
{
scm_t_port *pt;
pt = SCM_PTAB_ENTRY (port);
return pt->ilseq_handler;
}
}
void
scm_i_set_conversion_strategy_x (SCM port,
scm_t_string_failed_conversion_handler handler)
{
SCM strategy;
scm_t_port *pt;
strategy = scm_from_int ((int) handler);
if (scm_is_false (port))
{
/* Set the default encoding for future ports. */
if (!scm_conversion_strategy
|| !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
scm_misc_error (NULL, "tried to set conversion strategy fluid before it is initialized",
SCM_EOL);
scm_fluid_set_x (SCM_VARIABLE_REF (scm_conversion_strategy), strategy);
}
else
{
/* Set the character encoding for this port. */
pt = SCM_PTAB_ENTRY (port);
pt->ilseq_handler = handler;
}
}
SCM_DEFINE (scm_port_conversion_strategy, "port-conversion-strategy",
1, 0, 0, (SCM port),
"Returns the behavior of the port when handling a character that\n"
"is not representable in the port's current encoding.\n"
"It returns the symbol @code{error} if unrepresentable characters\n"
"should cause exceptions, @code{substitute} if the port should\n"
"try to replace unrepresentable characters with question marks or\n"
"approximate characters, or @code{escape} if unrepresentable\n"
"characters should be converted to string escapes.\n"
"\n"
"If @var{port} is @code{#f}, then the current default behavior\n"
"will be returned. New ports will have this default behavior\n"
"when they are created.\n")
#define FUNC_NAME s_scm_port_conversion_strategy
{
scm_t_string_failed_conversion_handler h;
SCM_VALIDATE_OPPORT (1, port);
if (!scm_is_false (port))
{
SCM_VALIDATE_OPPORT (1, port);
}
h = scm_i_get_conversion_strategy (port);
if (h == SCM_FAILED_CONVERSION_ERROR)
return scm_from_locale_symbol ("error");
else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK)
return scm_from_locale_symbol ("substitute");
else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
return scm_from_locale_symbol ("escape");
else
abort ();
/* Never gets here. */
return SCM_UNDEFINED;
}
#undef FUNC_NAME
SCM_DEFINE (scm_set_port_conversion_strategy_x, "set-port-conversion-strategy!",
2, 0, 0,
(SCM port, SCM sym),
"Sets the behavior of the interpreter when outputting a character\n"
"that is not representable in the port's current encoding.\n"
"@var{sym} can be either @code{'error}, @code{'substitute}, or\n"
"@code{'escape}. If it is @code{'error}, an error will be thrown\n"
"when an unconvertible character is encountered. If it is\n"
"@code{'substitute}, then unconvertible characters will \n"
"be replaced with approximate characters, or with question marks\n"
"if no approximately correct character is available.\n"
"If it is @code{'escape},\n"
"it will appear as a hex escape when output.\n"
"\n"
"If @var{port} is an open port, the conversion error behavior\n"
"is set for that port. If it is @code{#f}, it is set as the\n"
"default behavior for any future ports that get created in\n"
"this thread.\n")
#define FUNC_NAME s_scm_set_port_conversion_strategy_x
{
SCM err;
SCM qm;
SCM esc;
if (!scm_is_false (port))
{
SCM_VALIDATE_OPPORT (1, port);
}
err = scm_from_locale_symbol ("error");
if (scm_is_true (scm_eqv_p (sym, err)))
{
scm_i_set_conversion_strategy_x (port, SCM_FAILED_CONVERSION_ERROR);
return SCM_UNSPECIFIED;
}
qm = scm_from_locale_symbol ("substitute");
if (scm_is_true (scm_eqv_p (sym, qm)))
{
scm_i_set_conversion_strategy_x (port,
SCM_FAILED_CONVERSION_QUESTION_MARK);
return SCM_UNSPECIFIED;
}
esc = scm_from_locale_symbol ("escape");
if (scm_is_true (scm_eqv_p (sym, esc)))
{
scm_i_set_conversion_strategy_x (port,
SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
return SCM_UNSPECIFIED;
}
SCM_MISC_ERROR ("unknown conversion behavior ~s", scm_list_1 (sym));
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
void
scm_print_port_mode (SCM exp, SCM port)
{
@ -1780,8 +2278,17 @@ scm_init_ports ()
cur_loadport_fluid = scm_permanent_object (scm_make_fluid ());
scm_i_port_weak_hash = scm_permanent_object (scm_make_weak_key_hash_table (SCM_I_MAKINUM(31)));
#include "libguile/ports.x"
SCM_VARIABLE_SET (scm_port_encoding_var, scm_make_fluid ());
scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var), SCM_BOOL_F);
scm_port_encoding_init = 1;
SCM_VARIABLE_SET (scm_conversion_strategy, scm_make_fluid ());
scm_fluid_set_x (SCM_VARIABLE_REF (scm_conversion_strategy),
scm_from_int ((int) SCM_FAILED_CONVERSION_QUESTION_MARK));
scm_conversion_strategy_init = 1;
}
/*

View file

@ -28,7 +28,7 @@
#include "libguile/print.h"
#include "libguile/struct.h"
#include "libguile/threads.h"
#include "libguile/strings.h"
@ -56,6 +56,10 @@ typedef struct
long line_number; /* debugging support. */
int column_number; /* debugging support. */
/* Character encoding support */
char *encoding;
scm_t_string_failed_conversion_handler ilseq_handler;
/* port buffers. the buffer(s) are set up for all ports.
in the case of string ports, the buffer is the string itself.
in the case of unbuffered file ports, the buffer is a
@ -266,6 +270,7 @@ SCM_API SCM scm_eof_object_p (SCM x);
SCM_API SCM scm_force_output (SCM port);
SCM_API SCM scm_flush_all_ports (void);
SCM_API SCM scm_read_char (SCM port);
SCM_API scm_t_wchar scm_getc (SCM port);
SCM_API size_t scm_c_read (SCM port, void *buffer, size_t size);
SCM_API void scm_c_write (SCM port, const void *buffer, size_t size);
SCM_API void scm_lfwrite (const char *ptr, size_t size, SCM port);
@ -275,7 +280,8 @@ SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t start, size_t end,
SCM_API void scm_flush (SCM port);
SCM_API void scm_end_input (SCM port);
SCM_API int scm_fill_input (SCM port);
SCM_API void scm_ungetc (int c, SCM port);
SCM_INTERNAL void scm_unget_byte (int c, SCM port);
SCM_API void scm_ungetc (scm_t_wchar c, SCM port);
SCM_API void scm_ungets (const char *s, int n, SCM port);
SCM_API SCM scm_peek_char (SCM port);
SCM_API SCM scm_unread_char (SCM cobj, SCM port);
@ -288,6 +294,15 @@ SCM_API SCM scm_port_column (SCM port);
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
SCM_API SCM scm_port_filename (SCM port);
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
SCM_INTERNAL const char *scm_i_get_port_encoding (SCM port);
SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
SCM_API SCM scm_port_encoding (SCM port);
SCM_API SCM scm_set_port_encoding_x (SCM port, SCM encoding);
SCM_INTERNAL scm_t_string_failed_conversion_handler scm_i_get_conversion_strategy (SCM port);
SCM_INTERNAL void scm_i_set_conversion_strategy_x (SCM port,
scm_t_string_failed_conversion_handler h);
SCM_API SCM scm_port_conversion_strategy (SCM port);
SCM_API SCM scm_set_port_conversion_strategy_x (SCM port, SCM behavior);
SCM_API int scm_port_print (SCM exp, SCM port, scm_print_state *);
SCM_API void scm_print_port_mode (SCM exp, SCM port);
SCM_API void scm_ports_prehistory (void);
@ -295,7 +310,6 @@ SCM_API SCM scm_void_port (char * mode_str);
SCM_API SCM scm_sys_make_void_port (SCM mode);
SCM_INTERNAL void scm_init_ports (void);
#if SCM_ENABLE_DEPRECATED==1
SCM_API scm_t_port * scm_add_to_port_table (SCM port);
#endif

View file

@ -25,6 +25,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <uniconv.h>
#include "libguile/_scm.h"
#include "libguile/dynwind.h"
@ -1528,12 +1529,17 @@ SCM_DEFINE (scm_setlocale, "setlocale", 1, 1, 0,
"Otherwise the specified locale category is set to the string\n"
"@var{locale} and the new value is returned as a\n"
"system-dependent string. If @var{locale} is an empty string,\n"
"the locale will be set using environment variables.")
"the locale will be set using environment variables.\n"
"\n"
"When the locale is changed, the character encoding of the new\n"
"locale (UTF-8, ISO-8859-1, etc.) is used for the current\n"
"input, output, and error ports\n")
#define FUNC_NAME s_scm_setlocale
{
int c_category;
char *clocale;
char *rv;
const char *enc;
scm_dynwind_begin (0);
@ -1562,15 +1568,47 @@ SCM_DEFINE (scm_setlocale, "setlocale", 1, 1, 0,
SCM_SYSERROR;
}
/* Recompute the standard SRFI-14 character sets in a locale-dependent
(actually charset-dependent) way. */
scm_srfi_14_compute_char_sets ();
enc = locale_charset ();
/* Set the default encoding for new ports. */
scm_i_set_port_encoding_x (SCM_BOOL_F, enc);
/* Set the encoding for the stdio ports. */
scm_i_set_port_encoding_x (scm_current_input_port (), enc);
scm_i_set_port_encoding_x (scm_current_output_port (), enc);
scm_i_set_port_encoding_x (scm_current_error_port (), enc);
scm_dynwind_end ();
return scm_from_locale_string (rv);
}
#undef FUNC_NAME
#endif /* HAVE_SETLOCALE */
SCM_DEFINE (scm_setbinary, "setbinary", 0, 0, 0,
(void),
"Sets the encoding for the current input, output, and error\n"
"ports to ISO-8859-1. That character encoding allows\n"
"ports to operate on binary data.\n"
"\n"
"It also sets the default encoding for newly created ports\n"
"to ISO-8859-1.\n"
"\n"
"The previous default encoding for new ports is returned\n")
#define FUNC_NAME s_scm_setbinary
{
const char *enc = scm_i_get_port_encoding (SCM_BOOL_F);
/* Set the default encoding for new ports. */
scm_i_set_port_encoding_x (SCM_BOOL_F, NULL);
/* Set the encoding for the stdio ports. */
scm_i_set_port_encoding_x (scm_current_input_port (), NULL);
scm_i_set_port_encoding_x (scm_current_output_port (), NULL);
scm_i_set_port_encoding_x (scm_current_error_port (), NULL);
if (enc)
return scm_from_locale_string (enc);
return scm_from_locale_string ("ISO-8859-1");
}
#undef FUNC_NAME
#ifdef HAVE_MKNOD
SCM_DEFINE (scm_mknod, "mknod", 4, 0, 0,

View file

@ -74,6 +74,7 @@ SCM_API SCM scm_access (SCM path, SCM how);
SCM_API SCM scm_getpid (void);
SCM_API SCM scm_putenv (SCM str);
SCM_API SCM scm_setlocale (SCM category, SCM locale);
SCM_API SCM scm_setbinary (void);
SCM_API SCM scm_mknod (SCM path, SCM type, SCM perms, SCM dev);
SCM_API SCM scm_nice (SCM incr);
SCM_API SCM scm_sync (void);

View file

@ -463,20 +463,45 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
| UC_CATEGORY_MASK_S))
/* Print the character if is graphic character. */
{
if (i<256)
/* Character is graphic. Print it. */
scm_putc (i, port);
scm_t_wchar *wbuf;
SCM wstr = scm_i_make_wide_string (1, &wbuf);
char *buf;
size_t len;
const char *enc;
enc = scm_i_get_port_encoding (port);
wbuf[0] = i;
if (enc == NULL && i <= 0xFF)
{
/* Character is graphic and Latin-1. Print it */
scm_lfwrite_str (wstr, port);
}
else
/* Character is graphic but unrepresentable in
this port's encoding. */
scm_intprint (i, 8, port);
{
buf = u32_conv_to_encoding (enc,
iconveh_error,
(scm_t_uint32 *) wbuf,
1,
NULL,
NULL, &len);
if (buf != NULL)
{
/* Character is graphic. Print it. */
scm_lfwrite_str (wstr, port);
free (buf);
}
else
/* Character is graphic but unrepresentable in
this port's encoding. */
scm_intprint (i, 8, port);
}
}
else
/* Character is a non-graphical character. */
scm_intprint (i, 8, port);
}
else
scm_putc (i, port);
scm_i_charprint (i, port);
}
else if (SCM_IFLAGP (exp)
&& ((size_t) SCM_IFLAGNUM (exp) < (sizeof iflagnames / sizeof (char *))))
@ -608,21 +633,32 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
SCM wstr = scm_i_make_wide_string (1, &wbuf);
char *buf;
size_t len;
wbuf[0] = ch;
buf = u32_conv_to_encoding ("ISO-8859-1",
iconveh_error,
(scm_t_uint32 *) wbuf,
1, NULL, NULL, &len);
if (buf != NULL)
if (scm_i_get_port_encoding (port))
{
/* Character is graphic and representable in
this encoding. Print it. */
scm_lfwrite_str (wstr, port);
free (buf);
printed = 1;
wstr = scm_i_make_wide_string (1, &wbuf);
wbuf[0] = ch;
buf = u32_conv_to_encoding (scm_i_get_port_encoding (port),
iconveh_error,
(scm_t_uint32 *) wbuf,
1 ,
NULL,
NULL, &len);
if (buf != NULL)
{
/* Character is graphic and representable in
this encoding. Print it. */
scm_lfwrite_str (wstr, port);
free (buf);
printed = 1;
}
}
else
if (ch <= 0xFF)
{
scm_putc (ch, port);
printed = 1;
}
}
if (!printed)
@ -835,7 +871,7 @@ scm_prin1 (SCM exp, SCM port, int writingp)
/* Print a character.
*/
void
scm_i_charprint (scm_t_uint32 ch, SCM port)
scm_i_charprint (scm_t_wchar ch, SCM port)
{
scm_t_wchar *wbuf;
SCM wstr = scm_i_make_wide_string (1, &wbuf);
@ -1057,9 +1093,7 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
SCM port, answer = SCM_UNSPECIFIED;
int fReturnString = 0;
int writingp;
const char *start;
const char *end;
const char *p;
size_t start, p, end;
if (scm_is_eq (destination, SCM_BOOL_T))
{
@ -1082,15 +1116,16 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
SCM_VALIDATE_STRING (2, message);
SCM_VALIDATE_REST_ARGUMENT (args);
start = scm_i_string_chars (message);
end = start + scm_i_string_length (message);
p = 0;
start = 0;
end = scm_i_string_length (message);
for (p = start; p != end; ++p)
if (*p == '~')
if (scm_i_string_ref (message, p) == '~')
{
if (++p == end)
break;
switch (*p)
switch (scm_i_string_ref (message, p))
{
case 'A': case 'a':
writingp = 0;
@ -1099,33 +1134,33 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
writingp = 1;
break;
case '~':
scm_lfwrite (start, p - start, port);
scm_lfwrite_substr (message, start, p, port);
start = p + 1;
continue;
case '%':
scm_lfwrite (start, p - start - 1, port);
scm_lfwrite_substr (message, start, p - 1, port);
scm_newline (port);
start = p + 1;
continue;
default:
SCM_MISC_ERROR ("FORMAT: Unsupported format option ~~~A - use (ice-9 format) instead",
scm_list_1 (SCM_MAKE_CHAR (*p)));
scm_list_1 (SCM_MAKE_CHAR (scm_i_string_ref (message, p))));
}
if (!scm_is_pair (args))
SCM_MISC_ERROR ("FORMAT: Missing argument for ~~~A",
scm_list_1 (SCM_MAKE_CHAR (*p)));
scm_list_1 (SCM_MAKE_CHAR (scm_i_string_ref (message, p))));
scm_lfwrite (start, p - start - 1, port);
scm_lfwrite_substr (message, start, p - 1, port);
/* we pass destination here */
scm_prin1 (SCM_CAR (args), destination, writingp);
args = SCM_CDR (args);
start = p + 1;
}
scm_lfwrite (start, p - start, port);
scm_lfwrite_substr (message, start, p, port);
if (!scm_is_eq (args, SCM_EOL))
SCM_MISC_ERROR ("FORMAT: ~A superfluous arguments",
scm_list_1 (scm_length (args)));

View file

@ -25,6 +25,7 @@
#include "libguile/__scm.h"
#include "libguile/chars.h"
#include "libguile/options.h"
@ -77,7 +78,7 @@ SCM_API SCM scm_print_options (SCM setting);
SCM_API SCM scm_make_print_state (void);
SCM_API void scm_free_print_state (SCM print_state);
SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state);
SCM_INTERNAL void scm_i_charprint (scm_t_uint32 c, SCM port);
SCM_INTERNAL void scm_i_charprint (scm_t_wchar c, SCM port);
SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);

View file

@ -59,12 +59,10 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
size_t j;
size_t cstart;
size_t cend;
int c;
const char *cdelims;
scm_t_wchar c;
size_t num_delims;
SCM_VALIDATE_STRING (1, delims);
cdelims = scm_i_string_chars (delims);
num_delims = scm_i_string_length (delims);
SCM_VALIDATE_STRING (2, str);
@ -83,7 +81,7 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
c = scm_getc (port);
for (k = 0; k < num_delims; k++)
{
if (cdelims[k] == c)
if (scm_i_string_ref (delims, k) == c)
{
if (scm_is_false (gobble))
scm_ungetc (c, port);

View file

@ -27,6 +27,8 @@
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include <unicase.h>
#include "libguile/_scm.h"
#include "libguile/bytevectors.h"
@ -177,11 +179,6 @@ static SCM *scm_read_hash_procedures;
(((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
|| ((_chr) == 'd') || ((_chr) == 'l'))
/* An inlinable version of `scm_c_downcase ()'. */
#define CHAR_DOWNCASE(_chr) \
(((_chr) <= UCHAR_MAX) ? tolower ((int) (_chr)) : (_chr))
/* Read an SCSH block comment. */
static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
static SCM scm_read_commented_expression (int chr, SCM port);
@ -189,41 +186,69 @@ static SCM scm_read_commented_expression (int chr, SCM port);
/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
zero if the whole token fits in BUF, non-zero otherwise. */
static inline int
read_token (SCM port, char *buf, size_t buf_size, size_t *read)
read_token (SCM port, SCM buf, size_t *read)
{
scm_t_wchar chr;
*read = 0;
while (*read < buf_size)
buf = scm_i_string_start_writing (buf);
while (*read < scm_i_string_length (buf))
{
int chr;
chr = scm_getc (port);
chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
if (chr == EOF)
return 0;
else if (CHAR_IS_DELIMITER (chr))
{
scm_i_string_stop_writing ();
return 0;
}
chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
if (CHAR_IS_DELIMITER (chr))
{
scm_i_string_stop_writing ();
scm_ungetc (chr, port);
return 0;
}
else
{
*buf = (char) chr;
buf++, (*read)++;
}
scm_i_string_set_x (buf, *read, chr);
(*read)++;
}
scm_i_string_stop_writing ();
return 1;
}
static SCM
read_complete_token (SCM port, size_t *read)
{
SCM buffer, str = SCM_EOL;
size_t len;
int overflow;
buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
overflow = read_token (port, buffer, read);
if (!overflow)
return scm_i_substring (buffer, 0, *read);
str = scm_string_copy (buffer);
do
{
overflow = read_token (port, buffer, &len);
str = scm_string_append (scm_list_2 (str, buffer));
*read += len;
}
while (overflow);
return scm_i_substring (str, 0, *read);
}
/* Skip whitespace from PORT and return the first non-whitespace character
read. Raise an error on end-of-file. */
static int
flush_ws (SCM port, const char *eoferr)
{
register int c;
register scm_t_wchar c;
while (1)
switch (c = scm_getc (port))
{
@ -292,7 +317,7 @@ static SCM recsexpr (SCM obj, long line, int column, SCM filename);
static SCM
scm_read_sexp (int chr, SCM port)
scm_read_sexp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_i_lreadparen"
{
register int c;
@ -553,107 +578,52 @@ scm_read_string (int chr, SCM port)
static SCM
scm_read_number (int chr, SCM port)
scm_read_number (scm_t_wchar chr, SCM port)
{
SCM result, str = SCM_EOL;
char buffer[READER_BUFFER_SIZE];
SCM result;
SCM buffer;
size_t read;
int overflow = 0;
scm_ungetc (chr, port);
do
{
overflow = read_token (port, buffer, sizeof (buffer), &read);
if ((overflow) || (scm_is_pair (str)))
str = scm_cons (scm_from_locale_stringn (buffer, read), str);
}
while (overflow);
if (scm_is_pair (str))
{
/* The slow path. */
str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
result = scm_string_to_number (str, SCM_UNDEFINED);
if (!scm_is_true (result))
/* Return a symbol instead of a number. */
result = scm_string_to_symbol (str);
}
else
{
result = scm_c_locale_stringn_to_number (buffer, read, 10);
if (!scm_is_true (result))
/* Return a symbol instead of a number. */
result = scm_from_locale_symboln (buffer, read);
}
buffer = read_complete_token (port, &read);
result = scm_string_to_number (buffer, SCM_UNDEFINED);
if (!scm_is_true (result))
/* Return a symbol instead of a number. */
result = scm_string_to_symbol (buffer);
return result;
}
static SCM
scm_read_mixed_case_symbol (int chr, SCM port)
scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
{
SCM result, str = SCM_EOL;
int overflow = 0, ends_with_colon = 0;
char buffer[READER_BUFFER_SIZE];
SCM result;
int ends_with_colon = 0;
SCM buffer;
size_t read = 0;
int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
scm_ungetc (chr, port);
do
{
overflow = read_token (port, buffer, sizeof (buffer), &read);
buffer = read_complete_token (port, &read);
if (read > 0)
ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
if (read > 0)
ends_with_colon = (buffer[read - 1] == ':');
if ((overflow) || (scm_is_pair (str)))
str = scm_cons (scm_from_locale_stringn (buffer, read), str);
}
while (overflow);
if (scm_is_pair (str))
{
size_t len;
str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
len = scm_c_string_length (str);
/* Per SRFI-88, `:' alone is an identifier, not a keyword. */
if (postfix && ends_with_colon && (len > 1))
{
/* Strip off colon. */
str = scm_c_substring (str, 0, len-1);
result = scm_string_to_symbol (str);
result = scm_symbol_to_keyword (result);
}
else
result = scm_string_to_symbol (str);
}
if (postfix && ends_with_colon && (read > 1))
result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
else
{
/* For symbols smaller than `sizeof (buffer)', we don't need to recur
to Scheme strings. Therefore, we only create one Scheme object (a
symbol) per symbol read. */
if (postfix && ends_with_colon && (read > 1))
result = scm_from_locale_keywordn (buffer, read - 1);
else
result = scm_from_locale_symboln (buffer, read);
}
result = scm_string_to_symbol (buffer);
return result;
}
static SCM
scm_read_number_and_radix (int chr, SCM port)
scm_read_number_and_radix (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM result, str = SCM_EOL;
SCM result;
size_t read;
char buffer[READER_BUFFER_SIZE];
SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
unsigned int radix;
int overflow = 0;
switch (chr)
{
@ -683,22 +653,8 @@ scm_read_number_and_radix (int chr, SCM port)
radix = 10;
}
do
{
overflow = read_token (port, buffer, sizeof (buffer), &read);
if ((overflow) || (scm_is_pair (str)))
str = scm_cons (scm_from_locale_stringn (buffer, read), str);
}
while (overflow);
if (scm_is_pair (str))
{
str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
result = scm_string_to_number (str, scm_from_uint (radix));
}
else
result = scm_c_locale_stringn_to_number (buffer, read, radix);
buffer = read_complete_token (port, &read);
result = scm_string_to_number (buffer, scm_from_uint (radix));
if (scm_is_true (result))
return result;
@ -728,7 +684,7 @@ scm_read_quote (int chr, SCM port)
case ',':
{
int c;
scm_t_wchar c;
c = scm_getc (port);
if ('@' == c)
@ -827,7 +783,10 @@ scm_read_semicolon_comment (int chr, SCM port)
{
int c;
for (c = scm_getc (port);
/* We use the get_byte here because there is no need to get the
locale correct with comment input. This presumes that newline
always represents itself no matter what the encoding is. */
for (c = scm_get_byte_or_eof (port);
(c != EOF) && (c != '\n');
c = scm_getc (port));
@ -855,14 +814,19 @@ scm_read_boolean (int chr, SCM port)
}
static SCM
scm_read_character (int chr, SCM port)
scm_read_character (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM ch;
char charname[READER_CHAR_NAME_MAX_SIZE];
SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
size_t charname_len;
scm_t_wchar cp;
int overflow;
if (read_token (port, charname, sizeof (charname), &charname_len))
overflow = read_token (port, charname, &charname_len);
charname = scm_c_substring (charname, 0, charname_len);
if (overflow)
goto char_error;
if (charname_len == 0)
@ -877,28 +841,33 @@ scm_read_character (int chr, SCM port)
}
if (charname_len == 1)
return SCM_MAKE_CHAR (charname[0]);
return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
if (*charname >= '0' && *charname < '8')
cp = scm_i_string_ref (charname, 0);
if (cp >= '0' && cp < '8')
{
/* Dirk:FIXME:: This type of character syntax is not R5RS
* compliant. Further, it should be verified that the constant
* does only consist of octal digits. Finally, it should be
* checked whether the resulting fixnum is in the range of
* characters. */
SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
SCM p = scm_string_to_number (charname, scm_from_uint (8));
if (SCM_I_INUMP (p))
return SCM_MAKE_CHAR (SCM_I_INUM (p));
}
ch = scm_i_charname_to_char (charname, charname_len);
/* The names of characters should never have non-Latin1
characters. */
if (scm_i_is_narrow_string (charname)
|| scm_i_try_narrow_string (charname))
ch = scm_i_charname_to_char (scm_i_string_chars (charname),
charname_len);
if (scm_is_true (ch))
return ch;
char_error:
scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
scm_list_1 (scm_from_locale_stringn (charname,
charname_len)));
scm_list_1 (charname));
return SCM_UNSPECIFIED;
}
@ -940,7 +909,7 @@ scm_read_srfi4_vector (int chr, SCM port)
}
static SCM
scm_read_bytevector (int chr, SCM port)
scm_read_bytevector (scm_t_wchar chr, SCM port)
{
chr = scm_getc (port);
if (chr != 'u')
@ -964,7 +933,7 @@ scm_read_bytevector (int chr, SCM port)
}
static SCM
scm_read_guile_bit_vector (int chr, SCM port)
scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
{
/* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
terribly inefficient but who cares? */
@ -984,13 +953,17 @@ scm_read_guile_bit_vector (int chr, SCM port)
}
static inline SCM
scm_read_scsh_block_comment (int chr, SCM port)
scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
{
int bang_seen = 0;
/* We can use the get_byte here because there is no need to get the
locale correct when reading comments. This presumes that
hash and exclamation points always represent themselves no
matter what the source encoding is.*/
for (;;)
{
int c = scm_getc (port);
int c = scm_get_byte_or_eof (port);
if (c == EOF)
scm_i_input_error ("skip_block_comment", port,
@ -1008,9 +981,9 @@ scm_read_scsh_block_comment (int chr, SCM port)
}
static SCM
scm_read_commented_expression (int chr, SCM port)
scm_read_commented_expression (scm_t_wchar chr, SCM port)
{
int c;
scm_t_wchar c;
c = flush_ws (port, (char *) NULL);
if (EOF == c)
@ -1022,19 +995,18 @@ scm_read_commented_expression (int chr, SCM port)
}
static SCM
scm_read_extended_symbol (int chr, SCM port)
scm_read_extended_symbol (scm_t_wchar chr, SCM port)
{
/* Guile's extended symbol read syntax looks like this:
#{This is all a symbol name}#
So here, CHR is expected to be `{'. */
SCM result;
int saw_brace = 0, finished = 0;
size_t len = 0;
char buf[1024];
SCM buf = scm_i_make_string (1024, NULL);
result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
buf = scm_i_string_start_writing (buf);
while ((chr = scm_getc (port)) != EOF)
{
@ -1048,32 +1020,30 @@ scm_read_extended_symbol (int chr, SCM port)
else
{
saw_brace = 0;
buf[len++] = '}';
buf[len++] = chr;
scm_i_string_set_x (buf, len++, '}');
scm_i_string_set_x (buf, len++, chr);
}
}
else if (chr == '}')
saw_brace = 1;
else
buf[len++] = chr;
scm_i_string_set_x (buf, len++, chr);
if (len >= sizeof (buf) - 2)
if (len >= scm_i_string_length (buf) - 2)
{
scm_string_append (scm_list_2 (result,
scm_from_locale_stringn (buf, len)));
scm_i_string_stop_writing ();
SCM addy = scm_i_make_string (1024, NULL);
buf = scm_string_append (scm_list_2 (buf, addy));
len = 0;
buf = scm_i_string_start_writing (buf);
}
if (finished)
break;
}
scm_i_string_stop_writing ();
if (len)
result = scm_string_append (scm_list_2
(result,
scm_from_locale_stringn (buf, len)));
return (scm_string_to_symbol (result));
return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
}
@ -1109,7 +1079,7 @@ scm_read_sharp_extension (int chr, SCM port)
/* The reader for the sharp `#' character. It basically dispatches reads
among the above token readers. */
static SCM
scm_read_sharp (int chr, SCM port)
scm_read_sharp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM result;
@ -1161,7 +1131,7 @@ scm_read_sharp (int chr, SCM port)
{
/* When next char is '(', it really is an old-style
uniform array. */
int next_c = scm_getc (port);
scm_t_wchar next_c = scm_getc (port);
if (next_c != EOF)
scm_ungetc (next_c, port);
if (next_c == '(')
@ -1209,7 +1179,7 @@ scm_read_expression (SCM port)
{
while (1)
{
register int chr;
register scm_t_wchar chr;
chr = scm_getc (port);
@ -1420,6 +1390,127 @@ scm_get_hash_procedure (int c)
}
}
#define SCM_ENCODING_SEARCH_SIZE (500)
/* Search the first few hundred characters of a file for
an emacs-like coding declaration. */
char *
scm_scan_for_encoding (SCM port)
{
char header[SCM_ENCODING_SEARCH_SIZE+1];
size_t bytes_read;
char *encoding = NULL;
int utf8_bom = 0;
char *pos;
int i;
int in_comment;
bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
if (bytes_read > 3
&& header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
utf8_bom = 1;
/* search past "coding[:=]" */
pos = header;
while (1)
{
if ((pos = strstr(pos, "coding")) == NULL)
return NULL;
pos += strlen("coding");
if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
(*pos == ':' || *pos == '='))
{
pos ++;
break;
}
}
/* skip spaces */
while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
(*pos == ' ' || *pos == '\t'))
pos ++;
/* grab the next token */
i = 0;
while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
&& (isalnum(pos[i]) || pos[i] == '_' || pos[i] == '-' || pos[i] == '.'))
i++;
if (i == 0)
return NULL;
encoding = scm_malloc (i+1);
memcpy (encoding, pos, i);
encoding[i] ='\0';
for (i = 0; i < strlen(encoding); i++)
encoding[i] = toupper(encoding[i]);
/* push backwards to make sure we were in a comment */
in_comment = 0;
while (pos - i - header > 0)
{
if (*(pos - i) == '\n')
{
/* This wasn't in a semicolon comment. Check for a
hash-bang comment. */
char *beg = strstr (header, "#!");
char *end = strstr (header, "!#");
if (beg < pos && pos < end)
in_comment = 1;
break;
}
if (*(pos - i) == ';')
{
in_comment = 1;
break;
}
i ++;
}
if (!in_comment)
{
/* This wasn't in a comment */
free (encoding);
return NULL;
}
if (utf8_bom && strcmp(encoding, "UTF-8"))
scm_misc_error (NULL,
"the port input declares the encoding ~s but is encoded as UTF-8",
scm_list_1 (scm_from_locale_string (encoding)));
return encoding;
}
SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
(SCM port),
"Scans the port for an EMACS-like character coding declaration\n"
"near the top of the contents of a port with random-acessible contents.\n"
"The coding declaration is of the form\n"
"@code{coding: XXXXX} and must appear in a scheme comment.\n"
"\n"
"Returns a string containing the character encoding of the file\n"
"if a declaration was found, or @code{#f} otherwise.\n")
#define FUNC_NAME s_scm_file_encoding
{
char *enc;
SCM s_enc;
enc = scm_scan_for_encoding (port);
if (enc == NULL)
return SCM_BOOL_F;
else
{
s_enc = scm_from_locale_string (enc);
free (enc);
return s_enc;
}
return SCM_BOOL_F;
}
#undef FUNC_NAME
void
scm_init_read ()
{

View file

@ -56,6 +56,8 @@ SCM_API SCM scm_read_options (SCM setting);
SCM_API SCM scm_read (SCM port);
SCM_API size_t scm_read_token (int ic, SCM * tok_buf, SCM port, int weird);
SCM_API SCM scm_read_hash_extend (SCM chr, SCM proc);
SCM_INTERNAL char *scm_scan_for_encoding (SCM port);
SCM_API SCM scm_file_encoding (SCM port);
SCM_INTERNAL void scm_i_input_error (const char *func, SCM port,
const char *message, SCM arg)

View file

@ -28,6 +28,8 @@
#include <unistr.h>
#include <uniconv.h>
#include "striconveh.h"
#include "libguile/_scm.h"
#include "libguile/chars.h"
#include "libguile/root.h"
@ -632,6 +634,37 @@ scm_i_string_ref (SCM str, size_t x)
return scm_i_string_wide_chars (str)[x];
}
/* Returns index+1 of the first char in STR that matches C, or
0 if the char is not found. */
int
scm_i_string_contains_char (SCM str, char ch)
{
size_t i;
size_t len = scm_i_string_length (str);
i = 0;
if (scm_i_is_narrow_string (str))
{
while (i < len)
{
if (scm_i_string_chars (str)[i] == ch)
return i+1;
i++;
}
}
else
{
while (i < len)
{
if (scm_i_string_wide_chars (str)[i]
== (unsigned char) ch)
return i+1;
i++;
}
}
return 0;
}
int
scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
{
@ -1443,31 +1476,6 @@ scm_is_string (SCM obj)
return IS_STRING (obj);
}
SCM
scm_from_locale_stringn (const char *str, size_t len)
{
SCM res;
char *dst;
if (len == (size_t) -1)
len = strlen (str);
if (len == 0)
return scm_nullstr;
res = scm_i_make_string (len, &dst);
memcpy (dst, str, len);
return res;
}
SCM
scm_from_locale_string (const char *str)
{
if (str == NULL)
return scm_nullstr;
return scm_from_locale_stringn (str, -1);
}
static SCM
scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_t_string_failed_conversion_handler handler)
@ -1477,6 +1485,15 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
int wide = 0;
SCM res;
if (encoding == NULL)
{
/* If encoding is null, use Latin-1. */
char *buf;
res = scm_i_make_string (len, &buf);
memcpy (buf, str, len);
return res;
}
u32len = 0;
u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
(enum iconv_ilseq_handler)
@ -1491,12 +1508,9 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_memory_error ("locale string conversion");
else
{
/* There are invalid sequences in the input string. Since
it is partially nonsense, what is the best strategy for
printing it in the error message? */
/* There are invalid sequences in the input string. */
SCM errstr;
char *dst;
/* We'll just print it unconverted and hope for the best. */
errstr = scm_i_make_string (len, &dst);
memcpy (dst, str, len);
scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
@ -1534,6 +1548,44 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
return res;
}
SCM
scm_from_locale_stringn (const char *str, size_t len)
{
const char *enc;
scm_t_string_failed_conversion_handler hndl;
SCM inport;
scm_t_port *pt;
if (len == (size_t) -1)
len = strlen (str);
if (len == 0)
return scm_nullstr;
inport = scm_current_input_port ();
if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
{
pt = SCM_PTAB_ENTRY (inport);
enc = pt->encoding;
hndl = pt->ilseq_handler;
}
else
{
enc = NULL;
hndl = SCM_FAILED_CONVERSION_ERROR;
}
return scm_from_stringn (str, len, enc, hndl);
}
SCM
scm_from_locale_string (const char *str)
{
if (str == NULL)
return scm_nullstr;
return scm_from_locale_stringn (str, -1);
}
SCM
scm_i_from_utf8_string (const scm_t_uint8 *str)
{
@ -1630,13 +1682,22 @@ unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
char *
scm_to_locale_stringn (SCM str, size_t * lenp)
{
SCM outport;
scm_t_port *pt;
const char *enc;
/* In the future, enc will hold the port's encoding. */
enc = NULL;
outport = scm_current_output_port ();
if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
{
pt = SCM_PTAB_ENTRY (outport);
enc = pt->encoding;
}
else
enc = NULL;
return scm_to_stringn (str, lenp, enc,
SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
return scm_to_stringn (str, lenp,
enc,
scm_i_get_conversion_strategy (SCM_BOOL_F));
}
/* Low-level scheme to C string conversion function. */
@ -1646,6 +1707,8 @@ scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
{
char *buf;
size_t ilen, len, i;
int ret;
const char *enc;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
@ -1667,8 +1730,10 @@ scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
"string contains #\\nul character: ~S",
scm_list_1 (str));
if (scm_i_is_narrow_string (str))
if (scm_i_is_narrow_string (str) && (encoding == NULL))
{
/* If using native Latin-1 encoding, just copy the string
contents. */
if (lenp)
{
buf = scm_malloc (ilen);
@ -1688,17 +1753,41 @@ scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
buf = NULL;
len = 0;
buf = u32_conv_to_encoding (encoding ? encoding : "ISO-8859-1",
(enum iconv_ilseq_handler) handler,
(scm_t_uint32 *) scm_i_string_wide_chars (str),
ilen, NULL, NULL, &len);
if (buf == NULL)
scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
scm_list_2 (scm_from_locale_string (encoding), str));
enc = encoding;
if (enc == NULL)
enc = "ISO-8859-1";
if (scm_i_is_narrow_string (str))
{
ret = mem_iconveh (scm_i_string_chars (str), ilen,
"ISO-8859-1", enc,
(enum iconv_ilseq_handler) handler, NULL,
&buf, &len);
if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
unistring_escapes_to_guile_escapes (&buf, &len);
if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
unistring_escapes_to_guile_escapes (&buf, &len);
if (ret != 0)
{
scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
scm_list_2 (scm_from_locale_string (enc),
str));
}
}
else
{
buf = u32_conv_to_encoding (enc,
(enum iconv_ilseq_handler) handler,
(scm_t_uint32 *) scm_i_string_wide_chars (str),
ilen,
NULL,
NULL, &len);
if (buf == NULL)
{
scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
scm_list_2 (scm_from_locale_string (enc),
str));
}
}
if (lenp)
*lenp = len;
else

View file

@ -154,6 +154,7 @@ SCM_INTERNAL SCM scm_i_string_start_writing (SCM str);
SCM_INTERNAL void scm_i_string_stop_writing (void);
SCM_INTERNAL int scm_i_is_narrow_string (SCM str);
SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x);
SCM_INTERNAL int scm_i_string_contains_char (SCM str, char c);
SCM_INTERNAL int scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr);
SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr);
/* internal functions related to symbols. */

View file

@ -39,6 +39,7 @@
#include "libguile/modules.h"
#include "libguile/validate.h"
#include "libguile/deprecation.h"
#include "libguile/srfi-4.h"
#include "libguile/strports.h"
@ -289,42 +290,33 @@ st_truncate (SCM port, scm_t_off length)
}
SCM
scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len, long modes, const char *caller)
{
SCM z;
SCM z, str;
scm_t_port *pt;
size_t str_len, c_pos;
size_t c_pos;
char *buf;
SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
/* Because ports are inherently 8-bit, strings need to be converted
to a locale representation for storage. But, since string ports
rely on string functionality for their memory management, we need
to create a new string that has the 8-bit locale representation
of the underlying string. This violates the guideline that the
internal encoding of characters in strings is in unicode
codepoints. */
str = scm_i_make_string (str_len, &buf);
memcpy (buf, locale_str, str_len);
str_len = scm_i_string_length (str);
c_pos = scm_to_unsigned_integer (pos, 0, str_len);
if (!((modes & SCM_WRTNG) || (modes & SCM_RDNG)))
scm_misc_error ("scm_mkstrport", "port must read or write", SCM_EOL);
/* XXX
Make a new string to isolate us from changes to the original.
This is done so that we can rely on scm_i_string_chars to stay in
place even across SCM_TICKs.
Additionally, when we are going to write to the string, we make a
copy so that we can write to it without having to use
scm_i_string_writable_chars.
*/
if (modes & SCM_WRTNG)
str = scm_c_substring_copy (str, 0, str_len);
else
str = scm_c_substring (str, 0, str_len);
scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
z = scm_new_port_table_entry (scm_tc16_strport);
pt = SCM_PTAB_ENTRY(z);
SCM_SETSTREAM (z, SCM_UNPACK (str));
SCM_SET_CELL_TYPE(z, scm_tc16_strport|modes);
/* see above why we can use scm_i_string_chars here. */
pt->write_buf = pt->read_buf = (unsigned char *) scm_i_string_chars (str);
pt->read_pos = pt->write_pos = pt->read_buf + c_pos;
pt->write_buf_size = pt->read_buf_size = str_len;
@ -340,22 +332,60 @@ scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
return z;
}
SCM
scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
{
SCM z;
size_t str_len;
char *buf;
SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
/* Because ports are inherently 8-bit, strings need to be converted
to a locale representation for storage. But, since string ports
rely on string functionality for their memory management, we need
to create a new string that has the 8-bit locale representation
of the underlying string. This violates the guideline that the
internal encoding of characters in strings is in unicode
codepoints. */
buf = scm_to_locale_stringn (str, &str_len);
z = scm_i_mkstrport (pos, buf, str_len, modes, caller);
free (buf);
return z;
}
/* create a new string from a string port's buffer. */
SCM scm_strport_to_string (SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
SCM str;
char *dst;
if (pt->rw_active == SCM_PORT_WRITE)
st_flush (port);
str = scm_i_make_string (pt->read_buf_size, &dst);
memcpy (dst, (char *) pt->read_buf, pt->read_buf_size);
str = scm_from_locale_stringn ((char *)pt->read_buf, pt->read_buf_size);
scm_remember_upto_here_1 (port);
return str;
}
/* Create a vector containing the locale representation of the string in the
port's buffer. */
SCM scm_strport_to_locale_u8vector (SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
SCM vec;
char *buf;
if (pt->rw_active == SCM_PORT_WRITE)
st_flush (port);
buf = scm_malloc (pt->read_buf_size);
memcpy (buf, pt->read_buf, pt->read_buf_size);
vec = scm_take_u8vector ((unsigned char *) buf, pt->read_buf_size);
scm_remember_upto_here_1 (port);
return vec;
}
SCM_DEFINE (scm_object_to_string, "object->string", 1, 1, 0,
(SCM obj, SCM printer),
"Return a Scheme string obtained by printing @var{obj}.\n"
@ -380,6 +410,25 @@ SCM_DEFINE (scm_object_to_string, "object->string", 1, 1, 0,
}
#undef FUNC_NAME
SCM_DEFINE (scm_call_with_output_locale_u8vector, "call-with-output-locale-u8vector", 1, 0, 0,
(SCM proc),
"Calls the one-argument procedure @var{proc} with a newly created output\n"
"port. When the function returns, a vector containing the bytes of a\n"
"locale representation of the characters written into the port is returned\n")
#define FUNC_NAME s_scm_call_with_output_locale_u8vector
{
SCM p;
p = scm_mkstrport (SCM_INUM0,
scm_make_string (SCM_INUM0, SCM_UNDEFINED),
SCM_OPN | SCM_WRTNG,
FUNC_NAME);
scm_call_1 (proc, p);
return scm_get_output_locale_u8vector (p);
}
#undef FUNC_NAME
SCM_DEFINE (scm_call_with_output_string, "call-with-output-string", 1, 0, 0,
(SCM proc),
"Calls the one-argument procedure @var{proc} with a newly created output\n"
@ -424,6 +473,27 @@ SCM_DEFINE (scm_open_input_string, "open-input-string", 1, 0, 0,
}
#undef FUNC_NAME
SCM_DEFINE (scm_open_input_locale_u8vector, "open-input-locale-u8vector", 1, 0, 0,
(SCM vec),
"Take a u8vector containing the bytes of a string encoded in the\n"
"current locale and return an input port that delivers characters\n"
"from the string. The port can be closed by\n"
"@code{close-input-port}, though its storage will be reclaimed\n"
"by the garbage collector if it becomes inaccessible.")
#define FUNC_NAME s_scm_open_input_locale_u8vector
{
scm_t_array_handle hnd;
ssize_t inc;
size_t len;
const scm_t_uint8 *buf;
buf = scm_u8vector_elements (vec, &hnd, &len, &inc);
SCM p = scm_i_mkstrport(SCM_INUM0, (const char *) buf, len, SCM_OPN | SCM_RDNG, FUNC_NAME);
scm_array_handle_release (&hnd);
return p;
}
#undef FUNC_NAME
SCM_DEFINE (scm_open_output_string, "open-output-string", 0, 0, 0,
(void),
"Return an output port that will accumulate characters for\n"
@ -456,11 +526,26 @@ SCM_DEFINE (scm_get_output_string, "get-output-string", 1, 0, 0,
#undef FUNC_NAME
SCM_DEFINE (scm_get_output_locale_u8vector, "get-output-locale-u8vector", 1, 0, 0,
(SCM port),
"Given an output port created by @code{open-output-string},\n"
"return a u8 vector containing the characters of the string\n"
"encoded in the current locale.")
#define FUNC_NAME s_scm_get_output_locale_u8vector
{
SCM_VALIDATE_OPOUTSTRPORT (1, port);
return scm_strport_to_locale_u8vector (port);
}
#undef FUNC_NAME
/* Given a null-terminated string EXPR containing a Scheme expression
read it, and return it as an SCM value. */
SCM
scm_c_read_string (const char *expr)
{
/* FIXME: the c string gets packed into a string, only to get
immediately unpacked in scm_mkstrport. */
SCM port = scm_mkstrport (SCM_INUM0,
scm_from_locale_string (expr),
SCM_OPN | SCM_RDNG,

View file

@ -44,13 +44,19 @@ SCM_API scm_t_bits scm_tc16_strport;
SCM_API SCM scm_mkstrport (SCM pos, SCM str, long modes, const char * caller);
SCM_INTERNAL SCM scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len,
long modes, const char *caller);
SCM_API SCM scm_strport_to_string (SCM port);
SCM_API SCM scm_strport_to_locale_u8vector (SCM port);
SCM_API SCM scm_object_to_string (SCM obj, SCM printer);
SCM_API SCM scm_call_with_output_string (SCM proc);
SCM_API SCM scm_call_with_output_locale_u8vector (SCM proc);
SCM_API SCM scm_call_with_input_string (SCM str, SCM proc);
SCM_API SCM scm_open_input_string (SCM str);
SCM_API SCM scm_open_input_locale_u8vector (SCM str);
SCM_API SCM scm_open_output_string (void);
SCM_API SCM scm_get_output_string (SCM port);
SCM_API SCM scm_get_output_locale_u8vector (SCM port);
SCM_API SCM scm_c_read_string (const char *expr);
SCM_API SCM scm_c_eval_string (const char *expr);
SCM_API SCM scm_c_eval_string_in_module (const char *expr, SCM module);

View file

@ -145,8 +145,11 @@
(from (current-language))
(to 'objcode)
(opts '()))
(let ((comp (or output-file (compiled-file-name file)))
(in (open-input-file file)))
(let* ((comp (or output-file (compiled-file-name file)))
(in (open-input-file file))
(enc (file-encoding in)))
(if enc
(set-port-encoding! in enc))
(ensure-writable-dir (dirname comp))
(call-with-output-file/atomic comp
(lambda (port)

View file

@ -0,0 +1,139 @@
;;;; encoding-escapes.test --- test suite for Guile's string encodings -*- mode: scheme; coding: utf-8 -*-
;;;;
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
;;;;
;;;; This program is free software; you can redistribute it and/or modify
;;;; it under the terms of the GNU General Public License as published by
;;;; the Free Software Foundation; either version 2, or (at your option)
;;;; any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this software; see the file COPYING. If not, write to
;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;;;; Boston, MA 02110-1301 USA
(define-module (test-strings)
#:use-module (test-suite lib)
#:use-module (srfi srfi-1))
(define exception:conversion
(cons 'misc-error "^cannot convert to output locale"))
;; Create a string from integer char values, eg. (string-ints 65) => "A"
(define (string-ints . args)
(apply string (map integer->char args)))
(define s1 "última")
(define s2 "cédula")
(define s3 "años")
(define s4 "羅生門")
(with-test-prefix "internal encoding"
(pass-if "ultima"
(string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
(pass-if "cedula"
(string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
(pass-if "anos"
(string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
(pass-if "Rashomon"
(string=? s4 (string-ints #x7f85 #x751f #x9580))))
(with-test-prefix "chars"
(pass-if "ultima"
(list= eqv? (string->list s1)
(list #\372 #\l #\t #\i #\m #\a)))
(pass-if "cedula"
(list= eqv? (string->list s2)
(list #\c #\351 #\d #\u #\l #\a)))
(pass-if "anos"
(list= eqv? (string->list s3)
(list #\a #\361 #\o #\s)))
(pass-if "Rashomon"
(list= eqv? (string->list s4)
(list #\77605 #\72437 #\112600))))
;; Check that an error is flagged on display output when the output
;; error strategy is 'error
(with-test-prefix "display output errors"
(pass-if-exception "ultima"
exception:conversion
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'error)
(display s1 pt)))
(pass-if-exception "Rashomon"
exception:conversion
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'error)
(display s4 pt))))
;; Check that questions marks or substitutions appear when the conversion
;; mode is substitute
(with-test-prefix "display output substitutions"
(pass-if "ultima"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'substitute)
(display s1 pt)
(string=? "?ltima"
(get-output-string pt))))
(pass-if "Rashomon"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'substitute)
(display s4 pt)
(string=? "???"
(get-output-string pt)))))
;; Check that hex escapes appear in the write output and that no error
;; is thrown. The output error strategy should be irrelevant here.
(with-test-prefix "display output escapes"
(pass-if "ultima"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'escape)
(display s1 pt)
(string=? "\\xfaltima"
(get-output-string pt))))
(pass-if "Rashomon"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ASCII")
(set-port-conversion-strategy! pt 'escape)
(display s4 pt)
(string=? "\\u7F85\\u751F\\u9580"
(get-output-string pt)))))
(setlocale LC_ALL "en_US.utf8")
(with-test-prefix "input escapes"
(pass-if "última"
(string=? "última"
(with-input-from-string "\"\\xfaltima\"" read)))
(pass-if "羅生門"
(string=? "羅生門"
(with-input-from-string "\"\\u7F85\\u751F\\u9580\"" read))))

View file

@ -0,0 +1,135 @@
;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: iso-8859-1 -*-
;;;;
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
;;;;
;;;; This program is free software; you can redistribute it and/or modify
;;;; it under the terms of the GNU General Public License as published by
;;;; the Free Software Foundation; either version 2, or (at your option)
;;;; any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this software; see the file COPYING. If not, write to
;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;;;; Boston, MA 02110-1301 USA
(define-module (test-strings)
#:use-module (test-suite lib)
#:use-module (srfi srfi-1))
(define exception:conversion
(cons 'misc-error "^cannot convert to output locale"))
;; Create a string from integer char values, eg. (string-ints 65) => "A"
(define (string-ints . args)
(apply string (map integer->char args)))
(setlocale LC_ALL "")
(define s1 "última")
(define s2 "cédula")
(define s3 "años")
(define s4 "¿Cómo?")
(with-test-prefix "string length"
(pass-if "última"
(eq? (string-length s1) 6))
(pass-if "cédula"
(eq? (string-length s2) 6))
(pass-if "años"
(eq? (string-length s3) 4))
(pass-if "¿Cómo?"
(eq? (string-length s4) 6)))
(with-test-prefix "internal encoding"
(pass-if "última"
(string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
(pass-if "cédula"
(string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
(pass-if "años"
(string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
(pass-if "¿Cómo?"
(string=? s4 (string-ints #xbf #x43 #xf3 #x6d #x6f #x3f))))
(with-test-prefix "chars"
(pass-if "última"
(list= eqv? (string->list s1)
(list #\ú #\l #\t #\i #\m #\a)))
(pass-if "cédula"
(list= eqv? (string->list s2)
(list #\c #\é #\d #\u #\l #\a)))
(pass-if "años"
(list= eqv? (string->list s3)
(list #\a #\ñ #\o #\s)))
(pass-if "¿Cómo?"
(list= eqv? (string->list s4)
(list #\¿ #\C #\ó #\m #\o #\?))))
;; Check that the output is in ISO-8859-1 encoding
(with-test-prefix "display"
(pass-if "s1"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ISO-8859-1")
(display s1 pt)
(list= eqv?
(list #xfa #x6c #x74 #x69 #x6d #x61)
(u8vector->list
(get-output-locale-u8vector pt)))))
(pass-if "s2"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ISO-8859-1")
(display s2 pt)
(list= eqv?
(list #x63 #xe9 #x64 #x75 #x6c #x61)
(u8vector->list
(get-output-locale-u8vector pt))))))
(with-test-prefix "symbols == strings"
(pass-if "última"
(eq? (string->symbol s1) 'última))
(pass-if "cédula"
(eq? (string->symbol s2) 'cédula))
(pass-if "años"
(eq? (string->symbol s3) 'años))
(pass-if "¿Cómo?"
(eq? (string->symbol s4) '¿Cómo?)))
(with-test-prefix "non-ascii variable names"
(pass-if "1"
(let ((á 1)
(ñ 2))
(eq? (+ á ñ) 3))))
(with-test-prefix "output errors"
(pass-if-exception "char 256" exception:conversion
(let ((pt (open-output-string)))
(set-port-encoding! pt "ISO-8859-1")
(set-port-conversion-strategy! pt 'error)
(display (string-ints 256) pt))))
;; Reset locales
(setlocale LC_ALL "C")

View file

@ -0,0 +1,136 @@
;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: iso-8859-7 -*-
;;;;
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
;;;;
;;;; This program is free software; you can redistribute it and/or modify
;;;; it under the terms of the GNU General Public License as published by
;;;; the Free Software Foundation; either version 2, or (at your option)
;;;; any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this software; see the file COPYING. If not, write to
;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;;;; Boston, MA 02110-1301 USA
(define-module (test-strings)
#:use-module (test-suite lib)
#:use-module (srfi srfi-1))
(define exception:conversion
(cons 'misc-error "^cannot convert to output locale"))
;; Create a string from integer char values, eg. (string-ints 65) => "A"
(define (string-ints . args)
(apply string (map integer->char args)))
(setlocale LC_ALL "")
(define s1 "Ðåñß")
(define s2 "ôçò")
(define s3 "êñéôéêÞò")
(define s4 "êáé")
(with-test-prefix "string length"
(pass-if "s1"
(eq? (string-length s1) 4))
(pass-if "s2"
(eq? (string-length s2) 3))
(pass-if "s3"
(eq? (string-length s3) 8))
(pass-if "s4"
(eq? (string-length s4) 3)))
(with-test-prefix "internal encoding"
(pass-if "s1"
(string=? s1 (string-ints #x03a0 #x03b5 #x03c1 #x03af)))
(pass-if "s2"
(string=? s2 (string-ints #x03c4 #x03b7 #x03c2)))
(pass-if "s3"
(string=? s3 (string-ints #x03ba #x03c1 #x03b9 #x03c4 #x03b9 #x03ba #x03ae #x03c2)))
(pass-if "s4"
(string=? s4 (string-ints #x03ba #x03b1 #x03b9))))
(with-test-prefix "chars"
(pass-if "s1"
(list= eqv? (string->list s1)
(list #\Ð #\å #\ñ #\ß)))
(pass-if "s2"
(list= eqv? (string->list s2)
(list #\ô #\ç #\ò)))
(pass-if "s3"
(list= eqv? (string->list s3)
(list #\ê #\ñ #\é #\ô #\é #\ê #\Þ #\ò)))
(pass-if "s4"
(list= eqv? (string->list s4)
(list #\ê #\á #\é))))
;; Testing that the display of the string is output in the ISO-8859-7
;; encoding
(with-test-prefix "display"
(pass-if "s1"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ISO-8859-7")
(display s1 pt)
(list= eqv?
(list #xd0 #xe5 #xf1 #xdf)
(u8vector->list
(get-output-locale-u8vector pt)))))
(pass-if "s2"
(let ((pt (open-output-string)))
(set-port-encoding! pt "ISO-8859-7")
(display s2 pt)
(list= eqv?
(list #xf4 #xe7 #xf2)
(u8vector->list
(get-output-locale-u8vector pt))))))
(with-test-prefix "symbols == strings"
(pass-if "Ðåñß"
(eq? (string->symbol s1) 'Ðåñß))
(pass-if "ôçò"
(eq? (string->symbol s2) 'ôçò))
(pass-if "êñéôéêÞò"
(eq? (string->symbol s3) 'êñéôéêÞò))
(pass-if "êáé"
(eq? (string->symbol s4) 'êáé)))
(with-test-prefix "non-ascii variable names"
(pass-if "1"
(let ((á 1)
(ñ 2))
(eq? (+ á ñ) 3))))
(with-test-prefix "output errors"
(pass-if-exception "char #x0400"
exception:conversion
(let ((pt (open-output-string)))
(set-port-encoding! pt "ISO-8859-7")
(set-port-conversion-strategy! pt 'error)
(display (string-ints #x0400) pt))))
;; Reset locale
(setlocale LC_ALL "C")

View file

@ -0,0 +1,105 @@
;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: utf-8 -*-
;;;;
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
;;;;
;;;; This program is free software; you can redistribute it and/or modify
;;;; it under the terms of the GNU General Public License as published by
;;;; the Free Software Foundation; either version 2, or (at your option)
;;;; any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this software; see the file COPYING. If not, write to
;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;;;; Boston, MA 02110-1301 USA
(define-module (test-strings)
#:use-module (test-suite lib)
#:use-module (srfi srfi-1))
(define exception:conversion
(cons 'misc-error "^cannot convert to output locale"))
;; Create a string from integer char values, eg. (string-ints 65) => "A"
(define (string-ints . args)
(apply string (map integer->char args)))
(setlocale LC_ALL "")
(define s1 "última")
(define s2 "cédula")
(define s3 "años")
(define s4 "羅生門")
(with-test-prefix "string length"
(pass-if "última"
(eq? (string-length s1) 6))
(pass-if "cédula"
(eq? (string-length s2) 6))
(pass-if "años"
(eq? (string-length s3) 4))
(pass-if "羅生門"
(eq? (string-length s4) 3)))
(with-test-prefix "internal encoding"
(pass-if "última"
(string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
(pass-if "cédula"
(string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
(pass-if "años"
(string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
(pass-if "羅生門"
(string=? s4 (string-ints #x7f85 #x751f #x9580))))
(with-test-prefix "chars"
(pass-if "última"
(list= eqv? (string->list s1)
(list #\ú #\l #\t #\i #\m #\a)))
(pass-if "cédula"
(list= eqv? (string->list s2)
(list #\c #\é #\d #\u #\l #\a)))
(pass-if "años"
(list= eqv? (string->list s3)
(list #\a #\ñ #\o #\s)))
(pass-if "羅生門"
(list= eqv? (string->list s4)
(list #\羅 #\生 #\門))))
(with-test-prefix "symbols == strings"
(pass-if "última"
(eq? (string->symbol s1) 'última))
(pass-if "cédula"
(eq? (string->symbol s2) 'cédula))
(pass-if "años"
(eq? (string->symbol s3) 'años))
(pass-if "羅生門"
(eq? (string->symbol s4) '羅生門)))
(with-test-prefix "non-ascii variable names"
(pass-if "1"
(let ((芥川龍之介 1)
(ñ 2))
(eq? (+ 芥川龍之介 ñ) 3))))

View file

@ -22,6 +22,7 @@
;;;
;;; miscellaneous
;;;
(setbinary)
(define exception:numerical-overflow
(cons 'numerical-overflow "^Numerical overflow"))

View file

@ -33,6 +33,9 @@
;;;; Some general utilities for testing ports.
;;; Make sure we are set up for 8-bit data
(setbinary)
;;; Read from PORT until EOF, and return the result as a string.
(define (read-all port)
(let loop ((chars '()))

View file

@ -27,6 +27,9 @@
;;; All these tests assume Guile 1.8's port system, where characters are
;;; treated as octets.
;;; Set the default encoding of future ports to be binary
(setbinary)
(with-test-prefix "7.2.5 End-of-File Object"