mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 03:40:34 +02:00
Rework text encoding to be more Scheme-friendly
* libguile/ports.c (scm_port_clear_stream_start_for_bom_write): Instead of returning the BOM, take an optional buffer in which to write the BOM. Return number of bytes written. (port_clear_stream_start_for_bom_write): Remove. (scm_i_write): Adapt scm_port_clear_stream_start_for_bom_write call. (try_encode_char_to_iconv_buf, encode_latin1_chars_to_latin1_buf): (encode_latin1_chars_to_utf8_buf, encode_latin1_chars_to_iconv_buf): (encode_latin1_chars, encode_utf32_chars_to_latin1_buf): (encode_utf32_chars_to_utf8_buf, encode_utf32_chars_to_iconv_buf): (encode_utf32_chars, port_encode_chars): New helpers. (scm_port_encode_chars): New procedure. (scm_c_put_latin1_chars, scm_c_put_utf32_chars): Rework to use new encoding helpers. (scm_lfwrite): Use scm_c_put_latin1_chars.
This commit is contained in:
parent
0e888cd00b
commit
5bec3261b4
1 changed files with 302 additions and 297 deletions
599
libguile/ports.c
599
libguile/ports.c
|
@ -1497,7 +1497,6 @@ scm_i_read_bytes (SCM port, SCM dst, size_t start, size_t count)
|
||||||
caller. */
|
caller. */
|
||||||
enum bom_io_mode { BOM_IO_TEXT, BOM_IO_BINARY };
|
enum bom_io_mode { BOM_IO_TEXT, BOM_IO_BINARY };
|
||||||
static size_t port_clear_stream_start_for_bom_read (SCM, enum bom_io_mode);
|
static size_t port_clear_stream_start_for_bom_read (SCM, enum bom_io_mode);
|
||||||
static void port_clear_stream_start_for_bom_write (SCM, enum bom_io_mode);
|
|
||||||
|
|
||||||
/* Used by an application to read arbitrary number of bytes from an SCM
|
/* Used by an application to read arbitrary number of bytes from an SCM
|
||||||
port. Same semantics as libc read, except that scm_c_read_bytes only
|
port. Same semantics as libc read, except that scm_c_read_bytes only
|
||||||
|
@ -2455,10 +2454,10 @@ SCM_DEFINE (scm_port_clear_stream_start_for_bom_read,
|
||||||
}
|
}
|
||||||
#undef FUNC_NAME
|
#undef FUNC_NAME
|
||||||
|
|
||||||
SCM_INTERNAL SCM scm_port_clear_stream_start_for_bom_write (SCM port);
|
SCM_INTERNAL SCM scm_port_clear_stream_start_for_bom_write (SCM, SCM);
|
||||||
SCM_DEFINE (scm_port_clear_stream_start_for_bom_write,
|
SCM_DEFINE (scm_port_clear_stream_start_for_bom_write,
|
||||||
"port-clear-stream-start-for-bom-write", 1, 0, 0,
|
"port-clear-stream-start-for-bom-write", 1, 1, 0,
|
||||||
(SCM port),
|
(SCM port, SCM buf),
|
||||||
"")
|
"")
|
||||||
#define FUNC_NAME s_scm_port_clear_stream_start_for_bom_write
|
#define FUNC_NAME s_scm_port_clear_stream_start_for_bom_write
|
||||||
{
|
{
|
||||||
|
@ -2468,65 +2467,57 @@ SCM_DEFINE (scm_port_clear_stream_start_for_bom_write,
|
||||||
|
|
||||||
pt = SCM_PORT (port);
|
pt = SCM_PORT (port);
|
||||||
if (!pt->at_stream_start_for_bom_write)
|
if (!pt->at_stream_start_for_bom_write)
|
||||||
return SCM_BOOL_F;
|
return SCM_INUM0;
|
||||||
|
|
||||||
pt->at_stream_start_for_bom_write = 0;
|
pt->at_stream_start_for_bom_write = 0;
|
||||||
if (pt->rw_random)
|
if (pt->rw_random)
|
||||||
pt->at_stream_start_for_bom_read = 0;
|
pt->at_stream_start_for_bom_read = 0;
|
||||||
|
|
||||||
/* Record that we're no longer at stream start. */
|
if (SCM_UNBNDP (buf))
|
||||||
pt->at_stream_start_for_bom_write = 0;
|
return SCM_INUM0;
|
||||||
if (pt->rw_random)
|
|
||||||
pt->at_stream_start_for_bom_read = 0;
|
|
||||||
|
|
||||||
/* Return a BOM if appropriate. */
|
/* Write a BOM if appropriate. */
|
||||||
if (scm_is_eq (pt->encoding, sym_UTF_16))
|
if (scm_is_eq (pt->encoding, sym_UTF_16))
|
||||||
{
|
{
|
||||||
SCM precise_encoding;
|
SCM precise_encoding;
|
||||||
SCM bom = scm_c_make_bytevector (sizeof (scm_utf16be_bom));
|
size_t ret;
|
||||||
|
|
||||||
scm_port_acquire_iconv_descriptors (port, NULL, NULL);
|
scm_port_acquire_iconv_descriptors (port, NULL, NULL);
|
||||||
precise_encoding = pt->precise_encoding;
|
precise_encoding = pt->precise_encoding;
|
||||||
scm_port_release_iconv_descriptors (port);
|
scm_port_release_iconv_descriptors (port);
|
||||||
memcpy (SCM_BYTEVECTOR_CONTENTS (bom),
|
|
||||||
scm_is_eq (precise_encoding, sym_UTF_16LE)
|
if (scm_is_eq (precise_encoding, sym_UTF_16LE))
|
||||||
? scm_utf16le_bom : scm_utf16be_bom,
|
ret = scm_port_buffer_put (buf, scm_utf16le_bom,
|
||||||
SCM_BYTEVECTOR_LENGTH (bom));
|
sizeof (scm_utf16le_bom));
|
||||||
return bom;
|
else
|
||||||
|
ret = scm_port_buffer_put (buf, scm_utf16be_bom,
|
||||||
|
sizeof (scm_utf16be_bom));
|
||||||
|
|
||||||
|
return scm_from_size_t (ret);
|
||||||
}
|
}
|
||||||
else if (scm_is_eq (pt->encoding, sym_UTF_32))
|
else if (scm_is_eq (pt->encoding, sym_UTF_32))
|
||||||
{
|
{
|
||||||
SCM precise_encoding;
|
SCM precise_encoding;
|
||||||
SCM bom = scm_c_make_bytevector (sizeof (scm_utf32be_bom));
|
size_t ret;
|
||||||
|
|
||||||
scm_port_acquire_iconv_descriptors (port, NULL, NULL);
|
scm_port_acquire_iconv_descriptors (port, NULL, NULL);
|
||||||
precise_encoding = pt->precise_encoding;
|
precise_encoding = pt->precise_encoding;
|
||||||
scm_port_release_iconv_descriptors (port);
|
scm_port_release_iconv_descriptors (port);
|
||||||
memcpy (SCM_BYTEVECTOR_CONTENTS (bom),
|
|
||||||
scm_is_eq (precise_encoding, sym_UTF_32LE)
|
if (scm_is_eq (precise_encoding, sym_UTF_32LE))
|
||||||
? scm_utf32le_bom : scm_utf32be_bom,
|
ret = scm_port_buffer_put (buf, scm_utf32le_bom,
|
||||||
SCM_BYTEVECTOR_LENGTH (bom));
|
sizeof (scm_utf32le_bom));
|
||||||
return bom;
|
else
|
||||||
|
ret = scm_port_buffer_put (buf, scm_utf32be_bom,
|
||||||
|
sizeof (scm_utf32be_bom));
|
||||||
|
|
||||||
|
return scm_from_size_t (ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
return SCM_BOOL_F;
|
return SCM_INUM0;
|
||||||
}
|
}
|
||||||
#undef FUNC_NAME
|
#undef FUNC_NAME
|
||||||
|
|
||||||
static void
|
|
||||||
port_clear_stream_start_for_bom_write (SCM port, enum bom_io_mode io_mode)
|
|
||||||
{
|
|
||||||
scm_t_port *pt = SCM_PORT (port);
|
|
||||||
SCM bom;
|
|
||||||
|
|
||||||
/* Fast path. */
|
|
||||||
if (!pt->at_stream_start_for_bom_write)
|
|
||||||
return;
|
|
||||||
|
|
||||||
bom = scm_port_clear_stream_start_for_bom_write (port);
|
|
||||||
|
|
||||||
if (io_mode == BOM_IO_TEXT && scm_is_true (bom))
|
|
||||||
scm_c_write_bytes (port, bom, 0, SCM_BYTEVECTOR_LENGTH (bom));
|
|
||||||
}
|
|
||||||
|
|
||||||
SCM
|
SCM
|
||||||
scm_fill_input (SCM port, size_t minimum_size)
|
scm_fill_input (SCM port, size_t minimum_size)
|
||||||
{
|
{
|
||||||
|
@ -2752,10 +2743,7 @@ scm_i_write (SCM port, SCM buf)
|
||||||
{
|
{
|
||||||
size_t start, count;
|
size_t start, count;
|
||||||
|
|
||||||
/* The default is BOM_IO_TEXT. Binary output procedures should
|
scm_port_clear_stream_start_for_bom_write (port, SCM_UNDEFINED);
|
||||||
port_clear_stream_start_for_bom_write with BOM_IO_BINARY before
|
|
||||||
filling the input buffers. */
|
|
||||||
port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
|
|
||||||
|
|
||||||
/* Update cursors before attempting to write, assuming that I/O errors
|
/* Update cursors before attempting to write, assuming that I/O errors
|
||||||
are sticky. That way if the write throws an error, causing the
|
are sticky. That way if the write throws an error, causing the
|
||||||
|
@ -2982,257 +2970,294 @@ codepoint_to_utf8 (scm_t_uint32 codepoint, scm_t_uint8 utf8[UTF8_BUFFER_SIZE])
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We writing, we always iconv from UTF-8. Also in this function we
|
static size_t
|
||||||
only see complete codepoints. */
|
try_encode_char_to_iconv_buf (SCM port, SCM buf, scm_t_uint32 ch)
|
||||||
static void
|
|
||||||
put_utf8_chars_to_iconv_port (SCM port, const scm_t_uint8 *buf, size_t len)
|
|
||||||
{
|
{
|
||||||
SCM bv = scm_port_buffer_bytevector (scm_port_auxiliary_write_buffer (port));
|
scm_t_uint8 utf8[UTF8_BUFFER_SIZE];
|
||||||
scm_t_uint8 *aux = (scm_t_uint8*) SCM_BYTEVECTOR_CONTENTS (bv);
|
size_t utf8_len = codepoint_to_utf8 (ch, utf8);
|
||||||
size_t aux_len = SCM_BYTEVECTOR_LENGTH (bv);
|
scm_t_uint8 *aux = scm_port_buffer_put_pointer (buf);
|
||||||
|
size_t can_put = scm_port_buffer_can_put (buf);
|
||||||
iconv_t output_cd;
|
iconv_t output_cd;
|
||||||
scm_t_wchar bad_codepoint;
|
|
||||||
int saved_errno;
|
int saved_errno;
|
||||||
|
|
||||||
while (len)
|
char *input = (char *) utf8;
|
||||||
|
size_t input_left = utf8_len;
|
||||||
|
char *output = (char *) aux;
|
||||||
|
size_t output_left = can_put;
|
||||||
|
size_t res;
|
||||||
|
|
||||||
|
scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
|
||||||
|
res = iconv (output_cd, &input, &input_left, &output, &output_left);
|
||||||
|
saved_errno = errno;
|
||||||
|
/* Emit bytes needed to get back to initial state, if needed. */
|
||||||
|
iconv (output_cd, NULL, NULL, &output, &output_left);
|
||||||
|
scm_port_release_iconv_descriptors (port);
|
||||||
|
|
||||||
|
if (res != (size_t) -1)
|
||||||
{
|
{
|
||||||
char *input, *output;
|
/* Success. */
|
||||||
size_t done, input_left, output_left;
|
scm_port_buffer_did_put (buf, can_put - output_left);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
input = (char *) buf;
|
if (saved_errno == E2BIG)
|
||||||
input_left = len;
|
/* No space to encode the character; try again next time. */
|
||||||
output = (char *) aux;
|
return 0;
|
||||||
output_left = aux_len;
|
|
||||||
|
|
||||||
|
/* Otherwise, re-set the output buffer and try to escape or substitute
|
||||||
|
the character, as appropriate. */
|
||||||
|
output = (char *) aux;
|
||||||
|
output_left = can_put;
|
||||||
|
|
||||||
|
/* The source buffer is valid UTF-8, so we shouldn't get EILSEQ
|
||||||
|
because of the input encoding; if we get EILSEQ, that means the
|
||||||
|
codepoint is not accessible in the target encoding. We have whole
|
||||||
|
codepoints in the source buffer, so we shouldn't get EINVAL. We
|
||||||
|
already handled E2BIG. The descriptor should be valid so we
|
||||||
|
shouldn't get EBADF. In summary, we only need to handle EILSEQ. */
|
||||||
|
|
||||||
|
if (scm_is_eq (SCM_PORT (port)->conversion_strategy, sym_escape))
|
||||||
|
{
|
||||||
|
scm_t_uint8 escape[ESCAPE_BUFFER_SIZE];
|
||||||
|
input = (char *) escape;
|
||||||
|
input_left = encode_escape_sequence (ch, escape);
|
||||||
scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
|
scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
|
||||||
done = iconv (output_cd, &input, &input_left, &output, &output_left);
|
res = iconv (output_cd, &input, &input_left, &output, &output_left);
|
||||||
saved_errno = errno;
|
saved_errno = errno;
|
||||||
/* Emit bytes needed to get back to initial state, if needed. */
|
iconv (output_cd, NULL, NULL, &output, &output_left);
|
||||||
if (done != (size_t) -1)
|
|
||||||
iconv (output_cd, NULL, NULL, &output, &output_left);
|
|
||||||
scm_port_release_iconv_descriptors (port);
|
scm_port_release_iconv_descriptors (port);
|
||||||
|
|
||||||
buf += (len - input_left);
|
|
||||||
len -= (len - input_left);
|
|
||||||
scm_c_write_bytes (port, bv, 0, aux_len - output_left);
|
|
||||||
|
|
||||||
if (done == (size_t) -1)
|
|
||||||
{
|
|
||||||
scm_t_port *pt = SCM_PORT (port);
|
|
||||||
|
|
||||||
/* The source buffer is valid UTF-8, so we shouldn't get
|
|
||||||
EILSEQ because of the input encoding; if we get EILSEQ,
|
|
||||||
that means the codepoint is not accessible in the target
|
|
||||||
encoding. We have whole codepoints in the source buffer,
|
|
||||||
so we shouldn't get EINVAL. We can get E2BIG, meaning we
|
|
||||||
just need to process the next chunk. The descriptor should
|
|
||||||
be valid so we shouldn't get EBADF. In summary, we should
|
|
||||||
only do E2BIG and EILSEQ. */
|
|
||||||
|
|
||||||
if (saved_errno == E2BIG)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
bad_codepoint = utf8_to_codepoint (buf, len);
|
|
||||||
|
|
||||||
if (saved_errno != EILSEQ)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
/* Advance the input past the utf8 sequence. */
|
|
||||||
{
|
|
||||||
size_t advance = codepoint_to_utf8 (bad_codepoint, aux);
|
|
||||||
buf += advance;
|
|
||||||
len -= advance;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Convert substitutes or escapes into the aux buf. */
|
|
||||||
output = (char *) aux;
|
|
||||||
output_left = aux_len;
|
|
||||||
|
|
||||||
/* Substitute or escape. Note that this re-sets "done",
|
|
||||||
"saved_errno", "output", and "output_left". */
|
|
||||||
if (scm_is_eq (pt->conversion_strategy, sym_escape))
|
|
||||||
{
|
|
||||||
scm_t_uint8 escape[ESCAPE_BUFFER_SIZE];
|
|
||||||
input = (char *) escape;
|
|
||||||
input_left = encode_escape_sequence (bad_codepoint, escape);
|
|
||||||
scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
|
|
||||||
done = iconv (output_cd, &input, &input_left, &output, &output_left);
|
|
||||||
saved_errno = errno;
|
|
||||||
scm_port_release_iconv_descriptors (port);
|
|
||||||
}
|
|
||||||
else if (scm_is_eq (pt->conversion_strategy, sym_substitute))
|
|
||||||
{
|
|
||||||
scm_t_uint8 substitute[2] = "?";
|
|
||||||
input = (char *) substitute;
|
|
||||||
input_left = 1;
|
|
||||||
scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
|
|
||||||
done = iconv (output_cd, &input, &input_left, &output, &output_left);
|
|
||||||
saved_errno = errno;
|
|
||||||
scm_port_release_iconv_descriptors (port);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This catches both the "error" conversion strategy case, and
|
|
||||||
any error while substituting or escaping the character. */
|
|
||||||
if (done == (size_t) -1)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
/* The substitution or escape succeeded; print it. */
|
|
||||||
scm_c_write_bytes (port, bv, 0, aux_len - output_left);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
else if (scm_is_eq (SCM_PORT (port)->conversion_strategy, sym_substitute))
|
||||||
return;
|
|
||||||
|
|
||||||
error:
|
|
||||||
scm_encoding_error ("put-char", saved_errno,
|
|
||||||
"conversion to port encoding failed",
|
|
||||||
port, SCM_MAKE_CHAR (bad_codepoint));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
put_latin1_chars_to_utf8_port (SCM port, const scm_t_uint8 *buf, size_t len)
|
|
||||||
{
|
|
||||||
SCM bv = scm_port_buffer_bytevector (scm_port_auxiliary_write_buffer (port));
|
|
||||||
scm_t_uint8 *utf8 = (scm_t_uint8*) SCM_BYTEVECTOR_CONTENTS (bv);
|
|
||||||
size_t utf8_len = SCM_BYTEVECTOR_LENGTH (bv);
|
|
||||||
|
|
||||||
while (len)
|
|
||||||
{
|
{
|
||||||
size_t read, written;
|
scm_t_uint8 substitute[2] = "?";
|
||||||
for (read = 0, written = 0;
|
input = (char *) substitute;
|
||||||
read < len && written + UTF8_BUFFER_SIZE < utf8_len;
|
input_left = 1;
|
||||||
read++)
|
scm_port_acquire_iconv_descriptors (port, NULL, &output_cd);
|
||||||
written += codepoint_to_utf8 (buf[read], utf8 + written);
|
res = iconv (output_cd, &input, &input_left, &output, &output_left);
|
||||||
|
saved_errno = errno;
|
||||||
buf += read;
|
iconv (output_cd, NULL, NULL, &output, &output_left);
|
||||||
len -= read;
|
scm_port_release_iconv_descriptors (port);
|
||||||
scm_c_write_bytes (port, bv, 0, written);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
if (res != (size_t) -1)
|
||||||
put_latin1_chars_to_iconv_port (SCM port, const scm_t_uint8 *buf, size_t len)
|
|
||||||
{
|
|
||||||
scm_t_uint8 utf8[AUXILIARY_WRITE_BUFFER_SIZE];
|
|
||||||
size_t utf8_len = AUXILIARY_WRITE_BUFFER_SIZE;
|
|
||||||
|
|
||||||
/* Convert through UTF-8, as most non-GNU iconvs can only convert
|
|
||||||
between a limited number of encodings. */
|
|
||||||
while (len)
|
|
||||||
{
|
{
|
||||||
size_t read, written;
|
scm_port_buffer_did_put (buf, can_put - output_left);
|
||||||
for (read = 0, written = 0;
|
return 1;
|
||||||
read < len && written + UTF8_BUFFER_SIZE < utf8_len;
|
|
||||||
read++)
|
|
||||||
written += codepoint_to_utf8 (buf[read], utf8 + written);
|
|
||||||
|
|
||||||
buf += read;
|
|
||||||
len -= read;
|
|
||||||
put_utf8_chars_to_iconv_port (port, utf8, written);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* No space to write the substitution or escape, or maybe there was an
|
||||||
|
error. If there are buffered bytes, the caller should flush and
|
||||||
|
try again; otherwise the caller should raise an error. */
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static size_t
|
||||||
put_utf32_chars_to_latin1_port (SCM port, const scm_t_uint32 *buf, size_t len)
|
encode_latin1_chars_to_latin1_buf (SCM port, SCM buf,
|
||||||
|
const scm_t_uint8 *chars, size_t count)
|
||||||
{
|
{
|
||||||
SCM bv = scm_port_buffer_bytevector (scm_port_auxiliary_write_buffer (port));
|
return scm_port_buffer_put (buf, chars, count);
|
||||||
scm_t_uint8 *latin1 = (scm_t_uint8*) SCM_BYTEVECTOR_CONTENTS (bv);
|
|
||||||
size_t latin1_len = SCM_BYTEVECTOR_LENGTH (bv);
|
|
||||||
|
|
||||||
while (len)
|
|
||||||
{
|
|
||||||
size_t read = 0, written = 0;
|
|
||||||
while (read < len && written + ESCAPE_BUFFER_SIZE <= latin1_len)
|
|
||||||
{
|
|
||||||
scm_t_port *pt = SCM_PORT (port);
|
|
||||||
scm_t_uint32 ch = buf[read++];
|
|
||||||
if (ch <= 0xff)
|
|
||||||
latin1[written++] = ch;
|
|
||||||
else if (scm_is_eq (pt->conversion_strategy, sym_substitute))
|
|
||||||
latin1[written++] = '?';
|
|
||||||
else if (scm_is_eq (pt->conversion_strategy, sym_escape))
|
|
||||||
written += encode_escape_sequence (ch, latin1 + written);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
scm_c_write_bytes (port, bv, 0, written);
|
|
||||||
scm_encoding_error ("put-char", EILSEQ,
|
|
||||||
"conversion to port encoding failed",
|
|
||||||
port, SCM_MAKE_CHAR (ch));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
buf += read;
|
|
||||||
len -= read;
|
|
||||||
scm_c_write_bytes (port, bv, 0, written);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static size_t
|
||||||
put_utf32_chars_to_utf8_port (SCM port, const scm_t_uint32 *buf, size_t len)
|
encode_latin1_chars_to_utf8_buf (SCM port, SCM buf, const scm_t_uint8 *chars,
|
||||||
|
size_t count)
|
||||||
{
|
{
|
||||||
SCM bv = scm_port_buffer_bytevector (scm_port_auxiliary_write_buffer (port));
|
scm_t_uint8 *dst = scm_port_buffer_put_pointer (buf);
|
||||||
scm_t_uint8 *utf8 = (scm_t_uint8*) SCM_BYTEVECTOR_CONTENTS (bv);
|
size_t buf_size = scm_port_buffer_can_put (buf);
|
||||||
size_t utf8_len = SCM_BYTEVECTOR_LENGTH (bv);
|
size_t read, written;
|
||||||
|
for (read = 0, written = 0;
|
||||||
while (len)
|
read < count && written + UTF8_BUFFER_SIZE < buf_size;
|
||||||
{
|
read++)
|
||||||
size_t read, written;
|
written += codepoint_to_utf8 (chars[read], dst + written);
|
||||||
for (read = 0, written = 0;
|
scm_port_buffer_did_put (buf, written);
|
||||||
read < len && written + UTF8_BUFFER_SIZE < utf8_len;
|
return read;
|
||||||
read++)
|
|
||||||
written += codepoint_to_utf8 (buf[read], utf8 + written);
|
|
||||||
|
|
||||||
buf += read;
|
|
||||||
len -= read;
|
|
||||||
scm_c_write_bytes (port, bv, 0, written);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static size_t
|
||||||
put_utf32_chars_to_iconv_port (SCM port, const scm_t_uint32 *buf, size_t len)
|
encode_latin1_chars_to_iconv_buf (SCM port, SCM buf, const scm_t_uint8 *chars,
|
||||||
|
size_t count)
|
||||||
{
|
{
|
||||||
scm_t_uint8 utf8[AUXILIARY_WRITE_BUFFER_SIZE];
|
size_t read;
|
||||||
size_t utf8_len = AUXILIARY_WRITE_BUFFER_SIZE;
|
for (read = 0; read < count; read++)
|
||||||
|
if (!try_encode_char_to_iconv_buf (port, buf, chars[read]))
|
||||||
/* Convert through UTF-8, as most non-GNU iconvs can only convert
|
break;
|
||||||
between a limited number of encodings. */
|
return read;
|
||||||
while (len)
|
|
||||||
{
|
|
||||||
size_t read, written;
|
|
||||||
for (read = 0, written = 0;
|
|
||||||
read < len && written + UTF8_BUFFER_SIZE < utf8_len;
|
|
||||||
read++)
|
|
||||||
written += codepoint_to_utf8 (buf[read], utf8 + written);
|
|
||||||
|
|
||||||
buf += read;
|
|
||||||
len -= read;
|
|
||||||
put_utf8_chars_to_iconv_port (port, utf8, written);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
static size_t
|
||||||
scm_c_put_latin1_chars (SCM port, const scm_t_uint8 *buf, size_t len)
|
encode_latin1_chars (SCM port, SCM buf, const scm_t_uint8 *chars, size_t count)
|
||||||
{
|
{
|
||||||
scm_t_port *pt = SCM_PORT (port);
|
scm_t_port *pt = SCM_PORT (port);
|
||||||
SCM position, saved_line;
|
SCM position;
|
||||||
size_t i;
|
size_t ret, i;
|
||||||
|
|
||||||
if (len == 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
|
|
||||||
|
|
||||||
if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
||||||
scm_c_write (port, buf, len);
|
ret = encode_latin1_chars_to_latin1_buf (port, buf, chars, count);
|
||||||
else if (scm_is_eq (pt->encoding, sym_UTF_8))
|
else if (scm_is_eq (pt->encoding, sym_UTF_8))
|
||||||
put_latin1_chars_to_utf8_port (port, buf, len);
|
ret = encode_latin1_chars_to_utf8_buf (port, buf, chars, count);
|
||||||
else
|
else
|
||||||
put_latin1_chars_to_iconv_port (port, buf, len);
|
ret = encode_latin1_chars_to_iconv_buf (port, buf, chars, count);
|
||||||
|
|
||||||
|
if (ret == 0 && count > 0)
|
||||||
|
scm_encoding_error ("put-char", EILSEQ,
|
||||||
|
"conversion to port encoding failed",
|
||||||
|
port, SCM_MAKE_CHAR (chars[0]));
|
||||||
|
|
||||||
position = pt->position;
|
position = pt->position;
|
||||||
saved_line = scm_port_position_line (position);
|
for (i = 0; i < ret; i++)
|
||||||
for (i = 0; i < len; i++)
|
update_port_position (position, chars[i]);
|
||||||
update_port_position (position, buf[i]);
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
encode_utf32_chars_to_latin1_buf (SCM port, SCM buf,
|
||||||
|
const scm_t_uint32 *chars, size_t count)
|
||||||
|
{
|
||||||
|
scm_t_port *pt = SCM_PORT (port);
|
||||||
|
scm_t_uint8 *dst = scm_port_buffer_put_pointer (buf);
|
||||||
|
size_t buf_size = scm_port_buffer_can_put (buf);
|
||||||
|
size_t read, written;
|
||||||
|
for (read = 0, written = 0; read < count && written < buf_size; read++)
|
||||||
|
{
|
||||||
|
scm_t_uint32 ch = chars[read];
|
||||||
|
if (ch <= 0xff)
|
||||||
|
dst[written++] = ch;
|
||||||
|
else if (scm_is_eq (pt->conversion_strategy, sym_substitute))
|
||||||
|
dst[written++] = '?';
|
||||||
|
else if (scm_is_eq (pt->conversion_strategy, sym_escape))
|
||||||
|
{
|
||||||
|
scm_t_uint8 escape[ESCAPE_BUFFER_SIZE];
|
||||||
|
size_t escape_len = encode_escape_sequence (ch, escape);
|
||||||
|
if (escape_len > buf_size - written)
|
||||||
|
break;
|
||||||
|
memcpy (dst + written, escape, escape_len);
|
||||||
|
written += escape_len;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
scm_port_buffer_did_put (buf, written);
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
encode_utf32_chars_to_utf8_buf (SCM port, SCM buf, const scm_t_uint32 *chars,
|
||||||
|
size_t count)
|
||||||
|
{
|
||||||
|
scm_t_uint8 *dst = scm_port_buffer_put_pointer (buf);
|
||||||
|
size_t buf_size = scm_port_buffer_can_put (buf);
|
||||||
|
size_t read, written;
|
||||||
|
for (read = 0, written = 0;
|
||||||
|
read < count && written + UTF8_BUFFER_SIZE < buf_size;
|
||||||
|
read++)
|
||||||
|
written += codepoint_to_utf8 (chars[read], dst + written);
|
||||||
|
scm_port_buffer_did_put (buf, written);
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
encode_utf32_chars_to_iconv_buf (SCM port, SCM buf, const scm_t_uint32 *chars,
|
||||||
|
size_t count)
|
||||||
|
{
|
||||||
|
size_t read;
|
||||||
|
for (read = 0; read < count; read++)
|
||||||
|
if (!try_encode_char_to_iconv_buf (port, buf, chars[read]))
|
||||||
|
break;
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
encode_utf32_chars (SCM port, SCM buf, const scm_t_uint32 *chars, size_t count)
|
||||||
|
{
|
||||||
|
scm_t_port *pt = SCM_PORT (port);
|
||||||
|
SCM position;
|
||||||
|
size_t ret, i;
|
||||||
|
|
||||||
|
if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
||||||
|
ret = encode_utf32_chars_to_latin1_buf (port, buf, chars, count);
|
||||||
|
else if (scm_is_eq (pt->encoding, sym_UTF_8))
|
||||||
|
ret = encode_utf32_chars_to_utf8_buf (port, buf, chars, count);
|
||||||
|
else
|
||||||
|
ret = encode_utf32_chars_to_iconv_buf (port, buf, chars, count);
|
||||||
|
|
||||||
|
if (ret == 0 && count > 0)
|
||||||
|
scm_encoding_error ("put-char", EILSEQ,
|
||||||
|
"conversion to port encoding failed",
|
||||||
|
port, SCM_MAKE_CHAR (chars[0]));
|
||||||
|
|
||||||
|
position = pt->position;
|
||||||
|
for (i = 0; i < ret; i++)
|
||||||
|
update_port_position (position, chars[i]);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
port_encode_chars (SCM port, SCM buf, SCM str, size_t start, size_t count)
|
||||||
|
{
|
||||||
|
if (count == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (scm_i_is_narrow_string (str))
|
||||||
|
{
|
||||||
|
const char *chars = scm_i_string_chars (str);
|
||||||
|
return encode_latin1_chars (port, buf,
|
||||||
|
((const scm_t_uint8 *) chars) + start,
|
||||||
|
count);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const scm_t_wchar *chars = scm_i_string_wide_chars (str);
|
||||||
|
return encode_utf32_chars (port, buf,
|
||||||
|
((const scm_t_uint32 *) chars) + start,
|
||||||
|
count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SCM scm_port_encode_chars (SCM, SCM, SCM, SCM, SCM);
|
||||||
|
SCM_DEFINE (scm_port_encode_chars, "port-encode-chars", 5, 0, 0,
|
||||||
|
(SCM port, SCM buf, SCM str, SCM start, SCM count),
|
||||||
|
"")
|
||||||
|
#define FUNC_NAME s_scm_port_encode_chars
|
||||||
|
{
|
||||||
|
size_t c_start, c_count, c_len, encoded;
|
||||||
|
|
||||||
|
SCM_VALIDATE_OPOUTPORT (1, port);
|
||||||
|
SCM_VALIDATE_VECTOR (2, buf);
|
||||||
|
SCM_VALIDATE_STRING (3, str);
|
||||||
|
c_len = scm_i_string_length (str);
|
||||||
|
SCM_VALIDATE_SIZE_COPY (4, start, c_start);
|
||||||
|
SCM_ASSERT_RANGE (4, start, c_start <= c_len);
|
||||||
|
SCM_VALIDATE_SIZE_COPY (5, count, c_count);
|
||||||
|
SCM_ASSERT_RANGE (5, count, c_count <= c_len - c_start);
|
||||||
|
|
||||||
|
encoded = port_encode_chars (port, buf, str, c_start, c_count);
|
||||||
|
|
||||||
|
return scm_from_size_t (encoded);
|
||||||
|
}
|
||||||
|
#undef FUNC_NAME
|
||||||
|
|
||||||
|
void
|
||||||
|
scm_c_put_latin1_chars (SCM port, const scm_t_uint8 *chars, size_t len)
|
||||||
|
{
|
||||||
|
SCM aux_buf = scm_port_auxiliary_write_buffer (port);
|
||||||
|
SCM aux_bv = scm_port_buffer_bytevector (aux_buf);
|
||||||
|
SCM position = SCM_PORT (port)->position;
|
||||||
|
SCM saved_line = scm_port_position_line (position);
|
||||||
|
|
||||||
|
scm_port_clear_stream_start_for_bom_write (port, aux_buf);
|
||||||
|
|
||||||
|
while (len)
|
||||||
|
{
|
||||||
|
size_t encoded = encode_latin1_chars (port, aux_buf, chars, len);
|
||||||
|
assert(encoded <= len);
|
||||||
|
scm_c_write_bytes (port, aux_bv, 0,
|
||||||
|
scm_to_size_t (scm_port_buffer_end (aux_buf)));
|
||||||
|
scm_port_buffer_reset (aux_buf);
|
||||||
|
chars += encoded;
|
||||||
|
len -= encoded;
|
||||||
|
}
|
||||||
|
|
||||||
/* Handle line buffering. */
|
/* Handle line buffering. */
|
||||||
if ((SCM_CELL_WORD_0 (port) & SCM_BUFLINE) &&
|
if ((SCM_CELL_WORD_0 (port) & SCM_BUFLINE) &&
|
||||||
|
@ -3241,28 +3266,25 @@ scm_c_put_latin1_chars (SCM port, const scm_t_uint8 *buf, size_t len)
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
scm_c_put_utf32_chars (SCM port, const scm_t_uint32 *buf, size_t len)
|
scm_c_put_utf32_chars (SCM port, const scm_t_uint32 *chars, size_t len)
|
||||||
{
|
{
|
||||||
scm_t_port *pt = SCM_PORT (port);
|
SCM aux_buf = scm_port_auxiliary_write_buffer (port);
|
||||||
SCM position, saved_line;
|
SCM aux_bv = scm_port_buffer_bytevector (aux_buf);
|
||||||
size_t i;
|
SCM position = SCM_PORT (port)->position;
|
||||||
|
SCM saved_line = scm_port_position_line (position);
|
||||||
|
|
||||||
if (len == 0)
|
scm_port_clear_stream_start_for_bom_write (port, aux_buf);
|
||||||
return;
|
|
||||||
|
|
||||||
port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
|
while (len)
|
||||||
|
{
|
||||||
if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
|
size_t encoded = encode_utf32_chars (port, aux_buf, chars, len);
|
||||||
put_utf32_chars_to_latin1_port (port, buf, len);
|
assert(encoded <= len);
|
||||||
else if (scm_is_eq (pt->encoding, sym_UTF_8))
|
scm_c_write_bytes (port, aux_bv, 0,
|
||||||
put_utf32_chars_to_utf8_port (port, buf, len);
|
scm_to_size_t (scm_port_buffer_end (aux_buf)));
|
||||||
else
|
scm_port_buffer_reset (aux_buf);
|
||||||
put_utf32_chars_to_iconv_port (port, buf, len);
|
chars += encoded;
|
||||||
|
len -= encoded;
|
||||||
position = pt->position;
|
}
|
||||||
saved_line = scm_port_position_line (position);
|
|
||||||
for (i = 0; i < len; i++)
|
|
||||||
update_port_position (position, buf[i]);
|
|
||||||
|
|
||||||
/* Handle line buffering. */
|
/* Handle line buffering. */
|
||||||
if ((SCM_CELL_WORD_0 (port) & SCM_BUFLINE) &&
|
if ((SCM_CELL_WORD_0 (port) & SCM_BUFLINE) &&
|
||||||
|
@ -3346,7 +3368,7 @@ SCM_DEFINE (scm_put_string, "put-string", 2, 2, 0,
|
||||||
{
|
{
|
||||||
size_t c_start, c_count, c_len;
|
size_t c_start, c_count, c_len;
|
||||||
|
|
||||||
SCM_VALIDATE_OPINPORT (1, port);
|
SCM_VALIDATE_OPOUTPORT (1, port);
|
||||||
SCM_VALIDATE_STRING (2, string);
|
SCM_VALIDATE_STRING (2, string);
|
||||||
c_len = scm_i_string_length (string);
|
c_len = scm_i_string_length (string);
|
||||||
c_start = SCM_UNBNDP (start) ? 0 : scm_to_size_t (start);
|
c_start = SCM_UNBNDP (start) ? 0 : scm_to_size_t (start);
|
||||||
|
@ -3381,24 +3403,7 @@ scm_puts (const char *s, SCM port)
|
||||||
void
|
void
|
||||||
scm_lfwrite (const char *ptr, size_t size, SCM port)
|
scm_lfwrite (const char *ptr, size_t size, SCM port)
|
||||||
{
|
{
|
||||||
SCM position, saved_line;
|
scm_c_put_latin1_chars (port, (const scm_t_uint8 *) ptr, size);
|
||||||
|
|
||||||
if (size == 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
|
|
||||||
|
|
||||||
scm_c_write (port, ptr, size);
|
|
||||||
|
|
||||||
position = SCM_PORT (port)->position;
|
|
||||||
saved_line = scm_port_position_line (position);
|
|
||||||
for (; size; ptr++, size--)
|
|
||||||
update_port_position (position, (scm_t_wchar) (unsigned char) *ptr);
|
|
||||||
|
|
||||||
/* Handle line buffering. */
|
|
||||||
if ((SCM_CELL_WORD_0 (port) & SCM_BUFLINE) &&
|
|
||||||
!scm_is_eq (saved_line, scm_port_position_line (position)))
|
|
||||||
scm_flush (port);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Write STR to PORT from START inclusive to END exclusive. */
|
/* Write STR to PORT from START inclusive to END exclusive. */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue