1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-06 04:00:26 +02:00

Add Unicode strings and symbols

This adds full Unicode strings as a datatype, and it adds some
minimal functionality.  The terminal and port encoding is assumed
to be ISO-8859-1.  Non-ISO-8859-1 characters are written or
input as string character escapes.

The string character escapes now have 3 forms: \xXX \uXXXX and
\UXXXXXX, for unprintable characters that have 2, 4 or 6 hex digits.

The process for writing to strings has been modified.  There is now a
function scm_i_string_start_writing that does the copy-on-write
conversion if necessary.

To compile strings that may be wide, the VM storage of strings and
string-likes has changed.

Most string-using functions have not yet been updated and may break
when used with wide strings.


        * module/language/assembly/compile-bytecode.scm (write-bytecode):
        use variable width string bytecode format

        * module/language/assembly.scm (byte-length): use variable width
        bytecode format

        * libguile/vm-i-loader.c (load-string, load-symbol):
        (load-keyword, define): use variable-width bytecode format

        * libguile/vm-engine.h (FETCH_WIDTH): new macro

        * libguile/strings.h: new declarations

        * libguile/strings.c (make_wide_stringbuf): new function
        (widen_stringbuf): new function
        (scm_i_make_wide_string): new function
        (scm_i_is_narrow_string): new function
        (scm_i_string_wide_chars): new function
        (scm_i_string_start_writing): new function
        (scm_i_string_ref): new function
        (scm_i_string_set_x): new function
        (scm_i_is_narrow_symbol): new function
        (scm_i_symbol_wide_chars, scm_i_symbol_ref): new function
        (scm_string_width): new function
        (unistring_escapes_to_guile_escapes): new function
        (scm_to_stringn): new function
        (scm_i_stringbuf_free): modify for wide strings
        (scm_i_substring_copy): modify for wide strings
        (scm_i_string_chars, scm_string_append): modify for wide strings
        (scm_i_make_symbol, scm_to_locale_stringn): modify for wide strings
        (scm_string_dump, scm_symbol_dump, scm_to_locale_stringbuf):
        (scm_string, scm_i_deprecated_string_chars): modify for wide strings
        (scm_from_locale_string, scm_from_locale_stringn): add null test

        * libguile/srfi-13.c: add calls for scm_i_string_start_writing for
        each call of scm_i_string_stop_writing
        (scm_string_for_each): modify for wide strings

        * libguile/socket.c: add calls for scm_i_string_start_writing for each
        call of scm_i_string_stop_writing

        * libguile/rw.c: add calls for scm_i_string_start_writing for each
        call of scm_i_string_stop_writing

        * libguile/read.c (scm_read_string): allow reading of wide strings

        * libguile/print.h: add declaration for scm_charprint

        * libguile/print.c (iprin1): print wide strings and add new string
        escapes
        (scm_charprint): new function

        * libguile/ports.h: new declarations for scm_lfwrite_substr and
        scm_lfwrite_str

        * libguile/ports.c (update_port_lf): new function
        (scm_lfwrite): use update_port_lf
        (scm_lfwrite_substr): new function
        (scm_lfwrite_str): new function

        * test-suite/tests/asm-to-bytecode.test ("compiler"): add string
        width byte to sting-like asm tests
This commit is contained in:
Michael Gran 2009-08-08 02:35:00 -07:00
parent a876e7dcea
commit 9c44cd4559
15 changed files with 1046 additions and 306 deletions

View file

@ -969,6 +969,34 @@ scm_fill_input (SCM port)
* This function differs from scm_c_write; it updates port line and
* column. */
static void
update_port_lf (scm_t_wchar c, SCM port)
{
if (c == '\a')
{
}
else if (c == '\b')
{
SCM_DECCOL (port);
}
else if (c == '\n')
{
SCM_INCLINE (port);
}
else if (c == '\r')
{
SCM_ZEROCOL (port);
}
else if (c == '\t')
{
SCM_TABCOL (port);
}
else
{
SCM_INCCOL (port);
}
}
void
scm_lfwrite (const char *ptr, size_t size, SCM port)
{
@ -980,30 +1008,54 @@ scm_lfwrite (const char *ptr, size_t size, SCM port)
ptob->write (port, ptr, size);
for (; size; ptr++, size--) {
if (*ptr == '\a') {
}
else if (*ptr == '\b') {
SCM_DECCOL(port);
}
else if (*ptr == '\n') {
SCM_INCLINE(port);
}
else if (*ptr == '\r') {
SCM_ZEROCOL(port);
}
else if (*ptr == '\t') {
SCM_TABCOL(port);
}
else {
SCM_INCCOL(port);
}
for (; size; ptr++, size--)
update_port_lf ((scm_t_wchar) (unsigned char) *ptr, port);
if (pt->rw_random)
pt->rw_active = SCM_PORT_WRITE;
}
/* Write a scheme string STR to PORT from START inclusive to END
exclusive. */
void
scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
{
size_t i, size = scm_i_string_length (str);
scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)];
scm_t_wchar p;
char *buf;
size_t len;
if (pt->rw_active == SCM_PORT_READ)
scm_end_input (port);
if (end == -1)
end = size;
size = end - start;
buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
NULL, iconveh_escape_sequence);
ptob->write (port, buf, len);
free (buf);
for (i = 0; i < size; i++)
{
p = scm_i_string_ref (str, i + start);
update_port_lf (p, port);
}
if (pt->rw_random)
pt->rw_active = SCM_PORT_WRITE;
}
/* Write a scheme string STR to PORT. */
void
scm_lfwrite_str (SCM str, SCM port)
{
scm_lfwrite_substr (str, 0, -1, port);
}
/* scm_c_read
*
* Used by an application to read arbitrary number of bytes from an

View file

@ -269,6 +269,9 @@ SCM_API SCM scm_read_char (SCM port);
SCM_API size_t scm_c_read (SCM port, void *buffer, size_t size);
SCM_API void scm_c_write (SCM port, const void *buffer, size_t size);
SCM_API void scm_lfwrite (const char *ptr, size_t size, SCM port);
SCM_INTERNAL void scm_lfwrite_str (SCM str, SCM port);
SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t start, size_t end,
SCM port);
SCM_API void scm_flush (SCM port);
SCM_API void scm_end_input (SCM port);
SCM_API int scm_fill_input (SCM port);

View file

@ -563,37 +563,96 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
if (SCM_WRITINGP (pstate))
{
size_t i, j, len;
const char *data;
static char const hex[] = "0123456789abcdef";
char buf[8];
scm_putc ('"', port);
len = scm_i_string_length (exp);
data = scm_i_string_chars (exp);
for (i = 0, j = 0; i < len; ++i)
for (i = 0; i < len; ++i)
{
unsigned char ch = data[i];
if ((ch < 32 && ch != '\n') || (127 <= ch && ch < 148))
{
static char const hex[]="0123456789abcdef";
char buf[4];
scm_t_wchar ch = scm_i_string_ref (exp, i);
int printed = 0;
scm_lfwrite (data+j, i-j, port);
buf[0] = '\\';
buf[1] = 'x';
buf[2] = hex [ch / 16];
buf[3] = hex [ch % 16];
scm_lfwrite (buf, 4, port);
data = scm_i_string_chars (exp);
j = i+1;
if (ch == ' ' || ch == '\n')
{
scm_putc (ch, port);
printed = 1;
}
else if (ch == '"' || ch == '\\')
{
scm_lfwrite (data+j, i-j, port);
scm_putc ('\\', port);
data = scm_i_string_chars (exp);
j = i;
scm_charprint (ch, port);
printed = 1;
}
else
if (uc_is_general_category_withtable
(ch,
UC_CATEGORY_MASK_L | UC_CATEGORY_MASK_M |
UC_CATEGORY_MASK_N | UC_CATEGORY_MASK_P |
UC_CATEGORY_MASK_S))
{
/* Print the character since it is a graphic
character. */
scm_t_wchar *wbuf;
SCM wstr = scm_i_make_wide_string (1, &wbuf);
char *buf;
size_t len;
wbuf[0] = ch;
buf = u32_conv_to_encoding ("ISO-8859-1",
iconveh_error,
(scm_t_uint32 *) wbuf,
1, NULL, NULL, &len);
if (buf != NULL)
{
/* Character is graphic and representable in
this encoding. Print it. */
scm_lfwrite_str (wstr, port);
free (buf);
printed = 1;
}
}
if (!printed)
{
/* Character is graphic but unrepresentable in
this port's encoding or is not graphic. */
if (ch <= 0xFF)
{
buf[0] = '\\';
buf[1] = 'x';
buf[2] = hex[ch / 16];
buf[3] = hex[ch % 16];
scm_lfwrite (buf, 4, port);
}
else if (ch <= 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'u';
buf[2] = hex[(ch & 0xF000) >> 12];
buf[3] = hex[(ch & 0xF00) >> 8];
buf[4] = hex[(ch & 0xF0) >> 4];
buf[5] = hex[(ch & 0xF)];
scm_lfwrite (buf, 6, port);
j = i + 1;
}
else if (ch > 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'U';
buf[2] = hex[(ch & 0xF00000) >> 20];
buf[3] = hex[(ch & 0xF0000) >> 16];
buf[4] = hex[(ch & 0xF000) >> 12];
buf[5] = hex[(ch & 0xF00) >> 8];
buf[6] = hex[(ch & 0xF0) >> 4];
buf[7] = hex[(ch & 0xF)];
scm_lfwrite (buf, 8, port);
j = i + 1;
}
}
}
scm_lfwrite (data+j, i-j, port);
scm_putc ('"', port);
scm_remember_upto_here_1 (exp);
}
@ -606,8 +665,7 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
if (scm_i_symbol_is_interned (exp))
{
scm_print_symbol_name (scm_i_symbol_chars (exp),
scm_i_symbol_length (exp),
port);
scm_i_symbol_length (exp), port);
scm_remember_upto_here_1 (exp);
}
else
@ -763,6 +821,17 @@ scm_prin1 (SCM exp, SCM port, int writingp)
}
}
/* Print a character.
*/
void
scm_charprint (scm_t_uint32 ch, SCM port)
{
scm_t_wchar *wbuf;
SCM wstr = scm_i_make_wide_string (1, &wbuf);
wbuf[0] = ch;
scm_lfwrite_str (wstr, port);
}
/* Print an integer.
*/

View file

@ -77,6 +77,7 @@ SCM_API SCM scm_print_options (SCM setting);
SCM_API SCM scm_make_print_state (void);
SCM_API void scm_free_print_state (SCM print_state);
SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state);
SCM_API void scm_charprint (scm_t_uint32 c, SCM port);
SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);

View file

@ -387,32 +387,28 @@ scm_read_string (int chr, SCM port)
object (the string returned). */
SCM str = SCM_BOOL_F;
char c_str[READER_STRING_BUFFER_SIZE];
unsigned c_str_len = 0;
int c;
scm_t_wchar c;
str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
while ('"' != (c = scm_getc (port)))
{
if (c == EOF)
str_eof: scm_i_input_error (FUNC_NAME, port,
"end of file in string constant",
SCM_EOL);
if (c_str_len + 1 >= sizeof (c_str))
{
/* Flush the C buffer onto a Scheme string. */
SCM addy;
str_eof:
scm_i_input_error (FUNC_NAME, port,
"end of file in string constant", SCM_EOL);
}
if (str == SCM_BOOL_F)
str = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
if (c_str_len + 1 >= scm_i_string_length (str))
{
SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
addy = scm_from_locale_stringn (c_str, c_str_len);
str = scm_string_append_shared (scm_list_2 (str, addy));
c_str_len = 0;
str = scm_string_append (scm_list_2 (str, addy));
}
if (c == '\\')
{
switch (c = scm_getc (port))
{
case EOF:
@ -452,45 +448,106 @@ scm_read_string (int chr, SCM port)
break;
case 'x':
{
int a, b;
scm_t_wchar a, b;
a = scm_getc (port);
if (a == EOF) goto str_eof;
if (a == EOF)
goto str_eof;
b = scm_getc (port);
if (b == EOF) goto str_eof;
if ('0' <= a && a <= '9') a -= '0';
else if ('A' <= a && a <= 'F') a = a - 'A' + 10;
else if ('a' <= a && a <= 'f') a = a - 'a' + 10;
else goto bad_escaped;
if ('0' <= b && b <= '9') b -= '0';
else if ('A' <= b && b <= 'F') b = b - 'A' + 10;
else if ('a' <= b && b <= 'f') b = b - 'a' + 10;
else goto bad_escaped;
if (b == EOF)
goto str_eof;
if ('0' <= a && a <= '9')
a -= '0';
else if ('A' <= a && a <= 'F')
a = a - 'A' + 10;
else if ('a' <= a && a <= 'f')
a = a - 'a' + 10;
else
{
c = a;
goto bad_escaped;
}
if ('0' <= b && b <= '9')
b -= '0';
else if ('A' <= b && b <= 'F')
b = b - 'A' + 10;
else if ('a' <= b && b <= 'f')
b = b - 'a' + 10;
else
{
c = b;
goto bad_escaped;
}
c = a * 16 + b;
break;
}
case 'u':
{
scm_t_wchar a;
int i;
c = 0;
for (i = 0; i < 4; i++)
{
a = scm_getc (port);
if (a == EOF)
goto str_eof;
if ('0' <= a && a <= '9')
a -= '0';
else if ('A' <= a && a <= 'F')
a = a - 'A' + 10;
else if ('a' <= a && a <= 'f')
a = a - 'a' + 10;
else
{
c = a;
goto bad_escaped;
}
c = c * 16 + a;
}
break;
}
case 'U':
{
scm_t_wchar a;
int i;
c = 0;
for (i = 0; i < 6; i++)
{
a = scm_getc (port);
if (a == EOF)
goto str_eof;
if ('0' <= a && a <= '9')
a -= '0';
else if ('A' <= a && a <= 'F')
a = a - 'A' + 10;
else if ('a' <= a && a <= 'f')
a = a - 'a' + 10;
else
{
c = a;
goto bad_escaped;
}
c = c * 16 + a;
}
break;
}
default:
bad_escaped:
scm_i_input_error (FUNC_NAME, port,
"illegal character in escape sequence: ~S",
scm_list_1 (SCM_MAKE_CHAR (c)));
}
c_str[c_str_len++] = c;
}
str = scm_i_string_start_writing (str);
scm_i_string_set_x (str, c_str_len++, c);
scm_i_string_stop_writing ();
}
if (c_str_len > 0)
{
SCM addy;
addy = scm_from_locale_stringn (c_str, c_str_len);
if (str == SCM_BOOL_F)
str = addy;
else
str = scm_string_append_shared (scm_list_2 (str, addy));
return scm_i_substring_copy (str, 0, c_str_len);
}
else
str = (str == SCM_BOOL_F) ? scm_nullstr : str;
return str;
return scm_nullstr;
}
#undef FUNC_NAME

View file

@ -131,6 +131,7 @@ SCM_DEFINE (scm_read_string_x_partial, "read-string!/partial", 1, 3, 0,
don't touch the file descriptor. otherwise the
"return immediately if something is available" rule may
be violated. */
str = scm_i_string_start_writing (str);
dest = scm_i_string_writable_chars (str) + offset;
chars_read = scm_take_from_input_buffers (port, dest, read_len);
scm_i_string_stop_writing ();
@ -140,6 +141,7 @@ SCM_DEFINE (scm_read_string_x_partial, "read-string!/partial", 1, 3, 0,
if (chars_read == 0 && read_len > 0) /* don't confuse read_len == 0 with
EOF. */
{
str = scm_i_string_start_writing (str);
dest = scm_i_string_writable_chars (str) + offset;
SCM_SYSCALL (chars_read = read (fdes, dest, read_len));
scm_i_string_stop_writing ();

View file

@ -1438,6 +1438,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
fd = SCM_FPORT_FDES (sock);
len = scm_i_string_length (buf);
buf = scm_i_string_start_writing (buf);
dest = scm_i_string_writable_chars (buf);
SCM_SYSCALL (rv = recv (fd, dest, len, flg));
scm_i_string_stop_writing ();
@ -1482,6 +1483,7 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0,
fd = SCM_FPORT_FDES (sock);
len = scm_i_string_length (message);
message = scm_i_string_start_writing (message);
src = scm_i_string_writable_chars (message);
SCM_SYSCALL (rv = send (fd, src, len, flg));
scm_i_string_stop_writing ();
@ -1550,6 +1552,7 @@ SCM_DEFINE (scm_recvfrom, "recvfrom!", 2, 3, 0,
/* recvfrom will not necessarily return an address. usually nothing
is returned for stream sockets. */
str = scm_i_string_start_writing (str);
buf = scm_i_string_writable_chars (str);
((struct sockaddr *) &addr)->sa_family = AF_UNSPEC;
SCM_SYSCALL (rv = recvfrom (fd, buf + offset,

View file

@ -549,6 +549,7 @@ SCM_DEFINE (scm_string_copy_x, "string-copy!", 3, 2, 0,
len = cend - cstart;
SCM_ASSERT_RANGE (3, s, len <= scm_i_string_length (target) - ctstart);
target = scm_i_string_start_writing (target);
ctarget = scm_i_string_writable_chars (target);
memmove (ctarget + ctstart, cstr + cstart, len);
scm_i_string_stop_writing ();
@ -985,6 +986,7 @@ SCM_DEFINE (scm_substring_fill_x, "string-fill!", 2, 2, 0,
4, end, cend);
SCM_VALIDATE_CHAR_COPY (2, chr, c);
str = scm_i_string_start_writing (str);
cstr = scm_i_string_writable_chars (str);
for (k = cstart; k < cend; k++)
cstr[k] = c;
@ -2376,6 +2378,7 @@ string_upcase_x (SCM v, size_t start, size_t end)
size_t k;
char *dst;
v = scm_i_string_start_writing (v);
dst = scm_i_string_writable_chars (v);
for (k = start; k < end; ++k)
dst[k] = scm_c_upcase (dst[k]);
@ -2442,6 +2445,7 @@ string_downcase_x (SCM v, size_t start, size_t end)
size_t k;
char *dst;
v = scm_i_string_start_writing (v);
dst = scm_i_string_writable_chars (v);
for (k = start; k < end; ++k)
dst[k] = scm_c_downcase (dst[k]);
@ -2511,6 +2515,7 @@ string_titlecase_x (SCM str, size_t start, size_t end)
size_t i;
int in_word = 0;
str = scm_i_string_start_writing (str);
sz = (unsigned char *) scm_i_string_writable_chars (str);
for(i = start; i < end; i++)
{
@ -2635,6 +2640,7 @@ SCM_DEFINE (scm_string_reverse, "string-reverse", 1, 2, 0,
2, start, cstart,
3, end, cend);
result = scm_string_copy (str);
result = scm_i_string_start_writing (result);
ctarget = scm_i_string_writable_chars (result);
string_reverse_x (ctarget, cstart, cend);
scm_i_string_stop_writing ();
@ -2658,6 +2664,7 @@ SCM_DEFINE (scm_string_reverse_x, "string-reverse!", 1, 2, 0,
2, start, cstart,
3, end, cend);
str = scm_i_string_start_writing (str);
cstr = scm_i_string_writable_chars (str);
string_reverse_x (cstr, cstart, cend);
scm_i_string_stop_writing ();
@ -3018,19 +3025,16 @@ SCM_DEFINE (scm_string_for_each, "string-for-each", 2, 2, 0,
"return value is not specified.")
#define FUNC_NAME s_scm_string_for_each
{
const char *cstr;
size_t cstart, cend;
scm_t_trampoline_1 proc_tramp = scm_trampoline_1 (proc);
SCM_ASSERT (proc_tramp, proc, SCM_ARG1, FUNC_NAME);
MY_VALIDATE_SUBSTRING_SPEC_COPY (2, s, cstr,
MY_VALIDATE_SUBSTRING_SPEC (2, s,
3, start, cstart,
4, end, cend);
while (cstart < cend)
{
unsigned int c = (unsigned char) cstr[cstart];
proc_tramp (proc, SCM_MAKE_CHAR (c));
cstr = scm_i_string_chars (s);
proc_tramp (proc, SCM_MAKE_CHAR (scm_i_string_ref (s, cstart)));
cstart++;
}
@ -3162,6 +3166,7 @@ SCM_DEFINE (scm_string_xcopy_x, "string-xcopy!", 4, 3, 0,
SCM_ASSERT_RANGE (1, tstart,
ctstart + (csto - csfrom) <= scm_i_string_length (target));
target = scm_i_string_start_writing (target);
p = scm_i_string_writable_chars (target) + ctstart;
cs = scm_i_string_chars (s);
while (csfrom < csto)
@ -3200,8 +3205,8 @@ SCM_DEFINE (scm_string_replace, "string-replace", 2, 4, 0,
MY_VALIDATE_SUBSTRING_SPEC (2, s2,
5, start2, cstart2,
6, end2, cend2);
result = scm_i_make_string (cstart1 + (cend2 - cstart2) +
scm_i_string_length (s1) - cend1, &p);
result = scm_i_make_string ((cstart1 + cend2 - cstart2
+ scm_i_string_length (s1) - cend1), &p);
cstr1 = scm_i_string_chars (s1);
cstr2 = scm_i_string_chars (s2);
memmove (p, cstr1, cstart1 * sizeof (char));

View file

@ -24,6 +24,8 @@
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <unistr.h>
#include "libguile/_scm.h"
#include "libguile/chars.h"
@ -69,10 +71,12 @@
#define STRINGBUF_F_SHARED 0x100
#define STRINGBUF_F_INLINE 0x200
#define STRINGBUF_F_WIDE 0x400
#define STRINGBUF_TAG scm_tc7_stringbuf
#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
#define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
@ -82,6 +86,7 @@
#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
? STRINGBUF_INLINE_CHARS (buf) \
: STRINGBUF_OUTLINE_CHARS (buf))
#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
? STRINGBUF_INLINE_LENGTH (buf) \
: STRINGBUF_OUTLINE_LENGTH (buf))
@ -126,6 +131,23 @@ make_stringbuf (size_t len)
}
}
static SCM
make_wide_stringbuf (size_t len)
{
scm_t_wchar *mem;
#if SCM_DEBUG
if (len < 1000)
lenhist[len]++;
else
lenhist[1000]++;
#endif
mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
mem[len] = 0;
return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
(scm_t_bits) len, (scm_t_bits) 0);
}
/* Return a new stringbuf whose underlying storage consists of the LEN+1
octets pointed to by STR (the last octet is zero). */
SCM
@ -147,8 +169,58 @@ void
scm_i_stringbuf_free (SCM buf)
{
if (!STRINGBUF_INLINE (buf))
{
if (!STRINGBUF_WIDE (buf))
scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
else
scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
+ 1), "string");
}
}
static void
widen_stringbuf (SCM buf)
{
size_t i, len;
scm_t_wchar *mem;
if (STRINGBUF_WIDE (buf))
return;
if (STRINGBUF_INLINE (buf))
{
len = STRINGBUF_INLINE_LENGTH (buf);
mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
for (i = 0; i < len; i++)
mem[i] =
(scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
mem[len] = 0;
SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
SCM_SET_CELL_WORD_1 (buf, mem);
SCM_SET_CELL_WORD_2 (buf, len);
}
else
{
len = STRINGBUF_OUTLINE_LENGTH (buf);
mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
for (i = 0; i < len; i++)
mem[i] =
(scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
mem[len] = 0;
scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
SCM_SET_CELL_WORD_1 (buf, mem);
SCM_SET_CELL_WORD_2 (buf, len);
}
}
scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
@ -195,6 +267,18 @@ scm_i_make_string (size_t len, char **charsp)
return res;
}
SCM
scm_i_make_wide_string (size_t len, scm_t_wchar ** charsp)
{
SCM buf = make_wide_stringbuf (len);
SCM res;
if (charsp)
*charsp = STRINGBUF_WIDE_CHARS (buf);
res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
(scm_t_bits) 0, (scm_t_bits) len);
return res;
}
static void
validate_substring_args (SCM str, size_t start, size_t end)
{
@ -253,12 +337,24 @@ scm_i_substring_copy (SCM str, size_t start, size_t end)
SCM buf, my_buf;
size_t str_start;
get_str_buf_start (&str, &buf, &str_start);
if (scm_i_is_narrow_string (str))
{
my_buf = make_stringbuf (len);
memcpy (STRINGBUF_CHARS (my_buf),
STRINGBUF_CHARS (buf) + str_start + start, len);
}
else
{
my_buf = make_wide_stringbuf (len);
u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
(scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
+ start), len);
/* Even though this string is wide, the substring may be narrow.
Consider adding code to narrow string. */
}
scm_remember_upto_here_1 (buf);
return scm_double_cell (STRING_TAG, SCM_UNPACK(my_buf),
(scm_t_bits)0, (scm_t_bits) len);
return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
(scm_t_bits) 0, (scm_t_bits) len);
}
SCM
@ -330,17 +426,45 @@ scm_i_string_length (SCM str)
return STRING_LENGTH (str);
}
int
scm_i_is_narrow_string (SCM str)
{
return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
}
const char *
scm_i_string_chars (SCM str)
{
SCM buf;
size_t start;
get_str_buf_start (&str, &buf, &start);
if (scm_i_is_narrow_string (str))
return STRINGBUF_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
scm_list_1 (str));
return NULL;
}
char *
scm_i_string_writable_chars (SCM orig_str)
const scm_t_wchar *
scm_i_string_wide_chars (SCM str)
{
SCM buf;
size_t start;
get_str_buf_start (&str, &buf, &start);
if (!scm_i_is_narrow_string (str))
return STRINGBUF_WIDE_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
scm_list_1 (str));
}
/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
a new string buffer, so that it can be modified without modifying
other strings. */
SCM
scm_i_string_start_writing (SCM orig_str)
{
SCM buf, str = orig_str;
size_t start;
@ -352,18 +476,26 @@ scm_i_string_writable_chars (SCM orig_str)
scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
if (STRINGBUF_SHARED (buf))
{
/* Clone stringbuf. For this, we put all threads to sleep.
*/
/* Clone the stringbuf. */
size_t len = STRING_LENGTH (str);
SCM new_buf;
scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
if (scm_i_is_narrow_string (str))
{
new_buf = make_stringbuf (len);
memcpy (STRINGBUF_CHARS (new_buf),
STRINGBUF_CHARS (buf) + STRING_START (str), len);
}
else
{
new_buf = make_wide_stringbuf (len);
u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
(scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
+ STRING_START (str)), len);
}
scm_i_thread_put_to_sleep ();
SET_STRING_STRINGBUF (str, new_buf);
start -= STRING_START (str);
@ -374,8 +506,39 @@ scm_i_string_writable_chars (SCM orig_str)
scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
}
return orig_str;
}
/* Return a pointer to the chars of a string that fits in a Latin-1
encoding. */
char *
scm_i_string_writable_chars (SCM str)
{
SCM buf;
size_t start;
get_str_buf_start (&str, &buf, &start);
if (scm_i_is_narrow_string (str))
return STRINGBUF_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
scm_list_1 (str));
return NULL;
}
/* Return a pointer to the Unicode codepoints of a string. */
static scm_t_wchar *
scm_i_string_writable_wide_chars (SCM str)
{
SCM buf;
size_t start;
get_str_buf_start (&str, &buf, &start);
if (!scm_i_is_narrow_string (str))
return STRINGBUF_WIDE_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
scm_list_1 (str));
}
void
@ -384,6 +547,34 @@ scm_i_string_stop_writing (void)
scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
}
/* Return the Xth character is C. */
scm_t_wchar
scm_i_string_ref (SCM str, size_t x)
{
if (scm_i_is_narrow_string (str))
return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
else
return scm_i_string_wide_chars (str)[x];
}
void
scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
{
if (chr > 0xFF && scm_i_is_narrow_string (str))
widen_stringbuf (STRING_STRINGBUF (str));
if (scm_i_is_narrow_string (str))
{
char *dst = scm_i_string_writable_chars (str);
dst[p] = (char) (unsigned char) chr;
}
else
{
scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
dst[p] = chr;
}
}
/* Symbols.
Basic symbol creation and accessing is done here, the rest is in
@ -418,11 +609,22 @@ scm_i_make_symbol (SCM name, scm_t_bits flags,
else
{
/* make new buf. */
if (scm_i_is_narrow_string (name))
{
SCM new_buf = make_stringbuf (length);
memcpy (STRINGBUF_CHARS (new_buf),
STRINGBUF_CHARS (buf) + start, length);
buf = new_buf;
}
else
{
SCM new_buf = make_wide_stringbuf (length);
u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
(scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
length);
buf = new_buf;
}
}
return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
(scm_t_bits) hash, SCM_UNPACK (props));
}
@ -466,11 +668,40 @@ scm_c_symbol_length (SCM sym)
}
#undef FUNC_NAME
int
scm_i_is_narrow_symbol (SCM sym)
{
SCM buf;
buf = SYMBOL_STRINGBUF (sym);
return !STRINGBUF_WIDE (buf);
}
const char *
scm_i_symbol_chars (SCM sym)
{
SCM buf = SYMBOL_STRINGBUF (sym);
SCM buf;
buf = SYMBOL_STRINGBUF (sym);
if (!STRINGBUF_WIDE (buf))
return STRINGBUF_CHARS (buf);
else
scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
scm_list_1 (sym));
}
/* Return a pointer to the Unicode codepoints of a symbol's name. */
const scm_t_wchar *
scm_i_symbol_wide_chars (SCM sym)
{
SCM buf;
buf = SYMBOL_STRINGBUF (sym);
if (STRINGBUF_WIDE (buf))
return STRINGBUF_WIDE_CHARS (buf);
else
scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
scm_list_1 (sym));
}
SCM
@ -496,6 +727,15 @@ scm_i_symbol_substring (SCM sym, size_t start, size_t end)
(scm_t_bits)start, (scm_t_bits) end - start);
}
scm_t_wchar
scm_i_symbol_ref (SCM sym, size_t x)
{
if (scm_i_is_narrow_symbol (sym))
return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
else
return scm_i_symbol_wide_chars (sym)[x];
}
/* Debugging
*/
@ -505,15 +745,17 @@ SCM scm_sys_string_dump (SCM);
SCM scm_sys_symbol_dump (SCM);
SCM scm_sys_stringbuf_hist (void);
SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0,
(SCM str),
"")
SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str), "")
#define FUNC_NAME s_scm_sys_string_dump
{
SCM_VALIDATE_STRING (1, str);
fprintf (stderr, "%p:\n", str);
fprintf (stderr, " start: %u\n", STRING_START (str));
fprintf (stderr, " len: %u\n", STRING_LENGTH (str));
if (scm_i_is_narrow_string (str))
fprintf (stderr, " format: narrow\n");
else
fprintf (stderr, " format: wide\n");
if (IS_SH_STRING (str))
{
fprintf (stderr, " string: %p\n", SH_STRING_STRING (str));
@ -524,36 +766,54 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0,
{
SCM buf = STRING_STRINGBUF (str);
fprintf (stderr, " buf: %p\n", buf);
if (scm_i_is_narrow_string (str))
fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf));
else
fprintf (stderr, " chars: %p\n", STRINGBUF_WIDE_CHARS (buf));
fprintf (stderr, " length: %u\n", STRINGBUF_LENGTH (buf));
fprintf (stderr, " flags: %x\n", (SCM_CELL_WORD_0 (buf) & 0x300));
if (STRINGBUF_SHARED (buf))
fprintf (stderr, " shared: true\n");
else
fprintf (stderr, " shared: false\n");
if (STRINGBUF_INLINE (buf))
fprintf (stderr, " inline: true\n");
else
fprintf (stderr, " inline: false\n");
}
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0,
(SCM sym),
"")
SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), "")
#define FUNC_NAME s_scm_sys_symbol_dump
{
SCM_VALIDATE_SYMBOL (1, sym);
fprintf (stderr, "%p:\n", sym);
fprintf (stderr, " hash: %lu\n", scm_i_symbol_hash (sym));
if (scm_i_is_narrow_symbol (sym))
fprintf (stderr, " format: narrow\n");
else
fprintf (stderr, " format: wide\n");
{
SCM buf = SYMBOL_STRINGBUF (sym);
fprintf (stderr, " buf: %p\n", buf);
if (scm_i_is_narrow_symbol (sym))
fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf));
else
fprintf (stderr, " chars: %p\n", STRINGBUF_WIDE_CHARS (buf));
fprintf (stderr, " length: %u\n", STRINGBUF_LENGTH (buf));
fprintf (stderr, " shared: %u\n", STRINGBUF_SHARED (buf));
if (STRINGBUF_SHARED (buf))
fprintf (stderr, " shared: true\n");
else
fprintf (stderr, " shared: false\n");
}
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0,
(void),
"")
SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
#define FUNC_NAME s_scm_sys_stringbuf_hist
{
int i;
@ -589,29 +849,46 @@ SCM_DEFINE (scm_string, "string", 0, 0, 1,
#define FUNC_NAME s_scm_string
{
SCM result;
SCM rest;
size_t len;
char *data;
size_t p = 0;
long i;
/* Verify that this is a list of chars. */
i = scm_ilength (chrs);
len = (size_t) i;
rest = chrs;
SCM_ASSERT (len >= 0, chrs, SCM_ARG1, FUNC_NAME);
while (len > 0 && scm_is_pair (rest))
{
long i = scm_ilength (chrs);
SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
len = i;
}
result = scm_i_make_string (len, &data);
while (len > 0 && scm_is_pair (chrs))
{
SCM elt = SCM_CAR (chrs);
SCM elt = SCM_CAR (rest);
SCM_VALIDATE_CHAR (SCM_ARGn, elt);
*data++ = SCM_CHAR (elt);
chrs = SCM_CDR (chrs);
rest = SCM_CDR (rest);
len--;
scm_remember_upto_here_1 (elt);
}
/* Construct a string containing this list of chars. */
len = (size_t) i;
rest = chrs;
result = scm_i_make_string (len, NULL);
result = scm_i_string_start_writing (result);
while (len > 0 && scm_is_pair (rest))
{
SCM elt = SCM_CAR (rest);
scm_i_string_set_x (result, p, SCM_CHAR (elt));
p++;
rest = SCM_CDR (rest);
len--;
scm_remember_upto_here_1 (elt);
}
scm_i_string_stop_writing ();
if (len > 0)
scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
if (!scm_is_null (chrs))
if (!scm_is_null (rest))
scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
return result;
@ -634,13 +911,16 @@ SCM
scm_c_make_string (size_t len, SCM chr)
#define FUNC_NAME NULL
{
char *dst;
SCM res = scm_i_make_string (len, &dst);
size_t p;
SCM res = scm_i_make_string (len, NULL);
if (!SCM_UNBNDP (chr))
{
SCM_VALIDATE_CHAR (0, chr);
memset (dst, SCM_CHAR (chr), len);
res = scm_i_string_start_writing (res);
for (p = 0; p < len; p++)
scm_i_string_set_x (res, p, SCM_CHAR (chr));
scm_i_string_stop_writing ();
}
return res;
@ -657,6 +937,20 @@ SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
}
#undef FUNC_NAME
SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
(SCM string),
"Return the bytes used to represent a character in @var{string}."
"This will return 1 or 4.")
#define FUNC_NAME s_scm_string_width
{
SCM_VALIDATE_STRING (1, string);
if (!scm_i_is_narrow_string (string))
return scm_from_int (4);
return scm_from_int (1);
}
#undef FUNC_NAME
size_t
scm_c_string_length (SCM string)
{
@ -682,7 +976,10 @@ SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
else
scm_out_of_range (NULL, k);
if (scm_i_is_narrow_string (str))
return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
else
return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
}
#undef FUNC_NAME
@ -691,7 +988,11 @@ scm_c_string_ref (SCM str, size_t p)
{
if (p >= scm_i_string_length (str))
scm_out_of_range (NULL, scm_from_size_t (p));
if (scm_i_is_narrow_string (str))
return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
else
return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
}
SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
@ -713,11 +1014,10 @@ SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
scm_out_of_range (NULL, k);
SCM_VALIDATE_CHAR (3, chr);
{
char *dst = scm_i_string_writable_chars (str);
dst[idx] = SCM_CHAR (chr);
str = scm_i_string_start_writing (str);
scm_i_string_set_x (str, idx, SCM_CHAR (chr));
scm_i_string_stop_writing ();
}
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
@ -727,11 +1027,9 @@ scm_c_string_set_x (SCM str, size_t p, SCM chr)
{
if (p >= scm_i_string_length (str))
scm_out_of_range (NULL, scm_from_size_t (p));
{
char *dst = scm_i_string_writable_chars (str);
dst[p] = SCM_CHAR (chr);
str = scm_i_string_start_writing (str);
scm_i_string_set_x (str, p, SCM_CHAR (chr));
scm_i_string_stop_writing ();
}
}
SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
@ -837,26 +1135,50 @@ SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
#define FUNC_NAME s_scm_string_append
{
SCM res;
size_t i = 0;
size_t len = 0;
int wide = 0;
SCM l, s;
char *data;
scm_t_wchar *wdata;
int i;
SCM_VALIDATE_REST_ARGUMENT (args);
for (l = args; !scm_is_null (l); l = SCM_CDR (l))
{
s = SCM_CAR (l);
SCM_VALIDATE_STRING (SCM_ARGn, s);
i += scm_i_string_length (s);
len += scm_i_string_length (s);
if (!scm_i_is_narrow_string (s))
wide = 1;
}
res = scm_i_make_string (i, &data);
if (!wide)
res = scm_i_make_string (len, &data);
else
res = scm_i_make_wide_string (len, &wdata);
for (l = args; !scm_is_null (l); l = SCM_CDR (l))
{
size_t len;
s = SCM_CAR (l);
SCM_VALIDATE_STRING (SCM_ARGn, s);
len = scm_i_string_length (s);
if (!wide)
{
memcpy (data, scm_i_string_chars (s), len);
data += len;
}
else
{
if (scm_i_is_narrow_string (s))
{
for (i = 0; i < scm_i_string_length (s); i++)
wdata[i] = (unsigned char) scm_i_string_chars (s)[i];
}
else
u32_cpy ((scm_t_uint32 *) wdata,
(scm_t_uint32 *) scm_i_string_wide_chars (s), len);
wdata += len;
}
scm_remember_upto_here_1 (s);
}
return res;
@ -875,8 +1197,11 @@ scm_from_locale_stringn (const char *str, size_t len)
SCM res;
char *dst;
if (len == (size_t)-1)
if (len == (size_t) -1)
len = strlen (str);
if (len == 0)
return scm_nullstr;
res = scm_i_make_string (len, &dst);
memcpy (dst, str, len);
return res;
@ -885,6 +1210,9 @@ scm_from_locale_stringn (const char *str, size_t len)
SCM
scm_from_locale_string (const char *str)
{
if (str == NULL)
return scm_nullstr;
return scm_from_locale_stringn (str, -1);
}
@ -893,21 +1221,20 @@ scm_take_locale_stringn (char *str, size_t len)
{
SCM buf, res;
if (len == (size_t)-1)
if (len == (size_t) -1)
len = strlen (str);
else
{
/* Ensure STR is null terminated. A realloc for 1 extra byte should
often be satisfied from the alignment padding after the block, with
no actual data movement. */
str = scm_realloc (str, len+1);
str = scm_realloc (str, len + 1);
str[len] = '\0';
}
buf = scm_i_take_stringbufn (str, len);
res = scm_double_cell (STRING_TAG,
SCM_UNPACK (buf),
(scm_t_bits) 0, (scm_t_bits) len);
SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
return res;
}
@ -917,33 +1244,143 @@ scm_take_locale_string (char *str)
return scm_take_locale_stringn (str, -1);
}
char *
scm_to_locale_stringn (SCM str, size_t *lenp)
/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
and \UXXXXXX. */
static void
unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
{
char *res;
size_t len;
char *before, *after;
size_t i, j;
before = *bufp;
after = *bufp;
i = 0;
j = 0;
while (i < *lenp)
{
if ((i <= *lenp - 6)
&& before[i] == '\\'
&& before[i + 1] == 'u'
&& before[i + 2] == '0' && before[i + 3] == '0')
{
/* Convert \u00NN to \xNN */
after[j] = '\\';
after[j + 1] = 'x';
after[j + 2] = tolower (before[i + 4]);
after[j + 3] = tolower (before[i + 5]);
i += 6;
j += 4;
}
else if ((i <= *lenp - 10)
&& before[i] == '\\'
&& before[i + 1] == 'U'
&& before[i + 2] == '0' && before[i + 3] == '0')
{
/* Convert \U00NNNNNN to \UNNNNNN */
after[j] = '\\';
after[j + 1] = 'U';
after[j + 2] = tolower (before[i + 4]);
after[j + 3] = tolower (before[i + 5]);
after[j + 4] = tolower (before[i + 6]);
after[j + 5] = tolower (before[i + 7]);
after[j + 6] = tolower (before[i + 8]);
after[j + 7] = tolower (before[i + 9]);
i += 10;
j += 8;
}
else
{
after[j] = before[i];
i++;
j++;
}
}
*lenp = j;
after = scm_realloc (after, j);
}
char *
scm_to_locale_stringn (SCM str, size_t * lenp)
{
const char *enc;
/* In the future, enc will hold the port's encoding. */
enc = NULL;
return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence);
}
/* Low-level scheme to C string conversion function. */
char *
scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
enum iconv_ilseq_handler handler)
{
static const char iso[11] = "ISO-8859-1";
char *buf;
size_t ilen, len, i;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
len = scm_i_string_length (str);
res = scm_malloc (len + ((lenp==NULL)? 1 : 0));
memcpy (res, scm_i_string_chars (str), len);
ilen = scm_i_string_length (str);
if (ilen == 0)
{
buf = scm_malloc (1);
buf[0] = '\0';
if (lenp)
*lenp = 0;
return buf;
}
if (lenp == NULL)
{
res[len] = '\0';
if (strlen (res) != len)
{
free (res);
for (i = 0; i < ilen; i++)
if (scm_i_string_ref (str, i) == '\0')
scm_misc_error (NULL,
"string contains #\\nul character: ~S",
scm_list_1 (str));
}
if (scm_i_is_narrow_string (str))
{
if (lenp)
{
buf = scm_malloc (ilen);
memcpy (buf, scm_i_string_chars (str), ilen);
*lenp = ilen;
return buf;
}
else
{
buf = scm_malloc (ilen + 1);
memcpy (buf, scm_i_string_chars (str), ilen);
buf[ilen] = '\0';
return buf;
}
}
buf = NULL;
len = 0;
buf = u32_conv_to_encoding (iso,
handler,
(scm_t_uint32 *) scm_i_string_wide_chars (str),
ilen, NULL, NULL, &len);
if (buf == NULL)
scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
scm_list_2 (scm_from_locale_string (iso), str));
if (handler == iconveh_escape_sequence)
unistring_escapes_to_guile_escapes (&buf, &len);
if (lenp)
*lenp = len;
else
{
buf = scm_realloc (buf, len + 1);
buf[len] = '\0';
}
scm_remember_upto_here_1 (str);
return res;
return buf;
}
char *
@ -956,11 +1393,14 @@ size_t
scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
{
size_t len;
char *result = NULL;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
len = scm_i_string_length (str);
memcpy (buf, scm_i_string_chars (str), (len > max_len)? max_len : len);
result = scm_to_locale_stringn (str, &len);
memcpy (buf, result, (len > max_len) ? max_len : len);
free (result);
scm_remember_upto_here_1 (str);
return len;
}
@ -1081,6 +1521,7 @@ scm_i_deprecated_string_chars (SCM str)
/* The following is still wrong, of course...
*/
str = scm_i_string_start_writing (str);
chars = scm_i_string_writable_chars (str);
scm_i_string_stop_writing ();
return chars;

View file

@ -23,6 +23,7 @@
#include <uniconv.h>
#include "libguile/__scm.h"
@ -46,26 +47,37 @@
Internal, low level interface to the character arrays
- Use scm_i_string_chars to get a pointer to the byte array of a
string for reading. Use scm_i_string_length to get the number of
bytes in that array. The array is not null-terminated.
- Use scm_is_narrow_string to determine is the string is narrow or
wide.
- Use scm_i_string_chars or scm_i_string_wide_chars to get a
pointer to the byte or scm_t_wchar array of a string for reading.
Use scm_i_string_length to get the number of characters in that
array. The array is not null-terminated.
- The array is valid as long as the corresponding SCM object is
protected but only until the next SCM_TICK. During such a 'safe
point', strings might change their representation.
- Use scm_i_string_writable_chars to get the same pointer as with
scm_i_string_chars, but for reading and writing. This is a
potentially costly operation since it implements the
copy-on-write behavior. When done with the writing, call
scm_i_string_stop_writing. You must do this before the next
SCM_TICK. (This means, before calling almost any other scm_
function and you can't allow throws, of course.)
- Use scm_i_string_start_writing to get a version of the string
ready for reading and writing. This is a potentially costly
operation since it implements the copy-on-write behavior. When
done with the writing, call scm_i_string_stop_writing. You must
do this before the next SCM_TICK. (This means, before calling
almost any other scm_ function and you can't allow throws, of
course.)
- New strings can be created with scm_i_make_string. This gives
access to a writable pointer that remains valid as long as nobody
else makes a copy-on-write substring of the string. Do not call
scm_i_string_stop_writing for this pointer.
- New strings can be created with scm_i_make_string or
scm_i_make_wide_string. This gives access to a writable pointer
that remains valid as long as nobody else makes a copy-on-write
substring of the string. Do not call scm_i_string_stop_writing
for this pointer.
- Alternately, scm_i_string_ref and scm_i_string_set_x can be used
to read and write strings without worrying about whether the
string is narrow or wide. scm_i_string_set_x still needs to be
bracketed by scm_i_string_start_writing and
scm_i_string_stop_writing.
Legacy interface
@ -74,13 +86,15 @@
- SCM_STRING_CHARS uses scm_i_string_writable_chars and immediately
calls scm_i_stop_writing, hoping for the best. SCM_STRING_LENGTH
is the same as scm_i_string_length. SCM_STRING_CHARS will throw
an error for for strings that are not null-terminated.
an error for for strings that are not null-terminated. There is
no wide version of this interface.
*/
SCM_API SCM scm_string_p (SCM x);
SCM_API SCM scm_string (SCM chrs);
SCM_API SCM scm_make_string (SCM k, SCM chr);
SCM_API SCM scm_string_length (SCM str);
SCM_API SCM scm_string_width (SCM str);
SCM_API SCM scm_string_ref (SCM str, SCM k);
SCM_API SCM scm_string_set_x (SCM str, SCM k, SCM chr);
SCM_API SCM scm_substring (SCM str, SCM start, SCM end);
@ -106,6 +120,9 @@ SCM_API SCM scm_take_locale_string (char *str);
SCM_API SCM scm_take_locale_stringn (char *str, size_t len);
SCM_API char *scm_to_locale_string (SCM str);
SCM_API char *scm_to_locale_stringn (SCM str, size_t *lenp);
SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
const char *encoding,
enum iconv_ilseq_handler handler);
SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
SCM_API SCM scm_makfromstrs (int argc, char **argv);
@ -113,15 +130,20 @@ SCM_API SCM scm_makfromstrs (int argc, char **argv);
/* internal accessor functions. Arguments must be valid. */
SCM_INTERNAL SCM scm_i_make_string (size_t len, char **datap);
SCM_INTERNAL SCM scm_i_make_wide_string (size_t len, scm_t_wchar **datap);
SCM_INTERNAL SCM scm_i_substring (SCM str, size_t start, size_t end);
SCM_INTERNAL SCM scm_i_substring_read_only (SCM str, size_t start, size_t end);
SCM_INTERNAL SCM scm_i_substring_shared (SCM str, size_t start, size_t end);
SCM_INTERNAL SCM scm_i_substring_copy (SCM str, size_t start, size_t end);
SCM_INTERNAL size_t scm_i_string_length (SCM str);
SCM_API /* FIXME: not internal */ const char *scm_i_string_chars (SCM str);
SCM_API const scm_t_wchar *scm_i_string_wide_chars (SCM str);
SCM_API /* FIXME: not internal */ char *scm_i_string_writable_chars (SCM str);
SCM_INTERNAL SCM scm_i_string_start_writing (SCM str);
SCM_INTERNAL void scm_i_string_stop_writing (void);
SCM_INTERNAL int scm_i_is_narrow_string (SCM str);
SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x);
SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr);
/* internal functions related to symbols. */
SCM_INTERNAL SCM scm_i_make_symbol (SCM name, scm_t_bits flags,
@ -133,8 +155,11 @@ SCM_INTERNAL SCM
scm_i_c_take_symbol (char *name, size_t len,
scm_t_bits flags, unsigned long hash, SCM props);
SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym);
SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym);
SCM_INTERNAL size_t scm_i_symbol_length (SCM sym);
SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str);
SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end);
SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x);
/* internal GC functions. */

View file

@ -336,6 +336,7 @@ do { \
#define FETCH() (*ip++)
#define FETCH_LENGTH(len) do { len=*ip++; len<<=8; len+=*ip++; len<<=8; len+=*ip++; } while (0)
#define FETCH_WIDTH(width) do { width=*ip++; } while (0)
#undef CLOCK
#if VM_USE_CLOCK

View file

@ -72,31 +72,82 @@ VM_DEFINE_LOADER (82, load_number, "load-number")
VM_DEFINE_LOADER (83, load_string, "load-string")
{
size_t len;
int width;
SCM str;
FETCH_LENGTH (len);
FETCH_WIDTH (width);
SYNC_REGISTER ();
PUSH (scm_from_locale_stringn ((char *)ip, len));
/* Was: scm_makfromstr (ip, len, 0) */
ip += len;
if (width == 1)
{
char *buf;
str = scm_i_make_string (len, &buf);
memcpy (buf, (char *) ip, len);
}
else if (width == 4)
{
scm_t_wchar *wbuf;
str = scm_i_make_wide_string (len, &wbuf);
memcpy ((char *) wbuf, (char *) ip, len * width);
}
else
SCM_MISC_ERROR ("load-string: invalid character width", SCM_EOL);
PUSH (str);
ip += len * width;
NEXT;
}
VM_DEFINE_LOADER (84, load_symbol, "load-symbol")
{
size_t len;
int width;
SCM str;
FETCH_LENGTH (len);
FETCH_WIDTH (width);
SYNC_REGISTER ();
PUSH (scm_from_locale_symboln ((char *)ip, len));
ip += len;
if (width == 1)
{
char *buf;
str = scm_i_make_string (len, &buf);
memcpy (buf, (char *) ip, len);
}
else if (width == 4)
{
scm_t_wchar *wbuf;
str = scm_i_make_wide_string (len, &wbuf);
memcpy ((char *) wbuf, (char *) ip, len * width);
}
else
SCM_MISC_ERROR ("load-symbol: invalid character width", SCM_EOL);
PUSH (scm_string_to_symbol (str));
ip += len * width;
NEXT;
}
VM_DEFINE_LOADER (85, load_keyword, "load-keyword")
{
size_t len;
int width;
SCM str;
FETCH_LENGTH (len);
FETCH_WIDTH (width);
SYNC_REGISTER ();
PUSH (scm_from_locale_keywordn ((char *)ip, len));
ip += len;
if (width == 1)
{
char *buf;
str = scm_i_make_string (len, &buf);
memcpy (buf, (char *) ip, len);
}
else if (width == 4)
{
scm_t_wchar *wbuf;
str = scm_i_make_wide_string (len, &wbuf);
memcpy ((char *) wbuf, (char *) ip, len * width);
}
else
SCM_MISC_ERROR ("load-keyword: invalid character width", SCM_EOL);
PUSH (scm_symbol_to_keyword (scm_string_to_symbol (str)));
ip += len * width;
NEXT;
}
@ -132,13 +183,29 @@ VM_DEFINE_INSTRUCTION (87, link_now, "link-now", 0, 1, 1)
VM_DEFINE_LOADER (88, define, "define")
{
SCM sym;
SCM str, sym;
size_t len;
int width;
FETCH_LENGTH (len);
FETCH_WIDTH (width);
SYNC_REGISTER ();
sym = scm_from_locale_symboln ((char *)ip, len);
ip += len;
if (width == 1)
{
char *buf;
str = scm_i_make_string (len, &buf);
memcpy (buf, (char *) ip, len);
}
else if (width == 4)
{
scm_t_wchar *wbuf;
str = scm_i_make_wide_string (len, &wbuf);
memcpy ((char *) wbuf, (char *) ip, len * width);
}
else
SCM_MISC_ERROR ("load define: invalid character width", SCM_EOL);
sym = scm_string_to_symbol (str);
ip += len * width;
SYNC_REGISTER ();
PUSH (scm_sym2var (sym, scm_current_module_lookup_closure (), SCM_BOOL_T));

View file

@ -34,6 +34,10 @@
;; lengths are encoded in 3 bytes
(define *len-len* 3)
;; the number of bytes per string character is encoded in 1 byte
(define *width-len* 1)
(define (byte-length assembly)
(pmatch assembly
(,label (guard (not (pair? label)))
@ -45,15 +49,15 @@
((load-number ,str)
(+ 1 *len-len* (string-length str)))
((load-string ,str)
(+ 1 *len-len* (string-length str)))
(+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-symbol ,str)
(+ 1 *len-len* (string-length str)))
(+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-keyword ,str)
(+ 1 *len-len* (string-length str)))
(+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-array ,bv)
(+ 1 *len-len* (bytevector-length bv)))
((define ,str)
(+ 1 *len-len* (string-length str)))
(+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-program ,nargs ,nrest ,nlocs ,labels ,len ,meta . ,code)
(+ 1 *program-header-len* len (if meta (1- (byte-length meta)) 0)))
((,inst . _) (guard (>= (instruction-length inst) 0))

View file

@ -65,6 +65,12 @@
(write-byte (logand (ash x -8) 255))
(write-byte (logand (ash x -16) 255))
(write-byte (logand (ash x -24) 255)))
(define (write-uint32 x) (case byte-order
((1234) (write-uint32-le x))
((4321) (write-uint32-be x))
(else (error "unknown endianness" byte-order))))
(define (write-wide-string s)
(string-for-each (lambda (c) (write-uint32 (char->integer c))) s))
(define (write-loader-len len)
(write-byte (ash len -16))
(write-byte (logand (ash len -8) 255))
@ -72,6 +78,14 @@
(define (write-loader str)
(write-loader-len (string-length str))
(write-string str))
(define (write-sized-loader str)
(let ((len (string-length str))
(wid (string-width str)))
(write-loader-len len)
(write-byte wid)
(if (= wid 4)
(write-wide-string str)
(write-string str))))
(define (write-bytevector bv)
(write-loader-len (bytevector-length bv))
;; Ew!
@ -89,10 +103,6 @@
(write-uint16 (case byte-order
((1234) write-uint16-le)
((4321) write-uint16-be)
(else (error "unknown endianness" byte-order))))
(write-uint32 (case byte-order
((1234) write-uint32-le)
((4321) write-uint32-be)
(else (error "unknown endianness" byte-order)))))
(let ((opcode (instruction->opcode inst))
(len (instruction-length inst)))
@ -126,11 +136,11 @@
((load-unsigned-integer ,str) (write-loader str))
((load-integer ,str) (write-loader str))
((load-number ,str) (write-loader str))
((load-string ,str) (write-loader str))
((load-symbol ,str) (write-loader str))
((load-keyword ,str) (write-loader str))
((load-string ,str) (write-sized-loader str))
((load-symbol ,str) (write-sized-loader str))
((load-keyword ,str) (write-sized-loader str))
((load-array ,bv) (write-bytevector bv))
((define ,str) (write-loader str))
((define ,str) (write-sized-loader str))
((br ,l) (write-break l))
((br-if ,l) (write-break l))
((br-if-not ,l) (write-break l))

View file

@ -79,15 +79,15 @@
(char->integer #\1) (char->integer #\4)))
(comp-test '(load-string "foo")
(vector 'load-string 0 0 3 (char->integer #\f) (char->integer #\o)
(vector 'load-string 0 0 3 1 (char->integer #\f) (char->integer #\o)
(char->integer #\o)))
(comp-test '(load-symbol "foo")
(vector 'load-symbol 0 0 3 (char->integer #\f) (char->integer #\o)
(vector 'load-symbol 0 0 3 1 (char->integer #\f) (char->integer #\o)
(char->integer #\o)))
(comp-test '(load-keyword "qux")
(vector 'load-keyword 0 0 3 (char->integer #\q) (char->integer #\u)
(vector 'load-keyword 0 0 3 1 (char->integer #\q) (char->integer #\u)
(char->integer #\x)))
(comp-test '(load-program 3 2 1 () 3 #f (make-int8 3) (return))