1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-24 20:30:28 +02:00

Add initial support for wide symbols

* libguile/hash.c (scm_i_string_hash): new function
  (scm_hasher): don't unpack string: use scm_i_string_hash

* libguile/hash.h: new declaration for scm_i_string_hash

* libguile/print.c (quote_keywordish_symbol): use symbol accessors
  (scm_i_print_symbol_name): new function
  (scm_print_symbol_name): call scm_i_print_symbol_name
  (iprin1): use scm_i_print_symbol_name to print symbols

* libguile/print.h: new declaration for scm_i_print_symbol_name

* libguile/symbols.c (lookup_interned_symbol): now takes scheme string
  instead of c string; callers changed
  (lookup_interned_symbol): add wide symbol support
  (scm_i_c_mem2symbol): removed
  (scm_i_mem2symbol): removed and replaced with scm_i_str2symbol
  (scm_i_str2symbol): new function
  (scm_i_mem2uninterned_symbol): removed and replaced with
  scm_i_str2uninterned_symbol
  (scm_i_str2uninterned_symbol): new function
  (scm_make_symbol, scm_string_to_symbol, scm_from_locale_symbol)
  (scm_from_locale_symboln): use scm_i_str2symbol

* test-suite/tests/symbols.test: new tests
This commit is contained in:
Michael Gran 2009-08-21 08:57:35 -07:00
parent 90305ce9e4
commit e23106d53e
6 changed files with 112 additions and 111 deletions

View file

@ -50,6 +50,20 @@ scm_string_hash (const unsigned char *str, size_t len)
return h; return h;
} }
unsigned long
scm_i_string_hash (SCM str)
{
size_t len = scm_i_string_length (str);
size_t i = 0;
unsigned long h = 0;
while (len-- > 0)
h = (unsigned long) scm_i_string_ref (str, i++) + h * 37;
scm_remember_upto_here_1 (str);
return h;
}
/* Dirk:FIXME:: why downcase for characters? (2x: scm_hasher, scm_ihashv) */ /* Dirk:FIXME:: why downcase for characters? (2x: scm_hasher, scm_ihashv) */
/* Dirk:FIXME:: scm_hasher could be made static. */ /* Dirk:FIXME:: scm_hasher could be made static. */
@ -115,8 +129,7 @@ scm_hasher(SCM obj, unsigned long n, size_t d)
case scm_tc7_string: case scm_tc7_string:
{ {
unsigned long hash = unsigned long hash =
scm_string_hash ((const unsigned char *) scm_i_string_chars (obj), scm_i_string_hash (obj) % n;
scm_i_string_length (obj)) % n;
scm_remember_upto_here_1 (obj); scm_remember_upto_here_1 (obj);
return hash; return hash;
} }

View file

@ -28,6 +28,7 @@
SCM_API unsigned long scm_string_hash (const unsigned char *str, size_t len); SCM_API unsigned long scm_string_hash (const unsigned char *str, size_t len);
SCM_INTERNAL unsigned long scm_i_string_hash (SCM str);
SCM_API unsigned long scm_hasher (SCM obj, unsigned long n, size_t d); SCM_API unsigned long scm_hasher (SCM obj, unsigned long n, size_t d);
SCM_API unsigned long scm_ihashq (SCM obj, unsigned long n); SCM_API unsigned long scm_ihashq (SCM obj, unsigned long n);
SCM_API SCM scm_hashq (SCM obj, SCM n); SCM_API SCM scm_hashq (SCM obj, SCM n);

View file

@ -295,13 +295,12 @@ print_circref (SCM port, scm_print_state *pstate, SCM ref)
/* Print the name of a symbol. */ /* Print the name of a symbol. */
static int static int
quote_keywordish_symbol (const char *str, size_t len) quote_keywordish_symbol (SCM symbol)
{ {
SCM option; SCM option;
/* LEN is guaranteed to be > 0. if (scm_i_symbol_ref (symbol, 0) != ':'
*/ && scm_i_symbol_ref (symbol, scm_i_symbol_length (symbol) - 1) != ':')
if (str[0] != ':' && str[len-1] != ':')
return 0; return 0;
option = SCM_PRINT_KEYWORD_STYLE; option = SCM_PRINT_KEYWORD_STYLE;
@ -313,7 +312,7 @@ quote_keywordish_symbol (const char *str, size_t len)
} }
void void
scm_print_symbol_name (const char *str, size_t len, SCM port) scm_i_print_symbol_name (SCM str, SCM port)
{ {
/* This points to the first character that has not yet been written to the /* This points to the first character that has not yet been written to the
* port. */ * port. */
@ -334,18 +333,20 @@ scm_print_symbol_name (const char *str, size_t len, SCM port)
* simpler and faster. */ * simpler and faster. */
int maybe_weird = 0; int maybe_weird = 0;
size_t mw_pos = 0; size_t mw_pos = 0;
size_t len = scm_i_symbol_length (str);
scm_t_wchar str0 = scm_i_symbol_ref (str, 0);
if (len == 0 || str[0] == '\'' || str[0] == '`' || str[0] == ',' if (len == 0 || str0 == '\'' || str0 == '`' || str0 == ','
|| quote_keywordish_symbol (str, len) || quote_keywordish_symbol (str)
|| (str[0] == '.' && len == 1) || (str0 == '.' && len == 1)
|| scm_is_true (scm_c_locale_stringn_to_number (str, len, 10))) || scm_is_true (scm_i_string_to_number (scm_symbol_to_string (str), 10)))
{ {
scm_lfwrite ("#{", 2, port); scm_lfwrite ("#{", 2, port);
weird = 1; weird = 1;
} }
for (end = pos; end < len; ++end) for (end = pos; end < len; ++end)
switch (str[end]) switch (scm_i_symbol_ref (str, end))
{ {
#ifdef BRACKETS_AS_PARENS #ifdef BRACKETS_AS_PARENS
case '[': case '[':
@ -370,11 +371,11 @@ scm_print_symbol_name (const char *str, size_t len, SCM port)
weird = 1; weird = 1;
} }
if (pos < end) if (pos < end)
scm_lfwrite (str + pos, end - pos, port); scm_lfwrite_substr (scm_symbol_to_string (str), pos, end, port);
{ {
char buf[2]; char buf[2];
buf[0] = '\\'; buf[0] = '\\';
buf[1] = str[end]; buf[1] = (char) (unsigned char) scm_i_symbol_ref (str, end);
scm_lfwrite (buf, 2, port); scm_lfwrite (buf, 2, port);
} }
pos = end + 1; pos = end + 1;
@ -392,11 +393,18 @@ scm_print_symbol_name (const char *str, size_t len, SCM port)
break; break;
} }
if (pos < end) if (pos < end)
scm_lfwrite (str + pos, end - pos, port); scm_lfwrite_substr (scm_symbol_to_string (str), pos, end, port);
if (weird) if (weird)
scm_lfwrite ("}#", 2, port); scm_lfwrite ("}#", 2, port);
} }
void
scm_print_symbol_name (const char *str, size_t len, SCM port)
{
SCM symbol = scm_from_locale_symboln (str, len);
return scm_i_print_symbol_name (symbol, port);
}
/* Print generally. Handles both write and display according to PSTATE. /* Print generally. Handles both write and display according to PSTATE.
*/ */
SCM_GPROC(s_write, "write", 1, 1, 0, scm_write, g_write); SCM_GPROC(s_write, "write", 1, 1, 0, scm_write, g_write);
@ -665,16 +673,13 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
case scm_tc7_symbol: case scm_tc7_symbol:
if (scm_i_symbol_is_interned (exp)) if (scm_i_symbol_is_interned (exp))
{ {
scm_print_symbol_name (scm_i_symbol_chars (exp), scm_i_print_symbol_name (exp, port);
scm_i_symbol_length (exp), port);
scm_remember_upto_here_1 (exp); scm_remember_upto_here_1 (exp);
} }
else else
{ {
scm_puts ("#<uninterned-symbol ", port); scm_puts ("#<uninterned-symbol ", port);
scm_print_symbol_name (scm_i_symbol_chars (exp), scm_i_print_symbol_name (exp, port);
scm_i_symbol_length (exp),
port);
scm_putc (' ', port); scm_putc (' ', port);
scm_uintprint (SCM_UNPACK (exp), 16, port); scm_uintprint (SCM_UNPACK (exp), 16, port);
scm_putc ('>', port); scm_putc ('>', port);
@ -726,14 +731,16 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
EXIT_NESTED_DATA (pstate); EXIT_NESTED_DATA (pstate);
break; break;
case scm_tcs_subrs: case scm_tcs_subrs:
scm_puts (SCM_SUBR_GENERIC (exp) {
? "#<primitive-generic " SCM name = scm_symbol_to_string (SCM_SUBR_NAME (exp));
: "#<primitive-procedure ", scm_puts (SCM_SUBR_GENERIC (exp)
port); ? "#<primitive-generic "
scm_puts (scm_i_symbol_chars (SCM_SUBR_NAME (exp)), port); : "#<primitive-procedure ",
scm_putc ('>', port); port);
break; scm_lfwrite_str (name, port);
scm_putc ('>', port);
break;
}
case scm_tc7_pws: case scm_tc7_pws:
scm_puts ("#<procedure-with-setter", port); scm_puts ("#<procedure-with-setter", port);
{ {

View file

@ -82,6 +82,7 @@ SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port); SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port); SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);
SCM_API void scm_iprlist (char *hdr, SCM exp, int tlr, SCM port, scm_print_state *pstate); SCM_API void scm_iprlist (char *hdr, SCM exp, int tlr, SCM port, scm_print_state *pstate);
SCM_INTERNAL void scm_i_print_symbol_name (SCM sym, SCM port);
SCM_API void scm_print_symbol_name (const char *str, size_t len, SCM port); SCM_API void scm_print_symbol_name (const char *str, size_t len, SCM port);
SCM_API void scm_prin1 (SCM exp, SCM port, int writingp); SCM_API void scm_prin1 (SCM exp, SCM port, int writingp);
SCM_API void scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate); SCM_API void scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate);

View file

@ -89,11 +89,11 @@ scm_i_hash_symbol (SCM obj, unsigned long n, void *closure)
} }
static SCM static SCM
lookup_interned_symbol (const char *name, size_t len, lookup_interned_symbol (SCM name, unsigned long raw_hash)
unsigned long raw_hash)
{ {
/* Try to find the symbol in the symbols table */ /* Try to find the symbol in the symbols table */
SCM l; SCM l;
size_t len = scm_i_string_length (name);
unsigned long hash = raw_hash % SCM_HASHTABLE_N_BUCKETS (symbols); unsigned long hash = raw_hash % SCM_HASHTABLE_N_BUCKETS (symbols);
for (l = SCM_HASHTABLE_BUCKET (symbols, hash); for (l = SCM_HASHTABLE_BUCKET (symbols, hash);
@ -104,15 +104,32 @@ lookup_interned_symbol (const char *name, size_t len,
if (scm_i_symbol_hash (sym) == raw_hash if (scm_i_symbol_hash (sym) == raw_hash
&& scm_i_symbol_length (sym) == len) && scm_i_symbol_length (sym) == len)
{ {
const char *chrs = scm_i_symbol_chars (sym); size_t i = len;
size_t i = len;
while (i != 0) /* Slightly faster path for comparing narrow to narrow. */
{ if (scm_i_is_narrow_string (name) && scm_i_is_narrow_symbol (sym))
--i; {
if (name[i] != chrs[i]) const char *chrs = scm_i_symbol_chars (sym);
goto next_symbol; const char *str = scm_i_string_chars (name);
}
while (i != 0)
{
--i;
if (str[i] != chrs[i])
goto next_symbol;
}
}
else
{
/* Somewhat slower path for comparing narrow to wide or
wide to wide. */
while (i != 0)
{
--i;
if (scm_i_string_ref (name, i) != scm_i_symbol_ref (sym, i))
goto next_symbol;
}
}
return sym; return sym;
} }
@ -142,32 +159,12 @@ intern_symbol (SCM symbol)
} }
static SCM static SCM
scm_i_c_mem2symbol (const char *name, size_t len) scm_i_str2symbol (SCM str)
{ {
SCM symbol; SCM symbol;
size_t raw_hash = scm_string_hash ((const unsigned char *) name, len); size_t raw_hash = scm_i_string_hash (str);
symbol = lookup_interned_symbol (name, len, raw_hash); symbol = lookup_interned_symbol (str, raw_hash);
if (scm_is_false (symbol))
{
/* The symbol was not found, create it. */
symbol = scm_i_c_make_symbol (name, len, 0, raw_hash,
scm_cons (SCM_BOOL_F, SCM_EOL));
intern_symbol (symbol);
}
return symbol;
}
static SCM
scm_i_mem2symbol (SCM str)
{
SCM symbol;
const char *name = scm_i_string_chars (str);
size_t len = scm_i_string_length (str);
size_t raw_hash = scm_string_hash ((const unsigned char *) name, len);
symbol = lookup_interned_symbol (name, len, raw_hash);
if (scm_is_false (symbol)) if (scm_is_false (symbol))
{ {
/* The symbol was not found, create it. */ /* The symbol was not found, create it. */
@ -181,11 +178,9 @@ scm_i_mem2symbol (SCM str)
static SCM static SCM
scm_i_mem2uninterned_symbol (SCM str) scm_i_str2uninterned_symbol (SCM str)
{ {
const char *name = scm_i_string_chars (str); size_t raw_hash = scm_i_string_hash (str);
size_t len = scm_i_string_length (str);
size_t raw_hash = scm_string_hash ((const unsigned char *) name, len);
return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED, return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED,
raw_hash, scm_cons (SCM_BOOL_F, SCM_EOL)); raw_hash, scm_cons (SCM_BOOL_F, SCM_EOL));
@ -220,7 +215,7 @@ SCM_DEFINE (scm_make_symbol, "make-symbol", 1, 0, 0,
#define FUNC_NAME s_scm_make_symbol #define FUNC_NAME s_scm_make_symbol
{ {
SCM_VALIDATE_STRING (1, name); SCM_VALIDATE_STRING (1, name);
return scm_i_mem2uninterned_symbol (name); return scm_i_str2uninterned_symbol (name);
} }
#undef FUNC_NAME #undef FUNC_NAME
@ -282,7 +277,7 @@ SCM_DEFINE (scm_string_to_symbol, "string->symbol", 1, 0, 0,
#define FUNC_NAME s_scm_string_to_symbol #define FUNC_NAME s_scm_string_to_symbol
{ {
SCM_VALIDATE_STRING (1, string); SCM_VALIDATE_STRING (1, string);
return scm_i_mem2symbol (string); return scm_i_str2symbol (string);
} }
#undef FUNC_NAME #undef FUNC_NAME
@ -389,44 +384,23 @@ SCM_DEFINE (scm_symbol_pset_x, "symbol-pset!", 2, 0, 0,
SCM SCM
scm_from_locale_symbol (const char *sym) scm_from_locale_symbol (const char *sym)
{ {
return scm_i_c_mem2symbol (sym, strlen (sym)); return scm_from_locale_symboln (sym, -1);
} }
SCM SCM
scm_from_locale_symboln (const char *sym, size_t len) scm_from_locale_symboln (const char *sym, size_t len)
{ {
return scm_i_c_mem2symbol (sym, len); SCM str = scm_from_locale_stringn (sym, len);
return scm_i_str2symbol (str);
} }
SCM SCM
scm_take_locale_symboln (char *sym, size_t len) scm_take_locale_symboln (char *sym, size_t len)
{ {
SCM res; SCM str;
unsigned long raw_hash;
if (len == (size_t)-1) str = scm_take_locale_stringn (sym, len);
len = strlen (sym); return scm_i_str2symbol (str);
else
{
/* Ensure STR is null terminated. A realloc for 1 extra byte should
often be satisfied from the alignment padding after the block, with
no actual data movement. */
sym = scm_realloc (sym, len+1);
sym[len] = '\0';
}
raw_hash = scm_string_hash ((unsigned char *)sym, len);
res = lookup_interned_symbol (sym, len, raw_hash);
if (scm_is_false (res))
{
res = scm_i_c_take_symbol (sym, len, 0, raw_hash,
scm_cons (SCM_BOOL_F, SCM_EOL));
intern_symbol (res);
}
else
free (sym);
return res;
} }
SCM SCM

View file

@ -61,15 +61,13 @@
(let ((s 'x0123456789012345678901234567890123456789)) (let ((s 'x0123456789012345678901234567890123456789))
(not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) (not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
;; symbol->string isn't ready for UCS-4 yet (pass-if "short UCS-4-encoded symbols are not inlined"
(let ((s (string->symbol "\u0100")))
(not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
;;(pass-if "short UCS-4-encoded symbols are not inlined" (pass-if "long UCS-4-encoded symbols are not inlined"
;; (let ((s (string->symbol "\u0100"))) (let ((s (string->symbol "\u010012345678901234567890123456789")))
;; (not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) (not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
;;(pass-if "long UCS-4-encoded symbols are not inlined"
;; (let ((s (string->symbol "\u010012345678901234567890123456789")))
;; (not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
(with-test-prefix "hashes" (with-test-prefix "hashes"
@ -99,16 +97,13 @@
(let ((s (string->symbol "\xC0\xC1\xC2"))) (let ((s (string->symbol "\xC0\xC1\xC2")))
(not (assq-ref (%symbol-dump s) 'stringbuf-wide)))) (not (assq-ref (%symbol-dump s) 'stringbuf-wide))))
;; symbol->string isn't ready for UCS-4 yet (pass-if "BMP symbols are UCS-4 encoded"
(let ((s (string->symbol "\u0100\u0101\x0102")))
(assq-ref (%symbol-dump s) 'stringbuf-wide)))
;;(pass-if "BMP symbols are UCS-4 encoded" (pass-if "SMP symbols are UCS-4 encoded"
;; (let ((s (string->symbol "\u0100\u0101\x0102"))) (let ((s (string->symbol "\U010300\u010301\x010302")))
;; (assq-ref (%symbol-dump s) 'stringbuf-wide))) (assq-ref (%symbol-dump s) 'stringbuf-wide)))))
;;(pass-if "SMP symbols are UCS-4 encoded"
;; (let ((s (string->symbol "\U010300\u010301\x010302")))
;; (assq-ref (%symbol-dump s) 'stringbuf-wide)))
))
;;; ;;;
;;; symbol? ;;; symbol?
@ -125,6 +120,16 @@
(pass-if "symbol" (pass-if "symbol"
(symbol? 'foo))) (symbol? 'foo)))
;;;
;;; wide symbols
;;;
(with-test-prefix "BMP symbols"
(pass-if "BMP symbol's string"
(and (= 4 (string-length "abc\u0100"))
(string=? "abc\u0100"
(symbol->string (string->symbol "abc\u0100"))))))
;;; ;;;
;;; symbol->string ;;; symbol->string