1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-24 20:30:28 +02:00

Add initial support for wide symbols

* libguile/hash.c (scm_i_string_hash): new function
  (scm_hasher): don't unpack string: use scm_i_string_hash

* libguile/hash.h: new declaration for scm_i_string_hash

* libguile/print.c (quote_keywordish_symbol): use symbol accessors
  (scm_i_print_symbol_name): new function
  (scm_print_symbol_name): call scm_i_print_symbol_name
  (iprin1): use scm_i_print_symbol_name to print symbols

* libguile/print.h: new declaration for scm_i_print_symbol_name

* libguile/symbols.c (lookup_interned_symbol): now takes scheme string
  instead of c string; callers changed
  (lookup_interned_symbol): add wide symbol support
  (scm_i_c_mem2symbol): removed
  (scm_i_mem2symbol): removed and replaced with scm_i_str2symbol
  (scm_i_str2symbol): new function
  (scm_i_mem2uninterned_symbol): removed and replaced with
  scm_i_str2uninterned_symbol
  (scm_i_str2uninterned_symbol): new function
  (scm_make_symbol, scm_string_to_symbol, scm_from_locale_symbol)
  (scm_from_locale_symboln): use scm_i_str2symbol

* test-suite/tests/symbols.test: new tests
This commit is contained in:
Michael Gran 2009-08-21 08:57:35 -07:00
parent 90305ce9e4
commit e23106d53e
6 changed files with 112 additions and 111 deletions

View file

@ -50,6 +50,20 @@ scm_string_hash (const unsigned char *str, size_t len)
return h;
}
unsigned long
scm_i_string_hash (SCM str)
{
size_t len = scm_i_string_length (str);
size_t i = 0;
unsigned long h = 0;
while (len-- > 0)
h = (unsigned long) scm_i_string_ref (str, i++) + h * 37;
scm_remember_upto_here_1 (str);
return h;
}
/* Dirk:FIXME:: why downcase for characters? (2x: scm_hasher, scm_ihashv) */
/* Dirk:FIXME:: scm_hasher could be made static. */
@ -115,8 +129,7 @@ scm_hasher(SCM obj, unsigned long n, size_t d)
case scm_tc7_string:
{
unsigned long hash =
scm_string_hash ((const unsigned char *) scm_i_string_chars (obj),
scm_i_string_length (obj)) % n;
scm_i_string_hash (obj) % n;
scm_remember_upto_here_1 (obj);
return hash;
}

View file

@ -28,6 +28,7 @@
SCM_API unsigned long scm_string_hash (const unsigned char *str, size_t len);
SCM_INTERNAL unsigned long scm_i_string_hash (SCM str);
SCM_API unsigned long scm_hasher (SCM obj, unsigned long n, size_t d);
SCM_API unsigned long scm_ihashq (SCM obj, unsigned long n);
SCM_API SCM scm_hashq (SCM obj, SCM n);

View file

@ -295,13 +295,12 @@ print_circref (SCM port, scm_print_state *pstate, SCM ref)
/* Print the name of a symbol. */
static int
quote_keywordish_symbol (const char *str, size_t len)
quote_keywordish_symbol (SCM symbol)
{
SCM option;
/* LEN is guaranteed to be > 0.
*/
if (str[0] != ':' && str[len-1] != ':')
if (scm_i_symbol_ref (symbol, 0) != ':'
&& scm_i_symbol_ref (symbol, scm_i_symbol_length (symbol) - 1) != ':')
return 0;
option = SCM_PRINT_KEYWORD_STYLE;
@ -313,7 +312,7 @@ quote_keywordish_symbol (const char *str, size_t len)
}
void
scm_print_symbol_name (const char *str, size_t len, SCM port)
scm_i_print_symbol_name (SCM str, SCM port)
{
/* This points to the first character that has not yet been written to the
* port. */
@ -334,18 +333,20 @@ scm_print_symbol_name (const char *str, size_t len, SCM port)
* simpler and faster. */
int maybe_weird = 0;
size_t mw_pos = 0;
size_t len = scm_i_symbol_length (str);
scm_t_wchar str0 = scm_i_symbol_ref (str, 0);
if (len == 0 || str[0] == '\'' || str[0] == '`' || str[0] == ','
|| quote_keywordish_symbol (str, len)
|| (str[0] == '.' && len == 1)
|| scm_is_true (scm_c_locale_stringn_to_number (str, len, 10)))
if (len == 0 || str0 == '\'' || str0 == '`' || str0 == ','
|| quote_keywordish_symbol (str)
|| (str0 == '.' && len == 1)
|| scm_is_true (scm_i_string_to_number (scm_symbol_to_string (str), 10)))
{
scm_lfwrite ("#{", 2, port);
weird = 1;
}
for (end = pos; end < len; ++end)
switch (str[end])
switch (scm_i_symbol_ref (str, end))
{
#ifdef BRACKETS_AS_PARENS
case '[':
@ -370,11 +371,11 @@ scm_print_symbol_name (const char *str, size_t len, SCM port)
weird = 1;
}
if (pos < end)
scm_lfwrite (str + pos, end - pos, port);
scm_lfwrite_substr (scm_symbol_to_string (str), pos, end, port);
{
char buf[2];
buf[0] = '\\';
buf[1] = str[end];
buf[1] = (char) (unsigned char) scm_i_symbol_ref (str, end);
scm_lfwrite (buf, 2, port);
}
pos = end + 1;
@ -392,11 +393,18 @@ scm_print_symbol_name (const char *str, size_t len, SCM port)
break;
}
if (pos < end)
scm_lfwrite (str + pos, end - pos, port);
scm_lfwrite_substr (scm_symbol_to_string (str), pos, end, port);
if (weird)
scm_lfwrite ("}#", 2, port);
}
void
scm_print_symbol_name (const char *str, size_t len, SCM port)
{
SCM symbol = scm_from_locale_symboln (str, len);
return scm_i_print_symbol_name (symbol, port);
}
/* Print generally. Handles both write and display according to PSTATE.
*/
SCM_GPROC(s_write, "write", 1, 1, 0, scm_write, g_write);
@ -665,16 +673,13 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
case scm_tc7_symbol:
if (scm_i_symbol_is_interned (exp))
{
scm_print_symbol_name (scm_i_symbol_chars (exp),
scm_i_symbol_length (exp), port);
scm_i_print_symbol_name (exp, port);
scm_remember_upto_here_1 (exp);
}
else
{
scm_puts ("#<uninterned-symbol ", port);
scm_print_symbol_name (scm_i_symbol_chars (exp),
scm_i_symbol_length (exp),
port);
scm_i_print_symbol_name (exp, port);
scm_putc (' ', port);
scm_uintprint (SCM_UNPACK (exp), 16, port);
scm_putc ('>', port);
@ -726,14 +731,16 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
EXIT_NESTED_DATA (pstate);
break;
case scm_tcs_subrs:
scm_puts (SCM_SUBR_GENERIC (exp)
? "#<primitive-generic "
: "#<primitive-procedure ",
port);
scm_puts (scm_i_symbol_chars (SCM_SUBR_NAME (exp)), port);
scm_putc ('>', port);
break;
{
SCM name = scm_symbol_to_string (SCM_SUBR_NAME (exp));
scm_puts (SCM_SUBR_GENERIC (exp)
? "#<primitive-generic "
: "#<primitive-procedure ",
port);
scm_lfwrite_str (name, port);
scm_putc ('>', port);
break;
}
case scm_tc7_pws:
scm_puts ("#<procedure-with-setter", port);
{

View file

@ -82,6 +82,7 @@ SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);
SCM_API void scm_iprlist (char *hdr, SCM exp, int tlr, SCM port, scm_print_state *pstate);
SCM_INTERNAL void scm_i_print_symbol_name (SCM sym, SCM port);
SCM_API void scm_print_symbol_name (const char *str, size_t len, SCM port);
SCM_API void scm_prin1 (SCM exp, SCM port, int writingp);
SCM_API void scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate);

View file

@ -89,11 +89,11 @@ scm_i_hash_symbol (SCM obj, unsigned long n, void *closure)
}
static SCM
lookup_interned_symbol (const char *name, size_t len,
unsigned long raw_hash)
lookup_interned_symbol (SCM name, unsigned long raw_hash)
{
/* Try to find the symbol in the symbols table */
SCM l;
size_t len = scm_i_string_length (name);
unsigned long hash = raw_hash % SCM_HASHTABLE_N_BUCKETS (symbols);
for (l = SCM_HASHTABLE_BUCKET (symbols, hash);
@ -104,15 +104,32 @@ lookup_interned_symbol (const char *name, size_t len,
if (scm_i_symbol_hash (sym) == raw_hash
&& scm_i_symbol_length (sym) == len)
{
const char *chrs = scm_i_symbol_chars (sym);
size_t i = len;
size_t i = len;
while (i != 0)
{
--i;
if (name[i] != chrs[i])
goto next_symbol;
}
/* Slightly faster path for comparing narrow to narrow. */
if (scm_i_is_narrow_string (name) && scm_i_is_narrow_symbol (sym))
{
const char *chrs = scm_i_symbol_chars (sym);
const char *str = scm_i_string_chars (name);
while (i != 0)
{
--i;
if (str[i] != chrs[i])
goto next_symbol;
}
}
else
{
/* Somewhat slower path for comparing narrow to wide or
wide to wide. */
while (i != 0)
{
--i;
if (scm_i_string_ref (name, i) != scm_i_symbol_ref (sym, i))
goto next_symbol;
}
}
return sym;
}
@ -142,32 +159,12 @@ intern_symbol (SCM symbol)
}
static SCM
scm_i_c_mem2symbol (const char *name, size_t len)
scm_i_str2symbol (SCM str)
{
SCM symbol;
size_t raw_hash = scm_string_hash ((const unsigned char *) name, len);
size_t raw_hash = scm_i_string_hash (str);
symbol = lookup_interned_symbol (name, len, raw_hash);
if (scm_is_false (symbol))
{
/* The symbol was not found, create it. */
symbol = scm_i_c_make_symbol (name, len, 0, raw_hash,
scm_cons (SCM_BOOL_F, SCM_EOL));
intern_symbol (symbol);
}
return symbol;
}
static SCM
scm_i_mem2symbol (SCM str)
{
SCM symbol;
const char *name = scm_i_string_chars (str);
size_t len = scm_i_string_length (str);
size_t raw_hash = scm_string_hash ((const unsigned char *) name, len);
symbol = lookup_interned_symbol (name, len, raw_hash);
symbol = lookup_interned_symbol (str, raw_hash);
if (scm_is_false (symbol))
{
/* The symbol was not found, create it. */
@ -181,11 +178,9 @@ scm_i_mem2symbol (SCM str)
static SCM
scm_i_mem2uninterned_symbol (SCM str)
scm_i_str2uninterned_symbol (SCM str)
{
const char *name = scm_i_string_chars (str);
size_t len = scm_i_string_length (str);
size_t raw_hash = scm_string_hash ((const unsigned char *) name, len);
size_t raw_hash = scm_i_string_hash (str);
return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED,
raw_hash, scm_cons (SCM_BOOL_F, SCM_EOL));
@ -220,7 +215,7 @@ SCM_DEFINE (scm_make_symbol, "make-symbol", 1, 0, 0,
#define FUNC_NAME s_scm_make_symbol
{
SCM_VALIDATE_STRING (1, name);
return scm_i_mem2uninterned_symbol (name);
return scm_i_str2uninterned_symbol (name);
}
#undef FUNC_NAME
@ -282,7 +277,7 @@ SCM_DEFINE (scm_string_to_symbol, "string->symbol", 1, 0, 0,
#define FUNC_NAME s_scm_string_to_symbol
{
SCM_VALIDATE_STRING (1, string);
return scm_i_mem2symbol (string);
return scm_i_str2symbol (string);
}
#undef FUNC_NAME
@ -389,44 +384,23 @@ SCM_DEFINE (scm_symbol_pset_x, "symbol-pset!", 2, 0, 0,
SCM
scm_from_locale_symbol (const char *sym)
{
return scm_i_c_mem2symbol (sym, strlen (sym));
return scm_from_locale_symboln (sym, -1);
}
SCM
scm_from_locale_symboln (const char *sym, size_t len)
{
return scm_i_c_mem2symbol (sym, len);
SCM str = scm_from_locale_stringn (sym, len);
return scm_i_str2symbol (str);
}
SCM
scm_take_locale_symboln (char *sym, size_t len)
{
SCM res;
unsigned long raw_hash;
SCM str;
if (len == (size_t)-1)
len = strlen (sym);
else
{
/* Ensure STR is null terminated. A realloc for 1 extra byte should
often be satisfied from the alignment padding after the block, with
no actual data movement. */
sym = scm_realloc (sym, len+1);
sym[len] = '\0';
}
raw_hash = scm_string_hash ((unsigned char *)sym, len);
res = lookup_interned_symbol (sym, len, raw_hash);
if (scm_is_false (res))
{
res = scm_i_c_take_symbol (sym, len, 0, raw_hash,
scm_cons (SCM_BOOL_F, SCM_EOL));
intern_symbol (res);
}
else
free (sym);
return res;
str = scm_take_locale_stringn (sym, len);
return scm_i_str2symbol (str);
}
SCM

View file

@ -61,15 +61,13 @@
(let ((s 'x0123456789012345678901234567890123456789))
(not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
;; symbol->string isn't ready for UCS-4 yet
(pass-if "short UCS-4-encoded symbols are not inlined"
(let ((s (string->symbol "\u0100")))
(not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
;;(pass-if "short UCS-4-encoded symbols are not inlined"
;; (let ((s (string->symbol "\u0100")))
;; (not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
;;(pass-if "long UCS-4-encoded symbols are not inlined"
;; (let ((s (string->symbol "\u010012345678901234567890123456789")))
;; (not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
(pass-if "long UCS-4-encoded symbols are not inlined"
(let ((s (string->symbol "\u010012345678901234567890123456789")))
(not (assq-ref (%symbol-dump s) 'stringbuf-inline))))
(with-test-prefix "hashes"
@ -99,16 +97,13 @@
(let ((s (string->symbol "\xC0\xC1\xC2")))
(not (assq-ref (%symbol-dump s) 'stringbuf-wide))))
;; symbol->string isn't ready for UCS-4 yet
(pass-if "BMP symbols are UCS-4 encoded"
(let ((s (string->symbol "\u0100\u0101\x0102")))
(assq-ref (%symbol-dump s) 'stringbuf-wide)))
;;(pass-if "BMP symbols are UCS-4 encoded"
;; (let ((s (string->symbol "\u0100\u0101\x0102")))
;; (assq-ref (%symbol-dump s) 'stringbuf-wide)))
;;(pass-if "SMP symbols are UCS-4 encoded"
;; (let ((s (string->symbol "\U010300\u010301\x010302")))
;; (assq-ref (%symbol-dump s) 'stringbuf-wide)))
))
(pass-if "SMP symbols are UCS-4 encoded"
(let ((s (string->symbol "\U010300\u010301\x010302")))
(assq-ref (%symbol-dump s) 'stringbuf-wide)))))
;;;
;;; symbol?
@ -125,6 +120,16 @@
(pass-if "symbol"
(symbol? 'foo)))
;;;
;;; wide symbols
;;;
(with-test-prefix "BMP symbols"
(pass-if "BMP symbol's string"
(and (= 4 (string-length "abc\u0100"))
(string=? "abc\u0100"
(symbol->string (string->symbol "abc\u0100"))))))
;;;
;;; symbol->string