diff --git a/libguile/symbols.c b/libguile/symbols.c index 292941e9d..1cec76f26 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -1,4 +1,4 @@ -/* Copyright 1995-1998,2000-2001,2003-2004,2006,2009,2011,2013,2015,2018,2022,2023 +/* Copyright 1995-1998,2000-2001,2003-2004,2006,2009,2011,2013,2015,2018,2022,2023,2025 Free Software Foundation, Inc. This file is part of Guile. @@ -29,6 +29,7 @@ #include "alist.h" #include "boolean.h" #include "chars.h" +#include "ephemerons.h" #include "eval.h" #include "fluids.h" #include "gsubr.h" @@ -46,14 +47,13 @@ #include "threads.h" #include "variable.h" #include "vectors.h" -#include "weak-set.h" #include "symbols.h" -static SCM symbols; +static struct scm_ephemeron_table *symbols; #ifdef GUILE_DEBUG SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0, @@ -61,7 +61,7 @@ SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0, "Return the system symbol obarray.") #define FUNC_NAME s_scm_sys_symbols { - return symbols; + return scm_from_ephemeron_table (symbols); } #undef FUNC_NAME #endif @@ -77,187 +77,171 @@ scm_i_hash_symbol (SCM obj, unsigned long n, void *closure) return scm_i_symbol_hash (obj) % n; } -struct string_lookup_data -{ - SCM string; - unsigned long string_hash; -}; - static int -string_lookup_predicate_fn (SCM sym, void *closure) +symbol_equals_string (SCM sym, SCM str, size_t len, unsigned long hash) { - struct string_lookup_data *data = closure; - - if (scm_i_symbol_hash (sym) == data->string_hash - && scm_i_symbol_length (sym) == scm_i_string_length (data->string)) - { - size_t n = scm_i_symbol_length (sym); - while (n--) - if (scm_i_symbol_ref (sym, n) != scm_i_string_ref (data->string, n)) - return 0; - return 1; - } - else + if (scm_i_symbol_hash (sym) != hash) + return 0; + if (scm_i_symbol_length (sym) != len) return 0; -} - -static SCM -lookup_interned_symbol (SCM name, unsigned long raw_hash) -{ - struct string_lookup_data data; - - data.string = name; - data.string_hash = raw_hash; - return scm_c_weak_set_lookup (symbols, raw_hash, - string_lookup_predicate_fn, - &data, SCM_BOOL_F); -} + for (size_t i = 0; i < len; i++) + if (scm_i_symbol_ref (sym, i) != scm_i_string_ref (str, i)) + return 0; -struct latin1_lookup_data -{ - const char *str; - size_t len; - unsigned long string_hash; -}; + return 1; +} static int -latin1_lookup_predicate_fn (SCM sym, void *closure) +symbol_equals_latin1_string (SCM sym, const char *str, size_t len, + unsigned long hash) { - struct latin1_lookup_data *data = closure; - - return scm_i_symbol_hash (sym) == data->string_hash - && scm_i_is_narrow_symbol (sym) - && scm_i_symbol_length (sym) == data->len - && strncmp (scm_i_symbol_chars (sym), data->str, data->len) == 0; + if (scm_i_symbol_hash (sym) != hash) + return 0; + if (scm_i_symbol_length (sym) != len) + return 0; + if (!scm_i_is_narrow_symbol (sym)) + return 0; + + return strncmp (scm_i_symbol_chars (sym), str, len) == 0; } static SCM lookup_interned_latin1_symbol (const char *str, size_t len, unsigned long raw_hash) { - struct latin1_lookup_data data; - - data.str = str; - data.len = len; - data.string_hash = raw_hash; - - return scm_c_weak_set_lookup (symbols, raw_hash, - latin1_lookup_predicate_fn, - &data, SCM_BOOL_F); + size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols); + for (struct gc_ephemeron *e = scm_c_ephemeron_table_ref (symbols, bucket); + e; + e = scm_c_ephemeron_next (e)) + { + SCM sym = scm_c_ephemeron_key (e); + if (scm_is_true (sym) + && symbol_equals_latin1_string (sym, str, len, raw_hash)) + return sym; + } + return SCM_BOOL_F; } -struct utf8_lookup_data -{ - const char *str; - size_t len; - unsigned long string_hash; -}; - static int -utf8_string_equals_wide_string (const uint8_t *narrow, size_t nlen, - const scm_t_wchar *wide, size_t wlen) +utf8_string_equals_narrow_string (const uint8_t *utf8, size_t ulen, + const char *narrow) { - size_t byte_idx = 0, char_idx = 0; + /* Precondition: utf8,ulen is valid UTF-8. */ + size_t byte_idx = 0; - while (byte_idx < nlen && char_idx < wlen) + while (byte_idx < ulen) { - ucs4_t c; - int nbytes; - - nbytes = u8_mbtoucr (&c, narrow + byte_idx, nlen - byte_idx); - if (nbytes == 0) - break; - else if (nbytes < 0) - /* Bad UTF-8. */ + ucs4_t c = -1; + byte_idx += u8_mbtoucr (&c, utf8 + byte_idx, ulen - byte_idx); + if (c != *narrow) return 0; - else if (c != wide[char_idx]) - return 0; - - byte_idx += nbytes; - char_idx++; + narrow++; } - return byte_idx == nlen && char_idx == wlen; + return 1; } static int -utf8_lookup_predicate_fn (SCM sym, void *closure) +utf8_string_equals_wide_string (const uint8_t *utf8, size_t ulen, + const scm_t_wchar *wide) { - struct utf8_lookup_data *data = closure; + /* Precondition: utf8,ulen is valid UTF-8. */ + size_t byte_idx = 0; + + while (byte_idx < ulen) + { + ucs4_t c = -1; + byte_idx += u8_mbtoucr (&c, utf8 + byte_idx, ulen - byte_idx); + if (c != *wide) + return 0; + wide++; + } - if (scm_i_symbol_hash (sym) != data->string_hash) + return 1; +} + +static int +symbol_equals_utf8_string (SCM sym, const uint8_t *str, size_t len, + unsigned long hash, int codepoint_count) +{ + if (scm_i_symbol_hash (sym) != hash) + return 0; + if (scm_i_symbol_length (sym) != codepoint_count) return 0; if (scm_i_is_narrow_symbol (sym)) - return (scm_i_symbol_length (sym) == data->len - && strncmp (scm_i_symbol_chars (sym), data->str, data->len) == 0); + return utf8_string_equals_narrow_string (str, len, + scm_i_symbol_chars (sym)); else - return utf8_string_equals_wide_string ((const uint8_t *) data->str, - data->len, - scm_i_symbol_wide_chars (sym), - scm_i_symbol_length (sym)); + return utf8_string_equals_wide_string (str, len, + scm_i_symbol_wide_chars (sym)); } static SCM -lookup_interned_utf8_symbol (const char *str, size_t len, +lookup_interned_utf8_symbol (const uint8_t *str, size_t len, unsigned long raw_hash) { - struct utf8_lookup_data data; + int codepoint_count = u8_mbsnlen (str, len); + if (codepoint_count == -1) + /* Bad UTF-8. */ + return SCM_BOOL_F; - data.str = str; - data.len = len; - data.string_hash = raw_hash; - - return scm_c_weak_set_lookup (symbols, raw_hash, - utf8_lookup_predicate_fn, - &data, SCM_BOOL_F); -} + if (codepoint_count == len) + return lookup_interned_latin1_symbol ((const char *) str, len, raw_hash); -static int -symbol_lookup_predicate_fn (SCM sym, void *closure) -{ - SCM other = SCM_PACK_POINTER (closure); - - if (scm_i_symbol_hash (sym) == scm_i_symbol_hash (other) - && scm_i_symbol_length (sym) == scm_i_symbol_length (other)) + size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols); + for (struct gc_ephemeron *e = scm_c_ephemeron_table_ref (symbols, bucket); + e; + e = scm_c_ephemeron_next (e)) { - if (scm_i_is_narrow_symbol (sym)) - return scm_i_is_narrow_symbol (other) - && (strncmp (scm_i_symbol_chars (sym), - scm_i_symbol_chars (other), - scm_i_symbol_length (other)) == 0); - else - return scm_is_true - (scm_string_equal_p (scm_symbol_to_string (sym), - scm_symbol_to_string (other))); + SCM sym = scm_c_ephemeron_key (e); + if (scm_is_true (sym) + && symbol_equals_utf8_string (sym, str, len, raw_hash, + codepoint_count)) + return sym; } - return 0; + return SCM_BOOL_F; } - + static SCM scm_i_str2symbol (SCM str) { - SCM symbol; unsigned long raw_hash = scm_i_string_hash (str); + size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols); + size_t len = scm_i_string_length (str); - symbol = lookup_interned_symbol (str, raw_hash); - if (scm_is_true (symbol)) - return symbol; - else + struct gc_ephemeron *chain = scm_c_ephemeron_table_ref (symbols, bucket); + /* First see if a symbol with this name is already interned. */ + for (struct gc_ephemeron *e = chain; e; e = scm_c_ephemeron_next (e)) { - /* The symbol was not found, create it. */ - symbol = scm_i_make_symbol (str, 0, raw_hash); - - /* Might return a different symbol, if another one was interned at - the same time. */ - return scm_c_weak_set_add_x (symbols, raw_hash, - symbol_lookup_predicate_fn, - SCM_UNPACK_POINTER (symbol), symbol); + SCM sym = scm_c_ephemeron_key (e); + if (scm_is_true (sym) && symbol_equals_string (sym, str, len, raw_hash)) + return sym; + } + + /* The symbol was not found, create it. */ + SCM sym = scm_i_make_symbol (str, 0, raw_hash); + struct gc_ephemeron *link = scm_c_make_ephemeron (sym, SCM_BOOL_T); + while (1) + { + struct gc_ephemeron *prev = + scm_c_ephemeron_table_try_push_x (symbols, bucket, link, chain); + if (prev == chain) + return sym; + /* Lost a race, someone else added a symbol in this bucket. Check + the chain and try again. */ + chain = prev; + for (struct gc_ephemeron *e = chain; e; e = scm_c_ephemeron_next (e)) + { + SCM sym = scm_c_ephemeron_key (e); + if (scm_is_true (sym) + && symbol_equals_string (sym, str, len, raw_hash)) + return sym; + } } } - static SCM scm_i_str2uninterned_symbol (SCM str) { @@ -449,7 +433,7 @@ scm_from_utf8_symboln (const char *sym, size_t len) len = strlen (sym); hash = scm_i_utf8_string_hash (sym, len); - ret = lookup_interned_utf8_symbol (sym, len, hash); + ret = lookup_interned_utf8_symbol ((const uint8_t *)sym, len, hash); if (scm_is_false (ret)) { SCM str = scm_from_utf8_stringn (sym, len); @@ -462,7 +446,7 @@ scm_from_utf8_symboln (const char *sym, size_t len) void scm_symbols_prehistory () { - symbols = scm_c_make_weak_set (5000); + symbols = scm_c_make_ephemeron_table (5000); }