mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-06-05 19:50:23 +02:00
* libguile/symbols.c: Rework the symbol table to be an ephemeron table instead of a weak set. It is no longer resizeable; getting that to work will involve some GC cooperation.
459 lines
11 KiB
C
459 lines
11 KiB
C
/* Copyright 1995-1998,2000-2001,2003-2004,2006,2009,2011,2013,2015,2018,2022,2023,2025
|
||
Free Software Foundation, Inc.
|
||
|
||
This file is part of Guile.
|
||
|
||
Guile is free software: you can redistribute it and/or modify it
|
||
under the terms of the GNU Lesser General Public License as published
|
||
by the Free Software Foundation, either version 3 of the License, or
|
||
(at your option) any later version.
|
||
|
||
Guile is distributed in the hope that it will be useful, but WITHOUT
|
||
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
License for more details.
|
||
|
||
You should have received a copy of the GNU Lesser General Public
|
||
License along with Guile. If not, see
|
||
<https://www.gnu.org/licenses/>. */
|
||
|
||
|
||
|
||
#ifdef HAVE_CONFIG_H
|
||
# include <config.h>
|
||
#endif
|
||
|
||
#include <string.h>
|
||
#include <unistr.h>
|
||
|
||
#include "alist.h"
|
||
#include "boolean.h"
|
||
#include "chars.h"
|
||
#include "ephemerons.h"
|
||
#include "eval.h"
|
||
#include "fluids.h"
|
||
#include "gsubr.h"
|
||
#include "hash.h"
|
||
#include "list.h"
|
||
#include "modules.h"
|
||
#include "numbers.h"
|
||
#include "pairs.h"
|
||
#include "private-options.h"
|
||
#include "read.h"
|
||
#include "smob.h"
|
||
#include "srfi-13.h"
|
||
#include "strings.h"
|
||
#include "strorder.h"
|
||
#include "threads.h"
|
||
#include "variable.h"
|
||
#include "vectors.h"
|
||
|
||
#include "symbols.h"
|
||
|
||
|
||
|
||
|
||
static struct scm_ephemeron_table *symbols;
|
||
|
||
#ifdef GUILE_DEBUG
|
||
SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0,
|
||
(),
|
||
"Return the system symbol obarray.")
|
||
#define FUNC_NAME s_scm_sys_symbols
|
||
{
|
||
return scm_from_ephemeron_table (symbols);
|
||
}
|
||
#undef FUNC_NAME
|
||
#endif
|
||
|
||
|
||
|
||
/* {Symbols}
|
||
*/
|
||
|
||
unsigned long
|
||
scm_i_hash_symbol (SCM obj, unsigned long n, void *closure)
|
||
{
|
||
return scm_i_symbol_hash (obj) % n;
|
||
}
|
||
|
||
static int
|
||
symbol_equals_string (SCM sym, SCM str, size_t len, unsigned long hash)
|
||
{
|
||
if (scm_i_symbol_hash (sym) != hash)
|
||
return 0;
|
||
if (scm_i_symbol_length (sym) != len)
|
||
return 0;
|
||
|
||
for (size_t i = 0; i < len; i++)
|
||
if (scm_i_symbol_ref (sym, i) != scm_i_string_ref (str, i))
|
||
return 0;
|
||
|
||
return 1;
|
||
}
|
||
|
||
static int
|
||
symbol_equals_latin1_string (SCM sym, const char *str, size_t len,
|
||
unsigned long hash)
|
||
{
|
||
if (scm_i_symbol_hash (sym) != hash)
|
||
return 0;
|
||
if (scm_i_symbol_length (sym) != len)
|
||
return 0;
|
||
if (!scm_i_is_narrow_symbol (sym))
|
||
return 0;
|
||
|
||
return strncmp (scm_i_symbol_chars (sym), str, len) == 0;
|
||
}
|
||
|
||
static SCM
|
||
lookup_interned_latin1_symbol (const char *str, size_t len,
|
||
unsigned long raw_hash)
|
||
{
|
||
size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols);
|
||
for (struct gc_ephemeron *e = scm_c_ephemeron_table_ref (symbols, bucket);
|
||
e;
|
||
e = scm_c_ephemeron_next (e))
|
||
{
|
||
SCM sym = scm_c_ephemeron_key (e);
|
||
if (scm_is_true (sym)
|
||
&& symbol_equals_latin1_string (sym, str, len, raw_hash))
|
||
return sym;
|
||
}
|
||
return SCM_BOOL_F;
|
||
}
|
||
|
||
static int
|
||
utf8_string_equals_narrow_string (const uint8_t *utf8, size_t ulen,
|
||
const char *narrow)
|
||
{
|
||
/* Precondition: utf8,ulen is valid UTF-8. */
|
||
size_t byte_idx = 0;
|
||
|
||
while (byte_idx < ulen)
|
||
{
|
||
ucs4_t c = -1;
|
||
byte_idx += u8_mbtoucr (&c, utf8 + byte_idx, ulen - byte_idx);
|
||
if (c != *narrow)
|
||
return 0;
|
||
narrow++;
|
||
}
|
||
|
||
return 1;
|
||
}
|
||
|
||
static int
|
||
utf8_string_equals_wide_string (const uint8_t *utf8, size_t ulen,
|
||
const scm_t_wchar *wide)
|
||
{
|
||
/* Precondition: utf8,ulen is valid UTF-8. */
|
||
size_t byte_idx = 0;
|
||
|
||
while (byte_idx < ulen)
|
||
{
|
||
ucs4_t c = -1;
|
||
byte_idx += u8_mbtoucr (&c, utf8 + byte_idx, ulen - byte_idx);
|
||
if (c != *wide)
|
||
return 0;
|
||
wide++;
|
||
}
|
||
|
||
return 1;
|
||
}
|
||
|
||
static int
|
||
symbol_equals_utf8_string (SCM sym, const uint8_t *str, size_t len,
|
||
unsigned long hash, int codepoint_count)
|
||
{
|
||
if (scm_i_symbol_hash (sym) != hash)
|
||
return 0;
|
||
if (scm_i_symbol_length (sym) != codepoint_count)
|
||
return 0;
|
||
|
||
if (scm_i_is_narrow_symbol (sym))
|
||
return utf8_string_equals_narrow_string (str, len,
|
||
scm_i_symbol_chars (sym));
|
||
else
|
||
return utf8_string_equals_wide_string (str, len,
|
||
scm_i_symbol_wide_chars (sym));
|
||
}
|
||
|
||
static SCM
|
||
lookup_interned_utf8_symbol (const uint8_t *str, size_t len,
|
||
unsigned long raw_hash)
|
||
{
|
||
int codepoint_count = u8_mbsnlen (str, len);
|
||
if (codepoint_count == -1)
|
||
/* Bad UTF-8. */
|
||
return SCM_BOOL_F;
|
||
|
||
if (codepoint_count == len)
|
||
return lookup_interned_latin1_symbol ((const char *) str, len, raw_hash);
|
||
|
||
size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols);
|
||
for (struct gc_ephemeron *e = scm_c_ephemeron_table_ref (symbols, bucket);
|
||
e;
|
||
e = scm_c_ephemeron_next (e))
|
||
{
|
||
SCM sym = scm_c_ephemeron_key (e);
|
||
if (scm_is_true (sym)
|
||
&& symbol_equals_utf8_string (sym, str, len, raw_hash,
|
||
codepoint_count))
|
||
return sym;
|
||
}
|
||
return SCM_BOOL_F;
|
||
}
|
||
|
||
static SCM
|
||
scm_i_str2symbol (SCM str)
|
||
{
|
||
unsigned long raw_hash = scm_i_string_hash (str);
|
||
size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols);
|
||
size_t len = scm_i_string_length (str);
|
||
|
||
struct gc_ephemeron *chain = scm_c_ephemeron_table_ref (symbols, bucket);
|
||
/* First see if a symbol with this name is already interned. */
|
||
for (struct gc_ephemeron *e = chain; e; e = scm_c_ephemeron_next (e))
|
||
{
|
||
SCM sym = scm_c_ephemeron_key (e);
|
||
if (scm_is_true (sym) && symbol_equals_string (sym, str, len, raw_hash))
|
||
return sym;
|
||
}
|
||
|
||
/* The symbol was not found, create it. */
|
||
SCM sym = scm_i_make_symbol (str, 0, raw_hash);
|
||
struct gc_ephemeron *link = scm_c_make_ephemeron (sym, SCM_BOOL_T);
|
||
while (1)
|
||
{
|
||
struct gc_ephemeron *prev =
|
||
scm_c_ephemeron_table_try_push_x (symbols, bucket, link, chain);
|
||
if (prev == chain)
|
||
return sym;
|
||
/* Lost a race, someone else added a symbol in this bucket. Check
|
||
the chain and try again. */
|
||
chain = prev;
|
||
for (struct gc_ephemeron *e = chain; e; e = scm_c_ephemeron_next (e))
|
||
{
|
||
SCM sym = scm_c_ephemeron_key (e);
|
||
if (scm_is_true (sym)
|
||
&& symbol_equals_string (sym, str, len, raw_hash))
|
||
return sym;
|
||
}
|
||
}
|
||
}
|
||
|
||
static SCM
|
||
scm_i_str2uninterned_symbol (SCM str)
|
||
{
|
||
unsigned long raw_hash = scm_i_string_hash (str);
|
||
|
||
return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED, raw_hash);
|
||
}
|
||
|
||
SCM_DEFINE (scm_symbol_p, "symbol?", 1, 0, 0,
|
||
(SCM obj),
|
||
"Return @code{#t} if @var{obj} is a symbol, otherwise return\n"
|
||
"@code{#f}.")
|
||
#define FUNC_NAME s_scm_symbol_p
|
||
{
|
||
return scm_from_bool (scm_is_symbol (obj));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
SCM_DEFINE (scm_symbol_interned_p, "symbol-interned?", 1, 0, 0,
|
||
(SCM symbol),
|
||
"Return @code{#t} if @var{symbol} is interned, otherwise return\n"
|
||
"@code{#f}.")
|
||
#define FUNC_NAME s_scm_symbol_interned_p
|
||
{
|
||
SCM_VALIDATE_SYMBOL (1, symbol);
|
||
return scm_from_bool (scm_i_symbol_is_interned (symbol));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
SCM_DEFINE (scm_make_symbol, "make-symbol", 1, 0, 0,
|
||
(SCM name),
|
||
"Return a new uninterned symbol with the name @var{name}. "
|
||
"The returned symbol is guaranteed to be unique and future "
|
||
"calls to @code{string->symbol} will not return it.")
|
||
#define FUNC_NAME s_scm_make_symbol
|
||
{
|
||
SCM_VALIDATE_STRING (1, name);
|
||
return scm_i_str2uninterned_symbol (name);
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
SCM_DEFINE (scm_symbol_to_string, "symbol->string", 1, 0, 0,
|
||
(SCM s),
|
||
"Return the name of @var{symbol} as a string. The resulting\n"
|
||
"string is immutable.")
|
||
#define FUNC_NAME s_scm_symbol_to_string
|
||
{
|
||
SCM_VALIDATE_SYMBOL (1, s);
|
||
return scm_i_symbol_substring (s, 0, scm_i_symbol_length (s));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
|
||
SCM_DEFINE (scm_string_to_symbol, "string->symbol", 1, 0, 0,
|
||
(SCM string),
|
||
"Return the symbol whose name is @var{string}.")
|
||
#define FUNC_NAME s_scm_string_to_symbol
|
||
{
|
||
SCM_VALIDATE_STRING (1, string);
|
||
return scm_i_str2symbol (string);
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
SCM_DEFINE (scm_string_ci_to_symbol, "string-ci->symbol", 1, 0, 0,
|
||
(SCM str),
|
||
"Return the symbol whose name is @var{str}. @var{str} is\n"
|
||
"converted to lowercase before the conversion is done, if Guile\n"
|
||
"is currently reading symbols case-insensitively.")
|
||
#define FUNC_NAME s_scm_string_ci_to_symbol
|
||
{
|
||
return scm_string_to_symbol (SCM_CASE_INSENSITIVE_P
|
||
? scm_string_downcase(str)
|
||
: str);
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
/* The default prefix for `gensym'd symbols. */
|
||
static SCM default_gensym_prefix;
|
||
|
||
#define MAX_PREFIX_LENGTH 30
|
||
|
||
SCM_DEFINE (scm_gensym, "gensym", 0, 1, 0,
|
||
(SCM prefix),
|
||
"Create a new symbol with a name constructed from a prefix and\n"
|
||
"a counter value. The string @var{prefix} can be specified as\n"
|
||
"an optional argument. Default prefix is @code{ g}. The counter\n"
|
||
"is increased by 1 at each call. There is no provision for\n"
|
||
"resetting the counter.")
|
||
#define FUNC_NAME s_scm_gensym
|
||
{
|
||
static int gensym_counter = 0;
|
||
|
||
SCM suffix, name;
|
||
int n, n_digits;
|
||
char buf[SCM_INTBUFLEN];
|
||
|
||
if (SCM_UNBNDP (prefix))
|
||
prefix = default_gensym_prefix;
|
||
|
||
/* mutex in case another thread looks and incs at the exact same moment */
|
||
scm_i_scm_pthread_mutex_lock (&scm_i_misc_mutex);
|
||
n = gensym_counter++;
|
||
scm_i_pthread_mutex_unlock (&scm_i_misc_mutex);
|
||
|
||
n_digits = scm_iint2str (n, 10, buf);
|
||
suffix = scm_from_latin1_stringn (buf, n_digits);
|
||
name = scm_string_append (scm_list_2 (prefix, suffix));
|
||
return scm_string_to_symbol (name);
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
SCM_DEFINE (scm_symbol_hash, "symbol-hash", 1, 0, 0,
|
||
(SCM symbol),
|
||
"Return a hash value for @var{symbol}.")
|
||
#define FUNC_NAME s_scm_symbol_hash
|
||
{
|
||
SCM_VALIDATE_SYMBOL (1, symbol);
|
||
return scm_from_ulong (scm_i_symbol_hash (symbol));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
SCM
|
||
scm_from_locale_symbol (const char *sym)
|
||
{
|
||
return scm_from_locale_symboln (sym, -1);
|
||
}
|
||
|
||
SCM
|
||
scm_from_locale_symboln (const char *sym, size_t len)
|
||
{
|
||
SCM str = scm_from_locale_stringn (sym, len);
|
||
return scm_i_str2symbol (str);
|
||
}
|
||
|
||
SCM
|
||
scm_take_locale_symboln (char *sym, size_t len)
|
||
{
|
||
SCM str;
|
||
|
||
str = scm_take_locale_stringn (sym, len);
|
||
return scm_i_str2symbol (str);
|
||
}
|
||
|
||
SCM
|
||
scm_take_locale_symbol (char *sym)
|
||
{
|
||
return scm_take_locale_symboln (sym, (size_t)-1);
|
||
}
|
||
|
||
SCM
|
||
scm_from_latin1_symbol (const char *sym)
|
||
{
|
||
return scm_from_latin1_symboln (sym, -1);
|
||
}
|
||
|
||
SCM
|
||
scm_from_latin1_symboln (const char *sym, size_t len)
|
||
{
|
||
unsigned long hash;
|
||
SCM ret;
|
||
|
||
if (len == (size_t) -1)
|
||
len = strlen (sym);
|
||
hash = scm_i_latin1_string_hash (sym, len);
|
||
|
||
ret = lookup_interned_latin1_symbol (sym, len, hash);
|
||
if (scm_is_false (ret))
|
||
{
|
||
SCM str = scm_from_latin1_stringn (sym, len);
|
||
ret = scm_i_str2symbol (str);
|
||
}
|
||
|
||
return ret;
|
||
}
|
||
|
||
SCM
|
||
scm_from_utf8_symbol (const char *sym)
|
||
{
|
||
return scm_from_utf8_symboln (sym, -1);
|
||
}
|
||
|
||
SCM
|
||
scm_from_utf8_symboln (const char *sym, size_t len)
|
||
{
|
||
unsigned long hash;
|
||
SCM ret;
|
||
|
||
if (len == (size_t) -1)
|
||
len = strlen (sym);
|
||
hash = scm_i_utf8_string_hash (sym, len);
|
||
|
||
ret = lookup_interned_utf8_symbol ((const uint8_t *)sym, len, hash);
|
||
if (scm_is_false (ret))
|
||
{
|
||
SCM str = scm_from_utf8_stringn (sym, len);
|
||
ret = scm_i_str2symbol (str);
|
||
}
|
||
|
||
return ret;
|
||
}
|
||
|
||
void
|
||
scm_symbols_prehistory ()
|
||
{
|
||
symbols = scm_c_make_ephemeron_table (5000);
|
||
}
|
||
|
||
|
||
void
|
||
scm_init_symbols ()
|
||
{
|
||
#include "symbols.x"
|
||
|
||
default_gensym_prefix = scm_from_latin1_string (" g");
|
||
}
|