1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-02 04:40:29 +02:00
guile/libguile/hash.c
Jan (janneke) Nieuwenhuizen a043eaf349 Support for x86_64-w64-mingw32.
On x86-64-MinGW the size of long is 4.  As long is used for
SCM_FIXNUM_BIT, that would mean incompatible .go files, and waste of
cell space.  So we would like to use long long, but the GMP interface
uses long.

To get around this, the x86-64-MinGW port now requires the use of
mini-GMP.  Mini-GMP has been changed to use intptr_t and uintptr_t.

Likewise, "integers.{h,c}" and "numbers.{h,c}" now use intptr_t instead
of scm_t_inum or long, and uintptr_t instead of unsigned long.

* configure.ac: When x86_64-w64-mingw32, require mini-GMP.
* libguile/mini-gmp.h: Use intptr_t instead of long, uintptr_t instead
of unsigned long throughout.
* libguile/mini-gmp.c: Likewise.
* libguile/scm.h (SCM_INTPTR_T_BIT): New define.
* libguile/numbers.h (SCM_FIXNUM_BIT): Use it.
* libguile/numbers.c (L1, UL1): New macros.  Use them thoughout instead
of 1L, 1UL.
(verify): Use SCM_INTPTR_T_BIT.
(verify): Use SCM_INTPTR_T_MAX and SCM_INTPTR_T_MIN.
(scm_from_inum): Remove macro.
Use intptr_t and uintptr_t instead of scm_t_inum or long, and unsigned
long.
* libguile/numbers.h (scm_from_intptr, scm_from_uintptr, scm_to_intptr,
scm_to_uintptr): New defines.
* libguile/integers.h: Use intptr_t and uintptr_t instead of scm_t_inum
and unsigned long.
* libguile/integers.c (L1) : New macro.  Use it thoughout instead of 1L.
Use intptr_t and uintptr_t instead of long and unsigned long.
(long_magnitude): Rename to...
(intptr_t_magnitude): ...this.  Use intptr_t, uintptr_t.
(negative_long): Rename to...
(negative_t_intptr): ...this.  Use uintptr_t, INTPTR_MIN.
(inum_magnitude): Use intptr_t.
(ulong_to_bignum): Rename to...
(uintptr_t_to_bignum): ...this.  Use uintptr_t.
(long_to_bignum): Rename to...
(intptr_t_to_bignum): ...this.  Use intptr_t.
(long_to_scm): Rename to...
(intptr_t_to_scm): ...this.  Use intptr_to_bignum.
(ulong_to_scm): Rename to...
(uintptr_t_to_scm): ...this.  Use uintptr_to_bignum.
(long_sign): Rename to..
(intptr_t_sign): ...this.  Use SCM_SIZEOF_INTPTR_T.
(bignum_cmp_long): Rename to...
(bignum_cmp_intptr_t): ...this.  Use uintptr_t.
* libguile/array-map.c (array_compare): Use uintptr_t instead of
unsigned long and intptr_t instead of long.
* libguile/arrays.c (make-shared-array): Use ssize_t instead of long.
* libguile/bytevectors.c (is_signed_int32, is_unsigned_int32)
[MINGW32 && __x86_64__]: Use ULL.
(twos_complement): Use uintptr_t instead of unsigned long.
* libguile/hash.c (JENKINS_LOOKUP3_HASHWORD2): Likewise.
(narrow_string_hash, wide_string_hash, scm_i_string_hash,
scm_i_locale_string_hash, scm_i_latin1_string_hash,
scm_i_utf8_string_hash, scm_i_struct_hash, scm_raw_ihashq,
scm_raw_ihash): Use and return uintptr_t instead of unsigned long.
(scm_hashv, scm_hash): Use SCM_UINTPTR_T_MAX.
* libguile/hash.h (scm_i_locale_string_hash, scm_i_latin1_string_hash,
scm_i_utf8_string_hash): update prototypes.
* libguile/scmsigs.c (sigaction): Use intptr_t instead of long.
* libguile/strings.c (scm_i_make_symbol, (scm_i_c_make_symbol): Use
uintptr_t instead of unsigned long.
* libguile/strings.h (scm_i_make_symbol, (scm_i_c_make_symbol): Update
declacations.
* libguile/srfi-60.c: Use scm_to_uintptr, scm_from_intptr and variants
throughout.
* libguile/symbols.c (symbol-hash): Use scm_from_uintptr.

Co-authored-by: Mike Gran <spk121@yahoo.com>
Co-authored-by: Andy Wingo <wingo@pobox.com>
2025-03-30 17:57:12 -07:00

475 lines
14 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Copyright 1995-1997,2000-2001,2003-2004,2006,2008-2015,2017-2018,2020,2023
Free Software Foundation, Inc.
This file is part of Guile.
Guile is free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Guile is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public
License along with Guile. If not, see
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <wchar.h>
#include <math.h>
#include <string.h>
#include <unistr.h>
#include "chars.h"
#include "foreign.h"
#include "gsubr.h"
#include "keywords.h"
#include "numbers.h"
#include "pairs.h"
#include "ports.h"
#include "strings.h"
#include "struct.h"
#include "symbols.h"
#include "syntax.h"
#include "vectors.h"
#include "hash.h"
#ifndef floor
extern double floor();
#endif
/* This hash function is originally from
http://burtleburtle.net/bob/c/lookup3.c by Bob Jenkins, May 2006,
Public Domain. No warranty. */
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
#define mix(a,b,c) \
{ \
a -= c; a ^= rot(c, 4); c += b; \
b -= a; b ^= rot(a, 6); a += c; \
c -= b; c ^= rot(b, 8); b += a; \
a -= c; a ^= rot(c,16); c += b; \
b -= a; b ^= rot(a,19); a += c; \
c -= b; c ^= rot(b, 4); b += a; \
}
#define final(a,b,c) \
{ \
c ^= b; c -= rot(b,14); \
a ^= c; a -= rot(c,11); \
b ^= a; b -= rot(a,25); \
c ^= b; c -= rot(b,16); \
a ^= c; a -= rot(c,4); \
b ^= a; b -= rot(a,14); \
c ^= b; c -= rot(b,24); \
}
#define JENKINS_LOOKUP3_HASHWORD2(k, length, ret) \
do { \
uint32_t a, b, c; \
\
/* Set up the internal state. */ \
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47; \
\
/* Handle most of the key. */ \
while (length > 3) \
{ \
a += k[0]; \
b += k[1]; \
c += k[2]; \
mix (a, b, c); \
length -= 3; \
k += 3; \
} \
\
/* Handle the last 3 elements. */ \
switch(length) /* All the case statements fall through. */ \
{ \
case 3 : c += k[2]; \
case 2 : b += k[1]; \
case 1 : a += k[0]; \
final (a, b, c); \
case 0: /* case 0: nothing left to add */ \
break; \
} \
\
/* Scheme can access symbol-hash, which exposes this value. For \
cross-compilation reasons, we ensure that the high 32 bits of \
the hash on a 64-bit system are equal to the hash on a 32-bit \
system. The low 32 bits just add more entropy. */ \
if (sizeof (ret) == 8) \
ret = (((uintptr_t) c) << 32) | b; \
else \
ret = c; \
} while (0)
static uintptr_t
narrow_string_hash (const uint8_t *str, size_t len)
{
uintptr_t ret;
JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
ret >>= 2; /* Ensure that it fits in a fixnum. */
return ret;
}
static uintptr_t
wide_string_hash (const scm_t_wchar *str, size_t len)
{
uintptr_t ret;
JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
ret >>= 2; /* Ensure that it fits in a fixnum. */
return ret;
}
uintptr_t
scm_i_string_hash (SCM str)
{
size_t len = scm_i_string_length (str);
if (scm_i_is_narrow_string (str))
return narrow_string_hash ((const uint8_t *) scm_i_string_chars (str),
len);
else
return wide_string_hash (scm_i_string_wide_chars (str), len);
}
uintptr_t
scm_i_locale_string_hash (const char *str, size_t len)
{
return scm_i_string_hash (scm_from_locale_stringn (str, len));
}
uintptr_t
scm_i_latin1_string_hash (const char *str, size_t len)
{
if (len == (size_t) -1)
len = strlen (str);
return narrow_string_hash ((const uint8_t *) str, len);
}
/* A tricky optimization, but probably worth it. */
uintptr_t
scm_i_utf8_string_hash (const char *str, size_t len)
{
const uint8_t *end, *ustr = (const uint8_t *) str;
uintptr_t ret;
/* The length of the string in characters. This name corresponds to
Jenkins' original name. */
size_t length;
uint32_t a, b, c, u32;
if (len == (size_t) -1)
len = strlen (str);
// FIXME: eventually make fewer passes over str
const uint8_t *ustr = (const uint8_t *) str;
if (u8_check (ustr, len) != NULL)
/* Invalid UTF-8; punt. */
return scm_i_string_hash (scm_from_utf8_stringn (str, len));
/* The length of the string in characters. This name corresponds to
Jenkins' original name. */
size_t length = u8_mbsnlen (ustr, len);
if (len == length) // ascii, same as narrow_string_hash above
return narrow_string_hash ((uint8_t *) str, len);
const uint8_t * const end = ustr + len;
uint32_t a, b, c, u32;
uintptr_t ret;
/* Set up the internal state. */
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47;
/* Handle most of the key. */
while (length > 3)
{
ustr += u8_mbtouc (&u32, ustr, end - ustr);
a += u32;
ustr += u8_mbtouc (&u32, ustr, end - ustr);
b += u32;
ustr += u8_mbtouc (&u32, ustr, end - ustr);
c += u32;
mix (a, b, c);
length -= 3;
}
// Similar to narrow_string_hash(). Handle the last 3 chars; length
// cannot be zero because len != length above.
ustr += u8_mbtouc (&u32, ustr, end - ustr);
a += u32;
if (length > 1)
{
ustr += u8_mbtouc (&u32, ustr, end - ustr);
b += u32;
if (length > 2)
{
ustr += u8_mbtouc (&u32, ustr, end - ustr);
c += u32;
}
}
final (a, b, c);
if (sizeof (uintptr_t) == 8)
ret = (((uintptr_t) c) << 32) | b;
else
ret = c;
ret >>= 2; /* Ensure that it fits in a fixnum. */
return ret;
}
static uintptr_t scm_raw_ihashq (scm_t_bits key);
static uintptr_t scm_raw_ihash (SCM obj, size_t depth);
/* Return the hash of struct OBJ. Traverse OBJ's fields to compute the
result, unless DEPTH is zero. Assumes that OBJ is a struct. */
static uintptr_t
scm_i_struct_hash (SCM obj, size_t depth)
{
size_t struct_size, field_num;
uintptr_t hash;
struct_size = SCM_STRUCT_SIZE (obj);
hash = scm_raw_ihashq (SCM_UNPACK (SCM_STRUCT_VTABLE (obj)));
if (depth > 0)
{
for (field_num = 0; field_num < struct_size; field_num++)
if (SCM_STRUCT_FIELD_IS_UNBOXED (obj, field_num))
hash ^= scm_raw_ihashq (SCM_STRUCT_DATA_REF (obj, field_num));
else
hash ^= scm_raw_ihash (SCM_STRUCT_SLOT_REF (obj, field_num),
depth / 2);
}
return hash;
}
/* Thomas Wang's integer hasher, from
http://www.cris.com/~Ttwang/tech/inthash.htm. */
static uintptr_t
scm_raw_ihashq (scm_t_bits key)
{
if (sizeof (key) < 8)
{
key = (key ^ 61) ^ (key >> 16);
key = key + (key << 3);
key = key ^ (key >> 4);
key = key * 0x27d4eb2d;
key = key ^ (key >> 15);
}
else
{
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
}
key >>= 2; /* Ensure that it fits in a fixnum. */
return key;
}
/* `depth' is used to limit recursion. */
static uintptr_t
scm_raw_ihash (SCM obj, size_t depth)
{
if (SCM_IMP (obj))
return scm_raw_ihashq (SCM_UNPACK (obj));
switch (SCM_TYP7(obj))
{
/* FIXME: do better for structs, variables, ... Also the hashes
are currently associative, which ain't the right thing. */
case scm_tc7_smob:
return scm_raw_ihashq (SCM_TYP16 (obj));
case scm_tc7_number:
if (scm_is_integer (obj))
{
SCM n = SCM_I_MAKINUM (SCM_MOST_POSITIVE_FIXNUM);
if (scm_is_inexact (obj))
obj = scm_inexact_to_exact (obj);
return scm_raw_ihashq (scm_to_uintptr_t (scm_modulo (obj, n)));
}
else
return scm_i_string_hash (scm_number_to_string (obj, scm_from_int (10)));
case scm_tc7_string:
return scm_i_string_hash (obj);
case scm_tc7_symbol:
return scm_i_symbol_hash (obj);
case scm_tc7_keyword:
return SCM_I_KEYWORD_HASH (obj);
case scm_tc7_pointer:
return scm_raw_ihashq ((uintptr_t) SCM_POINTER_VALUE (obj));
case scm_tc7_wvect:
case scm_tc7_vector:
{
size_t len = SCM_SIMPLE_VECTOR_LENGTH (obj);
size_t i = depth / 2;
uintptr_t h = scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
if (len)
while (i--)
h ^= scm_raw_ihash (scm_c_vector_ref (obj, h % len), i);
return h;
}
case scm_tc7_syntax:
{
uintptr_t h;
h = scm_raw_ihash (scm_syntax_expression (obj), depth);
h ^= scm_raw_ihash (scm_syntax_wrap (obj), depth);
h ^= scm_raw_ihash (scm_syntax_module (obj), depth);
return h;
}
/* The following tc7s have no 'equal?' implementation. Thus, just
fall back to 'hashq'. */
case scm_tc7_variable:
case scm_tc7_hashtable:
case scm_tc7_fluid:
case scm_tc7_dynamic_state:
case scm_tc7_frame:
case scm_tc7_atomic_box:
case scm_tc7_program:
case scm_tc7_vm_cont:
case scm_tc7_weak_set:
case scm_tc7_weak_table:
case scm_tc7_port:
return scm_raw_ihashq (SCM_UNPACK (obj));
case scm_tcs_cons_imcar:
case scm_tcs_cons_nimcar:
if (depth)
return (scm_raw_ihash (SCM_CAR (obj), depth / 2)
^ scm_raw_ihash (SCM_CDR (obj), depth / 2));
else
return scm_raw_ihashq (scm_tc3_cons);
case scm_tcs_struct:
return scm_i_struct_hash (obj, depth);
default:
return scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
}
}
uintptr_t
scm_ihashq (SCM obj, uintptr_t n)
{
return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
}
SCM_DEFINE (scm_hashq, "hashq", 2, 0, 0,
(SCM key, SCM size),
"Determine a hash value for @var{key} that is suitable for\n"
"lookups in a hashtable of size @var{size}, where @code{eq?} is\n"
"used as the equality predicate. The function returns an\n"
"integer in the range 0 to @var{size} - 1. Note that\n"
"@code{hashq} may use internal addresses. Thus two calls to\n"
"hashq where the keys are @code{eq?} are not guaranteed to\n"
"deliver the same value if the key object gets garbage collected\n"
"in between. This can happen, for example with symbols:\n"
"@code{(hashq 'foo n) (gc) (hashq 'foo n)} may produce two\n"
"different values, since @code{foo} will be garbage collected.")
#define FUNC_NAME s_scm_hashq
{
uintptr_t sz = scm_to_unsigned_integer (size, 1, UINTPTR_MAX);
return scm_from_unsigned_integer (scm_ihashq (key, sz));
}
#undef FUNC_NAME
uintptr_t
scm_ihashv (SCM obj, uintptr_t n)
{
if (SCM_NUMP(obj))
return scm_raw_ihash (obj, 10) % n;
else
return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
}
SCM_DEFINE (scm_hashv, "hashv", 2, 0, 0,
(SCM key, SCM size),
"Determine a hash value for @var{key} that is suitable for\n"
"lookups in a hashtable of size @var{size}, where @code{eqv?} is\n"
"used as the equality predicate. The function returns an\n"
"integer in the range 0 to @var{size} - 1. Note that\n"
"@code{(hashv key)} may use internal addresses. Thus two calls\n"
"to hashv where the keys are @code{eqv?} are not guaranteed to\n"
"deliver the same value if the key object gets garbage collected\n"
"in between. This can happen, for example with symbols:\n"
"@code{(hashv 'foo n) (gc) (hashv 'foo n)} may produce two\n"
"different values, since @code{foo} will be garbage collected.")
#define FUNC_NAME s_scm_hashv
{
uintptr_t sz = scm_to_unsigned_integer (size, 1, UINTPTR_MAX);
<<<<<<< HEAD
return scm_from_ulong (scm_ihashv (key, sz));
=======
return scm_from_unsigned_integer (scm_ihashv (key, sz));
>>>>>>> 64dca4d57 (Store hashes as uintptr_t)
}
#undef FUNC_NAME
uintptr_t
scm_ihash (SCM obj, uintptr_t n)
{
return scm_raw_ihash (obj, 10) % n;
}
SCM_DEFINE (scm_hash, "hash", 2, 0, 0,
(SCM key, SCM size),
"Determine a hash value for @var{key} that is suitable for\n"
"lookups in a hashtable of size @var{size}, where @code{equal?}\n"
"is used as the equality predicate. The function returns an\n"
"integer in the range 0 to @var{size} - 1.")
#define FUNC_NAME s_scm_hash
{
uintptr_t sz = scm_to_unsigned_integer (size, 1, UINTPTR_MAX);
return scm_from_ulong (scm_ihash (key, sz));
return scm_from_unsigned_integer (scm_ihash (key, sz));
}
#undef FUNC_NAME
void
scm_init_hash ()
{
#include "hash.x"
}