mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-29 19:30:36 +02:00
When the length is zero, the previous code would include the byte after the end of the string in the hash. Fix that (the wide and narrow hashers also guard against it via "case 0"), and don't bother mutating length for the trailing bytes. Since we already compute the char length, use that to detect all ASCII strings and follow the same narrow string path that we do for latin-1. libguile/hash.c (scm_i_utf8_string_hash): avoid overrun when len == 0.
463 lines
14 KiB
C
463 lines
14 KiB
C
/* Copyright 1995-1997,2000-2001,2003-2004,2006,2008-2015,2017-2018,2020,2023
|
||
Free Software Foundation, Inc.
|
||
|
||
This file is part of Guile.
|
||
|
||
Guile is free software: you can redistribute it and/or modify it
|
||
under the terms of the GNU Lesser General Public License as published
|
||
by the Free Software Foundation, either version 3 of the License, or
|
||
(at your option) any later version.
|
||
|
||
Guile is distributed in the hope that it will be useful, but WITHOUT
|
||
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
License for more details.
|
||
|
||
You should have received a copy of the GNU Lesser General Public
|
||
License along with Guile. If not, see
|
||
<https://www.gnu.org/licenses/>. */
|
||
|
||
|
||
|
||
|
||
#ifdef HAVE_CONFIG_H
|
||
# include <config.h>
|
||
#endif
|
||
|
||
#include <wchar.h>
|
||
#include <math.h>
|
||
#include <string.h>
|
||
#include <unistr.h>
|
||
|
||
#include "chars.h"
|
||
#include "foreign.h"
|
||
#include "gsubr.h"
|
||
#include "keywords.h"
|
||
#include "numbers.h"
|
||
#include "pairs.h"
|
||
#include "ports.h"
|
||
#include "strings.h"
|
||
#include "struct.h"
|
||
#include "symbols.h"
|
||
#include "syntax.h"
|
||
#include "vectors.h"
|
||
|
||
#include "hash.h"
|
||
|
||
|
||
|
||
|
||
#ifndef floor
|
||
extern double floor();
|
||
#endif
|
||
|
||
|
||
/* This hash function is originally from
|
||
http://burtleburtle.net/bob/c/lookup3.c by Bob Jenkins, May 2006,
|
||
Public Domain. No warranty. */
|
||
|
||
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
|
||
#define mix(a,b,c) \
|
||
{ \
|
||
a -= c; a ^= rot(c, 4); c += b; \
|
||
b -= a; b ^= rot(a, 6); a += c; \
|
||
c -= b; c ^= rot(b, 8); b += a; \
|
||
a -= c; a ^= rot(c,16); c += b; \
|
||
b -= a; b ^= rot(a,19); a += c; \
|
||
c -= b; c ^= rot(b, 4); b += a; \
|
||
}
|
||
|
||
#define final(a,b,c) \
|
||
{ \
|
||
c ^= b; c -= rot(b,14); \
|
||
a ^= c; a -= rot(c,11); \
|
||
b ^= a; b -= rot(a,25); \
|
||
c ^= b; c -= rot(b,16); \
|
||
a ^= c; a -= rot(c,4); \
|
||
b ^= a; b -= rot(a,14); \
|
||
c ^= b; c -= rot(b,24); \
|
||
}
|
||
|
||
#define JENKINS_LOOKUP3_HASHWORD2(k, length, ret) \
|
||
do { \
|
||
uint32_t a, b, c; \
|
||
\
|
||
/* Set up the internal state. */ \
|
||
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47; \
|
||
\
|
||
/* Handle most of the key. */ \
|
||
while (length > 3) \
|
||
{ \
|
||
a += k[0]; \
|
||
b += k[1]; \
|
||
c += k[2]; \
|
||
mix (a, b, c); \
|
||
length -= 3; \
|
||
k += 3; \
|
||
} \
|
||
\
|
||
/* Handle the last 3 elements. */ \
|
||
switch(length) /* All the case statements fall through. */ \
|
||
{ \
|
||
case 3 : c += k[2]; \
|
||
case 2 : b += k[1]; \
|
||
case 1 : a += k[0]; \
|
||
final (a, b, c); \
|
||
case 0: /* case 0: nothing left to add */ \
|
||
break; \
|
||
} \
|
||
\
|
||
/* Scheme can access symbol-hash, which exposes this value. For \
|
||
cross-compilation reasons, we ensure that the high 32 bits of \
|
||
the hash on a 64-bit system are equal to the hash on a 32-bit \
|
||
system. The low 32 bits just add more entropy. */ \
|
||
if (sizeof (ret) == 8) \
|
||
ret = (((unsigned long) c) << 32) | b; \
|
||
else \
|
||
ret = c; \
|
||
} while (0)
|
||
|
||
|
||
static unsigned long
|
||
narrow_string_hash (const uint8_t *str, size_t len)
|
||
{
|
||
unsigned long ret;
|
||
JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
|
||
ret >>= 2; /* Ensure that it fits in a fixnum. */
|
||
return ret;
|
||
}
|
||
|
||
static unsigned long
|
||
wide_string_hash (const scm_t_wchar *str, size_t len)
|
||
{
|
||
unsigned long ret;
|
||
JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
|
||
ret >>= 2; /* Ensure that it fits in a fixnum. */
|
||
return ret;
|
||
}
|
||
|
||
/* If you change this to a different hash, also update (language cps
|
||
guile-vm). */
|
||
unsigned long
|
||
scm_i_string_hash (SCM str)
|
||
{
|
||
size_t len = scm_i_string_length (str);
|
||
|
||
if (scm_i_is_narrow_string (str))
|
||
return narrow_string_hash ((const uint8_t *) scm_i_string_chars (str),
|
||
len);
|
||
else
|
||
return wide_string_hash (scm_i_string_wide_chars (str), len);
|
||
}
|
||
|
||
unsigned long
|
||
scm_i_locale_string_hash (const char *str, size_t len)
|
||
{
|
||
return scm_i_string_hash (scm_from_locale_stringn (str, len));
|
||
}
|
||
|
||
unsigned long
|
||
scm_i_latin1_string_hash (const char *str, size_t len)
|
||
{
|
||
if (len == (size_t) -1)
|
||
len = strlen (str);
|
||
|
||
return narrow_string_hash ((const uint8_t *) str, len);
|
||
}
|
||
|
||
/* A tricky optimization, but probably worth it. */
|
||
unsigned long
|
||
scm_i_utf8_string_hash (const char *str, size_t len)
|
||
{
|
||
if (len == (size_t) -1)
|
||
len = strlen (str);
|
||
|
||
// FIXME: eventually make fewer passes over str
|
||
|
||
const uint8_t *ustr = (const uint8_t *) str;
|
||
if (u8_check (ustr, len) != NULL)
|
||
/* Invalid UTF-8; punt. */
|
||
return scm_i_string_hash (scm_from_utf8_stringn (str, len));
|
||
|
||
/* The length of the string in characters. This name corresponds to
|
||
Jenkins' original name. */
|
||
size_t length = u8_mbsnlen (ustr, len);
|
||
|
||
if (len == length) // ascii, same as narrow_string_hash above
|
||
return narrow_string_hash ((uint8_t *) str, len);
|
||
|
||
const uint8_t * const end = ustr + len;
|
||
uint32_t a, b, c, u32;
|
||
unsigned long ret;
|
||
|
||
/* Set up the internal state. */
|
||
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47;
|
||
|
||
/* Handle most of the key. */
|
||
while (length > 3)
|
||
{
|
||
ustr += u8_mbtouc (&u32, ustr, end - ustr);
|
||
a += u32;
|
||
ustr += u8_mbtouc (&u32, ustr, end - ustr);
|
||
b += u32;
|
||
ustr += u8_mbtouc (&u32, ustr, end - ustr);
|
||
c += u32;
|
||
mix (a, b, c);
|
||
length -= 3;
|
||
}
|
||
|
||
// Similar to narrow_string_hash(). Handle the last 3 chars; length
|
||
// cannot be zero because len != length above.
|
||
ustr += u8_mbtouc (&u32, ustr, end - ustr);
|
||
a += u32;
|
||
if (length > 1)
|
||
{
|
||
ustr += u8_mbtouc (&u32, ustr, end - ustr);
|
||
b += u32;
|
||
if (length > 2)
|
||
{
|
||
ustr += u8_mbtouc (&u32, ustr, end - ustr);
|
||
c += u32;
|
||
}
|
||
}
|
||
|
||
final (a, b, c);
|
||
|
||
if (sizeof (unsigned long) == 8)
|
||
ret = (((unsigned long) c) << 32) | b;
|
||
else
|
||
ret = c;
|
||
|
||
ret >>= 2; /* Ensure that it fits in a fixnum. */
|
||
return ret;
|
||
}
|
||
|
||
static unsigned long scm_raw_ihashq (scm_t_bits key);
|
||
static unsigned long scm_raw_ihash (SCM obj, size_t depth);
|
||
|
||
/* Return the hash of struct OBJ. Traverse OBJ's fields to compute the
|
||
result, unless DEPTH is zero. Assumes that OBJ is a struct. */
|
||
static unsigned long
|
||
scm_i_struct_hash (SCM obj, size_t depth)
|
||
{
|
||
size_t struct_size, field_num;
|
||
unsigned long hash;
|
||
|
||
struct_size = SCM_STRUCT_SIZE (obj);
|
||
|
||
hash = scm_raw_ihashq (SCM_UNPACK (SCM_STRUCT_VTABLE (obj)));
|
||
if (depth > 0)
|
||
{
|
||
for (field_num = 0; field_num < struct_size; field_num++)
|
||
if (SCM_STRUCT_FIELD_IS_UNBOXED (obj, field_num))
|
||
hash ^= scm_raw_ihashq (SCM_STRUCT_DATA_REF (obj, field_num));
|
||
else
|
||
hash ^= scm_raw_ihash (SCM_STRUCT_SLOT_REF (obj, field_num),
|
||
depth / 2);
|
||
}
|
||
|
||
return hash;
|
||
}
|
||
|
||
/* Thomas Wang's integer hasher, from
|
||
http://www.cris.com/~Ttwang/tech/inthash.htm. */
|
||
static unsigned long
|
||
scm_raw_ihashq (scm_t_bits key)
|
||
{
|
||
if (sizeof (key) < 8)
|
||
{
|
||
key = (key ^ 61) ^ (key >> 16);
|
||
key = key + (key << 3);
|
||
key = key ^ (key >> 4);
|
||
key = key * 0x27d4eb2d;
|
||
key = key ^ (key >> 15);
|
||
}
|
||
else
|
||
{
|
||
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
|
||
key = key ^ (key >> 24);
|
||
key = (key + (key << 3)) + (key << 8); // key * 265
|
||
key = key ^ (key >> 14);
|
||
key = (key + (key << 2)) + (key << 4); // key * 21
|
||
key = key ^ (key >> 28);
|
||
key = key + (key << 31);
|
||
}
|
||
key >>= 2; /* Ensure that it fits in a fixnum. */
|
||
return key;
|
||
}
|
||
|
||
/* `depth' is used to limit recursion. */
|
||
static unsigned long
|
||
scm_raw_ihash (SCM obj, size_t depth)
|
||
{
|
||
if (SCM_IMP (obj))
|
||
return scm_raw_ihashq (SCM_UNPACK (obj));
|
||
|
||
switch (SCM_TYP7(obj))
|
||
{
|
||
/* FIXME: do better for structs, variables, ... Also the hashes
|
||
are currently associative, which ain't the right thing. */
|
||
case scm_tc7_smob:
|
||
return scm_raw_ihashq (SCM_TYP16 (obj));
|
||
case scm_tc7_number:
|
||
if (scm_is_integer (obj))
|
||
{
|
||
SCM n = SCM_I_MAKINUM (SCM_MOST_POSITIVE_FIXNUM);
|
||
if (scm_is_inexact (obj))
|
||
obj = scm_inexact_to_exact (obj);
|
||
return scm_raw_ihashq (scm_to_ulong (scm_modulo (obj, n)));
|
||
}
|
||
else
|
||
return scm_i_string_hash (scm_number_to_string (obj, scm_from_int (10)));
|
||
case scm_tc7_string:
|
||
return scm_i_string_hash (obj);
|
||
case scm_tc7_symbol:
|
||
return scm_i_symbol_hash (obj);
|
||
case scm_tc7_keyword:
|
||
return SCM_I_KEYWORD_HASH (obj);
|
||
case scm_tc7_pointer:
|
||
return scm_raw_ihashq ((uintptr_t) SCM_POINTER_VALUE (obj));
|
||
case scm_tc7_wvect:
|
||
case scm_tc7_vector:
|
||
{
|
||
size_t len = SCM_SIMPLE_VECTOR_LENGTH (obj);
|
||
size_t i = depth / 2;
|
||
unsigned long h = scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
|
||
if (len)
|
||
while (i--)
|
||
h ^= scm_raw_ihash (scm_c_vector_ref (obj, h % len), i);
|
||
return h;
|
||
}
|
||
case scm_tc7_syntax:
|
||
{
|
||
unsigned long h;
|
||
h = scm_raw_ihash (scm_syntax_expression (obj), depth);
|
||
h ^= scm_raw_ihash (scm_syntax_wrap (obj), depth);
|
||
h ^= scm_raw_ihash (scm_syntax_module (obj), depth);
|
||
return h;
|
||
}
|
||
|
||
/* The following tc7s have no 'equal?' implementation. Thus, just
|
||
fall back to 'hashq'. */
|
||
case scm_tc7_variable:
|
||
case scm_tc7_hashtable:
|
||
case scm_tc7_fluid:
|
||
case scm_tc7_dynamic_state:
|
||
case scm_tc7_frame:
|
||
case scm_tc7_atomic_box:
|
||
case scm_tc7_program:
|
||
case scm_tc7_vm_cont:
|
||
case scm_tc7_weak_set:
|
||
case scm_tc7_weak_table:
|
||
case scm_tc7_port:
|
||
return scm_raw_ihashq (SCM_UNPACK (obj));
|
||
|
||
case scm_tcs_cons_imcar:
|
||
case scm_tcs_cons_nimcar:
|
||
if (depth)
|
||
return (scm_raw_ihash (SCM_CAR (obj), depth / 2)
|
||
^ scm_raw_ihash (SCM_CDR (obj), depth / 2));
|
||
else
|
||
return scm_raw_ihashq (scm_tc3_cons);
|
||
case scm_tcs_struct:
|
||
return scm_i_struct_hash (obj, depth);
|
||
default:
|
||
return scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
unsigned long
|
||
scm_ihashq (SCM obj, unsigned long n)
|
||
{
|
||
return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
|
||
}
|
||
|
||
|
||
SCM_DEFINE (scm_hashq, "hashq", 2, 0, 0,
|
||
(SCM key, SCM size),
|
||
"Determine a hash value for @var{key} that is suitable for\n"
|
||
"lookups in a hashtable of size @var{size}, where @code{eq?} is\n"
|
||
"used as the equality predicate. The function returns an\n"
|
||
"integer in the range 0 to @var{size} - 1. Note that\n"
|
||
"@code{hashq} may use internal addresses. Thus two calls to\n"
|
||
"hashq where the keys are @code{eq?} are not guaranteed to\n"
|
||
"deliver the same value if the key object gets garbage collected\n"
|
||
"in between. This can happen, for example with symbols:\n"
|
||
"@code{(hashq 'foo n) (gc) (hashq 'foo n)} may produce two\n"
|
||
"different values, since @code{foo} will be garbage collected.")
|
||
#define FUNC_NAME s_scm_hashq
|
||
{
|
||
unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
|
||
return scm_from_ulong (scm_ihashq (key, sz));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
|
||
|
||
|
||
|
||
unsigned long
|
||
scm_ihashv (SCM obj, unsigned long n)
|
||
{
|
||
if (SCM_NUMP(obj))
|
||
return scm_raw_ihash (obj, 10) % n;
|
||
else
|
||
return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
|
||
}
|
||
|
||
|
||
SCM_DEFINE (scm_hashv, "hashv", 2, 0, 0,
|
||
(SCM key, SCM size),
|
||
"Determine a hash value for @var{key} that is suitable for\n"
|
||
"lookups in a hashtable of size @var{size}, where @code{eqv?} is\n"
|
||
"used as the equality predicate. The function returns an\n"
|
||
"integer in the range 0 to @var{size} - 1. Note that\n"
|
||
"@code{(hashv key)} may use internal addresses. Thus two calls\n"
|
||
"to hashv where the keys are @code{eqv?} are not guaranteed to\n"
|
||
"deliver the same value if the key object gets garbage collected\n"
|
||
"in between. This can happen, for example with symbols:\n"
|
||
"@code{(hashv 'foo n) (gc) (hashv 'foo n)} may produce two\n"
|
||
"different values, since @code{foo} will be garbage collected.")
|
||
#define FUNC_NAME s_scm_hashv
|
||
{
|
||
unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
|
||
return scm_from_ulong (scm_ihashv (key, sz));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
|
||
|
||
|
||
|
||
unsigned long
|
||
scm_ihash (SCM obj, unsigned long n)
|
||
{
|
||
return (unsigned long) scm_raw_ihash (obj, 10) % n;
|
||
}
|
||
|
||
SCM_DEFINE (scm_hash, "hash", 2, 0, 0,
|
||
(SCM key, SCM size),
|
||
"Determine a hash value for @var{key} that is suitable for\n"
|
||
"lookups in a hashtable of size @var{size}, where @code{equal?}\n"
|
||
"is used as the equality predicate. The function returns an\n"
|
||
"integer in the range 0 to @var{size} - 1.")
|
||
#define FUNC_NAME s_scm_hash
|
||
{
|
||
unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
|
||
return scm_from_ulong (scm_ihash (key, sz));
|
||
}
|
||
#undef FUNC_NAME
|
||
|
||
|
||
|
||
|
||
|
||
void
|
||
scm_init_hash ()
|
||
{
|
||
#include "hash.x"
|
||
}
|
||
|