mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 03:40:34 +02:00
scm_i_utf8_string_hash: compute u8 chars not bytes
Noticed while investigating a migration to utf-8 strings. After making changes that routed non-ascii symbol hashing through this function, encoding-iso88597.test began intermittently failing because it would traverse trailing garbage when u8_strnlen reported 8 chars instead of 4. Change the scm_i_str2symbol and scm_i_str2uninterned_symbol internal hash type to unsigned long to explicitly match the scm_i_string_hash result type. * libguile/hash.c (scm_i_utf8_string_hash): Call u8_mbsnlen not u8_strnlen. * libguile/symbols.c (scm_i_str2symbol, scm_i_str2uninterned_symbol): Use unsigned long for scm_i_string_hash result. * test-suite/standalone/.gitignore: Add test-hashing. * test-suite/standalone/Makefile.am: Add test-hashing. * test-suite/standalone/test-hashing.c: Add.
This commit is contained in:
parent
f0df1ed0fd
commit
ffb95239aa
6 changed files with 86 additions and 3 deletions
12
NEWS
12
NEWS
|
@ -21,6 +21,18 @@ definitely unused---this is notably the case for modules that are only
|
||||||
used at macro-expansion time, such as (srfi srfi-26). In those cases,
|
used at macro-expansion time, such as (srfi srfi-26). In those cases,
|
||||||
the compiler reports it as "possibly unused".
|
the compiler reports it as "possibly unused".
|
||||||
|
|
||||||
|
* Bug fixes
|
||||||
|
|
||||||
|
* Hashing of UTF-8 symbols with non-ASCII characters avoids corruption
|
||||||
|
|
||||||
|
This issue could cause `scm_from_utf8_symbol' and
|
||||||
|
`scm_from_utf8_symboln` to incorrectly conclude that the symbol hadn't
|
||||||
|
already been interned, and then create a new one, which of course
|
||||||
|
wouldn't be `eq?' to the other(s). The incorrect hash was the result of
|
||||||
|
a buffer overrun, and so might vary. This problem affected a number of
|
||||||
|
other operations, given the internal use of those functions.
|
||||||
|
(<https://bugs.gnu.org/56413>)
|
||||||
|
|
||||||
|
|
||||||
Changes in 3.0.9 (since 3.0.8)
|
Changes in 3.0.9 (since 3.0.8)
|
||||||
|
|
||||||
|
|
|
@ -185,7 +185,7 @@ scm_i_utf8_string_hash (const char *str, size_t len)
|
||||||
/* Invalid UTF-8; punt. */
|
/* Invalid UTF-8; punt. */
|
||||||
return scm_i_string_hash (scm_from_utf8_stringn (str, len));
|
return scm_i_string_hash (scm_from_utf8_stringn (str, len));
|
||||||
|
|
||||||
length = u8_strnlen (ustr, len);
|
length = u8_mbsnlen (ustr, len);
|
||||||
|
|
||||||
/* Set up the internal state. */
|
/* Set up the internal state. */
|
||||||
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47;
|
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47;
|
||||||
|
|
|
@ -239,7 +239,7 @@ static SCM
|
||||||
scm_i_str2symbol (SCM str)
|
scm_i_str2symbol (SCM str)
|
||||||
{
|
{
|
||||||
SCM symbol;
|
SCM symbol;
|
||||||
size_t raw_hash = scm_i_string_hash (str);
|
unsigned long raw_hash = scm_i_string_hash (str);
|
||||||
|
|
||||||
symbol = lookup_interned_symbol (str, raw_hash);
|
symbol = lookup_interned_symbol (str, raw_hash);
|
||||||
if (scm_is_true (symbol))
|
if (scm_is_true (symbol))
|
||||||
|
@ -261,7 +261,7 @@ scm_i_str2symbol (SCM str)
|
||||||
static SCM
|
static SCM
|
||||||
scm_i_str2uninterned_symbol (SCM str)
|
scm_i_str2uninterned_symbol (SCM str)
|
||||||
{
|
{
|
||||||
size_t raw_hash = scm_i_string_hash (str);
|
unsigned long raw_hash = scm_i_string_hash (str);
|
||||||
|
|
||||||
return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED, raw_hash);
|
return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED, raw_hash);
|
||||||
}
|
}
|
||||||
|
|
1
test-suite/standalone/.gitignore
vendored
1
test-suite/standalone/.gitignore
vendored
|
@ -1,5 +1,6 @@
|
||||||
/test-conversion
|
/test-conversion
|
||||||
/test-gh
|
/test-gh
|
||||||
|
/test-hashing
|
||||||
/test-list
|
/test-list
|
||||||
/test-num2integral
|
/test-num2integral
|
||||||
/test-round
|
/test-round
|
||||||
|
|
|
@ -167,6 +167,13 @@ test_conversion_LDADD = $(LIBGUILE_LDADD) $(top_builddir)/lib/libgnu.la
|
||||||
check_PROGRAMS += test-conversion
|
check_PROGRAMS += test-conversion
|
||||||
TESTS += test-conversion
|
TESTS += test-conversion
|
||||||
|
|
||||||
|
# test-hashing
|
||||||
|
test_hashing_SOURCES = test-hashing.c
|
||||||
|
test_hashing_CFLAGS = ${test_cflags}
|
||||||
|
test_hashing_LDADD = $(LIBGUILE_LDADD) $(top_builddir)/lib/libgnu.la
|
||||||
|
check_PROGRAMS += test-hashing
|
||||||
|
TESTS += test-hashing
|
||||||
|
|
||||||
# test-loose-ends
|
# test-loose-ends
|
||||||
test_loose_ends_SOURCES = test-loose-ends.c
|
test_loose_ends_SOURCES = test-loose-ends.c
|
||||||
test_loose_ends_CFLAGS = ${test_cflags}
|
test_loose_ends_CFLAGS = ${test_cflags}
|
||||||
|
|
63
test-suite/standalone/test-hashing.c
Normal file
63
test-suite/standalone/test-hashing.c
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
/* Copyright 2022-2023
|
||||||
|
Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of Guile.
|
||||||
|
|
||||||
|
Guile is free software: you can redistribute it and/or modify it
|
||||||
|
under the terms of the GNU Lesser General Public License as published
|
||||||
|
by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
Guile is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||||
|
License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with Guile. If not, see
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#if HAVE_CONFIG_H
|
||||||
|
# include <config.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <libguile.h>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_hashing ()
|
||||||
|
{
|
||||||
|
// Make sure a utf-8 symbol has the expected hash. In addition to
|
||||||
|
// catching algorithmic regressions, this would have caught a
|
||||||
|
// long-standing buffer overflow.
|
||||||
|
|
||||||
|
// Περί
|
||||||
|
char about_u8[] = {0xce, 0xa0, 0xce, 0xb5, 0xcf, 0x81, 0xce, 0xaf, 0};
|
||||||
|
SCM sym = scm_from_utf8_symbol (about_u8);
|
||||||
|
|
||||||
|
// Value determined by calling wide_string_hash on {0x3A0, 0x3B5,
|
||||||
|
// 0x3C1, 0x3AF} via a temporary test program.
|
||||||
|
const unsigned long expect = 4029223418961680680;
|
||||||
|
const unsigned long actual = scm_to_ulong (scm_symbol_hash (sym));
|
||||||
|
|
||||||
|
if (actual != expect)
|
||||||
|
{
|
||||||
|
fprintf (stderr, "fail: unexpected utf-8 symbol hash (%lu != %lu)\n",
|
||||||
|
actual, expect);
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
tests (void *data, int argc, char **argv)
|
||||||
|
{
|
||||||
|
test_hashing ();
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main (int argc, char *argv[])
|
||||||
|
{
|
||||||
|
scm_boot_guile (argc, argv, tests, NULL);
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue