1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-29 19:30:36 +02:00

scm_i_utf8_string_hash: don't overrun when len is zero

When the length is zero, the previous code would include the byte after
the end of the string in the hash.  Fix that (the wide and narrow
hashers also guard against it via "case 0"), and don't bother mutating
length for the trailing bytes.

Since we already compute the char length, use that to detect all ASCII
strings and follow the same narrow string path that we do for latin-1.

libguile/hash.c (scm_i_utf8_string_hash): avoid overrun when len == 0.
This commit is contained in:
Rob Browning 2024-06-30 22:41:40 -05:00
parent c45e1f57ec
commit 35f13806af

View file

@ -169,25 +169,26 @@ scm_i_latin1_string_hash (const char *str, size_t len)
unsigned long
scm_i_utf8_string_hash (const char *str, size_t len)
{
const uint8_t *end, *ustr = (const uint8_t *) str;
unsigned long ret;
/* The length of the string in characters. This name corresponds to
Jenkins' original name. */
size_t length;
uint32_t a, b, c, u32;
if (len == (size_t) -1)
len = strlen (str);
end = ustr + len;
// FIXME: eventually make fewer passes over str
const uint8_t *ustr = (const uint8_t *) str;
if (u8_check (ustr, len) != NULL)
/* Invalid UTF-8; punt. */
return scm_i_string_hash (scm_from_utf8_stringn (str, len));
length = u8_mbsnlen (ustr, len);
/* The length of the string in characters. This name corresponds to
Jenkins' original name. */
size_t length = u8_mbsnlen (ustr, len);
if (len == length) // ascii, same as narrow_string_hash above
return narrow_string_hash ((uint8_t *) str, len);
const uint8_t * const end = ustr + len;
uint32_t a, b, c, u32;
unsigned long ret;
/* Set up the internal state. */
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47;
@ -205,14 +206,15 @@ scm_i_utf8_string_hash (const char *str, size_t len)
length -= 3;
}
/* Handle the last 3 elements's. */
// Similar to narrow_string_hash(). Handle the last 3 chars; length
// cannot be zero because len != length above.
ustr += u8_mbtouc (&u32, ustr, end - ustr);
a += u32;
if (--length)
if (length > 1)
{
ustr += u8_mbtouc (&u32, ustr, end - ustr);
b += u32;
if (--length)
if (length > 2)
{
ustr += u8_mbtouc (&u32, ustr, end - ustr);
c += u32;