From 35f13806af653ef9ed656708dddcd1d2c8f8da9e Mon Sep 17 00:00:00 2001 From: Rob Browning Date: Sun, 30 Jun 2024 22:41:40 -0500 Subject: [PATCH] scm_i_utf8_string_hash: don't overrun when len is zero When the length is zero, the previous code would include the byte after the end of the string in the hash. Fix that (the wide and narrow hashers also guard against it via "case 0"), and don't bother mutating length for the trailing bytes. Since we already compute the char length, use that to detect all ASCII strings and follow the same narrow string path that we do for latin-1. libguile/hash.c (scm_i_utf8_string_hash): avoid overrun when len == 0. --- libguile/hash.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/libguile/hash.c b/libguile/hash.c index ba2a1207d..b7ad03309 100644 --- a/libguile/hash.c +++ b/libguile/hash.c @@ -169,25 +169,26 @@ scm_i_latin1_string_hash (const char *str, size_t len) unsigned long scm_i_utf8_string_hash (const char *str, size_t len) { - const uint8_t *end, *ustr = (const uint8_t *) str; - unsigned long ret; - - /* The length of the string in characters. This name corresponds to - Jenkins' original name. */ - size_t length; - - uint32_t a, b, c, u32; - if (len == (size_t) -1) len = strlen (str); - end = ustr + len; + // FIXME: eventually make fewer passes over str + const uint8_t *ustr = (const uint8_t *) str; if (u8_check (ustr, len) != NULL) /* Invalid UTF-8; punt. */ return scm_i_string_hash (scm_from_utf8_stringn (str, len)); - length = u8_mbsnlen (ustr, len); + /* The length of the string in characters. This name corresponds to + Jenkins' original name. */ + size_t length = u8_mbsnlen (ustr, len); + + if (len == length) // ascii, same as narrow_string_hash above + return narrow_string_hash ((uint8_t *) str, len); + + const uint8_t * const end = ustr + len; + uint32_t a, b, c, u32; + unsigned long ret; /* Set up the internal state. */ a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47; @@ -205,14 +206,15 @@ scm_i_utf8_string_hash (const char *str, size_t len) length -= 3; } - /* Handle the last 3 elements's. */ + // Similar to narrow_string_hash(). Handle the last 3 chars; length + // cannot be zero because len != length above. ustr += u8_mbtouc (&u32, ustr, end - ustr); a += u32; - if (--length) + if (length > 1) { ustr += u8_mbtouc (&u32, ustr, end - ustr); b += u32; - if (--length) + if (length > 2) { ustr += u8_mbtouc (&u32, ustr, end - ustr); c += u32;