#ifndef SWAR_H #define SWAR_H #include static inline size_t count_zero_bytes(uint64_t bytes) { return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes); } static uint64_t broadcast_byte(uint8_t byte) { uint64_t result = byte; return result * 0x0101010101010101ULL; } static inline uint64_t load_eight_aligned_bytes(uint8_t *ptr) { GC_ASSERT(((uintptr_t)ptr & 7) == 0); uint8_t * __attribute__((aligned(8))) aligned_ptr = ptr; uint64_t word; memcpy(&word, aligned_ptr, 8); #ifdef WORDS_BIGENDIAN word = __builtin_bswap64(word); #endif return word; } static inline uint64_t match_bytes_against_bits(uint64_t bytes, uint8_t mask) { return bytes & broadcast_byte(mask); } static inline size_t scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) { size_t n = 0; size_t unaligned = ((uintptr_t) ptr) & 7; if (unaligned) { uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); uint64_t match = match_bytes_against_bits(bytes, mask); if (match) return count_zero_bytes(match); n += 8 - unaligned; } for(; n < limit; n += 8) { uint64_t bytes = load_eight_aligned_bytes(ptr + n); uint64_t match = match_bytes_against_bits(bytes, mask); if (match) return n + count_zero_bytes(match); } return limit; } static inline uint64_t match_bytes_against_tag(uint64_t bytes, uint8_t mask, uint8_t tag) { // Precondition: tag within mask. GC_ASSERT_EQ(tag & mask, tag); // Precondition: high bit of mask byte is empty, so that we can add without // overflow. GC_ASSERT_EQ(mask & 0x7f, mask); // Precondition: mask is low bits of byte. GC_ASSERT(mask); GC_ASSERT_EQ(mask & (mask + 1), 0); uint64_t vmask = broadcast_byte(mask); uint64_t vtest = broadcast_byte(mask + 1); uint64_t vtag = broadcast_byte(tag); bytes &= vmask; uint64_t m = (bytes ^ vtag) + vmask; return (m & vtest) ^ vtest; } static inline size_t scan_for_byte_with_tag(uint8_t *ptr, size_t limit, uint8_t mask, uint8_t tag) { // The way we handle unaligned reads by padding high bytes with zeroes assumes // that all-zeroes is not a matching byte. GC_ASSERT(tag); size_t n = 0; size_t unaligned = ((uintptr_t) ptr) & 7; if (unaligned) { uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); uint64_t match = match_bytes_against_tag(bytes, mask, tag); if (match) return count_zero_bytes(match); n += 8 - unaligned; } for(; n < limit; n += 8) { uint64_t bytes = load_eight_aligned_bytes(ptr + n); uint64_t match = match_bytes_against_tag(bytes, mask, tag); if (match) return n + count_zero_bytes(match); } return limit; } static inline uint64_t match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1, uint8_t tag2) { // Precondition: tags are covered by within mask. GC_ASSERT_EQ(tag1 & mask, tag1); GC_ASSERT_EQ(tag2 & mask, tag2); // Precondition: high bit of mask byte is empty, so that we can add without // overflow. GC_ASSERT_EQ(mask & 0x7f, mask); // Precondition: mask is low bits of byte. GC_ASSERT(mask); GC_ASSERT_EQ(mask & (mask + 1), 0); uint64_t vmask = broadcast_byte(mask); uint64_t vtest = broadcast_byte(mask + 1); uint64_t vtag1 = broadcast_byte(tag1); uint64_t vtag2 = broadcast_byte(tag2); bytes &= vmask; uint64_t m1 = (bytes ^ vtag1) + vmask; uint64_t m2 = (bytes ^ vtag2) + vmask; return ((m1 & m2) & vtest) ^ vtest; } static inline size_t scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask, uint8_t tag1, uint8_t tag2) { // The way we handle unaligned reads by padding high bytes with zeroes assumes // that all-zeroes is not a matching byte. GC_ASSERT(tag1 && tag2); size_t n = 0; size_t unaligned = ((uintptr_t) ptr) & 7; if (unaligned) { uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2); if (match) return count_zero_bytes(match); n += 8 - unaligned; } for(; n < limit; n += 8) { uint64_t bytes = load_eight_aligned_bytes(ptr + n); uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2); if (match) return n + count_zero_bytes(match); } return limit; } #endif // SWAR_H