From e780d2795933ed836e4ee993ebc3f903e78b0fae Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 5 Mar 2025 10:08:03 +0100 Subject: [PATCH] nofl: Refactor SWAR mark-matching routines We are going to try to use fewer bits for mark state. --- src/nofl-space.h | 23 ++++++++-------- src/swar.h | 71 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 75 insertions(+), 19 deletions(-) diff --git a/src/nofl-space.h b/src/nofl-space.h index 66aa0ac62..f98530b02 100644 --- a/src/nofl-space.h +++ b/src/nofl-space.h @@ -150,7 +150,6 @@ struct nofl_block_stack { #define NOFL_PAGE_OUT_QUEUE_SIZE 4 struct nofl_space { - uint64_t sweep_mask; uint8_t live_mask; uint8_t marked_mask; uint8_t evacuating; @@ -558,7 +557,7 @@ nofl_clear_memory(uintptr_t addr, size_t size) { static size_t nofl_space_live_object_granules(uint8_t *metadata) { - return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 1; + return scan_for_byte_with_bits(metadata, -1, NOFL_METADATA_BYTE_END) + 1; } static void @@ -704,7 +703,7 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) { // reached the end of the block. static size_t nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, - uintptr_t sweep_mask) { + uint8_t live_mask) { GC_ASSERT(nofl_allocator_has_block(alloc)); GC_ASSERT_EQ(alloc->alloc, alloc->sweep); uintptr_t sweep = alloc->sweep; @@ -721,7 +720,7 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, // right after a hole, which can point to either the end of the // block or to a live object. Assume that a live object is more // common. - while (limit_granules && (metadata[0] & sweep_mask)) { + while (limit_granules && (metadata[0] & live_mask)) { // Object survived collection; skip over it and continue sweeping. size_t object_granules = nofl_space_live_object_granules(metadata); sweep += object_granules * NOFL_GRANULE_SIZE; @@ -734,7 +733,8 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, return 0; } - size_t hole_granules = scan_for_byte(metadata, limit_granules, sweep_mask); + size_t hole_granules = scan_for_byte_with_bits(metadata, limit_granules, + live_mask); size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE; GC_ASSERT(hole_granules); GC_ASSERT(hole_granules <= limit_granules); @@ -754,10 +754,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, static void nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc, - uintptr_t sweep_mask) { + uint8_t live_mask) { do { nofl_allocator_finish_hole(alloc); - } while (nofl_allocator_next_hole_in_block(alloc, sweep_mask)); + } while (nofl_allocator_next_hole_in_block(alloc, live_mask)); } static void @@ -771,7 +771,7 @@ nofl_allocator_release_block(struct nofl_allocator *alloc, } else if (space->evacuating) { nofl_allocator_release_full_evacuation_target(alloc, space); } else { - nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask); + nofl_allocator_finish_sweeping_in_block(alloc, space->live_mask); nofl_allocator_release_full_block(alloc, space); } } @@ -801,7 +801,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc, // Sweep current block for a hole. if (nofl_allocator_has_block(alloc)) { size_t granules = - nofl_allocator_next_hole_in_block(alloc, space->sweep_mask); + nofl_allocator_next_hole_in_block(alloc, space->live_mask); if (granules) return granules; else @@ -819,7 +819,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc, alloc->block.summary->holes_with_fragmentation = 0; alloc->block.summary->fragmentation_granules = 0; size_t granules = - nofl_allocator_next_hole_in_block(alloc, space->sweep_mask); + nofl_allocator_next_hole_in_block(alloc, space->live_mask); if (granules) return granules; nofl_allocator_release_full_block(alloc, space); @@ -1134,7 +1134,6 @@ nofl_space_update_mark_patterns(struct nofl_space *space, if (advance_mark_mask) space->marked_mask = next_marked_mask; space->live_mask = survivor_mask | next_marked_mask; - space->sweep_mask = broadcast_byte(space->live_mask); } static void @@ -1206,7 +1205,7 @@ nofl_space_promote_blocks(struct nofl_space *space) { block.summary->holes_with_fragmentation = 0; block.summary->fragmentation_granules = 0; struct nofl_allocator alloc = { block.addr, block.addr, block }; - nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask); + nofl_allocator_finish_sweeping_in_block(&alloc, space->live_mask); atomic_fetch_add(&space->old_generation_granules, NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules); nofl_block_list_push(&space->old, block); diff --git a/src/swar.h b/src/swar.h index 293d99ec2..e516ed83f 100644 --- a/src/swar.h +++ b/src/swar.h @@ -26,23 +26,80 @@ load_eight_aligned_bytes(uint8_t *ptr) { return word; } +static inline uint64_t +match_bytes_against_bits(uint64_t bytes, uint8_t mask) { + return bytes & broadcast_byte(mask); +} + static size_t -scan_for_byte(uint8_t *ptr, size_t limit, uint64_t mask) { +scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) { size_t n = 0; size_t unaligned = ((uintptr_t) ptr) & 7; if (unaligned) { uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); - bytes &= mask; - if (bytes) - return count_zero_bytes(bytes); + uint64_t match = match_bytes_against_bits(bytes, mask); + if (match) + return count_zero_bytes(match); n += 8 - unaligned; } for(; n < limit; n += 8) { uint64_t bytes = load_eight_aligned_bytes(ptr + n); - bytes &= mask; - if (bytes) - return n + count_zero_bytes(bytes); + uint64_t match = match_bytes_against_bits(bytes, mask); + if (match) + return n + count_zero_bytes(match); + } + + return limit; +} + +static inline uint64_t +match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1, + uint8_t tag2) +{ + // Precondition: tags are covered by within mask. + GC_ASSERT_EQ(tag1 & mask, tag1); + GC_ASSERT_EQ(tag2 & mask, tag2); + // Precondition: high bit of mask byte is empty, so that we can add without + // overflow. + GC_ASSERT_EQ(mask & 0x7f, mask); + // Precondition: mask is low bits of byte. + GC_ASSERT(mask); + GC_ASSERT_EQ(mask & (mask + 1), 0); + + uint64_t vmask = broadcast_byte(mask); + uint64_t vtest = broadcast_byte(mask + 1); + uint64_t vtag1 = broadcast_byte(tag1); + uint64_t vtag2 = broadcast_byte(tag2); + + bytes &= vmask; + uint64_t m1 = (bytes ^ vtag1) + vmask; + uint64_t m2 = (bytes ^ vtag2) + vmask; + return ((m1 & m2) & vtest) ^ vtest; +} + +static size_t +scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask, + uint8_t tag1, uint8_t tag2) { + // The way we handle unaligned reads by padding high bytes with zeroes assumes + // that all-zeroes is not a matching byte. + GC_ASSERT(tag1 && tag2); + + size_t n = 0; + size_t unaligned = ((uintptr_t) ptr) & 7; + if (unaligned) { + uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); + uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2); + if (match) + return count_zero_bytes(match); + n += 8 - unaligned; + } + + for(; n < limit; n += 8) { + uint64_t bytes = load_eight_aligned_bytes(ptr + n); + uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2); + if (match) + return n + count_zero_bytes(match); } return limit;