1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-27 13:30:31 +02:00

nofl: Refactor SWAR mark-matching routines

We are going to try to use fewer bits for mark state.
This commit is contained in:
Andy Wingo 2025-03-05 10:08:03 +01:00
parent c410992d55
commit e780d27959
2 changed files with 75 additions and 19 deletions

View file

@ -150,7 +150,6 @@ struct nofl_block_stack {
#define NOFL_PAGE_OUT_QUEUE_SIZE 4 #define NOFL_PAGE_OUT_QUEUE_SIZE 4
struct nofl_space { struct nofl_space {
uint64_t sweep_mask;
uint8_t live_mask; uint8_t live_mask;
uint8_t marked_mask; uint8_t marked_mask;
uint8_t evacuating; uint8_t evacuating;
@ -558,7 +557,7 @@ nofl_clear_memory(uintptr_t addr, size_t size) {
static size_t static size_t
nofl_space_live_object_granules(uint8_t *metadata) { nofl_space_live_object_granules(uint8_t *metadata) {
return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 1; return scan_for_byte_with_bits(metadata, -1, NOFL_METADATA_BYTE_END) + 1;
} }
static void static void
@ -704,7 +703,7 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
// reached the end of the block. // reached the end of the block.
static size_t static size_t
nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
uintptr_t sweep_mask) { uint8_t live_mask) {
GC_ASSERT(nofl_allocator_has_block(alloc)); GC_ASSERT(nofl_allocator_has_block(alloc));
GC_ASSERT_EQ(alloc->alloc, alloc->sweep); GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
uintptr_t sweep = alloc->sweep; uintptr_t sweep = alloc->sweep;
@ -721,7 +720,7 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
// right after a hole, which can point to either the end of the // right after a hole, which can point to either the end of the
// block or to a live object. Assume that a live object is more // block or to a live object. Assume that a live object is more
// common. // common.
while (limit_granules && (metadata[0] & sweep_mask)) { while (limit_granules && (metadata[0] & live_mask)) {
// Object survived collection; skip over it and continue sweeping. // Object survived collection; skip over it and continue sweeping.
size_t object_granules = nofl_space_live_object_granules(metadata); size_t object_granules = nofl_space_live_object_granules(metadata);
sweep += object_granules * NOFL_GRANULE_SIZE; sweep += object_granules * NOFL_GRANULE_SIZE;
@ -734,7 +733,8 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
return 0; return 0;
} }
size_t hole_granules = scan_for_byte(metadata, limit_granules, sweep_mask); size_t hole_granules = scan_for_byte_with_bits(metadata, limit_granules,
live_mask);
size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE; size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE;
GC_ASSERT(hole_granules); GC_ASSERT(hole_granules);
GC_ASSERT(hole_granules <= limit_granules); GC_ASSERT(hole_granules <= limit_granules);
@ -754,10 +754,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
static void static void
nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc, nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
uintptr_t sweep_mask) { uint8_t live_mask) {
do { do {
nofl_allocator_finish_hole(alloc); nofl_allocator_finish_hole(alloc);
} while (nofl_allocator_next_hole_in_block(alloc, sweep_mask)); } while (nofl_allocator_next_hole_in_block(alloc, live_mask));
} }
static void static void
@ -771,7 +771,7 @@ nofl_allocator_release_block(struct nofl_allocator *alloc,
} else if (space->evacuating) { } else if (space->evacuating) {
nofl_allocator_release_full_evacuation_target(alloc, space); nofl_allocator_release_full_evacuation_target(alloc, space);
} else { } else {
nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask); nofl_allocator_finish_sweeping_in_block(alloc, space->live_mask);
nofl_allocator_release_full_block(alloc, space); nofl_allocator_release_full_block(alloc, space);
} }
} }
@ -801,7 +801,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
// Sweep current block for a hole. // Sweep current block for a hole.
if (nofl_allocator_has_block(alloc)) { if (nofl_allocator_has_block(alloc)) {
size_t granules = size_t granules =
nofl_allocator_next_hole_in_block(alloc, space->sweep_mask); nofl_allocator_next_hole_in_block(alloc, space->live_mask);
if (granules) if (granules)
return granules; return granules;
else else
@ -819,7 +819,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
alloc->block.summary->holes_with_fragmentation = 0; alloc->block.summary->holes_with_fragmentation = 0;
alloc->block.summary->fragmentation_granules = 0; alloc->block.summary->fragmentation_granules = 0;
size_t granules = size_t granules =
nofl_allocator_next_hole_in_block(alloc, space->sweep_mask); nofl_allocator_next_hole_in_block(alloc, space->live_mask);
if (granules) if (granules)
return granules; return granules;
nofl_allocator_release_full_block(alloc, space); nofl_allocator_release_full_block(alloc, space);
@ -1134,7 +1134,6 @@ nofl_space_update_mark_patterns(struct nofl_space *space,
if (advance_mark_mask) if (advance_mark_mask)
space->marked_mask = next_marked_mask; space->marked_mask = next_marked_mask;
space->live_mask = survivor_mask | next_marked_mask; space->live_mask = survivor_mask | next_marked_mask;
space->sweep_mask = broadcast_byte(space->live_mask);
} }
static void static void
@ -1206,7 +1205,7 @@ nofl_space_promote_blocks(struct nofl_space *space) {
block.summary->holes_with_fragmentation = 0; block.summary->holes_with_fragmentation = 0;
block.summary->fragmentation_granules = 0; block.summary->fragmentation_granules = 0;
struct nofl_allocator alloc = { block.addr, block.addr, block }; struct nofl_allocator alloc = { block.addr, block.addr, block };
nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask); nofl_allocator_finish_sweeping_in_block(&alloc, space->live_mask);
atomic_fetch_add(&space->old_generation_granules, atomic_fetch_add(&space->old_generation_granules,
NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules); NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
nofl_block_list_push(&space->old, block); nofl_block_list_push(&space->old, block);

View file

@ -26,23 +26,80 @@ load_eight_aligned_bytes(uint8_t *ptr) {
return word; return word;
} }
static inline uint64_t
match_bytes_against_bits(uint64_t bytes, uint8_t mask) {
return bytes & broadcast_byte(mask);
}
static size_t static size_t
scan_for_byte(uint8_t *ptr, size_t limit, uint64_t mask) { scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) {
size_t n = 0; size_t n = 0;
size_t unaligned = ((uintptr_t) ptr) & 7; size_t unaligned = ((uintptr_t) ptr) & 7;
if (unaligned) { if (unaligned) {
uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8);
bytes &= mask; uint64_t match = match_bytes_against_bits(bytes, mask);
if (bytes) if (match)
return count_zero_bytes(bytes); return count_zero_bytes(match);
n += 8 - unaligned; n += 8 - unaligned;
} }
for(; n < limit; n += 8) { for(; n < limit; n += 8) {
uint64_t bytes = load_eight_aligned_bytes(ptr + n); uint64_t bytes = load_eight_aligned_bytes(ptr + n);
bytes &= mask; uint64_t match = match_bytes_against_bits(bytes, mask);
if (bytes) if (match)
return n + count_zero_bytes(bytes); return n + count_zero_bytes(match);
}
return limit;
}
static inline uint64_t
match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1,
uint8_t tag2)
{
// Precondition: tags are covered by within mask.
GC_ASSERT_EQ(tag1 & mask, tag1);
GC_ASSERT_EQ(tag2 & mask, tag2);
// Precondition: high bit of mask byte is empty, so that we can add without
// overflow.
GC_ASSERT_EQ(mask & 0x7f, mask);
// Precondition: mask is low bits of byte.
GC_ASSERT(mask);
GC_ASSERT_EQ(mask & (mask + 1), 0);
uint64_t vmask = broadcast_byte(mask);
uint64_t vtest = broadcast_byte(mask + 1);
uint64_t vtag1 = broadcast_byte(tag1);
uint64_t vtag2 = broadcast_byte(tag2);
bytes &= vmask;
uint64_t m1 = (bytes ^ vtag1) + vmask;
uint64_t m2 = (bytes ^ vtag2) + vmask;
return ((m1 & m2) & vtest) ^ vtest;
}
static size_t
scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask,
uint8_t tag1, uint8_t tag2) {
// The way we handle unaligned reads by padding high bytes with zeroes assumes
// that all-zeroes is not a matching byte.
GC_ASSERT(tag1 && tag2);
size_t n = 0;
size_t unaligned = ((uintptr_t) ptr) & 7;
if (unaligned) {
uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8);
uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2);
if (match)
return count_zero_bytes(match);
n += 8 - unaligned;
}
for(; n < limit; n += 8) {
uint64_t bytes = load_eight_aligned_bytes(ptr + n);
uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2);
if (match)
return n + count_zero_bytes(match);
} }
return limit; return limit;