From 29cf0f40d329cde87ed9cba1e2717701065ae9b1 Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 5 Mar 2025 17:01:51 +0100 Subject: [PATCH] nofl space: Rework treatment of mark bits to avoid masks This will allow us to free up some metadata bits. --- src/nofl-space.h | 94 ++++++++++++++++++++++++++---------------------- src/swar.h | 50 ++++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 44 deletions(-) diff --git a/src/nofl-space.h b/src/nofl-space.h index 47c352154..e13572653 100644 --- a/src/nofl-space.h +++ b/src/nofl-space.h @@ -150,8 +150,8 @@ struct nofl_block_stack { #define NOFL_PAGE_OUT_QUEUE_SIZE 4 struct nofl_space { - uint8_t live_mask; - uint8_t marked_mask; + uint8_t current_mark; + uint8_t survivor_mark; uint8_t evacuating; struct extents *extents; size_t heap_size; @@ -249,10 +249,17 @@ enum nofl_metadata_byte { }; static uint8_t -nofl_rotate_dead_survivor_marked(uint8_t mask) { - uint8_t all = - NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2; - return ((mask << 1) | (mask >> 2)) & all; +nofl_advance_current_mark(uint8_t mark) { + switch (mark) { + case NOFL_METADATA_BYTE_MARK_0: + return NOFL_METADATA_BYTE_MARK_1; + case NOFL_METADATA_BYTE_MARK_1: + return NOFL_METADATA_BYTE_MARK_2; + case NOFL_METADATA_BYTE_MARK_2: + return NOFL_METADATA_BYTE_MARK_0; + default: + GC_CRASH(); + } } static struct gc_lock @@ -702,12 +709,23 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) { } } +static inline int +nofl_metadata_byte_has_mark(uint8_t byte, uint8_t marked) { + return (byte & NOFL_METADATA_BYTE_MARK_MASK) == marked; +} + +static inline int +nofl_metadata_byte_is_young_or_has_mark(uint8_t byte, uint8_t marked) { + return (nofl_metadata_byte_has_mark(byte, NOFL_METADATA_BYTE_YOUNG) + || nofl_metadata_byte_has_mark(byte, marked)); +} + // Sweep some heap to reclaim free space, advancing alloc->alloc and // alloc->sweep. Return the size of the hole in granules, or 0 if we // reached the end of the block. static size_t nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, - uint8_t live_mask) { + uint8_t survivor_mark) { GC_ASSERT(nofl_allocator_has_block(alloc)); GC_ASSERT_EQ(alloc->alloc, alloc->sweep); uintptr_t sweep = alloc->sweep; @@ -724,7 +742,8 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, // right after a hole, which can point to either the end of the // block or to a live object. Assume that a live object is more // common. - while (limit_granules && (metadata[0] & live_mask)) { + while (limit_granules && + nofl_metadata_byte_has_mark(metadata[0], survivor_mark)) { // Object survived collection; skip over it and continue sweeping. size_t object_granules = nofl_space_live_object_granules(metadata); sweep += object_granules * NOFL_GRANULE_SIZE; @@ -737,8 +756,9 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, return 0; } - size_t hole_granules = scan_for_byte_with_bits(metadata, limit_granules, - live_mask); + size_t hole_granules = scan_for_byte_with_tag(metadata, limit_granules, + NOFL_METADATA_BYTE_MARK_MASK, + survivor_mark); size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE; GC_ASSERT(hole_granules); GC_ASSERT(hole_granules <= limit_granules); @@ -758,10 +778,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc, static void nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc, - uint8_t live_mask) { + uint8_t survivor_mark) { do { nofl_allocator_finish_hole(alloc); - } while (nofl_allocator_next_hole_in_block(alloc, live_mask)); + } while (nofl_allocator_next_hole_in_block(alloc, survivor_mark)); } static void @@ -775,7 +795,7 @@ nofl_allocator_release_block(struct nofl_allocator *alloc, } else if (space->evacuating) { nofl_allocator_release_full_evacuation_target(alloc, space); } else { - nofl_allocator_finish_sweeping_in_block(alloc, space->live_mask); + nofl_allocator_finish_sweeping_in_block(alloc, space->survivor_mark); nofl_allocator_release_full_block(alloc, space); } } @@ -805,7 +825,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc, // Sweep current block for a hole. if (nofl_allocator_has_block(alloc)) { size_t granules = - nofl_allocator_next_hole_in_block(alloc, space->live_mask); + nofl_allocator_next_hole_in_block(alloc, space->survivor_mark); if (granules) return granules; else @@ -823,7 +843,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc, alloc->block.summary->holes_with_fragmentation = 0; alloc->block.summary->fragmentation_granules = 0; size_t granules = - nofl_allocator_next_hole_in_block(alloc, space->live_mask); + nofl_allocator_next_hole_in_block(alloc, space->survivor_mark); if (granules) return granules; nofl_allocator_release_full_block(alloc, space); @@ -1130,16 +1150,6 @@ nofl_space_prepare_evacuation(struct nofl_space *space) { } } -static void -nofl_space_update_mark_patterns(struct nofl_space *space, - int advance_mark_mask) { - uint8_t survivor_mask = space->marked_mask; - uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask); - if (advance_mark_mask) - space->marked_mask = next_marked_mask; - space->live_mask = survivor_mask | next_marked_mask; -} - static void nofl_space_clear_block_marks(struct nofl_space *space) { for (size_t s = 0; s < space->nslabs; s++) { @@ -1152,7 +1162,7 @@ static void nofl_space_prepare_gc(struct nofl_space *space, enum gc_collection_kind kind) { int is_minor = kind == GC_COLLECTION_MINOR; if (!is_minor) { - nofl_space_update_mark_patterns(space, 1); + space->current_mark = nofl_advance_current_mark(space->current_mark); nofl_space_clear_block_marks(space); } } @@ -1209,7 +1219,7 @@ nofl_space_promote_blocks(struct nofl_space *space) { block.summary->holes_with_fragmentation = 0; block.summary->fragmentation_granules = 0; struct nofl_allocator alloc = { block.addr, block.addr, block }; - nofl_allocator_finish_sweeping_in_block(&alloc, space->live_mask); + nofl_allocator_finish_sweeping_in_block(&alloc, space->current_mark); atomic_fetch_add(&space->old_generation_granules, NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules); nofl_block_list_push(&space->old, block); @@ -1238,7 +1248,7 @@ nofl_space_verify_sweepable_blocks(struct nofl_space *space, uintptr_t limit = addr + NOFL_BLOCK_SIZE; uint8_t *meta = nofl_metadata_byte_for_addr(b.addr); while (addr < limit) { - if (meta[0] & space->live_mask) { + if (nofl_metadata_byte_has_mark(meta[0], space->current_mark)) { struct gc_ref obj = gc_ref(addr); size_t obj_bytes; gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes); @@ -1275,8 +1285,7 @@ nofl_space_verify_swept_blocks(struct nofl_space *space, uint8_t *meta = nofl_metadata_byte_for_addr(addr); while (addr < limit) { if (meta[0]) { - GC_ASSERT(meta[0] & space->marked_mask); - GC_ASSERT_EQ(meta[0] & ~(space->marked_mask | NOFL_METADATA_BYTE_END), 0); + GC_ASSERT(nofl_metadata_byte_has_mark(meta[0], space->current_mark)); struct gc_ref obj = gc_ref(addr); size_t obj_bytes; gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes); @@ -1381,7 +1390,7 @@ nofl_space_finish_gc(struct nofl_space *space, gc_lock_release(&lock); nofl_space_promote_blocks(space); nofl_space_reset_statistics(space); - nofl_space_update_mark_patterns(space, 0); + space->survivor_mark = space->current_mark; if (GC_DEBUG) nofl_space_verify_before_restart(space); } @@ -1426,7 +1435,7 @@ nofl_space_set_mark_relaxed(struct nofl_space *space, uint8_t *metadata, uint8_t byte) { uint8_t mask = NOFL_METADATA_BYTE_MARK_MASK; atomic_store_explicit(metadata, - (byte & ~mask) | space->marked_mask, + (byte & ~mask) | space->current_mark, memory_order_relaxed); return 1; } @@ -1435,7 +1444,7 @@ static inline int nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) { uint8_t mask = NOFL_METADATA_BYTE_MARK_MASK; atomic_store_explicit(metadata, - (byte & ~mask) | space->marked_mask, + (byte & ~mask) | space->current_mark, memory_order_release); return 1; } @@ -1515,7 +1524,7 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte, // First check again if someone else tried to evacuate this object and ended // up marking in place instead. byte = atomic_load_explicit(metadata, memory_order_acquire); - if (byte & space->marked_mask) { + if (nofl_metadata_byte_has_mark(byte, space->current_mark)) { // Indeed, already marked in place. gc_atomic_forward_abort(&fwd); return 0; @@ -1581,7 +1590,7 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space, struct nofl_allocator *evacuate) { uint8_t *metadata = nofl_metadata_byte_for_object(old_ref); uint8_t byte = *metadata; - if (byte & space->marked_mask) + if (nofl_metadata_byte_has_mark(byte, space->current_mark)) return 0; if (nofl_space_should_evacuate(space, byte, old_ref)) @@ -1626,7 +1635,7 @@ nofl_space_forward_or_mark_if_traced(struct nofl_space *space, struct gc_ref ref) { uint8_t *metadata = nofl_metadata_byte_for_object(ref); uint8_t byte = *metadata; - if (byte & space->marked_mask) + if (nofl_metadata_byte_has_mark(byte, space->current_mark)) return 1; if (!nofl_space_should_evacuate(space, byte, ref)) @@ -1663,13 +1672,12 @@ nofl_space_mark_conservative_ref(struct nofl_space *space, uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed); // Already marked object? Nothing to do. - if (byte & space->marked_mask) + if (nofl_metadata_byte_has_mark(byte, space->current_mark)) return gc_ref_null(); // Addr is the not start of an unmarked object? Search backwards if // we have interior pointers, otherwise not an object. - uint8_t object_start_mask = space->live_mask | NOFL_METADATA_BYTE_YOUNG; - if (!(byte & object_start_mask)) { + if (!nofl_metadata_byte_is_young_or_has_mark(byte, space->survivor_mark)) { if (!possibly_interior) return gc_ref_null(); @@ -1685,9 +1693,12 @@ nofl_space_mark_conservative_ref(struct nofl_space *space, // Ran into the end of some other allocation? Not an object, then. if (byte & NOFL_METADATA_BYTE_END) return gc_ref_null(); + // Object already marked? Nothing to do. + if (nofl_metadata_byte_has_mark(byte, space->current_mark)) + return gc_ref_null(); // Continue until we find object start. - } while (!(byte & object_start_mask)); + } while (!nofl_metadata_byte_is_young_or_has_mark(byte, space->survivor_mark)); // Found object start, and object is unmarked; adjust addr. addr = block_base + (loc - loc_base) * NOFL_GRANULE_SIZE; @@ -1842,8 +1853,7 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic, if (!slabs) return 0; - space->marked_mask = NOFL_METADATA_BYTE_MARK_0; - nofl_space_update_mark_patterns(space, 0); + space->current_mark = space->survivor_mark = NOFL_METADATA_BYTE_MARK_0; space->extents = extents_allocate(10); nofl_space_add_slabs(space, slabs, nslabs); pthread_mutex_init(&space->lock, NULL); diff --git a/src/swar.h b/src/swar.h index e516ed83f..d8598c8b5 100644 --- a/src/swar.h +++ b/src/swar.h @@ -31,7 +31,7 @@ match_bytes_against_bits(uint64_t bytes, uint8_t mask) { return bytes & broadcast_byte(mask); } -static size_t +static inline size_t scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) { size_t n = 0; size_t unaligned = ((uintptr_t) ptr) & 7; @@ -53,6 +53,52 @@ scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) { return limit; } +static inline uint64_t +match_bytes_against_tag(uint64_t bytes, uint8_t mask, uint8_t tag) { + // Precondition: tag within mask. + GC_ASSERT_EQ(tag & mask, tag); + // Precondition: high bit of mask byte is empty, so that we can add without + // overflow. + GC_ASSERT_EQ(mask & 0x7f, mask); + // Precondition: mask is low bits of byte. + GC_ASSERT(mask); + GC_ASSERT_EQ(mask & (mask + 1), 0); + + uint64_t vmask = broadcast_byte(mask); + uint64_t vtest = broadcast_byte(mask + 1); + uint64_t vtag = broadcast_byte(tag); + + bytes &= vmask; + uint64_t m = (bytes ^ vtag) + vmask; + return (m & vtest) ^ vtest; +} + +static inline size_t +scan_for_byte_with_tag(uint8_t *ptr, size_t limit, uint8_t mask, uint8_t tag) { + // The way we handle unaligned reads by padding high bytes with zeroes assumes + // that all-zeroes is not a matching byte. + GC_ASSERT(tag); + + size_t n = 0; + size_t unaligned = ((uintptr_t) ptr) & 7; + if (unaligned) { + uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8); + uint64_t match = match_bytes_against_tag(bytes, mask, tag); + if (match) + return count_zero_bytes(match); + n += 8 - unaligned; + } + + for(; n < limit; n += 8) { + uint64_t bytes = load_eight_aligned_bytes(ptr + n); + uint64_t match = match_bytes_against_tag(bytes, mask, tag); + if (match) + return n + count_zero_bytes(match); + } + + return limit; +} + static inline uint64_t match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1, uint8_t tag2) @@ -78,7 +124,7 @@ match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1, return ((m1 & m2) & vtest) ^ vtest; } -static size_t +static inline size_t scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask, uint8_t tag1, uint8_t tag2) { // The way we handle unaligned reads by padding high bytes with zeroes assumes