Merged Whippet into libguile/whippet

2025-07-02 23:50:47 +02:00 · 2025-04-11 14:10:41 +02:00 · 2025-04-11 14:10:41 +02:00 · db181e67ff
commit db181e67ff
parent af96820e07 f909438596
112 changed files with 18115 additions and 0 deletions
--- a/libguile/whippet/.gitignore
+++ b/libguile/whippet/.gitignore
@ -0,0 +1,16 @@
+/*.o
+/*.bdw
+/*.semi
+/*.mmc
+/*.generational-mmc
+/*.parallel-mmc
+/*.parallel-generational-mmc
+/*.stack-conservative-mmc
+/*.stack-conservative-generational-mmc
+/*.stack-conservative-parallel-mmc
+/*.stack-conservative-parallel-generational-mmc
+/*.heap-conservative-mmc
+/*.heap-conservative-generational-mmc
+/*.heap-conservative-parallel-mmc
+/*.heap-conservative-parallel-generational-mmc
+/.deps/
--- a/libguile/whippet/Makefile
+++ b/libguile/whippet/Makefile
@ -0,0 +1,141 @@
+TESTS = quads mt-gcbench ephemerons finalizers
+COLLECTORS = \
+	bdw \
+	semi \
+	\
+	pcc \
+	generational-pcc \
+	\
+	mmc \
+	stack-conservative-mmc \
+	heap-conservative-mmc \
+	\
+	parallel-mmc \
+	stack-conservative-parallel-mmc \
+	heap-conservative-parallel-mmc \
+	\
+	generational-mmc \
+	stack-conservative-generational-mmc \
+	heap-conservative-generational-mmc \
+	\
+	parallel-generational-mmc \
+	stack-conservative-parallel-generational-mmc \
+	heap-conservative-parallel-generational-mmc
+
+DEFAULT_BUILD := opt
+
+BUILD_CFLAGS_opt      = -O2 -g -DNDEBUG
+BUILD_CFLAGS_optdebug = -Og -g -DGC_DEBUG=1
+BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
+
+BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
+
+USE_LTTNG_0 :=
+USE_LTTNG_1 := 1
+USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1 || echo 0)
+LTTNG_CPPFLAGS := $(if $(USE_LTTNG_$(USE_LTTNG)), $(shell pkg-config --cflags lttng-ust),)
+LTTNG_LIBS := $(if $(USE_LTTNG_$(USE_LTTNG)), $(shell pkg-config --libs lttng-ust),)
+TRACEPOINT_CPPFLAGS = $(if $(USE_LTTNG_$(USE_LTTNG)),$(LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
+TRACEPOINT_LIBS = $(LTTNG_LIBS)
+
+CC       = gcc
+CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
+CPPFLAGS = -Iapi $(TRACEPOINT_CPPFLAGS)
+LDFLAGS  = -lpthread -flto=auto $(TRACEPOINT_LIBS)
+DEPFLAGS = -MMD -MP -MF $(@:obj/%.o=.deps/%.d)
+COMPILE  = $(CC) $(CFLAGS) $(CPPFLAGS) $(DEPFLAGS) -o $@
+LINK     = $(CC) $(LDFLAGS) -o $@
+PLATFORM = gnu-linux
+
+ALL_TESTS = $(foreach COLLECTOR,$(COLLECTORS),$(addsuffix .$(COLLECTOR),$(TESTS)))
+
+all: $(ALL_TESTS:%=bin/%)
+.deps obj bin: ; mkdir -p $@
+
+include $(wildcard .deps/*)
+
+obj/gc-platform.o: src/gc-platform-$(PLATFORM).c | .deps obj
+	$(COMPILE) -c $<
+obj/gc-stack.o: src/gc-stack.c | .deps obj
+	$(COMPILE) -c $<
+obj/gc-options.o: src/gc-options.c | .deps obj
+	$(COMPILE) -c $<
+obj/gc-tracepoint.o: src/gc-tracepoint.c | .deps obj
+	$(COMPILE) -c $<
+obj/%.gc-ephemeron.o: src/gc-ephemeron.c | .deps obj
+	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
+obj/%.gc-finalizer.o: src/gc-finalizer.c | .deps obj
+	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
+
+GC_STEM_bdw   	   = bdw
+GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+GC_IMPL_CFLAGS_bdw = `pkg-config --cflags bdw-gc`
+GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
+
+GC_STEM_semi       = semi
+GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
+GC_LIBS_semi       = -lm
+
+GC_STEM_pcc        = pcc
+GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+GC_LIBS_pcc        = -lm
+
+GC_STEM_generational_pcc   = $(GC_STEM_pcc)
+GC_CFLAGS_generational_pcc = $(GC_CFLAGS_pcc) -DGC_GENERATIONAL=1
+GC_LIBS_generational_pcc   = $(GC_LIBS_pcc)
+
+define mmc_variant
+GC_STEM_$(1)       = mmc
+GC_CFLAGS_$(1)     = $(2)
+GC_LIBS_$(1)       = -lm
+endef
+
+define generational_mmc_variants
+$(call mmc_variant,$(1)mmc,$(2))
+$(call mmc_variant,$(1)generational_mmc,$(2) -DGC_GENERATIONAL=1)
+endef
+
+define parallel_mmc_variants
+$(call generational_mmc_variants,$(1),$(2))
+$(call generational_mmc_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
+endef
+
+define trace_mmc_variants
+$(call parallel_mmc_variants,,-DGC_PRECISE_ROOTS=1)
+$(call parallel_mmc_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
+$(call parallel_mmc_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
+endef
+
+$(eval $(call trace_mmc_variants))
+
+# $(1) is the benchmark, $(2) is the collector configuration
+make_gc_var    = $$($(1)$(subst -,_,$(2)))
+gc_impl        = $(call make_gc_var,GC_STEM_,$(1)).c
+gc_attrs       = $(call make_gc_var,GC_STEM_,$(1))-attrs.h
+gc_cflags      = $(call make_gc_var,GC_CFLAGS_,$(1))
+gc_impl_cflags = $(call make_gc_var,GC_IMPL_CFLAGS_,$(1))
+gc_libs        = $(call make_gc_var,GC_LIBS_,$(1))
+define benchmark_template
+obj/$(1).$(2).gc.o: src/$(call gc_impl,$(2)) | .deps obj
+	$$(COMPILE) $(call gc_cflags,$(2)) $(call gc_impl_cflags,$(2)) -include benchmarks/$(1)-embedder.h -c $$<
+obj/$(1).$(2).o: benchmarks/$(1).c | .deps obj
+	$$(COMPILE) $(call gc_cflags,$(2)) -include api/$(call gc_attrs,$(2)) -c $$<
+bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/gc-tracepoint.o obj/$(1).gc-ephemeron.o obj/$(1).gc-finalizer.o | bin
+	$$(LINK) $$^ $(call gc_libs,$(2))
+endef
+
+$(foreach BENCHMARK,$(TESTS),\
+  $(foreach COLLECTOR,$(COLLECTORS),\
+    $(eval $(call benchmark_template,$(BENCHMARK),$(COLLECTOR)))))
+
+.PRECIOUS: $(ALL_TESTS) $(OBJS)
+
+clean:
+	rm -f $(ALL_TESTS)
+	rm -rf .deps obj bin
+
+# Clear some of the default rules.
+.SUFFIXES:
+.SECONDARY:
+%.c:;
+Makefile:;
--- a/libguile/whippet/README.md
+++ b/libguile/whippet/README.md
@ -0,0 +1,91 @@
+# Whippet Garbage Collector
+
+This repository is for development of Whippet, a new garbage collector
+implementation, eventually for use in [Guile
+Scheme](https://gnu.org/s/guile).
+
+Whippet is an embed-only C library, designed to be copied into a
+program's source tree.  It exposes an abstract C API for managed memory
+allocation, and provides a number of implementations of that API.
+
+## Documentation
+
+See the [documentation](./doc/README.md).
+
+## Features
+
+ - Per-object pinning (with `mmc` collectors)
+ - Finalization (supporting resuscitation)
+ - Ephemerons (except on `bdw`, which has a polyfill)
+ - Conservative roots (optionally with `mmc` or always with `bdw`)
+ - Precise roots (optionally with `mmc` or always with `semi` / `pcc`)
+ - Precise embedder-parameterized heap tracing (except with `bdw`)
+ - Conservative heap tracing (optionally with `mmc`, always with `bdw`)
+ - Parallel tracing (except `semi`)
+ - Parallel mutators (except `semi`)
+ - Inline allocation / write barrier fast paths (supporting JIT)
+ - One unified API with no-overhead abstraction: switch collectors when
+   you like
+ - Three policies for sizing heaps: fixed, proportional to live size, and
+   [MemBalancer](http://marisa.moe/balancer.html)
+
+## Source repository structure
+
+ * [api/](./api/): The user-facing API.  Also, the "embedder API"; see
+   the [manual](./doc/manual.md) for more.
+ * [doc/](./doc/): Documentation, such as it is.
+ * [src/](./src/): The actual GC implementation, containing a number of
+   collector implementations.  The embedder chooses which collector to
+   use at compile-time.  See the [documentation](./doc/collectors.md)
+   for more on the different collectors (`semi`, `bdw`, `pcc`, and the
+   different flavors of `mmc`).
+ * [benchmarks/](./benchmarks/): Benchmarks.  A work in progress.
+ * [test/](./test/): A dusty attic of minimal testing.
+
+## Status and roadmap
+
+As of January 2025, Whippet is good to go!  Of course there will surely
+be new features to build as Whippet gets integrated it into language
+run-times, but the basics are there.
+
+The next phase on the roadmap is support for tracing, and
+some performance noodling.
+
+Once that is done, the big task is integrating Whippet into the [Guile
+Scheme](https://gnu.org/s/guile) language run-time, replacing BDW-GC.
+Fingers crossed!
+
+## About the name
+
+It sounds better than WIP (work-in-progress) garbage collector, doesn't
+it?  Also apparently a whippet is a kind of dog that is fast for its
+size.  It would be nice if the Whippet collectors turn out to have this
+property.
+
+## License
+
+```
+Copyright (c) 2022-2024 Andy Wingo
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+```
+
+Note that some benchmarks have other licenses; see
+[`benchmarks/README.md`](./benchmarks/README.md) for more.
--- a/libguile/whippet/api/bdw-attrs.h
+++ b/libguile/whippet/api/bdw-attrs.h
@ -0,0 +1,91 @@
+#ifndef BDW_ATTRS_H
+#define BDW_ATTRS_H
+
+#include "gc-attrs.h"
+#include "gc-assert.h"
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_FREELIST;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return 2 * sizeof(void *);
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return 256;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  GC_CRASH();
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  GC_CRASH();
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) {
+  GC_ASSERT(size);
+  size_t base;
+  switch (kind) {
+    case GC_ALLOCATION_TAGGED:
+    case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+      base = 0;
+      break;
+    case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+    case GC_ALLOCATION_TAGGED_POINTERLESS:
+      base = (sizeof(void*) * gc_allocator_large_threshold() /
+              gc_allocator_small_granule_size());
+      break;
+  }
+  size_t bucket = (size - 1) / gc_allocator_small_granule_size();
+  return base + sizeof(void*) * bucket;
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind) {
+  GC_CRASH();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_CRASH();
+}
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_SIGNAL;
+}
+
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_NONE;
+}
+
+static inline int gc_can_pin_objects(void) {
+  return 1;
+}
+
+#endif // BDW_ATTRS_H
--- a/libguile/whippet/api/gc-allocation-kind.h
+++ b/libguile/whippet/api/gc-allocation-kind.h
@ -0,0 +1,19 @@
+#ifndef GC_ALLOCATION_KIND_H
+#define GC_ALLOCATION_KIND_H
+
+enum gc_allocation_kind {
+  // An object whose type can be inspected at run-time based on its contents,
+  // and whose fields be traced via the gc_trace_object procedure.
+  GC_ALLOCATION_TAGGED,
+  // Like GC_ALLOCATION_TAGGED, but not containing any fields that reference
+  // GC-managed objects.  The GC may choose to handle these specially.
+  GC_ALLOCATION_TAGGED_POINTERLESS,
+  // A raw allocation whose type cannot be inspected at trace-time, and whose
+  // fields should be traced conservatively.
+  GC_ALLOCATION_UNTAGGED_CONSERVATIVE,
+  // A raw allocation whose type cannot be inspected at trace-time, but
+  // containing no fields that reference GC-managed objects.
+  GC_ALLOCATION_UNTAGGED_POINTERLESS
+};
+
+#endif // GC_ALLOCATION_KIND_H
--- a/libguile/whippet/api/gc-api.h
+++ b/libguile/whippet/api/gc-api.h
@ -0,0 +1,301 @@
+#ifndef GC_API_H_
+#define GC_API_H_
+
+#include "gc-config.h"
+#include "gc-allocation-kind.h"
+#include "gc-assert.h"
+#include "gc-attrs.h"
+#include "gc-collection-kind.h"
+#include "gc-edge.h"
+#include "gc-event-listener.h"
+#include "gc-inline.h"
+#include "gc-options.h"
+#include "gc-ref.h"
+#include "gc-visibility.h"
+
+#include <stdatomic.h>
+#include <stdint.h>
+#include <string.h>
+
+struct gc_heap;
+struct gc_mutator;
+
+struct gc_stack_addr;
+GC_API_ void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *,
+                                                 void *),
+                                      void *data) GC_NEVER_INLINE;
+
+GC_API_ int gc_init(const struct gc_options *options,
+                    struct gc_stack_addr *base, struct gc_heap **heap,
+                    struct gc_mutator **mutator,
+                    struct gc_event_listener event_listener,
+                    void *event_listener_data);
+
+GC_API_ uint64_t gc_allocation_counter(struct gc_heap *heap);
+
+GC_API_ struct gc_heap* gc_mutator_heap(struct gc_mutator *mut);
+
+GC_API_ uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap);
+GC_API_ uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap);
+
+struct gc_mutator_roots;
+GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
+                                  struct gc_mutator_roots *roots);
+
+struct gc_heap_roots;
+GC_API_ void gc_heap_set_roots(struct gc_heap *heap,
+                               struct gc_heap_roots *roots);
+
+struct gc_extern_space;
+GC_API_ void gc_heap_set_extern_space(struct gc_heap *heap,
+                                      struct gc_extern_space *space);
+
+GC_API_ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *base,
+                                              struct gc_heap *heap);
+GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
+GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
+                                 void *data) GC_NEVER_INLINE;
+
+GC_API_ void gc_collect(struct gc_mutator *mut,
+                        enum gc_collection_kind requested_kind);
+
+static inline void gc_update_alloc_table(struct gc_ref obj, size_t size,
+                                         enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void gc_update_alloc_table(struct gc_ref obj, size_t size,
+                                         enum gc_allocation_kind kind) {
+  size_t alignment = gc_allocator_alloc_table_alignment();
+  if (!alignment) return;
+
+  uintptr_t addr = gc_ref_value(obj);
+  uintptr_t base = addr & ~(alignment - 1);
+  size_t granule_size = gc_allocator_small_granule_size();
+  uintptr_t granule = (addr & (alignment - 1)) / granule_size;
+  uint8_t *alloc = (uint8_t*)(base + granule);
+
+  uint8_t begin_pattern = gc_allocator_alloc_table_begin_pattern(kind);
+  uint8_t end_pattern = gc_allocator_alloc_table_end_pattern();
+  if (end_pattern) {
+    size_t granules = size / granule_size;
+    if (granules == 1) {
+      alloc[0] = begin_pattern | end_pattern;
+    } else {
+      alloc[0] = begin_pattern;
+      if (granules > 2)
+        memset(alloc + 1, 0, granules - 2);
+      alloc[granules - 1] = end_pattern;
+    }
+  } else {
+    alloc[0] = begin_pattern;
+  }
+}
+
+GC_API_ void* gc_allocate_slow(struct gc_mutator *mut, size_t bytes,
+                               enum gc_allocation_kind kind) GC_NEVER_INLINE;
+
+static inline void*
+gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut, size_t size,
+                                    enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut,
+                                                        size_t size,
+                                                        enum gc_allocation_kind kind) {
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+
+  size_t granule_size = gc_allocator_small_granule_size();
+  size_t hp_offset = gc_allocator_allocation_pointer_offset();
+  size_t limit_offset = gc_allocator_allocation_limit_offset();
+
+  uintptr_t base_addr = (uintptr_t)mut;
+  uintptr_t *hp_loc = (uintptr_t*)(base_addr + hp_offset);
+  uintptr_t *limit_loc = (uintptr_t*)(base_addr + limit_offset);
+
+  size = (size + granule_size - 1) & ~(granule_size - 1);
+  uintptr_t hp = *hp_loc;
+  uintptr_t limit = *limit_loc;
+  uintptr_t new_hp = hp + size;
+
+  if (GC_UNLIKELY (new_hp > limit))
+    return NULL;
+
+  *hp_loc = new_hp;
+
+  gc_update_alloc_table(gc_ref(hp), size, kind);
+
+  return (void*)hp;
+}
+
+static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut,
+                                                    size_t size,
+                                                    enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut,
+                                                    size_t size,
+                                                    enum gc_allocation_kind kind) {
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+
+  size_t freelist_offset = gc_allocator_freelist_offset(size, kind);
+  uintptr_t base_addr = (uintptr_t)mut;
+  void **freelist_loc = (void**)(base_addr + freelist_offset);
+
+  void *head = *freelist_loc;
+  if (GC_UNLIKELY(!head))
+    return NULL;
+
+  *freelist_loc = *(void**)head;
+
+  gc_update_alloc_table(gc_ref_from_heap_object(head), size, kind);
+
+  return head;
+}
+
+static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size,
+                                           enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size,
+                                           enum gc_allocation_kind kind) {
+  GC_ASSERT(size != 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+
+  switch (gc_allocator_kind()) {
+  case GC_ALLOCATOR_INLINE_BUMP_POINTER:
+    return gc_allocate_small_fast_bump_pointer(mut, size, kind);
+  case GC_ALLOCATOR_INLINE_FREELIST:
+    return gc_allocate_small_fast_freelist(mut, size, kind);
+  case GC_ALLOCATOR_INLINE_NONE:
+    return NULL;
+  default:
+    GC_CRASH();
+  }
+}
+
+static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size,
+                                     enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size,
+                                     enum gc_allocation_kind kind) {
+  GC_ASSERT(size != 0);
+  if (size > gc_allocator_large_threshold())
+    return NULL;
+
+  return gc_allocate_small_fast(mut, size, kind);
+}
+
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size,
+                                          enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size,
+                                          enum gc_allocation_kind kind) {
+  void *ret = gc_allocate_fast(mut, size, kind);
+  if (GC_LIKELY(ret != NULL))
+    return ret;
+
+  return gc_allocate_slow(mut, size, kind);
+}
+
+GC_API_ int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                             struct gc_ref obj) GC_NEVER_INLINE;
+
+static inline int gc_object_is_old_generation(struct gc_mutator *mut,
+                                              struct gc_ref obj,
+                                              size_t obj_size) GC_ALWAYS_INLINE;
+static inline int gc_object_is_old_generation(struct gc_mutator *mut,
+                                              struct gc_ref obj,
+                                              size_t obj_size) {
+  switch (gc_old_generation_check_kind(obj_size)) {
+  case GC_OLD_GENERATION_CHECK_ALLOC_TABLE: {
+    size_t alignment = gc_allocator_alloc_table_alignment();
+    GC_ASSERT(alignment);
+    uintptr_t addr = gc_ref_value(obj);
+    uintptr_t base = addr & ~(alignment - 1);
+    size_t granule_size = gc_allocator_small_granule_size();
+    uintptr_t granule = (addr & (alignment - 1)) / granule_size;
+    uint8_t *byte_loc = (uint8_t*)(base + granule);
+    uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
+    uint8_t mask = gc_old_generation_check_alloc_table_tag_mask();
+    uint8_t young = gc_old_generation_check_alloc_table_young_tag();
+    return (byte & mask) != young;
+  }
+  case GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY: {
+    struct gc_heap *heap = gc_mutator_heap(mut);
+    // Note that these addresses are fixed and that the embedder might
+    // want to store them somewhere or inline them into the output of
+    // JIT-generated code.  They may also be power-of-two aligned.
+    uintptr_t low_addr = gc_small_object_nursery_low_address(heap);
+    uintptr_t high_addr = gc_small_object_nursery_high_address(heap);
+    uintptr_t size = high_addr - low_addr;
+    uintptr_t addr = gc_ref_value(obj);
+    return addr - low_addr >= size;
+  }
+  case GC_OLD_GENERATION_CHECK_SLOW:
+    return gc_object_is_old_generation_slow(mut, obj);
+  default:
+    GC_CRASH();
+  }
+}
+
+GC_API_ void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                                   size_t obj_size, struct gc_edge edge,
+                                   struct gc_ref new_val) GC_NEVER_INLINE;
+
+static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref obj,
+                                        size_t obj_size, struct gc_edge edge,
+                                        struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref obj,
+                                        size_t obj_size, struct gc_edge edge,
+                                        struct gc_ref new_val) {
+  switch (gc_write_barrier_kind(obj_size)) {
+  case GC_WRITE_BARRIER_NONE:
+    return 0;
+  case GC_WRITE_BARRIER_FIELD: {
+    if (!gc_object_is_old_generation(mut, obj, obj_size))
+      return 0;
+
+    size_t field_table_alignment = gc_write_barrier_field_table_alignment();
+    size_t fields_per_byte = gc_write_barrier_field_fields_per_byte();
+    uint8_t first_bit_pattern = gc_write_barrier_field_first_bit_pattern();
+    ssize_t table_offset = gc_write_barrier_field_table_offset();
+
+    uintptr_t addr = gc_edge_address(edge);
+    uintptr_t base = addr & ~(field_table_alignment - 1);
+    uintptr_t field = (addr & (field_table_alignment - 1)) / sizeof(uintptr_t);
+    uintptr_t log_byte = field / fields_per_byte;
+    uint8_t log_bit = first_bit_pattern << (field % fields_per_byte);
+    uint8_t *byte_loc = (uint8_t*)(base + table_offset + log_byte);
+    uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
+    return !(byte & log_bit);
+  }
+  case GC_WRITE_BARRIER_SLOW:
+    return 1;
+  default:
+    GC_CRASH();
+  }
+}
+
+static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
+                                    size_t obj_size, struct gc_edge edge,
+                                    struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
+                                    size_t obj_size, struct gc_edge edge,
+                                    struct gc_ref new_val) {
+  if (GC_UNLIKELY(gc_write_barrier_fast(mut, obj, obj_size, edge, new_val)))
+    gc_write_barrier_slow(mut, obj, obj_size, edge, new_val);
+}
+
+GC_API_ void gc_pin_object(struct gc_mutator *mut, struct gc_ref obj);
+
+GC_API_ void gc_safepoint_slow(struct gc_mutator *mut) GC_NEVER_INLINE;
+GC_API_ int* gc_safepoint_flag_loc(struct gc_mutator *mut);
+static inline int gc_should_stop_for_safepoint(struct gc_mutator *mut) {
+  switch (gc_cooperative_safepoint_kind()) {
+  case GC_COOPERATIVE_SAFEPOINT_NONE:
+    return 0;
+  case GC_COOPERATIVE_SAFEPOINT_MUTATOR_FLAG:
+  case GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG: {
+    return atomic_load_explicit(gc_safepoint_flag_loc(mut),
+                                memory_order_relaxed);
+  }
+  default:
+    GC_CRASH();
+  }
+}
+static inline void gc_safepoint(struct gc_mutator *mut) {
+  if (GC_UNLIKELY(gc_should_stop_for_safepoint(mut)))
+    gc_safepoint_slow(mut);
+}
+
+#endif // GC_API_H_
--- a/libguile/whippet/api/gc-assert.h
+++ b/libguile/whippet/api/gc-assert.h
@ -0,0 +1,21 @@
+#ifndef GC_ASSERT_H
+#define GC_ASSERT_H
+
+#include "gc-config.h"
+
+#define GC_UNLIKELY(e) __builtin_expect(e, 0)
+#define GC_LIKELY(e) __builtin_expect(e, 1)
+
+#define GC_CRASH() __builtin_trap()
+
+#if GC_DEBUG
+#define GC_ASSERT(x) do { if (GC_UNLIKELY(!(x))) GC_CRASH(); } while (0)
+#define GC_UNREACHABLE() GC_CRASH()
+#else
+#define GC_ASSERT(x) do { } while (0)
+#define GC_UNREACHABLE() __builtin_unreachable()
+#endif
+
+#define GC_ASSERT_EQ(a, b) GC_ASSERT((a) == (b))
+
+#endif // GC_ASSERT_H
--- a/libguile/whippet/api/gc-attrs.h
+++ b/libguile/whippet/api/gc-attrs.h
@ -0,0 +1,69 @@
+#ifndef GC_ATTRS_H
+#define GC_ATTRS_H
+
+#include "gc-inline.h"
+#include "gc-allocation-kind.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+enum gc_allocator_kind {
+  GC_ALLOCATOR_INLINE_BUMP_POINTER,
+  GC_ALLOCATOR_INLINE_FREELIST,
+  GC_ALLOCATOR_INLINE_NONE
+};
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_large_threshold(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_small_granule_size(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_allocation_limit_offset(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_alloc_table_alignment(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLINE;
+
+enum gc_old_generation_check_kind {
+  GC_OLD_GENERATION_CHECK_NONE,
+  GC_OLD_GENERATION_CHECK_ALLOC_TABLE,
+  GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY,
+  GC_OLD_GENERATION_CHECK_SLOW
+};
+
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) GC_ALWAYS_INLINE;
+
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) GC_ALWAYS_INLINE;
+
+enum gc_write_barrier_kind {
+  GC_WRITE_BARRIER_NONE,
+  GC_WRITE_BARRIER_FIELD,
+  GC_WRITE_BARRIER_SLOW
+};
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) GC_ALWAYS_INLINE;
+static inline size_t gc_write_barrier_field_table_alignment(void) GC_ALWAYS_INLINE;
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) GC_ALWAYS_INLINE;
+static inline size_t gc_write_barrier_field_fields_per_byte(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) GC_ALWAYS_INLINE;
+
+enum gc_safepoint_mechanism {
+  GC_SAFEPOINT_MECHANISM_COOPERATIVE,
+  GC_SAFEPOINT_MECHANISM_SIGNAL,
+};
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) GC_ALWAYS_INLINE;
+
+enum gc_cooperative_safepoint_kind {
+  GC_COOPERATIVE_SAFEPOINT_NONE,
+  GC_COOPERATIVE_SAFEPOINT_MUTATOR_FLAG,
+  GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG,
+};
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) GC_ALWAYS_INLINE;
+
+static inline int gc_can_pin_objects(void) GC_ALWAYS_INLINE;
+
+#endif // GC_ATTRS_H
--- a/libguile/whippet/api/gc-basic-stats.h
+++ b/libguile/whippet/api/gc-basic-stats.h
@ -0,0 +1,177 @@
+#ifndef GC_BASIC_STATS_H
+#define GC_BASIC_STATS_H
+
+#include "gc-event-listener.h"
+#include "gc-histogram.h"
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+GC_DEFINE_HISTOGRAM(gc_latency, 25, 4);
+
+struct gc_basic_stats {
+  uint64_t major_collection_count;
+  uint64_t minor_collection_count;
+  uint64_t last_time_usec;
+  uint64_t last_cpu_time_usec;
+  uint64_t elapsed_mutator_usec;
+  uint64_t elapsed_collector_usec;
+  uint64_t cpu_mutator_usec;
+  uint64_t cpu_collector_usec;
+  size_t heap_size;
+  size_t max_heap_size;
+  size_t max_live_data_size;
+  struct gc_latency pause_times;
+};
+
+static inline uint64_t gc_basic_stats_now(void) {
+  struct timeval tv;
+  if (gettimeofday(&tv, NULL) != 0) GC_CRASH();
+  uint64_t ret = tv.tv_sec;
+  ret *= 1000 * 1000;
+  ret += tv.tv_usec;
+  return ret;
+}
+
+static inline uint64_t gc_basic_stats_cpu_time(void) {
+  struct timespec ts;
+  clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &ts);
+  uint64_t ret = ts.tv_sec;
+  ret *= 1000 * 1000;
+  ret += ts.tv_nsec / 1000;
+  return ret;
+}
+
+static inline void gc_basic_stats_init(void *data, size_t heap_size) {
+  struct gc_basic_stats *stats = data;
+  memset(stats, 0, sizeof(*stats));
+  stats->last_time_usec = gc_basic_stats_now();
+  stats->last_cpu_time_usec = gc_basic_stats_cpu_time();
+  stats->heap_size = stats->max_heap_size = heap_size;
+}
+
+static inline void gc_basic_stats_requesting_stop(void *data) {
+  struct gc_basic_stats *stats = data;
+  uint64_t now = gc_basic_stats_now();
+  uint64_t cpu_time = gc_basic_stats_cpu_time();
+  stats->elapsed_mutator_usec += now - stats->last_time_usec;
+  stats->cpu_mutator_usec += cpu_time - stats->last_cpu_time_usec;
+  stats->last_time_usec = now;
+  stats->last_cpu_time_usec = cpu_time;
+}
+static inline void gc_basic_stats_waiting_for_stop(void *data) {}
+static inline void gc_basic_stats_mutators_stopped(void *data) {}
+
+static inline void gc_basic_stats_prepare_gc(void *data,
+                                             enum gc_collection_kind kind) {
+  struct gc_basic_stats *stats = data;
+  if (kind == GC_COLLECTION_MINOR)
+    stats->minor_collection_count++;
+  else
+    stats->major_collection_count++;
+}
+
+static inline void gc_basic_stats_roots_traced(void *data) {}
+static inline void gc_basic_stats_heap_traced(void *data) {}
+static inline void gc_basic_stats_ephemerons_traced(void *data) {}
+static inline void gc_basic_stats_finalizers_traced(void *data) {}
+
+static inline void gc_basic_stats_restarting_mutators(void *data) {
+  struct gc_basic_stats *stats = data;
+  uint64_t now = gc_basic_stats_now();
+  uint64_t cpu_time = gc_basic_stats_cpu_time();
+  uint64_t pause_time = now - stats->last_time_usec;
+  uint64_t pause_cpu_time = cpu_time - stats->last_cpu_time_usec;
+  stats->elapsed_collector_usec += pause_time;
+  stats->cpu_collector_usec += pause_cpu_time;
+  gc_latency_record(&stats->pause_times, pause_time);
+  stats->last_time_usec = now;
+  stats->last_cpu_time_usec = cpu_time;
+}
+
+static inline void* gc_basic_stats_mutator_added(void *data) {
+  return NULL;
+}
+static inline void gc_basic_stats_mutator_cause_gc(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_stopping(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_stopped(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_restarted(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_removed(void *mutator_data) {}
+
+static inline void gc_basic_stats_heap_resized(void *data, size_t size) {
+  struct gc_basic_stats *stats = data;
+  stats->heap_size = size;
+  if (size > stats->max_heap_size)
+    stats->max_heap_size = size;
+}
+
+static inline void gc_basic_stats_live_data_size(void *data, size_t size) {
+  struct gc_basic_stats *stats = data;
+  if (size > stats->max_live_data_size)
+    stats->max_live_data_size = size;
+}
+
+#define GC_BASIC_STATS                                                  \
+  ((struct gc_event_listener) {                                         \
+    gc_basic_stats_init,                                                \
+    gc_basic_stats_requesting_stop,                                     \
+    gc_basic_stats_waiting_for_stop,                                    \
+    gc_basic_stats_mutators_stopped,                                    \
+    gc_basic_stats_prepare_gc,                                          \
+    gc_basic_stats_roots_traced,                                        \
+    gc_basic_stats_heap_traced,                                         \
+    gc_basic_stats_ephemerons_traced,                                   \
+    gc_basic_stats_finalizers_traced,                                   \
+    gc_basic_stats_restarting_mutators,                                 \
+    gc_basic_stats_mutator_added,                                       \
+    gc_basic_stats_mutator_cause_gc,                                    \
+    gc_basic_stats_mutator_stopping,                                    \
+    gc_basic_stats_mutator_stopped,                                     \
+    gc_basic_stats_mutator_restarted,                                   \
+    gc_basic_stats_mutator_removed,                                     \
+    gc_basic_stats_heap_resized,                                        \
+    gc_basic_stats_live_data_size,                                      \
+  })
+
+static inline void gc_basic_stats_finish(struct gc_basic_stats *stats) {
+  uint64_t now = gc_basic_stats_now();
+  uint64_t cpu_time = gc_basic_stats_cpu_time();
+  stats->elapsed_mutator_usec += now - stats->last_time_usec;
+  stats->cpu_mutator_usec += cpu_time - stats->last_cpu_time_usec;
+  stats->last_time_usec = now;
+  stats->last_cpu_time_usec = cpu_time;
+}
+
+static inline void gc_basic_stats_print(struct gc_basic_stats *stats, FILE *f) {
+  fprintf(f, "Completed %" PRIu64 " major collections (%" PRIu64 " minor).\n",
+          stats->major_collection_count, stats->minor_collection_count);
+  uint64_t stopped = stats->elapsed_collector_usec;
+  uint64_t elapsed = stats->elapsed_mutator_usec + stopped;
+  uint64_t cpu_stopped = stats->cpu_collector_usec;
+  uint64_t cpu_total = stats->cpu_mutator_usec + cpu_stopped;
+  uint64_t ms = 1000; // per usec
+  fprintf(f, "%" PRIu64 ".%.3" PRIu64 " ms total time "
+          "(%" PRIu64 ".%.3" PRIu64 " stopped); "
+          "%" PRIu64 ".%.3" PRIu64 " ms CPU time "
+          "(%" PRIu64 ".%.3" PRIu64 " stopped).\n",
+          elapsed / ms, elapsed % ms, stopped / ms, stopped % ms,
+          cpu_total / ms, cpu_total % ms, cpu_stopped / ms, cpu_stopped % ms);
+  uint64_t pause_median = gc_latency_median(&stats->pause_times);
+  uint64_t pause_p95 = gc_latency_percentile(&stats->pause_times, 0.95);
+  uint64_t pause_max = gc_latency_max(&stats->pause_times);
+  fprintf(f, "%" PRIu64 ".%.3" PRIu64 " ms median pause time, "
+          "%" PRIu64 ".%.3" PRIu64 " p95, "
+          "%" PRIu64 ".%.3" PRIu64 " max.\n",
+          pause_median / ms, pause_median % ms, pause_p95 / ms, pause_p95 % ms,
+          pause_max / ms, pause_max % ms);
+  double MB = 1e6;
+  fprintf(f, "Heap size is %.3f MB (max %.3f MB); peak live data %.3f MB.\n",
+          stats->heap_size / MB, stats->max_heap_size / MB,
+          stats->max_live_data_size / MB);
+}
+
+#endif // GC_BASIC_STATS_H_
--- a/libguile/whippet/api/gc-collection-kind.h
+++ b/libguile/whippet/api/gc-collection-kind.h
@ -0,0 +1,11 @@
+#ifndef GC_COLLECTION_KIND_H
+#define GC_COLLECTION_KIND_H
+
+enum gc_collection_kind {
+  GC_COLLECTION_ANY,
+  GC_COLLECTION_MINOR,
+  GC_COLLECTION_MAJOR,
+  GC_COLLECTION_COMPACTING,
+};
+
+#endif // GC_COLLECTION_KIND_H
--- a/libguile/whippet/api/gc-config.h
+++ b/libguile/whippet/api/gc-config.h
@ -0,0 +1,40 @@
+#ifndef GC_CONFIG_H
+#define GC_CONFIG_H
+
+#ifndef GC_DEBUG
+#define GC_DEBUG 0
+#endif
+
+#ifndef GC_HAS_IMMEDIATES
+#define GC_HAS_IMMEDIATES 1
+#endif
+
+#ifndef GC_PARALLEL
+#define GC_PARALLEL 0
+#endif
+
+#ifndef GC_GENERATIONAL
+#define GC_GENERATIONAL 0
+#endif
+
+// Though you normally wouldn't configure things this way, it's possible
+// to have both precise and conservative roots.  However we have to
+// either have precise or conservative tracing; not a mix.
+
+#ifndef GC_PRECISE_ROOTS
+#define GC_PRECISE_ROOTS 0
+#endif
+
+#ifndef GC_CONSERVATIVE_ROOTS
+#define GC_CONSERVATIVE_ROOTS 0
+#endif
+
+#ifndef GC_CONSERVATIVE_TRACE
+#define GC_CONSERVATIVE_TRACE 0
+#endif
+
+#ifndef GC_CONCURRENT_TRACE
+#define GC_CONCURRENT_TRACE 0
+#endif
+
+#endif // GC_CONFIG_H
--- a/libguile/whippet/api/gc-conservative-ref.h
+++ b/libguile/whippet/api/gc-conservative-ref.h
@ -0,0 +1,17 @@
+#ifndef GC_CONSERVATIVE_REF_H
+#define GC_CONSERVATIVE_REF_H
+
+#include <stdint.h>
+
+struct gc_conservative_ref {
+  uintptr_t value;
+};
+
+static inline struct gc_conservative_ref gc_conservative_ref(uintptr_t value) {
+  return (struct gc_conservative_ref){value};
+}
+static inline uintptr_t gc_conservative_ref_value(struct gc_conservative_ref ref) {
+  return ref.value;
+}
+
+#endif // GC_CONSERVATIVE_REF_H
--- a/libguile/whippet/api/gc-edge.h
+++ b/libguile/whippet/api/gc-edge.h
@ -0,0 +1,26 @@
+#ifndef GC_EDGE_H
+#define GC_EDGE_H
+
+#include "gc-ref.h"
+
+struct gc_edge {
+  struct gc_ref *dst;
+};
+
+static inline struct gc_edge gc_edge(void* addr) {
+  return (struct gc_edge){addr};
+}
+static inline struct gc_ref gc_edge_ref(struct gc_edge edge) {
+  return *edge.dst;
+}
+static inline struct gc_ref* gc_edge_loc(struct gc_edge edge) {
+  return edge.dst;
+}
+static inline uintptr_t gc_edge_address(struct gc_edge edge) {
+  return (uintptr_t)gc_edge_loc(edge);
+}
+static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
+  *edge.dst = ref;
+}
+
+#endif // GC_EDGE_H
--- a/libguile/whippet/api/gc-embedder-api.h
+++ b/libguile/whippet/api/gc-embedder-api.h
@ -0,0 +1,67 @@
+#ifndef GC_EMBEDDER_API_H
+#define GC_EMBEDDER_API_H
+
+#include <stddef.h>
+
+#include "gc-config.h"
+#include "gc-edge.h"
+#include "gc-inline.h"
+#include "gc-forwarding.h"
+
+#ifndef GC_EMBEDDER_API
+#define GC_EMBEDDER_API static
+#endif
+
+struct gc_mutator_roots;
+struct gc_heap_roots;
+struct gc_atomic_forward;
+struct gc_heap;
+struct gc_extern_space;
+
+GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
+GC_EMBEDDER_API inline size_t gc_finalizer_priority_count(void);
+
+GC_EMBEDDER_API inline int gc_extern_space_visit(struct gc_extern_space *space,
+                                                 struct gc_edge edge,
+                                                 struct gc_ref ref) GC_ALWAYS_INLINE;
+GC_EMBEDDER_API inline void gc_extern_space_start_gc(struct gc_extern_space *space,
+                                                     int is_minor_gc);
+GC_EMBEDDER_API inline void gc_extern_space_finish_gc(struct gc_extern_space *space,
+                                                      int is_minor_gc);
+
+GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
+                                            void (*visit)(struct gc_edge edge,
+                                                          struct gc_heap *heap,
+                                                          void *visit_data),
+                                            struct gc_heap *heap,
+                                            void *trace_data,
+                                            size_t *size) GC_ALWAYS_INLINE;
+
+GC_EMBEDDER_API inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                                   void (*trace_edge)(struct gc_edge edge,
+                                                                      struct gc_heap *heap,
+                                                                      void *trace_data),
+                                                   struct gc_heap *heap,
+                                                   void *trace_data);
+GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                                void (*trace_edge)(struct gc_edge edge,
+                                                                   struct gc_heap *heap,
+                                                                   void *trace_data),
+                                                struct gc_heap *heap,
+                                                void *trace_data);
+
+GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
+GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
+                                                        struct gc_ref new_ref);
+
+GC_EMBEDDER_API inline struct gc_atomic_forward gc_atomic_forward_begin(struct gc_ref ref);
+GC_EMBEDDER_API inline void gc_atomic_forward_acquire(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline int gc_atomic_forward_retry_busy(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline void gc_atomic_forward_abort(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline size_t gc_atomic_forward_object_size(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline void gc_atomic_forward_commit(struct gc_atomic_forward *,
+                                                     struct gc_ref new_ref);
+GC_EMBEDDER_API inline uintptr_t gc_atomic_forward_address(struct gc_atomic_forward *);
+
+
+#endif // GC_EMBEDDER_API_H
--- a/libguile/whippet/api/gc-ephemeron.h
+++ b/libguile/whippet/api/gc-ephemeron.h
@ -0,0 +1,42 @@
+#ifndef GC_EPHEMERON_H_
+#define GC_EPHEMERON_H_
+
+#include "gc-edge.h"
+#include "gc-ref.h"
+#include "gc-visibility.h"
+
+// Ephemerons establish an association between a "key" object and a
+// "value" object.  If the ephemeron and the key are live, then the
+// value is live, and can be retrieved from the ephemeron.  Ephemerons
+// can be chained together, which allows them to function as links in a
+// buckets-and-chains hash table.
+//
+// This file defines the user-facing API for ephemerons.
+
+struct gc_heap;
+struct gc_mutator;
+struct gc_ephemeron;
+
+GC_API_ size_t gc_ephemeron_size(void);
+GC_API_ struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut);
+GC_API_ void gc_ephemeron_init(struct gc_mutator *mut,
+                               struct gc_ephemeron *ephemeron,
+                               struct gc_ref key, struct gc_ref value);
+
+GC_API_ struct gc_ref gc_ephemeron_key(struct gc_ephemeron *ephemeron);
+GC_API_ struct gc_ref gc_ephemeron_value(struct gc_ephemeron *ephemeron);
+
+GC_API_ struct gc_ephemeron* gc_ephemeron_chain_head(struct gc_ephemeron **loc);
+GC_API_ void gc_ephemeron_chain_push(struct gc_ephemeron **loc,
+                                     struct gc_ephemeron *ephemeron);
+GC_API_ struct gc_ephemeron* gc_ephemeron_chain_next(struct gc_ephemeron *ephemeron);
+GC_API_ void gc_ephemeron_mark_dead(struct gc_ephemeron *ephemeron);
+
+GC_API_ void gc_trace_ephemeron(struct gc_ephemeron *ephemeron,
+                                void (*visit)(struct gc_edge edge,
+                                              struct gc_heap *heap,
+                                              void *visit_data),
+                                struct gc_heap *heap,
+                                void *trace_data);
+
+#endif // GC_EPHEMERON_H_
--- a/libguile/whippet/api/gc-event-listener-chain.h
+++ b/libguile/whippet/api/gc-event-listener-chain.h
@ -0,0 +1,145 @@
+#ifndef GC_EVENT_LISTENER_CHAIN_H
+#define GC_EVENT_LISTENER_CHAIN_H
+
+#include "gc-event-listener.h"
+
+struct gc_event_listener_chain {
+  struct gc_event_listener head; void *head_data;
+  struct gc_event_listener tail; void *tail_data;
+};
+
+struct gc_event_listener_chain_mutator {
+  struct gc_event_listener_chain *chain;
+  void *head_mutator_data;
+  void *tail_mutator_data;
+};
+
+static inline void gc_event_listener_chain_init(void *data, size_t heap_size) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.init(chain->head_data, heap_size);
+  chain->tail.init(chain->tail_data, heap_size);
+}
+
+static inline void gc_event_listener_chain_requesting_stop(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.requesting_stop(chain->head_data);
+  chain->tail.requesting_stop(chain->tail_data);
+}
+static inline void gc_event_listener_chain_waiting_for_stop(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.waiting_for_stop(chain->head_data);
+  chain->tail.waiting_for_stop(chain->tail_data);
+}
+static inline void gc_event_listener_chain_mutators_stopped(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.mutators_stopped(chain->head_data);
+  chain->tail.mutators_stopped(chain->tail_data);
+}
+static inline void
+gc_event_listener_chain_prepare_gc(void *data, enum gc_collection_kind kind) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.prepare_gc(chain->head_data, kind);
+  chain->tail.prepare_gc(chain->tail_data, kind);
+}
+static inline void gc_event_listener_chain_roots_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.roots_traced(chain->head_data);
+  chain->tail.roots_traced(chain->tail_data);
+}
+static inline void gc_event_listener_chain_heap_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.heap_traced(chain->head_data);
+  chain->tail.heap_traced(chain->tail_data);
+}
+static inline void gc_event_listener_chain_ephemerons_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.ephemerons_traced(chain->head_data);
+  chain->tail.ephemerons_traced(chain->tail_data);
+}
+static inline void gc_event_listener_chain_finalizers_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.finalizers_traced(chain->head_data);
+  chain->tail.finalizers_traced(chain->tail_data);
+}
+
+static inline void gc_event_listener_chain_restarting_mutators(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.restarting_mutators(chain->head_data);
+  chain->tail.restarting_mutators(chain->tail_data);
+}
+
+static inline void* gc_event_listener_chain_mutator_added(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  struct gc_event_listener_chain_mutator *mutator = malloc(sizeof(*mutator));;
+  if (!mutator) abort();
+  mutator->chain = chain;
+  mutator->head_mutator_data = chain->head.mutator_added(chain->head_data);
+  mutator->tail_mutator_data = chain->tail.mutator_added(chain->tail_data);
+  return mutator;
+}
+
+static inline void gc_event_listener_chain_mutator_cause_gc(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.restarting_mutators(mutator->head_data);
+  mutator->chain->tail.restarting_mutators(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_stopping(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_stopping(mutator->head_data);
+  mutator->chain->tail.mutator_stopping(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_stopped(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_stopped(mutator->head_data);
+  mutator->chain->tail.mutator_stopped(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_restarted(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_restarted(mutator->head_data);
+  mutator->chain->tail.mutator_restarted(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_removed(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_removed(mutator->head_data);
+  mutator->chain->tail.mutator_removed(mutator->tail_data);
+  free(mutator);
+}
+
+static inline void gc_event_listener_chain_heap_resized(void *data, size_t size) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.heap_resized(chain->head_data, size);
+  chain->tail.heap_resized(chain->tail_data, size);
+}
+
+static inline void gc_event_listener_chain_live_data_size(void *data, size_t size) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.live_data_size(chain->head_data, size);
+  chain->tail.live_data_size(chain->tail_data, size);
+}
+
+#define GC_EVENT_LISTENER_CHAIN                                         \
+  ((struct gc_event_listener) {                                         \
+    gc_event_listener_chain_init,                                       \
+    gc_event_listener_chain_requesting_stop,                            \
+    gc_event_listener_chain_waiting_for_stop,                           \
+    gc_event_listener_chain_mutators_stopped,                           \
+    gc_event_listener_chain_prepare_gc,                                 \
+    gc_event_listener_chain_roots_traced,                               \
+    gc_event_listener_chain_heap_traced,                                \
+    gc_event_listener_chain_ephemerons_traced,                          \
+    gc_event_listener_chain_finalizers_traced,                          \
+    gc_event_listener_chain_restarting_mutators,                        \
+    gc_event_listener_chain_mutator_added,                              \
+    gc_event_listener_chain_mutator_cause_gc,                           \
+    gc_event_listener_chain_mutator_stopping,                           \
+    gc_event_listener_chain_mutator_stopped,                            \
+    gc_event_listener_chain_mutator_restarted,                          \
+    gc_event_listener_chain_mutator_removed,                            \
+    gc_event_listener_chain_heap_resized,                               \
+    gc_event_listener_chain_live_data_size,                             \
+  })
+
+#define GC_EVENT_LISTENER_CHAIN_DATA(head, head_data, tail, tail_data)  \
+  ((struct gc_event_listener_chain_data){head, head_data, tail, tail_data})
+
+#endif // GC_EVENT_LISTENER_CHAIN_H
--- a/libguile/whippet/api/gc-event-listener.h
+++ b/libguile/whippet/api/gc-event-listener.h
@ -0,0 +1,29 @@
+#ifndef GC_EVENT_LISTENER_H
+#define GC_EVENT_LISTENER_H
+
+#include "gc-collection-kind.h"
+
+struct gc_event_listener {
+  void (*init)(void *data, size_t heap_size);
+  void (*requesting_stop)(void *data);
+  void (*waiting_for_stop)(void *data);
+  void (*mutators_stopped)(void *data);
+  void (*prepare_gc)(void *data, enum gc_collection_kind kind);
+  void (*roots_traced)(void *data);
+  void (*heap_traced)(void *data);
+  void (*ephemerons_traced)(void *data);
+  void (*finalizers_traced)(void *data);
+  void (*restarting_mutators)(void *data);
+
+  void* (*mutator_added)(void *data);
+  void (*mutator_cause_gc)(void *mutator_data);
+  void (*mutator_stopping)(void *mutator_data);
+  void (*mutator_stopped)(void *mutator_data);
+  void (*mutator_restarted)(void *mutator_data);
+  void (*mutator_removed)(void *mutator_data);
+
+  void (*heap_resized)(void *data, size_t size);
+  void (*live_data_size)(void *data, size_t size);
+};
+
+#endif // GC_EVENT_LISTENER_H
--- a/libguile/whippet/api/gc-finalizer.h
+++ b/libguile/whippet/api/gc-finalizer.h
@ -0,0 +1,81 @@
+#ifndef GC_FINALIZER_H_
+#define GC_FINALIZER_H_
+
+#include "gc-edge.h"
+#include "gc-ref.h"
+#include "gc-visibility.h"
+
+// A finalizer allows the embedder to be notified when an object becomes
+// unreachable.
+//
+// A finalizer has a priority.  When the heap is created, the embedder
+// should declare how many priorities there are.  Lower-numbered
+// priorities take precedence; if an object has a priority-0 finalizer
+// outstanding, that will prevent any finalizer at level 1 (or 2, ...)
+// from firing until no priority-0 finalizer remains.
+//
+// Call gc_attach_finalizer to attach a finalizer to an object.
+//
+// A finalizer also references an associated GC-managed closure object.
+// A finalizer's reference to the closure object is strong:  if a
+// finalizer's closure closure references its finalizable object,
+// directly or indirectly, the finalizer will never fire.
+//
+// When an object with a finalizer becomes unreachable, it is added to a
+// queue.  The embedder can call gc_pop_finalizable to get the next
+// finalizable object and its associated closure.  At that point the
+// embedder can do anything with the object, including keeping it alive.
+// Ephemeron associations will still be present while the finalizable
+// object is live.  Note however that any objects referenced by the
+// finalizable object may themselves be already finalized; finalizers
+// are enqueued for objects when they become unreachable, which can
+// concern whole subgraphs of objects at once.
+//
+// The usual way for an embedder to know when the queue of finalizable
+// object is non-empty is to call gc_set_finalizer_callback to
+// provide a function that will be invoked when there are pending
+// finalizers.
+//
+// Arranging to call gc_pop_finalizable and doing something with the
+// finalizable object and closure is the responsibility of the embedder.
+// The embedder's finalization action can end up invoking arbitrary
+// code, so unless the embedder imposes some kind of restriction on what
+// finalizers can do, generally speaking finalizers should be run in a
+// dedicated thread instead of recursively from within whatever mutator
+// thread caused GC.  Setting up such a thread is the responsibility of
+// the mutator.  gc_pop_finalizable is thread-safe, allowing multiple
+// finalization threads if that is appropriate.
+//
+// gc_allocate_finalizer returns a finalizer, which is a fresh
+// GC-managed heap object.  The mutator should then directly attach it
+// to an object using gc_finalizer_attach.  When the finalizer is fired,
+// it becomes available to the mutator via gc_pop_finalizable.
+
+struct gc_heap;
+struct gc_mutator;
+struct gc_finalizer;
+
+GC_API_ size_t gc_finalizer_size(void);
+GC_API_ struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut);
+GC_API_ void gc_finalizer_attach(struct gc_mutator *mut,
+                                 struct gc_finalizer *finalizer,
+                                 unsigned priority,
+                                 struct gc_ref object, struct gc_ref closure);
+
+GC_API_ struct gc_ref gc_finalizer_object(struct gc_finalizer *finalizer);
+GC_API_ struct gc_ref gc_finalizer_closure(struct gc_finalizer *finalizer);
+
+GC_API_ struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut);
+
+typedef void (*gc_finalizer_callback)(struct gc_heap *heap, size_t count);
+GC_API_ void gc_set_finalizer_callback(struct gc_heap *heap,
+                                       gc_finalizer_callback callback);
+
+GC_API_ void gc_trace_finalizer(struct gc_finalizer *finalizer,
+                                void (*visit)(struct gc_edge edge,
+                                              struct gc_heap *heap,
+                                              void *visit_data),
+                                struct gc_heap *heap,
+                                void *trace_data);
+
+#endif // GC_FINALIZER_H_
--- a/libguile/whippet/api/gc-forwarding.h
+++ b/libguile/whippet/api/gc-forwarding.h
@ -0,0 +1,20 @@
+#ifndef GC_FORWARDING_H
+#define GC_FORWARDING_H
+
+#include <stdint.h>
+#include "gc-ref.h"
+
+enum gc_forwarding_state {
+  GC_FORWARDING_STATE_FORWARDED,
+  GC_FORWARDING_STATE_BUSY,
+  GC_FORWARDING_STATE_ACQUIRED,
+  GC_FORWARDING_STATE_NOT_FORWARDED
+};
+
+struct gc_atomic_forward {
+  struct gc_ref ref;
+  uintptr_t data;
+  enum gc_forwarding_state state;
+};
+
+#endif // GC_FORWARDING_H
--- a/libguile/whippet/api/gc-histogram.h
+++ b/libguile/whippet/api/gc-histogram.h
@ -0,0 +1,82 @@
+#ifndef GC_HISTOGRAM_H
+#define GC_HISTOGRAM_H
+
+#include "gc-assert.h"
+
+#include <stdint.h>
+
+static inline size_t gc_histogram_bucket(uint64_t max_value_bits,
+                                         uint64_t precision,
+                                         uint64_t val) {
+  uint64_t major = val < (1ULL << precision)
+    ? 0ULL
+    : 64ULL - __builtin_clzl(val) - precision;
+  uint64_t minor = val < (1 << precision)
+    ? val
+    : (val >> (major - 1ULL)) & ((1ULL << precision) - 1ULL);
+  uint64_t idx = (major << precision) | minor;
+  if (idx >= (max_value_bits << precision))
+    idx = max_value_bits << precision;
+  return idx;
+}
+
+static inline uint64_t gc_histogram_bucket_min_val(uint64_t precision,
+                                                   size_t idx) {
+  uint64_t major = idx >> precision;
+  uint64_t minor = idx & ((1ULL << precision) - 1ULL);
+  uint64_t min_val = major
+    ? ((1ULL << precision) | minor) << (major - 1ULL)
+    : minor;
+  return min_val;
+}
+
+#define GC_DEFINE_HISTOGRAM(name, max_value_bits, precision)            \
+  struct name { uint32_t buckets[((max_value_bits) << (precision)) + 1]; }; \
+  static inline size_t name##_size(void) {                              \
+    return ((max_value_bits) << (precision)) + 1;                       \
+  }                                                                     \
+  static inline uint64_t name##_bucket_min_val(size_t idx) {            \
+    GC_ASSERT(idx < name##_size());                                     \
+    return gc_histogram_bucket_min_val((precision), idx);               \
+  }                                                                     \
+  static inline struct name make_##name(void) {                         \
+    return (struct name) { { 0, }};                                     \
+  }                                                                     \
+  static inline void name##_record(struct name *h, uint64_t val) {      \
+    h->buckets[gc_histogram_bucket((max_value_bits), (precision), val)]++; \
+  }                                                                     \
+  static inline uint64_t name##_ref(struct name *h, size_t idx) {       \
+    GC_ASSERT(idx < name##_size());                                     \
+    return h->buckets[idx];                                             \
+  }                                                                     \
+  static inline uint64_t name##_min(struct name *h) {                   \
+    for (size_t bucket = 0; bucket < name##_size(); bucket++)           \
+      if (h->buckets[bucket]) return name##_bucket_min_val(bucket);     \
+    return -1;                                                          \
+  }                                                                     \
+  static inline uint64_t name##_max(struct name *h) {                   \
+    if (h->buckets[name##_size()-1]) return -1LL;                       \
+    for (ssize_t bucket = name##_size() - 1; bucket >= 0; bucket--)     \
+      if (h->buckets[bucket]) return name##_bucket_min_val(bucket+1);   \
+    return 0;                                                           \
+  }                                                                     \
+  static inline uint64_t name##_count(struct name *h) {                 \
+    uint64_t sum = 0;                                                   \
+    for (size_t bucket = 0; bucket < name##_size(); bucket++)           \
+      sum += h->buckets[bucket];                                        \
+    return sum;                                                         \
+  }                                                                     \
+  static inline uint64_t name##_percentile(struct name *h, double p) {  \
+    uint64_t n = name##_count(h) * p;                                   \
+    uint64_t sum = 0;                                                   \
+    for (size_t bucket = 0; bucket + 1 < name##_size(); bucket++) {     \
+      sum += h->buckets[bucket];                                        \
+      if (sum >= n) return name##_bucket_min_val(bucket+1);             \
+    }                                                                   \
+    return -1ULL;                                                       \
+  }                                                                     \
+  static inline uint64_t name##_median(struct name *h) {                \
+    return name##_percentile(h, 0.5);                                   \
+  }
+
+#endif // GC_HISTOGRAM_H
--- a/libguile/whippet/api/gc-inline.h
+++ b/libguile/whippet/api/gc-inline.h
@ -0,0 +1,7 @@
+#ifndef GC_INLINE_H_
+#define GC_INLINE_H_
+
+#define GC_ALWAYS_INLINE __attribute__((always_inline))
+#define GC_NEVER_INLINE __attribute__((noinline))
+
+#endif // GC_INLINE_H_
--- a/libguile/whippet/api/gc-lttng.h
+++ b/libguile/whippet/api/gc-lttng.h
@ -0,0 +1,100 @@
+#define LTTNG_UST_TRACEPOINT_PROVIDER whippet
+
+#undef LTTNG_UST_TRACEPOINT_INCLUDE
+#define LTTNG_UST_TRACEPOINT_INCLUDE "gc-lttng.h"
+
+#if !defined(_TP_H) || defined(LTTNG_UST_TRACEPOINT_HEADER_MULTI_READ)
+#define _TP_H
+
+#include <lttng/tracepoint.h>
+
+LTTNG_UST_TRACEPOINT_ENUM(
+  whippet, gc_kind,
+  LTTNG_UST_TP_ENUM_VALUES
+  (lttng_ust_field_enum_value("MINOR", 1)
+   lttng_ust_field_enum_value("MAJOR", 2)
+   lttng_ust_field_enum_value("COMPACTING", 3)))
+
+LTTNG_UST_TRACEPOINT_EVENT_CLASS(
+  whippet, tracepoint,
+  LTTNG_UST_TP_ARGS(),
+  LTTNG_UST_TP_FIELDS())
+
+LTTNG_UST_TRACEPOINT_EVENT_CLASS(
+  whippet, size_tracepoint,
+  LTTNG_UST_TP_ARGS(size_t, size),
+  LTTNG_UST_TP_FIELDS(lttng_ust_field_integer(size_t, size, size)))
+
+
+/* The tracepoint instances */
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, size_tracepoint, whippet, init,
+  LTTNG_UST_TP_ARGS(size_t, size))
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, size_tracepoint, whippet, heap_resized,
+  LTTNG_UST_TP_ARGS(size_t, size))
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, size_tracepoint, whippet, live_data_size,
+  LTTNG_UST_TP_ARGS(size_t, size))
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, requesting_stop, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, waiting_for_stop, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutators_stopped, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT(
+  whippet, prepare_gc,
+  LTTNG_UST_TP_ARGS(int, gc_kind),
+  LTTNG_UST_TP_FIELDS(
+    lttng_ust_field_enum(whippet, gc_kind, int, gc_kind, gc_kind)))
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, roots_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, heap_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, ephemerons_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, finalizers_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, restarting_mutators, LTTNG_UST_TP_ARGS())
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_added, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_cause_gc, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_stopping, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_stopped, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_restarted, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_removed, LTTNG_UST_TP_ARGS())
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_unpark_all, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_share, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_check_termination_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_check_termination_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_steal, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_roots_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_roots_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_objects_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_objects_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_worker_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_worker_end, LTTNG_UST_TP_ARGS())
+
+#endif /* _TP_H */
+
+#include <lttng/tracepoint-event.h>
--- a/libguile/whippet/api/gc-null-event-listener.h
+++ b/libguile/whippet/api/gc-null-event-listener.h
@ -0,0 +1,50 @@
+#ifndef GC_NULL_EVENT_LISTENER_H
+#define GC_NULL_EVENT_LISTENER_H
+
+#include "gc-event-listener.h"
+
+static inline void gc_null_event_listener_init(void *data, size_t size) {}
+static inline void gc_null_event_listener_requesting_stop(void *data) {}
+static inline void gc_null_event_listener_waiting_for_stop(void *data) {}
+static inline void gc_null_event_listener_mutators_stopped(void *data) {}
+static inline void gc_null_event_listener_prepare_gc(void *data,
+                                                     enum gc_collection_kind) {}
+static inline void gc_null_event_listener_roots_traced(void *data) {}
+static inline void gc_null_event_listener_heap_traced(void *data) {}
+static inline void gc_null_event_listener_ephemerons_traced(void *data) {}
+static inline void gc_null_event_listener_finalizers_traced(void *data) {}
+static inline void gc_null_event_listener_restarting_mutators(void *data) {}
+
+static inline void* gc_null_event_listener_mutator_added(void *data) {}
+static inline void gc_null_event_listener_mutator_cause_gc(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_stopping(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_stopped(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_restarted(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_removed(void *mutator_data) {}
+
+static inline void gc_null_event_listener_heap_resized(void *, size_t) {}
+static inline void gc_null_event_listener_live_data_size(void *, size_t) {}
+
+#define GC_NULL_EVENT_LISTENER                                         \
+  ((struct gc_event_listener) {                                        \
+    gc_null_event_listener_init,                                       \
+    gc_null_event_listener_requesting_stop,                            \
+    gc_null_event_listener_waiting_for_stop,                           \
+    gc_null_event_listener_mutators_stopped,                           \
+    gc_null_event_listener_prepare_gc,                                 \
+    gc_null_event_listener_roots_traced,                               \
+    gc_null_event_listener_heap_traced,                                \
+    gc_null_event_listener_ephemerons_traced,                          \
+    gc_null_event_listener_finalizers_traced,                          \
+    gc_null_event_listener_restarting_mutators,                        \
+    gc_null_event_listener_mutator_added,                              \
+    gc_null_event_listener_mutator_cause_gc,                           \
+    gc_null_event_listener_mutator_stopping,                           \
+    gc_null_event_listener_mutator_stopped,                            \
+    gc_null_event_listener_mutator_restarted,                          \
+    gc_null_event_listener_mutator_removed,                            \
+    gc_null_event_listener_heap_resized,                               \
+    gc_null_event_listener_live_data_size,                             \
+  })
+
+#endif // GC_NULL_EVENT_LISTENER_H_
--- a/libguile/whippet/api/gc-options.h
+++ b/libguile/whippet/api/gc-options.h
@ -0,0 +1,39 @@
+#ifndef GC_OPTIONS_H
+#define GC_OPTIONS_H
+
+#include "gc-visibility.h"
+
+enum gc_heap_size_policy {
+  GC_HEAP_SIZE_FIXED,
+  GC_HEAP_SIZE_GROWABLE,
+  GC_HEAP_SIZE_ADAPTIVE,
+};
+
+enum {
+  GC_OPTION_HEAP_SIZE_POLICY,
+  GC_OPTION_HEAP_SIZE,
+  GC_OPTION_MAXIMUM_HEAP_SIZE,
+  GC_OPTION_HEAP_SIZE_MULTIPLIER,
+  GC_OPTION_HEAP_EXPANSIVENESS,
+  GC_OPTION_PARALLELISM
+};
+
+struct gc_options;
+
+GC_API_ int gc_option_from_string(const char *str);
+
+GC_API_ struct gc_options* gc_allocate_options(void);
+
+GC_API_ int gc_options_set_int(struct gc_options *options, int option,
+                               int value);
+GC_API_ int gc_options_set_size(struct gc_options *options, int option,
+                                size_t value);
+GC_API_ int gc_options_set_double(struct gc_options *options, int option,
+                                  double value);
+
+GC_API_ int gc_options_parse_and_set(struct gc_options *options,
+                                     int option, const char *value);
+GC_API_ int gc_options_parse_and_set_many(struct gc_options *options,
+                                          const char *str);
+
+#endif // GC_OPTIONS_H
--- a/libguile/whippet/api/gc-ref.h
+++ b/libguile/whippet/api/gc-ref.h
@ -0,0 +1,50 @@
+#ifndef GC_REF_H
+#define GC_REF_H
+
+#include "gc-assert.h"
+#include "gc-config.h"
+
+#include <stdint.h>
+
+struct gc_ref {
+  uintptr_t value;
+};
+
+static inline struct gc_ref gc_ref(uintptr_t value) {
+  return (struct gc_ref){value};
+}
+static inline uintptr_t gc_ref_value(struct gc_ref ref) {
+  return ref.value;
+}
+
+static inline struct gc_ref gc_ref_null(void) {
+  return gc_ref(0);
+}
+static inline int gc_ref_is_null(struct gc_ref ref) {
+  return ref.value == 0;
+}
+static inline int gc_ref_is_immediate(struct gc_ref ref) {
+  GC_ASSERT(!gc_ref_is_null(ref));
+  return GC_HAS_IMMEDIATES && (ref.value & (sizeof(void*) - 1));
+}
+static inline struct gc_ref gc_ref_immediate(uintptr_t val) {
+  GC_ASSERT(val & (sizeof(void*) - 1));
+  GC_ASSERT(GC_HAS_IMMEDIATES);
+  return gc_ref(val);
+}
+static inline int gc_ref_is_heap_object(struct gc_ref ref) {
+  return !gc_ref_is_immediate(ref);
+}
+static inline struct gc_ref gc_ref_from_heap_object_or_null(void *obj) {
+  return gc_ref((uintptr_t) obj);
+}
+static inline struct gc_ref gc_ref_from_heap_object(void *obj) {
+  GC_ASSERT(obj);
+  return gc_ref_from_heap_object_or_null(obj);
+}
+static inline void* gc_ref_heap_object(struct gc_ref ref) {
+  GC_ASSERT(gc_ref_is_heap_object(ref));
+  return (void *) gc_ref_value(ref);
+}
+
+#endif // GC_REF_H
--- a/libguile/whippet/api/gc-tracepoint.h
+++ b/libguile/whippet/api/gc-tracepoint.h
@ -0,0 +1,17 @@
+#ifndef GC_TRACEPOINT_H
+#define GC_TRACEPOINT_H
+
+#ifdef GC_TRACEPOINT_LTTNG
+
+#include "gc-lttng.h"
+
+#define GC_TRACEPOINT(...) \
+  lttng_ust_tracepoint(whippet, __VA_ARGS__)
+
+#else // GC_TRACEPOINT_LTTNG
+
+#define GC_TRACEPOINT(...) do {} while (0)
+
+#endif // GC_TRACEPOINT_LTTNG
+
+#endif // GC_TRACEPOINT_H
--- a/libguile/whippet/api/gc-visibility.h
+++ b/libguile/whippet/api/gc-visibility.h
@ -0,0 +1,12 @@
+#ifndef GC_VISIBILITY_H_
+#define GC_VISIBILITY_H_
+
+#define GC_INTERNAL __attribute__((visibility("hidden")))
+#define GC_PUBLIC __attribute__((visibility("default")))
+
+// FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
+#ifndef GC_API_
+#define GC_API_ GC_INTERNAL
+#endif
+
+#endif // GC_VISIBILITY_H
--- a/libguile/whippet/api/mmc-attrs.h
+++ b/libguile/whippet/api/mmc-attrs.h
@ -0,0 +1,121 @@
+#ifndef MMC_ATTRS_H
+#define MMC_ATTRS_H
+
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-attrs.h"
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return 16;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return 8192;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) {
+  GC_CRASH();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 4 * 1024 * 1024;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
+  uint8_t young = 1;
+  uint8_t trace_precisely = 0;
+  uint8_t trace_none = 8;
+  uint8_t trace_conservatively = 16;
+  uint8_t pinned = 16;
+  if (GC_CONSERVATIVE_TRACE) {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return young | trace_conservatively;
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+        return young | trace_none;
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return young | trace_none;
+      default:
+        GC_CRASH();
+      };
+  } else {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+        return young | trace_precisely;
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+        return young | trace_none;
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return young | trace_none | pinned;
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+      default:
+        GC_CRASH();
+    };
+  }
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  return 32;
+}
+
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) {
+  if (GC_GENERATIONAL) {
+    if (obj_size <= gc_allocator_large_threshold())
+      return GC_OLD_GENERATION_CHECK_ALLOC_TABLE;
+    return GC_OLD_GENERATION_CHECK_SLOW;
+  }
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  return 7;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
+  return 1;
+}
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
+  if (GC_GENERATIONAL) {
+    if (obj_size <= gc_allocator_large_threshold())
+      return GC_WRITE_BARRIER_FIELD;
+    return GC_WRITE_BARRIER_SLOW;
+  }
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return gc_allocator_alloc_table_alignment();
+}
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 0;
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 2;
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 64; // NOFL_METADATA_BYTE_LOGGED_0
+}
+
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG;
+}
+
+static inline int gc_can_pin_objects(void) {
+  return 1;
+}
+
+#endif // MMC_ATTRS_H
--- a/libguile/whippet/api/pcc-attrs.h
+++ b/libguile/whippet/api/pcc-attrs.h
@ -0,0 +1,92 @@
+#ifndef PCC_ATTRS_H
+#define PCC_ATTRS_H
+
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-attrs.h"
+
+static const uintptr_t GC_ALIGNMENT = 8;
+static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GC_ALIGNMENT;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return GC_LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size, enum gc_allocation_kind kind) {
+  GC_CRASH();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
+  GC_CRASH();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t size) {
+  if (!GC_GENERATIONAL)
+    return GC_OLD_GENERATION_CHECK_NONE;
+  if (size <= gc_allocator_large_threshold())
+    return GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY;
+  return GC_OLD_GENERATION_CHECK_SLOW;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
+  if (!GC_GENERATIONAL)
+    return GC_WRITE_BARRIER_NONE;
+  if (obj_size <= gc_allocator_large_threshold())
+    return GC_WRITE_BARRIER_FIELD;
+  return GC_WRITE_BARRIER_SLOW;
+}
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 64 * 1024 * 1024;
+}
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 128 * 1024;
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 8;
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 1;
+}
+
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG;
+}
+
+static inline int gc_can_pin_objects(void) {
+  return 0;
+}
+
+#endif // PCC_ATTRS_H
--- a/libguile/whippet/api/semi-attrs.h
+++ b/libguile/whippet/api/semi-attrs.h
@ -0,0 +1,80 @@
+#ifndef SEMI_ATTRS_H
+#define SEMI_ATTRS_H
+
+#include "gc-attrs.h"
+#include "gc-assert.h"
+
+static const uintptr_t GC_ALIGNMENT = 8;
+static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GC_ALIGNMENT;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return GC_LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) {
+  GC_CRASH();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
+  GC_CRASH();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_CRASH();
+}
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_NONE;
+}
+
+static inline int gc_can_pin_objects(void) {
+  return 0;
+}
+
+#endif // SEMI_ATTRS_H
--- a/libguile/whippet/benchmarks/README.md
+++ b/libguile/whippet/benchmarks/README.md
@ -0,0 +1,35 @@
+# Benchmarks
+
+ - [`mt-gcbench.c`](./mt-gcbench.c): The multi-threaded [GCBench
+   benchmark](https://hboehm.info/gc/gc_bench.html).  An old but
+   standard benchmark that allocates different sizes of binary trees.
+   As parameters it takes a heap multiplier and a number of mutator
+   threads.  We analytically compute the peak amount of live data and
+   then size the GC heap as a multiplier of that size.  It has a peak
+   heap consumption of 10 MB or so per mutator thread: not very large.
+   At a 2x heap multiplier, it causes about 30 collections for the `mmc`
+   collector, and runs somewhere around 200-400 milliseconds in
+   single-threaded mode, on the machines I have in 2022.  For low thread
+   counts, the GCBench benchmark is small; but then again many Guile
+   processes also are quite short-lived, so perhaps it is useful to
+   ensure that small heaps remain lightweight.
+
+   To stress `mmc`'s handling of fragmentation, we modified this
+   benchmark to intersperse pseudorandomly-sized holes between tree
+   nodes.
+
+ - [`quads.c`](./quads.c): A synthetic benchmark that allocates quad
+   trees.  The mutator begins by allocating one long-lived tree of depth
+   N, and then allocates 13% of the heap in depth-3 trees, 20 times,
+   simulating a fixed working set and otherwise an allocation-heavy
+   workload.  By observing the times to allocate 13% of the heap in
+   garbage we can infer mutator overheads, and also note the variance
+   for the cycles in which GC hits.
+
+## License
+
+mt-gcbench.c was originally from https://hboehm.info/gc/gc_bench/, which
+has a somewhat unclear license.  I have modified GCBench significantly
+so that I can slot in different GC implementations.  Other files are
+distributed under the Whippet license; see the top-level
+[README.md](../README.md) for more.
--- a/libguile/whippet/benchmarks/ephemerons-embedder.h
+++ b/libguile/whippet/benchmarks/ephemerons-embedder.h
@ -0,0 +1,54 @@
+#ifndef EPHEMERONS_EMBEDDER_H
+#define EPHEMERONS_EMBEDDER_H
+
+#include <stddef.h>
+
+#include "ephemerons-types.h"
+#include "gc-ephemeron.h"
+
+struct gc_heap;
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t small_object_size(SmallObject *obj) { return sizeof(*obj); }
+static inline size_t ephemeron_size(Ephemeron *obj) { return gc_ephemeron_size(); }
+static inline size_t box_size(Box *obj) { return sizeof(*obj); }
+
+static inline void
+visit_small_object_fields(SmallObject *obj,
+                          void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                        void *visit_data),
+                          struct gc_heap *heap,
+                          void *visit_data) {}
+
+static inline void
+visit_ephemeron_fields(Ephemeron *ephemeron,
+                       void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                     void *visit_data),
+
+                       struct gc_heap *heap,
+                       void *visit_data) {
+  gc_trace_ephemeron((struct gc_ephemeron*)ephemeron, visit, heap, visit_data);
+}
+
+static inline void
+visit_box_fields(Box *box,
+                 void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                               void *visit_data),
+                 struct gc_heap *heap,
+                 void *visit_data) {
+  visit(gc_edge(&box->obj), heap, visit_data);
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // EPHEMERONS_EMBEDDER_H
--- a/libguile/whippet/benchmarks/ephemerons-types.h
+++ b/libguile/whippet/benchmarks/ephemerons-types.h
@ -0,0 +1,21 @@
+#ifndef EPHEMERONS_TYPES_H
+#define EPHEMERONS_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(box, Box, BOX) \
+  M(ephemeron, Ephemeron, EPHEMERON) \
+  M(small_object, SmallObject, SMALL_OBJECT)
+
+#include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct SmallObject {
+  struct gc_header header;
+};
+
+struct Box {
+  struct gc_header header;
+  void *obj;
+};
+
+#endif // EPHEMERONS_TYPES_H
--- a/libguile/whippet/benchmarks/ephemerons.c
+++ b/libguile/whippet/benchmarks/ephemerons.c
@ -0,0 +1,272 @@
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include "gc-ephemeron.h"
+#include "simple-roots-api.h"
+#include "ephemerons-types.h"
+#include "simple-allocator.h"
+
+typedef HANDLE_TO(SmallObject) SmallObjectHandle;
+typedef HANDLE_TO(struct gc_ephemeron) EphemeronHandle;
+typedef HANDLE_TO(Box) BoxHandle;
+
+static SmallObject* allocate_small_object(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_SMALL_OBJECT, sizeof(SmallObject));
+}
+
+static Box* allocate_box(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_BOX, sizeof(Box));
+}
+
+static struct gc_ephemeron* allocate_ephemeron(struct gc_mutator *mut) {
+  struct gc_ephemeron *ret = gc_allocate_ephemeron(mut);
+  *tag_word(gc_ref_from_heap_object(ret)) = tag_live(ALLOC_KIND_EPHEMERON);
+  return ret;
+}
+
+/* Get the current time in microseconds */
+static unsigned long current_time(void)
+{
+  struct timeval t;
+  if (gettimeofday(&t, NULL) == -1)
+    return 0;
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+struct thread {
+  struct gc_mutator *mut;
+  struct gc_mutator_roots roots;
+};
+
+static void print_elapsed(const char *what, unsigned long start) {
+  unsigned long end = current_time();
+  unsigned long msec = (end - start) / 1000;
+  unsigned long usec = (end - start) % 1000;
+  printf("Completed %s in %lu.%.3lu msec\n", what, msec, usec);
+}
+
+struct call_with_gc_data {
+  void* (*f)(struct thread *);
+  struct gc_heap *heap;
+};
+static void* call_with_gc_inner(struct gc_stack_addr *addr, void *arg) {
+  struct call_with_gc_data *data = arg;
+  struct gc_mutator *mut = gc_init_for_thread(addr, data->heap);
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+  void *ret = data->f(&t);
+  gc_finish_for_thread(mut);
+  return ret;
+}
+static void* call_with_gc(void* (*f)(struct thread *),
+                          struct gc_heap *heap) {
+  struct call_with_gc_data data = { f, heap };
+  return gc_call_with_stack_addr(call_with_gc_inner, &data);
+}
+
+#define CHECK(x)                                                        \
+  do {                                                                  \
+    if (!(x)) {                                                         \
+      fprintf(stderr, "%s:%d: check failed: %s\n", __FILE__, __LINE__, #x); \
+      exit(1);                                                          \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NULL(x) CHECK_EQ(x, NULL)
+#define CHECK_NOT_NULL(x) CHECK_NE(x, NULL)
+
+static size_t ephemeron_chain_length(struct gc_ephemeron **loc,
+                                     SmallObject *key) {
+  struct gc_ephemeron *head = gc_ephemeron_chain_head(loc);
+  size_t len = 0;
+  while (head) {
+    CHECK_EQ(key, (SmallObject*)gc_ref_value(gc_ephemeron_key(head)));
+    Box *value = gc_ref_heap_object(gc_ephemeron_value(head));
+    CHECK_NOT_NULL(value);
+    key = value->obj;
+    CHECK_NOT_NULL(key);
+    head = gc_ephemeron_chain_next(head);
+    len++;
+  }
+  return len;
+}
+
+static double heap_size;
+static double heap_multiplier;
+static size_t nthreads;
+
+static void cause_gc(struct gc_mutator *mut) {
+  // Doing a full collection lets us reason precisely about liveness.
+  gc_collect(mut, GC_COLLECTION_MAJOR);
+}
+
+static void make_ephemeron_chain(struct thread *t, EphemeronHandle *head,
+                                 SmallObjectHandle *head_key, size_t length) {
+  BoxHandle tail_box = { NULL };
+  PUSH_HANDLE(t, tail_box);
+
+  CHECK_NULL(HANDLE_REF(*head_key));
+  HANDLE_SET(*head_key, allocate_small_object(t->mut));
+
+  for (size_t i = 0; i < length; i++) {
+    HANDLE_SET(tail_box, allocate_box(t->mut));
+    HANDLE_REF(tail_box)->obj = HANDLE_REF(*head_key);
+    HANDLE_SET(*head_key, allocate_small_object(t->mut));
+    struct gc_ephemeron *ephemeron = allocate_ephemeron(t->mut);
+    gc_ephemeron_init(t->mut, ephemeron,
+                      gc_ref_from_heap_object(HANDLE_REF(*head_key)),
+                      gc_ref_from_heap_object(HANDLE_REF(tail_box)));
+    gc_ephemeron_chain_push(HANDLE_LOC(*head), ephemeron);
+  }
+
+  POP_HANDLE(t);
+}
+
+static void* run_one_test(struct thread *t) {
+  size_t unit_size = gc_ephemeron_size() + sizeof(Box);
+  size_t list_length = heap_size / nthreads / heap_multiplier / unit_size;
+
+  printf("Allocating ephemeron list %zu nodes long.  Total size %.3fGB.\n",
+         list_length, list_length * unit_size / 1e9);
+
+  unsigned long thread_start = current_time();
+
+  SmallObjectHandle head_key = { NULL };
+  EphemeronHandle head = { NULL };
+
+  PUSH_HANDLE(t, head_key);
+  PUSH_HANDLE(t, head);
+
+  make_ephemeron_chain(t, &head, &head_key, list_length);
+
+  size_t measured_length = ephemeron_chain_length(HANDLE_LOC(head),
+                                                  HANDLE_REF(head_key));
+  CHECK_EQ(measured_length, list_length);
+
+  cause_gc(t->mut);
+  measured_length = ephemeron_chain_length(HANDLE_LOC(head),
+                                           HANDLE_REF(head_key));
+  CHECK_EQ(measured_length, list_length);
+
+  if (!GC_CONSERVATIVE_ROOTS) {
+    HANDLE_SET(head_key, NULL);
+    cause_gc(t->mut);
+    measured_length = ephemeron_chain_length(HANDLE_LOC(head),
+                                             HANDLE_REF(head_key));
+    CHECK_EQ(measured_length, 0);
+  }
+
+  // swap head_key for a key halfway in, cause gc
+  // check length is expected half-length; warn, or error if precise
+  // clear and return
+
+  print_elapsed("thread", thread_start);
+
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+
+  return NULL;
+}
+
+static void* run_one_test_in_thread(void *arg) {
+  struct gc_heap *heap = arg;
+  return call_with_gc(run_one_test, heap);
+}
+
+struct join_data { int status; pthread_t thread; };
+static void *join_thread(void *data) {
+  struct join_data *join_data = data;
+  void *ret;
+  join_data->status = pthread_join(join_data->thread, &ret);
+  return ret;
+}
+
+#define MAX_THREAD_COUNT 256
+
+int main(int argc, char *argv[]) {
+  if (argc < 4 || 5 < argc) {
+    fprintf(stderr, "usage: %s HEAP_SIZE MULTIPLIER NTHREADS [GC-OPTIONS]\n", argv[0]);
+    return 1;
+  }
+
+  heap_size = atof(argv[1]);
+  heap_multiplier = atof(argv[2]);
+  nthreads = atol(argv[3]);
+
+  if (heap_size < 8192) {
+    fprintf(stderr,
+            "Heap size should probably be at least 8192, right? '%s'\n",
+            argv[1]);
+    return 1;
+  }
+  if (!(1.0 < heap_multiplier && heap_multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+  if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for thread count, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[2]);
+    return 1;
+  }
+
+  printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
+         heap_size / 1e9, heap_multiplier);
+
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  if (argc == 5) {
+    if (!gc_options_parse_and_set_many(options, argv[4])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[4]);
+      return 1;
+    }
+  }
+
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            (size_t)heap_size);
+    return 1;
+  }
+  struct thread main_thread = { mut, };
+  gc_mutator_set_roots(mut, &main_thread.roots);
+
+  pthread_t threads[MAX_THREAD_COUNT];
+  // Run one of the threads in the main thread.
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, heap);
+    if (status) {
+      errno = status;
+      perror("Failed to create thread");
+      return 1;
+    }
+  }
+  run_one_test(&main_thread);
+  for (size_t i = 1; i < nthreads; i++) {
+    struct join_data data = { 0, threads[i] };
+    gc_call_without_gc(mut, join_thread, &data);
+    if (data.status) {
+      errno = data.status;
+      perror("Failed to join thread");
+      return 1;
+    }
+  }
+  
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
+
+  return 0;
+}
+
--- a/libguile/whippet/benchmarks/finalizers-embedder.h
+++ b/libguile/whippet/benchmarks/finalizers-embedder.h
@ -0,0 +1,55 @@
+#ifndef FINALIZERS_EMBEDDER_H
+#define FINALIZERS_EMBEDDER_H
+
+#include <stddef.h>
+
+#include "finalizers-types.h"
+#include "gc-finalizer.h"
+
+struct gc_heap;
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t small_object_size(SmallObject *obj) { return sizeof(*obj); }
+static inline size_t finalizer_size(Finalizer *obj) { return gc_finalizer_size(); }
+static inline size_t pair_size(Pair *obj) { return sizeof(*obj); }
+
+static inline void
+visit_small_object_fields(SmallObject *obj,
+                          void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                        void *visit_data),
+                          struct gc_heap *heap,
+                          void *visit_data) {}
+
+static inline void
+visit_finalizer_fields(Finalizer *finalizer,
+                       void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                     void *visit_data),
+
+                       struct gc_heap *heap,
+                       void *visit_data) {
+  gc_trace_finalizer((struct gc_finalizer*)finalizer, visit, heap, visit_data);
+}
+
+static inline void
+visit_pair_fields(Pair *pair,
+                  void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                void *visit_data),
+                  struct gc_heap *heap,
+                  void *visit_data) {
+  visit(gc_edge(&pair->car), heap, visit_data);
+  visit(gc_edge(&pair->cdr), heap, visit_data);
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // FINALIZERS_EMBEDDER_H
--- a/libguile/whippet/benchmarks/finalizers-types.h
+++ b/libguile/whippet/benchmarks/finalizers-types.h
@ -0,0 +1,22 @@
+#ifndef FINALIZERS_TYPES_H
+#define FINALIZERS_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(pair, Pair, PAIR) \
+  M(finalizer, Finalizer, FINALIZER) \
+  M(small_object, SmallObject, SMALL_OBJECT)
+
+#include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct SmallObject {
+  struct gc_header header;
+};
+
+struct Pair {
+  struct gc_header header;
+  void *car;
+  void *cdr;
+};
+
+#endif // FINALIZERS_TYPES_H
--- a/libguile/whippet/benchmarks/finalizers.c
+++ b/libguile/whippet/benchmarks/finalizers.c
@ -0,0 +1,284 @@
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include "gc-finalizer.h"
+#include "simple-roots-api.h"
+#include "finalizers-types.h"
+#include "simple-allocator.h"
+
+typedef HANDLE_TO(SmallObject) SmallObjectHandle;
+typedef HANDLE_TO(struct gc_finalizer) FinalizerHandle;
+typedef HANDLE_TO(Pair) PairHandle;
+
+static SmallObject* allocate_small_object(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_SMALL_OBJECT, sizeof(SmallObject));
+}
+
+static Pair* allocate_pair(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_PAIR, sizeof(Pair));
+}
+
+static struct gc_finalizer* allocate_finalizer(struct gc_mutator *mut) {
+  struct gc_finalizer *ret = gc_allocate_finalizer(mut);
+  *tag_word(gc_ref_from_heap_object(ret)) = tag_live(ALLOC_KIND_FINALIZER);
+  return ret;
+}
+
+/* Get the current time in microseconds */
+static unsigned long current_time(void)
+{
+  struct timeval t;
+  if (gettimeofday(&t, NULL) == -1)
+    return 0;
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+struct thread {
+  struct gc_mutator *mut;
+  struct gc_mutator_roots roots;
+};
+
+static void print_elapsed(const char *what, unsigned long start) {
+  unsigned long end = current_time();
+  unsigned long msec = (end - start) / 1000;
+  unsigned long usec = (end - start) % 1000;
+  printf("Completed %s in %lu.%.3lu msec\n", what, msec, usec);
+}
+
+struct call_with_gc_data {
+  void* (*f)(struct thread *);
+  struct gc_heap *heap;
+};
+static void* call_with_gc_inner(struct gc_stack_addr *addr, void *arg) {
+  struct call_with_gc_data *data = arg;
+  struct gc_mutator *mut = gc_init_for_thread(addr, data->heap);
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+  void *ret = data->f(&t);
+  gc_finish_for_thread(mut);
+  return ret;
+}
+static void* call_with_gc(void* (*f)(struct thread *),
+                          struct gc_heap *heap) {
+  struct call_with_gc_data data = { f, heap };
+  return gc_call_with_stack_addr(call_with_gc_inner, &data);
+}
+
+#define CHECK(x)                                                        \
+  do {                                                                  \
+    if (!(x)) {                                                         \
+      fprintf(stderr, "%s:%d: check failed: %s\n", __FILE__, __LINE__, #x); \
+      exit(1);                                                          \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NULL(x) CHECK_EQ(x, NULL)
+#define CHECK_NOT_NULL(x) CHECK_NE(x, NULL)
+
+static double heap_size;
+static double heap_multiplier;
+static size_t nthreads;
+
+static void cause_gc(struct gc_mutator *mut) {
+  // Doing a full collection lets us reason precisely about liveness.
+  gc_collect(mut, GC_COLLECTION_MAJOR);
+}
+
+static inline void set_car(struct gc_mutator *mut, Pair *obj, void *val) {
+  void **field = &obj->car;
+  if (val)
+    gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Pair),
+                     gc_edge(field),
+                     gc_ref_from_heap_object(val));
+  *field = val;
+}
+
+static inline void set_cdr(struct gc_mutator *mut, Pair *obj, void *val) {
+  void **field = &obj->cdr;
+  if (val)
+    gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Pair),
+                     gc_edge(field),
+                     gc_ref_from_heap_object(val));
+  field = val;
+}
+
+static Pair* make_finalizer_chain(struct thread *t, size_t length) {
+  PairHandle head = { NULL };
+  PairHandle tail = { NULL };
+  PUSH_HANDLE(t, head);
+  PUSH_HANDLE(t, tail);
+
+  for (size_t i = 0; i < length; i++) {
+    HANDLE_SET(tail, HANDLE_REF(head));
+    HANDLE_SET(head, allocate_pair(t->mut));
+    set_car(t->mut, HANDLE_REF(head), allocate_small_object(t->mut));
+    set_cdr(t->mut, HANDLE_REF(head), HANDLE_REF(tail));
+    struct gc_finalizer *finalizer = allocate_finalizer(t->mut);
+    gc_finalizer_attach(t->mut, finalizer, 0,
+                        gc_ref_from_heap_object(HANDLE_REF(head)),
+                        gc_ref_from_heap_object(HANDLE_REF(head)->car));
+  }
+
+  Pair *ret = HANDLE_REF(head);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  return ret;
+}
+
+static void* run_one_test(struct thread *t) {
+  size_t unit_size = gc_finalizer_size() + sizeof(Pair);
+  size_t list_length = heap_size / nthreads / heap_multiplier / unit_size;
+  ssize_t outstanding = list_length;
+
+  printf("Allocating list %zu nodes long.  Total size %.3fGB.\n",
+         list_length, list_length * unit_size / 1e9);
+
+  unsigned long thread_start = current_time();
+
+  PairHandle chain = { NULL };
+  PUSH_HANDLE(t, chain);
+
+  HANDLE_SET(chain, make_finalizer_chain(t, list_length));
+  cause_gc(t->mut);
+
+  size_t finalized = 0;
+  for (struct gc_finalizer *f = gc_pop_finalizable(t->mut);
+       f;
+       f = gc_pop_finalizable(t->mut)) {
+    Pair* p = gc_ref_heap_object(gc_finalizer_object(f));
+    SmallObject* o = gc_ref_heap_object(gc_finalizer_closure(f));
+    CHECK_EQ(p->car, o);
+    finalized++;
+  }
+  printf("thread %p: GC before clear finalized %zu nodes.\n", t, finalized);
+  outstanding -= finalized;
+
+  HANDLE_SET(chain, NULL);
+  cause_gc(t->mut);
+
+  finalized = 0;
+  for (struct gc_finalizer *f = gc_pop_finalizable(t->mut);
+       f;
+       f = gc_pop_finalizable(t->mut)) {
+    Pair* p = gc_ref_heap_object(gc_finalizer_object(f));
+    SmallObject* o = gc_ref_heap_object(gc_finalizer_closure(f));
+    CHECK_EQ(p->car, o);
+    finalized++;
+  }
+  printf("thread %p: GC after clear finalized %zu nodes.\n", t, finalized);
+  outstanding -= finalized;
+
+  print_elapsed("thread", thread_start);
+
+  POP_HANDLE(t);
+
+  return (void*)outstanding;
+}
+
+static void* run_one_test_in_thread(void *arg) {
+  struct gc_heap *heap = arg;
+  return call_with_gc(run_one_test, heap);
+}
+
+struct join_data { int status; pthread_t thread; };
+static void *join_thread(void *data) {
+  struct join_data *join_data = data;
+  void *ret;
+  join_data->status = pthread_join(join_data->thread, &ret);
+  return ret;
+}
+
+#define MAX_THREAD_COUNT 256
+
+int main(int argc, char *argv[]) {
+  if (argc < 4 || 5 < argc) {
+    fprintf(stderr, "usage: %s HEAP_SIZE MULTIPLIER NTHREADS [GC-OPTIONS]\n", argv[0]);
+    return 1;
+  }
+
+  heap_size = atof(argv[1]);
+  heap_multiplier = atof(argv[2]);
+  nthreads = atol(argv[3]);
+
+  if (heap_size < 8192) {
+    fprintf(stderr,
+            "Heap size should probably be at least 8192, right? '%s'\n",
+            argv[1]);
+    return 1;
+  }
+  if (!(1.0 < heap_multiplier && heap_multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+  if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for thread count, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[2]);
+    return 1;
+  }
+
+  printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
+         heap_size / 1e9, heap_multiplier);
+
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  if (argc == 5) {
+    if (!gc_options_parse_and_set_many(options, argv[4])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[4]);
+      return 1;
+    }
+  }
+
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            (size_t)heap_size);
+    return 1;
+  }
+  struct thread main_thread = { mut, };
+  gc_mutator_set_roots(mut, &main_thread.roots);
+
+  pthread_t threads[MAX_THREAD_COUNT];
+  // Run one of the threads in the main thread.
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, heap);
+    if (status) {
+      errno = status;
+      perror("Failed to create thread");
+      return 1;
+    }
+  }
+  ssize_t outstanding = (size_t)run_one_test(&main_thread);
+  for (size_t i = 1; i < nthreads; i++) {
+    struct join_data data = { 0, threads[i] };
+    void *ret = gc_call_without_gc(mut, join_thread, &data);
+    if (data.status) {
+      errno = data.status;
+      perror("Failed to join thread");
+      return 1;
+    }
+    ssize_t thread_outstanding = (ssize_t)ret;
+    outstanding += thread_outstanding;
+  }
+  
+  if (outstanding)
+    printf("\n\nWARNING: %zd nodes outstanding!!!\n\n", outstanding);
+
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
+
+  return 0;
+}
+
--- a/libguile/whippet/benchmarks/heap-objects.h
+++ b/libguile/whippet/benchmarks/heap-objects.h
@ -0,0 +1,19 @@
+#ifndef HEAP_OBJECTS_H
+#define HEAP_OBJECTS_H
+
+#include "gc-inline.h"
+#include "gc-edge.h"
+
+#define DECLARE_NODE_TYPE(name, Name, NAME) \
+  struct Name;                              \
+  typedef struct Name Name;
+FOR_EACH_HEAP_OBJECT_KIND(DECLARE_NODE_TYPE)
+#undef DECLARE_NODE_TYPE
+
+#define DEFINE_ENUM(name, Name, NAME) ALLOC_KIND_##NAME,
+enum alloc_kind {
+  FOR_EACH_HEAP_OBJECT_KIND(DEFINE_ENUM)
+};
+#undef DEFINE_ENUM
+
+#endif // HEAP_OBJECTS_H
--- a/libguile/whippet/benchmarks/mt-gcbench-embedder.h
+++ b/libguile/whippet/benchmarks/mt-gcbench-embedder.h
@ -0,0 +1,54 @@
+#ifndef MT_GCBENCH_EMBEDDER_H
+#define MT_GCBENCH_EMBEDDER_H
+
+#include "gc-config.h"
+#include "mt-gcbench-types.h"
+
+struct gc_heap;
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t node_size(Node *obj) {
+  return sizeof(Node);
+}
+static inline size_t double_array_size(DoubleArray *array) {
+  return sizeof(*array) + array->length * sizeof(double);
+}
+static inline size_t hole_size(Hole *hole) {
+  return sizeof(*hole) + hole->length * sizeof(uintptr_t);
+}
+static inline void
+visit_node_fields(Node *node,
+                  void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                void *visit_data),
+                  struct gc_heap *heap, void *visit_data) {
+  visit(gc_edge(&node->left), heap, visit_data);
+  visit(gc_edge(&node->right), heap, visit_data);
+}
+static inline void
+visit_double_array_fields(DoubleArray *obj,
+                          void (*visit)(struct gc_edge edge,
+                                        struct gc_heap *heap, void *visit_data),
+                          struct gc_heap *heap, void *visit_data) {
+}
+static inline void
+visit_hole_fields(Hole *obj,
+                  void (*visit)(struct gc_edge edge,
+                                struct gc_heap *heap, void *visit_data),
+                  struct gc_heap *heap, void *visit_data) {
+  if (GC_PRECISE_ROOTS)
+    GC_CRASH();
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // MT_GCBENCH_EMBEDDER_H
--- a/libguile/whippet/benchmarks/mt-gcbench-types.h
+++ b/libguile/whippet/benchmarks/mt-gcbench-types.h
@ -0,0 +1,34 @@
+#ifndef GCBENCH_TYPES_H
+#define GCBENCH_TYPES_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(node, Node, NODE) \
+  M(double_array, DoubleArray, DOUBLE_ARRAY) \
+  M(hole, Hole, HOLE)
+
+#include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct Node {
+  struct gc_header header;
+  struct Node *left;
+  struct Node *right;
+  int i, j;
+};
+
+struct DoubleArray {
+  struct gc_header header;
+  size_t length;
+  double values[0];
+};
+
+struct Hole {
+  struct gc_header header;
+  size_t length;
+  uintptr_t values[0];
+};
+
+#endif // GCBENCH_TYPES_H
--- a/libguile/whippet/benchmarks/mt-gcbench.c
+++ b/libguile/whippet/benchmarks/mt-gcbench.c
@ -0,0 +1,402 @@
+// This is adapted from a benchmark written by John Ellis and Pete Kovac
+// of Post Communications.
+// It was modified by Hans Boehm of Silicon Graphics.
+// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
+// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
+//
+//      This is no substitute for real applications.  No actual application
+//      is likely to behave in exactly this way.  However, this benchmark was
+//      designed to be more representative of real applications than other
+//      Java GC benchmarks of which we are aware.
+//      It attempts to model those properties of allocation requests that
+//      are important to current GC techniques.
+//      It is designed to be used either to obtain a single overall performance
+//      number, or to give a more detailed estimate of how collector
+//      performance varies with object lifetimes.  It prints the time
+//      required to allocate and collect balanced binary trees of various
+//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
+//      allocates roughly the same amount of memory.
+//      Two data structures are kept around during the entire process, so
+//      that the measured performance is representative of applications
+//      that maintain some live in-memory data.  One of these is a tree
+//      containing many pointers.  The other is a large array containing
+//      double precision floating point numbers.  Both should be of comparable
+//      size.
+//
+//      The results are only really meaningful together with a specification
+//      of how much memory was used.  It is possible to trade memory for
+//      better time performance.  This benchmark should be run in a 32 MB
+//      heap, though we don't currently know how to enforce that uniformly.
+//
+//      Unlike the original Ellis and Kovac benchmark, we do not attempt
+//      measure pause times.  This facility should eventually be added back
+//      in.  There are several reasons for omitting it for now.  The original
+//      implementation depended on assumptions about the thread scheduler
+//      that don't hold uniformly.  The results really measure both the
+//      scheduler and GC.  Pause time measurements tend to not fit well with
+//      current benchmark suites.  As far as we know, none of the current
+//      commercial Java implementations seriously attempt to minimize GC pause
+//      times.
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include "mt-gcbench-types.h"
+#include "simple-roots-api.h"
+#include "simple-allocator.h"
+
+#define MAX_THREAD_COUNT 256
+
+static const int long_lived_tree_depth = 16; // about 4Mb
+static const int array_size = 500000; // about 4Mb
+static const int min_tree_depth = 4;
+static const int max_tree_depth = 16;
+
+typedef HANDLE_TO(Node) NodeHandle;
+typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
+
+static Node* allocate_node(struct gc_mutator *mut) {
+  // memset to 0 by the collector.
+  return gc_allocate_with_kind(mut, ALLOC_KIND_NODE, sizeof (Node));
+}
+
+static DoubleArray* allocate_double_array(struct gc_mutator *mut,
+                                                 size_t size) {
+  // May be uninitialized.
+  size_t bytes = sizeof(DoubleArray) + sizeof (double) * size;
+  DoubleArray *ret =
+    gc_allocate_pointerless_with_kind(mut, ALLOC_KIND_DOUBLE_ARRAY, bytes);
+  ret->length = size;
+  return ret;
+}
+
+static Hole* allocate_hole(struct gc_mutator *mut, size_t size) {
+  size_t bytes = sizeof(Hole) + sizeof (uintptr_t) * size;
+  Hole *ret = gc_allocate_with_kind(mut, ALLOC_KIND_HOLE, bytes);
+  ret->length = size;
+  return ret;
+}
+
+static unsigned long current_time(void) {
+  struct timeval t = { 0 };
+  gettimeofday(&t, NULL);
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+static double elapsed_millis(unsigned long start) {
+  return (current_time() - start) * 1e-3;
+}
+
+// Nodes used by a tree of a given size
+static int tree_size(int i) {
+  return ((1 << (i + 1)) - 1);
+}
+
+// Number of iterations to use for a given tree depth
+static int compute_num_iters(int i) {
+  return 2 * tree_size(max_tree_depth + 2) / tree_size(i);
+}
+
+// A power-law distribution.  Each integer was selected by starting at 0, taking
+// a random number in [0,1), and then accepting the integer if the random number
+// was less than 0.15, or trying again with the next integer otherwise.  Useful
+// for modelling allocation sizes or number of garbage objects to allocate
+// between live allocations.
+static const uint8_t power_law_distribution[256] = {
+  1, 15, 3, 12, 2, 8, 4, 0, 18, 7, 9, 8, 15, 2, 36, 5,
+  1, 9, 6, 11, 9, 19, 2, 0, 0, 3, 9, 6, 3, 2, 1, 1,
+  6, 1, 8, 4, 2, 0, 5, 3, 7, 0, 0, 3, 0, 4, 1, 7,
+  1, 8, 2, 2, 2, 14, 0, 7, 8, 0, 2, 1, 4, 12, 7, 5,
+  0, 3, 4, 13, 10, 2, 3, 7, 0, 8, 0, 23, 0, 16, 1, 1,
+  6, 28, 1, 18, 0, 3, 6, 5, 8, 6, 14, 5, 2, 5, 0, 11,
+  0, 18, 4, 16, 1, 4, 3, 13, 3, 23, 7, 4, 10, 5, 3, 13,
+  0, 14, 5, 5, 2, 5, 0, 16, 2, 0, 1, 1, 0, 0, 4, 2,
+  7, 7, 0, 5, 7, 2, 1, 24, 27, 3, 7, 1, 0, 8, 1, 4,
+  0, 3, 0, 7, 7, 3, 9, 2, 9, 2, 5, 10, 1, 1, 12, 6,
+  2, 9, 5, 0, 4, 6, 0, 7, 2, 1, 5, 4, 1, 0, 1, 15,
+  4, 0, 15, 4, 0, 0, 32, 18, 2, 2, 1, 7, 8, 3, 11, 1,
+  2, 7, 11, 1, 9, 1, 2, 6, 11, 17, 1, 2, 5, 1, 14, 3,
+  6, 1, 1, 15, 3, 1, 0, 6, 10, 8, 1, 3, 2, 7, 0, 1,
+  0, 11, 3, 3, 5, 8, 2, 0, 0, 7, 12, 2, 5, 20, 3, 7,
+  4, 4, 5, 22, 1, 5, 2, 7, 15, 2, 4, 6, 11, 8, 12, 1
+};
+
+static size_t power_law(size_t *counter) {
+  return power_law_distribution[(*counter)++ & 0xff];
+}
+
+struct thread {
+  struct gc_mutator *mut;
+  struct gc_mutator_roots roots;
+  size_t counter;
+};
+
+static void allocate_garbage(struct thread *t) {
+  size_t hole = power_law(&t->counter);
+  if (hole) {
+    allocate_hole(t->mut, hole);
+  }
+}
+
+static inline void set_field(struct gc_mutator *mut, Node *obj,
+                             Node **field, Node *val) {
+  gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Node),
+                   gc_edge(field),
+                   gc_ref_from_heap_object(val));
+  *field = val;
+}
+
+// Build tree top down, assigning to older objects.
+static void populate(struct thread *t, int depth, Node *node) {
+  struct gc_mutator *mut = t->mut;
+  if (depth <= 0)
+    return;
+
+  NodeHandle self = { node };
+  PUSH_HANDLE(t, self);
+  allocate_garbage(t);
+  NodeHandle l = { allocate_node(mut) };
+  PUSH_HANDLE(t, l);
+  allocate_garbage(t);
+  NodeHandle r = { allocate_node(mut) };
+  PUSH_HANDLE(t, r);
+
+  set_field(mut, HANDLE_REF(self), &HANDLE_REF(self)->left, HANDLE_REF(l));
+  set_field(mut, HANDLE_REF(self), &HANDLE_REF(self)->right, HANDLE_REF(r));
+  // i is 0 because the memory is zeroed.
+  HANDLE_REF(self)->j = depth;
+
+  populate(t, depth-1, HANDLE_REF(self)->left);
+  populate(t, depth-1, HANDLE_REF(self)->right);
+
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+}
+
+// Build tree bottom-up
+static Node* make_tree(struct thread *t, int depth) {
+  struct gc_mutator *mut = t->mut;
+  if (depth <= 0)
+    return allocate_node(mut);
+
+  NodeHandle left = { make_tree(t, depth-1) };
+  PUSH_HANDLE(t, left);
+  NodeHandle right = { make_tree(t, depth-1) };
+  PUSH_HANDLE(t, right);
+
+  allocate_garbage(t);
+  Node *result = allocate_node(mut);
+  result->left = HANDLE_REF(left);
+  result->right = HANDLE_REF(right);
+  // i is 0 because the memory is zeroed.
+  result->j = depth;
+
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+
+  return result;
+}
+
+static void validate_tree(Node *tree, int depth) {
+#ifndef NDEBUG
+  GC_ASSERT_EQ(tree->i, 0);
+  GC_ASSERT_EQ(tree->j, depth);
+  if (depth == 0) {
+    GC_ASSERT(!tree->left);
+    GC_ASSERT(!tree->right);
+  } else {
+    GC_ASSERT(tree->left);
+    GC_ASSERT(tree->right);
+    validate_tree(tree->left, depth - 1);
+    validate_tree(tree->right, depth - 1);
+  }
+#endif
+}
+
+static void time_construction(struct thread *t, int depth) {
+  struct gc_mutator *mut = t->mut;
+  int num_iters = compute_num_iters(depth);
+  NodeHandle temp_tree = { NULL };
+  PUSH_HANDLE(t, temp_tree);
+
+  printf("Creating %d trees of depth %d\n", num_iters, depth);
+
+  {
+    unsigned long start = current_time();
+    for (int i = 0; i < num_iters; ++i) {
+      HANDLE_SET(temp_tree, allocate_node(mut));
+      populate(t, depth, HANDLE_REF(temp_tree));
+      validate_tree(HANDLE_REF(temp_tree), depth);
+      HANDLE_SET(temp_tree, NULL);
+    }
+    printf("\tTop down construction took %.3f msec\n",
+           elapsed_millis(start));
+  }
+
+  {
+    long start = current_time();
+    for (int i = 0; i < num_iters; ++i) {
+      HANDLE_SET(temp_tree, make_tree(t, depth));
+      validate_tree(HANDLE_REF(temp_tree), depth);
+      HANDLE_SET(temp_tree, NULL);
+    }
+    printf("\tBottom up construction took %.3f msec\n",
+           elapsed_millis(start));
+  }
+
+  POP_HANDLE(t);
+}
+
+struct call_with_gc_data {
+  void* (*f)(struct thread *);
+  struct gc_heap *heap;
+};
+static void* call_with_gc_inner(struct gc_stack_addr *addr, void *arg) {
+  struct call_with_gc_data *data = arg;
+  struct gc_mutator *mut = gc_init_for_thread(addr, data->heap);
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+  void *ret = data->f(&t);
+  gc_finish_for_thread(mut);
+  return ret;
+}
+static void* call_with_gc(void* (*f)(struct thread *),
+                          struct gc_heap *heap) {
+  struct call_with_gc_data data = { f, heap };
+  return gc_call_with_stack_addr(call_with_gc_inner, &data);
+}
+
+static void* run_one_test(struct thread *t) {
+  NodeHandle long_lived_tree = { NULL };
+  NodeHandle temp_tree = { NULL };
+  DoubleArrayHandle array = { NULL };
+
+  PUSH_HANDLE(t, long_lived_tree);
+  PUSH_HANDLE(t, temp_tree);
+  PUSH_HANDLE(t, array);
+
+  // Create a long lived object
+  printf(" Creating a long-lived binary tree of depth %d\n",
+         long_lived_tree_depth);
+  HANDLE_SET(long_lived_tree, allocate_node(t->mut));
+  populate(t, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
+
+  // Create long-lived array, filling half of it
+  printf(" Creating a long-lived array of %d doubles\n", array_size);
+  HANDLE_SET(array, allocate_double_array(t->mut, array_size));
+  for (int i = 0; i < array_size/2; ++i) {
+    HANDLE_REF(array)->values[i] = 1.0/i;
+  }
+
+  for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
+    time_construction(t, d);
+  }
+
+  validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);
+
+  // Fake reference to LongLivedTree and array to keep them from being optimized
+  // away.
+  if (HANDLE_REF(long_lived_tree)->i != 0
+      || HANDLE_REF(array)->values[1000] != 1.0/1000)
+    fprintf(stderr, "Failed\n");
+
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  return NULL;
+}
+
+static void* run_one_test_in_thread(void *arg) {
+  struct gc_heap *heap = arg;
+  return call_with_gc(run_one_test, heap);
+}
+
+struct join_data { int status; pthread_t thread; };
+static void *join_thread(void *data) {
+  struct join_data *join_data = data;
+  void *ret;
+  join_data->status = pthread_join(join_data->thread, &ret);
+  return ret;
+}
+
+int main(int argc, char *argv[]) {
+  size_t heap_max_live =
+    tree_size(long_lived_tree_depth) * sizeof(Node) +
+    tree_size(max_tree_depth) * sizeof(Node) +
+    sizeof(DoubleArray) + sizeof(double) * array_size;
+  if (argc < 3 || argc > 4) {
+    fprintf(stderr, "usage: %s MULTIPLIER NTHREADS [GC-OPTIONS]\n", argv[0]);
+    return 1;
+  }
+
+  double multiplier = atof(argv[1]);
+  size_t nthreads = atol(argv[2]);
+
+  if (!(0.1 < multiplier && multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[1]);
+    return 1;
+  }
+  if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for thread count, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[2]);
+    return 1;
+  }
+
+  size_t heap_size = heap_max_live * multiplier * nthreads;
+
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  if (argc == 4) {
+    if (!gc_options_parse_and_set_many(options, argv[3])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[3]);
+      return 1;
+    }
+  }
+
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            heap_size);
+    return 1;
+  }
+  struct thread main_thread = { mut, };
+  gc_mutator_set_roots(mut, &main_thread.roots);
+
+  printf("Garbage Collector Test\n");
+  printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
+
+  pthread_t threads[MAX_THREAD_COUNT];
+  // Run one of the threads in the main thread.
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, heap);
+    if (status) {
+      errno = status;
+      perror("Failed to create thread");
+      return 1;
+    }
+  }
+  run_one_test(&main_thread);
+  for (size_t i = 1; i < nthreads; i++) {
+    struct join_data data = { 0, threads[i] };
+    gc_call_without_gc(mut, join_thread, &data);
+    if (data.status) {
+      errno = data.status;
+      perror("Failed to join thread");
+      return 1;
+    }
+  }
+  
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
+}
--- a/libguile/whippet/benchmarks/quads-embedder.h
+++ b/libguile/whippet/benchmarks/quads-embedder.h
@ -0,0 +1,37 @@
+#ifndef QUADS_EMBEDDER_H
+#define QUADS_EMBEDDER_H
+
+#include <stddef.h>
+
+#include "quads-types.h"
+
+struct gc_heap;
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t quad_size(Quad *obj) {
+  return sizeof(Quad);
+}
+
+static inline void
+visit_quad_fields(Quad *quad,
+                  void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                void *visit_data),
+                  struct gc_heap *heap,
+                  void *visit_data) {
+  for (size_t i = 0; i < 4; i++)
+    visit(gc_edge(&quad->kids[i]), heap, visit_data);
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // QUADS_EMBEDDER_H
--- a/libguile/whippet/benchmarks/quads-types.h
+++ b/libguile/whippet/benchmarks/quads-types.h
@ -0,0 +1,15 @@
+#ifndef QUADS_TYPES_H
+#define QUADS_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(quad, Quad, QUAD)
+
+#include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct Quad {
+  struct gc_header header;
+  struct Quad *kids[4];
+};
+
+#endif // QUADS_TYPES_H
--- a/libguile/whippet/benchmarks/quads.c
+++ b/libguile/whippet/benchmarks/quads.c
@ -0,0 +1,181 @@
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include "simple-roots-api.h"
+#include "quads-types.h"
+#include "simple-allocator.h"
+
+typedef HANDLE_TO(Quad) QuadHandle;
+
+static Quad* allocate_quad(struct gc_mutator *mut) {
+  // memset to 0 by the collector.
+  return gc_allocate_with_kind(mut, ALLOC_KIND_QUAD, sizeof (Quad));
+}
+
+/* Get the current time in microseconds */
+static unsigned long current_time(void)
+{
+  struct timeval t;
+  if (gettimeofday(&t, NULL) == -1)
+    return 0;
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+struct thread {
+  struct gc_mutator *mut;
+  struct gc_mutator_roots roots;
+  size_t counter;
+};
+
+// Build tree bottom-up
+static Quad* make_tree(struct thread *t, int depth) {
+  if (depth<=0) {
+    return allocate_quad(t->mut);
+  } else {
+    QuadHandle kids[4] = { { NULL }, };
+    for (size_t i = 0; i < 4; i++) {
+      HANDLE_SET(kids[i], make_tree(t, depth-1));
+      PUSH_HANDLE(t, kids[i]);
+    }
+
+    Quad *result = allocate_quad(t->mut);
+    for (size_t i = 0; i < 4; i++)
+      result->kids[i] = HANDLE_REF(kids[i]);
+
+    for (size_t i = 0; i < 4; i++)
+      POP_HANDLE(t);
+
+    return result;
+  }
+}
+
+static void validate_tree(Quad *tree, int depth) {
+  for (size_t i = 0; i < 4; i++) {
+    if (depth == 0) {
+      if (tree->kids[i])
+        abort();
+    } else {
+      if (!tree->kids[i])
+        abort();
+      validate_tree(tree->kids[i], depth - 1);
+    }
+  }
+}
+
+static void print_elapsed(const char *what, unsigned long start) {
+  unsigned long end = current_time();
+  unsigned long msec = (end - start) / 1000;
+  unsigned long usec = (end - start) % 1000;
+  printf("Completed %s in %lu.%.3lu msec\n", what, msec, usec);
+}
+
+static size_t parse_size(char *arg, const char *what) {
+  long val = atol(arg);
+  if (val <= 0) {
+    fprintf(stderr, "Failed to parse %s '%s'\n", what, arg);
+    exit(1);
+  }
+  return val;
+}
+
+static size_t tree_size(size_t depth) {
+  size_t nquads = 0;
+  size_t leaf_count = 1;
+  for (size_t i = 0; i <= depth; i++) {
+    if (nquads > ((size_t)-1) - leaf_count) {
+      fprintf(stderr,
+              "error: address space too small for quad tree of depth %zu\n",
+              depth);
+      exit(1);
+    }
+    nquads += leaf_count;
+    leaf_count *= 4;
+  }
+  return nquads;
+}
+
+#define MAX_THREAD_COUNT 256
+
+int main(int argc, char *argv[]) {
+  if (argc < 3 || 4 < argc) {
+    fprintf(stderr, "usage: %s DEPTH MULTIPLIER [GC-OPTIONS]\n", argv[0]);
+    return 1;
+  }
+
+  size_t depth = parse_size(argv[1], "depth");
+  double multiplier = atof(argv[2]);
+
+  if (!(1.0 < multiplier && multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+
+  size_t nquads = tree_size(depth);
+  size_t tree_bytes = nquads * sizeof(Quad);
+  size_t heap_size = tree_bytes * multiplier;
+
+  printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
+         heap_size / 1e9, multiplier);
+
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  if (argc == 4) {
+    if (!gc_options_parse_and_set_many(options, argv[3])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[3]);
+      return 1;
+    }
+  }
+
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            heap_size);
+    return 1;
+  }
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+
+  QuadHandle quad = { NULL };
+
+  PUSH_HANDLE(&t, quad);
+
+  printf("Making quad tree of depth %zu (%zu nodes).  Total size %.3fGB.\n",
+         depth, nquads, (nquads * sizeof(Quad)) / 1e9);
+  unsigned long start = current_time();
+  HANDLE_SET(quad, make_tree(&t, depth));
+  print_elapsed("construction", start);
+
+  validate_tree(HANDLE_REF(quad), depth);
+
+  size_t garbage_step = heap_size / 7.5;
+  printf("Allocating %.3f GB of garbage, 20 times, validating live tree each time.\n",
+         garbage_step / 1e9);
+  unsigned long garbage_start = current_time();
+  for (size_t i = 0; i < 20; i++) {
+    size_t garbage_depth = 3;
+    start = current_time();
+    for (size_t i = garbage_step/(tree_size(garbage_depth)*4*sizeof(Quad*)); i; i--)
+      make_tree(&t, garbage_depth);
+    print_elapsed("allocating garbage", start);
+
+    start = current_time();
+    validate_tree(HANDLE_REF(quad), depth);
+  }
+  print_elapsed("allocation loop", garbage_start);
+
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
+
+  POP_HANDLE(&t);
+  return 0;
+}
+
--- a/libguile/whippet/benchmarks/simple-allocator.h
+++ b/libguile/whippet/benchmarks/simple-allocator.h
@ -0,0 +1,21 @@
+#ifndef SIMPLE_ALLOCATOR_H
+#define SIMPLE_ALLOCATOR_H
+
+#include "simple-tagging-scheme.h"
+#include "gc-api.h"
+
+static inline void*
+gc_allocate_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
+  void *obj = gc_allocate(mut, bytes, GC_ALLOCATION_TAGGED);
+  *tag_word(gc_ref_from_heap_object(obj)) = tag_live(kind);
+  return obj;
+}
+
+static inline void*
+gc_allocate_pointerless_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
+  void *obj = gc_allocate(mut, bytes, GC_ALLOCATION_TAGGED_POINTERLESS);
+  *tag_word(gc_ref_from_heap_object(obj)) = tag_live(kind);
+  return obj;
+}
+
+#endif // SIMPLE_ALLOCATOR_H
--- a/libguile/whippet/benchmarks/simple-gc-embedder.h
+++ b/libguile/whippet/benchmarks/simple-gc-embedder.h
@ -0,0 +1,183 @@
+#include <stdatomic.h>
+
+#include "simple-tagging-scheme.h"
+#include "simple-roots-types.h"
+#include "gc-config.h"
+#include "gc-embedder-api.h"
+
+#define GC_EMBEDDER_EPHEMERON_HEADER struct gc_header header;
+#define GC_EMBEDDER_FINALIZER_HEADER struct gc_header header;
+
+static inline size_t gc_finalizer_priority_count(void) { return 2; }
+
+static inline int
+gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
+#if GC_CONSERVATIVE_ROOTS || GC_CONSERVATIVE_TRACE
+  // Here is where you would allow tagged heap object references.
+  return displacement == 0;
+#else
+  // Shouldn't get here.
+  GC_CRASH();
+#endif
+}
+
+// No external objects in simple benchmarks.
+static inline int gc_extern_space_visit(struct gc_extern_space *space,
+                                        struct gc_edge edge,
+                                        struct gc_ref ref) {
+  GC_CRASH();
+}
+static inline void gc_extern_space_start_gc(struct gc_extern_space *space,
+                                            int is_minor_gc) {
+}
+static inline void gc_extern_space_finish_gc(struct gc_extern_space *space,
+                                             int is_minor_gc) {
+}
+
+static inline void gc_trace_object(struct gc_ref ref,
+                                   void (*trace_edge)(struct gc_edge edge,
+                                                      struct gc_heap *heap,
+                                                      void *trace_data),
+                                   struct gc_heap *heap,
+                                   void *trace_data,
+                                   size_t *size) {
+#if GC_CONSERVATIVE_TRACE
+  // Shouldn't get here.
+  GC_CRASH();
+#else
+  switch (tag_live_alloc_kind(*tag_word(ref))) {
+#define SCAN_OBJECT(name, Name, NAME)                                   \
+    case ALLOC_KIND_##NAME:                                             \
+      if (trace_edge)                                                   \
+        visit_##name##_fields(gc_ref_heap_object(ref), trace_edge,      \
+                              heap, trace_data);                        \
+      if (size)                                                         \
+        *size = name##_size(gc_ref_heap_object(ref));                   \
+      break;
+    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
+#undef SCAN_OBJECT
+  default:
+    GC_CRASH();
+  }
+#endif
+}
+
+static inline void visit_roots(struct handle *roots,
+                               void (*trace_edge)(struct gc_edge edge,
+                                                  struct gc_heap *heap,
+                                                  void *trace_data),
+                               struct gc_heap *heap,
+                               void *trace_data) {
+  for (struct handle *h = roots; h; h = h->next)
+    trace_edge(gc_edge(&h->v), heap, trace_data);
+}
+
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             struct gc_heap *heap,
+                                                             void *trace_data),
+                                          struct gc_heap *heap,
+                                          void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, heap, trace_data);
+}
+
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          struct gc_heap *heap,
+                                                          void *trace_data),
+                                       struct gc_heap *heap,
+                                       void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, heap, trace_data);
+}
+
+static inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref) {
+  uintptr_t tag = *tag_word(ref);
+  return (tag & gcobj_not_forwarded_bit) ? 0 : tag;
+}
+
+static inline void gc_object_forward_nonatomic(struct gc_ref ref,
+                                               struct gc_ref new_ref) {
+  *tag_word(ref) = gc_ref_value(new_ref);
+}
+
+static inline struct gc_atomic_forward
+gc_atomic_forward_begin(struct gc_ref ref) {
+  uintptr_t tag = atomic_load_explicit(tag_word(ref), memory_order_acquire);
+  enum gc_forwarding_state state;
+  if (tag == gcobj_busy)
+    state = GC_FORWARDING_STATE_BUSY;
+  else if (tag & gcobj_not_forwarded_bit)
+    state = GC_FORWARDING_STATE_NOT_FORWARDED;
+  else
+    state = GC_FORWARDING_STATE_FORWARDED;
+  return (struct gc_atomic_forward){ ref, tag, state };
+}
+
+static inline int
+gc_atomic_forward_retry_busy(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_BUSY);
+  uintptr_t tag = atomic_load_explicit(tag_word(fwd->ref),
+                                       memory_order_acquire);
+  if (tag == gcobj_busy)
+    return 0;
+  if (tag & gcobj_not_forwarded_bit) {
+    fwd->state = GC_FORWARDING_STATE_NOT_FORWARDED;
+    fwd->data = tag;
+  } else {
+    fwd->state = GC_FORWARDING_STATE_FORWARDED;
+    fwd->data = tag;
+  }
+  return 1;
+}
+  
+static inline void
+gc_atomic_forward_acquire(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_NOT_FORWARDED);
+  if (atomic_compare_exchange_strong(tag_word(fwd->ref), &fwd->data,
+                                     gcobj_busy))
+    fwd->state = GC_FORWARDING_STATE_ACQUIRED;
+  else if (fwd->data == gcobj_busy)
+    fwd->state = GC_FORWARDING_STATE_BUSY;
+  else {
+    GC_ASSERT((fwd->data & gcobj_not_forwarded_bit) == 0);
+    fwd->state = GC_FORWARDING_STATE_FORWARDED;
+  }
+}
+
+static inline void
+gc_atomic_forward_abort(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
+  atomic_store_explicit(tag_word(fwd->ref), fwd->data, memory_order_release);
+  fwd->state = GC_FORWARDING_STATE_NOT_FORWARDED;
+}
+
+static inline size_t
+gc_atomic_forward_object_size(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
+  switch (tag_live_alloc_kind(fwd->data)) {
+#define OBJECT_SIZE(name, Name, NAME)                                   \
+    case ALLOC_KIND_##NAME:                                             \
+      return name##_size(gc_ref_heap_object(fwd->ref));
+    FOR_EACH_HEAP_OBJECT_KIND(OBJECT_SIZE)
+#undef OBJECT_SIZE
+  default:
+    GC_CRASH();
+  }
+}
+
+static inline void
+gc_atomic_forward_commit(struct gc_atomic_forward *fwd, struct gc_ref new_ref) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
+  *tag_word(new_ref) = fwd->data;
+  atomic_store_explicit(tag_word(fwd->ref), gc_ref_value(new_ref),
+                        memory_order_release);
+  fwd->state = GC_FORWARDING_STATE_FORWARDED;
+}
+
+static inline uintptr_t
+gc_atomic_forward_address(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_FORWARDED);
+  return fwd->data;
+}
--- a/libguile/whippet/benchmarks/simple-roots-api.h
+++ b/libguile/whippet/benchmarks/simple-roots-api.h
@ -0,0 +1,26 @@
+#ifndef SIMPLE_ROOTS_API_H
+#define SIMPLE_ROOTS_API_H
+
+#include "gc-config.h"
+#include "simple-roots-types.h"
+
+#define HANDLE_TO(T) union { T* v; struct handle handle; }
+#define HANDLE_LOC(h) &(h).v
+#define HANDLE_REF(h) (h).v
+#define HANDLE_SET(h,val) do { (h).v = val; } while (0)
+#define PUSH_HANDLE(cx, h) push_handle(&(cx)->roots.roots, &h.handle)
+#define POP_HANDLE(cx) pop_handle(&(cx)->roots.roots)
+
+static inline void push_handle(struct handle **roots, struct handle *handle) {
+  if (GC_PRECISE_ROOTS) {
+    handle->next = *roots;
+    *roots = handle;
+  }
+}
+
+static inline void pop_handle(struct handle **roots) {
+  if (GC_PRECISE_ROOTS)
+    *roots = (*roots)->next;
+}
+
+#endif // SIMPLE_ROOTS_API_H
--- a/libguile/whippet/benchmarks/simple-roots-types.h
+++ b/libguile/whippet/benchmarks/simple-roots-types.h
@ -0,0 +1,17 @@
+#ifndef SIMPLE_ROOTS_TYPES_H
+#define SIMPLE_ROOTS_TYPES_H
+
+struct handle {
+  void *v;
+  struct handle *next;
+};
+
+struct gc_heap_roots {
+  struct handle *roots;
+};
+
+struct gc_mutator_roots {
+  struct handle *roots;
+};
+
+#endif // SIMPLE_ROOTS_TYPES_H
--- a/libguile/whippet/benchmarks/simple-tagging-scheme.h
+++ b/libguile/whippet/benchmarks/simple-tagging-scheme.h
@ -0,0 +1,29 @@
+#ifndef SIMPLE_TAGGING_SCHEME_H
+#define SIMPLE_TAGGING_SCHEME_H
+
+#include <stdint.h>
+
+struct gc_header {
+  uintptr_t tag;
+};
+
+// Alloc kind is in bits 1-7, for live objects.
+static const uintptr_t gcobj_alloc_kind_mask = 0x7f;
+static const uintptr_t gcobj_alloc_kind_shift = 1;
+static const uintptr_t gcobj_forwarded_mask = 0x1;
+static const uintptr_t gcobj_not_forwarded_bit = 0x1;
+static const uintptr_t gcobj_busy = 0;
+static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
+  return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
+}
+static inline uintptr_t tag_live(uint8_t alloc_kind) {
+  return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift)
+    | gcobj_not_forwarded_bit;
+}
+
+static inline uintptr_t* tag_word(struct gc_ref ref) {
+  struct gc_header *header = gc_ref_heap_object(ref);
+  return &header->tag;
+}
+
+#endif // SIMPLE_TAGGING_SCHEME_H
--- a/libguile/whippet/ctf_to_json.py
+++ b/libguile/whippet/ctf_to_json.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+# Any copyright is dedicated to the Public Domain.
+# https://creativecommons.org/publicdomain/zero/1.0/
+#
+# Originally written by Andy Wingo <wingo@igalia.com>.
+
+import bt2 # From the babeltrace2 package.
+import sys
+import json
+from enum import Enum
+
+# Usage: ./ctf_to_json.py ~/lttng-traces/name-of-your-trace > foo.json
+#
+# Convert a Common Trace Format (CTF) trace, for example as produced by
+# LTTng, to the JSON-based Trace Event Format (TEF), for example as
+# consumed by `chrome://tracing`, `https://ui.perfetto.dev/`, or
+# `https://profiler.firefox.com`.
+
+# The Trace Event Format is documented here:
+#
+# https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0
+
+# By default, events are emitted as EventPhase.INSTANT.  We also support
+# rewriting the event stream so as to generate EventPhase.BEGIN /
+# EventPhase.END events for specific named events.
+
+synthetic_events = {
+    'gc': ['whippet:mutator_cause_gc',
+           'whippet:restarting_mutators'],
+    'stop-the-world': ['whippet:requesting_stop',
+                       'whippet:mutators_stopped'],
+    'trace': ['whippet:prepare_gc',
+              'whippet:restarting_mutators'],
+    'mutator-stopped': ['whippet:mutator_stopping',
+                        'whippet:mutator_restarted'],
+    'trace-roots': ['whippet:trace_roots_begin',
+                    'whippet:trace_roots_end'],
+    'trace-check-termination': ['whippet:trace_check_termination_begin',
+                                'whippet:trace_check_termination_end'],
+    'trace-objects': ['whippet:trace_objects_begin',
+                      'whippet:trace_objects_end'],
+    'trace-worker': ['whippet:trace_worker_begin',
+                     'whippet:trace_worker_end']
+}
+
+class EventPhase(Enum):
+    BEGIN = 'B'
+    END = 'E'
+    COMPLETE = 'X'
+    INSTANT = 'i'
+    COUNTER = 'C'
+    NESTABLE_START = 'b'
+    NESTABLE_INSTANT = 'n'
+    NESTABLE_END = 'e'
+    FLOW_START = 's'
+    FLOW_STEP = 't'
+    FLOW_END = 'f'
+    SAMPLE = 'P'
+    OBJECT_CREATED = 'N'
+    OBJECT_SNAPSHOT = 'O'
+    OBJECT_DESTROYED = 'D'
+    METADATA = 'M'
+    MEMORY_DUMP_GLOBAL = 'V'
+    MEMORY_DUMP_PROCESS = 'V'
+    MARK = 'R'
+    CLOCK_SYNC = 'c'
+    CONTEXT_BEGIN = '('
+    CONTEXT_END = ')'
+
+base_time = None
+def event_us(msg):
+    assert(msg.default_clock_snapshot.clock_class.name == 'monotonic')
+    assert(msg.default_clock_snapshot.clock_class.frequency == 1e9)
+    global base_time
+    ns = msg.default_clock_snapshot.value
+    if base_time is None:
+        base_time = ns
+    return (ns - base_time) * 1e-3
+
+def lower(x):
+    if isinstance(x, str) or isinstance(x, int) or isinstance(x, float):
+        return x
+    if isinstance(x, dict) or isinstance(x, bt2._StructureFieldConst):
+        return {lower(k):lower(v) for k, v in x.items()}
+    if isinstance(x, bt2._BoolValueConst) or isinstance(x, bt2._BoolFieldConst):
+        return bool(x)
+    if isinstance(x, bt2._EnumerationFieldConst):
+        return repr(x)
+    if isinstance(x, bt2._IntegerValueConst) or isinstance(x, bt2._IntegerFieldConst):
+        return int(x)
+    if isinstance(x, bt2._RealValueConst) or isinstance(x, bt2._RealFieldConst):
+        return float(x)
+    if isinstance(x, bt2._StringValueConst) or isinstance(x, bt2._StringFieldConst):
+        return str(x)
+    raise ValueError("Unexpected value from trace", x)
+
+# Specific Whippet events.
+synthetic_begin = {}
+synthetic_end = {}
+for synthetic, [begin, end] in synthetic_events.items():
+    synthetic_begin[begin] = []
+    synthetic_end[end] = []
+for synthetic, [begin, end] in synthetic_events.items():
+    synthetic_begin[begin].append(synthetic)
+    synthetic_end[end].append(synthetic)
+
+def put(str):
+    sys.stdout.write(str)
+
+need_comma = False
+def print_event(ev):
+    global need_comma
+    if need_comma:
+        sys.stdout.write(',\n    ')
+    else:
+        need_comma = True
+    # It appears to be faster to make a string, then print the string,
+    # than to call json.dump with a file object.
+    # json.dump(ev, sys.stdout, ensure_ascii=False, check_circular=False)
+    put(json.dumps(ev, ensure_ascii=False, check_circular=False))
+
+def emit_event(msg, name, phase):
+    ev = {'name': name,
+          'cat': 'whippet',
+          'ph': phase.value,
+          'ts': event_us(msg),
+          'pid': lower(msg.event.common_context_field['vpid']),
+          'tid': lower(msg.event.common_context_field['vtid']),
+          'args': lower(msg.event.payload_field)}
+    print_event(ev)
+def emit_begin_event(msg, name):
+    emit_event(msg, name, EventPhase.BEGIN)
+def emit_end_event(msg, name):
+    emit_event(msg, name, EventPhase.END)
+
+def emit_events(msg):
+    emit_event(msg, msg.event.name, EventPhase.INSTANT)
+    for begin in synthetic_begin.get(msg.event.name, []):
+        emit_begin_event(msg, begin)
+    for end in synthetic_end.get(msg.event.name, []):
+        emit_end_event(msg, end)
+
+def ctf_to_json(path):
+    msg_it = bt2.TraceCollectionMessageIterator(path)
+    put('{\n')
+    put('  "traceEvents": [\n    ')
+    for msg in msg_it:
+        if hasattr(msg, 'event'):
+            emit_events(msg)
+    put('\n')
+    put('\n  ],\n')
+    put('  "displayTimeUnit": "ns"\n')
+    put('}\n')
+
+if len(sys.argv) != 2:
+    sys.stderr.write(
+        'usage: ' + sys.argv[0] + ' ~/lttng-traces/name-of-your-trace\n')
+    sys.exit(1)
+else:
+    ctf_to_json(sys.argv[1])
--- a/libguile/whippet/doc/README.md
+++ b/libguile/whippet/doc/README.md
@ -0,0 +1,13 @@
+# Whippet documentation
+
+ * [Manual](./manual.md): How do you get your program to use
+   Whippet?  What is the API?
+
+ * [Collector implementations](./collectors.md): There are a number of
+   implementations of the Whippet API with differing performance
+   characteristics and which impose different requirements on the
+   embedder.
+
+ * [Guile](./guile.md): Some notes on a potential rebase of Guile on
+   top of Whippet.
+
--- a/libguile/whippet/doc/collector-bdw.md
+++ b/libguile/whippet/doc/collector-bdw.md
@ -0,0 +1,26 @@
+# Boehm-Demers-Weiser collector
+
+Whippet's `bdw` collector is backed by a third-party garbage collector,
+the [Boehm-Demers-Weiser collector](https://github.com/ivmai/bdwgc).
+
+BDW-GC is a mark-sweep collector with conservative root-finding,
+conservative heap tracing, and parallel tracing.
+
+Whereas the other Whippet collectors which rely on mutators to
+[periodically check if they need to
+stop](https://github.com/wingo/whippet/blob/main/doc/manual.md#safepoints),
+`bdw` will stop mutators with a POSIX signal.  Also, it doesn't really
+support ephemerons (the Whippet `bdw` collector simulates them using
+finalizers), and both ephemerons and finalizers only approximate the
+Whippet behavior, because they are implemented in terms of what BDW-GC
+provides.
+
+`bdw` supports the `fixed` and `growable` heap-sizing policies, but not
+`adaptive`, as BDW-GC can't reliably return memory to the OS.  Also,
+[`growable` has an effective limit of a 3x heap
+multiplier](https://github.com/wingo/whippet/blob/main/src/bdw.c#L478).
+Oh well!
+
+It's a bit of an oddball from a Whippet perspective, but useful as a
+migration path if you have an embedder that is already using BDW-GC.
+And, it is a useful performance comparison.
--- a/libguile/whippet/doc/collector-mmc.md
+++ b/libguile/whippet/doc/collector-mmc.md
@ -0,0 +1,148 @@
+# Mostly-marking collector
+
+The `mmc` collector is mainly a mark-region collector, inspired by
+[Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
+To a first approximation, `mmc` is a whole-heap Immix collector with a
+large object space on the side.
+
+When tracing, `mmc` mostly marks objects in place.  If the heap is
+too fragmented, it can compact the heap by choosing to evacuate
+sparsely-populated heap blocks instead of marking in place.  However
+evacuation is strictly optional, which means that `mmc` is also
+compatible with conservative root-finding, making it a good replacement
+for embedders that currently use the [Boehm-Demers-Weiser
+collector](./collector-bdw.md).
+
+## Differences from Immix
+
+The original Immix divides the heap into 32kB blocks, and then divides
+those blocks into 128B lines.  An Immix allocation can span lines but
+not blocks; allocations larger than 8kB go into a separate large object
+space.  Mutators request blocks from the global store and allocate into
+those blocks using bump-pointer allocation.  When all blocks are
+consumed, Immix stops the world and traces the object graph, marking
+objects but also the lines that objects are on.  After marking, blocks
+contain some lines with live objects and others that are completely
+free.  Spans of free lines are called holes.  When a mutator gets a
+recycled block from the global block store, it allocates into those
+holes.  For an exposition of Immix, see the lovely detailed [Rust
+implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
+
+The essential difference of `mmc` from Immix stems from a simple
+observation: Immix needs a side table of line mark bytes and also a mark
+bit or bits in each object (or in a side table).  But if instead you
+choose to store mark bytes instead of bits (for concurrency reasons) in
+a side table, with one mark byte per granule (unit of allocation,
+perhaps 16 bytes), then you effectively have a line mark table where the
+granule size is the line size.  You can bump-pointer allocate into holes
+in the mark byte table.
+
+You might think this is a bad tradeoff, and perhaps it is: I don't know
+yet.  If your granule size is two pointers, then one mark byte per
+granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
+32-bit, it's a lot!  On the other hand, instead of the worst case of one
+survivor object wasting a line (or two, in the case of conservative line
+marking), granule-size-is-line-size instead wastes nothing.  Also, you
+don't need GC bits in the object itself, and you can use the mark byte
+array to record the object end, so that finding holes in a block can
+just read the mark table and can avoid looking at object memory.
+
+## Optional features
+
+The `mmc` collector has a few feature flags that can be turned on or
+off.  If you use the [standard embedder makefile include](../embed.mk),
+then there is a name for each combination of features: `mmc` has no
+additional features, `parallel-mmc` enables parallel marking,
+`parallel-generational-mmc` enables generations,
+`stack-conservative-parallel-generational-mmc` uses conservative
+root-finding, and `heap-conservative-parallel-generational-mmc`
+additionally traces the heap conservatively.  You can leave off
+components of the name to get a collector without those features.
+Underneath this corresponds to some pre-processor definitions passed to
+the compiler on the command line.
+
+### Generations
+
+`mmc` supports generational tracing via the [sticky mark-bit
+algorithm](https://wingolog.org/archives/2022/10/22/the-sticky-mark-bit-algorithm).
+This requires that the embedder emit [write
+barriers](https://github.com/wingo/whippet/blob/main/doc/manual.md#write-barriers);
+if your embedder cannot ensure write barriers are always invoked, then
+generational collection is not for you.  (We could perhaps relax this a
+bit, following what [Ruby developers
+did](http://rvm.jp/~ko1/activities/rgengc_ismm.pdf).)
+
+The write barrier is currently a card-marking barrier emitted on stores,
+with one card byte per 256 object bytes, where the card location can be
+computed from the object address because blocks are allocated in
+two-megabyte aligned slabs.
+
+### Parallel tracing
+
+You almost certainly want this on!  `parallel-mmc` uses a the
+[fine-grained work-stealing parallel tracer](../src/parallel-tracer.h).
+Each trace worker maintains a [local queue of objects that need
+tracing](../src/local-worklist.h), which currently has a capacity of
+1024 entries.  If the local queue becomes full, the worker will publish
+3/4 of those entries to the worker's [shared
+worklist](../src/shared-worklist.h).  When a worker runs out of local
+work, it will first try to remove work from its own shared worklist,
+then will try to steal from other workers.
+
+The memory used for the external worklist is dynamically allocated from
+the OS and is not currently counted as contributing to the heap size.
+If you absolutely need to avoid dynamic allocation during GC, `mmc`
+(even `serial-mmc`) would need some work for your use case, to allocate
+a fixed-size space for a marking queue and to gracefully handle mark
+queue overflow.
+
+### Conservative stack scanning
+
+With `semi` and `pcc`, embedders must precisely enumerate the set of
+*roots*: the edges into the heap from outside.  Commonly, roots include
+global variables, as well as working variables from each mutator's
+stack.  `mmc` can optionally mark mutator stacks *conservatively*:
+treating each word on the stack as if it may be an object reference, and
+marking any object at that address.
+
+After all these years, *whether* to mark stacks conservatively or not is
+still an open research question.  Conservative stack scanning can retain
+too much data if an integer is confused for an object reference and
+removes a layer of correctness-by-construction from a system.  Sometimes
+conservative stack-scanning is required, for example if your embedder
+cannot enumerate roots precisely.  But there are reasons to consider it
+even if you can do precise roots: conservative scanning removes the need
+for the compiler to produce a stack map to store the precise root
+enumeration at every safepoint; it removes the need to look up a stack
+map when tracing; and it allows C or C++ support code to avoid having to
+place roots in traceable locations published to the garbage collector.
+And the [performance question is still
+open](https://dl.acm.org/doi/10.1145/2660193.2660198).
+
+Anyway.  `mmc` can scan roots conservatively.  Those roots are pinned
+for the collection; even if the collection will compact via evacuation,
+referents of conservative roots won't be moved.  Objects not directly
+referenced by roots can be evacuated, however.
+
+### Conservative heap scanning
+
+In addition to stack and global references, the Boehm-Demers-Weiser
+collector scans heap objects conservatively as well, treating each word
+of each heap object as if it were a reference.  `mmc` can do that, if
+the embedder is unable to provide a `gc_trace_object` implementation.
+However this is generally a performance lose, and it prevents
+evacuation.
+
+## Other implementation tidbits
+
+`mmc` does lazy sweeping: as a mutator grabs a fresh block, it
+reclaims memory that was unmarked in the previous collection before
+making the memory available for allocation.  This makes sweeping
+naturally cache-friendly and parallel.
+
+The mark byte array facilitates conservative collection by being an
+oracle for "does this address start an object".
+
+For a detailed introduction, see [Whippet: Towards a new local
+maximum](https://wingolog.org/archives/2023/02/07/whippet-towards-a-new-local-maximum),
+a talk given at FOSDEM 2023.
--- a/libguile/whippet/doc/collector-pcc.md
+++ b/libguile/whippet/doc/collector-pcc.md
@ -0,0 +1,84 @@
+# Parallel copying collector
+
+Whippet's `pcc` collector is a copying collector, like the more simple
+[`semi`](./collector-semi.md), but supporting multiple mutator threads,
+multiple tracing threads, and using an external FIFO worklist instead of
+a Cheney worklist.
+
+Like `semi`, `pcc` traces by evacuation: it moves all live objects on
+every collection.  (Exception:  objects larger than 8192 bytes are
+placed into a partitioned space which traces by marking in place instead
+of copying.)  Evacuation requires precise roots, so if your embedder
+does not support precise roots, `pcc` is not for you.
+
+Again like `semi`, `pcc` generally requires a heap size at least twice
+as large as the maximum live heap size, and performs best with ample
+heap sizes; between 3× and 5× is best.
+
+Overall, `pcc` is a better version of `semi`.  It should have broadly
+the same performance characteristics with a single mutator and with
+parallelism disabled, additionally allowing multiple mutators, and
+scaling better with multiple tracing threads.
+
+`pcc` has a generational configuration, conventionally referred to as
+`generational-pcc`, in which both the nursery and the old generation are
+copy spaces.  Objects stay in the nursery for one cycle before moving on
+to the old generation.  This configuration is a bit new (January 2025)
+and still needs some tuning.
+
+## Implementation notes
+
+Unlike `semi` which has a single global bump-pointer allocation region,
+`pcc` structures the heap into 64-kB blocks.  In this way it supports
+multiple mutator threads: mutators do local bump-pointer allocation into
+their own block, and when their block is full, they fetch another from
+the global store.
+
+The block size is 64 kB, but really it's 128 kB, because each block has
+two halves: the active region and the copy reserve.  Dividing each block
+in two allows the collector to easily grow and shrink the heap while
+ensuring there is always enough reserve space.
+
+Blocks are allocated in 64-MB aligned slabs, so there are 512 blocks in
+a slab.  The first block in a slab is used by the collector itself, to
+keep metadata for the rest of the blocks, for example a chain pointer
+allowing blocks to be collected in lists, a saved allocation pointer for
+partially-filled blocks, whether the block is paged in or out, and so
+on.
+
+`pcc` supports tracing in parallel.  This mechanism works somewhat like
+allocation, in which multiple trace workers compete to evacuate objects
+into their local allocation buffers; when an allocation buffer is full,
+the trace worker grabs another, just like mutators do.
+
+Unlike the simple semi-space collector which uses a Cheney grey
+worklist, `pcc` uses an external worklist.  If parallelism is disabled
+at compile-time, it uses a simple first-in, first-out queue of objects
+to be traced.  Like a Cheney worklist, this should result in objects
+being copied in breadth-first order.  The literature would suggest that
+depth-first is generally better for locality, but that preserving
+allocation order is generally best.  This is something to experiment
+with in the future.
+
+If parallelism is enabled, as it is by default, `pcc` uses a
+[fine-grained work-stealing parallel tracer](../src/parallel-tracer.h).
+Each trace worker maintains a [local queue of objects that need
+tracing](../src/local-worklist.h), which currently has 1024 entries.  If
+the local queue becomes full, the worker will publish 3/4 of those
+entries to the worker's [shared worklist](../src/shared-worklist.h).
+When a worker runs out of local work, it will first try to remove work
+from its own shared worklist, then will try to steal from other workers.
+
+If only one tracing thread is enabled at run-time (`parallelism=1`) (or
+if parallelism is disabled at compile-time), `pcc` will evacuate by
+non-atomic forwarding, but if multiple threads compete to evacuate
+objects, `pcc` uses [atomic compare-and-swap instead of simple
+forwarding pointer updates](./manual.md#forwarding-objects).  This
+imposes around a ~30% performance penalty but having multiple tracing
+threads is generally worth it, unless the object graph is itself serial.
+
+The memory used for the external worklist is dynamically allocated from
+the OS and is not currently counted as contributing to the heap size.
+If you are targetting a microcontroller or something, probably you need
+to choose a different kind of collector that never dynamically
+allocates, such as `semi`.
--- a/libguile/whippet/doc/collector-semi.md
+++ b/libguile/whippet/doc/collector-semi.md
@ -0,0 +1,23 @@
+# Semi-space collector
+
+The `semi` collector is simple.  It is mostly useful as a first
+collector to try out, to make sure that a mutator correctly records all
+roots: because `semi` moves every live object on every collection, it is
+very effective at shaking out mutator bugs.
+
+If your embedder chooses to not precisely record roots, for example
+instead choosing to conservatively scan the stack, then the semi-space
+collector is not for you: `semi` requires precise roots.
+
+For more on semi-space collectors, see
+https://wingolog.org/archives/2022/12/10/a-simple-semi-space-collector.
+
+Whippet's `semi` collector incorporates a large-object space, which
+marks objects in place instead of moving.  Otherwise, `semi` generally
+requires a heap size at least twice as large as the maximum live heap
+size, and performs best with ample heap sizes; between 3× and 5× is
+best.
+
+The semi-space collector doesn't support multiple mutator threads.  If
+you want a copying collector for a multi-threaded mutator, look at
+[pcc](./collector-pcc.md).
--- a/libguile/whippet/doc/collectors.md
+++ b/libguile/whippet/doc/collectors.md
@ -0,0 +1,43 @@
+# Whippet collectors
+
+Whippet has four collectors currently:
+ - [Semi-space collector (`semi`)](./collector-semi.md): For
+   single-threaded embedders who are not too tight on memory.
+ - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like
+   `semi`, but with support for multiple mutator and tracing threads and
+   generational collection.
+ - [Mostly marking collector (`mmc`)](./collector-mmc.md):
+   Immix-inspired collector.  Optionally parallel, conservative (stack
+   and/or heap), and/or generational.
+ - [Boehm-Demers-Weiser collector (`bdw`)](./collector-bdw.md):
+   Conservative mark-sweep collector, implemented by
+   Boehm-Demers-Weiser library.
+
+## How to choose?
+
+If you are migrating an embedder off BDW-GC, then it could be reasonable
+to first go to `bdw`, then `stack-conservative-parallel-mmc`.
+
+If you have an embedder with precise roots, use `pcc`.  That will shake
+out mutator/embedder bugs.  Then if memory is tight, switch to
+`parallel-mmc`, possibly `parallel-generational-mmc`.
+
+If you are aiming for maximum simplicity and minimal code size (ten
+kilobytes or so), use `semi`.
+
+If you are writing a new project, you have a choice as to whether to pay
+the development cost of precise roots or not.  If you choose to not have
+precise roots, then go for `stack-conservative-parallel-mmc` directly.
+
+## More collectors
+
+It would be nice to have a generational GC that uses the space from
+`parallel-mmc` for the old generation but a pcc-style copying nursery.
+We have `generational-pcc` now, so this should be possible.
+
+Support for concurrent marking in `mmc` would be good as well, perhaps
+with a SATB barrier.  (Or, if you are the sort of person to bet on
+conservative stack scanning, perhaps a retreating-wavefront barrier
+would be more appropriate.)
+
+Contributions are welcome, provided they have no more dependencies!
--- a/libguile/whippet/doc/guile.md
+++ b/libguile/whippet/doc/guile.md
@ -0,0 +1,26 @@
+# Whippet and Guile
+
+If the `mmc` collector works out, it could replace Guile's garbage
+collector.  Guile currently uses BDW-GC.  Guile has a widely used C API
+and implements part of its run-time in C.  For this reason it may be
+infeasible to require precise enumeration of GC roots -- we may need to
+allow GC roots to be conservatively identified from data sections and
+from stacks.  Such conservative roots would be pinned, but other objects
+can be moved by the collector if it chooses to do so.  We assume that
+object references within a heap object can be precisely identified.
+(However, Guile currently uses BDW-GC in its default configuration,
+which scans for references conservatively even on the heap.)
+
+The existing C API allows direct access to mutable object fields,
+without the mediation of read or write barriers.  Therefore it may be
+impossible to switch to collector strategies that need barriers, such as
+generational or concurrent collectors.  However, we shouldn't write off
+this possibility entirely; an ideal replacement for Guile's GC will
+offer the possibility of migration to other GC designs without imposing
+new requirements on C API users in the initial phase.
+
+In this regard, the Whippet experiment also has the goal of identifying
+a smallish GC abstraction in Guile, so that we might consider evolving
+GC implementation in the future without too much pain.  If we switch
+away from BDW-GC, we should be able to evaluate that it's a win for a
+large majority of use cases.
--- a/libguile/whippet/doc/manual.md
+++ b/libguile/whippet/doc/manual.md
@ -0,0 +1,718 @@
+# Whippet user's guide
+
+Whippet is an embed-only library: it should be copied into the source
+tree of the program that uses it.  The program's build system needs to
+be wired up to compile Whippet, then link it into the program that uses
+it.
+
+## Subtree merges
+
+One way is get Whippet is just to manually copy the files present in a
+Whippet checkout into your project.  However probably the best way is to
+perform a [subtree
+merge](https://docs.github.com/en/get-started/using-git/about-git-subtree-merges)
+of Whippet into your project's Git repository, so that you can easily
+update your copy of Whippet in the future.
+
+Performing the first subtree merge is annoying and full of arcane
+incantations.  Follow the [subtree merge
+page](https://docs.github.com/en/get-started/using-git/about-git-subtree-merges)
+for full details, but for a cheat sheet, you might do something like
+this to copy Whippet into the `whippet/` directory of your project root:
+
+```
+git remote add whippet https://github.com/wingo/whippet
+git fetch whippet
+git merge -s ours --no-commit --allow-unrelated-histories whippet/main
+git read-tree --prefix=whippet/ -u whippet/main
+git commit -m 'Added initial Whippet merge'
+```
+
+Then to later update your copy of whippet, assuming you still have the
+`whippet` remote, just do:
+
+```
+git pull -s subtree whippet main
+```
+
+## `gc-embedder-api.h`
+
+To determine the live set of objects, a tracing garbage collector starts
+with a set of root objects, and then transitively visits all reachable
+object edges.  Exactly how it goes about doing this depends on the
+program that is using the garbage collector; different programs will
+have different object representations, different strategies for
+recording roots, and so on.
+
+To traverse the heap in a program-specific way but without imposing an
+abstraction overhead, Whippet requires that a number of data types and
+inline functions be implemented by the program, for use by Whippet
+itself.  This is the *embedder API*, and this document describes what
+Whippet requires from a program.
+
+A program should provide a header file implementing the API in
+[`gc-embedder-api.h`](../api/gc-embedder-api.h).  This header should only be
+included when compiling Whippet itself; it is not part of the API that
+Whippet exposes to the program.
+
+### Identifying roots
+
+The collector uses two opaque struct types, `struct gc_mutator_roots`
+and `struct gc_heap_roots`, that are used by the program to record
+object roots.  Probably you should put the definition of these data
+types in a separate header that is included both by Whippet, via the
+embedder API, and via users of Whippet, so that programs can populate
+the root set.  In any case the embedder-API use of these structs is via
+`gc_trace_mutator_roots` and `gc_trace_heap_roots`, two functions that
+are passed a trace visitor function `trace_edge`, and which should call
+that function on all edges from a given mutator or heap.  (Usually
+mutator roots are per-thread roots, such as from the stack, and heap
+roots are global roots.)
+
+### Tracing objects
+
+The `gc_trace_object` is responsible for calling the `trace_edge`
+visitor function on all outgoing edges in an object.  It also includes a
+`size` out-parameter, for when the collector wants to measure the size
+of an object.  `trace_edge` and `size` may be `NULL`, in which case no
+tracing or size computation should be performed.
+
+### Tracing ephemerons and finalizers
+
+Most kinds of GC-managed object are defined by the program, but the GC
+itself has support for two specific object kind: ephemerons and
+finalizers.  If the program allocates ephemerons, it should trace them
+in the `gc_trace_object` function by calling `gc_trace_ephemeron` from
+[`gc-ephemerons.h`](../api/gc-ephemerons.h).  Likewise if the program
+allocates finalizers, it should trace them by calling
+`gc_trace_finalizer` from [`gc-finalizer.h`](../api/gc-finalizer.h).
+
+### Forwarding objects
+
+When built with a collector that moves objects, the embedder must also
+allow for forwarding pointers to be installed in an object.  There are
+two forwarding APIs: one that is atomic and one that isn't.
+
+The nonatomic API is relatively simple; there is a
+`gc_object_forwarded_nonatomic` function that returns an embedded
+forwarding address, or 0 if the object is not yet forwarded, and
+`gc_object_forward_nonatomic`, which installs a forwarding pointer.
+
+The atomic API is gnarly.  It is used by parallel collectors, in which
+multiple collector threads can race to evacuate an object.
+
+There is a state machine associated with the `gc_atomic_forward`
+structure from [`gc-forwarding.h`](../api/gc-forwarding.h); the embedder API
+implements the state changes.  The collector calls
+`gc_atomic_forward_begin` on an object to begin a forwarding attempt,
+and the resulting `gc_atomic_forward` can be in the `NOT_FORWARDED`,
+`FORWARDED`, or `BUSY` state.
+
+If the `gc_atomic_forward`'s state is `BUSY`, the collector will call
+`gc_atomic_forward_retry_busy`; a return value of 0 means the object is
+still busy, because another thread is attempting to forward it.
+Otherwise the forwarding state becomes either `FORWARDED`, if the other
+thread succeeded in forwarding it, or go back to `NOT_FORWARDED`,
+indicating that the other thread failed to forward it.
+
+If the forwarding state is `FORWARDED`, the collector will call
+`gc_atomic_forward_address` to get the new address.
+
+If the forwarding state is `NOT_FORWARDED`, the collector may begin a
+forwarding attempt by calling `gc_atomic_forward_acquire`.  The
+resulting state is `ACQUIRED` on success, or `BUSY` if another thread
+acquired the object in the meantime, or `FORWARDED` if another thread
+acquired and completed the forwarding attempt.
+
+An `ACQUIRED` object can then be forwarded via
+`gc_atomic_forward_commit`, or the forwarding attempt can be aborted via
+`gc_atomic_forward_abort`.  Also, when an object is acquired, the
+collector may call `gc_atomic_forward_object_size` to compute how many
+bytes to copy.  (The collector may choose instead to record object sizes
+in a different way.)
+
+All of these `gc_atomic_forward` functions are to be implemented by the
+embedder.  Some programs may allocate a dedicated forwarding word in all
+objects; some will manage to store the forwarding word in an initial
+"tag" word, via a specific pattern for the low 3 bits of the tag that no
+non-forwarded object will have.  The low-bits approach takes advantage
+of the collector's minimum object alignment, in which objects are
+aligned at least to an 8-byte boundary, so all objects have 0 for the
+low 3 bits of their address.
+
+### Conservative references
+
+Finally, when configured in a mode in which root edges or intra-object
+edges are *conservative*, the embedder can filter out which bit patterns
+might be an object reference by implementing
+`gc_is_valid_conservative_ref_displacement`.  Here, the collector masks
+off the low bits of a conservative reference, and asks the embedder if a
+value with those low bits might point to an object.  Usually the
+embedder should return 1 only if the displacement is 0, but if the
+program allows low-bit tagged pointers, then it should also return 1 for
+those pointer tags.
+
+### External objects
+
+Sometimes a system will allocate objects outside the GC, for example on
+the stack or in static data sections.  To support this use case, Whippet
+allows the embedder to provide a `struct gc_extern_space`
+implementation.  Whippet will call `gc_extern_space_start_gc` at the
+start of each collection, and `gc_extern_space_finish_gc` at the end.
+External objects will be visited by `gc_extern_space_mark`, which should
+return nonzero if the object hasn't been seen before and needs to be
+traced via `gc_trace_object` (coloring the object grey).  Note,
+`gc_extern_space_mark` may be called concurrently from many threads; be
+prepared!
+
+## Configuration, compilation, and linking
+
+To the user, Whippet presents an abstract API that does not encode the
+specificities of any given collector.  Whippet currently includes four
+implementations of that API: `semi`, a simple semi-space collector;
+`pcc`, a parallel copying collector (like semi but multithreaded);
+`bdw`, an implementation via the third-party
+[Boehm-Demers-Weiser](https://github.com/ivmai/bdwgc) conservative
+collector; and `mmc`, a mostly-marking collector inspired by Immix.
+
+The program that embeds Whippet selects the collector implementation at
+build-time.  For `pcc`, the program can also choose whether to be
+generational or not.  For `mmc` collector, the program configures a
+specific collector mode, again at build-time: generational or not,
+parallel or not, stack-conservative or not, and heap-conservative or
+not.  It may be nice in the future to be able to configure these at
+run-time, but for the time being they are compile-time options so that
+adding new features doesn't change the footprint of a more minimal
+collector.
+
+Different collectors have different allocation strategies: for example,
+the BDW collector allocates from thread-local freelists, whereas the
+semi-space collector has a bump-pointer allocator.  A collector may also
+expose a write barrier, for example to enable generational collection.
+For performance reasons, many of these details can't be hidden behind an
+opaque functional API: they must be inlined into call sites.  Whippet's
+approach is to expose fast paths as part of its inline API, but which
+are *parameterized* on attributes of the selected garbage collector.
+The goal is to keep the user's code generic and avoid any code
+dependency on the choice of garbage collector.  Because of inlining,
+however, the choice of garbage collector does need to be specified when
+compiling user code.
+
+### Compiling the collector
+
+As an embed-only library, Whippet needs to be integrated into the build
+system of its host (embedder).  There are two build systems supported
+currently; we would be happy to add other systems over time.
+
+#### GNU make
+
+At a high level, first the embedder chooses a collector and defines how
+to specialize the collector against the embedder.  Whippet's `embed.mk`
+Makefile snippet then defines how to build the set of object files that
+define the collector, and how to specialize the embedder against the
+chosen collector.
+
+As an example, say you have a file `program.c`, and you want to compile
+it against a Whippet checkout in `whippet/`.  Your headers are in
+`include/`, and you have written an implementation of the embedder
+interface in `host-gc.h`.  In that case you would have a Makefile like
+this:
+
+```
+HOST_DIR:=$(dir $(lastword $(MAKEFILE_LIST)))
+WHIPPET_DIR=$(HOST_DIR)whippet/
+
+all: out
+
+# The collector to choose: e.g. semi, bdw, pcc, generational-pcc, mmc,
+# parallel-mmc, etc.
+GC_COLLECTOR=pcc
+
+include $(WHIPPET_DIR)embed.mk
+
+# Host cflags go here...
+HOST_CFLAGS=
+
+# Whippet's embed.mk uses this variable when it compiles code that
+# should be specialized against the embedder.
+EMBEDDER_TO_GC_CFLAGS=$(HOST_CFLAGS) -include $(HOST_DIR)host-gc.h
+
+program.o: program.c
+	$(GC_COMPILE) $(HOST_CFLAGS) $(GC_TO_EMBEDDER_CFLAGS) -c $<
+program: program.o $(GC_OBJS)
+	$(GC_LINK) $^ $(GC_LIBS)
+```
+
+The optimization settings passed to the C compiler are taken from
+`GC_BUILD_CFLAGS`.  Embedders can override this variable directly, or
+via the shorthand `GC_BUILD` variable.  A `GC_BUILD` of `opt` indicates
+maximum optimization and no debugging assertions; `optdebug` adds
+debugging assertions; and `debug` removes optimizations.
+
+Though Whippet tries to put performance-sensitive interfaces in header
+files, users should also compile with link-time optimization (LTO) to
+remove any overhead imposed by the division of code into separate
+compilation units.  `embed.mk` includes the necessary LTO flags in
+`GC_CFLAGS` and `GC_LDFLAGS`.
+
+#### GNU Autotools
+
+To use Whippet from an autotools project, the basic idea is to include a
+`Makefile.am` snippet from the subdirectory containing the Whippet
+checkout.  That will build `libwhippet.la`, which you should link into
+your binary.  There are some `m4` autoconf macros that need to be
+invoked, for example to select the collector.
+
+Let us imagine you have checked out Whippet in `whippet/`.  Let us also
+assume for the moment that we are going to build `mt-gcbench`, a program
+included in Whippet itself.
+
+A top-level autoconf file (`configure.ac`) might look like this:
+
+```autoconf
+AC_PREREQ([2.69])
+AC_INIT([whippet-autotools-example],[0.1.0])
+AC_CONFIG_SRCDIR([whippet/benchmarks/mt-gcbench.c])
+AC_CONFIG_AUX_DIR([build-aux])
+AC_CONFIG_MACRO_DIRS([m4 whippet])
+AM_INIT_AUTOMAKE([subdir-objects foreign])
+
+WHIPPET_ENABLE_LTO
+
+LT_INIT
+
+WARN_CFLAGS=-Wall
+AC_ARG_ENABLE([Werror],
+  AS_HELP_STRING([--disable-Werror],
+                 [Don't stop the build on errors]),
+  [],
+  WARN_CFLAGS="-Wall -Werror")
+CFLAGS="$CFLAGS $WARN_CFLAGS"
+
+WHIPPET_PKG
+
+AC_CONFIG_FILES(Makefile)
+AC_OUTPUT
+```
+
+Then your `Makefile.am` might look like this:
+
+```automake
+noinst_LTLIBRARIES =
+WHIPPET_EMBEDDER_CPPFLAGS = -include $(srcdir)/whippet/benchmarks/mt-gcbench-embedder.h
+include whippet/embed.am
+
+noinst_PROGRAMS = whippet/benchmarks/mt-gcbench
+whippet_benchmarks_mt_gcbench_SOURCES = \
+  whippet/benchmarks/heap-objects.h \
+  whippet/benchmarks/mt-gcbench-embedder.h \
+  whippet/benchmarks/mt-gcbench-types.h \
+  whippet/benchmarks/mt-gcbench.c \
+  whippet/benchmarks/simple-allocator.h \
+  whippet/benchmarks/simple-gc-embedder.h \
+  whippet/benchmarks/simple-roots-api.h \
+  whippet/benchmarks/simple-roots-types.h \
+  whippet/benchmarks/simple-tagging-scheme.h
+
+AM_CFLAGS = $(WHIPPET_CPPFLAGS) $(WHIPPET_CFLAGS) $(WHIPPET_TO_EMBEDDER_CPPFLAGS)
+LDADD = libwhippet.la
+```
+
+We have to list all the little header files it uses because, well,
+autotools.
+
+To actually build, you do the usual autotools dance:
+
+```bash
+autoreconf -vif && ./configure && make
+```
+
+See `./configure --help` for a list of user-facing options.  Before the
+`WHIPPET_PKG`, you can run e.g. `WHIPPET_PKG_COLLECTOR(mmc)` to set the
+default collector to `mmc`; if you don't do that, the default collector
+is `pcc`.  There are also `WHIPPET_PKG_DEBUG`, `WHIPPET_PKG_TRACING`,
+and `WHIPPET_PKG_PLATFORM`; see [`whippet.m4`](../whippet.m4) for more
+details.  See also
+[`whippet-autotools`](https://github.com/wingo/whippet-autotools) for an
+example of how this works.
+
+#### Compile-time options
+
+There are a number of pre-processor definitions that can parameterize
+the collector at build-time:
+
+ * `GC_DEBUG`: If nonzero, then enable debugging assertions.
+ * `NDEBUG`: This one is a bit weird; if not defined, then enable
+   debugging assertions and some debugging printouts.  Probably
+   Whippet's use of `NDEBUG` should be folded in to `GC_DEBUG`.
+ * `GC_PARALLEL`: If nonzero, then enable parallelism in the collector.
+   Defaults to 0.
+ * `GC_GENERATIONAL`: If nonzero, then enable generational collection.
+   Defaults to zero.
+ * `GC_PRECISE_ROOTS`: If nonzero, then collect precise roots via
+   `gc_heap_roots` and `gc_mutator_roots`.  Defaults to zero.
+ * `GC_CONSERVATIVE_ROOTS`: If nonzero, then scan the stack and static
+   data sections for conservative roots.  Defaults to zero.  Not
+   mutually exclusive with `GC_PRECISE_ROOTS`.
+ * `GC_CONSERVATIVE_TRACE`: If nonzero, heap edges are scanned
+   conservatively.  Defaults to zero.
+
+Some collectors require specific compile-time options.  For example, the
+semi-space collector has to be able to move all objects; this is not
+compatible with conservative roots or heap edges.
+
+#### Tracing support
+
+Whippet includes support for low-overhead run-time tracing via
+[LTTng](https://lttng.org/).  If the support library `lttng-ust` is
+present when Whippet is compiled (as checked via `pkg-config`),
+tracepoint support will be present.  See
+[tracepoints.md](./tracepoints.md) for more information on how to get
+performance traces out of Whippet.
+
+## Using the collector
+
+Whew!  So you finally built the thing!  Did you also link it into your
+program?  No, because your program isn't written yet?  Well this section
+is for you: we describe the user-facing API of Whippet, where "user" in
+this case denotes the embedding program.
+
+What is the API, you ask?  It is in [`gc-api.h`](../api/gc-api.h).
+
+### Heaps and mutators
+
+To start with, you create a *heap*.  Usually an application will create
+just one heap.  A heap has one or more associated *mutators*.  A mutator
+is a thread-specific handle on the heap.  Allocating objects requires a
+mutator.
+
+The initial heap and mutator are created via `gc_init`, which takes
+three logical input parameters: the *options*, a stack base address, and
+an *event listener*.  The options specify the initial heap size and so
+on.  The event listener is mostly for gathering statistics; see below
+for more.  `gc_init` returns the new heap as an out parameter, and also
+returns a mutator for the current thread.
+
+To make a new mutator for a new thread, use `gc_init_for_thread`.  When
+a thread is finished with its mutator, call `gc_finish_for_thread`.
+Each thread that allocates or accesses GC-managed objects should have
+its own mutator.
+
+The stack base address allows the collector to scan the mutator's stack,
+if conservative root-finding is enabled.  It may be omitted in the call
+to `gc_init` and `gc_init_for_thread`; passing `NULL` tells Whippet to
+ask the platform for the stack bounds of the current thread.  Generally
+speaking, this works on all platforms for the main thread, but not
+necessarily on other threads.  The most reliable solution is to
+explicitly obtain a base address by trampolining through
+`gc_call_with_stack_addr`.
+
+### Options
+
+There are some run-time parameters that programs and users might want to
+set explicitly; these are encapsulated in the *options*.  Make an
+options object with `gc_allocate_options()`; this object will be
+consumed by its `gc_init`.  Then, the most convenient thing is to set
+those options from `gc_options_parse_and_set_many` from a string passed
+on the command line or an environment variable, but to get there we have
+to explain the low-level first.  There are a few options that are
+defined for all collectors:
+
+ * `GC_OPTION_HEAP_SIZE_POLICY`: How should we size the heap?  Either
+   it's `GC_HEAP_SIZE_FIXED` (which is 0), in which the heap size is
+   fixed at startup; or `GC_HEAP_SIZE_GROWABLE` (1), in which the heap
+   may grow but will never shrink; or `GC_HEAP_SIZE_ADAPTIVE` (2), in
+   which we take an
+   [adaptive](https://wingolog.org/archives/2023/01/27/three-approaches-to-heap-sizing)
+   approach, depending on the rate of allocation and the cost of
+   collection.  Really you want the adaptive strategy, but if you are
+   benchmarking you definitely want the fixed policy.
+ * `GC_OPTION_HEAP_SIZE`: The initial heap size.  For a
+   `GC_HEAP_SIZE_FIXED` policy, this is also the final heap size.  In
+   bytes.
+ * `GC_OPTION_MAXIMUM_HEAP_SIZE`: For growable and adaptive heaps, the
+   maximum heap size, in bytes.
+ * `GC_OPTION_HEAP_SIZE_MULTIPLIER`: For growable heaps, the target heap
+   multiplier.  A heap multiplier of 2.5 means that for 100 MB of live
+   data, the heap should be 250 MB.
+ * `GC_OPTION_HEAP_EXPANSIVENESS`: For adaptive heap sizing, an
+   indication of how much free space will be given to heaps, as a
+   proportion of the square root of the live data size.
+ * `GC_OPTION_PARALLELISM`: How many threads to devote to collection
+   tasks during GC pauses.  By default, the current number of
+   processors, with a maximum of 8.
+
+You can set these options via `gc_option_set_int` and so on; see
+[`gc-options.h`](../api/gc-options.h).  Or, you can parse options from
+strings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
+on.  Use `gc_option_from_string` to determine if a string is really an
+option.  Use `gc_option_parse_and_set` to parse a value for an option.
+Use `gc_options_parse_and_set_many` to parse a number of comma-delimited
+*key=value* settings from a string.
+
+### Allocation
+
+So you have a heap and a mutator; great!  Let's allocate!  Call
+`gc_allocate`, passing the mutator and the number of bytes to allocate.
+
+There is also `gc_allocate_fast`, which is an inlined fast-path.  If
+that returns NULL, you need to call `gc_allocate_slow`.  The advantage
+of this API is that you can punt some root-saving overhead to the slow
+path.
+
+Allocation always succeeds.  If it doesn't, it kills your program.  The
+bytes in the resulting allocation will be initialized to 0.
+
+The allocation fast path is parameterized by collector-specific
+attributes.  JIT compilers can also read those attributes to emit
+appropriate inline code that replicates the logic of `gc_allocate_fast`.
+
+### Write barriers
+
+For some collectors, mutators have to tell the collector whenever they
+mutate an object.  They tell the collector by calling a *write barrier*;
+in Whippet this is currently the case only for generational collectors.
+
+The write barrier is `gc_write_barrier`; see `gc-api.h` for its
+parameters.
+
+As with allocation, the fast path for the write barrier is parameterized
+by collector-specific attributes, to allow JIT compilers to inline write
+barriers.
+
+### Safepoints
+
+Sometimes Whippet will need to synchronize all threads, for example as
+part of the "stop" phase of a stop-and-copy semi-space collector.
+Whippet stops at *safepoints*.  At a safepoint, all mutators must be
+able to enumerate all of their edges to live objects.
+
+Whippet has cooperative safepoints: mutators have to periodically call
+into the collector to potentially synchronize with other mutators.
+`gc_allocate_slow` is a safepoint, so if you a bunch of threads that are
+all allocating, usually safepoints are reached in a more-or-less prompt
+fashion.  But if a mutator isn't allocating, it either needs to
+temporarily mark itself as inactive by trampolining through
+`gc_call_without_gc`, or it should arrange to periodically call
+`gc_safepoint`.  Marking a mutator as inactive is the right strategy
+for, for example, system calls that might block.  Periodic safepoints is
+better for code that is active but not allocating.
+
+Also, the BDW collector actually uses pre-emptive safepoints: it stops
+threads via POSIX signals.  `gc_safepoint` is a no-op with BDW.
+
+Embedders can inline safepoint checks.  If
+`gc_cooperative_safepoint_kind()` is `GC_COOPERATIVE_SAFEPOINT_NONE`,
+then the collector doesn't need safepoints, as is the case for `bdw`
+which uses signals and `semi` which is single-threaded.  If it is
+`GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG`, then calling
+`gc_safepoint_flag_loc` on a mutator will return the address of an `int`
+in memory, which if nonzero when loaded using relaxed atomics indicates
+that the mutator should call `gc_safepoint_slow`.  Similarly for
+`GC_COOPERATIVE_SAFEPOINT_MUTATOR_FLAG`, except that the address is
+per-mutator rather than global.
+
+### Pinning
+
+Sometimes a mutator or embedder would like to tell the collector to not
+move a particular object.  This can happen for example during a foreign
+function call, or if the embedder allows programs to access the address
+of an object, for example to compute an identity hash code.  To support
+this use case, some Whippet collectors allow the embedder to *pin*
+objects.  Call `gc_pin_object` to prevent the collector from relocating
+an object.
+
+Pinning is currently supported by the `bdw` collector, which never moves
+objects, and also by the various `mmc` collectors, which can move
+objects that have no inbound conservative references.
+
+Pinning is not supported on `semi` or `pcc`.
+
+Call `gc_can_pin_objects` to determine whether the current collector can
+pin objects.
+
+### Statistics
+
+Sometimes a program would like some information from the GC: how many
+bytes and objects have been allocated?  How much time has been spent in
+the GC?  How many times has GC run, and how many of those were minor
+collections?  What's the maximum pause time?  Stuff like that.
+
+Instead of collecting a fixed set of information, Whippet emits
+callbacks when the collector reaches specific states.  The embedder
+provides a *listener* for these events when initializing the collector.
+
+The listener interface is defined in
+[`gc-event-listener.h`](../api/gc-event-listener.h).  Whippet ships with
+two listener implementations,
+[`GC_NULL_EVENT_LISTENER`](../api/gc-null-event-listener.h), and
+[`GC_BASIC_STATS`](../api/gc-basic-stats.h).  Most embedders will want
+their own listener, but starting with the basic stats listener is not a
+bad option:
+
+```
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include <stdio.h>
+
+int main() {
+  struct gc_options *options = NULL;
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats);
+  // ...
+  gc_basic_stats_finish(&stats);
+  gc_basic_stats_print(&stats, stdout);
+}
+```
+
+As you can see, `GC_BASIC_STATS` expands to a `struct gc_event_listener`
+definition.  We pass an associated pointer to a `struct gc_basic_stats`
+instance which will be passed to the listener at every event.
+
+The output of this program might be something like:
+
+```
+Completed 19 major collections (0 minor).
+654.597 ms total time (385.235 stopped).
+Heap size is 167.772 MB (max 167.772 MB); peak live data 55.925 MB.
+```
+
+There are currently three different sorts of events: heap events to
+track heap growth, collector events to time different parts of
+collection, and mutator events to indicate when specific mutators are
+stopped.
+
+There are three heap events:
+
+ * `init(void* data, size_t heap_size)`: Called during `gc_init`, to
+   allow the listener to initialize its associated state.
+ * `heap_resized(void* data, size_t new_size)`: Called if the heap grows
+   or shrinks.
+ * `live_data_size(void* data, size_t size)`: Called periodically when
+   the collector learns about live data size.
+ 
+The collection events form a kind of state machine, and are called in
+this order:
+
+ * `requesting_stop(void* data)`: Called when the collector asks
+   mutators to stop.
+ * `waiting_for_stop(void* data)`: Called when the collector has done
+   all the pre-stop work that it is able to and is just waiting on
+   mutators to stop.
+ * `mutators_stopped(void* data)`: Called when all mutators have
+   stopped; the trace phase follows.
+ * `prepare_gc(void* data, enum gc_collection_kind gc_kind)`: Called 
+   to indicate which kind of collection is happening.
+ * `roots_traced(void* data)`: Called when roots have been visited.
+ * `heap_traced(void* data)`: Called when the whole heap has been
+   traced.
+ * `ephemerons_traced(void* data)`: Called when the [ephemeron
+   fixpoint](https://wingolog.org/archives/2023/01/24/parallel-ephemeron-tracing)
+   has been reached.
+ * `restarting_mutators(void* data)`: Called right before the collector
+   restarts mutators.
+
+The collectors in Whippet will call all of these event handlers, but it
+may be that they are called conservatively: for example, the
+single-mutator, single-collector semi-space collector will never have to
+wait for mutators to stop.  It will still call the functions, though!
+
+Finally, there are the mutator events:
+ * `mutator_added(void* data) -> void*`: The only event handler that
+   returns a value, called when a new mutator is added.  The parameter
+   is the overall event listener data, and the result is
+   mutator-specific data.  The rest of the mutator events pass this
+   mutator-specific data instead.
+ * `mutator_cause_gc(void* mutator_data)`: Called when a mutator causes
+   GC, either via allocation or an explicit `gc_collect` call.
+ * `mutator_stopping(void* mutator_data)`: Called when a mutator has
+   received the signal to stop.  It may perform some marking work before
+   it stops.
+ * `mutator_stopped(void* mutator_data)`: Called when a mutator parks
+   itself.
+ * `mutator_restarted(void* mutator_data)`: Called when a mutator
+   restarts.
+ * `mutator_removed(void* mutator_data)`: Called when a mutator goes
+   away.
+
+Note that these events handlers shouldn't really do much.  In
+particular, they shouldn't call into the Whippet API, and they shouldn't
+even access GC-managed objects.  Event listeners are really about
+statistics and profiling and aren't a place to mutate the object graph.
+
+### Ephemerons
+
+Whippet supports ephemerons, first-class objects that weakly associate
+keys with values.  If the an ephemeron's key ever becomes unreachable,
+the ephemeron becomes dead and loses its value.
+
+The user-facing API is in [`gc-ephemeron.h`](../api/gc-ephemeron.h).  To
+allocate an ephemeron, call `gc_allocate_ephemeron`, then initialize its
+key and value via `gc_ephemeron_init`.  Get the key and value via
+`gc_ephemeron_key` and `gc_ephemeron_value`, respectively.
+
+In Whippet, ephemerons can be linked together in a chain.  During GC, if
+an ephemeron's chain points to a dead ephemeron, that link will be
+elided, allowing the dead ephemeron itself to be collected.  In that
+way, ephemerons can be used to build weak data structures such as weak
+maps.
+
+Weak data structures are often shared across multiple threads, so all
+routines to access and modify chain links are atomic.  Use
+`gc_ephemeron_chain_head` to access the head of a storage location that
+points to an ephemeron; push a new ephemeron on a location with
+`gc_ephemeron_chain_push`; and traverse a chain with
+`gc_ephemeron_chain_next`.
+
+An ephemeron association can be removed via `gc_ephemeron_mark_dead`.
+
+### Finalizers
+
+A finalizer allows the embedder to be notified when an object becomes
+unreachable.
+
+A finalizer has a priority.  When the heap is created, the embedder
+should declare how many priorities there are.  Lower-numbered priorities
+take precedence; if an object has a priority-0 finalizer outstanding,
+that will prevent any finalizer at level 1 (or 2, ...)  from firing
+until no priority-0 finalizer remains.
+
+Call `gc_attach_finalizer`, from `gc-finalizer.h`, to attach a finalizer
+to an object.
+
+A finalizer also references an associated GC-managed closure object.
+A finalizer's reference to the closure object is strong:  if a
+finalizer's closure closure references its finalizable object,
+directly or indirectly, the finalizer will never fire.
+
+When an object with a finalizer becomes unreachable, it is added to a
+queue.  The embedder can call `gc_pop_finalizable` to get the next
+finalizable object and its associated closure.  At that point the
+embedder can do anything with the object, including keeping it alive.
+Ephemeron associations will still be present while the finalizable
+object is live.  Note however that any objects referenced by the
+finalizable object may themselves be already finalized; finalizers are
+enqueued for objects when they become unreachable, which can concern
+whole subgraphs of objects at once.
+
+The usual way for an embedder to know when the queue of finalizable
+object is non-empty is to call `gc_set_finalizer_callback` to
+provide a function that will be invoked when there are pending
+finalizers.
+
+Arranging to call `gc_pop_finalizable` and doing something with the
+finalizable object and closure is the responsibility of the embedder.
+The embedder's finalization action can end up invoking arbitrary code,
+so unless the embedder imposes some kind of restriction on what
+finalizers can do, generally speaking finalizers should be run in a
+dedicated thread instead of recursively from within whatever mutator
+thread caused GC.  Setting up such a thread is the responsibility of the
+mutator.  `gc_pop_finalizable` is thread-safe, allowing multiple
+finalization threads if that is appropriate.
+
+`gc_allocate_finalizer` returns a finalizer, which is a fresh GC-managed
+heap object.  The mutator should then directly attach it to an object
+using `gc_finalizer_attach`.  When the finalizer is fired, it becomes
+available to the mutator via `gc_pop_finalizable`.
--- a/libguile/whippet/doc/perfetto-minor-gc.png
+++ b/libguile/whippet/doc/perfetto-minor-gc.png
--- a/libguile/whippet/doc/tracepoints.md
+++ b/libguile/whippet/doc/tracepoints.md
@ -0,0 +1,127 @@
+# Whippet performance tracing
+
+Whippet includes support for run-time tracing via
+[LTTng](https://LTTng.org) user-space tracepoints.  This allows you to
+get a detailed look at how Whippet is performing on your system.
+Tracing support is currently limited to Linux systems.
+
+## Getting started
+
+First, you need to build Whippet with LTTng support.  Usually this is as
+easy as building it in an environment where the `lttng-ust` library is
+present, as determined by `pkg-config --libs lttng-ust`.  You can know
+if your Whippet has tracing support by seeing if the resulting binaries
+are dynamically linked to `liblttng-ust`.
+
+If we take as an example the `mt-gcbench` test in the Whippet source
+tree, we would have:
+
+```
+$ ldd bin/mt-gcbench.pcc | grep lttng
+...
+liblttng-ust.so.1 => ...
+...
+```
+
+### Capturing traces
+
+Actually capturing traces is a little annoying; it's not as easy as
+`perf run`.  The [LTTng
+documentation](https://lttng.org/docs/v2.13/#doc-controlling-tracing) is
+quite thorough, but here is a summary.
+
+First, create your tracing session:
+
+```
+$ lttng create
+Session auto-20250214-091153 created.
+Traces will be output to ~/lttng-traces/auto-20250214-091153
+```
+
+You run all these commands as your own user; they don't require root
+permissions or system-wide modifications, as all of the Whippet
+tracepoints are user-space tracepoints (UST).
+
+Just having an LTTng session created won't do anything though; you need
+to configure the session.  Monotonic nanosecond-resolution timestamps
+are already implicitly part of each event.  We also want to have process
+and thread IDs for all events:
+
+```
+$ lttng add-context --userspace --type=vpid --type=vtid
+ust context vpid added to all channels
+ust context vtid added to all channels
+```
+
+Now enable Whippet events:
+
+```
+$ lttng enable-event --userspace 'whippet:*'
+ust event whippet:* created in channel channel0
+```
+
+And now, start recording:
+
+```
+$ lttng start
+Tracing started for session auto-20250214-091153
+```
+
+With this, traces will be captured for our program of interest:
+
+```
+$ bin/mt-gcbench.pcc 2.5 8
+...
+```
+
+Now stop the trace:
+
+```
+$ lttng stop
+Waiting for data availability
+Tracing stopped for session auto-20250214-091153
+```
+
+Whew.  If we did it right, our data is now in
+`~/lttng-traces/auto-20250214-091153`.
+
+### Visualizing traces
+
+LTTng produces traces in the [Common Trace Format
+(CTF)](https://diamon.org/ctf/).  My favorite trace viewing tool is the
+family of web-based trace viewers derived from `chrome://tracing`.  The
+best of these appear to be [the Firefox
+profiler](https://profiler.firefox.com) and
+[Perfetto](https://ui.perfetto.dev).  Unfortunately neither of these can
+work with CTF directly, so we instead need to run a trace converter.
+
+Oddly, there is no trace converter that can read CTF and write something
+that Perfetto (e.g.) can read.  However there is a [JSON-based tracing
+format that these tools can
+read](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw),
+and [Python bindings for Babeltrace, a library that works with
+CTF](https://babeltrace.org/), so that's what we do:
+
+```
+$ python3 ctf_to_json.py ~/lttng-traces/auto-20250214-091153 > trace.json
+```
+
+While Firefox Profiler can load this file, it works better on Perfetto,
+as the Whippet events are visually rendered on their respective threads.
+
+![Screenshot of part of Perfetto UI showing a minor GC](./perfetto-minor-gc.png)
+
+### Expanding the set of events
+
+As of February 2025,
+the current set of tracepoints includes the [heap
+events](https://github.com/wingo/whippet/blob/main/doc/manual.md#statistics)
+and some detailed internals of the parallel tracer.  We expect this set
+of tracepoints to expand over time.
+
+### Overhead of tracepoints
+
+When tracepoints are compiled in but no events are enabled, tracepoints
+appear to have no impact on run-time.  When event collection is on, for
+x86-64 hardware, [emitting a tracepoint event takes about
+100ns](https://discuss.systems/@DesnoyersMa/113986344940256872).
--- a/libguile/whippet/embed.am
+++ b/libguile/whippet/embed.am
@ -0,0 +1,207 @@
+# Automake snippet for embedding Whippet in an autotools project.
+#
+# The including Makefile.am needs to do this, assuming Whippet is in the
+# whippet/ subdirectory:
+#  
+#     noinst_LTLIBRARIES =
+#     WHIPPET_EMBEDDER_CPPFLAGS = -include src/my-embedder.h
+#     include whippet/embed.am
+#
+# my-embedder.h should provide the various hooks that Whippet needs to
+# specialize itself to the embedder's object representation.
+#
+# The result is a libwhippet.la.  To compile and link against it:
+#
+#     AM_CFLAGS = $(WHIPPET_CPPFLAGS) $(WHIPPET_CFLAGS) $(WHIPPET_TO_EMBEDDER_CPPFLAGS)
+#     LDADD = libwhippet.la
+#     AM_LDFLAGS = $(WHIPPET_TO_EMBEDDER_LDFLAGS)
+#
+# The assumption is that the embedder will build a single copy of
+# Whippet, specialized against a single collector, a single set of
+# embedder hooks, and a single target platform.  The collector and
+# platform should be chosen at configure-time.  Because Automake really
+# wants the set of source files to be visible to it at automake-time, we
+# need to implement these conditions via AM_CONDITIONAL in a
+# configure.ac.  For example for a parallel-mmc configuration on
+# gnu-linux, we would need:
+#
+#     AM_SUBST(WHIPPET_COLLECTOR, parallel-mmc)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_SEMI, 0)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_PCC, 0)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_BDW, 0)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_MMC, 1)
+#     AM_CONDITIONAL(WHIPPET_PLATFORM_GNU_LINUX, 1)
+#
+# Then there are other conditionals for compilation options:
+#
+#     AM_CONDITIONAL(WHIPPET_ENABLE_DEBUG, 0)
+#     AM_CONDITIONAL(WHIPPET_USE_LTTNG, 1)
+#
+# Finally, LTO should be enabled, for best performance.  This should be
+# added to CFLAGS at configure-time.
+#
+# Getting all of this in there is gnarly.  See the example configure.ac
+# for one take on the topic.
+
+noinst_LTLIBRARIES += libwhippet-common.la libwhippet.la
+
+libwhippet_common_la_SOURCES = \
+  %D%/src/gc-options-internal.h \
+  %D%/src/gc-options.c \
+  %D%/src/gc-stack.c \
+  %D%/src/gc-stack.h \
+  %D%/src/gc-tracepoint.c
+
+if WHIPPET_PLATFORM_GNU_LINUX
+libwhippet_common_la_SOURCES += %D%/src/gc-platform-gnu-linux.c
+endif
+
+libwhippet_la_SOURCES = \
+  %D%/src/adaptive-heap-sizer.h \
+  %D%/src/address-hash.h \
+  %D%/src/address-map.h \
+  %D%/src/address-set.h \
+  %D%/src/assert.h \
+  %D%/src/background-thread.h \
+  %D%/src/copy-space.h \
+  %D%/src/debug.h \
+  %D%/src/extents.h \
+  %D%/src/field-set.h \
+  %D%/src/freelist.h \
+  %D%/src/gc-align.h \
+  %D%/src/gc-ephemeron-internal.h \
+  %D%/src/gc-ephemeron.c \
+  %D%/src/gc-finalizer-internal.h \
+  %D%/src/gc-finalizer.c \
+  %D%/src/gc-internal.h \
+  %D%/src/gc-lock.h \
+  %D%/src/gc-platform.h \
+  %D%/src/gc-trace.h \
+  %D%/src/growable-heap-sizer.h \
+  %D%/src/heap-sizer.h \
+  %D%/src/large-object-space.h \
+  %D%/src/local-worklist.h \
+  %D%/src/nofl-space.h \
+  %D%/src/parallel-tracer.h \
+  %D%/src/root.h \
+  %D%/src/root-worklist.h \
+  %D%/src/serial-tracer.h \
+  %D%/src/shared-worklist.h \
+  %D%/src/simple-worklist.h \
+  %D%/src/spin.h \
+  %D%/src/splay-tree.h \
+  %D%/src/swar.h \
+  %D%/src/tracer.h
+
+WHIPPET_CFLAGS_bdw = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+WHIPPET_CFLAGS_semi = -DGC_PRECISE_ROOTS=1
+WHIPPET_CFLAGS_pcc = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_generational_pcc = $(WHIPPET_CFLAGS_pcc) -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_mmc = \
+  -DGC_PRECISE_ROOTS=1
+WHIPPET_CFLAGS_generational_mmc = \
+  -DGC_PRECISE_ROOTS=1 -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_parallel_mmc = \
+  -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_parallel_generational_mmc = \
+  -DGC_PRECISE_ROOTS=1 -DGC_GENERATIONAL=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_stack_conservative_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1
+WHIPPET_CFLAGS_stack_conservative_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_stack_conservative_parallel_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_stack_conservative_parallel_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_GENERATIONAL=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_heap_conservative_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+WHIPPET_CFLAGS_heap_conservative_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_heap_conservative_parallel_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_heap_conservative_parallel_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_GENERATIONAL=1 -DGC_PARALLEL=1
+
+WHIPPET_CFLAGS      = $(WHIPPET_CFLAGS_$(subst -,_,$(WHIPPET_COLLECTOR)))
+WHIPPET_IMPL_CFLAGS =
+WHIPPET_LIBS        = -lm
+WHIPPET_CPPFLAGS    = -I$(srcdir)/%D%/api
+WHIPPET_TO_EMBEDDER_CPPFLAGS = $(WHIPPET_CPPFLAGS)
+
+if WHIPPET_ENABLE_DEBUG
+WHIPPET_CFLAGS += -DGC_DEBUG=1
+endif
+
+if WHIPPET_COLLECTOR_SEMI
+libwhippet_la_SOURCES += %D%/src/semi.c
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/semi-attrs.h
+endif
+
+if WHIPPET_COLLECTOR_PCC
+libwhippet_la_SOURCES += %D%/src/pcc.c
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/pcc-attrs.h
+endif
+
+if WHIPPET_COLLECTOR_BDW
+libwhippet_la_SOURCES += %D%/src/bdw.c
+WHIPPET_IMPL_CFLAGS   += $(WHIPPET_BDW_CFLAGS)
+WHIPPET_LIBS          += $(WHIPPET_BDW_LIBS)
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/bdw-attrs.h
+endif
+
+if WHIPPET_COLLECTOR_MMC
+libwhippet_la_SOURCES += %D%/src/mmc.c
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/mmc-attrs.h
+endif
+
+# add to cflags: -flto -fvisibility=hidden -fno-strict-aliasing
+
+libwhippet_common_la_CPPFLAGS = $(WHIPPET_CPPFLAGS)
+libwhippet_common_la_CFLAGS = -Wall -Wno-unused $(CFLAGS)
+libwhippet_common_la_CFLAGS += $(WHIPPET_CFLAGS)
+libwhippet_common_la_LDFLAGS = -lpthread $(LDFLAGS)
+libwhippet_common_la_LIBADD = $(LIBS)
+
+if WHIPPET_USE_LTTNG
+libwhippet_common_la_CPPFLAGS += $(WHIPPET_LTTNG_CFLAGS) -DGC_TRACEPOINT_LTTNG=1
+WHIPPET_LIBS += $(WHIPPET_LTTNG_LIBS)
+endif
+
+if !WHIPPET_ENABLE_DEBUG
+libwhippet_common_la_CFLAGS += -DNDEBUG
+endif
+
+libwhippet_la_CPPFLAGS = $(libwhippet_common_la_CPPFLAGS) $(WHIPPET_EMBEDDER_CPPFLAGS)
+libwhippet_la_CFLAGS = $(libwhippet_common_la_CFLAGS)
+libwhippet_la_CFLAGS += $(WHIPPET_IMPL_CFLAGS)
+libwhippet_la_LDFLAGS = $(libwhippet_common_la_LDFLAGS) $(WHIPPET_LIBS)
+libwhippet_la_LIBADD = libwhippet-common.la
+
+noinst_HEADERS = \
+  %D%/api/bdw-attrs.h \
+  %D%/api/gc-allocation-kind.h \
+  %D%/api/gc-api.h \
+  %D%/api/gc-assert.h \
+  %D%/api/gc-attrs.h \
+  %D%/api/gc-basic-stats.h \
+  %D%/api/gc-collection-kind.h \
+  %D%/api/gc-config.h \
+  %D%/api/gc-conservative-ref.h \
+  %D%/api/gc-edge.h \
+  %D%/api/gc-embedder-api.h \
+  %D%/api/gc-ephemeron.h \
+  %D%/api/gc-event-listener-chain.h \
+  %D%/api/gc-event-listener.h \
+  %D%/api/gc-finalizer.h \
+  %D%/api/gc-forwarding.h \
+  %D%/api/gc-histogram.h \
+  %D%/api/gc-inline.h \
+  %D%/api/gc-lttng.h \
+  %D%/api/gc-null-event-listener.h \
+  %D%/api/gc-options.h \
+  %D%/api/gc-ref.h \
+  %D%/api/gc-tracepoint.h \
+  %D%/api/gc-visibility.h \
+  %D%/api/mmc-attrs.h \
+  %D%/api/pcc-attrs.h \
+  %D%/api/semi-attrs.h
--- a/libguile/whippet/embed.mk
+++ b/libguile/whippet/embed.mk
@ -0,0 +1,105 @@
+GC_COLLECTOR ?= semi
+
+DEFAULT_BUILD := opt
+
+BUILD_CFLAGS_opt      = -O2 -g -DNDEBUG
+BUILD_CFLAGS_optdebug = -O2 -g -DGC_DEBUG=1
+BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
+
+GC_BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(GC_BUILD),$(DEFAULT_BUILD)))
+
+V ?= 1
+v_0 = @
+v_1 =
+
+GC_USE_LTTNG_0 :=
+GC_USE_LTTNG_1 := 1
+GC_USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1 || echo 0)
+GC_LTTNG_CPPFLAGS := $(if $(GC_USE_LTTNG_$(GC_USE_LTTNG)), $(shell pkg-config --cflags lttng-ust),)
+GC_LTTNG_LIBS := $(if $(GC_USE_LTTNG_$(GC_USE_LTTNG)), $(shell pkg-config --libs lttng-ust),)
+GC_TRACEPOINT_CPPFLAGS = $(if $(GC_USE_LTTNG_$(GC_USE_LTTNG)),$(GC_LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
+GC_TRACEPOINT_LIBS = $(GC_LTTNG_LIBS)
+
+GC_V        = $(v_$(V))
+GC_CC       = gcc
+GC_CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(GC_BUILD_CFLAGS)
+GC_CPPFLAGS = -I$(WHIPPET)api $(GC_TRACEPOINT_CPPFLAGS)
+GC_LDFLAGS  = -lpthread -flto=auto $(GC_TRACEPOINT_LIBS)
+GC_DEPFLAGS = 
+GC_COMPILE  = $(GC_V)$(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
+GC_LINK     = $(GC_V)$(GC_CC) $(GC_LDFLAGS) -o $@
+GC_PLATFORM = gnu-linux
+GC_OBJDIR   =
+
+$(GC_OBJDIR)gc-platform.o: $(WHIPPET)src/gc-platform-$(GC_PLATFORM).c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-stack.o: $(WHIPPET)src/gc-stack.c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-options.o: $(WHIPPET)src/gc-options.c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-tracepoint.o: $(WHIPPET)src/gc-tracepoint.c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-ephemeron.o: $(WHIPPET)src/gc-ephemeron.c
+	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
+$(GC_OBJDIR)gc-finalizer.o: $(WHIPPET)src/gc-finalizer.c
+	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
+
+GC_STEM_bdw   	   = bdw
+GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+GC_IMPL_CFLAGS_bdw = `pkg-config --cflags bdw-gc`
+GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
+
+GC_STEM_semi       = semi
+GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
+GC_LIBS_semi       = -lm
+
+GC_STEM_pcc        = pcc
+GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+GC_LIBS_pcc       = -lm
+
+GC_STEM_generational_pcc   = $(GC_STEM_pcc)
+GC_CFLAGS_generational_pcc = $(GC_CFLAGS_pcc) -DGC_GENERATIONAL=1
+GC_LIBS_generational_pcc   = $(GC_LIBS_pcc)
+
+define mmc_variant
+GC_STEM_$(1)       = mmc
+GC_CFLAGS_$(1)     = $(2)
+GC_LIBS_$(1)       = -lm
+endef
+
+define generational_mmc_variants
+$(call mmc_variant,$(1)mmc,$(2))
+$(call mmc_variant,$(1)generational_mmc,$(2) -DGC_GENERATIONAL=1)
+endef
+
+define parallel_mmc_variants
+$(call generational_mmc_variants,$(1),$(2))
+$(call generational_mmc_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
+endef
+
+define trace_mmc_variants
+$(call parallel_mmc_variants,,-DGC_PRECISE_ROOTS=1)
+$(call parallel_mmc_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
+$(call parallel_mmc_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
+endef
+
+$(eval $(call trace_mmc_variants))
+
+gc_var         = $($(1)$(subst -,_,$(2)))
+gc_impl        = $(call gc_var,GC_STEM_,$(1)).c
+gc_attrs       = $(call gc_var,GC_STEM_,$(1))-attrs.h
+gc_cflags      = $(call gc_var,GC_CFLAGS_,$(1))
+gc_impl_cflags = $(call gc_var,GC_IMPL_CFLAGS_,$(1))
+gc_libs        = $(call gc_var,GC_LIBS_,$(1))
+
+GC_IMPL        	    = $(call gc_impl,$(GC_COLLECTOR))
+GC_CFLAGS      	   += $(call gc_cflags,$(GC_COLLECTOR))
+GC_IMPL_CFLAGS 	    = $(call gc_impl_cflags,$(GC_COLLECTOR))
+GC_ATTRS            = $(WHIPPET)api/$(call gc_attrs,$(GC_COLLECTOR))
+GC_TO_EMBEDDER_CFLAGS = -include $(GC_ATTRS)
+GC_LIBS             = $(call gc_libs,$(GC_COLLECTOR))
+
+$(GC_OBJDIR)gc-impl.o: $(WHIPPET)src/$(call gc_impl,$(GC_COLLECTOR))
+	$(GC_COMPILE) $(GC_IMPL_CFLAGS) $(EMBEDDER_TO_GC_CFLAGS) -c $<
+
+GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-tracepoint.o gc-ephemeron.o gc-finalizer.o gc-impl.o,$(GC_OBJDIR)$(O))
--- a/libguile/whippet/manifest.scm
+++ b/libguile/whippet/manifest.scm
@ -0,0 +1,11 @@
+(use-modules (guix packages))
+
+(specifications->manifest
+ '("bash"
+   "coreutils"
+   "gcc-toolchain"
+   "lttng-ust"
+   "glibc"
+   "libgc"
+   "make"
+   "pkg-config"))
--- a/libguile/whippet/src/adaptive-heap-sizer.h
+++ b/libguile/whippet/src/adaptive-heap-sizer.h
@ -0,0 +1,171 @@
+#ifndef ADAPTIVE_HEAP_SIZER_H
+#define ADAPTIVE_HEAP_SIZER_H
+
+#include <math.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "assert.h"
+#include "background-thread.h"
+#include "debug.h"
+#include "gc-config.h"
+#include "gc-platform.h"
+#include "heap-sizer.h"
+
+// This is the MemBalancer algorithm from "Optimal Heap Limits for Reducing
+// Browser Memory Use" by Marisa Kirisame, Pranav Shenoy, and Pavel Panchekha
+// (https://arxiv.org/abs/2204.10455).
+//
+// This implementation differs slightly in that the constant "c" of the paper
+// has been extracted outside the radical, and notionally reversed: it is a
+// unitless "expansiveness" parameter whose domain is [0,+∞].  Also there are
+// minimum and maximum heap size multipliers, and a minimum amount of free
+// space.  The initial collection rate is an informed guess.  The initial
+// allocation rate estimate is high, considering that allocation rates are often
+// high on program startup.
+
+struct gc_adaptive_heap_sizer {
+  uint64_t (*get_allocation_counter)(struct gc_heap *heap);
+  void (*set_heap_size)(struct gc_heap *heap, size_t size);
+  struct gc_heap *heap;
+  uint64_t smoothed_pause_time;
+  uint64_t smoothed_live_bytes;
+  uint64_t live_bytes;
+  double smoothed_allocation_rate;
+  double collection_smoothing_factor;
+  double allocation_smoothing_factor;
+  double minimum_multiplier;
+  double maximum_multiplier;
+  double minimum_free_space;
+  double expansiveness;
+#if GC_PARALLEL
+  pthread_mutex_t lock;
+#endif
+  int background_task_id;
+  uint64_t last_bytes_allocated;
+  uint64_t last_heartbeat;
+};
+
+static void
+gc_adaptive_heap_sizer_lock(struct gc_adaptive_heap_sizer *sizer) {
+#if GC_PARALLEL
+  pthread_mutex_lock(&sizer->lock);
+#endif
+}
+
+static void
+gc_adaptive_heap_sizer_unlock(struct gc_adaptive_heap_sizer *sizer) {
+#if GC_PARALLEL
+  pthread_mutex_unlock(&sizer->lock);
+#endif
+}
+
+// With lock
+static uint64_t
+gc_adaptive_heap_sizer_calculate_size(struct gc_adaptive_heap_sizer *sizer) {
+  double allocation_rate = sizer->smoothed_allocation_rate;
+  double collection_rate =
+    (double)sizer->smoothed_pause_time / (double)sizer->smoothed_live_bytes;
+  double radicand = sizer->live_bytes * allocation_rate / collection_rate;
+  double multiplier = 1.0 + sizer->expansiveness * sqrt(radicand);
+  if (isnan(multiplier) || multiplier < sizer->minimum_multiplier)
+    multiplier = sizer->minimum_multiplier;
+  else if (multiplier > sizer->maximum_multiplier)
+    multiplier = sizer->maximum_multiplier;
+  uint64_t size = sizer->live_bytes * multiplier;
+  if (size - sizer->live_bytes < sizer->minimum_free_space)
+    size = sizer->live_bytes + sizer->minimum_free_space;
+  return size;
+}
+
+static uint64_t
+gc_adaptive_heap_sizer_set_expansiveness(struct gc_adaptive_heap_sizer *sizer,
+                                         double expansiveness) {
+  gc_adaptive_heap_sizer_lock(sizer);
+  sizer->expansiveness = expansiveness;
+  uint64_t heap_size = gc_adaptive_heap_sizer_calculate_size(sizer);
+  gc_adaptive_heap_sizer_unlock(sizer);
+  return heap_size;
+}
+
+static void
+gc_adaptive_heap_sizer_on_gc(struct gc_adaptive_heap_sizer *sizer,
+                             size_t live_bytes, uint64_t pause_ns,
+                             void (*set_heap_size)(struct gc_heap*, size_t)) {
+  gc_adaptive_heap_sizer_lock(sizer);
+  sizer->live_bytes = live_bytes;
+  sizer->smoothed_live_bytes *= 1.0 - sizer->collection_smoothing_factor;
+  sizer->smoothed_live_bytes += sizer->collection_smoothing_factor * live_bytes;
+  sizer->smoothed_pause_time *= 1.0 - sizer->collection_smoothing_factor;
+  sizer->smoothed_pause_time += sizer->collection_smoothing_factor * pause_ns;
+  set_heap_size(sizer->heap, gc_adaptive_heap_sizer_calculate_size(sizer));
+  gc_adaptive_heap_sizer_unlock(sizer);
+}
+
+static void
+gc_adaptive_heap_sizer_background_task(void *data) {
+  struct gc_adaptive_heap_sizer *sizer = data;
+  gc_adaptive_heap_sizer_lock(sizer);
+  uint64_t bytes_allocated =
+    sizer->get_allocation_counter(sizer->heap);
+  // bytes_allocated being 0 means the request failed; retry later.
+  if (bytes_allocated) {
+    uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
+    double rate = (double) (bytes_allocated - sizer->last_bytes_allocated) /
+      (double) (heartbeat - sizer->last_heartbeat);
+    // Just smooth the rate, under the assumption that the denominator is almost
+    // always 1.
+    sizer->smoothed_allocation_rate *= 1.0 - sizer->allocation_smoothing_factor;
+    sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
+    sizer->last_heartbeat = heartbeat;
+    sizer->last_bytes_allocated = bytes_allocated;
+    sizer->set_heap_size(sizer->heap,
+                         gc_adaptive_heap_sizer_calculate_size(sizer));
+  }
+  gc_adaptive_heap_sizer_unlock(sizer);
+}
+
+static struct gc_adaptive_heap_sizer*
+gc_make_adaptive_heap_sizer(struct gc_heap *heap, double expansiveness,
+                            uint64_t (*get_allocation_counter)(struct gc_heap*),
+                            void (*set_heap_size)(struct gc_heap*, size_t),
+                            struct gc_background_thread *thread) {
+  struct gc_adaptive_heap_sizer *sizer;
+  sizer = malloc(sizeof(*sizer));
+  if (!sizer)
+    GC_CRASH();
+  memset(sizer, 0, sizeof(*sizer));
+  sizer->get_allocation_counter = get_allocation_counter;
+  sizer->set_heap_size = set_heap_size;
+  sizer->heap = heap;
+  // Baseline estimate of GC speed: 10 MB/ms, or 10 bytes/ns.  However since we
+  // observe this speed by separately noisy measurements, we have to provide
+  // defaults for numerator and denominator; estimate 2ms for initial GC pauses
+  // for 20 MB of live data during program startup.
+  sizer->smoothed_pause_time = 2 * 1000 * 1000;
+  sizer->smoothed_live_bytes = 20 * 1024 * 1024;
+  // Baseline estimate of allocation rate during startup: 50 MB in 10ms, or 5
+  // bytes/ns.
+  sizer->smoothed_allocation_rate = 5;
+  sizer->collection_smoothing_factor = 0.5;
+  sizer->allocation_smoothing_factor = 0.95;
+  sizer->minimum_multiplier = 1.1;
+  sizer->maximum_multiplier = 5;
+  sizer->minimum_free_space = 4 * 1024 * 1024;
+  sizer->expansiveness = expansiveness;
+  sizer->last_bytes_allocated = get_allocation_counter(heap);
+  sizer->last_heartbeat = gc_platform_monotonic_nanoseconds();
+#if GC_PARALLEL
+  pthread_mutex_init(&thread->lock, NULL);
+  sizer->background_task_id =
+    gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_MIDDLE,
+                                  gc_adaptive_heap_sizer_background_task,
+                                  sizer);
+#else
+  sizer->background_task_id = -1;
+#endif
+  return sizer;
+}
+
+#endif // ADAPTIVE_HEAP_SIZER_H
--- a/libguile/whippet/src/address-hash.h
+++ b/libguile/whippet/src/address-hash.h
@ -0,0 +1,45 @@
+#ifndef ADDRESS_HASH_H
+#define ADDRESS_HASH_H
+
+#include <stdint.h>
+
+static uintptr_t hash_address(uintptr_t x) {
+  if (sizeof (x) < 8) {
+    // Chris Wellon's lowbias32, from https://nullprogram.com/blog/2018/07/31/.
+    x ^= x >> 16;
+    x *= 0x7feb352dU;
+    x ^= x >> 15;
+    x *= 0x846ca68bU;
+    x ^= x >> 16;
+    return x;
+  } else {
+    // Sebastiano Vigna's splitmix64 integer mixer, from
+    // https://prng.di.unimi.it/splitmix64.c.
+    x ^= x >> 30;
+    x *= 0xbf58476d1ce4e5b9U;
+    x ^= x >> 27;
+    x *= 0x94d049bb133111ebU;
+    x ^= x >> 31;
+    return x;
+  }
+}
+// Inverse of hash_address from https://nullprogram.com/blog/2018/07/31/.
+static uintptr_t unhash_address(uintptr_t x) {
+  if (sizeof (x) < 8) {
+    x ^= x >> 16;
+    x *= 0x43021123U;
+    x ^= x >> 15 ^ x >> 30;
+    x *= 0x1d69e2a5U;
+    x ^= x >> 16;
+    return x;
+  } else {
+    x ^= x >> 31 ^ x >> 62;
+    x *= 0x319642b2d24d8ec3U;
+    x ^= x >> 27 ^ x >> 54;
+    x *= 0x96de1b173f119089U;
+    x ^= x >> 30 ^ x >> 60;
+    return x;
+  }
+}
+
+#endif // ADDRESS_HASH_H
--- a/libguile/whippet/src/address-map.h
+++ b/libguile/whippet/src/address-map.h
@ -0,0 +1,213 @@
+#ifndef ADDRESS_MAP_H
+#define ADDRESS_MAP_H
+
+#include <malloc.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "address-hash.h"
+#include "gc-assert.h"
+
+struct hash_map_entry {
+  uintptr_t k;
+  uintptr_t v;
+};
+
+struct hash_map {
+  struct hash_map_entry *data;
+  size_t size;    	// total number of slots
+  size_t n_items;	// number of items in set
+  uint8_t *bits;        // bitvector indicating set slots
+};
+
+static void hash_map_clear(struct hash_map *map) {
+  memset(map->bits, 0, map->size / 8);
+  map->n_items = 0;
+}
+  
+// Size must be a power of 2.
+static void hash_map_init(struct hash_map *map, size_t size) {
+  map->size = size;
+  map->data = malloc(sizeof(struct hash_map_entry) * size);
+  if (!map->data) GC_CRASH();
+  map->bits = malloc(size / 8);
+  if (!map->bits) GC_CRASH();
+  hash_map_clear(map);
+}
+static void hash_map_destroy(struct hash_map *map) {
+  free(map->data);
+  free(map->bits);
+}
+
+static size_t hash_map_slot_index(struct hash_map *map, size_t idx) {
+  return idx & (map->size - 1);
+}
+static struct hash_map_entry* hash_map_slot_entry(struct hash_map *map,
+                                                  size_t idx) {
+  return &map->data[hash_map_slot_index(map, idx)];
+}
+static int hash_map_slot_is_empty(struct hash_map *map, size_t idx) {
+  idx = hash_map_slot_index(map, idx);
+  return (map->bits[idx / 8] & (1 << (idx % 8))) == 0;
+}
+static void hash_map_slot_acquire(struct hash_map *map, size_t idx) {
+  idx = hash_map_slot_index(map, idx);
+  map->bits[idx / 8] |= (1 << (idx % 8));
+  map->n_items++;
+}
+static void hash_map_slot_release(struct hash_map *map, size_t idx) {
+  idx = hash_map_slot_index(map, idx);
+  map->bits[idx / 8] &= ~(1 << (idx % 8));
+  map->n_items--;
+}
+static size_t hash_map_slot_distance(struct hash_map *map, size_t idx) {
+  return hash_map_slot_index(map, idx - hash_map_slot_entry(map, idx)->k);
+}
+static int hash_map_should_shrink(struct hash_map *map) {
+  return map->size > 8 && map->n_items <= (map->size >> 3);
+}
+static int hash_map_should_grow(struct hash_map *map) {
+  return map->n_items >= map->size - (map->size >> 3);
+}
+
+static void hash_map_do_insert(struct hash_map *map, uintptr_t k, uintptr_t v) {
+  size_t displacement = 0;
+  while (!hash_map_slot_is_empty(map, k + displacement)
+         && displacement < hash_map_slot_distance(map, k + displacement))
+    displacement++;
+  while (!hash_map_slot_is_empty(map, k + displacement)
+         && displacement == hash_map_slot_distance(map, k + displacement)) {
+    if (hash_map_slot_entry(map, k + displacement)->k == k) {
+      hash_map_slot_entry(map, k + displacement)->v = v;
+      return;
+    }
+    displacement++;
+  }
+  size_t idx = k + displacement;
+  size_t slots_to_move = 0;
+  while (!hash_map_slot_is_empty(map, idx + slots_to_move))
+    slots_to_move++;
+  hash_map_slot_acquire(map, idx + slots_to_move);
+  while (slots_to_move--)
+    *hash_map_slot_entry(map, idx + slots_to_move + 1) =
+      *hash_map_slot_entry(map, idx + slots_to_move);
+  *hash_map_slot_entry(map, idx) = (struct hash_map_entry){ k, v };
+}
+
+static void hash_map_populate(struct hash_map *dst, struct hash_map *src) {
+  for (size_t i = 0; i < src->size; i++)
+    if (!hash_map_slot_is_empty(src, i))
+      hash_map_do_insert(dst, hash_map_slot_entry(src, i)->k,
+                         hash_map_slot_entry(src, i)->v);
+}
+static void hash_map_grow(struct hash_map *map) {
+  struct hash_map fresh;
+  hash_map_init(&fresh, map->size << 1);
+  hash_map_populate(&fresh, map);
+  hash_map_destroy(map);
+  memcpy(map, &fresh, sizeof(fresh));
+}
+static void hash_map_shrink(struct hash_map *map) {
+  struct hash_map fresh;
+  hash_map_init(&fresh, map->size >> 1);
+  hash_map_populate(&fresh, map);
+  hash_map_destroy(map);
+  memcpy(map, &fresh, sizeof(fresh));
+}
+
+static void hash_map_insert(struct hash_map *map, uintptr_t k, uintptr_t v) {
+  if (hash_map_should_grow(map))
+    hash_map_grow(map);
+  hash_map_do_insert(map, k, v);
+}
+static void hash_map_remove(struct hash_map *map, uintptr_t k) {
+  size_t slot = k;
+  while (!hash_map_slot_is_empty(map, slot) && hash_map_slot_entry(map, slot)->k != k)
+    slot++;
+  if (hash_map_slot_is_empty(map, slot))
+    __builtin_trap();
+  while (!hash_map_slot_is_empty(map, slot + 1)
+         && hash_map_slot_distance(map, slot + 1)) {
+    *hash_map_slot_entry(map, slot) = *hash_map_slot_entry(map, slot + 1);
+    slot++;
+  }
+  hash_map_slot_release(map, slot);
+  if (hash_map_should_shrink(map))
+    hash_map_shrink(map);
+}
+static int hash_map_contains(struct hash_map *map, uintptr_t k) {
+  for (size_t slot = k; !hash_map_slot_is_empty(map, slot); slot++) {
+    if (hash_map_slot_entry(map, slot)->k == k)
+      return 1;
+    if (hash_map_slot_distance(map, slot) < (slot - k))
+      return 0;
+  }
+  return 0;
+}
+static uintptr_t hash_map_lookup(struct hash_map *map, uintptr_t k, uintptr_t default_) {
+  for (size_t slot = k; !hash_map_slot_is_empty(map, slot); slot++) {
+    if (hash_map_slot_entry(map, slot)->k == k)
+      return hash_map_slot_entry(map, slot)->v;
+    if (hash_map_slot_distance(map, slot) < (slot - k))
+      break;
+  }
+  return default_;
+}
+static inline void hash_map_for_each (struct hash_map *map,
+                                      void (*f)(uintptr_t, uintptr_t, void*),
+                                      void *data) __attribute__((always_inline));
+static inline void hash_map_for_each(struct hash_map *map,
+                                     void (*f)(uintptr_t, uintptr_t, void*),
+                                     void *data) {
+  for (size_t i = 0; i < map->size; i++)
+    if (!hash_map_slot_is_empty(map, i))
+      f(hash_map_slot_entry(map, i)->k, hash_map_slot_entry(map, i)->v, data);
+}
+  
+struct address_map {
+  struct hash_map hash_map;
+};
+
+static void address_map_init(struct address_map *map) {
+  hash_map_init(&map->hash_map, 8);
+}
+static void address_map_destroy(struct address_map *map) {
+  hash_map_destroy(&map->hash_map);
+}
+static void address_map_clear(struct address_map *map) {
+  hash_map_clear(&map->hash_map);
+}
+  
+static void address_map_add(struct address_map *map, uintptr_t addr, uintptr_t v) {
+  hash_map_insert(&map->hash_map, hash_address(addr), v);
+}
+static void address_map_remove(struct address_map *map, uintptr_t addr) {
+  hash_map_remove(&map->hash_map, hash_address(addr));
+}
+static int address_map_contains(struct address_map *map, uintptr_t addr) {
+  return hash_map_contains(&map->hash_map, hash_address(addr));
+}
+static uintptr_t address_map_lookup(struct address_map *map, uintptr_t addr,
+                                 uintptr_t default_) {
+  return hash_map_lookup(&map->hash_map, hash_address(addr), default_);
+}
+
+struct address_map_for_each_data {
+  void (*f)(uintptr_t, uintptr_t, void *);
+  void *data;
+};
+static void address_map_do_for_each(uintptr_t k, uintptr_t v, void *data) {
+  struct address_map_for_each_data *for_each_data = data;
+  for_each_data->f(unhash_address(k), v, for_each_data->data);
+}
+static inline void address_map_for_each (struct address_map *map,
+                                         void (*f)(uintptr_t, uintptr_t, void*),
+                                         void *data) __attribute__((always_inline));
+static inline void address_map_for_each (struct address_map *map,
+                                         void (*f)(uintptr_t, uintptr_t, void*),
+                                         void *data) {
+  struct address_map_for_each_data for_each_data = { f, data };
+  hash_map_for_each(&map->hash_map, address_map_do_for_each, &for_each_data);
+}
+
+#endif // ADDRESS_MAP_H
--- a/libguile/whippet/src/address-set.h
+++ b/libguile/whippet/src/address-set.h
@ -0,0 +1,214 @@
+#ifndef ADDRESS_SET_H
+#define ADDRESS_SET_H
+
+#include <malloc.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "address-hash.h"
+#include "gc-assert.h"
+
+struct hash_set {
+  uintptr_t *data;
+  size_t size;    	// total number of slots
+  size_t n_items;	// number of items in set
+  uint8_t *bits;        // bitvector indicating set slots
+};
+
+static void hash_set_clear(struct hash_set *set) {
+  memset(set->bits, 0, set->size / 8);
+  set->n_items = 0;
+}
+  
+// Size must be a power of 2.
+static void hash_set_init(struct hash_set *set, size_t size) {
+  set->size = size;
+  set->data = malloc(sizeof(uintptr_t) * size);
+  if (!set->data) GC_CRASH();
+  set->bits = malloc(size / 8);
+  if (!set->bits) GC_CRASH();
+  hash_set_clear(set);
+}
+static void hash_set_destroy(struct hash_set *set) {
+  free(set->data);
+  free(set->bits);
+}
+
+static size_t hash_set_slot_index(struct hash_set *set, size_t idx) {
+  return idx & (set->size - 1);
+}
+static int hash_set_slot_is_empty(struct hash_set *set, size_t idx) {
+  idx = hash_set_slot_index(set, idx);
+  return (set->bits[idx / 8] & (1 << (idx % 8))) == 0;
+}
+static uintptr_t hash_set_slot_ref(struct hash_set *set, size_t idx) {
+  return set->data[hash_set_slot_index(set, idx)];
+}
+static void hash_set_slot_set(struct hash_set *set, size_t idx, uintptr_t v) {
+  set->data[hash_set_slot_index(set, idx)] = v;
+}
+static void hash_set_slot_acquire(struct hash_set *set, size_t idx) {
+  idx = hash_set_slot_index(set, idx);
+  set->bits[idx / 8] |= (1 << (idx % 8));
+  set->n_items++;
+}
+static void hash_set_slot_release(struct hash_set *set, size_t idx) {
+  idx = hash_set_slot_index(set, idx);
+  set->bits[idx / 8] &= ~(1 << (idx % 8));
+  set->n_items--;
+}
+static size_t hash_set_slot_distance(struct hash_set *set, size_t idx) {
+  return hash_set_slot_index(set, idx - hash_set_slot_ref(set, idx));
+}
+static int hash_set_should_shrink(struct hash_set *set) {
+  return set->size > 8 && set->n_items <= (set->size >> 3);
+}
+static int hash_set_should_grow(struct hash_set *set) {
+  return set->n_items >= set->size - (set->size >> 3);
+}
+
+static void hash_set_do_insert(struct hash_set *set, uintptr_t v) {
+  size_t displacement = 0;
+  while (!hash_set_slot_is_empty(set, v + displacement)
+         && displacement < hash_set_slot_distance(set, v + displacement))
+    displacement++;
+  while (!hash_set_slot_is_empty(set, v + displacement)
+         && displacement == hash_set_slot_distance(set, v + displacement)) {
+    if (hash_set_slot_ref(set, v + displacement) == v)
+      return;
+    displacement++;
+  }
+  size_t idx = v + displacement;
+  size_t slots_to_move = 0;
+  while (!hash_set_slot_is_empty(set, idx + slots_to_move))
+    slots_to_move++;
+  hash_set_slot_acquire(set, idx + slots_to_move);
+  while (slots_to_move--)
+    hash_set_slot_set(set, idx + slots_to_move + 1,
+                      hash_set_slot_ref(set, idx + slots_to_move));
+  hash_set_slot_set(set, idx, v);
+}
+
+static void hash_set_populate(struct hash_set *dst, struct hash_set *src) {
+  for (size_t i = 0; i < src->size; i++)
+    if (!hash_set_slot_is_empty(src, i))
+      hash_set_do_insert(dst, hash_set_slot_ref(src, i));
+}
+static void hash_set_grow(struct hash_set *set) {
+  struct hash_set fresh;
+  hash_set_init(&fresh, set->size << 1);
+  hash_set_populate(&fresh, set);
+  hash_set_destroy(set);
+  memcpy(set, &fresh, sizeof(fresh));
+}
+static void hash_set_shrink(struct hash_set *set) {
+  struct hash_set fresh;
+  hash_set_init(&fresh, set->size >> 1);
+  hash_set_populate(&fresh, set);
+  hash_set_destroy(set);
+  memcpy(set, &fresh, sizeof(fresh));
+}
+
+static void hash_set_insert(struct hash_set *set, uintptr_t v) {
+  if (hash_set_should_grow(set))
+    hash_set_grow(set);
+  hash_set_do_insert(set, v);
+}
+
+static void hash_set_remove(struct hash_set *set, uintptr_t v) {
+  size_t slot = v;
+  while (!hash_set_slot_is_empty(set, slot) && hash_set_slot_ref(set, slot) != v)
+    slot++;
+  if (hash_set_slot_is_empty(set, slot))
+    __builtin_trap();
+  while (!hash_set_slot_is_empty(set, slot + 1)
+         && hash_set_slot_distance(set, slot + 1)) {
+    hash_set_slot_set(set, slot, hash_set_slot_ref(set, slot + 1));
+    slot++;
+  }
+  hash_set_slot_release(set, slot);
+  if (hash_set_should_shrink(set))
+    hash_set_shrink(set);
+}
+static int hash_set_contains(struct hash_set *set, uintptr_t v) {
+  for (size_t slot = v; !hash_set_slot_is_empty(set, slot); slot++) {
+    if (hash_set_slot_ref(set, slot) == v)
+      return 1;
+    if (hash_set_slot_distance(set, slot) < (slot - v))
+      return 0;
+  }
+  return 0;
+}
+static inline void hash_set_find(struct hash_set *set,
+                                 int (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void hash_set_find(struct hash_set *set,
+                                 int (*f)(uintptr_t, void*), void *data) {
+  for (size_t i = 0; i < set->size; i++)
+    if (!hash_set_slot_is_empty(set, i))
+      if (f(hash_set_slot_ref(set, i), data))
+        return;
+}
+  
+struct address_set {
+  struct hash_set hash_set;
+};
+
+static void address_set_init(struct address_set *set) {
+  hash_set_init(&set->hash_set, 8);
+}
+static void address_set_destroy(struct address_set *set) {
+  hash_set_destroy(&set->hash_set);
+}
+static void address_set_clear(struct address_set *set) {
+  hash_set_clear(&set->hash_set);
+}
+  
+static void address_set_add(struct address_set *set, uintptr_t addr) {
+  hash_set_insert(&set->hash_set, hash_address(addr));
+}
+static void address_set_remove(struct address_set *set, uintptr_t addr) {
+  hash_set_remove(&set->hash_set, hash_address(addr));
+}
+static int address_set_contains(struct address_set *set, uintptr_t addr) {
+  return hash_set_contains(&set->hash_set, hash_address(addr));
+}
+static void address_set_union(struct address_set *set, struct address_set *other) {
+  while (set->hash_set.size < other->hash_set.size)
+    hash_set_grow(&set->hash_set);
+  hash_set_populate(&set->hash_set, &other->hash_set);
+}
+
+struct address_set_for_each_data {
+  void (*f)(uintptr_t, void *);
+  void *data;
+};
+static int address_set_do_for_each(uintptr_t v, void *data) {
+  struct address_set_for_each_data *for_each_data = data;
+  for_each_data->f(unhash_address(v), for_each_data->data);
+  return 0;
+}
+static inline void address_set_for_each(struct address_set *set,
+                                        void (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void address_set_for_each(struct address_set *set,
+                                        void (*f)(uintptr_t, void*), void *data) {
+  struct address_set_for_each_data for_each_data = { f, data };
+  hash_set_find(&set->hash_set, address_set_do_for_each, &for_each_data);
+}
+
+struct address_set_find_data {
+  int (*f)(uintptr_t, void *);
+  void *data;
+};
+static int address_set_do_find(uintptr_t v, void *data) {
+  struct address_set_find_data *find_data = data;
+  return find_data->f(unhash_address(v), find_data->data);
+}
+static inline void address_set_find(struct address_set *set,
+                                    int (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void address_set_find(struct address_set *set,
+                                    int (*f)(uintptr_t, void*), void *data) {
+  struct address_set_find_data find_data = { f, data };
+  hash_set_find(&set->hash_set, address_set_do_find, &find_data);
+}
+
+#endif // ADDRESS_SET_H
--- a/libguile/whippet/src/assert.h
+++ b/libguile/whippet/src/assert.h
@ -0,0 +1,16 @@
+#ifndef ASSERT_H
+#define ASSERT_H
+
+#define STATIC_ASSERT_EQ(a, b) _Static_assert((a) == (b), "eq")
+
+#define UNLIKELY(e) __builtin_expect(e, 0)
+#define LIKELY(e) __builtin_expect(e, 1)
+
+#ifndef NDEBUG
+#define ASSERT(x) do { if (UNLIKELY(!(x))) __builtin_trap(); } while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+#define ASSERT_EQ(a,b) ASSERT((a) == (b))
+
+#endif // ASSERT_H
--- a/libguile/whippet/src/background-thread.h
+++ b/libguile/whippet/src/background-thread.h
@ -0,0 +1,155 @@
+#ifndef BACKGROUND_THREAD_H
+#define BACKGROUND_THREAD_H
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "assert.h"
+#include "debug.h"
+
+enum {
+  GC_BACKGROUND_TASK_START = 0,
+  GC_BACKGROUND_TASK_MIDDLE = 100,
+  GC_BACKGROUND_TASK_END = 200
+};
+
+struct gc_background_task {
+  int id;
+  int priority;
+  void (*run)(void *data);
+  void *data;
+};
+
+enum gc_background_thread_state {
+  GC_BACKGROUND_THREAD_STARTING,
+  GC_BACKGROUND_THREAD_RUNNING,
+  GC_BACKGROUND_THREAD_STOPPING
+};
+
+struct gc_background_thread {
+  size_t count;
+  size_t capacity;
+  struct gc_background_task *tasks;
+  int next_id;
+  enum gc_background_thread_state state;
+  pthread_t thread;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+};
+
+static void*
+gc_background_thread(void *data) {
+  struct gc_background_thread *thread = data;
+  pthread_mutex_lock(&thread->lock);
+  while (thread->state == GC_BACKGROUND_THREAD_STARTING)
+    pthread_cond_wait(&thread->cond, &thread->lock);
+  struct timespec ts;
+  if (clock_gettime(CLOCK_REALTIME, &ts)) {
+    perror("background thread: failed to get time!");
+    return NULL;
+  }
+  while (thread->state == GC_BACKGROUND_THREAD_RUNNING) {
+    ts.tv_sec += 1;
+    pthread_cond_timedwait(&thread->cond, &thread->lock, &ts);
+    if (thread->state == GC_BACKGROUND_THREAD_RUNNING)
+      for (size_t i = 0; i < thread->count; i++)
+        thread->tasks[i].run(thread->tasks[i].data);
+  }
+  pthread_mutex_unlock(&thread->lock);
+  return NULL;
+}
+
+static struct gc_background_thread*
+gc_make_background_thread(void) {
+  struct gc_background_thread *thread;
+  thread = malloc(sizeof(*thread));
+  if (!thread)
+    GC_CRASH();
+  memset(thread, 0, sizeof(*thread));
+  thread->tasks = NULL;
+  thread->count = 0;
+  thread->capacity = 0;
+  thread->state = GC_BACKGROUND_THREAD_STARTING;
+  pthread_mutex_init(&thread->lock, NULL);
+  pthread_cond_init(&thread->cond, NULL);
+  if (pthread_create(&thread->thread, NULL, gc_background_thread, thread)) {
+    perror("spawning background thread failed");
+    GC_CRASH();
+  }
+  return thread;
+}
+
+static void
+gc_background_thread_start(struct gc_background_thread *thread) {
+  pthread_mutex_lock(&thread->lock);
+  GC_ASSERT_EQ(thread->state, GC_BACKGROUND_THREAD_STARTING);
+  thread->state = GC_BACKGROUND_THREAD_RUNNING;
+  pthread_mutex_unlock(&thread->lock);
+  pthread_cond_signal(&thread->cond);
+}
+
+static int
+gc_background_thread_add_task(struct gc_background_thread *thread,
+                              int priority, void (*run)(void *data),
+                              void *data) {
+  pthread_mutex_lock(&thread->lock);
+  if (thread->count == thread->capacity) {
+    size_t new_capacity = thread->capacity * 2 + 1;
+    struct gc_background_task *new_tasks =
+      realloc(thread->tasks, sizeof(struct gc_background_task) * new_capacity);
+    if (!new_tasks) {
+      perror("ran out of space for background tasks!");
+      GC_CRASH();
+    }
+    thread->capacity = new_capacity;
+    thread->tasks = new_tasks;
+  }
+  size_t insert = 0;
+  for (; insert < thread->count; insert++) {
+    if (priority < thread->tasks[insert].priority)
+      break;
+  }
+  size_t bytes_to_move =
+    (thread->count - insert) * sizeof(struct gc_background_task);
+  memmove(&thread->tasks[insert + 1], &thread->tasks[insert], bytes_to_move);
+  int id = thread->next_id++;
+  thread->tasks[insert].id = id;
+  thread->tasks[insert].priority = priority;
+  thread->tasks[insert].run = run;
+  thread->tasks[insert].data = data;
+  thread->count++;
+  pthread_mutex_unlock(&thread->lock);
+  return id;
+}
+
+static void
+gc_background_thread_remove_task(struct gc_background_thread *thread,
+                                 int id) {
+  pthread_mutex_lock(&thread->lock);
+  size_t remove = 0;
+  for (; remove < thread->count; remove++) {
+    if (thread->tasks[remove].id == id)
+      break;
+  }
+  if (remove == thread->count)
+    GC_CRASH();
+  size_t bytes_to_move =
+    (thread->count - (remove + 1)) * sizeof(struct gc_background_task);
+  memmove(&thread->tasks[remove], &thread->tasks[remove + 1], bytes_to_move);
+  pthread_mutex_unlock(&thread->lock);
+}
+
+static void
+gc_destroy_background_thread(struct gc_background_thread *thread) {
+  pthread_mutex_lock(&thread->lock);
+  GC_ASSERT(thread->state == GC_BACKGROUND_THREAD_RUNNING);
+  thread->state = GC_BACKGROUND_THREAD_STOPPING;
+  pthread_mutex_unlock(&thread->lock);
+  pthread_cond_signal(&thread->cond);
+  pthread_join(thread->thread, NULL);
+  free(thread->tasks);
+  free(thread);
+}
+
+#endif // BACKGROUND_THREAD_H
--- a/libguile/whippet/src/bdw.c
+++ b/libguile/whippet/src/bdw.c
@ -0,0 +1,647 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "gc-api.h"
+#include "gc-ephemeron.h"
+#include "gc-tracepoint.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
+
+#include "bdw-attrs.h"
+
+#if GC_PRECISE_ROOTS
+#error bdw-gc is a conservative collector
+#endif
+
+#if !GC_CONSERVATIVE_ROOTS
+#error bdw-gc is a conservative collector
+#endif
+
+#if !GC_CONSERVATIVE_TRACE
+#error bdw-gc is a conservative collector
+#endif
+
+// When pthreads are used, let `libgc' know about it and redirect
+// allocation calls such as `GC_MALLOC ()' to (contention-free, faster)
+// thread-local allocation.
+
+#define GC_THREADS 1
+#define GC_REDIRECT_TO_LOCAL 1
+
+// Don't #define pthread routines to their GC_pthread counterparts.
+// Instead we will be careful inside the benchmarks to use API to
+// register threads with libgc.
+#define GC_NO_THREAD_REDIRECTS 1
+
+#include <gc/gc.h>
+#include <gc/gc_inline.h> /* GC_generic_malloc_many */
+#include <gc/gc_mark.h> /* GC_generic_malloc */
+
+#define GC_INLINE_GRANULE_WORDS 2
+#define GC_INLINE_GRANULE_BYTES (sizeof(void *) * GC_INLINE_GRANULE_WORDS)
+
+/* A freelist set contains GC_INLINE_FREELIST_COUNT pointers to singly
+   linked lists of objects of different sizes, the ith one containing
+   objects i + 1 granules in size.  This setting of
+   GC_INLINE_FREELIST_COUNT will hold freelists for allocations of
+   up to 256 bytes.  */
+#define GC_INLINE_FREELIST_COUNT (256U / GC_INLINE_GRANULE_BYTES)
+
+struct gc_heap {
+  struct gc_heap *freelist; // see mark_heap
+  pthread_mutex_t lock;
+  struct gc_heap_roots *roots;
+  struct gc_mutator *mutators;
+  struct gc_event_listener event_listener;
+  struct gc_finalizer_state *finalizer_state;
+  gc_finalizer_callback have_finalizers;
+  void *event_listener_data;
+};
+
+struct gc_mutator {
+  void *freelists[GC_INLINE_FREELIST_COUNT];
+  void *pointerless_freelists[GC_INLINE_FREELIST_COUNT];
+  struct gc_heap *heap;
+  struct gc_mutator_roots *roots;
+  struct gc_mutator *next; // with heap lock
+  struct gc_mutator **prev; // with heap lock
+  void *event_listener_data;
+};
+
+struct gc_heap *__the_bdw_gc_heap;
+#define HEAP_EVENT(event, ...) do {                                     \
+    __the_bdw_gc_heap->event_listener.event(__the_bdw_gc_heap->event_listener_data, \
+                                            ##__VA_ARGS__);             \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
+#define MUTATOR_EVENT(mut, event, ...) do {                             \
+    __the_bdw_gc_heap->event_listener.event(mut->event_listener_data,   \
+                                            ##__VA_ARGS__);             \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
+static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
+  return (bytes - 1U) / GC_INLINE_GRANULE_BYTES;
+}
+static inline size_t gc_inline_freelist_object_size(size_t idx) {
+  return (idx + 1U) * GC_INLINE_GRANULE_BYTES;
+}
+
+struct gc_heap* gc_mutator_heap(struct gc_mutator *mutator) {
+  return __the_bdw_gc_heap;
+}
+uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+
+// The values of these must match the internal POINTERLESS and NORMAL
+// definitions in libgc, for which unfortunately there are no external
+// definitions.  Alack.
+enum gc_inline_kind {
+  GC_INLINE_KIND_POINTERLESS,
+  GC_INLINE_KIND_NORMAL
+};
+
+static inline void *
+allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
+  void *head = *freelist;
+
+  if (!head) {
+    size_t bytes = gc_inline_freelist_object_size(idx);
+    GC_generic_malloc_many(bytes, kind, freelist);
+    head = *freelist;
+    if (GC_UNLIKELY (!head)) {
+      fprintf(stderr, "ran out of space, heap size %zu\n",
+              GC_get_heap_size());
+      GC_CRASH();
+    }
+  }
+
+  *freelist = *(void **)(head);
+
+  if (kind == GC_INLINE_KIND_POINTERLESS)
+    memset(head, 0, gc_inline_freelist_object_size(idx));
+
+  return head;
+}
+
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size,
+                       enum gc_allocation_kind kind) {
+  GC_ASSERT(size != 0);
+  if (size <= gc_allocator_large_threshold()) {
+    size_t idx = gc_inline_bytes_to_freelist_index(size);
+    void **freelists;
+    enum gc_inline_kind freelist_kind;
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return allocate_small(&mut->pointerless_freelists[idx], idx,
+                              GC_INLINE_KIND_POINTERLESS);
+      default:
+        GC_CRASH();
+    }
+  } else {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return GC_malloc(size);
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS: {
+        void *ret = GC_malloc_atomic(size);
+        memset(ret, 0, size);
+        return ret;
+      }
+      default:
+        GC_CRASH();
+    }
+  }
+}
+
+void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
+  // Nothing to do.
+}
+
+void gc_collect(struct gc_mutator *mut,
+                enum gc_collection_kind requested_kind) {
+  switch (requested_kind) {
+  case GC_COLLECTION_MINOR:
+    GC_collect_a_little();
+    break;
+  case GC_COLLECTION_ANY:
+  case GC_COLLECTION_MAJOR:
+    GC_gcollect();
+    break;
+  case GC_COLLECTION_COMPACTING:
+    GC_gcollect_and_unmap();
+    break;
+  default:
+    GC_CRASH();
+  }
+}
+
+int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                     struct gc_ref obj) {
+  return 0;
+}
+
+void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                           size_t obj_size, struct gc_edge edge,
+                           struct gc_ref new_val) {
+}
+
+int* gc_safepoint_flag_loc(struct gc_mutator *mut) { GC_CRASH(); }
+void gc_safepoint_slow(struct gc_mutator *mut) { GC_CRASH(); }
+
+struct bdw_mark_state {
+  struct GC_ms_entry *mark_stack_ptr;
+  struct GC_ms_entry *mark_stack_limit;
+};
+
+static void bdw_mark_edge(struct gc_edge edge, struct gc_heap *heap,
+                          void *visit_data) {
+  struct bdw_mark_state *state = visit_data;
+  uintptr_t addr = gc_ref_value(gc_edge_ref(edge));
+  state->mark_stack_ptr = GC_MARK_AND_PUSH ((void *) addr,
+                                            state->mark_stack_ptr,
+                                            state->mark_stack_limit,
+                                            NULL);
+}
+
+static int heap_gc_kind;
+static int mutator_gc_kind;
+static int ephemeron_gc_kind;
+static int finalizer_gc_kind;
+
+// In BDW-GC, we can't hook into the mark phase to call
+// gc_trace_ephemerons_for_object, so the advertised ephemeron strategy
+// doesn't really work.  The primitives that we have are mark functions,
+// which run during GC and can't allocate; finalizers, which run after
+// GC and can allocate but can't add to the connectivity graph; and
+// disappearing links, which are cleared at the end of marking, in the
+// stop-the-world phase.  It does not appear to be possible to implement
+// ephemerons using these primitives.  Instead fall back to weak-key
+// tables.
+
+struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
+  return GC_generic_malloc(gc_ephemeron_size(), ephemeron_gc_kind);
+}
+
+unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+  return GC_get_gc_no();
+}
+
+void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                       struct gc_ref key, struct gc_ref value) {
+  gc_ephemeron_init_internal(mut->heap, ephemeron, key, value);
+  if (GC_base((void*)gc_ref_value(key))) {
+    struct gc_ref *loc = gc_edge_loc(gc_ephemeron_key_edge(ephemeron));
+    GC_register_disappearing_link((void**)loc);
+  }
+}
+
+int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+  // Pretend the key is traced, to avoid adding this ephemeron to the
+  // global table.
+  return 1;
+}
+
+struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+  return GC_generic_malloc(gc_finalizer_size(), finalizer_gc_kind);
+}
+
+static void finalize_object(void *obj, void *data) {
+  struct gc_finalizer *f = data;
+  gc_finalizer_externally_fired(__the_bdw_gc_heap->finalizer_state, f);
+}
+
+void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                         unsigned priority, struct gc_ref object,
+                         struct gc_ref closure) {
+  // Don't bother much about the actual finalizer; just delegate to BDW-GC.
+  GC_finalization_proc prev = NULL;
+  void *prev_data = NULL;
+  gc_finalizer_init_internal(finalizer, object, closure);
+  gc_finalizer_externally_activated(finalizer);
+  GC_register_finalizer_no_order(gc_ref_heap_object(object), finalize_object,
+                                 finalizer, &prev, &prev_data);
+  // FIXME: Allow multiple finalizers per object.
+  GC_ASSERT(prev == NULL);
+  GC_ASSERT(prev_data == NULL);
+}
+
+struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
+  GC_invoke_finalizers();
+  return gc_finalizer_state_pop(mut->heap->finalizer_state);
+}
+
+void gc_set_finalizer_callback(struct gc_heap *heap,
+                               gc_finalizer_callback callback) {
+  heap->have_finalizers = callback;
+}
+
+static void have_finalizers(void) {
+  struct gc_heap *heap = __the_bdw_gc_heap;
+  if (heap->have_finalizers)
+    heap->have_finalizers(heap, 1);
+}
+
+static struct GC_ms_entry *
+mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+               struct GC_ms_entry *mark_stack_limit, GC_word env) {
+
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_ephemeron *ephemeron = (struct gc_ephemeron*) addr;
+
+  // If this ephemeron is on a freelist, its first word will be a
+  // freelist link and everything else will be NULL.
+  if (!gc_ref_value(gc_edge_ref(gc_ephemeron_value_edge(ephemeron)))) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  if (!gc_ref_value(gc_edge_ref(gc_ephemeron_key_edge(ephemeron)))) {
+    // If the key died in a previous collection, the disappearing link
+    // will have been cleared.  Mark the ephemeron as dead.
+    gc_ephemeron_mark_dead(ephemeron);
+  }
+
+  gc_trace_ephemeron(ephemeron, bdw_mark_edge, NULL, &state);
+
+  return state.mark_stack_ptr;
+}
+
+static struct GC_ms_entry *
+mark_finalizer(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+               struct GC_ms_entry *mark_stack_limit, GC_word env) {
+
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_finalizer *finalizer = (struct gc_finalizer*) addr;
+
+  // If this ephemeron is on a freelist, its first word will be a
+  // freelist link and everything else will be NULL.
+  if (!gc_ref_value(gc_finalizer_object(finalizer))) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  gc_trace_finalizer(finalizer, bdw_mark_edge, NULL, &state);
+
+  return state.mark_stack_ptr;
+}
+
+static struct GC_ms_entry *
+mark_heap(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+          struct GC_ms_entry *mark_stack_limit, GC_word env) {
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_heap *heap = (struct gc_heap*) addr;
+
+  // If this heap is on a freelist... well probably we are screwed, BDW
+  // isn't really made to do multiple heaps in a process.  But still, in
+  // this case, the first word is the freelist and the rest are null.
+  if (heap->freelist) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  if (heap->roots)
+    gc_trace_heap_roots(heap->roots, bdw_mark_edge, heap, &state);
+
+  gc_visit_finalizer_roots(heap->finalizer_state, bdw_mark_edge, heap, &state);
+
+  state.mark_stack_ptr = GC_MARK_AND_PUSH (heap->mutators,
+                                           state.mark_stack_ptr,
+                                           state.mark_stack_limit,
+                                           NULL);
+
+  return state.mark_stack_ptr;
+}
+
+static struct GC_ms_entry *
+mark_mutator(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+             struct GC_ms_entry *mark_stack_limit, GC_word env) {
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_mutator *mut = (struct gc_mutator*) addr;
+
+  // If this mutator is on a freelist, its first word will be a
+  // freelist link and everything else will be NULL.
+  if (!mut->heap) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  for (int i = 0; i < GC_INLINE_FREELIST_COUNT; i++)
+    state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->freelists[i],
+                                             state.mark_stack_ptr,
+                                             state.mark_stack_limit,
+                                             NULL);
+
+  for (int i = 0; i < GC_INLINE_FREELIST_COUNT; i++)
+    for (void *head = mut->pointerless_freelists[i]; head; head = *(void**)head)
+      state.mark_stack_ptr = GC_MARK_AND_PUSH (head,
+                                               state.mark_stack_ptr,
+                                               state.mark_stack_limit,
+                                               NULL);
+
+  if (mut->roots)
+    gc_trace_mutator_roots(mut->roots, bdw_mark_edge, mut->heap, &state);
+
+  state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->next,
+                                           state.mark_stack_ptr,
+                                           state.mark_stack_limit,
+                                           NULL);
+
+  return state.mark_stack_ptr;
+}
+
+static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
+  struct gc_mutator *ret =
+    GC_generic_malloc(sizeof(struct gc_mutator), mutator_gc_kind);
+  ret->heap = heap;
+  ret->event_listener_data =
+    heap->event_listener.mutator_added(heap->event_listener_data);
+
+  pthread_mutex_lock(&heap->lock);
+  ret->next = heap->mutators;
+  ret->prev = &heap->mutators;
+  if (ret->next)
+    ret->next->prev = &ret->next;
+  heap->mutators = ret;
+  pthread_mutex_unlock(&heap->lock);
+
+  return ret;
+}
+
+struct gc_options {
+  struct gc_common_options common;
+};
+int gc_option_from_string(const char *str) {
+  return gc_common_option_from_string(str);
+}
+struct gc_options* gc_allocate_options(void) {
+  struct gc_options *ret = malloc(sizeof(struct gc_options));
+  gc_init_common_options(&ret->common);
+  return ret;
+}
+int gc_options_set_int(struct gc_options *options, int option, int value) {
+  return gc_common_options_set_int(&options->common, option, value);
+}
+int gc_options_set_size(struct gc_options *options, int option,
+                        size_t value) {
+  return gc_common_options_set_size(&options->common, option, value);
+}
+int gc_options_set_double(struct gc_options *options, int option,
+                          double value) {
+  return gc_common_options_set_double(&options->common, option, value);
+}
+int gc_options_parse_and_set(struct gc_options *options, int option,
+                             const char *value) {
+  return gc_common_options_parse_and_set(&options->common, option, value);
+}
+
+struct gc_pending_ephemerons *
+gc_heap_pending_ephemerons(struct gc_heap *heap) {
+  GC_CRASH();
+  return NULL;
+}
+
+static void on_collection_event(GC_EventType event) {
+  switch (event) {
+  case GC_EVENT_START: {
+    HEAP_EVENT(requesting_stop);
+    HEAP_EVENT(waiting_for_stop);
+    break;
+  }
+  case GC_EVENT_MARK_START:
+    HEAP_EVENT(mutators_stopped);
+    HEAP_EVENT(prepare_gc, GC_COLLECTION_MAJOR);
+    break;
+  case GC_EVENT_MARK_END:
+    HEAP_EVENT(roots_traced);
+    HEAP_EVENT(heap_traced);
+    break;
+  case GC_EVENT_RECLAIM_START:
+    break;
+  case GC_EVENT_RECLAIM_END:
+    // Sloppily attribute finalizers and eager reclamation to
+    // ephemerons.
+    HEAP_EVENT(ephemerons_traced);
+    HEAP_EVENT(live_data_size, GC_get_heap_size() - GC_get_free_bytes());
+    break;
+  case GC_EVENT_END:
+    HEAP_EVENT(restarting_mutators);
+    break;
+  case GC_EVENT_PRE_START_WORLD:
+  case GC_EVENT_POST_STOP_WORLD:
+    // Can't rely on these, as they are only fired when threads are
+    // enabled.
+    break;
+  case GC_EVENT_THREAD_SUSPENDED:
+  case GC_EVENT_THREAD_UNSUSPENDED:
+    // No nice way to map back to the mutator.
+    break;
+  default:
+    break;
+  }
+}
+
+static void on_heap_resize(GC_word size) {
+  HEAP_EVENT(heap_resized, size);
+}
+
+uint64_t gc_allocation_counter(struct gc_heap *heap) {
+  return GC_get_total_bytes();
+}
+
+int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
+            struct gc_heap **heap, struct gc_mutator **mutator,
+            struct gc_event_listener event_listener,
+            void *event_listener_data) {
+  // Root the heap, which will also cause all mutators to be marked.
+  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
+  GC_ASSERT_EQ(gc_allocator_large_threshold(),
+               GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
+
+  GC_ASSERT_EQ(__the_bdw_gc_heap, NULL);
+
+  if (!options) options = gc_allocate_options();
+
+  // Ignore stack base for main thread.
+
+  switch (options->common.heap_size_policy) {
+    case GC_HEAP_SIZE_FIXED:
+      GC_set_max_heap_size(options->common.heap_size);
+      break;
+    case GC_HEAP_SIZE_GROWABLE: {
+      if (options->common.maximum_heap_size)
+        GC_set_max_heap_size(options->common.maximum_heap_size);
+      // BDW uses a pretty weird heap-sizing heuristic:
+      //
+      // heap-size = live-data * (1 + (2 / GC_free_space_divisor))
+      // heap-size-multiplier = heap-size/live-data = 1 + 2/GC_free_space_divisor
+      // GC_free_space_divisor = 2/(heap-size-multiplier-1)
+      //
+      // (Assumption: your heap is mostly "composite", i.e. not
+      // "atomic".  See bdw's alloc.c:min_bytes_allocd.)
+      double fsd = 2.0/(options->common.heap_size_multiplier - 1);
+      // But, the divisor is an integer.  WTF.  This caps the effective
+      // maximum heap multiplier at 3.  Oh well.
+      GC_set_free_space_divisor(fsd + 0.51);
+      break;
+    }
+    case GC_HEAP_SIZE_ADAPTIVE:
+    default:
+      fprintf(stderr, "adaptive heap sizing unsupported by bdw-gc\n");
+      return 0;
+  }
+
+  GC_set_all_interior_pointers (0);
+  GC_set_finalize_on_demand (1);
+  GC_set_finalizer_notifier(have_finalizers);
+
+  // Not part of 7.3, sigh.  Have to set an env var.
+  // GC_set_markers_count(options->common.parallelism);
+  char markers[21] = {0,}; // 21 bytes enough for 2**64 in decimal + NUL.
+  snprintf(markers, sizeof(markers), "%d", options->common.parallelism);
+  setenv("GC_MARKERS", markers, 1);
+  GC_init();
+  size_t current_heap_size = GC_get_heap_size();
+  if (options->common.heap_size > current_heap_size)
+    GC_expand_hp(options->common.heap_size - current_heap_size);
+  GC_allow_register_threads();
+
+  {
+    int add_size_to_descriptor = 0;
+    int clear_memory = 1;
+
+    heap_gc_kind = GC_new_kind(GC_new_free_list(),
+                               GC_MAKE_PROC(GC_new_proc(mark_heap), 0),
+                               add_size_to_descriptor, clear_memory);
+    mutator_gc_kind = GC_new_kind(GC_new_free_list(),
+                                  GC_MAKE_PROC(GC_new_proc(mark_mutator), 0),
+                                  add_size_to_descriptor, clear_memory);
+    ephemeron_gc_kind = GC_new_kind(GC_new_free_list(),
+                                    GC_MAKE_PROC(GC_new_proc(mark_ephemeron), 0),
+                                    add_size_to_descriptor, clear_memory);
+    finalizer_gc_kind = GC_new_kind(GC_new_free_list(),
+                                    GC_MAKE_PROC(GC_new_proc(mark_finalizer), 0),
+                                    add_size_to_descriptor, clear_memory);
+  }
+
+  *heap = GC_generic_malloc(sizeof(struct gc_heap), heap_gc_kind);
+  pthread_mutex_init(&(*heap)->lock, NULL);
+
+  (*heap)->event_listener = event_listener;
+  (*heap)->event_listener_data = event_listener_data;
+  (*heap)->finalizer_state = gc_make_finalizer_state();
+
+  __the_bdw_gc_heap = *heap;
+  HEAP_EVENT(init, GC_get_heap_size());
+  GC_set_on_collection_event(on_collection_event);
+  GC_set_on_heap_resize(on_heap_resize);
+
+  *mutator = add_mutator(*heap);
+
+  // Sanity check.
+  if (!GC_is_visible (&__the_bdw_gc_heap))
+    abort ();
+
+  return 1;
+}
+
+struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
+                                      struct gc_heap *heap) {
+  struct GC_stack_base base = { stack_base };
+  GC_register_my_thread(&base);
+  return add_mutator(heap);
+}
+void gc_finish_for_thread(struct gc_mutator *mut) {
+  pthread_mutex_lock(&mut->heap->lock);
+  MUTATOR_EVENT(mut, mutator_removed);
+  *mut->prev = mut->next;
+  if (mut->next)
+    mut->next->prev = mut->prev;
+  pthread_mutex_unlock(&mut->heap->lock);
+
+  GC_unregister_my_thread();
+}
+
+void* gc_call_without_gc(struct gc_mutator *mut,
+                         void* (*f)(void*),
+                         void *data) {
+  return GC_do_blocking(f, data);
+}
+
+void gc_mutator_set_roots(struct gc_mutator *mut,
+                          struct gc_mutator_roots *roots) {
+  mut->roots = roots;
+}
+void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
+  heap->roots = roots;
+}
+void gc_heap_set_extern_space(struct gc_heap *heap,
+                              struct gc_extern_space *space) {
+}
--- a/libguile/whippet/src/copy-space.h
+++ b/libguile/whippet/src/copy-space.h
@ -0,0 +1,979 @@
+#ifndef COPY_SPACE_H
+#define COPY_SPACE_H
+
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "gc-api.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
+
+#include "assert.h"
+#include "background-thread.h"
+#include "debug.h"
+#include "extents.h"
+#include "gc-align.h"
+#include "gc-attrs.h"
+#include "gc-inline.h"
+#include "gc-lock.h"
+#include "gc-platform.h"
+#include "spin.h"
+
+// A copy space: a block-structured space that traces via evacuation.
+
+#define COPY_SPACE_SLAB_SIZE (64 * 1024 * 1024)
+#define COPY_SPACE_REGION_SIZE (64 * 1024)
+#define COPY_SPACE_BLOCK_SIZE (2 * COPY_SPACE_REGION_SIZE)
+#define COPY_SPACE_BLOCKS_PER_SLAB \
+  (COPY_SPACE_SLAB_SIZE / COPY_SPACE_BLOCK_SIZE)
+#define COPY_SPACE_HEADER_BYTES_PER_BLOCK \
+  (COPY_SPACE_BLOCK_SIZE / COPY_SPACE_BLOCKS_PER_SLAB)
+#define COPY_SPACE_HEADER_BLOCKS_PER_SLAB 1
+#define COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB \
+  (COPY_SPACE_BLOCKS_PER_SLAB - COPY_SPACE_HEADER_BLOCKS_PER_SLAB)
+#define COPY_SPACE_HEADER_BYTES_PER_SLAB \
+  (COPY_SPACE_HEADER_BYTES_PER_BLOCK * COPY_SPACE_HEADER_BLOCKS_PER_SLAB)
+
+struct copy_space_slab;
+
+struct copy_space_slab_header {
+  union {
+    struct {
+      struct copy_space_slab *next;
+      struct copy_space_slab *prev;
+      unsigned incore_block_count;
+    };
+    uint8_t padding[COPY_SPACE_HEADER_BYTES_PER_SLAB];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct copy_space_slab_header),
+                 COPY_SPACE_HEADER_BYTES_PER_SLAB);
+
+// Really just the block header.
+struct copy_space_block {
+  union {
+    struct {
+      struct copy_space_block *next;
+      uint8_t in_core;
+      uint8_t all_zeroes[2];
+      uint8_t is_survivor[2];
+      size_t allocated; // For partly-empty blocks.
+    };
+    uint8_t padding[COPY_SPACE_HEADER_BYTES_PER_BLOCK];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct copy_space_block),
+                 COPY_SPACE_HEADER_BYTES_PER_BLOCK);
+
+struct copy_space_region {
+  char data[COPY_SPACE_REGION_SIZE];
+};
+
+struct copy_space_block_payload {
+  struct copy_space_region regions[2];
+};
+
+struct copy_space_slab {
+  struct copy_space_slab_header header;
+  struct copy_space_block headers[COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB];
+  struct copy_space_block_payload blocks[COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB];
+};
+STATIC_ASSERT_EQ(sizeof(struct copy_space_slab), COPY_SPACE_SLAB_SIZE);
+
+static inline struct copy_space_block*
+copy_space_block_for_addr(uintptr_t addr) {
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  struct copy_space_slab *slab = (struct copy_space_slab*) base;
+  uintptr_t block_idx =
+    (addr / COPY_SPACE_BLOCK_SIZE) % COPY_SPACE_BLOCKS_PER_SLAB;
+  return &slab->headers[block_idx - COPY_SPACE_HEADER_BLOCKS_PER_SLAB];
+}
+
+static inline struct copy_space_block*
+copy_space_block_header(struct copy_space_block_payload *payload) {
+  return copy_space_block_for_addr((uintptr_t) payload);
+}
+
+static inline struct copy_space_block_payload*
+copy_space_block_payload(struct copy_space_block *block) {
+  uintptr_t addr = (uintptr_t) block;
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  struct copy_space_slab *slab = (struct copy_space_slab*) base;
+  uintptr_t block_idx =
+    (addr / COPY_SPACE_HEADER_BYTES_PER_BLOCK) % COPY_SPACE_BLOCKS_PER_SLAB;
+  return &slab->blocks[block_idx - COPY_SPACE_HEADER_BLOCKS_PER_SLAB];
+}
+
+static uint8_t
+copy_space_object_region(struct gc_ref obj) {
+  return (gc_ref_value(obj) / COPY_SPACE_REGION_SIZE) & 1;
+}
+
+#define COPY_SPACE_PAGE_OUT_QUEUE_SIZE 4
+
+struct copy_space_block_list {
+  struct copy_space_block *head;
+};
+
+struct copy_space_block_stack {
+  struct copy_space_block_list list;
+};
+
+enum copy_space_flags {
+  COPY_SPACE_ATOMIC_FORWARDING = 1,
+  COPY_SPACE_ALIGNED = 2,
+  COPY_SPACE_HAS_FIELD_LOGGING_BITS = 4,
+};
+
+struct copy_space {
+  pthread_mutex_t lock;
+  struct copy_space_block_stack empty;
+  struct copy_space_block_stack partly_full;
+  struct copy_space_block_list full ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t allocated_bytes;
+  size_t fragmentation;
+  struct copy_space_block_stack paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE]
+    ALIGNED_TO_AVOID_FALSE_SHARING;
+  ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
+  // The rest of these members are only changed rarely and with the heap
+  // lock.
+  uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
+  uint8_t atomic_forward;
+  uint8_t in_gc;
+  uint32_t flags;
+  size_t allocated_bytes_at_last_gc;
+  size_t fragmentation_at_last_gc;
+  struct extents *extents;
+  struct copy_space_slab **slabs;
+  size_t nslabs;
+};
+
+enum copy_space_forward_result {
+  // We went to forward an edge, but the target was already forwarded, so we
+  // just updated the edge.
+  COPY_SPACE_FORWARD_UPDATED,
+  // We went to forward an edge and evacuated the referent to a new location.
+  COPY_SPACE_FORWARD_EVACUATED,
+  // We went to forward an edge but failed to acquire memory for its new
+  // location.
+  COPY_SPACE_FORWARD_FAILED,
+};
+
+struct copy_space_allocator {
+  uintptr_t hp;
+  uintptr_t limit;
+  struct copy_space_block *block;
+};
+
+static struct gc_lock
+copy_space_lock(struct copy_space *space) {
+  return gc_lock_acquire(&space->lock);
+}
+
+static void
+copy_space_block_list_push(struct copy_space_block_list *list,
+                           struct copy_space_block *block) {
+  struct copy_space_block *next =
+    atomic_load_explicit(&list->head, memory_order_acquire);
+  do {
+    block->next = next;
+  } while (!atomic_compare_exchange_weak(&list->head, &next, block));
+}
+
+static struct copy_space_block*
+copy_space_block_list_pop(struct copy_space_block_list *list) {
+  struct copy_space_block *head =
+    atomic_load_explicit(&list->head, memory_order_acquire);
+  struct copy_space_block *next;
+  do {
+    if (!head)
+      return NULL;
+  } while (!atomic_compare_exchange_weak(&list->head, &head, head->next));
+  head->next = NULL;
+  return head;
+}
+
+static void
+copy_space_block_stack_push(struct copy_space_block_stack *stack,
+                            struct copy_space_block *block,
+                            const struct gc_lock *lock) {
+  struct copy_space_block *next = stack->list.head;
+  block->next = next;
+  stack->list.head = block;
+}
+
+static struct copy_space_block*
+copy_space_block_stack_pop(struct copy_space_block_stack *stack,
+                           const struct gc_lock *lock) {
+  struct copy_space_block *head = stack->list.head;
+  if (head) {
+    stack->list.head = head->next;
+    head->next = NULL;
+  }
+  return head;
+}
+
+static struct copy_space_block*
+copy_space_pop_empty_block(struct copy_space *space,
+                           const struct gc_lock *lock) {
+  struct copy_space_block *ret = copy_space_block_stack_pop(&space->empty,
+                                                            lock);
+  if (ret) {
+    ret->allocated = 0;
+    ret->is_survivor[space->active_region] = 0;
+  }
+  return ret;
+}
+
+static void
+copy_space_push_empty_block(struct copy_space *space,
+                            struct copy_space_block *block,
+                            const struct gc_lock *lock) {
+  copy_space_block_stack_push(&space->empty, block, lock);
+}
+
+static struct copy_space_block*
+copy_space_pop_full_block(struct copy_space *space) {
+  return copy_space_block_list_pop(&space->full);
+}
+
+static void
+copy_space_push_full_block(struct copy_space *space,
+                           struct copy_space_block *block) {
+  if (space->in_gc)
+    block->is_survivor[space->active_region] = 1;
+  copy_space_block_list_push(&space->full, block);
+}
+
+static struct copy_space_block*
+copy_space_pop_partly_full_block(struct copy_space *space,
+                                 const struct gc_lock *lock) {
+  return copy_space_block_stack_pop(&space->partly_full, lock);
+}
+
+static void
+copy_space_push_partly_full_block(struct copy_space *space,
+                                  struct copy_space_block *block,
+                                  const struct gc_lock *lock) {
+  copy_space_block_stack_push(&space->partly_full, block, lock);
+}
+
+static void
+copy_space_page_out_block(struct copy_space *space,
+                          struct copy_space_block *block,
+                          const struct gc_lock *lock) {
+  copy_space_block_stack_push
+    (block->in_core
+     ? &space->paged_out[0]
+     : &space->paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE-1],
+     block,
+     lock);
+}
+
+static struct copy_space_block*
+copy_space_page_in_block(struct copy_space *space,
+                         const struct gc_lock *lock) {
+  for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++) {
+    struct copy_space_block *block =
+      copy_space_block_stack_pop(&space->paged_out[age], lock);
+    if (block) return block;
+  }
+  return NULL;
+}
+
+static ssize_t
+copy_space_request_release_memory(struct copy_space *space, size_t bytes) {
+  return atomic_fetch_add(&space->bytes_to_page_out, bytes) + bytes;
+}
+
+static int
+copy_space_page_out_blocks_until_memory_released(struct copy_space *space) {
+  ssize_t pending = atomic_load(&space->bytes_to_page_out);
+  struct gc_lock lock = copy_space_lock(space);
+  while (pending > 0) {
+    struct copy_space_block *block = copy_space_pop_empty_block(space, &lock);
+    if (!block) break;
+    copy_space_page_out_block(space, block, &lock);
+    pending = (atomic_fetch_sub(&space->bytes_to_page_out, COPY_SPACE_BLOCK_SIZE)
+               - COPY_SPACE_BLOCK_SIZE);
+  }
+  gc_lock_release(&lock);
+  return pending <= 0;
+}
+
+static ssize_t
+copy_space_maybe_reacquire_memory(struct copy_space *space, size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
+  struct gc_lock lock = copy_space_lock(space);
+  while (pending + COPY_SPACE_BLOCK_SIZE <= 0) {
+    struct copy_space_block *block = copy_space_page_in_block(space, &lock);
+    if (!block) break;
+    copy_space_push_empty_block(space, block, &lock);
+    pending = (atomic_fetch_add(&space->bytes_to_page_out,
+                                COPY_SPACE_BLOCK_SIZE)
+               + COPY_SPACE_BLOCK_SIZE);
+  }
+  gc_lock_release(&lock);
+  return pending;
+}
+
+static void
+copy_space_reacquire_memory(struct copy_space *space, size_t bytes) {
+  ssize_t pending = copy_space_maybe_reacquire_memory(space, bytes);
+  GC_ASSERT(pending + COPY_SPACE_BLOCK_SIZE > 0);
+}
+
+static inline int
+copy_space_contains_address(struct copy_space *space, uintptr_t addr) {
+  return extents_contain_addr(space->extents, addr);
+}
+
+static inline int
+copy_space_contains(struct copy_space *space, struct gc_ref ref) {
+  return copy_space_contains_address(space, gc_ref_value(ref));
+}
+
+static int
+copy_space_has_field_logging_bits(struct copy_space *space) {
+  return space->flags & COPY_SPACE_HAS_FIELD_LOGGING_BITS;
+}
+
+static size_t
+copy_space_field_logging_blocks(struct copy_space *space) {
+  if (!copy_space_has_field_logging_bits(space))
+    return 0;
+  size_t bytes = COPY_SPACE_SLAB_SIZE / sizeof (uintptr_t) / 8;
+  size_t blocks =
+    align_up(bytes, COPY_SPACE_BLOCK_SIZE) / COPY_SPACE_BLOCK_SIZE;
+  return blocks;
+}
+
+static uint8_t*
+copy_space_field_logged_byte(struct gc_edge edge) {
+  uintptr_t addr = gc_edge_address(edge);
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  base += offsetof(struct copy_space_slab, blocks);
+  uintptr_t field = (addr & (COPY_SPACE_SLAB_SIZE - 1)) / sizeof(uintptr_t);
+  uintptr_t byte = field / 8;
+  return (uint8_t*) (base + byte);
+}
+
+static uint8_t
+copy_space_field_logged_bit(struct gc_edge edge) {
+  // Each byte has 8 bytes, covering 8 fields.
+  size_t field = gc_edge_address(edge) / sizeof(uintptr_t);
+  return 1 << (field % 8);
+}
+
+static void
+copy_space_clear_field_logged_bits_for_region(struct copy_space *space,
+                                              void *region_base) {
+  uintptr_t addr = (uintptr_t)region_base;
+  GC_ASSERT_EQ(addr, align_down(addr, COPY_SPACE_REGION_SIZE));
+  GC_ASSERT(copy_space_contains_address(space, addr));
+  if (copy_space_has_field_logging_bits(space))
+    memset(copy_space_field_logged_byte(gc_edge(region_base)),
+           0,
+           COPY_SPACE_REGION_SIZE / sizeof(uintptr_t) / 8);
+}
+
+static void
+copy_space_clear_field_logged_bits_for_block(struct copy_space *space,
+                                             struct copy_space_block *block) {
+  struct copy_space_block_payload *payload = copy_space_block_payload(block);
+  copy_space_clear_field_logged_bits_for_region(space, &payload->regions[0]);
+  copy_space_clear_field_logged_bits_for_region(space, &payload->regions[1]);
+}
+
+static inline void
+copy_space_allocator_set_block(struct copy_space_allocator *alloc,
+                               struct copy_space_block *block,
+                               int active_region) {
+  struct copy_space_block_payload *payload = copy_space_block_payload(block);
+  struct copy_space_region *region = &payload->regions[active_region];
+  alloc->block = block;
+  alloc->hp = (uintptr_t)&region[0];
+  alloc->limit = (uintptr_t)&region[1];
+}
+
+static inline int
+copy_space_allocator_acquire_block(struct copy_space_allocator *alloc,
+                                   struct copy_space_block *block,
+                                   int active_region) {
+  if (block) {
+    copy_space_allocator_set_block(alloc, block, active_region);
+    return 1;
+  }
+  return 0;
+}
+
+static int
+copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
+                                         struct copy_space *space) {
+  struct gc_lock lock = copy_space_lock(space);
+  struct copy_space_block *block = copy_space_pop_empty_block(space, &lock);
+  gc_lock_release(&lock);
+  if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
+    block->in_core = 1;
+    if (block->all_zeroes[space->active_region]) {
+      block->all_zeroes[space->active_region] = 0;
+    } else {
+      memset((char*)alloc->hp, 0, COPY_SPACE_REGION_SIZE);
+      copy_space_clear_field_logged_bits_for_region(space, (void*)alloc->hp);
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static int
+copy_space_allocator_acquire_partly_full_block(struct copy_space_allocator *alloc,
+                                               struct copy_space *space) {
+  struct gc_lock lock = copy_space_lock(space);
+  struct copy_space_block *block = copy_space_pop_partly_full_block(space,
+                                                                    &lock);
+  gc_lock_release(&lock);
+  if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
+    alloc->hp += block->allocated;
+    return 1;
+  }
+  return 0;
+}
+
+static void
+copy_space_allocator_release_full_block(struct copy_space_allocator *alloc,
+                                        struct copy_space *space) {
+  size_t fragmentation = alloc->limit - alloc->hp;
+  size_t allocated = COPY_SPACE_REGION_SIZE - alloc->block->allocated;
+  atomic_fetch_add_explicit(&space->allocated_bytes, allocated,
+                            memory_order_relaxed);
+  if (fragmentation)
+    atomic_fetch_add_explicit(&space->fragmentation, fragmentation,
+                              memory_order_relaxed);
+  copy_space_push_full_block(space, alloc->block);
+  alloc->hp = alloc->limit = 0;
+  alloc->block = NULL;
+}
+
+static void
+copy_space_allocator_release_partly_full_block(struct copy_space_allocator *alloc,
+                                               struct copy_space *space) {
+  size_t allocated = alloc->hp & (COPY_SPACE_REGION_SIZE - 1);
+  if (allocated) {
+    atomic_fetch_add_explicit(&space->allocated_bytes,
+                              allocated - alloc->block->allocated,
+                              memory_order_relaxed);
+    alloc->block->allocated = allocated;
+    struct gc_lock lock = copy_space_lock(space);
+    copy_space_push_partly_full_block(space, alloc->block, &lock);
+    gc_lock_release(&lock);
+  } else {
+    // In this case, hp was bumped all the way to the limit, in which
+    // case allocated wraps to 0; the block is full.
+    atomic_fetch_add_explicit(&space->allocated_bytes,
+                              COPY_SPACE_REGION_SIZE - alloc->block->allocated,
+                              memory_order_relaxed);
+    copy_space_push_full_block(space, alloc->block);
+  }
+  alloc->hp = alloc->limit = 0;
+  alloc->block = NULL;
+}
+
+static inline struct gc_ref
+copy_space_allocate(struct copy_space_allocator *alloc,
+                    struct copy_space *space,
+                    size_t size) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, gc_allocator_small_granule_size());
+
+  if (alloc->hp + size <= alloc->limit)
+    goto done;
+
+  if (alloc->block)
+    copy_space_allocator_release_full_block(alloc, space);
+  while (copy_space_allocator_acquire_partly_full_block(alloc, space)) {
+    if (alloc->hp + size <= alloc->limit)
+      goto done;
+    copy_space_allocator_release_full_block(alloc, space);
+  }
+  if (!copy_space_allocator_acquire_empty_block(alloc, space))
+    return gc_ref_null();
+  // The newly acquired block is empty and is therefore large enough for
+  // a small allocation.
+
+done:
+  struct gc_ref ret = gc_ref(alloc->hp);
+  alloc->hp += size;
+  return ret;
+}
+
+static struct copy_space_block*
+copy_space_append_block_lists(struct copy_space_block *head,
+                              struct copy_space_block *tail) {
+  if (!head) return tail;
+  if (tail) {
+    struct copy_space_block *walk = head;
+    while (walk->next)
+      walk = walk->next;
+    walk->next = tail;
+  }
+  return head;
+}
+
+static void
+copy_space_flip(struct copy_space *space) {
+  // Mutators stopped, can access nonatomically.
+  struct copy_space_block* flip = space->full.head;
+  flip = copy_space_append_block_lists(space->partly_full.list.head, flip);
+  flip = copy_space_append_block_lists(space->empty.list.head, flip);
+  space->empty.list.head = flip;
+  space->partly_full.list.head = NULL;
+  space->full.head = NULL;
+  space->allocated_bytes = 0;
+  space->fragmentation = 0;
+  space->active_region ^= 1;
+  space->in_gc = 1;
+}
+
+static inline void
+copy_space_allocator_init(struct copy_space_allocator *alloc) {
+  memset(alloc, 0, sizeof(*alloc));
+}
+
+static inline void
+copy_space_allocator_finish(struct copy_space_allocator *alloc,
+                            struct copy_space *space) {
+  if (alloc->block)
+    copy_space_allocator_release_partly_full_block(alloc, space);
+}
+
+static void
+copy_space_finish_gc(struct copy_space *space, int is_minor_gc) {
+  // Mutators stopped, can access nonatomically.
+  if (is_minor_gc) {
+    // Avoid mixing survivors and new objects on the same blocks.
+    struct copy_space_allocator alloc;
+    copy_space_allocator_init(&alloc);
+    while (copy_space_allocator_acquire_partly_full_block(&alloc, space))
+      copy_space_allocator_release_full_block(&alloc, space);
+    copy_space_allocator_finish(&alloc, space);
+  }
+
+  space->allocated_bytes_at_last_gc = space->allocated_bytes;
+  space->fragmentation_at_last_gc = space->fragmentation;
+  space->in_gc = 0;
+}
+
+static size_t
+copy_space_can_allocate(struct copy_space *space, size_t bytes) {
+  // With lock!
+  size_t count = 0;
+  for (struct copy_space_block *empties = space->empty.list.head;
+       empties && count < bytes;
+       empties = empties->next) {
+    count += COPY_SPACE_REGION_SIZE;
+  }
+  return count;
+}
+
+static void
+copy_space_add_to_allocation_counter(struct copy_space *space,
+                                     uint64_t *counter) {
+  *counter += space->allocated_bytes - space->allocated_bytes_at_last_gc;
+}
+
+static void
+copy_space_gc_during_evacuation(void *data) {
+  // If space is really tight and reordering of objects during
+  // evacuation resulted in more end-of-block fragmentation and thus
+  // block use than before collection started, we can actually run out
+  // of memory while collecting.  We should probably attempt to expand
+  // the heap here, at least by a single block; it's better than the
+  // alternatives.
+  fprintf(stderr, "Out of memory\n");
+  GC_CRASH();
+}
+
+static inline enum copy_space_forward_result
+copy_space_forward_atomic(struct copy_space *space, struct gc_edge edge,
+                          struct gc_ref old_ref,
+                          struct copy_space_allocator *alloc) {
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+
+retry:
+  if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+    gc_atomic_forward_acquire(&fwd);
+
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+  default:
+    // Impossible.
+    GC_CRASH();
+  case GC_FORWARDING_STATE_ACQUIRED: {
+    // We claimed the object successfully; evacuating is up to us.
+    size_t bytes = gc_atomic_forward_object_size(&fwd);
+    struct gc_ref new_ref = copy_space_allocate(alloc, space, bytes);
+    if (gc_ref_is_null(new_ref)) {
+      gc_atomic_forward_abort(&fwd);
+      return COPY_SPACE_FORWARD_FAILED;
+    }
+    // Copy object contents before committing, as we don't know what
+    // part of the object (if any) will be overwritten by the
+    // commit.
+    memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), bytes);
+    gc_atomic_forward_commit(&fwd, new_ref);
+    gc_edge_update(edge, new_ref);
+    return COPY_SPACE_FORWARD_EVACUATED;
+  }
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known, or evacuation aborts.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        goto retry;
+      yield_for_spin(spin_count);
+    }
+    GC_CRASH(); // Unreachable.
+  case GC_FORWARDING_STATE_FORWARDED:
+    // The object has been evacuated already.  Update the edge;
+    // whoever forwarded the object will make sure it's eventually
+    // traced.
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return COPY_SPACE_FORWARD_UPDATED;
+  }
+}
+
+static int
+copy_space_forward_if_traced_atomic(struct copy_space *space,
+                                    struct gc_edge edge,
+                                    struct gc_ref old_ref) {
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+retry:
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+    return 0;
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        goto retry;
+      yield_for_spin(spin_count);
+    }
+    GC_CRASH(); // Unreachable.
+  case GC_FORWARDING_STATE_FORWARDED:
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 1;
+  default:
+    GC_CRASH();
+  }
+}
+
+static inline enum copy_space_forward_result
+copy_space_forward_nonatomic(struct copy_space *space, struct gc_edge edge,
+                             struct gc_ref old_ref,
+                             struct copy_space_allocator *alloc) {
+  uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
+  if (forwarded) {
+    gc_edge_update(edge, gc_ref(forwarded));
+    return COPY_SPACE_FORWARD_UPDATED;
+  } else {
+    size_t size;
+    gc_trace_object(old_ref, NULL, NULL, NULL, &size);
+    struct gc_ref new_ref = copy_space_allocate(alloc, space, size);
+    if (gc_ref_is_null(new_ref))
+      return COPY_SPACE_FORWARD_FAILED;
+    memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), size);
+    gc_object_forward_nonatomic(old_ref, new_ref);
+    gc_edge_update(edge, new_ref);
+    return COPY_SPACE_FORWARD_EVACUATED;
+  }
+}
+
+static int
+copy_space_forward_if_traced_nonatomic(struct copy_space *space,
+                                       struct gc_edge edge,
+                                       struct gc_ref old_ref) {
+  uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
+  if (forwarded) {
+    gc_edge_update(edge, gc_ref(forwarded));
+    return 1;
+  }
+  return 0;
+}
+
+static inline enum copy_space_forward_result
+copy_space_forward(struct copy_space *src_space, struct copy_space *dst_space,
+                   struct gc_edge edge,
+                   struct gc_ref old_ref,
+                   struct copy_space_allocator *dst_alloc) {
+  GC_ASSERT(copy_space_contains(src_space, old_ref));
+  GC_ASSERT(src_space != dst_space
+            || copy_space_object_region(old_ref) != src_space->active_region);
+  if (GC_PARALLEL && src_space->atomic_forward)
+    return copy_space_forward_atomic(dst_space, edge, old_ref, dst_alloc);
+  return copy_space_forward_nonatomic(dst_space, edge, old_ref, dst_alloc);
+}
+
+static inline int
+copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
+                             struct gc_ref old_ref) {
+  GC_ASSERT(copy_space_contains(space, old_ref));
+  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
+  if (GC_PARALLEL && space->atomic_forward)
+    return copy_space_forward_if_traced_atomic(space, edge, old_ref);
+  return copy_space_forward_if_traced_nonatomic(space, edge, old_ref);
+}
+
+static int
+copy_space_is_aligned(struct copy_space *space) {
+  return space->flags & COPY_SPACE_ALIGNED;
+}
+
+static int
+copy_space_fixed_size(struct copy_space *space) {
+  // If the extent is aligned, it is fixed.
+  return copy_space_is_aligned(space);
+}
+
+static inline uintptr_t
+copy_space_low_aligned_address(struct copy_space *space) {
+  GC_ASSERT(copy_space_is_aligned(space));
+  GC_ASSERT_EQ(space->extents->size, 1);
+  return space->extents->ranges[0].lo_addr;
+}
+
+static inline uintptr_t
+copy_space_high_aligned_address(struct copy_space *space) {
+  GC_ASSERT(copy_space_is_aligned(space));
+  GC_ASSERT_EQ(space->extents->size, 1);
+  return space->extents->ranges[0].hi_addr;
+}
+
+static inline int
+copy_space_contains_address_aligned(struct copy_space *space, uintptr_t addr) {
+  uintptr_t low_addr = copy_space_low_aligned_address(space);
+  uintptr_t high_addr = copy_space_high_aligned_address(space);
+  uintptr_t size = high_addr - low_addr;
+  return (addr - low_addr) < size;
+}
+
+static inline int
+copy_space_contains_edge_aligned(struct copy_space *space,
+                                 struct gc_edge edge) {
+  return copy_space_contains_address_aligned(space, gc_edge_address(edge));
+}
+
+static inline int
+copy_space_should_promote(struct copy_space *space, struct gc_ref ref) {
+  GC_ASSERT(copy_space_contains(space, ref));
+  uintptr_t addr = gc_ref_value(ref);
+  struct copy_space_block *block = copy_space_block_for_addr(gc_ref_value(ref));
+  GC_ASSERT_EQ(copy_space_object_region(ref), space->active_region ^ 1);
+  return block->is_survivor[space->active_region ^ 1];
+}
+
+static int
+copy_space_contains_edge(struct copy_space *space, struct gc_edge edge) {
+  return copy_space_contains_address(space, gc_edge_address(edge));
+}
+
+static int
+copy_space_remember_edge(struct copy_space *space, struct gc_edge edge) {
+  GC_ASSERT(copy_space_contains_edge(space, edge));
+  uint8_t* loc = copy_space_field_logged_byte(edge);
+  uint8_t bit = copy_space_field_logged_bit(edge);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (byte & bit) return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &byte, byte|bit,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  return 1;
+}
+
+static int
+copy_space_forget_edge(struct copy_space *space, struct gc_edge edge) {
+  GC_ASSERT(copy_space_contains_edge(space, edge));
+  uint8_t* loc = copy_space_field_logged_byte(edge);
+  uint8_t bit = copy_space_field_logged_bit(edge);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (!(byte & bit)) return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &byte, byte&~bit,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  return 1;
+}
+
+static size_t copy_space_is_power_of_two(size_t n) {
+  GC_ASSERT(n != 0);
+  return (n & (n - 1)) == 0;
+}
+
+static size_t copy_space_round_up_power_of_two(size_t n) {
+  if (copy_space_is_power_of_two(n))
+    return n;
+
+  return 1ULL << (sizeof(size_t) * 8 - __builtin_clzll(n));
+}
+
+static struct copy_space_slab*
+copy_space_allocate_slabs(size_t nslabs, uint32_t flags) {
+  size_t size = nslabs * COPY_SPACE_SLAB_SIZE;
+  size_t alignment = COPY_SPACE_SLAB_SIZE;
+  if (flags & COPY_SPACE_ALIGNED) {
+    GC_ASSERT(copy_space_is_power_of_two(size));
+    alignment = size;
+  }
+  return gc_platform_acquire_memory(size, alignment);
+}
+
+static void
+copy_space_add_slabs(struct copy_space *space, struct copy_space_slab *slabs,
+                     size_t nslabs) {
+  size_t old_size = space->nslabs * sizeof(struct copy_space_slab*);
+  size_t additional_size = nslabs * sizeof(struct copy_space_slab*);
+  space->extents = extents_adjoin(space->extents, slabs,
+                                  nslabs * sizeof(struct copy_space_slab));
+  space->slabs = realloc(space->slabs, old_size + additional_size);
+  if (!space->slabs)
+    GC_CRASH();
+  while (nslabs--)
+    space->slabs[space->nslabs++] = slabs++;
+}
+
+static void
+copy_space_shrink(struct copy_space *space, size_t bytes) {
+  ssize_t pending = copy_space_request_release_memory(space, bytes);
+  copy_space_page_out_blocks_until_memory_released(space);
+  
+  // It still may be the case we need to page out more blocks.  Only collection
+  // can help us then!
+}
+      
+static size_t
+copy_space_first_payload_block(struct copy_space *space) {
+  return copy_space_field_logging_blocks(space);
+}
+
+static void
+copy_space_expand(struct copy_space *space, size_t bytes) {
+  GC_ASSERT(!copy_space_fixed_size(space));
+  ssize_t to_acquire = -copy_space_maybe_reacquire_memory(space, bytes);
+  if (to_acquire <= 0) return;
+  size_t reserved = align_up(to_acquire, COPY_SPACE_SLAB_SIZE);
+  size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
+  struct copy_space_slab *slabs =
+    copy_space_allocate_slabs(nslabs, space->flags);
+  copy_space_add_slabs(space, slabs, nslabs);
+
+  struct gc_lock lock = copy_space_lock(space);
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t idx = copy_space_first_payload_block(space);
+         idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB;
+         idx++) {
+      struct copy_space_block *block = &slabs[slab].headers[idx];
+      block->all_zeroes[0] = block->all_zeroes[1] = 1;
+      block->in_core = 0;
+      copy_space_page_out_block(space, block, &lock);
+      reserved -= COPY_SPACE_BLOCK_SIZE;
+    }
+  }
+  gc_lock_release(&lock);
+  copy_space_reacquire_memory(space, 0);
+}
+
+static void
+copy_space_advance_page_out_queue(void *data) {
+  struct copy_space *space = data;
+  struct gc_lock lock = copy_space_lock(space);
+  for (int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
+    while (1) {
+      struct copy_space_block *block =
+        copy_space_block_stack_pop(&space->paged_out[age], &lock);
+      if (!block) break;
+      copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
+    }
+  }
+  gc_lock_release(&lock);
+}
+
+static void
+copy_space_page_out_blocks(void *data) {
+  struct copy_space *space = data;
+  int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 2;
+  struct gc_lock lock = copy_space_lock(space);
+  while (1) {
+    struct copy_space_block *block =
+      copy_space_block_stack_pop(&space->paged_out[age], &lock);
+    if (!block) break;
+    block->in_core = 0;
+    block->all_zeroes[0] = block->all_zeroes[1] = 1;
+    gc_platform_discard_memory(copy_space_block_payload(block),
+                               COPY_SPACE_BLOCK_SIZE);
+    copy_space_clear_field_logged_bits_for_block(space, block);
+    copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
+  }
+  gc_lock_release(&lock);
+}
+
+static int
+copy_space_init(struct copy_space *space, size_t size, uint32_t flags,
+                struct gc_background_thread *thread) {
+  size = align_up(size, COPY_SPACE_BLOCK_SIZE);
+  size_t reserved = align_up(size, COPY_SPACE_SLAB_SIZE);
+  if (flags & COPY_SPACE_ALIGNED)
+    reserved = copy_space_round_up_power_of_two(reserved);
+  size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
+  struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs, flags);
+  if (!slabs)
+    return 0;
+
+  pthread_mutex_init(&space->lock, NULL);
+  space->empty.list.head = NULL;
+  space->partly_full.list.head = NULL;
+  space->full.head = NULL;
+  for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++)
+    space->paged_out[age].list.head = NULL;
+  space->allocated_bytes = 0;
+  space->fragmentation = 0;
+  space->bytes_to_page_out = 0;
+  space->active_region = 0;
+  space->atomic_forward = flags & COPY_SPACE_ATOMIC_FORWARDING;
+  space->flags = flags;
+  space->allocated_bytes_at_last_gc = 0;
+  space->fragmentation_at_last_gc = 0;
+  space->extents = extents_allocate((flags & COPY_SPACE_ALIGNED) ? 1 : 10);
+  copy_space_add_slabs(space, slabs, nslabs);
+  struct gc_lock lock = copy_space_lock(space);
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t idx = copy_space_first_payload_block(space);
+         idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB;
+         idx++) {
+      struct copy_space_block *block = &slabs[slab].headers[idx];
+      block->all_zeroes[0] = block->all_zeroes[1] = 1;
+      block->in_core = 0;
+      block->is_survivor[0] = block->is_survivor[1] = 0;
+      if (reserved > size) {
+        copy_space_page_out_block(space, block, &lock);
+        reserved -= COPY_SPACE_BLOCK_SIZE;
+      } else {
+        copy_space_push_empty_block(space, block, &lock);
+      }
+    }
+  }
+  gc_lock_release(&lock);
+  gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
+                                copy_space_advance_page_out_queue,
+                                space);
+  gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_END,
+                                copy_space_page_out_blocks,
+                                space);
+  return 1;
+}
+
+#endif // COPY_SPACE_H
--- a/libguile/whippet/src/debug.h
+++ b/libguile/whippet/src/debug.h
@ -0,0 +1,10 @@
+#ifndef DEBUG_H
+#define DEBUG_H
+
+#ifndef NDEBUG
+#define DEBUG(...) fprintf (stderr, "DEBUG: " __VA_ARGS__)
+#else
+#define DEBUG(...) do { } while (0)
+#endif
+
+#endif // DEBUG_H
--- a/libguile/whippet/src/extents.h
+++ b/libguile/whippet/src/extents.h
@ -0,0 +1,88 @@
+#ifndef EXTENTS_H
+#define EXTENTS_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "gc-assert.h"
+
+struct extent_range {
+  uintptr_t lo_addr;
+  uintptr_t hi_addr;
+};
+
+struct extents {
+  size_t size;
+  size_t capacity;
+  struct extent_range ranges[];
+};
+
+static inline int
+extents_contain_addr(struct extents *extents, uintptr_t addr) {
+  size_t lo = 0;
+  size_t hi = extents->size;
+  while (lo != hi) {
+    size_t mid = (lo + hi) / 2;
+    struct extent_range range = extents->ranges[mid];
+    if (addr < range.lo_addr) {
+      hi = mid;
+    } else if (addr < range.hi_addr) {
+      return 1;
+    } else {
+      lo = mid + 1;
+    }
+  }
+  return 0;
+}
+
+static struct extents*
+extents_allocate(size_t capacity) {
+  size_t byte_size =
+    sizeof(struct extents) + sizeof(struct extent_range) * capacity;
+  struct extents *ret = malloc(byte_size);
+  if (!ret) __builtin_trap();
+  memset(ret, 0, byte_size);
+  ret->capacity = capacity;
+  return ret;
+}
+
+static struct extents*
+extents_insert(struct extents *old, size_t idx, struct extent_range range) {
+  if (old->size < old->capacity) {
+    size_t bytes_to_move = sizeof(struct extent_range) * (old->size - idx);
+    memmove(&old->ranges[idx + 1], &old->ranges[idx], bytes_to_move);
+    old->ranges[idx] = range;
+    old->size++;
+    return old;
+  } else {
+    struct extents *new_ = extents_allocate(old->capacity * 2 + 1);
+    memcpy(&new_->ranges[0], &old->ranges[0],
+           sizeof(struct extent_range) * idx);
+    memcpy(&new_->ranges[idx + 1], &old->ranges[idx],
+           sizeof(struct extent_range) * (old->size - idx));
+    new_->ranges[idx] = range;
+    new_->size = old->size + 1;
+    free(old);
+    return new_;
+  }
+}
+
+static struct extents*
+extents_adjoin(struct extents *extents, void *lo_addr, size_t size) {
+  size_t i;
+  struct extent_range range = { (uintptr_t)lo_addr, (uintptr_t)lo_addr + size };
+  for (i = 0; i < extents->size; i++) {
+    if (range.hi_addr < extents->ranges[i].lo_addr) {
+      break;
+    } else if (range.hi_addr == extents->ranges[i].lo_addr) {
+      extents->ranges[i].lo_addr = range.lo_addr;
+      return extents;
+    } else if (range.lo_addr == extents->ranges[i].hi_addr) {
+      extents->ranges[i].hi_addr = range.hi_addr;
+      return extents;
+    }
+  }
+  return extents_insert(extents, i, range);
+}
+  
+#endif // EXTENTS_H
--- a/libguile/whippet/src/field-set.h
+++ b/libguile/whippet/src/field-set.h
@ -0,0 +1,229 @@
+#ifndef FIELD_SET_H
+#define FIELD_SET_H
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+
+#include "assert.h"
+#include "gc-edge.h"
+#include "gc-lock.h"
+#include "tracer.h"
+
+#define GC_EDGE_BUFFER_CAPACITY 510
+
+struct gc_edge_buffer {
+  struct gc_edge_buffer *next;
+  size_t size;
+  struct gc_edge edges[GC_EDGE_BUFFER_CAPACITY];
+};
+
+// Lock-free.
+struct gc_edge_buffer_list {
+  struct gc_edge_buffer *head;
+};
+
+// With a lock.
+struct gc_edge_buffer_stack {
+  struct gc_edge_buffer_list list;
+};
+
+struct gc_field_set {
+  struct gc_edge_buffer_list full;
+  struct gc_edge_buffer_stack partly_full;
+  struct gc_edge_buffer_list empty;
+  size_t count;
+  pthread_mutex_t lock;
+};
+
+struct gc_field_set_writer {
+  struct gc_edge_buffer *buf;
+  struct gc_field_set *set;
+};
+
+static void
+gc_edge_buffer_list_push(struct gc_edge_buffer_list *list,
+                         struct gc_edge_buffer *buf) {
+  GC_ASSERT(!buf->next);
+  struct gc_edge_buffer *next =
+    atomic_load_explicit(&list->head, memory_order_relaxed);
+  do {
+    buf->next = next;
+  } while (!atomic_compare_exchange_weak_explicit(&list->head, &next, buf,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+}
+
+static struct gc_edge_buffer*
+gc_edge_buffer_list_pop(struct gc_edge_buffer_list *list) {
+  struct gc_edge_buffer *head =
+    atomic_load_explicit(&list->head, memory_order_acquire);
+  struct gc_edge_buffer *next;
+  do {
+    if (!head) return NULL;
+    next = head->next;
+  } while (!atomic_compare_exchange_weak_explicit(&list->head, &head, next,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  head->next = NULL;
+  return head;
+}
+
+static void
+gc_edge_buffer_stack_push(struct gc_edge_buffer_stack *stack,
+                          struct gc_edge_buffer *buf,
+                          const struct gc_lock *lock) {
+  GC_ASSERT(!buf->next);
+  buf->next = stack->list.head;
+  stack->list.head = buf;
+}
+
+static struct gc_edge_buffer*
+gc_edge_buffer_stack_pop(struct gc_edge_buffer_stack *stack,
+                         const struct gc_lock *lock) {
+  struct gc_edge_buffer *head = stack->list.head;
+  if (head) {
+    stack->list.head = head->next;
+    head->next = NULL;
+  }
+  return head;
+}
+
+static void
+gc_field_set_init(struct gc_field_set *set) {
+  memset(set, 0, sizeof(*set));
+  pthread_mutex_init(&set->lock, NULL);
+}
+
+static struct gc_edge_buffer*
+gc_field_set_acquire_buffer(struct gc_field_set *set) {
+  struct gc_edge_buffer *ret;
+
+  ret = gc_edge_buffer_list_pop(&set->empty);
+  if (ret) return ret;
+
+  struct gc_lock lock = gc_lock_acquire(&set->lock);
+  ret = gc_edge_buffer_stack_pop(&set->partly_full, &lock);
+  gc_lock_release(&lock);
+  if (ret) return ret;
+
+  // atomic inc count
+  ret = malloc(sizeof(*ret));
+  if (!ret) {
+    perror("Failed to allocate remembered set");
+    GC_CRASH();
+  }
+  memset(ret, 0, sizeof(*ret));
+  return ret;
+}
+
+static void
+gc_field_set_release_buffer(struct gc_field_set *set,
+                            struct gc_edge_buffer *buf) {
+  if (buf->size == GC_EDGE_BUFFER_CAPACITY) {
+    gc_edge_buffer_list_push(&set->full, buf);
+  } else {
+    struct gc_lock lock = gc_lock_acquire(&set->lock);
+    gc_edge_buffer_stack_push(&set->partly_full, buf, &lock);
+    gc_lock_release(&lock);
+  }
+}
+
+static void
+gc_field_set_add_roots(struct gc_field_set *set, struct gc_tracer *tracer) {
+  struct gc_edge_buffer *buf;
+  struct gc_lock lock = gc_lock_acquire(&set->lock);
+  while ((buf = gc_edge_buffer_stack_pop(&set->partly_full, &lock)))
+    gc_tracer_add_root(tracer, gc_root_edge_buffer(buf));
+  while ((buf = gc_edge_buffer_list_pop(&set->full)))
+    gc_tracer_add_root(tracer, gc_root_edge_buffer(buf));
+  gc_lock_release(&lock);
+}
+
+static void
+gc_field_set_clear(struct gc_field_set *set,
+                   void (*forget_edge)(struct gc_edge, struct gc_heap*),
+                   struct gc_heap *heap) {
+  struct gc_edge_buffer *partly_full = set->partly_full.list.head;
+  struct gc_edge_buffer *full = set->full.head;
+  // Clear the full and partly full sets now so that if a collector
+  // wanted to it could re-add an edge to the remembered set.
+  set->partly_full.list.head = NULL;
+  set->full.head = NULL;
+  struct gc_edge_buffer *buf, *next;
+  for (buf = partly_full; buf; buf = next) {
+    next = buf->next;
+    buf->next = NULL;
+    if (forget_edge)
+      for (size_t i = 0; i < buf->size; i++)
+        forget_edge(buf->edges[i], heap);
+    buf->size = 0;
+    gc_edge_buffer_list_push(&set->empty, buf);
+  }
+  for (buf = full; buf; buf = next) {
+    next = buf->next;
+    buf->next = NULL;
+    if (forget_edge)
+      for (size_t i = 0; i < buf->size; i++)
+        forget_edge(buf->edges[i], heap);
+    buf->size = 0;
+    gc_edge_buffer_list_push(&set->empty, buf);
+  }
+}
+
+static inline void
+gc_field_set_visit_edge_buffer(struct gc_field_set *set,
+                               struct gc_edge_buffer *buf,
+                               int (*visit)(struct gc_edge,
+                                            struct gc_heap*,
+                                            void *data),
+                               struct gc_heap *heap,
+                               void *data) GC_ALWAYS_INLINE;
+static inline void
+gc_field_set_visit_edge_buffer(struct gc_field_set *set,
+                               struct gc_edge_buffer *buf,
+                               int (*visit)(struct gc_edge,
+                                            struct gc_heap*,
+                                            void *data),
+                               struct gc_heap *heap,
+                               void *data) {
+  size_t i = 0;
+  while (i < buf->size) {
+    if (visit(buf->edges[i], heap, data))
+      i++;
+    else
+      buf->edges[i] = buf->edges[--buf->size];
+  }
+  gc_field_set_release_buffer(set, buf);
+}
+
+static void
+gc_field_set_writer_release_buffer(struct gc_field_set_writer *writer) {
+  if (writer->buf) {
+    gc_field_set_release_buffer(writer->set, writer->buf);
+    writer->buf = NULL;
+  }
+}
+
+static void
+gc_field_set_writer_init(struct gc_field_set_writer *writer,
+                         struct gc_field_set *set) {
+  writer->set = set;
+  writer->buf = NULL;
+}
+
+static void
+gc_field_set_writer_add_edge(struct gc_field_set_writer *writer,
+                             struct gc_edge edge) {
+  struct gc_edge_buffer *buf = writer->buf;
+  if (GC_UNLIKELY(!buf))
+    writer->buf = buf = gc_field_set_acquire_buffer(writer->set);
+  GC_ASSERT(buf->size < GC_EDGE_BUFFER_CAPACITY);
+  buf->edges[buf->size++] = edge;
+  if (GC_UNLIKELY(buf->size == GC_EDGE_BUFFER_CAPACITY)) {
+    gc_edge_buffer_list_push(&writer->set->full, buf);
+    writer->buf = NULL;
+  }
+}
+
+#endif // FIELD_SET_H
--- a/libguile/whippet/src/freelist.h
+++ b/libguile/whippet/src/freelist.h
@ -0,0 +1,31 @@
+#ifndef FREELIST_H
+#define FREELIST_H
+
+// A size-segregated freelist with linear-log buckets à la
+// https://pvk.ca/Blog/2015/06/27/linear-log-bucketing-fast-versatile-simple/.
+
+#include "gc-assert.h"
+#include "gc-histogram.h"
+
+#include <string.h>
+
+#define DEFINE_FREELIST(name, max_value_bits, precision, node)          \
+  struct name { node buckets[((max_value_bits) << (precision)) + 1]; }; \
+  static inline size_t name##_num_size_classes(void) {                  \
+    return ((max_value_bits) << (precision)) + 1;                       \
+  }                                                                     \
+  static inline uint64_t name##_bucket_min_val(size_t idx) {            \
+    GC_ASSERT(idx < name##_num_size_classes());                         \
+    return gc_histogram_bucket_min_val((precision), idx);               \
+  }                                                                     \
+  static inline void name##_init(struct name *f) {                      \
+    memset(f, 0, sizeof(*f));                                           \
+  }                                                                     \
+  static inline size_t name##_size_class(uint64_t val) {                \
+    return gc_histogram_bucket((max_value_bits), (precision), val);     \
+  }                                                                     \
+  static inline node* name##_bucket(struct name *f, uint64_t val) {     \
+    return &f->buckets[name##_size_class(val)];                         \
+  }
+
+#endif // FREELIST_H
--- a/libguile/whippet/src/gc-align.h
+++ b/libguile/whippet/src/gc-align.h
@ -0,0 +1,22 @@
+#ifndef GC_ALIGN_H
+#define GC_ALIGN_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include <stdint.h>
+
+static inline uintptr_t align_down(uintptr_t addr, size_t align) {
+  return addr & ~(align - 1);
+}
+static inline uintptr_t align_up(uintptr_t addr, size_t align) {
+  return align_down(addr + align - 1, align);
+}
+
+// Poor man's equivalent of std::hardware_destructive_interference_size.
+#define AVOID_FALSE_SHARING 128
+#define ALIGNED_TO_AVOID_FALSE_SHARING \
+  __attribute__((aligned(AVOID_FALSE_SHARING)))
+
+#endif // GC_ALIGN_H
--- a/libguile/whippet/src/gc-ephemeron-internal.h
+++ b/libguile/whippet/src/gc-ephemeron-internal.h
@ -0,0 +1,55 @@
+#ifndef GC_EPHEMERON_INTERNAL_H
+#define GC_EPHEMERON_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-ephemeron.h"
+
+struct gc_pending_ephemerons;
+
+// API implemented by collector, for use by ephemerons:
+GC_INTERNAL int gc_visit_ephemeron_key(struct gc_edge edge,
+                                       struct gc_heap *heap);
+GC_INTERNAL struct gc_pending_ephemerons*
+gc_heap_pending_ephemerons(struct gc_heap *heap);
+GC_INTERNAL unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap);
+
+// API implemented by ephemerons, for use by collector:
+GC_INTERNAL struct gc_edge gc_ephemeron_key_edge(struct gc_ephemeron *eph);
+GC_INTERNAL struct gc_edge gc_ephemeron_value_edge(struct gc_ephemeron *eph);
+
+GC_INTERNAL struct gc_pending_ephemerons*
+gc_prepare_pending_ephemerons(struct gc_pending_ephemerons *state,
+                              size_t target_size, double slop);
+
+GC_INTERNAL void
+gc_resolve_pending_ephemerons(struct gc_ref obj, struct gc_heap *heap);
+
+GC_INTERNAL void
+gc_scan_pending_ephemerons(struct gc_pending_ephemerons *state,
+                           struct gc_heap *heap, size_t shard,
+                           size_t nshards);
+
+GC_INTERNAL struct gc_ephemeron*
+gc_pop_resolved_ephemerons(struct gc_heap *heap);
+
+GC_INTERNAL void
+gc_trace_resolved_ephemerons(struct gc_ephemeron *resolved,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *trace_data);
+
+GC_INTERNAL void
+gc_sweep_pending_ephemerons(struct gc_pending_ephemerons *state,
+                            size_t shard, size_t nshards);
+
+GC_INTERNAL void gc_ephemeron_init_internal(struct gc_heap *heap,
+                                            struct gc_ephemeron *ephemeron,
+                                            struct gc_ref key,
+                                            struct gc_ref value);
+
+#endif // GC_EPHEMERON_INTERNAL_H
--- a/libguile/whippet/src/gc-ephemeron.c
+++ b/libguile/whippet/src/gc-ephemeron.c
@ -0,0 +1,583 @@
+#include <math.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+
+#define GC_IMPL 1
+
+#include "address-hash.h"
+#include "debug.h"
+#include "gc-embedder-api.h"
+#include "gc-ephemeron-internal.h"
+
+// # Overview
+//
+// An ephemeron is a conjunction consisting of the ephemeron object
+// itself, a "key" object, and a "value" object.  If the ephemeron and
+// the key are live, then the value is kept live and can be looked up
+// given the ephemeron object.
+//
+// Sometimes we write this as E×K⇒V, indicating that you need both E and
+// K to get V.  We'll use this notation in these comments sometimes.
+//
+// The key and the value of an ephemeron are never modified, except
+// possibly via forwarding during GC.
+//
+// If the key of an ephemeron ever becomes unreachable, the ephemeron
+// object will be marked as dead by the collector, and neither key nor
+// value will be accessible.  Users can also explicitly mark an
+// ephemeron as dead.
+//
+// Users can build collections of ephemerons by chaining them together.
+// If an ephemeron ever becomes dead, the ephemeron will be removed from
+// the chain by the garbage collector.
+//
+// # Tracing algorithm
+//
+// Tracing ephemerons is somewhat complicated.  Tracing the live objects
+// in a heap is usually a parallelizable fan-out kind of operation,
+// requiring minimal synchronization between tracing worker threads.
+// However with ephemerons, each worker thread may need to check if
+// there is a pending ephemeron E for an object K, marking the
+// associated V for later traversal by the tracer.  Doing this without
+// introducing excessive global serialization points is the motivation
+// for the complications that follow.
+//
+// From the viewpoint of the garbage collector, an ephemeron E×K⇒V has 4
+// possible states:
+//
+//  - Traced: An E that was already fully traced as of a given GC epoch.
+//
+//  - Claimed: GC discovers E for the first time in a GC epoch
+//
+//  - Pending: K's liveness is unknown
+//
+//  - Resolved: K is live; V needs tracing
+//
+// The ephemeron state is kept in an atomic variable.  The pending and
+// resolved states also have associated atomic list link fields as well;
+// it doesn't appear possible to coalesce them into a single field
+// without introducing serialization.  Finally, there is a bit to
+// indicate whether a "traced" ephemeron is live or dead, and a field to
+// indicate the epoch at which it was last traced.
+//
+// Here is a diagram of the state transitions:
+//
+//               ,----->Traced<-----.
+//              ,          |   |     .
+//             ,           v   /      .
+//             |        Claimed        |
+//             |  ,-----/     \---.    |
+//             |  v               v    |
+//             Pending--------->Resolved
+//
+// Ephemerons are born in the traced state, for the current GC epoch.
+//
+// When the tracer sees an ephemeron E in the traced state it checks the
+// epoch.  If the epoch is up to date, E stays in the traced state and
+// we are done.
+//
+// Otherwise, E transitions from traced to claimed.  The thread that
+// claims E is then responsible for resetting E's pending and resolved
+// links, updating E's epoch, and tracing E's user-controlled chain
+// link.
+//
+// If the claiming thread sees that E was already marked dead by a
+// previous GC, or explicitly by the user, the ephemeron then
+// transitions from back to traced, ready for the next epoch.
+//
+// If the claiming thread sees K to already be known to be live, then E
+// is added to the global resolved set and E's state becomes resolved.
+//
+// Otherwise the claiming thread publishes K⇒E to the global pending
+// ephemeron table, via the pending link, and E transitions to pending.
+//
+// A pending ephemeron is a link in a buckets-of-chains concurrent hash
+// table.  If its K is ever determined to be live, it becomes resolved,
+// and is added to a global set of resolved ephemerons.  At the end of
+// GC, any ephemerons still pending are marked dead, transitioning their
+// states to traced.
+//
+// Note that the claiming thread -- the one that publishes K⇒E to the
+// global pending ephemeron table -- needs to re-check that K is still
+// untraced after adding K⇒E to the pending table, and move to resolved
+// if so.
+//
+// A resolved ephemeron needs its V to be traced.  Incidentally its K
+// also needs tracing, to relocate any forwarding pointer.  The thread
+// that pops an ephemeron from the resolved set is responsible for
+// tracing and for moving E's state to traced.
+//
+// # Concurrency
+//
+// All operations on ephemerons are wait-free.  Sometimes only one
+// thread can make progress (for example for an ephemeron in the claimed
+// state), but no thread will be stalled waiting on other threads to
+// proceed.
+//
+// There is one interesting (from a concurrency point of view) data
+// structure used by the implementation of ephemerons, the singly-linked
+// list.  Actually there are three of these; one is used as a stack and
+// the other two is used as sets.
+//
+// The resolved set is implemented via a global `struct gc_ephemeron
+// *resolved` variable.  Resolving an ephemeron does an atomic push to
+// this stack, via compare-and-swap (CAS); popping from the stack (also
+// via CAS) yields an ephemeron for tracing.  Ephemerons are added to
+// the resolved set at most once per GC cycle, and the resolved set is
+// empty outside of GC.
+//
+// The operations that are supported on atomic stacks are:
+//
+//   push(LOC, E, OFFSET) -> void
+//
+// The user-visible chain link and the link for the pending ephemeron
+// table are used to build atomic sets.  In these you can add an
+// ephemeron to the beginning of the list, traverse the list link by
+// link to the end (indicated by NULL), and remove any list item.
+// Removing a list node proceeds in two phases: one, you mark the node
+// for removal, by changing the ephemeron's state; then, possibly on a
+// subsequent traversal, any predecessor may forward its link past
+// removed nodes.  Because node values never change and nodes only go
+// from live to dead, the live list tail can always be reached by any
+// node, even from dead nodes.
+//
+// The operations that are supported on these atomic lists:
+//
+//   push(LOC, E, OFFSET) -> void
+//   pop(LOC, OFFSET) -> ephemeron or null
+//   follow(LOC, OFFSET, STATE_OFFSET, LIVE_STATE) -> ephemeron or null
+//
+// These operations are all wait-free.  The "push" operation is shared
+// between stack and set use cases.  "pop" is for stack-like use cases.
+// The "follow" operation traverses a list, opportunistically eliding
+// nodes that have been marked dead, atomically updating the location
+// storing the next item.
+//
+// There are also accessors on ephemerons to their fields:
+//
+//   key(E) -> value or null
+//   value(E) -> value or null
+//
+// These operations retrieve the key and value, respectively, provided
+// that the ephemeron is not marked dead.
+
+////////////////////////////////////////////////////////////////////////
+// Concurrent operations on ephemeron lists
+////////////////////////////////////////////////////////////////////////
+
+static void
+ephemeron_list_push(struct gc_ephemeron **loc,
+                    struct gc_ephemeron *head,
+                    struct gc_ephemeron** (*get_next)(struct gc_ephemeron*)) {
+  struct gc_ephemeron *tail = atomic_load_explicit(loc, memory_order_acquire);
+  while (1) {
+    // There must be no concurrent readers of HEAD, a precondition that
+    // we ensure by only publishing HEAD to LOC at most once per cycle.
+    // Therefore we can use a normal store for the tail pointer.
+    *get_next(head) = tail;
+    if (atomic_compare_exchange_weak(loc, &tail, head))
+      break;
+  }
+}
+
+static struct gc_ephemeron*
+ephemeron_list_pop(struct gc_ephemeron **loc,
+                   struct gc_ephemeron** (*get_next)(struct gc_ephemeron*)) {
+  struct gc_ephemeron *head = atomic_load_explicit(loc, memory_order_acquire);
+  while (head) {
+    // Precondition: the result of get_next on an ephemeron is never
+    // updated concurrently; OK to load non-atomically.
+    struct gc_ephemeron *tail = *get_next(head);
+    if (atomic_compare_exchange_weak(loc, &head, tail))
+      break;
+  }
+  return head;
+}
+
+static struct gc_ephemeron*
+ephemeron_list_follow(struct gc_ephemeron **loc,
+                      struct gc_ephemeron** (*get_next)(struct gc_ephemeron*),
+                      int (*is_live)(struct gc_ephemeron*)) {
+  struct gc_ephemeron *head = atomic_load_explicit(loc, memory_order_acquire);
+  if (!head) return NULL;
+
+  while (1) {
+    struct gc_ephemeron *new_head = head;
+
+    // Skip past any dead nodes.
+    while (new_head && !is_live(new_head))
+      new_head = atomic_load_explicit(get_next(new_head), memory_order_acquire);
+
+    if (// If we didn't have to advance past any dead nodes, no need to
+        // update LOC.
+        (head == new_head)
+        // Otherwise if we succeed in updating LOC, we're done.
+        || atomic_compare_exchange_strong(loc, &head, new_head)
+        // Someone else managed to advance LOC; that's fine too.
+        || (head == new_head))
+      return new_head;
+
+    // Otherwise we lost a race; loop and retry.
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// The ephemeron object type
+////////////////////////////////////////////////////////////////////////
+
+#ifndef GC_EMBEDDER_EPHEMERON_HEADER
+#error Embedder should define GC_EMBEDDER_EPHEMERON_HEADER
+#endif
+
+enum {
+  EPHEMERON_STATE_TRACED,
+  EPHEMERON_STATE_CLAIMED,
+  EPHEMERON_STATE_PENDING,
+  EPHEMERON_STATE_RESOLVED,
+};
+
+struct gc_ephemeron {
+  GC_EMBEDDER_EPHEMERON_HEADER
+  uint8_t state;
+  unsigned epoch;
+  struct gc_ephemeron *chain;
+  struct gc_ephemeron *pending;
+  struct gc_ephemeron *resolved;
+  struct gc_ref key;
+  struct gc_ref value;
+};
+
+size_t gc_ephemeron_size(void) { return sizeof(struct gc_ephemeron); }
+
+struct gc_edge gc_ephemeron_key_edge(struct gc_ephemeron *e) {
+  return gc_edge(&e->key);
+}
+struct gc_edge gc_ephemeron_value_edge(struct gc_ephemeron *e) {
+  return gc_edge(&e->value);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Operations on the user-controlled chain field
+////////////////////////////////////////////////////////////////////////
+
+static struct gc_ephemeron** ephemeron_chain(struct gc_ephemeron *e) {
+  return &e->chain;
+}
+static int ephemeron_is_dead(struct gc_ephemeron *e) {
+  return !atomic_load_explicit(&e->key.value, memory_order_acquire);
+}
+static int ephemeron_is_not_dead(struct gc_ephemeron *e) {
+  return !ephemeron_is_dead(e);
+}
+
+void gc_ephemeron_chain_push(struct gc_ephemeron **loc,
+                             struct gc_ephemeron *e) {
+  ephemeron_list_push(loc, e, ephemeron_chain);
+}  
+static struct gc_ephemeron* follow_chain(struct gc_ephemeron **loc) {
+  return ephemeron_list_follow(loc, ephemeron_chain, ephemeron_is_not_dead);
+}  
+struct gc_ephemeron* gc_ephemeron_chain_head(struct gc_ephemeron **loc) {
+  return follow_chain(loc);
+}
+struct gc_ephemeron* gc_ephemeron_chain_next(struct gc_ephemeron *e) {
+  return follow_chain(ephemeron_chain(e));
+}
+void gc_ephemeron_mark_dead(struct gc_ephemeron *e) {
+  atomic_store_explicit(&e->key.value, 0, memory_order_release);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Operations on the GC-managed pending link
+////////////////////////////////////////////////////////////////////////
+
+static struct gc_ephemeron** ephemeron_pending(struct gc_ephemeron *e) {
+  return &e->pending;
+}
+static uint8_t ephemeron_state(struct gc_ephemeron *e) {
+  return atomic_load_explicit(&e->state, memory_order_acquire);
+}
+static int ephemeron_is_pending(struct gc_ephemeron *e) {
+  return ephemeron_state(e) == EPHEMERON_STATE_PENDING;
+}
+
+static void push_pending(struct gc_ephemeron **loc, struct gc_ephemeron *e) {
+  ephemeron_list_push(loc, e, ephemeron_pending);
+}  
+static struct gc_ephemeron* follow_pending(struct gc_ephemeron **loc) {
+  return ephemeron_list_follow(loc, ephemeron_pending, ephemeron_is_pending);
+}  
+
+////////////////////////////////////////////////////////////////////////
+// Operations on the GC-managed resolved link
+////////////////////////////////////////////////////////////////////////
+
+static struct gc_ephemeron** ephemeron_resolved(struct gc_ephemeron *e) {
+  return &e->resolved;
+}
+static void push_resolved(struct gc_ephemeron **loc, struct gc_ephemeron *e) {
+  ephemeron_list_push(loc, e, ephemeron_resolved);
+}  
+static struct gc_ephemeron* pop_resolved(struct gc_ephemeron **loc) {
+  return ephemeron_list_pop(loc, ephemeron_resolved);
+}  
+
+////////////////////////////////////////////////////////////////////////
+// Access to the association
+////////////////////////////////////////////////////////////////////////
+
+struct gc_ref gc_ephemeron_key(struct gc_ephemeron *e) {
+  return gc_ref(atomic_load_explicit(&e->key.value, memory_order_acquire));
+}
+
+struct gc_ref gc_ephemeron_value(struct gc_ephemeron *e) {
+  return ephemeron_is_dead(e) ? gc_ref_null() : e->value;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Tracing ephemerons
+////////////////////////////////////////////////////////////////////////
+
+struct gc_pending_ephemerons {
+  struct gc_ephemeron* resolved;
+  size_t nbuckets;
+  double scale;
+  struct gc_ephemeron* buckets[0];
+};
+
+static const size_t MIN_PENDING_EPHEMERONS_SIZE = 32;
+
+static size_t pending_ephemerons_byte_size(size_t nbuckets) {
+  return sizeof(struct gc_pending_ephemerons) +
+    sizeof(struct gc_ephemeron*) * nbuckets;
+}
+
+static struct gc_pending_ephemerons*
+gc_make_pending_ephemerons(size_t byte_size) {
+  size_t nbuckets = byte_size / sizeof(struct gc_ephemeron*);
+  if (nbuckets < MIN_PENDING_EPHEMERONS_SIZE)
+    nbuckets = MIN_PENDING_EPHEMERONS_SIZE;
+
+  struct gc_pending_ephemerons *ret =
+    malloc(pending_ephemerons_byte_size(nbuckets));
+  if (!ret)
+    return NULL;
+
+  ret->resolved = NULL;
+  ret->nbuckets = nbuckets;
+  ret->scale = nbuckets / pow(2.0, sizeof(uintptr_t) * 8);
+  for (size_t i = 0; i < nbuckets; i++)
+    ret->buckets[i] = NULL;
+
+  return ret;
+}
+
+struct gc_pending_ephemerons*
+gc_prepare_pending_ephemerons(struct gc_pending_ephemerons *state,
+                              size_t target_byte_size, double slop) {
+  size_t existing =
+    state ? pending_ephemerons_byte_size(state->nbuckets) : 0;
+  slop += 1.0;
+  if (existing * slop > target_byte_size && existing < target_byte_size * slop)
+    return state;
+
+  struct gc_pending_ephemerons *new_state =
+    gc_make_pending_ephemerons(target_byte_size);
+
+  if (!new_state)
+    return state;
+
+  free(state);
+  return new_state;
+}
+
+static struct gc_ephemeron**
+pending_ephemeron_bucket(struct gc_pending_ephemerons *state,
+                         struct gc_ref ref) {
+  uintptr_t hash = hash_address(gc_ref_value(ref));
+  size_t idx = hash * state->scale;
+  GC_ASSERT(idx < state->nbuckets);
+  return &state->buckets[idx];
+}
+
+static void
+add_pending_ephemeron(struct gc_pending_ephemerons *state,
+                      struct gc_ephemeron *e) {
+  struct gc_ephemeron **bucket = pending_ephemeron_bucket(state, e->key);
+  atomic_store_explicit(&e->state, EPHEMERON_STATE_PENDING,
+                        memory_order_release);
+  push_pending(bucket, e);
+}
+
+static void maybe_resolve_ephemeron(struct gc_pending_ephemerons *state,
+                                    struct gc_ephemeron *e) {
+  uint8_t expected = EPHEMERON_STATE_PENDING;
+  if (atomic_compare_exchange_strong(&e->state, &expected,
+                                     EPHEMERON_STATE_RESOLVED))
+    push_resolved(&state->resolved, e);
+}
+
+// Precondition: OBJ has already been copied to tospace, but OBJ is a
+// fromspace ref.
+void gc_resolve_pending_ephemerons(struct gc_ref obj, struct gc_heap *heap) {
+  struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
+  struct gc_ephemeron **bucket = pending_ephemeron_bucket(state, obj);
+  for (struct gc_ephemeron *link = follow_pending(bucket);
+       link;
+       link = follow_pending(&link->pending)) {
+    if (gc_ref_value(obj) == gc_ref_value(link->key)) {
+      gc_visit_ephemeron_key(gc_ephemeron_key_edge(link), heap);
+      // PENDING -> RESOLVED, if it was pending.
+      maybe_resolve_ephemeron(state, link);
+    }
+  }
+}
+
+void gc_trace_ephemeron(struct gc_ephemeron *e,
+                        void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                      void *visit_data),
+                        struct gc_heap *heap,
+                        void *trace_data) {
+  unsigned epoch = gc_heap_ephemeron_trace_epoch(heap);
+  uint8_t expected = EPHEMERON_STATE_TRACED;
+  // TRACED[_] -> CLAIMED[_].
+  if (!atomic_compare_exchange_strong(&e->state, &expected,
+                                      EPHEMERON_STATE_CLAIMED))
+    return;
+
+
+  if (e->epoch == epoch) {
+    // CLAIMED[epoch] -> TRACED[epoch].
+    atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+    return;
+  }
+
+  // CLAIMED[!epoch] -> CLAIMED[epoch].
+  e->epoch = epoch;
+  e->pending = NULL;
+  e->resolved = NULL;
+
+  // Trace chain successors, eliding any intermediate dead links.  Note
+  // that there is a race between trace-time evacuation of the next link
+  // in the chain and any mutation of that link pointer by the mutator
+  // (which can only be to advance the chain forward past dead links).
+  // Collectors using this API have to eliminate this race, for example
+  // by not evacuating while the mutator is running.
+  follow_chain(&e->chain);
+  visit(gc_edge(&e->chain), heap, trace_data);
+
+  // Similarly there is a race between the mutator marking an ephemeron
+  // as dead and here; the consequence would be that we treat an
+  // ephemeron as live when it's not, but only for this cycle.  No big
+  // deal.
+  if (ephemeron_is_dead(e)) {
+    // CLAIMED[epoch] -> TRACED[epoch].
+    atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+    return;
+  }
+    
+  // If K is live, trace V and we are done.
+  if (gc_visit_ephemeron_key(gc_ephemeron_key_edge(e), heap)) {
+    visit(gc_ephemeron_value_edge(e), heap, trace_data);
+    // CLAIMED[epoch] -> TRACED[epoch].
+    atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+    return;
+  }
+
+  // Otherwise K is not yet traced, so we don't know if it is live.
+  // Publish the ephemeron to a global table.
+  struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
+  // CLAIMED[epoch] -> PENDING.
+  add_pending_ephemeron(state, e);
+
+  // Given an ephemeron E×K⇒V, there is a race between marking K and E.
+  // One thread could go to mark E and see that K is unmarked, so we get
+  // here.  Meanwhile another thread could go to mark K and not see E in
+  // the global table yet.  Therefore after publishing E, we have to
+  // check the mark on K again.
+  if (gc_visit_ephemeron_key(gc_ephemeron_key_edge(e), heap))
+    // K visited by another thread while we published E; PENDING ->
+    // RESOLVED, if still PENDING.
+    maybe_resolve_ephemeron(state, e);
+}
+
+void
+gc_scan_pending_ephemerons(struct gc_pending_ephemerons *state,
+                           struct gc_heap *heap, size_t shard,
+                           size_t nshards) {
+  GC_ASSERT(shard < nshards);
+  size_t start = state->nbuckets * 1.0 * shard / nshards;
+  size_t end = state->nbuckets * 1.0 * (shard + 1) / nshards;
+  for (size_t idx = start; idx < end; idx++) {
+    for (struct gc_ephemeron *e = follow_pending(&state->buckets[idx]);
+         e;
+         e = follow_pending(&e->pending)) {
+      if (gc_visit_ephemeron_key(gc_ephemeron_key_edge(e), heap))
+        // PENDING -> RESOLVED, if PENDING.
+        maybe_resolve_ephemeron(state, e);
+    }
+  }
+}
+
+struct gc_ephemeron*
+gc_pop_resolved_ephemerons(struct gc_heap *heap) {
+  struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
+  return atomic_exchange(&state->resolved, NULL);
+}    
+
+void
+gc_trace_resolved_ephemerons(struct gc_ephemeron *resolved,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *trace_data) {
+  for (; resolved; resolved = resolved->resolved) {
+    visit(gc_ephemeron_value_edge(resolved), heap, trace_data);
+    // RESOLVED -> TRACED.
+    atomic_store_explicit(&resolved->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+  }
+}    
+
+void
+gc_sweep_pending_ephemerons(struct gc_pending_ephemerons *state,
+                            size_t shard, size_t nshards) {
+  GC_ASSERT(shard < nshards);
+  size_t start = state->nbuckets * 1.0 * shard / nshards;
+  size_t end = state->nbuckets * 1.0 * (shard + 1) / nshards;
+  for (size_t idx = start; idx < end; idx++) {
+    struct gc_ephemeron **bucket = &state->buckets[idx];
+    for (struct gc_ephemeron *e = follow_pending(bucket);
+         e;
+         e = follow_pending(&e->pending)) {
+      // PENDING -> TRACED, but dead.
+      atomic_store_explicit(&e->key.value, 0, memory_order_release);
+      atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                            memory_order_release);
+    }
+    atomic_store_explicit(bucket, NULL, memory_order_release);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// Allocation & initialization
+////////////////////////////////////////////////////////////////////////
+
+void gc_ephemeron_init_internal(struct gc_heap *heap,
+                                struct gc_ephemeron *ephemeron,
+                                struct gc_ref key, struct gc_ref value) {
+  // Caller responsible for any write barrier, though really the
+  // assumption is that the ephemeron is younger than the key and the
+  // value.
+  ephemeron->state = EPHEMERON_STATE_TRACED;
+  ephemeron->epoch = gc_heap_ephemeron_trace_epoch(heap) - 1;
+  ephemeron->chain = NULL;
+  ephemeron->pending = NULL;
+  ephemeron->resolved = NULL;
+  ephemeron->key = key;
+  ephemeron->value = value;
+}
--- a/libguile/whippet/src/gc-finalizer-internal.h
+++ b/libguile/whippet/src/gc-finalizer-internal.h
@ -0,0 +1,65 @@
+#ifndef GC_FINALIZER_INTERNAL_H
+#define GC_FINALIZER_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-finalizer.h"
+#include "root.h"
+
+struct gc_finalizer_state;
+
+GC_INTERNAL
+struct gc_finalizer_state* gc_make_finalizer_state(void);
+
+GC_INTERNAL
+void gc_finalizer_init_internal(struct gc_finalizer *f,
+                                struct gc_ref object,
+                                struct gc_ref closure);
+
+GC_INTERNAL
+void gc_finalizer_attach_internal(struct gc_finalizer_state *state,
+                                  struct gc_finalizer *f,
+                                  unsigned priority);
+
+GC_INTERNAL
+void gc_finalizer_externally_activated(struct gc_finalizer *f);
+
+GC_INTERNAL
+void gc_finalizer_externally_fired(struct gc_finalizer_state *state,
+                                   struct gc_finalizer *finalizer);
+
+GC_INTERNAL
+struct gc_finalizer* gc_finalizer_state_pop(struct gc_finalizer_state *state);
+
+GC_INTERNAL
+void gc_finalizer_fire(struct gc_finalizer **fired_list_loc,
+                       struct gc_finalizer *finalizer);
+
+GC_INTERNAL
+void gc_finalizer_state_set_callback(struct gc_finalizer_state *state,
+                                     gc_finalizer_callback callback);
+
+GC_INTERNAL
+size_t gc_visit_finalizer_roots(struct gc_finalizer_state *state,
+                                void (*visit)(struct gc_edge edge,
+                                              struct gc_heap *heap,
+                                              void *visit_data),
+                                struct gc_heap *heap,
+                                void *visit_data);
+
+GC_INTERNAL
+size_t gc_resolve_finalizers(struct gc_finalizer_state *state,
+                             size_t priority,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *visit_data);
+
+GC_INTERNAL
+void gc_notify_finalizers(struct gc_finalizer_state *state,
+                          struct gc_heap *heap);
+
+#endif // GC_FINALIZER_INTERNAL_H
--- a/libguile/whippet/src/gc-finalizer.c
+++ b/libguile/whippet/src/gc-finalizer.c
@ -0,0 +1,307 @@
+#include <math.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GC_IMPL 1
+
+#include "debug.h"
+#include "gc-embedder-api.h"
+#include "gc-ephemeron-internal.h" // for gc_visit_ephemeron_key
+#include "gc-finalizer-internal.h"
+
+// # Overview
+//
+// See gc-finalizer.h for a overview of finalizers from the user and
+// embedder point of view.
+//
+// ## Tracing
+//
+// From the perspecive of the collector implementation, finalizers are
+// GC-managed objects, allowing their size to be accounted for within
+// the heap size.  They get traced during collection, allowing for
+// relocation of their object references, and allowing the finalizer
+// object itself to be evacuated if appropriate.
+//
+// The collector holds on to outstanding finalizers in a *finalizer
+// state*, which holds one *finalizer table* for each priority.  We
+// don't need to look up finalizers by object, so we could just hold
+// them in a big list, but to facilitate parallelism we slice them
+// across some number of shards, where the "next" pointer is part of the
+// finalizer object.
+//
+// There are a number of ways you could imagine integrating finalizers
+// into a system.  The way Whippet does it goes like this.  See
+// https://wingolog.org/archives/2022/10/31/ephemerons-and-finalizers
+// and
+// https://wingolog.org/archives/2024/07/22/finalizers-guardians-phantom-references-et-cetera
+// for some further discussion.
+//
+//   1. The collector should begin a cycle by adding all shards from all
+//      priorities to the root set.  When the embedder comes across a
+//      finalizer (as it will, because we added them to the root set),
+//      it traces it via gc_trace_finalizer(), which will visit the
+//      finalizer's closure and its "next" pointer.
+//
+//   2. After the full trace, and then the fix-point on pending
+//      ephemerons, for each priority from 0 upwards:
+//
+//      i. Visit each finalizable object in the table.  If the object
+//         was as-yet unvisited, then it is unreachable and thus
+//         finalizable; the finalizer is added to the global "fired"
+//         list, and changes state from "attached" to "fired".
+//         Otherwise it is re-added to the finalizer table.
+//
+//     ii. If any finalizer was added to the fired list, then those
+//         objects were also added to the grey worklist; run tracing
+//         again until the grey set is empty, including ephemerons.
+//
+//   3. Finally, call the finalizer callback if the list of fired finalizers is
+//      nonempty.
+//
+// ## Concurrency
+//
+// The finalizer table is wait-free.  It keeps a count of active finalizers, and
+// chooses a bucket based on the count modulo the number of buckets.  Adding a
+// finalizer to the table is an atomic push on a linked list.  The table is
+// completely rebuilt during the GC pause, redistributing survivor entries
+// across the buckets, and pushing all finalizable entries onto the single
+// "fired" linked list.
+//
+// The fired list is also wait-free.  As noted above, it is built
+// during the pause, and mutators pop items off of it atomically.
+//
+// ## Generations
+//
+// It would be ideal if a young generation had its own finalizer table.
+// Promoting an object would require promoting its finalizer to the old
+// finalizer table.  Not yet implemented (but would be nice).
+
+#ifndef GC_EMBEDDER_FINALIZER_HEADER
+#error Embedder should define GC_EMBEDDER_FINALIZER_HEADER
+#endif
+
+enum finalizer_state {
+  FINALIZER_STATE_INIT = 0, // Finalizer is newborn.
+  FINALIZER_STATE_ACTIVE,   // Finalizer is ours and in the finalizer table.
+  FINALIZER_STATE_FIRED,    // Finalizer is handed back to mutator.
+};
+
+struct gc_finalizer {
+  GC_EMBEDDER_FINALIZER_HEADER
+  enum finalizer_state state;
+  struct gc_ref object;
+  struct gc_ref closure;
+  struct gc_finalizer *next;
+};
+
+// Enough buckets to parallelize closure marking.  No need to look up a
+// finalizer for a given object.
+#define BUCKET_COUNT 32
+
+struct gc_finalizer_table {
+  size_t finalizer_count;
+  struct gc_finalizer* buckets[BUCKET_COUNT];
+};
+
+struct gc_finalizer_state {
+  gc_finalizer_callback have_finalizers;
+  struct gc_finalizer *fired;
+  size_t fired_this_cycle;
+  size_t table_count;
+  struct gc_finalizer_table tables[0];
+};
+
+// public
+size_t gc_finalizer_size(void) { return sizeof(struct gc_finalizer); }
+struct gc_ref gc_finalizer_object(struct gc_finalizer *f) { return f->object; }
+struct gc_ref gc_finalizer_closure(struct gc_finalizer *f) { return f->closure; }
+
+// internal
+struct gc_finalizer_state* gc_make_finalizer_state(void) {
+  size_t ntables = gc_finalizer_priority_count();
+  size_t size = (sizeof(struct gc_finalizer_state) +
+                 sizeof(struct gc_finalizer_table) * ntables);
+  struct gc_finalizer_state *ret = malloc(size);
+  if (!ret)
+    return NULL;
+  memset(ret, 0, size);
+  ret->table_count = ntables;
+  return ret;
+}
+
+static void finalizer_list_push(struct gc_finalizer **loc,
+                                struct gc_finalizer *head) {
+  struct gc_finalizer *tail = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    head->next = tail;
+  } while (!atomic_compare_exchange_weak(loc, &tail, head));
+}
+
+static struct gc_finalizer* finalizer_list_pop(struct gc_finalizer **loc) {
+  struct gc_finalizer *head = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (!head) return NULL;
+  } while (!atomic_compare_exchange_weak(loc, &head, head->next));
+  head->next = NULL;
+  return head;
+}
+
+static void add_finalizer_to_table(struct gc_finalizer_table *table,
+                                   struct gc_finalizer *f) {
+  size_t count = atomic_fetch_add_explicit(&table->finalizer_count, 1,
+                                           memory_order_relaxed);
+  struct gc_finalizer **loc = &table->buckets[count % BUCKET_COUNT];
+  finalizer_list_push(loc, f);
+}
+
+// internal
+void gc_finalizer_init_internal(struct gc_finalizer *f,
+                                struct gc_ref object,
+                                struct gc_ref closure) {
+  // Caller responsible for any write barrier, though really the
+  // assumption is that the finalizer is younger than the key and the
+  // value.
+  if (f->state != FINALIZER_STATE_INIT)
+    GC_CRASH();
+  GC_ASSERT(gc_ref_is_null(f->object));
+  f->object = object;
+  f->closure = closure;
+}
+
+// internal
+void gc_finalizer_attach_internal(struct gc_finalizer_state *state,
+                                  struct gc_finalizer *f,
+                                  unsigned priority) {
+  // Caller responsible for any write barrier, though really the
+  // assumption is that the finalizer is younger than the key and the
+  // value.
+  if (f->state != FINALIZER_STATE_INIT)
+    GC_CRASH();
+  if (gc_ref_is_null(f->object))
+    GC_CRASH();
+
+  f->state = FINALIZER_STATE_ACTIVE;
+
+  GC_ASSERT(priority < state->table_count);
+  add_finalizer_to_table(&state->tables[priority], f);
+}
+
+// internal
+struct gc_finalizer* gc_finalizer_state_pop(struct gc_finalizer_state *state) {
+  return finalizer_list_pop(&state->fired);
+}
+
+static void
+add_fired_finalizer(struct gc_finalizer_state *state,
+                    struct gc_finalizer *f) {
+  if (f->state != FINALIZER_STATE_ACTIVE)
+    GC_CRASH();
+  f->state = FINALIZER_STATE_FIRED;
+  finalizer_list_push(&state->fired, f);
+}
+
+// internal
+void
+gc_finalizer_externally_activated(struct gc_finalizer *f) {
+  if (f->state != FINALIZER_STATE_INIT)
+    GC_CRASH();
+  f->state = FINALIZER_STATE_ACTIVE;
+}
+
+// internal
+void
+gc_finalizer_externally_fired(struct gc_finalizer_state *state,
+                              struct gc_finalizer *f) {
+  add_fired_finalizer(state, f);
+}
+
+// internal
+size_t gc_visit_finalizer_roots(struct gc_finalizer_state *state,
+                                void (*visit)(struct gc_edge,
+                                              struct gc_heap*,
+                                              void *),
+                                struct gc_heap *heap,
+                                void *visit_data) {
+  size_t count = 0;
+  for (size_t tidx = 0; tidx < state->table_count; tidx++) {
+    struct gc_finalizer_table *table = &state->tables[tidx];
+    if (table->finalizer_count) {
+      count += table->finalizer_count;
+      for (size_t bidx = 0; bidx < BUCKET_COUNT; bidx++)
+        visit(gc_edge(&table->buckets[bidx]), heap, visit_data);
+    }
+  }
+  visit(gc_edge(&state->fired), heap, visit_data);
+  return count;
+}
+
+// public
+void gc_trace_finalizer(struct gc_finalizer *f,
+                        void (*visit)(struct gc_edge edge,
+                                      struct gc_heap *heap,
+                                      void *visit_data),
+                        struct gc_heap *heap,
+                        void *trace_data) {
+  if (f->state != FINALIZER_STATE_ACTIVE)
+    visit(gc_edge(&f->object), heap, trace_data);
+  visit(gc_edge(&f->closure), heap, trace_data);
+  visit(gc_edge(&f->next), heap, trace_data);
+}
+
+// Sweeping is currently serial.  It could run in parallel but we want to
+// resolve all finalizers before shading any additional node.  Perhaps we should
+// relax this restriction though; if the user attaches two finalizers to the
+// same object, it's probably OK to only have one finalizer fire per cycle.
+
+// internal
+size_t gc_resolve_finalizers(struct gc_finalizer_state *state,
+                             size_t priority,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *visit_data) {
+  GC_ASSERT(priority < state->table_count);
+  struct gc_finalizer_table *table = &state->tables[priority];
+  size_t finalizers_fired = 0;
+  // Visit each finalizer in the table.  If its object was already visited,
+  // re-add the finalizer to the table.  Otherwise enqueue its object edge for
+  // tracing and mark the finalizer as fired.
+  if (table->finalizer_count) {
+    struct gc_finalizer_table scratch = { 0, };
+    for (size_t bidx = 0; bidx < BUCKET_COUNT; bidx++) {
+      struct gc_finalizer *next;
+      for (struct gc_finalizer *f = table->buckets[bidx]; f; f = next) {
+        next = f->next;
+        f->next = NULL;
+        struct gc_edge edge = gc_edge(&f->object);
+        if (gc_visit_ephemeron_key(edge, heap)) {
+          add_finalizer_to_table(&scratch, f);
+        } else {
+          finalizers_fired++;
+          visit(edge, heap, visit_data);
+          add_fired_finalizer(state, f);
+        }
+      }
+    }
+    memcpy(table, &scratch, sizeof(*table));
+  }
+  state->fired_this_cycle += finalizers_fired;
+  return finalizers_fired;
+}
+
+// internal
+void gc_notify_finalizers(struct gc_finalizer_state *state,
+                          struct gc_heap *heap) {
+  if (state->fired_this_cycle && state->have_finalizers) {
+    state->have_finalizers(heap, state->fired_this_cycle);
+    state->fired_this_cycle = 0;
+  }
+}
+
+// internal
+void gc_finalizer_state_set_callback(struct gc_finalizer_state *state,
+                                     gc_finalizer_callback callback) {
+  state->have_finalizers = callback;
+}
--- a/libguile/whippet/src/gc-internal.h
+++ b/libguile/whippet/src/gc-internal.h
@ -0,0 +1,16 @@
+#ifndef GC_INTERNAL_H
+#define GC_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-ephemeron-internal.h"
+#include "gc-finalizer-internal.h"
+#include "gc-options-internal.h"
+
+uint64_t gc_heap_total_bytes_allocated(struct gc_heap *heap);
+void gc_mutator_adjust_heap_size(struct gc_mutator *mut, uint64_t new_size);
+
+
+#endif // GC_INTERNAL_H
--- a/libguile/whippet/src/gc-lock.h
+++ b/libguile/whippet/src/gc-lock.h
@ -0,0 +1,24 @@
+#ifndef GC_LOCK_H
+#define GC_LOCK_H
+
+#include <pthread.h>
+#include "gc-assert.h"
+
+struct gc_lock {
+  pthread_mutex_t *lock;
+};
+
+static struct gc_lock
+gc_lock_acquire(pthread_mutex_t *lock) {
+  pthread_mutex_lock(lock);
+  return (struct gc_lock){ lock };
+}
+
+static void
+gc_lock_release(struct gc_lock *lock) {
+  GC_ASSERT(lock->lock);
+  pthread_mutex_unlock(lock->lock);
+  lock->lock = NULL;
+}
+
+#endif // GC_LOCK_H
--- a/libguile/whippet/src/gc-options-internal.h
+++ b/libguile/whippet/src/gc-options-internal.h
@ -0,0 +1,32 @@
+#ifndef GC_OPTIONS_INTERNAL_H
+#define GC_OPTIONS_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-options.h"
+
+struct gc_common_options {
+  enum gc_heap_size_policy heap_size_policy;
+  size_t heap_size;
+  size_t maximum_heap_size;
+  double heap_size_multiplier;
+  double heap_expansiveness;
+  int parallelism;
+};
+
+GC_INTERNAL void gc_init_common_options(struct gc_common_options *options);
+
+GC_INTERNAL int gc_common_option_from_string(const char *str);
+
+GC_INTERNAL int gc_common_options_set_int(struct gc_common_options *options,
+                                          int option, int value);
+GC_INTERNAL int gc_common_options_set_size(struct gc_common_options *options,
+                                           int option, size_t value);
+GC_INTERNAL int gc_common_options_set_double(struct gc_common_options *options,
+                                             int option, double value);
+GC_INTERNAL int gc_common_options_parse_and_set(struct gc_common_options *options,
+                                                int option, const char *value);
+
+#endif // GC_OPTIONS_INTERNAL_H
--- a/libguile/whippet/src/gc-options.c
+++ b/libguile/whippet/src/gc-options.c
@ -0,0 +1,198 @@
+#include <limits.h>
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GC_IMPL 1
+
+#include "gc-options-internal.h"
+#include "gc-platform.h"
+
+// M(UPPER, lower, repr, type, parser, default, min, max)
+#define FOR_EACH_INT_GC_OPTION(M)                                       \
+  M(HEAP_SIZE_POLICY, heap_size_policy, "heap-size-policy",             \
+    int, heap_size_policy, GC_HEAP_SIZE_FIXED, GC_HEAP_SIZE_FIXED,      \
+    GC_HEAP_SIZE_ADAPTIVE)                                              \
+  M(PARALLELISM, parallelism, "parallelism",                            \
+    int, int, default_parallelism(), 1, 64)
+
+#define FOR_EACH_SIZE_GC_OPTION(M)                                      \
+  M(HEAP_SIZE, heap_size, "heap-size",                                  \
+    size, size, 6 * 1024 * 1024, 0, -1)                                 \
+  M(MAXIMUM_HEAP_SIZE, maximum_heap_size, "maximum-heap-size",          \
+    size, size, 0, 0, -1)
+
+#define FOR_EACH_DOUBLE_GC_OPTION(M)                                    \
+  M(HEAP_SIZE_MULTIPLIER, heap_size_multiplier, "heap-size-multiplier", \
+    double, double, 1.75, 1.0, 1e6)                                     \
+  M(HEAP_EXPANSIVENESS, heap_expansiveness, "heap-expansiveness",       \
+    double, double, 1.0, 0.0, 50.0)
+
+typedef int gc_option_int;
+typedef size_t gc_option_size;
+typedef double gc_option_double;
+
+#define FOR_EACH_COMMON_GC_OPTION(M)                                    \
+  FOR_EACH_INT_GC_OPTION(M)                                             \
+  FOR_EACH_SIZE_GC_OPTION(M)                                            \
+  FOR_EACH_DOUBLE_GC_OPTION(M)
+
+static int clamp_int(int n, int lo, int hi) {
+  return n < lo ? lo : n > hi ? hi : n;
+}
+static size_t clamp_size(size_t n, size_t lo, size_t hi) {
+  return n < lo ? lo : n > hi ? hi : n;
+}
+static double clamp_double(double n, double lo, double hi) {
+  return n < lo ? lo : n > hi ? hi : n;
+}
+
+static int default_parallelism(void) {
+  return clamp_int(gc_platform_processor_count(), 1, 8);
+}
+
+void gc_init_common_options(struct gc_common_options *options) {
+#define INIT(UPPER, lower, repr, type, parser, default, min, max) \
+  options->lower = default;
+  FOR_EACH_COMMON_GC_OPTION(INIT)
+#undef INIT
+}
+
+int gc_common_option_from_string(const char *str) {
+#define GET_OPTION(UPPER, lower, repr, type, parser, default, min, max) \
+  if (strcmp(str, repr) == 0) return GC_OPTION_##UPPER;
+  FOR_EACH_COMMON_GC_OPTION(GET_OPTION)
+#undef GET_OPTION
+  return -1;
+}
+
+#define SET_OPTION(UPPER, lower, repr, type, parser, default, min, max)  \
+  case GC_OPTION_##UPPER:                                               \
+  if (value != clamp_##type(value, min, max)) return 0;                 \
+    options->lower = value;                                             \
+    return 1;
+#define DEFINE_SETTER(STEM, stem, type)                                 \
+  int gc_common_options_set_##stem(struct gc_common_options *options,    \
+                                   int option, type value) {            \
+    switch (option) {                                                   \
+      FOR_EACH_##STEM##_GC_OPTION(SET_OPTION)                           \
+      default: return 0;                                                \
+    }                                                                   \
+  }
+DEFINE_SETTER(INT, int, int)
+DEFINE_SETTER(SIZE, size, size_t)
+DEFINE_SETTER(DOUBLE, double, double)
+#undef SET_OPTION
+#undef DEFINE_SETTER
+
+static int parse_size(const char *arg, size_t *val) {
+  char *end;
+  long i = strtol(arg, &end, 0);
+  if (i < 0 || i == LONG_MAX) return 0;
+  if (end == arg) return 0;
+  char delim = *end;
+  if (delim == 'k' || delim == 'K')
+    ++end, i *= 1024L;
+  else if (delim == 'm' || delim == 'M')
+    ++end, i *= 1024L * 1024L;
+  else if (delim == 'g' || delim == 'G')
+    ++end, i *= 1024L * 1024L * 1024L;
+  else if (delim == 't' || delim == 'T')
+    ++end, i *= 1024L * 1024L * 1024L * 1024L;
+
+  if (*end != '\0') return 0;
+  *val = i;
+  return 1;
+}
+
+static int parse_int(const char *arg, int *val) {
+  char *end;
+  long i = strtol(arg, &end, 0);
+  if (i == LONG_MIN || i == LONG_MAX || end == arg || *end)
+    return 0;
+  *val = i;
+  return 1;
+}
+
+static int parse_heap_size_policy(const char *arg, int *val) {
+  if (strcmp(arg, "fixed") == 0) {
+    *val = GC_HEAP_SIZE_FIXED;
+    return 1;
+  }
+  if (strcmp(arg, "growable") == 0) {
+    *val = GC_HEAP_SIZE_GROWABLE;
+    return 1;
+  }
+  if (strcmp(arg, "adaptive") == 0) {
+    *val = GC_HEAP_SIZE_ADAPTIVE;
+    return 1;
+  }
+  return parse_int(arg, val);
+}
+
+static int parse_double(const char *arg, double *val) {
+  char *end;
+  double d = strtod(arg, &end);
+  if (end == arg || *end)
+    return 0;
+  *val = d;
+  return 1;
+}
+
+int gc_common_options_parse_and_set(struct gc_common_options *options,
+                                    int option, const char *value) {
+  switch (option) {
+#define SET_OPTION(UPPER, lower, repr, type, parser, default, min, max)  \
+    case GC_OPTION_##UPPER: {                                            \
+      gc_option_##type v;                                                \
+      if (!parse_##parser(value, &v)) return 0;                          \
+      return gc_common_options_set_##type(options, option, v);           \
+    }
+    FOR_EACH_COMMON_GC_OPTION(SET_OPTION)
+    default: return 0;
+  }
+}
+
+static int is_lower(char c) { return 'a' <= c && c <= 'z'; }
+static int is_digit(char c) { return '0' <= c && c <= '9'; }
+static int is_option(char c) { return is_lower(c) || c == '-'; }
+static int is_option_end(char c) { return c == '='; }
+static int is_value(char c) {
+  return is_lower(c) || is_digit(c) || c == '-' || c == '+' || c == '.';
+}
+static int is_value_end(char c) { return c == '\0' || c == ','; }
+static char* read_token(char *p, int (*is_tok)(char c), int (*is_end)(char c),
+                        char *delim) {
+  char c;
+  for (c = *p; is_tok(c); c = *++p);
+  if (!is_end(c)) return NULL;
+  *delim = c;
+  *p = '\0';
+  return p + 1;
+}
+int gc_options_parse_and_set_many(struct gc_options *options,
+                                  const char *str) {
+  if (!*str) return 1;
+  char *copy = strdup(str);
+  char *cur = copy;
+  int ret = 0;
+  while (1) {
+    char delim;
+    char *next = read_token(cur, is_option, is_option_end, &delim);
+    if (!next) break;
+    int option = gc_option_from_string(cur);
+    if (option < 0) break;
+
+    cur = next;
+    next = read_token(cur, is_value, is_value_end, &delim);
+    if (!next) break;
+    if (!gc_options_parse_and_set(options, option, cur)) break;
+    cur = next;
+    if (delim == '\0') {
+      ret = 1;
+      break;
+    }
+  }
+  free(copy);
+  return ret;
+}
--- a/libguile/whippet/src/gc-platform-gnu-linux.c
+++ b/libguile/whippet/src/gc-platform-gnu-linux.c
@ -0,0 +1,211 @@
+// For pthread_getattr_np.
+#define _GNU_SOURCE
+#include <errno.h>
+#include <link.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+
+#define GC_IMPL 1
+
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-assert.h"
+#include "gc-inline.h"
+#include "gc-platform.h"
+
+void gc_platform_init(void) {
+  // Nothing to do.
+}
+
+static uintptr_t fallback_current_thread_stack_base(void) GC_NEVER_INLINE;
+static uintptr_t fallback_current_thread_stack_base(void) {
+  // Sloppily assume that there are very few frames between us and the
+  // thread entry or main function, and that therefore we haven't
+  // consumed more than a page of stack; we can then just round up the
+  // stack pointer to the page boundary.
+  fprintf(stderr,
+          "Using fallback strategy to capture stack base for thread %p.\n",
+          (void*)pthread_self());
+  int local;
+  uintptr_t hot = (uintptr_t)&local;
+  size_t page_size = getpagesize();
+  return (hot + page_size) & ~(page_size - 1);
+}
+
+uintptr_t gc_platform_current_thread_stack_base(void) {
+  pthread_t me = pthread_self();
+  pthread_attr_t attr;
+  int err = pthread_getattr_np(me, &attr);
+  if (err) {
+    errno = err;
+    // This case can occur for the main thread when running in a
+    // filesystem without /proc/stat.
+    perror("Failed to capture stack base via pthread_getattr_np");
+    return fallback_current_thread_stack_base();
+  }
+
+  void *stack_low_addr;
+  size_t stack_size;
+  err = pthread_attr_getstack(&attr, &stack_low_addr, &stack_size);
+  pthread_attr_destroy(&attr);
+  if (err) {
+    // Should never occur.
+    errno = err;
+    perror("pthread_attr_getstack");
+    return fallback_current_thread_stack_base();
+  }
+
+  return (uintptr_t)stack_low_addr + stack_size;
+}
+
+struct visit_data {
+  void (*f)(uintptr_t start, uintptr_t end, struct gc_heap *heap, void *data);
+  struct gc_heap *heap;
+  void *data;
+};
+
+static int visit_roots(struct dl_phdr_info *info, size_t size, void *data) {
+  struct visit_data *visit_data = data;
+  uintptr_t object_addr = info->dlpi_addr;
+  const char *object_name = info->dlpi_name;
+  const ElfW(Phdr) *program_headers = info->dlpi_phdr;
+  size_t program_headers_count = info->dlpi_phnum;
+
+  // From the loader's perspective, an ELF image is broken up into
+  // "segments", each of which is described by a "program header".
+  // Treat all writable data segments as potential edges into the
+  // GC-managed heap.
+  //
+  // Note that there are some RELRO segments which are initially
+  // writable but then remapped read-only.  BDW-GC will exclude these,
+  // but we just punt for the time being and treat them as roots
+  for (size_t i = 0; i < program_headers_count; i++) {
+    const ElfW(Phdr) *p = &program_headers[i];
+    if (p->p_type == PT_LOAD && (p->p_flags & PF_W)) {
+      uintptr_t start = p->p_vaddr + object_addr;
+      uintptr_t end = start + p->p_memsz;
+      DEBUG("found roots for '%s': [%p,%p)\n", object_name,
+            (void*)start, (void*)end);
+      visit_data->f(start, end, visit_data->heap, visit_data->data);
+    }
+  }
+
+  return 0;
+}
+
+void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
+                                                           uintptr_t end,
+                                                           struct gc_heap*,
+                                                           void *data),
+                                                 struct gc_heap *heap,
+                                                 void *data) {
+  struct visit_data visit_data = { f, heap, data };
+  dl_iterate_phdr(visit_roots, &visit_data);
+}
+
+int gc_platform_processor_count(void) {
+  cpu_set_t set;
+  if (sched_getaffinity(0, sizeof (set), &set) != 0)
+    return 1;
+  return CPU_COUNT(&set);
+}
+
+uint64_t gc_platform_monotonic_nanoseconds(void) {
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC, &ts))
+    GC_CRASH();
+  uint64_t s = ts.tv_sec;
+  uint64_t ns = ts.tv_nsec;
+  uint64_t ns_per_sec = 1000000000;
+  return s * ns_per_sec + ns;
+}
+
+size_t gc_platform_page_size(void) {
+  return getpagesize();
+}
+
+struct gc_reservation gc_platform_reserve_memory(size_t size,
+                                                 size_t alignment) {
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  GC_ASSERT_EQ(alignment & (alignment - 1), 0);
+  GC_ASSERT_EQ(alignment, align_down(alignment, getpagesize()));
+
+  size_t extent = size + alignment;
+  void *mem = mmap(NULL, extent, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+
+  if (mem == MAP_FAILED) {
+    perror("failed to reserve address space");
+    GC_CRASH();
+  }
+
+  uintptr_t base = (uintptr_t) mem;
+  uintptr_t end = base + extent;
+  uintptr_t aligned_base = alignment ? align_up(base, alignment) : base;
+  uintptr_t aligned_end = aligned_base + size;
+
+  if (aligned_base - base)
+    munmap((void*)base, aligned_base - base);
+  if (end - aligned_end)
+    munmap((void*)aligned_end, end - aligned_end);
+
+  return (struct gc_reservation){aligned_base, size};
+}
+
+void*
+gc_platform_acquire_memory_from_reservation(struct gc_reservation reservation,
+                                            size_t offset, size_t size) {
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  GC_ASSERT(size <= reservation.size);
+  GC_ASSERT(offset <= reservation.size - size);
+
+  void *mem = mmap((void*)(reservation.base + offset), size,
+                   PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return NULL;
+  }
+
+  return mem;
+}
+
+void
+gc_platform_release_reservation(struct gc_reservation reservation) {
+  if (munmap((void*)reservation.base, reservation.size) != 0)
+    perror("failed to unmap memory");
+}
+
+void*
+gc_platform_acquire_memory(size_t size, size_t alignment) {
+  struct gc_reservation reservation =
+    gc_platform_reserve_memory(size, alignment);
+  return gc_platform_acquire_memory_from_reservation(reservation, 0, size);
+}
+
+void gc_platform_release_memory(void *ptr, size_t size) {
+  GC_ASSERT_EQ((uintptr_t)ptr, align_down((uintptr_t)ptr, getpagesize()));
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  if (munmap(ptr, size) != 0)
+    perror("failed to unmap memory");
+}
+
+int gc_platform_populate_memory(void *ptr, size_t size) {
+  GC_ASSERT_EQ((uintptr_t)ptr, align_down((uintptr_t)ptr, getpagesize()));
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  if (madvise(ptr, size, MADV_WILLNEED) == 0)
+    return 1;
+  perror("failed to populate memory");
+  return 0;
+}
+
+int gc_platform_discard_memory(void *ptr, size_t size) {
+  GC_ASSERT_EQ((uintptr_t)ptr, align_down((uintptr_t)ptr, getpagesize()));
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  if (madvise(ptr, size, MADV_DONTNEED) == 0)
+    return 1;
+  perror("failed to discard memory");
+  return 0;
+}
--- a/libguile/whippet/src/gc-platform.h
+++ b/libguile/whippet/src/gc-platform.h
@ -0,0 +1,48 @@
+#ifndef GC_PLATFORM_H
+#define GC_PLATFORM_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include <stdint.h>
+
+#include "gc-visibility.h"
+
+struct gc_heap;
+
+GC_INTERNAL void gc_platform_init(void);
+GC_INTERNAL uintptr_t gc_platform_current_thread_stack_base(void);
+GC_INTERNAL
+void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
+                                                           uintptr_t end,
+                                                           struct gc_heap *heap,
+                                                           void *data),
+                                                 struct gc_heap *heap,
+                                                 void *data);
+GC_INTERNAL int gc_platform_processor_count(void);
+GC_INTERNAL uint64_t gc_platform_monotonic_nanoseconds(void);
+
+GC_INTERNAL size_t gc_platform_page_size(void);
+
+struct gc_reservation {
+  uintptr_t base;
+  size_t size;
+};
+
+GC_INTERNAL
+struct gc_reservation gc_platform_reserve_memory(size_t size, size_t alignment);
+GC_INTERNAL
+void*
+gc_platform_acquire_memory_from_reservation(struct gc_reservation reservation,
+                                            size_t offset, size_t size);
+GC_INTERNAL
+void gc_platform_release_reservation(struct gc_reservation reservation);
+
+GC_INTERNAL void* gc_platform_acquire_memory(size_t size, size_t alignment);
+GC_INTERNAL void gc_platform_release_memory(void *base, size_t size);
+
+GC_INTERNAL int gc_platform_populate_memory(void *addr, size_t size);
+GC_INTERNAL int gc_platform_discard_memory(void *addr, size_t size);
+
+#endif // GC_PLATFORM_H
--- a/libguile/whippet/src/gc-stack.c
+++ b/libguile/whippet/src/gc-stack.c
@ -0,0 +1,92 @@
+// For pthread_getattr_np.
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define GC_IMPL 1
+
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-assert.h"
+#include "gc-inline.h"
+#include "gc-platform.h"
+#include "gc-stack.h"
+
+static uintptr_t current_thread_hot_stack_addr(void) {
+#ifdef __GNUC__
+  return (uintptr_t)__builtin_frame_address(0);
+#else
+  uintptr_t local;
+  return (uintptr_t)&local;
+#endif
+}
+
+// FIXME: check platform stack growth direction.
+#define HOTTER_THAN <=
+
+static void capture_current_thread_hot_stack_addr(struct gc_stack_addr *addr) {
+  addr->addr = current_thread_hot_stack_addr();
+}
+
+static void capture_current_thread_cold_stack_addr(struct gc_stack_addr *addr) {
+  addr->addr = gc_platform_current_thread_stack_base();
+}
+
+void gc_stack_init(struct gc_stack *stack, struct gc_stack_addr *base) {
+  if (base)
+    stack->cold = *base;
+  else
+    capture_current_thread_cold_stack_addr(&stack->cold);
+  stack->hot = stack->cold;
+}
+
+void gc_stack_capture_hot(struct gc_stack *stack) {
+  capture_current_thread_hot_stack_addr(&stack->hot);
+  setjmp(stack->registers);
+  GC_ASSERT(stack->hot.addr HOTTER_THAN stack->cold.addr);
+}
+
+static void* call_with_stack(void* (*)(struct gc_stack_addr*, void*),
+                             struct gc_stack_addr*, void*) GC_NEVER_INLINE;
+static void* call_with_stack(void* (*f)(struct gc_stack_addr *, void *),
+                             struct gc_stack_addr *addr, void *arg) {
+  return f(addr, arg);
+}
+void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *base,
+                                         void *arg),
+                              void *arg) {
+  struct gc_stack_addr base;
+  capture_current_thread_hot_stack_addr(&base);
+  return call_with_stack(f, &base, arg);
+}
+
+void gc_stack_visit(struct gc_stack *stack,
+                    void (*visit)(uintptr_t low, uintptr_t high,
+                                  struct gc_heap *heap, void *data),
+                    struct gc_heap *heap,
+                    void *data) {
+  {
+    uintptr_t low = (uintptr_t)stack->registers;
+    GC_ASSERT(low == align_down(low, sizeof(uintptr_t)));
+    uintptr_t high = low + sizeof(jmp_buf);
+    DEBUG("found mutator register roots for %p: [%p,%p)\n", stack,
+          (void*)low, (void*)high);
+    visit(low, high, heap, data);
+  }
+
+  if (0 HOTTER_THAN 1) {
+    DEBUG("found mutator stack roots for %p: [%p,%p)\n", stack,
+          (void*)stack->hot.addr, (void*)stack->cold.addr);
+    visit(align_up(stack->hot.addr, sizeof(uintptr_t)),
+          align_down(stack->cold.addr, sizeof(uintptr_t)),
+          heap, data);
+  } else {
+    DEBUG("found mutator stack roots for %p: [%p,%p)\n", stack,
+          (void*)stack->cold.addr, (void*)stack->hot.addr);
+    visit(align_up(stack->cold.addr, sizeof(uintptr_t)),
+          align_down(stack->hot.addr, sizeof(uintptr_t)),
+          heap, data);
+  }
+}
--- a/libguile/whippet/src/gc-stack.h
+++ b/libguile/whippet/src/gc-stack.h
@ -0,0 +1,33 @@
+#ifndef GC_STACK_H
+#define GC_STACK_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-inline.h"
+#include <setjmp.h>
+
+struct gc_stack_addr {
+  uintptr_t addr;
+};
+
+struct gc_stack {
+  struct gc_stack_addr cold;
+  struct gc_stack_addr hot;
+  jmp_buf registers;
+};
+
+struct gc_heap;
+
+GC_INTERNAL void gc_stack_init(struct gc_stack *stack,
+                               struct gc_stack_addr *base);
+GC_INTERNAL void gc_stack_capture_hot(struct gc_stack *stack);
+GC_INTERNAL void gc_stack_visit(struct gc_stack *stack,
+                                void (*visit)(uintptr_t low, uintptr_t high,
+                                              struct gc_heap *heap,
+                                              void *data),
+                                struct gc_heap *heap,
+                                void *data);
+
+#endif // GC_STACK_H
--- a/libguile/whippet/src/gc-trace.h
+++ b/libguile/whippet/src/gc-trace.h
@ -0,0 +1,56 @@
+#ifndef GC_TRACE_H
+#define GC_TRACE_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-conservative-ref.h"
+#include "gc-embedder-api.h"
+
+static inline int gc_has_mutator_conservative_roots(void) {
+  return GC_CONSERVATIVE_ROOTS;
+}
+static inline int gc_mutator_conservative_roots_may_be_interior(void) {
+  return 1;
+}
+static inline int gc_has_global_conservative_roots(void) {
+  return GC_CONSERVATIVE_ROOTS;
+}
+static inline int gc_has_conservative_intraheap_edges(void) {
+  return GC_CONSERVATIVE_TRACE;
+}
+
+static inline int gc_has_conservative_roots(void) {
+  return gc_has_mutator_conservative_roots() ||
+    gc_has_global_conservative_roots();
+}
+
+enum gc_trace_kind {
+  GC_TRACE_PRECISELY,
+  GC_TRACE_NONE,
+  GC_TRACE_CONSERVATIVELY,
+  GC_TRACE_EPHEMERON,
+};
+
+struct gc_trace_plan {
+  enum gc_trace_kind kind;
+  size_t size; // For conservative tracing.
+};
+
+static inline int
+gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
+                                           int possibly_interior) {
+  // Assume that the minimum page size is 4096, and that the first page
+  // will contain no heap objects.
+  if (gc_conservative_ref_value(ref) < 4096)
+    return 0;
+  if (possibly_interior)
+    return 1;
+  return gc_is_valid_conservative_ref_displacement
+    (gc_conservative_ref_value(ref) & (sizeof(uintptr_t) - 1));
+}
+
+#endif // GC_TRACE_H
--- a/libguile/whippet/src/gc-tracepoint.c
+++ b/libguile/whippet/src/gc-tracepoint.c
@ -0,0 +1,6 @@
+#include <assert.h>
+#ifdef GC_TRACEPOINT_LTTNG
+#define LTTNG_UST_TRACEPOINT_DEFINE
+#define LTTNG_UST_TRACEPOINT_CREATE_PROBES
+#include "gc-lttng.h"
+#endif // GC_TRACEPOINT_LTTNG
--- a/libguile/whippet/src/growable-heap-sizer.h
+++ b/libguile/whippet/src/growable-heap-sizer.h
@ -0,0 +1,59 @@
+#ifndef GROWABLE_HEAP_SIZER_H
+#define GROWABLE_HEAP_SIZER_H
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "assert.h"
+#include "heap-sizer.h"
+
+// This is a simple heap-sizing algorithm that will grow the heap if it is
+// smaller than a given multiplier of the live data size.  It does not shrink
+// the heap.
+
+struct gc_growable_heap_sizer {
+  struct gc_heap *heap;
+  double multiplier;
+  pthread_mutex_t lock;
+};
+
+static void
+gc_growable_heap_sizer_set_multiplier(struct gc_growable_heap_sizer *sizer,
+                                      double multiplier) {
+  pthread_mutex_lock(&sizer->lock);
+  sizer->multiplier = multiplier;
+  pthread_mutex_unlock(&sizer->lock);
+}
+
+static void
+gc_growable_heap_sizer_on_gc(struct gc_growable_heap_sizer *sizer,
+                             size_t heap_size, size_t live_bytes,
+                             uint64_t pause_ns,
+                             void (*set_heap_size)(struct gc_heap*, size_t)) {
+  pthread_mutex_lock(&sizer->lock);
+  size_t target_size = live_bytes * sizer->multiplier;
+  if (target_size > heap_size)
+    set_heap_size(sizer->heap, target_size);
+  pthread_mutex_unlock(&sizer->lock);
+}
+
+static struct gc_growable_heap_sizer*
+gc_make_growable_heap_sizer(struct gc_heap *heap, double multiplier) {
+  struct gc_growable_heap_sizer *sizer;
+  sizer = malloc(sizeof(*sizer));
+  if (!sizer)
+    GC_CRASH();
+  memset(sizer, 0, sizeof(*sizer));
+  sizer->heap = heap;
+  sizer->multiplier = multiplier;
+  pthread_mutex_init(&sizer->lock, NULL);
+  return sizer;
+}
+
+static void
+gc_destroy_growable_heap_sizer(struct gc_growable_heap_sizer *sizer) {
+  free(sizer);
+}
+
+#endif // GROWABLE_HEAP_SIZER_H
--- a/libguile/whippet/src/heap-sizer.h
+++ b/libguile/whippet/src/heap-sizer.h
@ -0,0 +1,74 @@
+#ifndef HEAP_SIZER_H
+#define HEAP_SIZER_H
+
+#include "gc-api.h"
+
+#include "gc-options-internal.h"
+#include "growable-heap-sizer.h"
+#include "adaptive-heap-sizer.h"
+
+struct gc_heap_sizer {
+  enum gc_heap_size_policy policy;
+  union {
+    struct gc_growable_heap_sizer* growable;
+    struct gc_adaptive_heap_sizer* adaptive;
+  };
+};
+
+static struct gc_heap_sizer
+gc_make_heap_sizer(struct gc_heap *heap,
+                   const struct gc_common_options *options,
+                   uint64_t (*get_allocation_counter_from_thread)(struct gc_heap*),
+                   void (*set_heap_size_from_thread)(struct gc_heap*, size_t),
+                   struct gc_background_thread *thread) {
+  struct gc_heap_sizer ret = { options->heap_size_policy, };
+  switch (options->heap_size_policy) {
+    case GC_HEAP_SIZE_FIXED:
+      break;
+
+    case GC_HEAP_SIZE_GROWABLE:
+      ret.growable =
+        gc_make_growable_heap_sizer(heap, options->heap_size_multiplier);
+      break;
+
+    case GC_HEAP_SIZE_ADAPTIVE:
+      ret.adaptive =
+        gc_make_adaptive_heap_sizer (heap, options->heap_expansiveness,
+                                     get_allocation_counter_from_thread,
+                                     set_heap_size_from_thread,
+                                     thread);
+      break;
+
+    default:
+      GC_CRASH();
+  }
+  return ret;
+}
+
+static void
+gc_heap_sizer_on_gc(struct gc_heap_sizer sizer, size_t heap_size,
+                    size_t live_bytes, size_t pause_ns,
+                    void (*set_heap_size)(struct gc_heap*, size_t)) {
+  switch (sizer.policy) {
+    case GC_HEAP_SIZE_FIXED:
+      break;
+
+    case GC_HEAP_SIZE_GROWABLE:
+      gc_growable_heap_sizer_on_gc(sizer.growable, heap_size, live_bytes,
+                                   pause_ns, set_heap_size);
+      break;
+
+    case GC_HEAP_SIZE_ADAPTIVE:
+      if (sizer.adaptive->background_task_id < 0)
+        gc_adaptive_heap_sizer_background_task(sizer.adaptive);
+      gc_adaptive_heap_sizer_on_gc(sizer.adaptive, live_bytes, pause_ns,
+                                   set_heap_size);
+      break;
+
+    default:
+      GC_CRASH();
+  }
+}
+                    
+
+#endif // HEAP_SIZER_H
--- a/libguile/whippet/src/large-object-space.h
+++ b/libguile/whippet/src/large-object-space.h
@ -0,0 +1,525 @@
+#ifndef LARGE_OBJECT_SPACE_H
+#define LARGE_OBJECT_SPACE_H
+
+#include <pthread.h>
+#include <malloc.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "gc-assert.h"
+#include "gc-ref.h"
+#include "gc-conservative-ref.h"
+#include "gc-trace.h"
+#include "address-map.h"
+#include "address-set.h"
+#include "background-thread.h"
+#include "freelist.h"
+
+// A mark-sweep space with generational support.
+
+struct gc_heap;
+
+enum large_object_state {
+  LARGE_OBJECT_NURSERY = 0,
+  LARGE_OBJECT_MARKED_BIT = 1,
+  LARGE_OBJECT_MARK_TOGGLE_BIT = 2,
+  LARGE_OBJECT_MARK_0 = LARGE_OBJECT_MARKED_BIT,
+  LARGE_OBJECT_MARK_1 = LARGE_OBJECT_MARKED_BIT | LARGE_OBJECT_MARK_TOGGLE_BIT
+};
+
+struct large_object {
+  uintptr_t addr;
+  size_t size;
+};
+struct large_object_node;
+struct large_object_live_data {
+  uint8_t mark;
+  enum gc_trace_kind trace;
+};
+struct large_object_dead_data {
+  uint8_t age;
+  struct large_object_node **prev;
+  struct large_object_node *next;
+};
+struct large_object_data {
+  uint8_t is_live;
+  union {
+    struct large_object_live_data live;
+    struct large_object_dead_data dead;
+  };
+};
+
+#define SPLAY_TREE_PREFIX large_object_
+typedef struct large_object large_object_key_span;
+typedef uintptr_t large_object_key;
+typedef struct large_object_data large_object_value;
+static inline int
+large_object_compare(uintptr_t addr, struct large_object obj) {
+  if (addr < obj.addr) return -1;
+  if (addr - obj.addr < obj.size) return 0;
+  return 1;
+}
+static inline uintptr_t
+large_object_span_start(struct large_object obj) {
+  return obj.addr;
+}
+#include "splay-tree.h"
+
+DEFINE_FREELIST(large_object_freelist, sizeof(uintptr_t) * 8 - 1, 2,
+                struct large_object_node*);
+
+struct large_object_space {
+  // Lock for object_map, quarantine, nursery, and marked.
+  pthread_mutex_t lock;
+  // Lock for object_tree.
+  pthread_mutex_t object_tree_lock;
+  // Lock for remembered_edges.
+  pthread_mutex_t remembered_edges_lock;
+  // Locking order: You must hold the space lock when taking
+  // object_tree_lock.  Take no other lock while holding
+  // object_tree_lock.  remembered_edges_lock is a leaf; take no locks
+  // when holding it.
+
+  // The value for a large_object_node's "mark" field indicating a
+  // marked object; always nonzero, and alternating between two values
+  // at every major GC.
+  uint8_t marked;
+
+  // Splay tree of objects, keyed by <addr, size> tuple.  Useful when
+  // looking up object-for-address.
+  struct large_object_tree object_tree;
+
+  // Hash table of objects, where values are pointers to splay tree
+  // nodes.  Useful when you have the object address and just want to
+  // check something about it (for example its size).
+  struct address_map object_map;
+
+  // In generational configurations, we collect all allocations in the
+  // last cycle into the nursery.
+  struct address_map nursery;
+
+  // Size-segregated freelist of dead objects.  Allocations are first
+  // served from the quarantine freelist before falling back to the OS
+  // if needed.  Collected objects spend a second or two in quarantine
+  // before being returned to the OS.  This is an optimization to avoid
+  // mucking about too much with the TLB and so on.
+  struct large_object_freelist quarantine;
+
+  // Set of edges from lospace that may reference young objects,
+  // possibly in other spaces.
+  struct address_set remembered_edges;
+
+  size_t page_size;
+  size_t page_size_log2;
+  size_t total_pages;
+  size_t free_pages;
+  size_t live_pages_at_last_collection;
+  size_t pages_freed_by_last_collection;
+  int synchronous_release;
+};
+
+static size_t
+large_object_space_npages(struct large_object_space *space, size_t bytes) {
+  return (bytes + space->page_size - 1) >> space->page_size_log2;
+}
+
+static size_t
+large_object_space_size_at_last_collection(struct large_object_space *space) {
+  return space->live_pages_at_last_collection << space->page_size_log2;
+}
+
+static inline int
+large_object_space_contains_with_lock(struct large_object_space *space,
+                                      struct gc_ref ref) {
+  return address_map_contains(&space->object_map, gc_ref_value(ref));
+}
+
+static inline int
+large_object_space_contains(struct large_object_space *space,
+                            struct gc_ref ref) {
+  pthread_mutex_lock(&space->lock);
+  int ret = large_object_space_contains_with_lock(space, ref);
+  pthread_mutex_unlock(&space->lock);
+  return ret;
+}
+
+static inline struct gc_ref
+large_object_space_object_containing_edge(struct large_object_space *space,
+                                          struct gc_edge edge) {
+  pthread_mutex_lock(&space->object_tree_lock);
+  struct large_object_node *node =
+    large_object_tree_lookup(&space->object_tree, gc_edge_address(edge));
+  uintptr_t addr = (node && node->value.is_live) ? node->key.addr : 0;
+  pthread_mutex_unlock(&space->object_tree_lock);
+  return gc_ref(addr);
+}
+
+static void
+large_object_space_start_gc(struct large_object_space *space, int is_minor_gc) {
+  // Take the space lock to prevent
+  // large_object_space_process_quarantine from concurrently mutating
+  // the object map.
+  pthread_mutex_lock(&space->lock);
+  if (!is_minor_gc) {
+    space->marked ^= LARGE_OBJECT_MARK_TOGGLE_BIT;
+    space->live_pages_at_last_collection = 0;
+  }
+}
+
+static inline struct gc_trace_plan
+large_object_space_object_trace_plan(struct large_object_space *space,
+                                     struct gc_ref ref) {
+  uintptr_t node_bits =
+    address_map_lookup(&space->object_map, gc_ref_value(ref), 0);
+  GC_ASSERT(node_bits);
+  struct large_object_node *node = (struct large_object_node*) node_bits;
+  switch (node->value.live.trace) {
+    case GC_TRACE_PRECISELY:
+      return (struct gc_trace_plan){ GC_TRACE_PRECISELY, };
+    case GC_TRACE_NONE:
+      return (struct gc_trace_plan){ GC_TRACE_NONE, };
+#if GC_CONSERVATIVE_TRACE
+    case GC_TRACE_CONSERVATIVELY: {
+      return (struct gc_trace_plan){ GC_TRACE_CONSERVATIVELY, node->key.size };
+    }
+    // No large ephemerons.
+#endif
+    default:
+      GC_CRASH();
+  }
+}
+
+static uint8_t*
+large_object_node_mark_loc(struct large_object_node *node) {
+  GC_ASSERT(node->value.is_live);
+  return &node->value.live.mark;
+}
+
+static uint8_t
+large_object_node_get_mark(struct large_object_node *node) {
+  return atomic_load_explicit(large_object_node_mark_loc(node),
+                              memory_order_acquire);
+}
+
+static struct large_object_node*
+large_object_space_lookup(struct large_object_space *space, struct gc_ref ref) {
+  return (struct large_object_node*) address_map_lookup(&space->object_map,
+                                                        gc_ref_value(ref),
+                                                        0);
+}
+
+static int
+large_object_space_mark(struct large_object_space *space, struct gc_ref ref) {
+  struct large_object_node *node = large_object_space_lookup(space, ref);
+  if (!node)
+    return 0;
+  GC_ASSERT(node->value.is_live);
+
+  uint8_t *loc = large_object_node_mark_loc(node);
+  uint8_t mark = atomic_load_explicit(loc, memory_order_relaxed);
+  do {
+    if (mark == space->marked)
+      return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &mark, space->marked,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+
+  size_t pages = node->key.size >> space->page_size_log2;
+  atomic_fetch_add(&space->live_pages_at_last_collection, pages);
+
+  return 1;
+}
+
+static int
+large_object_space_is_marked(struct large_object_space *space,
+                             struct gc_ref ref) {
+  struct large_object_node *node = large_object_space_lookup(space, ref);
+  if (!node)
+    return 0;
+  GC_ASSERT(node->value.is_live);
+
+  return atomic_load_explicit(large_object_node_mark_loc(node),
+                              memory_order_acquire) == space->marked;
+}
+
+static int
+large_object_space_is_survivor(struct large_object_space *space,
+                               struct gc_ref ref) {
+  GC_ASSERT(large_object_space_contains(space, ref));
+  pthread_mutex_lock(&space->lock);
+  int old = large_object_space_is_marked(space, ref);
+  pthread_mutex_unlock(&space->lock);
+  return old;
+}
+
+static int
+large_object_space_remember_edge(struct large_object_space *space,
+                                 struct gc_ref obj,
+                                 struct gc_edge edge) {
+  GC_ASSERT(large_object_space_contains(space, obj));
+  if (!large_object_space_is_survivor(space, obj))
+    return 0;
+
+  uintptr_t edge_addr = gc_edge_address(edge);
+  int remembered = 0;
+  pthread_mutex_lock(&space->remembered_edges_lock);
+  if (!address_set_contains(&space->remembered_edges, edge_addr)) {
+    address_set_add(&space->remembered_edges, edge_addr);
+    remembered = 1;
+  }
+  pthread_mutex_unlock(&space->remembered_edges_lock);
+  return remembered;
+}
+
+static void
+large_object_space_forget_edge(struct large_object_space *space,
+                               struct gc_edge edge) {
+  uintptr_t edge_addr = gc_edge_address(edge);
+  pthread_mutex_lock(&space->remembered_edges_lock);
+  GC_ASSERT(address_set_contains(&space->remembered_edges, edge_addr));
+  address_set_remove(&space->remembered_edges, edge_addr);
+  pthread_mutex_unlock(&space->remembered_edges_lock);
+}
+
+static void
+large_object_space_clear_remembered_edges(struct large_object_space *space) {
+  address_set_clear(&space->remembered_edges);
+}
+
+static void
+large_object_space_add_to_freelist(struct large_object_space *space,
+                                   struct large_object_node *node) {
+  node->value.is_live = 0;
+  struct large_object_dead_data *data = &node->value.dead;
+  memset(data, 0, sizeof(*data));
+  data->age = 0;
+  struct large_object_node **bucket =
+    large_object_freelist_bucket(&space->quarantine, node->key.size);
+  data->next = *bucket;
+  if (data->next)
+    data->next->value.dead.prev = &data->next;
+  data->prev = bucket;
+  *bucket = node;
+}
+
+static void
+large_object_space_remove_from_freelist(struct large_object_space *space,
+                                        struct large_object_node *node) {
+  GC_ASSERT(!node->value.is_live);
+  struct large_object_dead_data *dead = &node->value.dead;
+  GC_ASSERT(dead->prev);
+  if (dead->next)
+    dead->next->value.dead.prev = dead->prev;
+  *dead->prev = dead->next;
+  dead->prev = NULL;
+  dead->next = NULL;
+}
+
+static void
+large_object_space_sweep_one(uintptr_t addr, uintptr_t node_bits,
+                             void *data) {
+  struct large_object_space *space = data;
+  struct large_object_node *node = (struct large_object_node*) node_bits;
+  if (!node->value.is_live)
+    return;
+  GC_ASSERT(node->value.is_live);
+  uint8_t mark = atomic_load_explicit(large_object_node_mark_loc(node),
+                                      memory_order_acquire);
+  if (mark != space->marked)
+    large_object_space_add_to_freelist(space, node);
+}
+
+static void
+large_object_space_process_quarantine(void *data) {
+  struct large_object_space *space = data;
+  pthread_mutex_lock(&space->lock);
+  pthread_mutex_lock(&space->object_tree_lock);
+  for (size_t idx = 0; idx < large_object_freelist_num_size_classes(); idx++) {
+    struct large_object_node **link = &space->quarantine.buckets[idx];
+    for (struct large_object_node *node = *link; node; node = *link) {
+      GC_ASSERT(!node->value.is_live);
+      if (++node->value.dead.age < 2) {
+        link = &node->value.dead.next;
+      } else {
+        struct large_object obj = node->key;
+        large_object_space_remove_from_freelist(space, node);
+        address_map_remove(&space->object_map, obj.addr);
+        large_object_tree_remove(&space->object_tree, obj.addr);
+        gc_platform_release_memory((void*)obj.addr, obj.size);
+      }
+    }
+  }
+  pthread_mutex_unlock(&space->object_tree_lock);
+  pthread_mutex_unlock(&space->lock);
+}
+
+static void
+large_object_space_finish_gc(struct large_object_space *space,
+                             int is_minor_gc) {
+  if (GC_GENERATIONAL) {
+    address_map_for_each(is_minor_gc ? &space->nursery : &space->object_map,
+                         large_object_space_sweep_one,
+                         space);
+    address_map_clear(&space->nursery);
+  } else {
+    address_map_for_each(&space->object_map,
+                         large_object_space_sweep_one,
+                         space);
+  }
+  size_t free_pages =
+    space->total_pages - space->live_pages_at_last_collection;
+  space->pages_freed_by_last_collection = free_pages - space->free_pages;
+  space->free_pages = free_pages;
+  pthread_mutex_unlock(&space->lock);
+  if (space->synchronous_release)
+    large_object_space_process_quarantine(space);
+}
+
+static void
+large_object_space_add_to_allocation_counter(struct large_object_space *space,
+                                             uint64_t *counter) {
+  size_t pages = space->total_pages - space->free_pages;
+  pages -= space->live_pages_at_last_collection;
+  *counter += pages << space->page_size_log2;
+}
+
+static inline struct gc_ref
+large_object_space_mark_conservative_ref(struct large_object_space *space,
+                                         struct gc_conservative_ref ref,
+                                         int possibly_interior) {
+  uintptr_t addr = gc_conservative_ref_value(ref);
+
+  if (!possibly_interior) {
+    // Addr not aligned on page boundary?  Not a large object.
+    // Otherwise strip the displacement to obtain the true base address.
+    uintptr_t displacement = addr & (space->page_size - 1);
+    if (!gc_is_valid_conservative_ref_displacement(displacement))
+      return gc_ref_null();
+    addr -= displacement;
+  }
+
+  struct large_object_node *node;
+  if (possibly_interior) {
+    pthread_mutex_lock(&space->object_tree_lock);
+    node = large_object_tree_lookup(&space->object_tree, addr);
+    pthread_mutex_unlock(&space->object_tree_lock);
+  } else {
+    node = large_object_space_lookup(space, gc_ref(addr));
+  }
+
+  if (node && node->value.is_live &&
+      large_object_space_mark(space, gc_ref(node->key.addr)))
+    return gc_ref(node->key.addr);
+
+  return gc_ref_null();
+}
+
+static void*
+large_object_space_alloc(struct large_object_space *space, size_t npages,
+                         enum gc_trace_kind trace) {
+  void *ret = NULL;
+  pthread_mutex_lock(&space->lock);
+  
+  size_t size = npages << space->page_size_log2;
+  for (size_t idx = large_object_freelist_size_class(size);
+       idx < large_object_freelist_num_size_classes();
+       idx++) {
+    struct large_object_node *node = space->quarantine.buckets[idx];
+    while (node && node->key.size < size)
+      node = node->value.dead.next;
+    if (node) {
+      // We found a suitable hole in quarantine.  Unlink it from the
+      // freelist.
+      large_object_space_remove_from_freelist(space, node);
+
+      // Mark the hole as live.
+      node->value.is_live = 1;
+      memset(&node->value.live, 0, sizeof(node->value.live));
+      node->value.live.mark = LARGE_OBJECT_NURSERY;
+      node->value.live.trace = trace;
+
+      // If the hole is actually too big, trim its tail.
+      if (node->key.size > size) {
+        struct large_object tail = {node->key.addr + size, node->key.size - size};
+        struct large_object_data tail_value = {0,};
+        node->key.size = size;
+        pthread_mutex_lock(&space->object_tree_lock);
+        struct large_object_node *tail_node =
+          large_object_tree_insert(&space->object_tree, tail, tail_value);
+        pthread_mutex_unlock(&space->object_tree_lock);
+        uintptr_t tail_node_bits = (uintptr_t)tail_node;
+        address_map_add(&space->object_map, tail_node->key.addr,
+                        tail_node_bits);
+        large_object_space_add_to_freelist(space, tail_node);
+      }
+
+      // Add the object to the nursery.
+      if (GC_GENERATIONAL)
+        address_map_add(&space->nursery, node->key.addr, (uintptr_t)node);
+    
+      space->free_pages -= npages;
+      ret = (void*)node->key.addr;
+      memset(ret, 0, size);
+      break;
+    }
+  }
+
+  // If we didn't find anything in the quarantine, get fresh pages from the OS.
+  if (!ret) {
+    ret = gc_platform_acquire_memory(size, 0);
+    if (ret) {
+      uintptr_t addr = (uintptr_t)ret;
+      struct large_object k = { addr, size };
+      struct large_object_data v = {0,};
+      v.is_live = 1;
+      v.live.mark = LARGE_OBJECT_NURSERY;
+      v.live.trace = trace;
+
+      pthread_mutex_lock(&space->object_tree_lock);
+      struct large_object_node *node =
+        large_object_tree_insert(&space->object_tree, k, v);
+      uintptr_t node_bits = (uintptr_t)node;
+      address_map_add(&space->object_map, addr, node_bits);
+      space->total_pages += npages;
+      pthread_mutex_unlock(&space->object_tree_lock);
+    }
+  }
+
+  pthread_mutex_unlock(&space->lock);
+  return ret;
+}
+
+static int
+large_object_space_init(struct large_object_space *space,
+                        struct gc_heap *heap,
+                        struct gc_background_thread *thread) {
+  memset(space, 0, sizeof(*space));
+  pthread_mutex_init(&space->lock, NULL);
+  pthread_mutex_init(&space->object_tree_lock, NULL);
+  pthread_mutex_init(&space->remembered_edges_lock, NULL);
+
+  space->page_size = getpagesize();
+  space->page_size_log2 = __builtin_ctz(space->page_size);
+
+  space->marked = LARGE_OBJECT_MARK_0;
+
+  large_object_tree_init(&space->object_tree);
+  address_map_init(&space->object_map);
+  address_map_init(&space->nursery);
+  large_object_freelist_init(&space->quarantine);
+
+  address_set_init(&space->remembered_edges);
+
+  if (thread)
+    gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
+                                  large_object_space_process_quarantine,
+                                  space);
+  else
+    space->synchronous_release = 1;
+
+  return 1;
+}
+
+#endif // LARGE_OBJECT_SPACE_H
--- a/libguile/whippet/src/local-worklist.h
+++ b/libguile/whippet/src/local-worklist.h
@ -0,0 +1,59 @@
+#ifndef LOCAL_WORKLIST_H
+#define LOCAL_WORKLIST_H
+
+#include "assert.h"
+
+#define LOCAL_WORKLIST_SIZE 1024
+#define LOCAL_WORKLIST_MASK (LOCAL_WORKLIST_SIZE - 1)
+#define LOCAL_WORKLIST_SHARE_AMOUNT (LOCAL_WORKLIST_SIZE * 3 / 4)
+struct local_worklist {
+  size_t read;
+  size_t write;
+  struct gc_ref data[LOCAL_WORKLIST_SIZE];
+};
+
+static inline void
+local_worklist_init(struct local_worklist *q) {
+  q->read = q->write = 0;
+}
+static inline void
+local_worklist_poison(struct local_worklist *q) {
+  q->read = 0; q->write = LOCAL_WORKLIST_SIZE;
+}
+static inline size_t
+local_worklist_size(struct local_worklist *q) {
+  return q->write - q->read;
+}
+static inline int
+local_worklist_empty(struct local_worklist *q) {
+  return local_worklist_size(q) == 0;
+}
+static inline int
+local_worklist_full(struct local_worklist *q) {
+  return local_worklist_size(q) >= LOCAL_WORKLIST_SIZE;
+}
+static inline void
+local_worklist_push(struct local_worklist *q, struct gc_ref v) {
+  ASSERT(!local_worklist_full(q));
+  q->data[q->write++ & LOCAL_WORKLIST_MASK] = v;
+}
+static inline struct gc_ref
+local_worklist_pop(struct local_worklist *q) {
+  ASSERT(!local_worklist_empty(q));
+  return q->data[q->read++ & LOCAL_WORKLIST_MASK];
+}
+
+static inline size_t
+local_worklist_pop_many(struct local_worklist *q, struct gc_ref **objv,
+                        size_t limit) {
+  size_t avail = local_worklist_size(q);
+  size_t read = q->read & LOCAL_WORKLIST_MASK;
+  size_t contig = LOCAL_WORKLIST_SIZE - read;
+  if (contig < avail) avail = contig;
+  if (limit < avail) avail = limit;
+  *objv = q->data + read;
+  q->read += avail;
+  return avail;
+}
+
+#endif // LOCAL_WORKLIST_H
--- a/libguile/whippet/src/mmc.c
+++ b/libguile/whippet/src/mmc.c
--- a/libguile/whippet/src/nofl-space.h
+++ b/libguile/whippet/src/nofl-space.h
--- a/libguile/whippet/src/parallel-tracer.h
+++ b/libguile/whippet/src/parallel-tracer.h
@ -0,0 +1,433 @@
+#ifndef PARALLEL_TRACER_H
+#define PARALLEL_TRACER_H
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-inline.h"
+#include "gc-tracepoint.h"
+#include "local-worklist.h"
+#include "root-worklist.h"
+#include "shared-worklist.h"
+#include "spin.h"
+#include "tracer.h"
+
+#ifdef VERBOSE_LOGGING
+#define LOG(...) fprintf (stderr, "LOG: " __VA_ARGS__)
+#else
+#define LOG(...) do { } while (0)
+#endif
+
+enum trace_worker_state {
+  TRACE_WORKER_STOPPED,
+  TRACE_WORKER_IDLE,
+  TRACE_WORKER_TRACING,
+  TRACE_WORKER_STOPPING,
+  TRACE_WORKER_DEAD
+};
+
+struct gc_heap;
+struct gc_trace_worker {
+  struct gc_heap *heap;
+  struct gc_tracer *tracer;
+  size_t id;
+  size_t steal_id;
+  pthread_t thread;
+  enum trace_worker_state state;
+  pthread_mutex_t lock;
+  struct shared_worklist shared;
+  struct local_worklist local;
+  struct gc_trace_worker_data *data;
+};
+
+static inline struct gc_trace_worker_data*
+gc_trace_worker_data(struct gc_trace_worker *worker) {
+  return worker->data;
+}
+
+#define TRACE_WORKERS_MAX_COUNT 8
+
+struct gc_tracer {
+  struct gc_heap *heap;
+  atomic_size_t active_tracers;
+  size_t worker_count;
+  long epoch;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+  int trace_roots_only;
+  struct root_worklist roots;
+  struct gc_trace_worker workers[TRACE_WORKERS_MAX_COUNT];
+};
+
+static int
+trace_worker_init(struct gc_trace_worker *worker, struct gc_heap *heap,
+                  struct gc_tracer *tracer, size_t id) {
+  worker->heap = heap;
+  worker->tracer = tracer;
+  worker->id = id;
+  worker->steal_id = 0;
+  worker->thread = 0;
+  worker->state = TRACE_WORKER_STOPPED;
+  pthread_mutex_init(&worker->lock, NULL);
+  worker->data = NULL;
+  local_worklist_init(&worker->local);
+  return shared_worklist_init(&worker->shared);
+}
+
+static void trace_worker_trace(struct gc_trace_worker *worker);
+
+static void*
+trace_worker_thread(void *data) {
+  struct gc_trace_worker *worker = data;
+  struct gc_tracer *tracer = worker->tracer;
+  long trace_epoch = 0;
+
+  pthread_mutex_lock(&worker->lock);
+  while (1) {
+    long epoch = atomic_load_explicit(&tracer->epoch, memory_order_acquire);
+    if (trace_epoch != epoch) {
+      trace_epoch = epoch;
+      trace_worker_trace(worker);
+    }
+    pthread_cond_wait(&tracer->cond, &worker->lock);
+  }
+  return NULL;
+}
+
+static int
+trace_worker_spawn(struct gc_trace_worker *worker) {
+  if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) {
+    perror("spawning tracer thread failed");
+    return 0;
+  }
+
+  return 1;
+}
+
+static int
+gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
+               size_t parallelism) {
+  tracer->heap = heap;
+  atomic_init(&tracer->active_tracers, 0);
+  tracer->epoch = 0;
+  tracer->trace_roots_only = 0;
+  pthread_mutex_init(&tracer->lock, NULL);
+  pthread_cond_init(&tracer->cond, NULL);
+  root_worklist_init(&tracer->roots);
+  size_t desired_worker_count = parallelism;
+  ASSERT(desired_worker_count);
+  if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
+    desired_worker_count = TRACE_WORKERS_MAX_COUNT;
+  if (!trace_worker_init(&tracer->workers[0], heap, tracer, 0))
+    return 0;
+  tracer->worker_count++;
+  for (size_t i = 1; i < desired_worker_count; i++) {
+    if (!trace_worker_init(&tracer->workers[i], heap, tracer, i))
+      break;
+    pthread_mutex_lock(&tracer->workers[i].lock);
+    if (trace_worker_spawn(&tracer->workers[i]))
+      tracer->worker_count++;
+    else
+      break;
+  }
+  return 1;
+}
+
+static void gc_tracer_prepare(struct gc_tracer *tracer) {
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    tracer->workers[i].steal_id = (i + 1) % tracer->worker_count;
+}
+static void gc_tracer_release(struct gc_tracer *tracer) {
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    shared_worklist_release(&tracer->workers[i].shared);
+}
+
+static inline void
+gc_tracer_add_root(struct gc_tracer *tracer, struct gc_root root) {
+  root_worklist_push(&tracer->roots, root);
+}
+
+static inline void
+tracer_unpark_all_workers(struct gc_tracer *tracer) {
+  long old_epoch =
+    atomic_fetch_add_explicit(&tracer->epoch, 1, memory_order_acq_rel);
+  long epoch = old_epoch + 1;
+  DEBUG("starting trace; %zu workers; epoch=%ld\n", tracer->worker_count,
+        epoch);
+  GC_TRACEPOINT(trace_unpark_all);
+  pthread_cond_broadcast(&tracer->cond);
+}
+
+static inline void
+tracer_maybe_unpark_workers(struct gc_tracer *tracer) {
+  size_t active =
+    atomic_load_explicit(&tracer->active_tracers, memory_order_acquire);
+  if (active < tracer->worker_count)
+    tracer_unpark_all_workers(tracer);
+}
+
+static inline void
+tracer_share(struct gc_trace_worker *worker) {
+  LOG("tracer #%zu: sharing\n", worker->id);
+  GC_TRACEPOINT(trace_share);
+  size_t to_share = LOCAL_WORKLIST_SHARE_AMOUNT;
+  while (to_share) {
+    struct gc_ref *objv;
+    size_t count = local_worklist_pop_many(&worker->local, &objv, to_share);
+    shared_worklist_push_many(&worker->shared, objv, count);
+    to_share -= count;
+  }
+  tracer_maybe_unpark_workers(worker->tracer);
+}
+
+static inline void
+gc_trace_worker_enqueue(struct gc_trace_worker *worker, struct gc_ref ref) {
+  ASSERT(gc_ref_is_heap_object(ref));
+  if (local_worklist_full(&worker->local))
+    tracer_share(worker);
+  local_worklist_push(&worker->local, ref);
+}
+
+static struct gc_ref
+tracer_steal_from_worker(struct gc_tracer *tracer, size_t id) {
+  ASSERT(id < tracer->worker_count);
+  return shared_worklist_steal(&tracer->workers[id].shared);
+}
+
+static int
+tracer_can_steal_from_worker(struct gc_tracer *tracer, size_t id) {
+  ASSERT(id < tracer->worker_count);
+  return shared_worklist_can_steal(&tracer->workers[id].shared);
+}
+
+static struct gc_ref
+trace_worker_steal_from_any(struct gc_trace_worker *worker,
+                            struct gc_tracer *tracer) {
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    LOG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
+    struct gc_ref obj = tracer_steal_from_worker(tracer, worker->steal_id);
+    if (!gc_ref_is_null(obj)) {
+      LOG("tracer #%zu: stealing got %p\n", worker->id,
+            gc_ref_heap_object(obj));
+      return obj;
+    }
+    worker->steal_id = (worker->steal_id + 1) % tracer->worker_count;
+  }
+  LOG("tracer #%zu: failed to steal\n", worker->id);
+  return gc_ref_null();
+}
+
+static int
+trace_worker_can_steal_from_any(struct gc_trace_worker *worker,
+                                struct gc_tracer *tracer) {
+  LOG("tracer #%zu: checking if any worker has tasks\n", worker->id);
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
+    if (res) {
+      LOG("tracer #%zu: worker #%zu has tasks!\n", worker->id,
+            worker->steal_id);
+      return 1;
+    }
+    worker->steal_id = (worker->steal_id + 1) % tracer->worker_count;
+  }
+  LOG("tracer #%zu: nothing to steal\n", worker->id);
+  return 0;
+}
+
+static size_t
+trace_worker_should_continue(struct gc_trace_worker *worker, size_t spin_count) {
+  // Helper workers should park themselves immediately if they have no work.
+  if (worker->id != 0)
+    return 0;
+
+  struct gc_tracer *tracer = worker->tracer;
+
+  if (atomic_load_explicit(&tracer->active_tracers, memory_order_acquire) != 1) {
+    LOG("checking for termination: tracers active, spinning #%zu\n", spin_count);
+    yield_for_spin(spin_count);
+    return 1;
+  }
+
+  // All trace workers have exited except us, the main worker.  We are
+  // probably done, but we need to synchronize to be sure that there is no
+  // work pending, for example if a worker had a spurious wakeup.  Skip
+  // worker 0 (the main worker).
+
+  GC_TRACEPOINT(trace_check_termination_begin);
+  size_t locked = 1;
+  while (locked < tracer->worker_count) {
+    if (pthread_mutex_trylock(&tracer->workers[locked].lock) == 0)
+      locked++;
+    else
+      break;
+  }
+  int done = (locked == tracer->worker_count) &&
+    !trace_worker_can_steal_from_any(worker, tracer);
+  GC_TRACEPOINT(trace_check_termination_end);
+
+  if (done)
+    return 0;
+  while (locked > 1)
+    pthread_mutex_unlock(&tracer->workers[--locked].lock);
+
+  LOG("checking for termination: failed to lock, spinning #%zu\n", spin_count);
+  yield_for_spin(spin_count);
+  return 1;
+}
+
+static struct gc_ref
+trace_worker_steal(struct gc_trace_worker *worker) {
+  struct gc_tracer *tracer = worker->tracer;
+
+  // It could be that the worker's local trace queue has simply
+  // overflowed.  In that case avoid contention by trying to pop
+  // something from the worker's own queue.
+  {
+    LOG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
+    struct gc_ref obj = shared_worklist_try_pop(&worker->shared);
+    if (!gc_ref_is_null(obj))
+      return obj;
+  }
+
+  GC_TRACEPOINT(trace_steal);
+  LOG("tracer #%zu: trying to steal\n", worker->id);
+  struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
+  if (!gc_ref_is_null(obj))
+    return obj;
+
+  return gc_ref_null();
+}
+
+static void
+trace_with_data(struct gc_tracer *tracer,
+                struct gc_heap *heap,
+                struct gc_trace_worker *worker,
+                struct gc_trace_worker_data *data) {
+  atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
+  worker->data = data;
+
+  LOG("tracer #%zu: running trace loop\n", worker->id);
+
+  {
+    LOG("tracer #%zu: tracing roots\n", worker->id);
+    size_t n = 0;
+    do {
+      struct gc_root root = root_worklist_pop(&tracer->roots);
+      if (root.kind == GC_ROOT_KIND_NONE)
+        break;
+      trace_root(root, heap, worker);
+      n++;
+    } while (1);
+
+    LOG("tracer #%zu: done tracing roots, %zu roots traced\n", worker->id, n);
+  }
+
+  if (tracer->trace_roots_only) {
+    // Unlike the full trace where work is generated during the trace, a
+    // roots-only trace consumes work monotonically; any object enqueued as a
+    // result of marking roots isn't ours to deal with.  However we do need to
+    // synchronize with remote workers to ensure they have completed their
+    // work items.
+    if (worker->id == 0) {
+      for (size_t i = 1; i < tracer->worker_count; i++)
+        pthread_mutex_lock(&tracer->workers[i].lock);
+    }
+  } else {
+    LOG("tracer #%zu: tracing objects\n", worker->id);
+    GC_TRACEPOINT(trace_objects_begin);
+    size_t n = 0;
+    size_t spin_count = 0;
+    do {
+      while (1) {
+        struct gc_ref ref;
+        if (!local_worklist_empty(&worker->local)) {
+          ref = local_worklist_pop(&worker->local);
+        } else {
+          ref = trace_worker_steal(worker);
+          if (gc_ref_is_null(ref))
+            break;
+        }
+        trace_one(ref, heap, worker);
+        n++;
+      }
+    } while (trace_worker_should_continue(worker, spin_count++));
+    GC_TRACEPOINT(trace_objects_end);
+
+    LOG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
+  }
+
+  worker->data = NULL;
+  atomic_fetch_sub_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
+}
+
+static void
+trace_worker_trace(struct gc_trace_worker *worker) {
+  GC_TRACEPOINT(trace_worker_begin);
+  gc_trace_worker_call_with_data(trace_with_data, worker->tracer,
+                                 worker->heap, worker);
+  GC_TRACEPOINT(trace_worker_end);
+}
+
+static inline int
+gc_tracer_should_parallelize(struct gc_tracer *tracer) {
+  if (root_worklist_size(&tracer->roots) > 1)
+    return 1;
+
+  if (tracer->trace_roots_only)
+    return 0;
+
+  size_t nonempty_worklists = 0;
+  ssize_t parallel_threshold =
+    LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    ssize_t size = shared_worklist_size(&tracer->workers[i].shared);
+    if (!size)
+      continue;
+    nonempty_worklists++;
+    if (nonempty_worklists > 1)
+      return 1;
+    if (size >= parallel_threshold)
+      return 1;
+  }
+  return 0;
+}
+
+static inline void
+gc_tracer_trace(struct gc_tracer *tracer) {
+  LOG("starting trace; %zu workers\n", tracer->worker_count);
+
+  for (int i = 1; i < tracer->worker_count; i++)
+    pthread_mutex_unlock(&tracer->workers[i].lock);
+
+  if (gc_tracer_should_parallelize(tracer)) {
+    LOG("waking workers\n");
+    tracer_unpark_all_workers(tracer);
+  } else {
+    LOG("starting in local-only mode\n");
+  }
+
+  trace_worker_trace(&tracer->workers[0]);
+  root_worklist_reset(&tracer->roots);
+
+  LOG("trace finished\n");
+}
+
+static inline void
+gc_tracer_trace_roots(struct gc_tracer *tracer) {
+  LOG("starting roots-only trace\n");
+
+  GC_TRACEPOINT(trace_roots_begin);
+  tracer->trace_roots_only = 1;
+  gc_tracer_trace(tracer);
+  tracer->trace_roots_only = 0;
+  GC_TRACEPOINT(trace_roots_end);
+  
+  GC_ASSERT_EQ(atomic_load(&tracer->active_tracers), 0);
+  LOG("roots-only trace finished\n");
+}
+
+#endif // PARALLEL_TRACER_H
--- a/libguile/whippet/src/pcc.c
+++ b/libguile/whippet/src/pcc.c
--- a/libguile/whippet/src/root-worklist.h
+++ b/libguile/whippet/src/root-worklist.h
@ -0,0 +1,76 @@
+#ifndef ROOT_WORKLIST_H
+#define ROOT_WORKLIST_H
+
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-inline.h"
+#include "gc-ref.h"
+#include "root.h"
+
+// A single-producer, multiple-consumer worklist that has two phases:
+// one in which roots are added by the producer, then one in which roots
+// are consumed from the worklist.  Roots are never added once the
+// consumer phase starts.
+struct root_worklist {
+  size_t size;
+  size_t read;
+  size_t write;
+  struct gc_root *buf;
+};
+
+void
+root_worklist_alloc(struct root_worklist *q) {
+  q->buf = realloc(q->buf, q->size * sizeof(struct gc_root));
+  if (!q->buf) {
+    perror("Failed to grow root worklist");
+    GC_CRASH();
+  }
+}
+
+static void
+root_worklist_init(struct root_worklist *q) {
+  q->size = 16;
+  q->read = 0;
+  q->write = 0;
+  q->buf = NULL;
+  root_worklist_alloc(q);
+}
+
+static inline void
+root_worklist_push(struct root_worklist *q, struct gc_root root) {
+  if (UNLIKELY(q->write == q->size)) {
+    q->size *= 2;
+    root_worklist_alloc(q);
+  }
+  q->buf[q->write++] = root;
+}
+
+// Not atomic.
+static inline size_t
+root_worklist_size(struct root_worklist *q) {
+  return q->write - q->read;
+}
+
+static inline struct gc_root
+root_worklist_pop(struct root_worklist *q) {
+  size_t idx = atomic_fetch_add(&q->read, 1);
+  if (idx < q->write)
+    return q->buf[idx];
+  return (struct gc_root){ GC_ROOT_KIND_NONE, };
+}
+
+static void
+root_worklist_reset(struct root_worklist *q) {
+  q->read = q->write = 0;
+}
+
+static void
+root_worklist_destroy(struct root_worklist *q) {
+  free(q->buf);
+}
+
+#endif // ROOT_WORKLIST_H
--- a/libguile/whippet/src/root.h
+++ b/libguile/whippet/src/root.h
@ -0,0 +1,81 @@
+#ifndef ROOT_H
+#define ROOT_H
+
+#include "gc-edge.h"
+#include "extents.h"
+
+struct gc_ephemeron;
+struct gc_heap;
+struct gc_mutator;
+struct gc_edge_buffer;
+
+enum gc_root_kind {
+  GC_ROOT_KIND_NONE,
+  GC_ROOT_KIND_HEAP,
+  GC_ROOT_KIND_MUTATOR,
+  GC_ROOT_KIND_CONSERVATIVE_EDGES,
+  GC_ROOT_KIND_CONSERVATIVE_POSSIBLY_INTERIOR_EDGES,
+  GC_ROOT_KIND_RESOLVED_EPHEMERONS,
+  GC_ROOT_KIND_EDGE,
+  GC_ROOT_KIND_EDGE_BUFFER,
+};
+
+struct gc_root {
+  enum gc_root_kind kind;
+  union {
+    struct gc_heap *heap;
+    struct gc_mutator *mutator;
+    struct gc_ephemeron *resolved_ephemerons;
+    struct extent_range range;
+    struct gc_edge edge;
+    struct gc_edge_buffer *edge_buffer;
+  };
+};
+
+static inline struct gc_root
+gc_root_heap(struct gc_heap* heap) {
+  struct gc_root ret = { GC_ROOT_KIND_HEAP };
+  ret.heap = heap;
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_mutator(struct gc_mutator* mutator) {
+  struct gc_root ret = { GC_ROOT_KIND_MUTATOR };
+  ret.mutator = mutator;
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_conservative_edges(uintptr_t lo_addr, uintptr_t hi_addr,
+                           int possibly_interior) {
+  enum gc_root_kind kind = possibly_interior
+    ? GC_ROOT_KIND_CONSERVATIVE_POSSIBLY_INTERIOR_EDGES
+    : GC_ROOT_KIND_CONSERVATIVE_EDGES;
+  struct gc_root ret = { kind };
+  ret.range = (struct extent_range) {lo_addr, hi_addr};
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_resolved_ephemerons(struct gc_ephemeron* resolved) {
+  struct gc_root ret = { GC_ROOT_KIND_RESOLVED_EPHEMERONS };
+  ret.resolved_ephemerons = resolved;
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_edge(struct gc_edge edge) {
+  struct gc_root ret = { GC_ROOT_KIND_EDGE };
+  ret.edge = edge;
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_edge_buffer(struct gc_edge_buffer *buf) {
+  struct gc_root ret = { GC_ROOT_KIND_EDGE_BUFFER };
+  ret.edge_buffer = buf;
+  return ret;
+}
+
+#endif // ROOT_H
--- a/Show more
+++ b/Show more