From bdf4b27733dd5a7c96e4d0b7a133a92cfb9f9d13 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 21 Feb 2022 21:05:18 +0100
Subject: [PATCH 001/403] Initial commit

---
 GCBench.c     | 296 +++++++++++++++++++++++++++++++++++++
 MT_GCBench.c  | 341 ++++++++++++++++++++++++++++++++++++++++++
 MT_GCBench2.c | 398 ++++++++++++++++++++++++++++++++++++++++++++++++++
 Makefile      |  29 ++++
 bdw.h         |  13 ++
 5 files changed, 1077 insertions(+)
 create mode 100644 GCBench.c
 create mode 100644 MT_GCBench.c
 create mode 100644 MT_GCBench2.c
 create mode 100644 Makefile
 create mode 100644 bdw.h

diff --git a/GCBench.c b/GCBench.c
new file mode 100644
index 000000000..c9e77d191
--- /dev/null
+++ b/GCBench.c
@@ -0,0 +1,296 @@
+// This is adapted from a benchmark written by John Ellis and Pete Kovac
+// of Post Communications.
+// It was modified by Hans Boehm of Silicon Graphics.
+// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
+// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
+//
+//      This is no substitute for real applications.  No actual application
+//      is likely to behave in exactly this way.  However, this benchmark was
+//      designed to be more representative of real applications than other
+//      Java GC benchmarks of which we are aware.
+//      It attempts to model those properties of allocation requests that
+//      are important to current GC techniques.
+//      It is designed to be used either to obtain a single overall performance
+//      number, or to give a more detailed estimate of how collector
+//      performance varies with object lifetimes.  It prints the time
+//      required to allocate and collect balanced binary trees of various
+//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
+//      allocates roughly the same amount of memory.
+//      Two data structures are kept around during the entire process, so
+//      that the measured performance is representative of applications
+//      that maintain some live in-memory data.  One of these is a tree
+//      containing many pointers.  The other is a large array containing
+//      double precision floating point numbers.  Both should be of comparable
+//      size.
+//
+//      The results are only really meaningful together with a specification
+//      of how much memory was used.  It is possible to trade memory for
+//      better time performance.  This benchmark should be run in a 32 MB
+//      heap, though we don't currently know how to enforce that uniformly.
+//
+//      Unlike the original Ellis and Kovac benchmark, we do not attempt
+//      measure pause times.  This facility should eventually be added back
+//      in.  There are several reasons for omitting it for now.  The original
+//      implementation depended on assumptions about the thread scheduler
+//      that don't hold uniformly.  The results really measure both the
+//      scheduler and GC.  Pause time measurements tend to not fit well with
+//      current benchmark suites.  As far as we know, none of the current
+//      commercial Java implementations seriously attempt to minimize GC pause
+//      times.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#ifdef GC
+#  include "gc.h"
+#endif
+
+#ifdef PROFIL
+  extern void init_profiling();
+  extern dump_profile();
+#endif
+
+//  These macros were a quick hack for the Macintosh.
+//
+//  #define currentTime() clock()
+//  #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC)
+
+#define currentTime() stats_rtclock()
+#define elapsedTime(x) (x)
+
+/* Get the current time in milliseconds */
+
+unsigned
+stats_rtclock( void )
+{
+  struct timeval t;
+  struct timezone tz;
+
+  if (gettimeofday( &t, &tz ) == -1)
+    return 0;
+  return (t.tv_sec * 1000 + t.tv_usec / 1000);
+}
+
+static const int kStretchTreeDepth    = 18;      // about 16Mb
+static const int kLongLivedTreeDepth  = 16;  // about 4Mb
+static const int kArraySize  = 500000;  // about 4Mb
+static const int kMinTreeDepth = 4;
+static const int kMaxTreeDepth = 16;
+
+typedef struct Node0_struct {
+        struct Node0_struct * left;
+        struct Node0_struct * right;
+        int i, j;
+} Node0;
+
+#ifdef HOLES
+#   define HOLE() GC_NEW(Node0);
+#else
+#   define HOLE()
+#endif
+
+typedef Node0 *Node;
+
+void init_Node(Node me, Node l, Node r) {
+    me -> left = l;
+    me -> right = r;
+}
+
+#ifndef GC
+  void destroy_Node(Node me) {
+    if (me -> left) {
+	destroy_Node(me -> left);
+    }
+    if (me -> right) {
+	destroy_Node(me -> right);
+    }
+    free(me);
+  }
+#endif
+
+// Nodes used by a tree of a given size
+static int TreeSize(int i) {
+        return ((1 << (i + 1)) - 1);
+}
+
+// Number of iterations to use for a given tree depth
+static int NumIters(int i) {
+        return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
+}
+
+// Build tree top down, assigning to older objects.
+static void Populate(int iDepth, Node thisNode) {
+        if (iDepth<=0) {
+                return;
+        } else {
+                iDepth--;
+#		ifdef GC
+                  thisNode->left  = GC_NEW(Node0); HOLE();
+                  thisNode->right = GC_NEW(Node0); HOLE();
+#		else
+                  thisNode->left  = calloc(1, sizeof(Node0));
+                  thisNode->right = calloc(1, sizeof(Node0));
+#		endif
+                Populate (iDepth, thisNode->left);
+                Populate (iDepth, thisNode->right);
+        }
+}
+
+// Build tree bottom-up
+static Node MakeTree(int iDepth) {
+	Node result;
+        if (iDepth<=0) {
+#	    ifndef GC
+		result = calloc(1, sizeof(Node0));
+#	    else
+		result = GC_NEW(Node0); HOLE();
+#	    endif
+	    /* result is implicitly initialized in both cases. */
+	    return result;
+        } else {
+	    Node left = MakeTree(iDepth-1);
+	    Node right = MakeTree(iDepth-1);
+#	    ifndef GC
+		result = malloc(sizeof(Node0));
+#	    else
+		result = GC_NEW(Node0); HOLE();
+#	    endif
+	    init_Node(result, left, right);
+	    return result;
+        }
+}
+
+static void PrintDiagnostics() {
+#if 0
+        long lFreeMemory = Runtime.getRuntime().freeMemory();
+        long lTotalMemory = Runtime.getRuntime().totalMemory();
+
+        System.out.print(" Total memory available="
+                         + lTotalMemory + " bytes");
+        System.out.println("  Free memory=" + lFreeMemory + " bytes");
+#endif
+}
+
+static void TimeConstruction(int depth) {
+        long    tStart, tFinish;
+        int     iNumIters = NumIters(depth);
+        Node    tempTree;
+	int 	i;
+
+	printf("Creating %d trees of depth %d\n", iNumIters, depth);
+        
+        tStart = currentTime();
+        for (i = 0; i < iNumIters; ++i) {
+#		ifndef GC
+                  tempTree = calloc(1, sizeof(Node0));
+#		else
+                  tempTree = GC_NEW(Node0);
+#		endif
+                Populate(depth, tempTree);
+#		ifndef GC
+                  destroy_Node(tempTree);
+#		endif
+                tempTree = 0;
+        }
+        tFinish = currentTime();
+        printf("\tTop down construction took %d msec\n",
+               elapsedTime(tFinish - tStart));
+             
+        tStart = currentTime();
+        for (i = 0; i < iNumIters; ++i) {
+                tempTree = MakeTree(depth);
+#		ifndef GC
+                  destroy_Node(tempTree);
+#		endif
+                tempTree = 0;
+        }
+        tFinish = currentTime();
+        printf("\tBottom up construction took %d msec\n",
+               elapsedTime(tFinish - tStart));
+
+}
+
+int main() {
+        Node    root;
+        Node    longLivedTree;
+        Node    tempTree;
+        long    tStart, tFinish;
+        long    tElapsed;
+  	int	i, d;
+	double 	*array;
+
+#ifdef GC
+ // GC_full_freq = 30;
+ // GC_free_space_divisor = 16;
+ // GC_enable_incremental();
+#endif
+	printf("Garbage Collector Test\n");
+ 	printf(" Live storage will peak at %d bytes.\n\n",
+               2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
+               sizeof(double) * kArraySize);
+        printf(" Stretching memory with a binary tree of depth %d\n",
+               kStretchTreeDepth);
+        PrintDiagnostics();
+#	ifdef PROFIL
+	    init_profiling();
+#	endif
+       
+        tStart = currentTime();
+        
+        // Stretch the memory space quickly
+        tempTree = MakeTree(kStretchTreeDepth);
+#	ifndef GC
+          destroy_Node(tempTree);
+#	endif
+        tempTree = 0;
+
+        // Create a long lived object
+        printf(" Creating a long-lived binary tree of depth %d\n",
+               kLongLivedTreeDepth);
+#	ifndef GC
+          longLivedTree = calloc(1, sizeof(Node0));
+#	else 
+          longLivedTree = GC_NEW(Node0);
+#	endif
+        Populate(kLongLivedTreeDepth, longLivedTree);
+
+        // Create long-lived array, filling half of it
+	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
+#	ifndef GC
+          array = malloc(kArraySize * sizeof(double));
+#	else
+#	  ifndef NO_PTRFREE
+            array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
+#	  else
+            array = GC_MALLOC(sizeof(double) * kArraySize);
+#	  endif
+#	endif
+        for (i = 0; i < kArraySize/2; ++i) {
+                array[i] = 1.0/i;
+        }
+        PrintDiagnostics();
+
+        for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
+                TimeConstruction(d);
+        }
+
+        if (longLivedTree == 0 || array[1000] != 1.0/1000)
+		fprintf(stderr, "Failed\n");
+                                // fake reference to LongLivedTree
+                                // and array
+                                // to keep them from being optimized away
+
+        tFinish = currentTime();
+        tElapsed = elapsedTime(tFinish-tStart);
+        PrintDiagnostics();
+        printf("Completed in %d msec\n", tElapsed);
+#	ifdef GC
+	  printf("Completed %d collections\n", GC_gc_no);
+	  printf("Heap size is %d\n", GC_get_heap_size());
+#       endif
+#	ifdef PROFIL
+	  dump_profile();
+#	endif
+}
+
diff --git a/MT_GCBench.c b/MT_GCBench.c
new file mode 100644
index 000000000..ba3a594f9
--- /dev/null
+++ b/MT_GCBench.c
@@ -0,0 +1,341 @@
+// This is adapted from a benchmark written by John Ellis and Pete Kovac
+// of Post Communications.
+// It was modified by Hans Boehm of Silicon Graphics.
+// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
+// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
+// Adapted to run NTHREADS client threads concurrently.  Each
+// thread executes the original benchmark.  12 June 2000  by Hans Boehm.
+//
+//      This is no substitute for real applications.  No actual application
+//      is likely to behave in exactly this way.  However, this benchmark was
+//      designed to be more representative of real applications than other
+//      Java GC benchmarks of which we are aware.
+//      It attempts to model those properties of allocation requests that
+//      are important to current GC techniques.
+//      It is designed to be used either to obtain a single overall performance
+//      number, or to give a more detailed estimate of how collector
+//      performance varies with object lifetimes.  It prints the time
+//      required to allocate and collect balanced binary trees of various
+//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
+//      allocates roughly the same amount of memory.
+//      Two data structures are kept around during the entire process, so
+//      that the measured performance is representative of applications
+//      that maintain some live in-memory data.  One of these is a tree
+//      containing many pointers.  The other is a large array containing
+//      double precision floating point numbers.  Both should be of comparable
+//      size.
+//
+//      The results are only really meaningful together with a specification
+//      of how much memory was used.  It is possible to trade memory for
+//      better time performance.  This benchmark should be run in a 32 MB
+//      heap, though we don't currently know how to enforce that uniformly.
+//
+//      Unlike the original Ellis and Kovac benchmark, we do not attempt
+//      measure pause times.  This facility should eventually be added back
+//      in.  There are several reasons for omitting it for now.  The original
+//      implementation depended on assumptions about the thread scheduler
+//      that don't hold uniformly.  The results really measure both the
+//      scheduler and GC.  Pause time measurements tend to not fit well with
+//      current benchmark suites.  As far as we know, none of the current
+//      commercial Java implementations seriously attempt to minimize GC pause
+//      times.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#ifdef GC
+#  ifndef LINUX_THREADS
+#     define LINUX_THREADS
+#  endif
+#  ifndef _REENTRANT
+#     define _REENTRANT
+#  endif
+#  ifdef LOCAL
+#    define GC_REDIRECT_TO_LOCAL
+#    include "gc_local_alloc.h"
+#  endif
+#  include "gc.h"
+#endif
+
+
+#ifndef NTHREADS
+#   define NTHREADS 1
+#endif
+
+#ifdef PROFIL
+  extern void init_profiling();
+  extern dump_profile();
+#endif
+
+//  These macros were a quick hack for the Macintosh.
+//
+//  #define currentTime() clock()
+//  #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC)
+
+#define currentTime() stats_rtclock()
+#define elapsedTime(x) (x)
+
+/* Get the current time in milliseconds */
+
+unsigned
+stats_rtclock( void )
+{
+  struct timeval t;
+  struct timezone tz;
+
+  if (gettimeofday( &t, &tz ) == -1)
+    return 0;
+  return (t.tv_sec * 1000 + t.tv_usec / 1000);
+}
+
+static const int kStretchTreeDepth    = 18;      // about 16Mb
+static const int kLongLivedTreeDepth  = 16;  // about 4Mb
+static const int kArraySize  = 500000;  // about 4Mb
+static const int kMinTreeDepth = 4;
+static const int kMaxTreeDepth = 16;
+
+typedef struct Node0_struct {
+        struct Node0_struct * left;
+        struct Node0_struct * right;
+        int i, j;
+} Node0;
+
+#ifdef HOLES
+#   define HOLE() GC_NEW(Node0);
+#else
+#   define HOLE()
+#endif
+
+typedef Node0 *Node;
+
+void init_Node(Node me, Node l, Node r) {
+    me -> left = l;
+    me -> right = r;
+}
+
+#ifndef GC
+  void destroy_Node(Node me) {
+    if (me -> left) {
+	destroy_Node(me -> left);
+    }
+    if (me -> right) {
+	destroy_Node(me -> right);
+    }
+    free(me);
+  }
+#endif
+
+// Nodes used by a tree of a given size
+static int TreeSize(int i) {
+        return ((1 << (i + 1)) - 1);
+}
+
+// Number of iterations to use for a given tree depth
+static int NumIters(int i) {
+        return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
+}
+
+// Build tree top down, assigning to older objects.
+static void Populate(int iDepth, Node thisNode) {
+        if (iDepth<=0) {
+                return;
+        } else {
+                iDepth--;
+#		ifdef GC
+                  thisNode->left  = GC_NEW(Node0); HOLE();
+                  thisNode->right = GC_NEW(Node0); HOLE();
+#		else
+                  thisNode->left  = calloc(1, sizeof(Node0));
+                  thisNode->right = calloc(1, sizeof(Node0));
+#		endif
+                Populate (iDepth, thisNode->left);
+                Populate (iDepth, thisNode->right);
+        }
+}
+
+// Build tree bottom-up
+static Node MakeTree(int iDepth) {
+	Node result;
+        if (iDepth<=0) {
+#	    ifndef GC
+		result = calloc(1, sizeof(Node0));
+#	    else
+		result = GC_NEW(Node0); HOLE();
+#	    endif
+	    /* result is implicitly initialized in both cases. */
+	    return result;
+        } else {
+	    Node left = MakeTree(iDepth-1);
+	    Node right = MakeTree(iDepth-1);
+#	    ifndef GC
+		result = malloc(sizeof(Node0));
+#	    else
+		result = GC_NEW(Node0); HOLE();
+#	    endif
+	    init_Node(result, left, right);
+	    return result;
+        }
+}
+
+static void PrintDiagnostics() {
+#if 0
+        long lFreeMemory = Runtime.getRuntime().freeMemory();
+        long lTotalMemory = Runtime.getRuntime().totalMemory();
+
+        System.out.print(" Total memory available="
+                         + lTotalMemory + " bytes");
+        System.out.println("  Free memory=" + lFreeMemory + " bytes");
+#endif
+}
+
+static void TimeConstruction(int depth) {
+        long    tStart, tFinish;
+        int     iNumIters = NumIters(depth);
+        Node    tempTree;
+	int 	i;
+
+	printf("0x%x: Creating %d trees of depth %d\n", pthread_self(), iNumIters, depth);
+        
+        tStart = currentTime();
+        for (i = 0; i < iNumIters; ++i) {
+#		ifndef GC
+                  tempTree = calloc(1, sizeof(Node0));
+#		else
+                  tempTree = GC_NEW(Node0);
+#		endif
+                Populate(depth, tempTree);
+#		ifndef GC
+                  destroy_Node(tempTree);
+#		endif
+                tempTree = 0;
+        }
+        tFinish = currentTime();
+        printf("\t0x%x: Top down construction took %d msec\n",
+               pthread_self(), elapsedTime(tFinish - tStart));
+             
+        tStart = currentTime();
+        for (i = 0; i < iNumIters; ++i) {
+                tempTree = MakeTree(depth);
+#		ifndef GC
+                  destroy_Node(tempTree);
+#		endif
+                tempTree = 0;
+        }
+        tFinish = currentTime();
+        printf("\t0x%x: Bottom up construction took %d msec\n",
+               pthread_self(), elapsedTime(tFinish - tStart));
+
+}
+
+void * run_one_test(void * arg) {
+	int d;
+        for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
+                TimeConstruction(d);
+        }
+}
+
+int main() {
+        Node    root;
+        Node    longLivedTree;
+        Node    tempTree;
+        long    tStart, tFinish;
+        long    tElapsed;
+  	int	i;
+	double 	*array;
+
+#ifdef GC
+ // GC_full_freq = 30;
+ // GC_free_space_divisor = 16;
+ // GC_enable_incremental();
+#endif
+#       if defined(GC) && defined(LOCAL)
+	  GC_thr_init();
+#  	endif
+	printf("Garbage Collector Test\n");
+ 	printf(" Live storage will peak at %d bytes.\n\n",
+               2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
+               sizeof(double) * kArraySize);
+        printf(" Stretching memory with a binary tree of depth %d\n",
+               kStretchTreeDepth);
+        PrintDiagnostics();
+#	ifdef PROFIL
+	    init_profiling();
+#	endif
+       
+        tStart = currentTime();
+        
+        // Stretch the memory space quickly
+        tempTree = MakeTree(kStretchTreeDepth);
+#	ifndef GC
+          destroy_Node(tempTree);
+#	endif
+        tempTree = 0;
+
+        // Create a long lived object
+        printf(" Creating a long-lived binary tree of depth %d\n",
+               kLongLivedTreeDepth);
+#	ifndef GC
+          longLivedTree = calloc(1, sizeof(Node0));
+#	else 
+          longLivedTree = GC_NEW(Node0);
+#	endif
+        Populate(kLongLivedTreeDepth, longLivedTree);
+
+        // Create long-lived array, filling half of it
+	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
+#	ifndef GC
+          array = malloc(kArraySize * sizeof(double));
+#	else
+#	  ifndef NO_PTRFREE
+            array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
+#	  else
+            array = GC_MALLOC(sizeof(double) * kArraySize);
+#	  endif
+#	endif
+        for (i = 0; i < kArraySize/2; ++i) {
+                array[i] = 1.0/i;
+        }
+
+        {
+	  pthread_t thread[NTHREADS];
+	  for (i = 1; i < NTHREADS; ++i) {
+    	    int code;
+
+	    if ((code = pthread_create(thread+i, 0, run_one_test, 0)) != 0) {
+    	      fprintf(stderr, "Thread creation failed %u\n", code);
+	      exit(1);
+	    }
+	  }
+	  /* We use the main thread to run one test.  This allows	*/
+	  /* profiling to work, for example.				*/
+	  run_one_test(0);
+	  for (i = 1; i < NTHREADS; ++i) {
+    	    int code;
+	    if ((code = pthread_join(thread[i], 0)) != 0) {
+        	fprintf(stderr, "Thread join failed %u\n", code);
+      	    }
+ 	  }
+        }
+        PrintDiagnostics();
+
+        if (longLivedTree == 0 || array[1000] != 1.0/1000)
+		fprintf(stderr, "Failed\n");
+                                // fake reference to LongLivedTree
+                                // and array
+                                // to keep them from being optimized away
+
+        tFinish = currentTime();
+        tElapsed = elapsedTime(tFinish-tStart);
+        PrintDiagnostics();
+        printf("Completed in %d msec\n", tElapsed);
+#	ifdef GC
+	  printf("Completed %d collections\n", GC_gc_no);
+	  printf("Heap size is %d\n", GC_get_heap_size());
+#       endif
+#	ifdef PROFIL
+	  dump_profile();
+#	endif
+}
+
diff --git a/MT_GCBench2.c b/MT_GCBench2.c
new file mode 100644
index 000000000..07fe7e3a5
--- /dev/null
+++ b/MT_GCBench2.c
@@ -0,0 +1,398 @@
+// This is version 2 of the multithreaded GC Bench.
+// Heap expansion is handled differently from version 1, in an attempt
+// to make scalability measurements more meaningful.  The version with
+// N threads now immediately expands the heap to N*32MB.
+//
+// To run this with BDWGC versions 6 and later with thread local allocation,
+// define GC and LOCAL.  Without thread-local allocation, define just GC.
+// To run it with the University of Tokyo scalable GC,
+// define SGC.  To run it with malloc and explicit deallocation, define
+// none of these.  (This should also work for Hoard.)
+//
+// Note that defining GC or SGC removes the explicit deallocation passes,
+// which seems fair.
+// 
+// This is adapted from a benchmark written by John Ellis and Pete Kovac
+// of Post Communications.
+// It was modified by Hans Boehm of Silicon Graphics.
+// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
+// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
+// Adapted to run NTHREADS client threads concurrently.  Each
+// thread executes the original benchmark.  12 June 2000  by Hans Boehm.
+// Changed heap expansion rule, and made the number of threads run-time
+// configurable.  25 Oct 2000 by Hans Boehm.
+//
+//      This is no substitute for real applications.  No actual application
+//      is likely to behave in exactly this way.  However, this benchmark was
+//      designed to be more representative of real applications than other
+//      Java GC benchmarks of which we were aware at the time.
+//      It still doesn't seem too bad for something this small.
+//      It attempts to model those properties of allocation requests that
+//      are important to current GC techniques.
+//      It is designed to be used either to obtain a single overall performance
+//      number, or to give a more detailed estimate of how collector
+//      performance varies with object lifetimes.  It prints the time
+//      required to allocate and collect balanced binary trees of various
+//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
+//      allocates roughly the same amount of memory.
+//      Two data structures are kept around during the entire process, so
+//      that the measured performance is representative of applications
+//      that maintain some live in-memory data.  One of these is a tree
+//      containing many pointers.  The other is a large array containing
+//      double precision floating point numbers.  Both should be of comparable
+//      size.
+//
+//      The results are only really meaningful together with a specification
+//      of how much memory was used.  This versions of the benchmark tries
+//      to preallocate a sufficiently large heap that expansion should not be
+//      needed.
+//
+//      Unlike the original Ellis and Kovac benchmark, we do not attempt
+//      measure pause times.  This facility should eventually be added back
+//      in.  There are several reasons for omitting it for now.  The original
+//      implementation depended on assumptions about the thread scheduler
+//      that don't hold uniformly.  The results really measure both the
+//      scheduler and GC.  Pause time measurements tend to not fit well with
+//      current benchmark suites.  As far as we know, none of the current
+//      commercial Java implementations seriously attempt to minimize GC pause
+//      times.
+//
+//      Since this benchmark has recently been more widely used, some
+//      anomalous behavious has been uncovered.  The user should be aware
+//      of this:
+//      1) Nearly all objects are of the same size.  This benchmark is
+//         not useful for analyzing fragmentation behavior.  It is unclear
+//         whether this is an issue for well-designed allocators.
+//      2) Unless HOLES is defined, it tends to drop consecutively allocated
+//         memory at the same time.  Many real applications do exhibit this
+//         phenomenon, but probably not to this extent.  (Defining HOLES tends
+//         to move the benchmark to the opposite extreme.)
+//      3) It appears harder to predict object lifetimes than for most real
+//         Java programs (see T. Harris, "Dynamic adptive pre-tenuring",
+//         ISMM '00).
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#ifdef GC
+#  ifndef LINUX_THREADS
+#     define LINUX_THREADS
+#  endif
+#  ifndef _REENTRANT
+#     define _REENTRANT
+#  endif
+#  ifdef LOCAL
+#    define GC_REDIRECT_TO_LOCAL
+#    include "gc_local_alloc.h"
+#  endif
+#  include "gc.h"
+#endif
+#ifdef SGC
+#  include "sgc.h"
+#  define GC
+#  define pthread_create GC_pthread_create
+#  define pthread_join GC_pthread_join
+#endif
+
+#define MAX_NTHREADS 1024
+
+int nthreads = 0;
+
+#ifdef PROFIL
+  extern void init_profiling();
+  extern dump_profile();
+#endif
+
+//  These macros were a quick hack for the Macintosh.
+//
+//  #define currentTime() clock()
+//  #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC)
+
+#define currentTime() stats_rtclock()
+#define elapsedTime(x) (x)
+
+/* Get the current time in milliseconds */
+
+unsigned
+stats_rtclock( void )
+{
+  struct timeval t;
+  struct timezone tz;
+
+  if (gettimeofday( &t, &tz ) == -1)
+    return 0;
+  return (t.tv_sec * 1000 + t.tv_usec / 1000);
+}
+
+static const int kStretchTreeDepth    = 18;      // about 16Mb
+static const int kLongLivedTreeDepth  = 16;  // about 4Mb
+static const int kArraySize  = 500000;  // about 4Mb
+static const int kMinTreeDepth = 4;
+static const int kMaxTreeDepth = 16;
+
+typedef struct Node0_struct {
+        struct Node0_struct * left;
+        struct Node0_struct * right;
+        int i, j;
+} Node0;
+
+#ifdef HOLES
+#   define HOLE() GC_NEW(Node0);
+#else
+#   define HOLE()
+#endif
+
+typedef Node0 *Node;
+
+void init_Node(Node me, Node l, Node r) {
+    me -> left = l;
+    me -> right = r;
+}
+
+#ifndef GC
+  void destroy_Node(Node me) {
+    if (me -> left) {
+	destroy_Node(me -> left);
+    }
+    if (me -> right) {
+	destroy_Node(me -> right);
+    }
+    free(me);
+  }
+#endif
+
+// Nodes used by a tree of a given size
+static int TreeSize(int i) {
+        return ((1 << (i + 1)) - 1);
+}
+
+// Number of iterations to use for a given tree depth
+static int NumIters(int i) {
+        return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
+}
+
+// Build tree top down, assigning to older objects.
+static void Populate(int iDepth, Node thisNode) {
+        if (iDepth<=0) {
+                return;
+        } else {
+                iDepth--;
+#		ifdef GC
+                  thisNode->left  = GC_NEW(Node0); HOLE();
+                  thisNode->right = GC_NEW(Node0); HOLE();
+#		else
+                  thisNode->left  = calloc(1, sizeof(Node0));
+                  thisNode->right = calloc(1, sizeof(Node0));
+#		endif
+                Populate (iDepth, thisNode->left);
+                Populate (iDepth, thisNode->right);
+        }
+}
+
+// Build tree bottom-up
+static Node MakeTree(int iDepth) {
+	Node result;
+        if (iDepth<=0) {
+#	    ifndef GC
+		result = calloc(1, sizeof(Node0));
+#	    else
+		result = GC_NEW(Node0); HOLE();
+#	    endif
+	    /* result is implicitly initialized in both cases. */
+	    return result;
+        } else {
+	    Node left = MakeTree(iDepth-1);
+	    Node right = MakeTree(iDepth-1);
+#	    ifndef GC
+		result = malloc(sizeof(Node0));
+#	    else
+		result = GC_NEW(Node0); HOLE();
+#	    endif
+	    init_Node(result, left, right);
+	    return result;
+        }
+}
+
+static void PrintDiagnostics() {
+#if 0
+        long lFreeMemory = Runtime.getRuntime().freeMemory();
+        long lTotalMemory = Runtime.getRuntime().totalMemory();
+
+        System.out.print(" Total memory available="
+                         + lTotalMemory + " bytes");
+        System.out.println("  Free memory=" + lFreeMemory + " bytes");
+#endif
+}
+
+static void TimeConstruction(int depth) {
+        long    tStart, tFinish;
+        int     iNumIters = NumIters(depth);
+        Node    tempTree;
+	int 	i;
+
+	printf("0x%x: Creating %d trees of depth %d\n", pthread_self(), iNumIters, depth);
+        
+        tStart = currentTime();
+        for (i = 0; i < iNumIters; ++i) {
+#		ifndef GC
+                  tempTree = calloc(1, sizeof(Node0));
+#		else
+                  tempTree = GC_NEW(Node0);
+#		endif
+                Populate(depth, tempTree);
+#		ifndef GC
+                  destroy_Node(tempTree);
+#		endif
+                tempTree = 0;
+        }
+        tFinish = currentTime();
+        printf("\t0x%x: Top down construction took %d msec\n",
+               pthread_self(), elapsedTime(tFinish - tStart));
+             
+        tStart = currentTime();
+        for (i = 0; i < iNumIters; ++i) {
+                tempTree = MakeTree(depth);
+#		ifndef GC
+                  destroy_Node(tempTree);
+#		endif
+                tempTree = 0;
+        }
+        tFinish = currentTime();
+        printf("\t0x%x: Bottom up construction took %d msec\n",
+               pthread_self(), elapsedTime(tFinish - tStart));
+
+}
+
+void * run_one_test(void * arg) {
+	int d, i;
+        Node    longLivedTree;
+	double 	*array;
+	/* size_t initial_bytes = GC_get_total_bytes(); */
+
+        // Create a long lived object
+        printf(" Creating a long-lived binary tree of depth %d\n",
+               kLongLivedTreeDepth);
+#	ifndef GC
+          longLivedTree = calloc(1, sizeof(Node0));
+#	else 
+          longLivedTree = GC_NEW(Node0);
+#	endif
+        Populate(kLongLivedTreeDepth, longLivedTree);
+
+        // Create long-lived array, filling half of it
+	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
+#	ifndef GC
+          array = malloc(kArraySize * sizeof(double));
+#	else
+#	  ifndef NO_PTRFREE
+            array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
+#	  else
+            array = GC_MALLOC(sizeof(double) * kArraySize);
+#	  endif
+#	endif
+        for (i = 0; i < kArraySize/2; ++i) {
+                array[i] = 1.0/i;
+        }
+
+        for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
+                TimeConstruction(d);
+        }
+	/* printf("Allocated %ld bytes before start, %ld after\n",
+		initial_bytes, GC_get_total_bytes() - initial_bytes); */
+        if (longLivedTree->left -> right == 0 || array[1000] != 1.0/1000)
+		fprintf(stderr, "Failed\n");
+                                // fake reference to LongLivedTree
+                                // and array
+                                // to keep them from being optimized away
+
+}
+
+int main(int argc, char **argv) {
+        Node    root;
+        Node    tempTree[MAX_NTHREADS];
+        long    tStart, tFinish;
+        long    tElapsed;
+  	int	i;
+#	ifdef SGC
+	  SGC_attr_t attr;
+#	endif
+
+	if (1 == argc) {
+	    nthreads = 1;
+	} else if (2 == argc) {
+	    nthreads = atoi(argv[1]);
+	    if (nthreads < 1 || nthreads > MAX_NTHREADS) {
+		fprintf(stderr, "Invalid # of threads argument\n");
+		exit(1);
+	    }
+	} else {
+	    fprintf(stderr, "Usage: %s [# of threads]\n");
+	    exit(1);
+	}
+#       if defined(SGC)
+	  /* The University of Tokyo collector needs explicit	*/
+	  /* initialization.					*/
+	  SGC_attr_init(&attr);
+	  SGC_init(nthreads, &attr);
+#  	endif
+#ifdef GC
+ // GC_full_freq = 30;
+ // GC_free_space_divisor = 16;
+ // GC_enable_incremental();
+#endif
+	printf("Garbage Collector Test\n");
+ 	printf(" Live storage will peak at %d bytes or less .\n\n",
+               2 * sizeof(Node0) * nthreads
+	         * (TreeSize(kLongLivedTreeDepth) + TreeSize(kMaxTreeDepth))
+               + sizeof(double) * kArraySize);
+        PrintDiagnostics();
+        
+#	ifdef GC
+	  /* GC_expand_hp fails with empty heap */
+	  GC_malloc(1);
+	  GC_expand_hp(32*1024*1024*nthreads);
+#	endif
+
+#	ifdef PROFIL
+	    init_profiling();
+#	endif
+       
+        tStart = currentTime();
+        {
+	  pthread_t thread[MAX_NTHREADS];
+	  for (i = 1; i < nthreads; ++i) {
+    	    int code;
+
+	    if ((code = pthread_create(thread+i, 0, run_one_test, 0)) != 0) {
+    	      fprintf(stderr, "Thread creation failed %u\n", code);
+	      exit(1);
+	    }
+	  }
+	  /* We use the main thread to run one test.  This allows	*/
+	  /* profiling to work, for example.				*/
+	  run_one_test(0);
+	  for (i = 1; i < nthreads; ++i) {
+    	    int code;
+	    if ((code = pthread_join(thread[i], 0)) != 0) {
+        	fprintf(stderr, "Thread join failed %u\n", code);
+      	    }
+ 	  }
+        }
+        PrintDiagnostics();
+
+        tFinish = currentTime();
+        tElapsed = elapsedTime(tFinish-tStart);
+        PrintDiagnostics();
+        printf("Completed in %d msec\n", tElapsed);
+#	ifdef GC
+	  printf("Completed %d collections\n", GC_gc_no);
+	  printf("Heap size is %d\n", GC_get_heap_size());
+#       endif
+#	ifdef PROFIL
+	  dump_profile();
+#	endif
+        return 0;
+}
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..2d7c92c9c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,29 @@
+TESTS=GCBench MT_GCBench MT_GCBench2
+COLLECTORS=bdw
+
+CC=gcc
+CFLAGS=-Wall -O2 -g
+
+ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
+
+all: $(ALL_TESTS)
+
+bdw-%: bdw.h %.c
+	$(CC) $(CFLAGS) -lpthread `pkg-config --libs --cflags bdw-gc` -I. -o $@ $*.c bdw.h
+
+check: $(addprefix test-$(TARGET),$(TARGETS))
+
+test-%: $(ALL_TESTS)
+	@echo "Running unit tests..."
+	@set -e; for test in $?; do \
+	  echo "Testing: $$test"; \
+	  ./$$test; \
+	done
+	@echo "Success."
+
+.PHONY: check
+
+.PRECIOUS: $(ALL_TESTS)
+
+clean:
+	rm -f $(ALL_TESTS)
diff --git a/bdw.h b/bdw.h
new file mode 100644
index 000000000..28932aea8
--- /dev/null
+++ b/bdw.h
@@ -0,0 +1,13 @@
+// When pthreads are used, let `libgc' know about it and redirect
+// allocation calls such as `GC_MALLOC ()' to (contention-free, faster)
+// thread-local allocation.
+
+#define GC_THREADS 1
+#define GC_REDIRECT_TO_LOCAL 1
+
+// Don't #define pthread routines to their GC_pthread counterparts.
+// Instead we will be careful inside the benchmarks to use API to
+// register threads with libgc.
+#define GC_NO_THREAD_REDIRECTS 1
+
+#include <gc/gc.h>

From 869a490ba6c57500e85b6ae2534af13c9e59b5ec Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 21 Feb 2022 21:22:21 +0100
Subject: [PATCH 002/403] Refactor gcbench.c

---
 GCBench.c | 108 +++++++++---------------------------------------------
 Makefile  |   2 +-
 2 files changed, 18 insertions(+), 92 deletions(-)

diff --git a/GCBench.c b/GCBench.c
index c9e77d191..95b2ff0c6 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -42,27 +42,16 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
-#ifdef GC
-#  include "gc.h"
+#ifdef GC_BDW
+#include "bdw.h"
+#else
+#error unknown gc
 #endif
 
-#ifdef PROFIL
-  extern void init_profiling();
-  extern dump_profile();
-#endif
-
-//  These macros were a quick hack for the Macintosh.
-//
-//  #define currentTime() clock()
-//  #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC)
-
-#define currentTime() stats_rtclock()
 #define elapsedTime(x) (x)
 
 /* Get the current time in milliseconds */
-
-unsigned
-stats_rtclock( void )
+static unsigned currentTime(void)
 {
   struct timeval t;
   struct timezone tz;
@@ -84,12 +73,6 @@ typedef struct Node0_struct {
         int i, j;
 } Node0;
 
-#ifdef HOLES
-#   define HOLE() GC_NEW(Node0);
-#else
-#   define HOLE()
-#endif
-
 typedef Node0 *Node;
 
 void init_Node(Node me, Node l, Node r) {
@@ -97,18 +80,6 @@ void init_Node(Node me, Node l, Node r) {
     me -> right = r;
 }
 
-#ifndef GC
-  void destroy_Node(Node me) {
-    if (me -> left) {
-	destroy_Node(me -> left);
-    }
-    if (me -> right) {
-	destroy_Node(me -> right);
-    }
-    free(me);
-  }
-#endif
-
 // Nodes used by a tree of a given size
 static int TreeSize(int i) {
         return ((1 << (i + 1)) - 1);
@@ -125,13 +96,8 @@ static void Populate(int iDepth, Node thisNode) {
                 return;
         } else {
                 iDepth--;
-#		ifdef GC
-                  thisNode->left  = GC_NEW(Node0); HOLE();
-                  thisNode->right = GC_NEW(Node0); HOLE();
-#		else
-                  thisNode->left  = calloc(1, sizeof(Node0));
-                  thisNode->right = calloc(1, sizeof(Node0));
-#		endif
+                thisNode->left  = GC_NEW(Node0);
+                thisNode->right = GC_NEW(Node0);
                 Populate (iDepth, thisNode->left);
                 Populate (iDepth, thisNode->right);
         }
@@ -141,21 +107,13 @@ static void Populate(int iDepth, Node thisNode) {
 static Node MakeTree(int iDepth) {
 	Node result;
         if (iDepth<=0) {
-#	    ifndef GC
-		result = calloc(1, sizeof(Node0));
-#	    else
-		result = GC_NEW(Node0); HOLE();
-#	    endif
-	    /* result is implicitly initialized in both cases. */
-	    return result;
+          result = GC_NEW(Node0);
+          /* result is implicitly initialized in both cases. */
+          return result;
         } else {
 	    Node left = MakeTree(iDepth-1);
 	    Node right = MakeTree(iDepth-1);
-#	    ifndef GC
-		result = malloc(sizeof(Node0));
-#	    else
-		result = GC_NEW(Node0); HOLE();
-#	    endif
+            result = GC_NEW(Node0);
 	    init_Node(result, left, right);
 	    return result;
         }
@@ -182,32 +140,22 @@ static void TimeConstruction(int depth) {
         
         tStart = currentTime();
         for (i = 0; i < iNumIters; ++i) {
-#		ifndef GC
-                  tempTree = calloc(1, sizeof(Node0));
-#		else
-                  tempTree = GC_NEW(Node0);
-#		endif
+          tempTree = GC_NEW(Node0);
                 Populate(depth, tempTree);
-#		ifndef GC
-                  destroy_Node(tempTree);
-#		endif
                 tempTree = 0;
         }
         tFinish = currentTime();
         printf("\tTop down construction took %d msec\n",
-               elapsedTime(tFinish - tStart));
+               tFinish - tStart);
              
         tStart = currentTime();
         for (i = 0; i < iNumIters; ++i) {
                 tempTree = MakeTree(depth);
-#		ifndef GC
-                  destroy_Node(tempTree);
-#		endif
                 tempTree = 0;
         }
         tFinish = currentTime();
         printf("\tBottom up construction took %d msec\n",
-               elapsedTime(tFinish - tStart));
+               tFinish - tStart);
 
 }
 
@@ -220,11 +168,9 @@ int main() {
   	int	i, d;
 	double 	*array;
 
-#ifdef GC
  // GC_full_freq = 30;
  // GC_free_space_divisor = 16;
  // GC_enable_incremental();
-#endif
 	printf("Garbage Collector Test\n");
  	printf(" Live storage will peak at %d bytes.\n\n",
                2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
@@ -240,32 +186,17 @@ int main() {
         
         // Stretch the memory space quickly
         tempTree = MakeTree(kStretchTreeDepth);
-#	ifndef GC
-          destroy_Node(tempTree);
-#	endif
         tempTree = 0;
 
         // Create a long lived object
         printf(" Creating a long-lived binary tree of depth %d\n",
                kLongLivedTreeDepth);
-#	ifndef GC
-          longLivedTree = calloc(1, sizeof(Node0));
-#	else 
-          longLivedTree = GC_NEW(Node0);
-#	endif
+        longLivedTree = GC_NEW(Node0);
         Populate(kLongLivedTreeDepth, longLivedTree);
 
         // Create long-lived array, filling half of it
 	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-#	ifndef GC
-          array = malloc(kArraySize * sizeof(double));
-#	else
-#	  ifndef NO_PTRFREE
-            array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
-#	  else
-            array = GC_MALLOC(sizeof(double) * kArraySize);
-#	  endif
-#	endif
+        array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
         for (i = 0; i < kArraySize/2; ++i) {
                 array[i] = 1.0/i;
         }
@@ -282,15 +213,10 @@ int main() {
                                 // to keep them from being optimized away
 
         tFinish = currentTime();
-        tElapsed = elapsedTime(tFinish-tStart);
+        tElapsed = tFinish - tStart;
         PrintDiagnostics();
         printf("Completed in %d msec\n", tElapsed);
-#	ifdef GC
 	  printf("Completed %d collections\n", GC_gc_no);
 	  printf("Heap size is %d\n", GC_get_heap_size());
-#       endif
-#	ifdef PROFIL
-	  dump_profile();
-#	endif
 }
 
diff --git a/Makefile b/Makefile
index 2d7c92c9c..819cf3f5d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 all: $(ALL_TESTS)
 
 bdw-%: bdw.h %.c
-	$(CC) $(CFLAGS) -lpthread `pkg-config --libs --cflags bdw-gc` -I. -o $@ $*.c bdw.h
+	$(CC) $(CFLAGS) -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
 

From 25213ccdebe2003120507d7a635dfbb534877189 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 23 Feb 2022 20:03:32 +0100
Subject: [PATCH 003/403] Reindent gcbench

---
 GCBench.c | 208 +++++++++++++++++++++++++++---------------------------
 1 file changed, 103 insertions(+), 105 deletions(-)

diff --git a/GCBench.c b/GCBench.c
index 95b2ff0c6..5a0f2015f 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -48,8 +48,6 @@
 #error unknown gc
 #endif
 
-#define elapsedTime(x) (x)
-
 /* Get the current time in milliseconds */
 static unsigned currentTime(void)
 {
@@ -68,155 +66,155 @@ static const int kMinTreeDepth = 4;
 static const int kMaxTreeDepth = 16;
 
 typedef struct Node0_struct {
-        struct Node0_struct * left;
-        struct Node0_struct * right;
-        int i, j;
+  struct Node0_struct * left;
+  struct Node0_struct * right;
+  int i, j;
 } Node0;
 
 typedef Node0 *Node;
 
 void init_Node(Node me, Node l, Node r) {
-    me -> left = l;
-    me -> right = r;
+  me -> left = l;
+  me -> right = r;
 }
 
 // Nodes used by a tree of a given size
 static int TreeSize(int i) {
-        return ((1 << (i + 1)) - 1);
+  return ((1 << (i + 1)) - 1);
 }
 
 // Number of iterations to use for a given tree depth
 static int NumIters(int i) {
-        return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
+  return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
 }
 
 // Build tree top down, assigning to older objects.
 static void Populate(int iDepth, Node thisNode) {
-        if (iDepth<=0) {
-                return;
-        } else {
-                iDepth--;
-                thisNode->left  = GC_NEW(Node0);
-                thisNode->right = GC_NEW(Node0);
-                Populate (iDepth, thisNode->left);
-                Populate (iDepth, thisNode->right);
-        }
+  if (iDepth<=0) {
+    return;
+  } else {
+    iDepth--;
+    thisNode->left  = GC_NEW(Node0);
+    thisNode->right = GC_NEW(Node0);
+    Populate (iDepth, thisNode->left);
+    Populate (iDepth, thisNode->right);
+  }
 }
 
 // Build tree bottom-up
 static Node MakeTree(int iDepth) {
-	Node result;
-        if (iDepth<=0) {
-          result = GC_NEW(Node0);
-          /* result is implicitly initialized in both cases. */
-          return result;
-        } else {
-	    Node left = MakeTree(iDepth-1);
-	    Node right = MakeTree(iDepth-1);
-            result = GC_NEW(Node0);
-	    init_Node(result, left, right);
-	    return result;
-        }
+  Node result;
+  if (iDepth<=0) {
+    result = GC_NEW(Node0);
+    /* result is implicitly initialized in both cases. */
+    return result;
+  } else {
+    Node left = MakeTree(iDepth-1);
+    Node right = MakeTree(iDepth-1);
+    result = GC_NEW(Node0);
+    init_Node(result, left, right);
+    return result;
+  }
 }
 
 static void PrintDiagnostics() {
 #if 0
-        long lFreeMemory = Runtime.getRuntime().freeMemory();
-        long lTotalMemory = Runtime.getRuntime().totalMemory();
+  long lFreeMemory = Runtime.getRuntime().freeMemory();
+  long lTotalMemory = Runtime.getRuntime().totalMemory();
 
-        System.out.print(" Total memory available="
-                         + lTotalMemory + " bytes");
-        System.out.println("  Free memory=" + lFreeMemory + " bytes");
+  System.out.print(" Total memory available="
+                   + lTotalMemory + " bytes");
+  System.out.println("  Free memory=" + lFreeMemory + " bytes");
 #endif
 }
 
 static void TimeConstruction(int depth) {
-        long    tStart, tFinish;
-        int     iNumIters = NumIters(depth);
-        Node    tempTree;
-	int 	i;
+  long    tStart, tFinish;
+  int     iNumIters = NumIters(depth);
+  Node    tempTree;
+  int 	i;
 
-	printf("Creating %d trees of depth %d\n", iNumIters, depth);
+  printf("Creating %d trees of depth %d\n", iNumIters, depth);
         
-        tStart = currentTime();
-        for (i = 0; i < iNumIters; ++i) {
-          tempTree = GC_NEW(Node0);
-                Populate(depth, tempTree);
-                tempTree = 0;
-        }
-        tFinish = currentTime();
-        printf("\tTop down construction took %d msec\n",
-               tFinish - tStart);
+  tStart = currentTime();
+  for (i = 0; i < iNumIters; ++i) {
+    tempTree = GC_NEW(Node0);
+    Populate(depth, tempTree);
+    tempTree = 0;
+  }
+  tFinish = currentTime();
+  printf("\tTop down construction took %d msec\n",
+         tFinish - tStart);
              
-        tStart = currentTime();
-        for (i = 0; i < iNumIters; ++i) {
-                tempTree = MakeTree(depth);
-                tempTree = 0;
-        }
-        tFinish = currentTime();
-        printf("\tBottom up construction took %d msec\n",
-               tFinish - tStart);
+  tStart = currentTime();
+  for (i = 0; i < iNumIters; ++i) {
+    tempTree = MakeTree(depth);
+    tempTree = 0;
+  }
+  tFinish = currentTime();
+  printf("\tBottom up construction took %d msec\n",
+         tFinish - tStart);
 
 }
 
 int main() {
-        Node    root;
-        Node    longLivedTree;
-        Node    tempTree;
-        long    tStart, tFinish;
-        long    tElapsed;
-  	int	i, d;
-	double 	*array;
+  Node    root;
+  Node    longLivedTree;
+  Node    tempTree;
+  long    tStart, tFinish;
+  long    tElapsed;
+  int	i, d;
+  double 	*array;
 
- // GC_full_freq = 30;
- // GC_free_space_divisor = 16;
- // GC_enable_incremental();
-	printf("Garbage Collector Test\n");
- 	printf(" Live storage will peak at %d bytes.\n\n",
-               2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
-               sizeof(double) * kArraySize);
-        printf(" Stretching memory with a binary tree of depth %d\n",
-               kStretchTreeDepth);
-        PrintDiagnostics();
+  // GC_full_freq = 30;
+  // GC_free_space_divisor = 16;
+  // GC_enable_incremental();
+  printf("Garbage Collector Test\n");
+  printf(" Live storage will peak at %d bytes.\n\n",
+         2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
+         sizeof(double) * kArraySize);
+  printf(" Stretching memory with a binary tree of depth %d\n",
+         kStretchTreeDepth);
+  PrintDiagnostics();
 #	ifdef PROFIL
-	    init_profiling();
+  init_profiling();
 #	endif
        
-        tStart = currentTime();
+  tStart = currentTime();
         
-        // Stretch the memory space quickly
-        tempTree = MakeTree(kStretchTreeDepth);
-        tempTree = 0;
+  // Stretch the memory space quickly
+  tempTree = MakeTree(kStretchTreeDepth);
+  tempTree = 0;
 
-        // Create a long lived object
-        printf(" Creating a long-lived binary tree of depth %d\n",
-               kLongLivedTreeDepth);
-        longLivedTree = GC_NEW(Node0);
-        Populate(kLongLivedTreeDepth, longLivedTree);
+  // Create a long lived object
+  printf(" Creating a long-lived binary tree of depth %d\n",
+         kLongLivedTreeDepth);
+  longLivedTree = GC_NEW(Node0);
+  Populate(kLongLivedTreeDepth, longLivedTree);
 
-        // Create long-lived array, filling half of it
-	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-        array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
-        for (i = 0; i < kArraySize/2; ++i) {
-                array[i] = 1.0/i;
-        }
-        PrintDiagnostics();
+  // Create long-lived array, filling half of it
+  printf(" Creating a long-lived array of %d doubles\n", kArraySize);
+  array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
+  for (i = 0; i < kArraySize/2; ++i) {
+    array[i] = 1.0/i;
+  }
+  PrintDiagnostics();
 
-        for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
-                TimeConstruction(d);
-        }
+  for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
+    TimeConstruction(d);
+  }
 
-        if (longLivedTree == 0 || array[1000] != 1.0/1000)
-		fprintf(stderr, "Failed\n");
-                                // fake reference to LongLivedTree
-                                // and array
-                                // to keep them from being optimized away
+  if (longLivedTree == 0 || array[1000] != 1.0/1000)
+    fprintf(stderr, "Failed\n");
+  // fake reference to LongLivedTree
+  // and array
+  // to keep them from being optimized away
 
-        tFinish = currentTime();
-        tElapsed = tFinish - tStart;
-        PrintDiagnostics();
-        printf("Completed in %d msec\n", tElapsed);
-	  printf("Completed %d collections\n", GC_gc_no);
-	  printf("Heap size is %d\n", GC_get_heap_size());
+  tFinish = currentTime();
+  tElapsed = tFinish - tStart;
+  PrintDiagnostics();
+  printf("Completed in %d msec\n", tElapsed);
+  printf("Completed %d collections\n", GC_gc_no);
+  printf("Heap size is %d\n", GC_get_heap_size());
 }
 

From 2fdfefd2fc6b15e7ad044ff8f1c58216827847b3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 23 Feb 2022 21:25:26 +0100
Subject: [PATCH 004/403] handlify

---
 GCBench.c | 186 +++++++++++++++++++++++++++---------------------------
 bdw.h     |  52 +++++++++++++++
 2 files changed, 146 insertions(+), 92 deletions(-)

diff --git a/GCBench.c b/GCBench.c
index 5a0f2015f..8331e82be 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -42,6 +42,12 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
+typedef struct Node {
+  struct Node * left;
+  struct Node * right;
+  int i, j;
+} Node;
+
 #ifdef GC_BDW
 #include "bdw.h"
 #else
@@ -65,17 +71,9 @@ static const int kArraySize  = 500000;  // about 4Mb
 static const int kMinTreeDepth = 4;
 static const int kMaxTreeDepth = 16;
 
-typedef struct Node0_struct {
-  struct Node0_struct * left;
-  struct Node0_struct * right;
-  int i, j;
-} Node0;
-
-typedef Node0 *Node;
-
-void init_Node(Node me, Node l, Node r) {
-  me -> left = l;
-  me -> right = r;
+void init_Node(Node *me, Node *l, Node *r) {
+  init_field((void**)&me->left, l);
+  init_field((void**)&me->right, r);
 }
 
 // Nodes used by a tree of a given size
@@ -89,132 +87,136 @@ static int NumIters(int i) {
 }
 
 // Build tree top down, assigning to older objects.
-static void Populate(int iDepth, Node thisNode) {
+static void Populate(int iDepth, Node *node) {
   if (iDepth<=0) {
     return;
   } else {
     iDepth--;
-    thisNode->left  = GC_NEW(Node0);
-    thisNode->right = GC_NEW(Node0);
-    Populate (iDepth, thisNode->left);
-    Populate (iDepth, thisNode->right);
+    
+    NodeHandle self = { node };
+    PUSH_HANDLE(self);
+    NodeHandle l = { allocate_node() };
+    PUSH_HANDLE(l);
+    NodeHandle r = { allocate_node() };
+    PUSH_HANDLE(r);
+    set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
+    set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
+    Populate (iDepth, HANDLE_REF(self)->left);
+    Populate (iDepth, HANDLE_REF(self)->right);
+    POP_HANDLE(r);
+    POP_HANDLE(l);
+    POP_HANDLE(self);
   }
 }
 
 // Build tree bottom-up
-static Node MakeTree(int iDepth) {
-  Node result;
+static Node* MakeTree(int iDepth) {
   if (iDepth<=0) {
-    result = GC_NEW(Node0);
-    /* result is implicitly initialized in both cases. */
-    return result;
+    return allocate_node();
   } else {
-    Node left = MakeTree(iDepth-1);
-    Node right = MakeTree(iDepth-1);
-    result = GC_NEW(Node0);
-    init_Node(result, left, right);
+    NodeHandle left = { MakeTree(iDepth-1) };
+    PUSH_HANDLE(left);
+    NodeHandle right = { MakeTree(iDepth-1) };
+    PUSH_HANDLE(right);
+    Node *result = allocate_node();
+    init_Node(result, HANDLE_REF(left), HANDLE_REF(right));
+    POP_HANDLE(left);
+    POP_HANDLE(right);
     return result;
   }
 }
 
-static void PrintDiagnostics() {
-#if 0
-  long lFreeMemory = Runtime.getRuntime().freeMemory();
-  long lTotalMemory = Runtime.getRuntime().totalMemory();
-
-  System.out.print(" Total memory available="
-                   + lTotalMemory + " bytes");
-  System.out.println("  Free memory=" + lFreeMemory + " bytes");
-#endif
-}
-
 static void TimeConstruction(int depth) {
-  long    tStart, tFinish;
-  int     iNumIters = NumIters(depth);
-  Node    tempTree;
-  int 	i;
+  int iNumIters = NumIters(depth);
+  NodeHandle tempTree = { NULL };
+  PUSH_HANDLE(tempTree);
 
   printf("Creating %d trees of depth %d\n", iNumIters, depth);
-        
-  tStart = currentTime();
-  for (i = 0; i < iNumIters; ++i) {
-    tempTree = GC_NEW(Node0);
-    Populate(depth, tempTree);
-    tempTree = 0;
-  }
-  tFinish = currentTime();
-  printf("\tTop down construction took %d msec\n",
-         tFinish - tStart);
-             
-  tStart = currentTime();
-  for (i = 0; i < iNumIters; ++i) {
-    tempTree = MakeTree(depth);
-    tempTree = 0;
-  }
-  tFinish = currentTime();
-  printf("\tBottom up construction took %d msec\n",
-         tFinish - tStart);
 
+  {
+    long tStart = currentTime();
+    for (int i = 0; i < iNumIters; ++i) {
+      HANDLE_SET(tempTree, allocate_node());
+      Populate(depth, HANDLE_REF(tempTree));
+      HANDLE_SET(tempTree, NULL);
+    }
+    long tFinish = currentTime();
+    printf("\tTop down construction took %ld msec\n",
+           tFinish - tStart);
+  }
+
+  {
+    long tStart = currentTime();
+    for (int i = 0; i < iNumIters; ++i) {
+      HANDLE_SET(tempTree, MakeTree(depth));
+      HANDLE_SET(tempTree, NULL);
+    }
+    long tFinish = currentTime();
+    printf("\tBottom up construction took %ld msec\n",
+           tFinish - tStart);
+  }
+
+  POP_HANDLE(tempTree);
 }
 
 int main() {
-  Node    root;
-  Node    longLivedTree;
-  Node    tempTree;
-  long    tStart, tFinish;
-  long    tElapsed;
-  int	i, d;
-  double 	*array;
+  NodeHandle root = { NULL };
+  NodeHandle longLivedTree = { NULL };
+  NodeHandle tempTree = { NULL };
+  HANDLE_TO(double) array = { NULL };
+
+  PUSH_HANDLE(root);
+  PUSH_HANDLE(longLivedTree);
+  PUSH_HANDLE(tempTree);
+  PUSH_HANDLE(array);
+
+  initialize_gc();
 
-  // GC_full_freq = 30;
-  // GC_free_space_divisor = 16;
-  // GC_enable_incremental();
   printf("Garbage Collector Test\n");
-  printf(" Live storage will peak at %d bytes.\n\n",
-         2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
+  printf(" Live storage will peak at %zd bytes.\n\n",
+         2 * sizeof(struct Node) * TreeSize(kLongLivedTreeDepth) +
          sizeof(double) * kArraySize);
   printf(" Stretching memory with a binary tree of depth %d\n",
          kStretchTreeDepth);
-  PrintDiagnostics();
-#	ifdef PROFIL
-  init_profiling();
-#	endif
+  print_start_gc_stats();
        
-  tStart = currentTime();
+  long tStart = currentTime();
         
   // Stretch the memory space quickly
-  tempTree = MakeTree(kStretchTreeDepth);
-  tempTree = 0;
+  HANDLE_SET(tempTree, MakeTree(kStretchTreeDepth));
+  HANDLE_SET(tempTree, NULL);
 
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
          kLongLivedTreeDepth);
-  longLivedTree = GC_NEW(Node0);
-  Populate(kLongLivedTreeDepth, longLivedTree);
+  HANDLE_SET(longLivedTree, allocate_node());
+  Populate(kLongLivedTreeDepth, HANDLE_REF(longLivedTree));
 
   // Create long-lived array, filling half of it
   printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-  array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
-  for (i = 0; i < kArraySize/2; ++i) {
-    array[i] = 1.0/i;
+  HANDLE_SET(array, allocate_double_array(kArraySize));
+  for (int i = 0; i < kArraySize/2; ++i) {
+    HANDLE_REF(array)[i] = 1.0/i;
   }
-  PrintDiagnostics();
 
-  for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
+  for (int d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
     TimeConstruction(d);
   }
 
-  if (longLivedTree == 0 || array[1000] != 1.0/1000)
+  if (HANDLE_REF(longLivedTree) == 0 || HANDLE_REF(array)[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
   // fake reference to LongLivedTree
   // and array
   // to keep them from being optimized away
 
-  tFinish = currentTime();
-  tElapsed = tFinish - tStart;
-  PrintDiagnostics();
-  printf("Completed in %d msec\n", tElapsed);
-  printf("Completed %d collections\n", GC_gc_no);
-  printf("Heap size is %d\n", GC_get_heap_size());
+  long tFinish = currentTime();
+  long tElapsed = tFinish - tStart;
+  printf("Completed in %ld msec\n", tElapsed);
+  print_end_gc_stats();
+
+  POP_HANDLE(array);
+  POP_HANDLE(tempTree);
+  POP_HANDLE(longLivedTree);
+  POP_HANDLE(root);
 }
 
diff --git a/bdw.h b/bdw.h
index 28932aea8..aea294ac6 100644
--- a/bdw.h
+++ b/bdw.h
@@ -11,3 +11,55 @@
 #define GC_NO_THREAD_REDIRECTS 1
 
 #include <gc/gc.h>
+
+static Node* allocate_node(void) {
+  // memset to 0 by the collector.
+  return GC_malloc (sizeof (Node));
+}
+
+static double* allocate_double_array(size_t size) {
+  // note, not memset to 0 by the collector.
+  return GC_malloc_atomic (sizeof (double) * size);
+}
+
+struct handle {
+  void *v;
+};
+
+#define HANDLE_TO(T) union { T* v; struct handle handle; }
+#define HANDLE_REF(h) h.v
+#define HANDLE_SET(h,val) do { h.v = val; } while (0)
+#define PUSH_HANDLE(h) push_handle(&h.handle)
+#define POP_HANDLE(h) pop_handle(&h.handle)
+
+typedef HANDLE_TO(Node) NodeHandle;
+
+static inline void push_handle(struct handle *handle) {
+}
+
+static inline void pop_handle(struct handle *handle) {
+}
+
+static inline void init_field(void **addr, void *val) {
+  *addr = val;
+}
+static inline void set_field(void **addr, void *val) {
+  *addr = val;
+}
+static inline void* get_field(void **addr) {
+  return *addr;
+}
+
+static inline void initialize_gc(void) {
+  // GC_full_freq = 30;
+  // GC_free_space_divisor = 16;
+  // GC_enable_incremental();
+}
+
+static inline void print_start_gc_stats(void) {
+}
+
+static inline void print_end_gc_stats(void) {
+  printf("Completed %ld collections\n", (long)GC_get_gc_no());
+  printf("Heap size is %ld\n", (long)GC_get_heap_size());
+}

From 30b5c8a6c8835cb580d257331ad4e5c768b6b40d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 28 Feb 2022 21:35:28 +0100
Subject: [PATCH 005/403] Use handle API, add semispace collector

---
 GCBench.c | 169 ++++++++++++++++++++++++++++++-----------------
 Makefile  |   5 +-
 bdw.h     |  46 ++++++++-----
 semi.h    | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 336 insertions(+), 77 deletions(-)
 create mode 100644 semi.h

diff --git a/GCBench.c b/GCBench.c
index 8331e82be..a16eb6eb1 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -42,17 +42,65 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
+#ifdef GC_BDW
+#include "bdw.h"
+#elif defined(GC_SEMI)
+#include "semi.h"
+#else
+#error unknown gc
+#endif
+
+static const int kStretchTreeDepth    = 18;      // about 16Mb
+static const int kLongLivedTreeDepth  = 16;  // about 4Mb
+static const int kArraySize  = 500000;  // about 4Mb
+static const int kMinTreeDepth = 4;
+static const int kMaxTreeDepth = 16;
+
 typedef struct Node {
+  GC_HEADER;
   struct Node * left;
   struct Node * right;
   int i, j;
 } Node;
 
-#ifdef GC_BDW
-#include "bdw.h"
-#else
-#error unknown gc
-#endif
+typedef struct DoubleArray {
+  GC_HEADER;
+  size_t length;
+  double values[0];
+} DoubleArray;
+
+static inline size_t node_size(void *obj) {
+  return sizeof(Node);
+}
+static inline size_t double_array_size(void *obj) {
+  DoubleArray *array = obj;
+  return sizeof(*array) + array->length * sizeof(double);
+}
+static inline void visit_node_fields(struct context *cx, void *obj,
+                                     field_visitor visit) {
+  Node *node = obj;
+  visit(cx, (void**)&node->left);
+  visit(cx, (void**)&node->right);
+}
+static inline void visit_double_array_fields(struct context *cx, void *obj,
+                                             field_visitor visit) {
+}
+
+typedef HANDLE_TO(Node) NodeHandle;
+typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
+
+static Node* allocate_node(struct context *cx) {
+  // memset to 0 by the collector.
+  return allocate(cx, NODE, sizeof (Node));
+}
+
+static struct DoubleArray* allocate_double_array(struct context *cx,
+                                                 size_t size) {
+  // note, not memset to 0 by the collector.
+  DoubleArray *ret = allocate(cx, DOUBLE_ARRAY, sizeof (double) * size);
+  ret->length = size;
+  return ret;
+}
 
 /* Get the current time in milliseconds */
 static unsigned currentTime(void)
@@ -65,12 +113,6 @@ static unsigned currentTime(void)
   return (t.tv_sec * 1000 + t.tv_usec / 1000);
 }
 
-static const int kStretchTreeDepth    = 18;      // about 16Mb
-static const int kLongLivedTreeDepth  = 16;  // about 4Mb
-static const int kArraySize  = 500000;  // about 4Mb
-static const int kMinTreeDepth = 4;
-static const int kMaxTreeDepth = 16;
-
 void init_Node(Node *me, Node *l, Node *r) {
   init_field((void**)&me->left, l);
   init_field((void**)&me->right, r);
@@ -87,57 +129,57 @@ static int NumIters(int i) {
 }
 
 // Build tree top down, assigning to older objects.
-static void Populate(int iDepth, Node *node) {
+static void Populate(struct context *cx, int iDepth, Node *node) {
   if (iDepth<=0) {
     return;
   } else {
     iDepth--;
     
     NodeHandle self = { node };
-    PUSH_HANDLE(self);
-    NodeHandle l = { allocate_node() };
-    PUSH_HANDLE(l);
-    NodeHandle r = { allocate_node() };
-    PUSH_HANDLE(r);
+    PUSH_HANDLE(cx, self);
+    NodeHandle l = { allocate_node(cx) };
+    PUSH_HANDLE(cx, l);
+    NodeHandle r = { allocate_node(cx) };
+    PUSH_HANDLE(cx, r);
     set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
     set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
-    Populate (iDepth, HANDLE_REF(self)->left);
-    Populate (iDepth, HANDLE_REF(self)->right);
-    POP_HANDLE(r);
-    POP_HANDLE(l);
-    POP_HANDLE(self);
+    Populate (cx, iDepth, HANDLE_REF(self)->left);
+    Populate (cx, iDepth, HANDLE_REF(self)->right);
+    POP_HANDLE(cx, r);
+    POP_HANDLE(cx, l);
+    POP_HANDLE(cx, self);
   }
 }
 
 // Build tree bottom-up
-static Node* MakeTree(int iDepth) {
+static Node* MakeTree(struct context *cx, int iDepth) {
   if (iDepth<=0) {
-    return allocate_node();
+    return allocate_node(cx);
   } else {
-    NodeHandle left = { MakeTree(iDepth-1) };
-    PUSH_HANDLE(left);
-    NodeHandle right = { MakeTree(iDepth-1) };
-    PUSH_HANDLE(right);
-    Node *result = allocate_node();
+    NodeHandle left = { MakeTree(cx, iDepth-1) };
+    PUSH_HANDLE(cx, left);
+    NodeHandle right = { MakeTree(cx, iDepth-1) };
+    PUSH_HANDLE(cx, right);
+    Node *result = allocate_node(cx);
     init_Node(result, HANDLE_REF(left), HANDLE_REF(right));
-    POP_HANDLE(left);
-    POP_HANDLE(right);
+    POP_HANDLE(cx, right);
+    POP_HANDLE(cx, left);
     return result;
   }
 }
 
-static void TimeConstruction(int depth) {
+static void TimeConstruction(struct context *cx, int depth) {
   int iNumIters = NumIters(depth);
   NodeHandle tempTree = { NULL };
-  PUSH_HANDLE(tempTree);
+  PUSH_HANDLE(cx, tempTree);
 
   printf("Creating %d trees of depth %d\n", iNumIters, depth);
 
   {
     long tStart = currentTime();
     for (int i = 0; i < iNumIters; ++i) {
-      HANDLE_SET(tempTree, allocate_node());
-      Populate(depth, HANDLE_REF(tempTree));
+      HANDLE_SET(tempTree, allocate_node(cx));
+      Populate(cx, depth, HANDLE_REF(tempTree));
       HANDLE_SET(tempTree, NULL);
     }
     long tFinish = currentTime();
@@ -148,7 +190,7 @@ static void TimeConstruction(int depth) {
   {
     long tStart = currentTime();
     for (int i = 0; i < iNumIters; ++i) {
-      HANDLE_SET(tempTree, MakeTree(depth));
+      HANDLE_SET(tempTree, MakeTree(cx, depth));
       HANDLE_SET(tempTree, NULL);
     }
     long tFinish = currentTime();
@@ -156,54 +198,61 @@ static void TimeConstruction(int depth) {
            tFinish - tStart);
   }
 
-  POP_HANDLE(tempTree);
+  POP_HANDLE(cx, tempTree);
 }
 
 int main() {
+  size_t kHeapMaxLive =
+    2 * sizeof(struct Node) * TreeSize(kLongLivedTreeDepth) +
+    sizeof(double) * kArraySize;
+  double kHeapMultiplier = 3;
+  size_t kHeapSize = kHeapMaxLive * kHeapMultiplier;
+
+  struct context _cx;
+  struct context *cx = &_cx;
+  initialize_gc(cx, kHeapSize);
+
   NodeHandle root = { NULL };
   NodeHandle longLivedTree = { NULL };
   NodeHandle tempTree = { NULL };
-  HANDLE_TO(double) array = { NULL };
+  DoubleArrayHandle array = { NULL };
 
-  PUSH_HANDLE(root);
-  PUSH_HANDLE(longLivedTree);
-  PUSH_HANDLE(tempTree);
-  PUSH_HANDLE(array);
-
-  initialize_gc();
+  PUSH_HANDLE(cx, root);
+  PUSH_HANDLE(cx, longLivedTree);
+  PUSH_HANDLE(cx, tempTree);
+  PUSH_HANDLE(cx, array);
 
   printf("Garbage Collector Test\n");
-  printf(" Live storage will peak at %zd bytes.\n\n",
-         2 * sizeof(struct Node) * TreeSize(kLongLivedTreeDepth) +
-         sizeof(double) * kArraySize);
+  printf(" Live storage will peak at %zd bytes.\n\n", kHeapMaxLive);
   printf(" Stretching memory with a binary tree of depth %d\n",
          kStretchTreeDepth);
-  print_start_gc_stats();
+  print_start_gc_stats(cx);
        
   long tStart = currentTime();
         
   // Stretch the memory space quickly
-  HANDLE_SET(tempTree, MakeTree(kStretchTreeDepth));
+  HANDLE_SET(tempTree, MakeTree(cx, kStretchTreeDepth));
   HANDLE_SET(tempTree, NULL);
 
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
          kLongLivedTreeDepth);
-  HANDLE_SET(longLivedTree, allocate_node());
-  Populate(kLongLivedTreeDepth, HANDLE_REF(longLivedTree));
+  HANDLE_SET(longLivedTree, allocate_node(cx));
+  Populate(cx, kLongLivedTreeDepth, HANDLE_REF(longLivedTree));
 
   // Create long-lived array, filling half of it
   printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-  HANDLE_SET(array, allocate_double_array(kArraySize));
+  HANDLE_SET(array, allocate_double_array(cx, kArraySize));
   for (int i = 0; i < kArraySize/2; ++i) {
-    HANDLE_REF(array)[i] = 1.0/i;
+    HANDLE_REF(array)->values[i] = 1.0/i;
   }
 
   for (int d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
-    TimeConstruction(d);
+    TimeConstruction(cx, d);
   }
 
-  if (HANDLE_REF(longLivedTree) == 0 || HANDLE_REF(array)[1000] != 1.0/1000)
+  if (HANDLE_REF(longLivedTree) == 0
+      || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
   // fake reference to LongLivedTree
   // and array
@@ -212,11 +261,11 @@ int main() {
   long tFinish = currentTime();
   long tElapsed = tFinish - tStart;
   printf("Completed in %ld msec\n", tElapsed);
-  print_end_gc_stats();
+  print_end_gc_stats(cx);
 
-  POP_HANDLE(array);
-  POP_HANDLE(tempTree);
-  POP_HANDLE(longLivedTree);
-  POP_HANDLE(root);
+  POP_HANDLE(cx, array);
+  POP_HANDLE(cx, tempTree);
+  POP_HANDLE(cx, longLivedTree);
+  POP_HANDLE(cx, root);
 }
 
diff --git a/Makefile b/Makefile
index 819cf3f5d..c5573977c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=GCBench MT_GCBench MT_GCBench2
-COLLECTORS=bdw
+COLLECTORS=bdw semi
 
 CC=gcc
 CFLAGS=-Wall -O2 -g
@@ -11,6 +11,9 @@ all: $(ALL_TESTS)
 bdw-%: bdw.h %.c
 	$(CC) $(CFLAGS) -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
 
+semi-%: semi.h %.c
+	$(CC) $(CFLAGS) -I. -DGC_SEMI -o $@ $*.c
+
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
 test-%: $(ALL_TESTS)
diff --git a/bdw.h b/bdw.h
index aea294ac6..571f67639 100644
--- a/bdw.h
+++ b/bdw.h
@@ -12,14 +12,24 @@
 
 #include <gc/gc.h>
 
-static Node* allocate_node(void) {
-  // memset to 0 by the collector.
-  return GC_malloc (sizeof (Node));
-}
+struct context {};
 
-static double* allocate_double_array(size_t size) {
-  // note, not memset to 0 by the collector.
-  return GC_malloc_atomic (sizeof (double) * size);
+enum alloc_kind { NODE, DOUBLE_ARRAY };
+
+typedef void (*field_visitor)(struct context *, void **ref);
+
+#define GC_HEADER /**/
+
+static inline void* allocate(struct context *cx, enum alloc_kind kind,
+                             size_t size) {
+  // memset to 0 by the collector.
+  switch (kind) {
+  case NODE:
+    return GC_malloc(size);
+  case DOUBLE_ARRAY:
+    return GC_malloc_atomic(size);
+  }
+  abort();
 }
 
 struct handle {
@@ -29,15 +39,13 @@ struct handle {
 #define HANDLE_TO(T) union { T* v; struct handle handle; }
 #define HANDLE_REF(h) h.v
 #define HANDLE_SET(h,val) do { h.v = val; } while (0)
-#define PUSH_HANDLE(h) push_handle(&h.handle)
-#define POP_HANDLE(h) pop_handle(&h.handle)
+#define PUSH_HANDLE(cx, h) push_handle(cx, &h.handle)
+#define POP_HANDLE(cx, h) pop_handle(cx, &h.handle)
 
-typedef HANDLE_TO(Node) NodeHandle;
-
-static inline void push_handle(struct handle *handle) {
+static inline void push_handle(struct context *cx, struct handle *handle) {
 }
 
-static inline void pop_handle(struct handle *handle) {
+static inline void pop_handle(struct context *cx, struct handle *handle) {
 }
 
 static inline void init_field(void **addr, void *val) {
@@ -50,16 +58,22 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static inline void initialize_gc(void) {
+static inline void initialize_gc(struct context* cx, size_t heap_size) {
   // GC_full_freq = 30;
   // GC_free_space_divisor = 16;
   // GC_enable_incremental();
+  GC_INIT();
+  size_t current_heap_size = GC_get_heap_size();
+  if (heap_size > current_heap_size) {
+    GC_set_max_heap_size (heap_size);
+    GC_expand_hp(heap_size - current_heap_size);
+  }
 }
 
-static inline void print_start_gc_stats(void) {
+static inline void print_start_gc_stats(struct context *cx) {
 }
 
-static inline void print_end_gc_stats(void) {
+static inline void print_end_gc_stats(struct context *cx) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
 }
diff --git a/semi.h b/semi.h
new file mode 100644
index 000000000..16bb8566c
--- /dev/null
+++ b/semi.h
@@ -0,0 +1,193 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+struct handle {
+  void *v;
+  struct handle *next;
+};
+
+struct context {
+  uintptr_t hp;
+  uintptr_t limit;
+  uintptr_t base;
+  size_t size;
+  struct handle *roots;
+  long count;
+};
+
+static const uintptr_t ALIGNMENT = 8;
+
+static uintptr_t align_up(uintptr_t addr, size_t align) {
+  return (addr + align - 1) & ~(align-1);
+}
+
+#define GC_HEADER uintptr_t _gc_header
+
+enum alloc_kind { NODE, DOUBLE_ARRAY };
+
+typedef void (*field_visitor)(struct context *, void **ref);
+
+static inline size_t node_size(void *obj) __attribute__((always_inline));
+static inline size_t double_array_size(void *obj) __attribute__((always_inline));
+static inline void visit_node_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
+static inline void visit_double_array_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
+
+static inline void clear_memory(uintptr_t addr, size_t size) {
+  memset((char*)addr, 0, size);
+}
+
+static void collect(struct context *cx, size_t bytes) __attribute__((noinline));
+
+static void process(struct context *cx, void **loc);
+
+static void flip(struct context *cx) {
+  uintptr_t split = cx->base + (cx->size >> 1);
+  if (cx->hp <= split) {
+    cx->hp = split;
+    cx->limit = cx->base + cx->size;
+  } else {
+    cx->hp = cx->base;
+    cx->limit = split;
+  }
+  cx->count++;
+}  
+
+static void* copy(struct context *cx, uintptr_t kind, void *obj) {
+  size_t size;
+  switch (kind) {
+  case NODE:
+    size = node_size(obj);
+    break;
+  case DOUBLE_ARRAY:
+    size = double_array_size(obj);
+    break;
+  default:
+    abort ();
+  }
+  void *new_obj = (void*)cx->hp;
+  memcpy(new_obj, obj, size);
+  *(uintptr_t*) obj = cx->hp;
+  cx->hp += align_up (size, ALIGNMENT);
+  return new_obj;
+}
+
+static uintptr_t scan(struct context *cx, uintptr_t grey) {
+  void *obj = (void*)grey;
+  uintptr_t kind = *(uintptr_t*) obj;
+  switch (kind) {
+  case NODE:
+    visit_node_fields(cx, obj, process);
+    return grey + align_up (node_size(obj), ALIGNMENT);
+    break;
+  case DOUBLE_ARRAY:
+    visit_double_array_fields(cx, obj, process);
+    return grey + align_up (double_array_size(obj), ALIGNMENT);
+    break;
+  default:
+    abort ();
+  }
+}
+
+static void* forward(struct context *cx, void *obj) {
+  uintptr_t header_word = *(uintptr_t*)obj;
+  switch (header_word) {
+  case NODE:
+  case DOUBLE_ARRAY:
+    return copy(cx, header_word, obj);
+  default:
+    return (void*)header_word;
+  }
+}  
+
+static void process(struct context *cx, void **loc) {
+  void *obj = *loc;
+  if (obj != NULL)
+    *loc = forward(cx, obj);
+}
+static void collect(struct context *cx, size_t bytes) {
+  // fprintf(stderr, "start collect #%ld:\n", cx->count);
+  flip(cx);
+  uintptr_t grey = cx->hp;
+  for (struct handle *h = cx->roots; h; h = h->next)
+    process(cx, &h->v);
+  // fprintf(stderr, "pushed %zd bytes in roots\n", cx->hp - grey);
+  while(grey < cx->hp)
+    grey = scan(cx, grey);
+  // fprintf(stderr, "%zd bytes copied\n", (cx->size>>1)-(cx->limit-cx->hp));
+
+  if (cx->limit - cx->hp < bytes) {
+    fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
+    abort();
+  }
+}
+
+static inline void* allocate(struct context *cx, enum alloc_kind kind,
+                             size_t size) {
+  while (1) {
+    uintptr_t addr = cx->hp;
+    uintptr_t new_hp = align_up (addr + size, ALIGNMENT);
+    if (cx->limit < new_hp) {
+      collect(cx, size);
+      continue;
+    }
+    cx->hp = new_hp;
+    void *ret = (void *)addr;
+    uintptr_t *header_word = ret;
+    *header_word = kind;
+    if (kind == NODE)
+      clear_memory(addr + sizeof(uintptr_t), size - sizeof(uintptr_t));
+    return ret;
+  }
+}
+
+#define HANDLE_TO(T) union { T* v; struct handle handle; }
+#define HANDLE_REF(h) h.v
+#define HANDLE_SET(h,val) do { h.v = val; } while (0)
+#define PUSH_HANDLE(cx, h) push_handle(cx, &h.handle)
+#define POP_HANDLE(cx, h) pop_handle(cx, &h.handle)
+
+static inline void push_handle(struct context *cx, struct handle *handle) {
+  handle->next = cx->roots;
+  cx->roots = handle;
+}
+
+static inline void pop_handle(struct context *cx, struct handle *handle) {
+  cx->roots = handle->next;
+}
+
+static inline void init_field(void **addr, void *val) {
+  *addr = val;
+}
+static inline void set_field(void **addr, void *val) {
+  *addr = val;
+}
+static inline void* get_field(void **addr) {
+  return *addr;
+}
+
+static inline void initialize_gc(struct context *cx, size_t size) {
+  size = align_up(size, getpagesize());
+
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    abort();
+  }
+  cx->hp = cx->base = (uintptr_t) mem;
+  cx->size = size;
+  cx->count = -1;
+  flip(cx);
+  cx->roots = NULL;
+}
+
+static inline void print_start_gc_stats(struct context *cx) {
+}
+
+static inline void print_end_gc_stats(struct context *cx) {
+  printf("Completed %ld collections\n", cx->count);
+  printf("Heap size is %zd\n", cx->size);
+}

From 2a619ba67dcbddd495e3112fd140824d8fbdada0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 2 Mar 2022 09:17:23 +0100
Subject: [PATCH 006/403] Add README

---
 Makefile  |  2 +-
 README.md | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 README.md

diff --git a/Makefile b/Makefile
index c5573977c..3b301d3b7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS=GCBench MT_GCBench MT_GCBench2
+TESTS=GCBench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi
 
 CC=gcc
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..518736410
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+# GC workbench
+
+This repository is a workbench for implementing different GCs.  It's a
+scratch space.
+
+## License
+
+GCBench.c, MT_GCBench.c, and MT_GCBench2.c are from
+https://hboehm.info/gc/gc_bench/ and have a somewhat unclear license.  I
+have modified GCBench significantly so that I can slot in different GC
+implementations.  The GC implementations themselves are available under
+a MIT-style license, the text of which follows:
+
+```
+Copyright (c) 2022 Andy Wingo
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+```

From 283721b39a8b8aeb9ffaaaddf0ad5750a2712c43 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Mar 2022 15:27:22 +0100
Subject: [PATCH 007/403] Refactor handling of precise and conservative roots

---
 Makefile             |  4 ++--
 bdw.h                | 18 ++----------------
 conservative-roots.h |  7 +++++++
 precise-roots.h      | 19 +++++++++++++++++++
 semi.h               | 20 +-------------------
 5 files changed, 31 insertions(+), 37 deletions(-)
 create mode 100644 conservative-roots.h
 create mode 100644 precise-roots.h

diff --git a/Makefile b/Makefile
index 3b301d3b7..bd8e4dc8b 100644
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,10 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
 all: $(ALL_TESTS)
 
-bdw-%: bdw.h %.c
+bdw-%: bdw.h conservative-roots.h %.c
 	$(CC) $(CFLAGS) -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
 
-semi-%: semi.h %.c
+semi-%: semi.h precise-roots.h %.c
 	$(CC) $(CFLAGS) -I. -DGC_SEMI -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/bdw.h b/bdw.h
index 571f67639..7a5538d40 100644
--- a/bdw.h
+++ b/bdw.h
@@ -1,3 +1,5 @@
+#include "conservative-roots.h"
+
 // When pthreads are used, let `libgc' know about it and redirect
 // allocation calls such as `GC_MALLOC ()' to (contention-free, faster)
 // thread-local allocation.
@@ -32,22 +34,6 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
   abort();
 }
 
-struct handle {
-  void *v;
-};
-
-#define HANDLE_TO(T) union { T* v; struct handle handle; }
-#define HANDLE_REF(h) h.v
-#define HANDLE_SET(h,val) do { h.v = val; } while (0)
-#define PUSH_HANDLE(cx, h) push_handle(cx, &h.handle)
-#define POP_HANDLE(cx, h) pop_handle(cx, &h.handle)
-
-static inline void push_handle(struct context *cx, struct handle *handle) {
-}
-
-static inline void pop_handle(struct context *cx, struct handle *handle) {
-}
-
 static inline void init_field(void **addr, void *val) {
   *addr = val;
 }
diff --git a/conservative-roots.h b/conservative-roots.h
new file mode 100644
index 000000000..7f2db0abd
--- /dev/null
+++ b/conservative-roots.h
@@ -0,0 +1,7 @@
+struct handle { void *unused; };
+
+#define HANDLE_TO(T) union { T* v; struct handle handle; }
+#define HANDLE_REF(h) h.v
+#define HANDLE_SET(h,val) do { h.v = val; } while (0)
+#define PUSH_HANDLE(cx, h) do { (void) &h; } while (0)
+#define POP_HANDLE(cx, h) do { (void) &h; } while (0)
diff --git a/precise-roots.h b/precise-roots.h
new file mode 100644
index 000000000..919154b99
--- /dev/null
+++ b/precise-roots.h
@@ -0,0 +1,19 @@
+struct handle {
+  void *v;
+  struct handle *next;
+};
+
+#define HANDLE_TO(T) union { T* v; struct handle handle; }
+#define HANDLE_REF(h) h.v
+#define HANDLE_SET(h,val) do { h.v = val; } while (0)
+#define PUSH_HANDLE(cx, h) push_handle(&cx->roots, &h.handle)
+#define POP_HANDLE(cx, h) pop_handle(&cx->roots, &h.handle)
+
+static inline void push_handle(struct handle **roots, struct handle *handle) {
+  handle->next = *roots;
+  *roots = handle;
+}
+
+static inline void pop_handle(struct handle **roots, struct handle *handle) {
+  *roots = handle->next;
+}
diff --git a/semi.h b/semi.h
index 16bb8566c..37b9f4ef4 100644
--- a/semi.h
+++ b/semi.h
@@ -4,10 +4,7 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-struct handle {
-  void *v;
-  struct handle *next;
-};
+#include "precise-roots.h"
 
 struct context {
   uintptr_t hp;
@@ -143,21 +140,6 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
   }
 }
 
-#define HANDLE_TO(T) union { T* v; struct handle handle; }
-#define HANDLE_REF(h) h.v
-#define HANDLE_SET(h,val) do { h.v = val; } while (0)
-#define PUSH_HANDLE(cx, h) push_handle(cx, &h.handle)
-#define POP_HANDLE(cx, h) pop_handle(cx, &h.handle)
-
-static inline void push_handle(struct context *cx, struct handle *handle) {
-  handle->next = cx->roots;
-  cx->roots = handle;
-}
-
-static inline void pop_handle(struct context *cx, struct handle *handle) {
-  cx->roots = handle->next;
-}
-
 static inline void init_field(void **addr, void *val) {
   *addr = val;
 }

From 7b85284a89ccdd1fe34754c3f8c18903254d6f1f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 7 Mar 2022 10:23:05 +0100
Subject: [PATCH 008/403] Add mark-sweep collector

---
 GCBench.c    |   4 +-
 Makefile     |   5 +-
 mark-sweep.h | 523 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 530 insertions(+), 2 deletions(-)
 create mode 100644 mark-sweep.h

diff --git a/GCBench.c b/GCBench.c
index a16eb6eb1..6434fbdfb 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -42,10 +42,12 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
-#ifdef GC_BDW
+#if defined(GC_BDW)
 #include "bdw.h"
 #elif defined(GC_SEMI)
 #include "semi.h"
+#elif defined(GC_MARK_SWEEP)
+#include "mark-sweep.h"
 #else
 #error unknown gc
 #endif
diff --git a/Makefile b/Makefile
index bd8e4dc8b..0067cec1a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=GCBench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi
+COLLECTORS=bdw semi mark-sweep
 
 CC=gcc
 CFLAGS=-Wall -O2 -g
@@ -14,6 +14,9 @@ bdw-%: bdw.h conservative-roots.h %.c
 semi-%: semi.h precise-roots.h %.c
 	$(CC) $(CFLAGS) -I. -DGC_SEMI -o $@ $*.c
 
+mark-sweep-%: mark-sweep.h precise-roots.h %.c
+	$(CC) $(CFLAGS) -I. -DGC_MARK_SWEEP -o $@ $*.c
+
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
 test-%: $(ALL_TESTS)
diff --git a/mark-sweep.h b/mark-sweep.h
new file mode 100644
index 000000000..9a2d3b422
--- /dev/null
+++ b/mark-sweep.h
@@ -0,0 +1,523 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "precise-roots.h"
+
+#define STATIC_ASSERT_EQ(a, b) _Static_assert((a) == (b), "eq")
+
+#ifndef NDEBUG
+#define ASSERT(x) do { if (!(x)) __builtin_trap(); } while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+#define ASSERT_EQ(a,b) ASSERT((a) == (b))
+
+#define GRANULE_SIZE 8
+#define GRANULE_SIZE_LOG_2 3
+#define LARGE_OBJECT_THRESHOLD 256
+#define LARGE_OBJECT_GRANULE_THRESHOLD 32
+
+STATIC_ASSERT_EQ(GRANULE_SIZE, 1 << GRANULE_SIZE_LOG_2);
+STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
+                 LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
+
+// There are small object pages for allocations of these sizes.
+#define FOR_EACH_SMALL_OBJECT_GRANULES(M) \
+  M(2) M(3) M(4) M(5) M(6) M(8) M(10) M(16) M(32)
+
+enum small_object_size {
+#define SMALL_OBJECT_GRANULE_SIZE(i) SMALL_OBJECT_##i,
+  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE)
+#undef SMALL_OBJECT_GRANULE_SIZE
+  SMALL_OBJECT_SIZES,
+  NOT_SMALL_OBJECT = SMALL_OBJECT_SIZES
+};
+
+static const uint8_t small_object_granule_sizes[] = 
+{
+#define SMALL_OBJECT_GRANULE_SIZE(i) i,
+  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE)
+#undef SMALL_OBJECT_GRANULE_SIZE
+};
+
+static enum small_object_size granules_to_small_object_size(unsigned granules) {
+  if (granules <= 1) return NOT_SMALL_OBJECT;
+#define TEST_GRANULE_SIZE(i) if (granules <= i) return SMALL_OBJECT_##i;
+  FOR_EACH_SMALL_OBJECT_GRANULES(TEST_GRANULE_SIZE);
+#undef TEST_GRANULE_SIZE
+  return NOT_SMALL_OBJECT;
+}
+  
+static uintptr_t align_up(uintptr_t addr, size_t align) {
+  return (addr + align - 1) & ~(align-1);
+}
+
+static inline size_t size_to_granules(size_t size) {
+  return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
+}
+
+// Object kind is stored in low bits of first word of all heap objects
+// (allocated or free).
+enum gcobj_kind { GCOBJ_TINY, GCOBJ };
+
+// gcobj_kind is in the low bit of tag.
+static const uintptr_t gcobj_kind_bit = (1 << 0);
+static inline enum gcobj_kind tag_gcobj_kind(uintptr_t tag) {
+  return tag & gcobj_kind_bit;
+}
+
+// If bit 1 of a tag is set, the object is potentially live.  allocate()
+// returns objects with this flag set.  When sweep() adds an object to
+// the freelist, it gets added as dead (with this flag unset).  If
+// sweep() ever sees a dead object, then the object represents wasted
+// space in the form of fragmentation.
+static const uintptr_t gcobj_live_bit = (1 << 1);
+static inline int tag_maybe_live(uintptr_t tag) {
+  return tag & gcobj_live_bit;
+}
+
+// The mark bit is bit 2, and can only ever be set on allocated object
+// (i.e. never for objects on a free list).  It is cleared by the
+// sweeper before the next collection.
+static const uintptr_t gcobj_mark_bit = (1 << 2);
+static inline int tag_marked(uintptr_t tag) {
+  return tag & gcobj_mark_bit;
+}
+static inline void tag_set_marked(uintptr_t *tag_loc) {
+  *tag_loc |= gcobj_mark_bit;
+}
+static inline void tag_clear_marked(uintptr_t *tag_loc) {
+  *tag_loc &= ~gcobj_mark_bit;
+}
+
+// Alloc kind is in bits 3-10, for live objects.
+static const uintptr_t gcobj_alloc_kind_mask = 0xff;
+static const uintptr_t gcobj_alloc_kind_shift = 3;
+static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
+  return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
+}
+
+// For free objects, bits 2 and up are free.  Non-tiny objects store the
+// object size in granules there.
+static const uintptr_t gcobj_free_granules_shift = 2;
+static inline uintptr_t tag_free_granules(uintptr_t tag) {
+  return tag >> gcobj_free_granules_shift;
+}
+
+static inline uintptr_t tag_free(enum gcobj_kind kind, size_t granules) {
+  return kind | (granules << gcobj_free_granules_shift);
+}
+static inline uintptr_t tag_live(enum gcobj_kind kind, uint8_t alloc_kind) {
+  return kind | gcobj_live_bit |
+    ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
+}
+static inline uintptr_t tag_free_tiny(void) {
+  return tag_free(GCOBJ_TINY, 0);
+}
+
+// The gcobj_free_tiny and gcobj_free structs define the fields in free
+// tiny (1-granule), and non-tiny (2 granules and up) objects.
+struct gcobj_free_tiny {
+  // Low 2 bits of tag are GCOBJ_TINY, which is 0.  Bit 2 is live bit;
+  // never set for free objects.  Therefore for free objects, the
+  // 8-byte-aligned next pointer can alias the tag.
+  union {
+    uintptr_t tag;
+    struct gcobj_free_tiny *next;
+  };
+};
+
+// Objects from 2 granules and up.
+struct gcobj_free {
+  // For free objects, we store the granule size in the tag's payload.
+  // Next pointer only valid for objects on small freelist.
+  uintptr_t tag;
+  struct gcobj_free *next;
+};
+
+struct gcobj {
+  union {
+    uintptr_t tag;
+    struct gcobj_free_tiny free_tiny;
+    struct gcobj_free free;
+    uintptr_t words[0];
+    void *pointers[0];
+  };
+};
+
+static inline enum gcobj_kind gcobj_kind(struct gcobj *obj) {
+  return tag_gcobj_kind (obj->tag);
+}
+
+struct context {
+  // Segregated freelists of tiny and small objects.
+  struct gcobj_free_tiny *tiny_objects;
+  struct gcobj_free *small_objects[SMALL_OBJECT_SIZES];
+  // Unordered list of large objects.
+  struct gcobj_free *large_objects;
+  uintptr_t base;
+  size_t size;
+  uintptr_t sweep;
+  struct handle *roots;
+  long count;
+};
+
+static inline struct gcobj_free**
+get_small_object_freelist(struct context *cx, enum small_object_size kind) {
+  ASSERT(kind < SMALL_OBJECT_SIZES);
+  return &cx->small_objects[kind];
+}
+
+#define GC_HEADER uintptr_t _gc_header
+
+enum alloc_kind { NODE, DOUBLE_ARRAY };
+
+typedef void (*field_visitor)(struct context *, void **ref);
+
+static inline size_t node_size(void *obj) __attribute__((always_inline));
+static inline size_t double_array_size(void *obj) __attribute__((always_inline));
+static inline void visit_node_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
+static inline void visit_double_array_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
+
+static inline void clear_memory(uintptr_t addr, size_t size) {
+  memset((char*)addr, 0, size);
+}
+
+static void collect(struct context *cx) __attribute__((noinline));
+static void mark(struct context *cx, void *p);
+
+static inline void visit(struct context *cx, void **loc) {
+  mark(cx, *loc);
+}
+
+static void mark(struct context *cx, void *p) {
+  // A production mark implementation would use a worklist, to avoid
+  // stack overflow.  This implementation just uses the call stack.
+  struct gcobj *obj = p;
+  if (obj == NULL)
+    return;
+  if (tag_marked(obj->tag))
+    return;
+  tag_set_marked(&obj->tag);
+  switch (tag_live_alloc_kind(obj->tag)) {
+  case NODE:
+    visit_node_fields(cx, obj, visit);
+    break;
+  case DOUBLE_ARRAY:
+    visit_double_array_fields(cx, obj, visit);
+    break;
+  default:
+    abort ();
+  }
+}
+
+static void clear_freelists(struct context *cx) {
+  cx->tiny_objects = NULL;
+  for (int i = 0; i < SMALL_OBJECT_SIZES; i++)
+    cx->small_objects[i] = NULL;
+  cx->large_objects = NULL;
+}
+
+static void collect(struct context *cx) {
+  // fprintf(stderr, "start collect #%ld:\n", cx->count);
+  for (struct handle *h = cx->roots; h; h = h->next)
+    mark(cx, h->v);
+  // fprintf(stderr, "done marking\n");
+  cx->sweep = cx->base;
+  clear_freelists(cx);
+  cx->count++;
+}
+
+static void push_free_tiny(struct gcobj_free_tiny **loc,
+                           struct gcobj_free_tiny *obj) {
+  // Rely on obj->next having low bits being 0, indicating a non-live
+  // tiny object.
+  obj->next = *loc;
+  *loc = obj;
+}
+
+static void push_free(struct gcobj_free **loc, struct gcobj_free *obj,
+                      size_t granules) {
+  obj->tag = tag_free(GCOBJ, granules);
+  obj->next = *loc;
+  *loc = obj;
+}
+
+static void push_tiny(struct context *cx, void *obj) {
+  push_free_tiny(&cx->tiny_objects, obj);
+}
+
+static void push_small(struct context *cx, void *region,
+                       enum small_object_size kind, size_t region_granules) {
+  uintptr_t addr = (uintptr_t) region;
+  while (region_granules) {
+    size_t granules = small_object_granule_sizes[kind];
+    struct gcobj_free **loc = get_small_object_freelist(cx, kind);
+    while (granules <= region_granules) {
+      push_free(loc, (struct gcobj_free*) addr, granules);
+      region_granules -= granules;
+      addr += granules * GRANULE_SIZE;
+    }
+    if (region_granules == 1) {
+      // Region is actually a tiny object.
+      push_free_tiny(&cx->tiny_objects, (struct gcobj_free_tiny *)addr);
+      return;
+    }
+    // Fit any remaining granules into smaller freelists.
+    kind--;
+  }
+}
+
+static void push_large(struct context *cx, void *region, size_t granules) {
+  push_free(&cx->large_objects, region, granules);
+}
+
+static void reclaim(struct context *cx, void *obj, size_t granules) {
+  if (granules == 1) {
+    push_tiny(cx, obj);
+  } else if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD) {
+    push_small(cx, obj, SMALL_OBJECT_SIZES - 1, granules);
+  } else {
+    push_large(cx, obj, granules);
+  }
+}
+
+static void split_large_object(struct context *cx,
+                               struct gcobj_free *large,
+                               size_t granules) {
+  size_t large_granules = tag_free_granules(large->tag);
+  ASSERT(large_granules >= granules);
+  if (large_granules == granules)
+    return;
+  
+  char *tail = ((char*)large) + granules * GRANULE_SIZE;
+  reclaim(cx, tail, large_granules - granules);
+}
+
+static void unlink_large_object(struct gcobj_free **prev,
+                                struct gcobj_free *large) {
+  *prev = large->next;
+}
+
+static size_t live_object_granules(struct gcobj *obj) {
+  enum gcobj_kind size_kind = tag_gcobj_kind(obj->tag);
+  if (size_kind == GCOBJ_TINY)
+    return 1;
+  size_t bytes;
+  switch (tag_live_alloc_kind (obj->tag)) {
+  case NODE:
+    bytes = node_size(obj);
+    break;
+  case DOUBLE_ARRAY:
+    bytes = double_array_size(obj);
+    break;
+  default:
+    abort ();
+  }
+  size_t granules = size_to_granules(bytes);
+  if (granules > LARGE_OBJECT_GRANULE_THRESHOLD)
+    return granules;
+  return small_object_granule_sizes[granules_to_small_object_size(granules)];
+}  
+
+static size_t free_object_granules(struct gcobj *obj) {
+  enum gcobj_kind size_kind = tag_gcobj_kind(obj->tag);
+  if (size_kind == GCOBJ_TINY)
+    return 1;
+  return tag_free_granules(obj->tag);
+}  
+
+// Sweep some heap to reclaim free space.  Return 1 if there is more
+// heap to sweep, or 0 if we reached the end.
+static int sweep(struct context *cx) {
+  // Sweep until we have reclaimed 128 granules (1024 kB), or we reach
+  // the end of the heap.
+  ssize_t to_reclaim = 128;
+  uintptr_t sweep = cx->sweep;
+  uintptr_t limit = cx->base + cx->size;
+
+  while (to_reclaim > 0 && sweep < limit) {
+    struct gcobj *obj = (struct gcobj*)sweep;
+    size_t obj_granules = tag_maybe_live(obj->tag)
+      ? live_object_granules(obj) : free_object_granules(obj);
+    sweep += obj_granules * GRANULE_SIZE;
+    if (tag_maybe_live(obj->tag) && tag_marked(obj->tag)) {
+      // Object survived collection; clear mark and continue sweeping.
+      tag_clear_marked(&obj->tag);
+    } else {
+      // Found a free object.  Combine with any following free objects.
+      // To avoid fragmentation, don't limit the amount to reclaim.
+      to_reclaim -= obj_granules;
+      while (sweep < limit) {
+        struct gcobj *next = (struct gcobj*)sweep;
+        if (tag_maybe_live(next->tag) && tag_marked(next->tag))
+          break;
+        size_t next_granules = tag_maybe_live(next->tag)
+          ? live_object_granules(next) : free_object_granules(next);
+        sweep += next_granules * GRANULE_SIZE;
+        to_reclaim -= next_granules;
+        obj_granules += next_granules;
+      }
+      memset(((char*)obj) + GRANULE_SIZE, 0, (obj_granules - 1) * GRANULE_SIZE);
+      reclaim(cx, obj, obj_granules);
+    }
+  }
+
+  cx->sweep = sweep;
+  return sweep < limit;
+}
+
+static void* allocate_large(struct context *cx, enum alloc_kind kind,
+                            size_t granules) {
+  int swept_from_beginning = 0;
+  struct gcobj_free *already_scanned = NULL;
+  while (1) {
+    do {
+      struct gcobj_free **prev = &cx->large_objects;
+      for (struct gcobj_free *large = cx->large_objects;
+           large != already_scanned;
+           prev = &large->next, large = large->next) {
+        if (tag_free_granules(large->tag) >= granules) {
+          unlink_large_object(prev, large);
+          split_large_object(cx, large, granules);
+          large->tag = tag_live(GCOBJ, kind);
+          return large;
+        }
+      }
+      already_scanned = cx->large_objects;
+    } while (sweep (cx));
+
+    // No large object, and we swept across the whole heap.  Collect.
+    if (swept_from_beginning) {
+      fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
+      abort();
+    } else {
+      collect(cx);
+      swept_from_beginning = 1;
+    }
+  }
+}
+  
+static void fill_small(struct context *cx, enum small_object_size kind) {
+  int swept_from_beginning = 0;
+  while (1) {
+    // First see if there are small objects already on the freelists
+    // that can be split.
+    for (enum small_object_size next_kind = kind;
+         next_kind < SMALL_OBJECT_SIZES;
+         next_kind++) {
+      struct gcobj_free **loc = get_small_object_freelist(cx, next_kind);
+      if (*loc) {
+        if (kind != next_kind) {
+          struct gcobj_free *ret = *loc;
+          *loc = ret->next;
+          push_small(cx, ret, kind,
+                     small_object_granule_sizes[next_kind]);
+        }
+        return;
+      }
+    }
+
+    // Otherwise if there is a large object, take and split it.
+    struct gcobj_free *large = cx->large_objects;
+    if (large) {
+      unlink_large_object(&cx->large_objects, large);
+      split_large_object(cx, large, LARGE_OBJECT_GRANULE_THRESHOLD);
+      push_small(cx, large, kind, LARGE_OBJECT_GRANULE_THRESHOLD);
+      return;
+    }
+
+    if (!sweep(cx)) {
+      if (swept_from_beginning) {
+        fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
+        abort();
+      } else {
+        collect(cx);
+        swept_from_beginning = 1;
+      }
+    }
+  }
+}
+
+static inline void* allocate_small(struct context *cx,
+                                   enum alloc_kind alloc_kind,
+                                   enum small_object_size small_kind) {
+  struct gcobj_free **loc = get_small_object_freelist(cx, small_kind);
+  if (!*loc)
+    fill_small(cx, small_kind);
+  struct gcobj_free *ret = *loc;
+  *loc = ret->next;
+  ret->tag = tag_live(GCOBJ, alloc_kind);
+  return (void *) ret;
+}
+
+static inline void fill_tiny(struct context *cx) {
+  struct gcobj_free **loc = get_small_object_freelist(cx, SMALL_OBJECT_2);
+  if (!*loc)
+    fill_small(cx, SMALL_OBJECT_2);
+  struct gcobj_free *small = *loc;
+  *loc = small->next;
+  struct gcobj_free_tiny *ret = (struct gcobj_free_tiny *)small;
+  reclaim(cx, ret, 1);
+  reclaim(cx, ret + 1, 1);
+}
+
+static inline void* allocate_tiny(struct context *cx,
+                                  enum alloc_kind alloc_kind) {
+  if (!cx->tiny_objects)
+    fill_tiny(cx);
+
+  struct gcobj_free_tiny *ret = cx->tiny_objects;
+  cx->tiny_objects = ret->next;
+  ret->tag = tag_live(GCOBJ_TINY, alloc_kind);
+  return ret;
+}
+
+static inline void* allocate(struct context *cx, enum alloc_kind kind,
+                             size_t size) {
+  size_t granules = size_to_granules(size);
+  if (granules <= 1)
+    return allocate_tiny(cx, kind);
+  if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
+    return allocate_small(cx, kind, granules_to_small_object_size(granules));
+  return allocate_large(cx, kind, granules);
+}
+
+static inline void init_field(void **addr, void *val) {
+  *addr = val;
+}
+static inline void set_field(void **addr, void *val) {
+  *addr = val;
+}
+static inline void* get_field(void **addr) {
+  return *addr;
+}
+
+static inline void initialize_gc(struct context *cx, size_t size) {
+  size = align_up(size, getpagesize());
+
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    abort();
+  }
+  clear_freelists(cx);
+  cx->base = (uintptr_t) mem;
+  cx->size = size;
+  cx->sweep = cx->base + cx->size;
+  cx->roots = NULL;
+  cx->count = 0;
+  reclaim(cx, mem, size_to_granules(size));
+}
+
+static inline void print_start_gc_stats(struct context *cx) {
+}
+
+static inline void print_end_gc_stats(struct context *cx) {
+  printf("Completed %ld collections\n", cx->count);
+  printf("Heap size is %zd\n", cx->size);
+}

From 7b60164cacbde69e3420d31508d06660503a6e2f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 7 Mar 2022 11:31:28 +0100
Subject: [PATCH 009/403] Update README

---
 README.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/README.md b/README.md
index 518736410..3323d1377 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,80 @@
 This repository is a workbench for implementing different GCs.  It's a
 scratch space.
 
+## What's there
+
+There's just the (modified) GCBench, which is an old but standard
+benchmark that allocates different sizes of binary trees.  It takes a
+heap of 25 MB or so, not very large, and causes somewhere between 20 and
+50 collections, running in 100 to 500 milliseconds on 2022 machines.
+
+Then there are currently three collectors:
+
+ - `bdw.h`: The external BDW-GC conservative parallel stop-the-world
+   mark-sweep segregated-fits collector with lazy sweeping.
+ - `semi.h`: Semispace copying collector.
+ - `mark-sweep.h`: Stop-the-world mark-sweep segregated-fits collector
+   with lazy sweeping.
+
+The two latter collectors reserve one word per object on the header,
+which makes them collect more frequently than `bdw` because the `Node`
+data type takes 32 bytes instead of 24 bytes.
+
+These collectors are sketches and exercises for improving Guile's
+garbage collector.  Guile currently uses BDW-GC.  In Guile if we have an
+object reference we generally have to be able to know what kind of
+object it is, because there are few global invariants enforced by
+typing.  Therefore it is reasonable to consider allowing the GC and the
+application to share the first word of an object, for example to store a
+mark bit, to allow the application to know what kind an object is, to
+allow the GC to find references within the object, to allow the GC to
+compute the object's size, and so on.
+
+The GCBench benchmark is small but then again many Guile processes also
+are quite short-lived, so perhaps it is useful to ensure that small
+heaps remain lightweight.
+
+Guile has a widely used C API and implements part of its run-time in C.
+For this reason it may be infeasible to require precise enumeration of
+GC roots -- we may need to allow GC roots to be conservatively
+identified from data sections and from stacks.  Such conservative roots
+would be pinned, but other objects can be moved by the collector if it
+chooses to do so.  We assume that object references within a heap object
+can be precisely identified.  (The current BDW-GC scans for references
+conservatively even on the heap.)
+
+A likely good solution for Guile would be an [Immix
+collector](https://www.cs.utexas.edu/users/speedway/DaCapo/papers/immix-pldi-2008.pdf)
+with conservative roots, and a parallel stop-the-world mark/evacuate
+phase.  We would probably follow the [Rust
+implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf),
+more or less, with support for per-line pinning.  In an ideal world we
+would work out some kind of generational solution as well, either via a
+semispace nursery or via sticky mark bits, but this requires Guile to
+use a write barrier -- something that's possible to do within Guile
+itself but it's unclear if we can extend this obligation to users of
+Guile's C API.
+
+In any case, these experiments also have the goal of identifying a
+smallish GC abstraction in Guile, so that we might consider evolving GC
+implementation in the future without too much pain.  If we switch away
+from BDW-GC, we should be able to evaluate that it's a win for a large
+majority of use cases.
+
+## To do
+
+ - [ ] Implement a parallel marker for the mark-sweep collector.
+ - [ ] Adapt GCBench for multiple mutator threads.
+ - [ ] Implement precise non-moving Immix whole-heap collector.
+ - [ ] Add evacuation to Immix whole-heap collector.
+ - [ ] Add parallelism to Immix stop-the-world phase.
+ - [ ] Implement conservative root-finding for the mark-sweep collector.
+ - [ ] Implement conservative root-finding and pinning for Immix.
+ - [ ] Implement generational GC with semispace nursery and mark-sweep
+   old generation.
+ - [ ] Implement generational GC with semispace nursery and Immix
+   old generation.
+
 ## License
 
 GCBench.c, MT_GCBench.c, and MT_GCBench2.c are from

From e492da2d2bf3a8af8751b495f4e9751b59a30833 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 16:42:06 +0100
Subject: [PATCH 010/403] Add heap validation to gcbench

* GCBench.c (ValidateTree): New function.
---
 GCBench.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/GCBench.c b/GCBench.c
index 6434fbdfb..903713d3c 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -42,6 +42,8 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
+#include "assert.h"
+
 #if defined(GC_BDW)
 #include "bdw.h"
 #elif defined(GC_SEMI)
@@ -170,6 +172,22 @@ static Node* MakeTree(struct context *cx, int iDepth) {
   }
 }
 
+static void ValidateTree(Node *tree, int depth) {
+#ifndef NDEBUG
+  ASSERT_EQ(tree->i, 0);
+  ASSERT_EQ(tree->j, 0);
+  if (depth == 0) {
+    ASSERT(!tree->left);
+    ASSERT(!tree->right);
+  } else {
+    ASSERT(tree->left);
+    ASSERT(tree->right);
+    ValidateTree(tree->left, depth - 1);
+    ValidateTree(tree->right, depth - 1);
+  }
+#endif
+}
+
 static void TimeConstruction(struct context *cx, int depth) {
   int iNumIters = NumIters(depth);
   NodeHandle tempTree = { NULL };
@@ -182,6 +200,7 @@ static void TimeConstruction(struct context *cx, int depth) {
     for (int i = 0; i < iNumIters; ++i) {
       HANDLE_SET(tempTree, allocate_node(cx));
       Populate(cx, depth, HANDLE_REF(tempTree));
+      ValidateTree(HANDLE_REF(tempTree), depth);
       HANDLE_SET(tempTree, NULL);
     }
     long tFinish = currentTime();
@@ -193,6 +212,7 @@ static void TimeConstruction(struct context *cx, int depth) {
     long tStart = currentTime();
     for (int i = 0; i < iNumIters; ++i) {
       HANDLE_SET(tempTree, MakeTree(cx, depth));
+      ValidateTree(HANDLE_REF(tempTree), depth);
       HANDLE_SET(tempTree, NULL);
     }
     long tFinish = currentTime();
@@ -234,6 +254,7 @@ int main() {
         
   // Stretch the memory space quickly
   HANDLE_SET(tempTree, MakeTree(cx, kStretchTreeDepth));
+  ValidateTree(HANDLE_REF(tempTree), kStretchTreeDepth);
   HANDLE_SET(tempTree, NULL);
 
   // Create a long lived object
@@ -253,6 +274,8 @@ int main() {
     TimeConstruction(cx, d);
   }
 
+  ValidateTree(HANDLE_REF(longLivedTree), kLongLivedTreeDepth);
+
   if (HANDLE_REF(longLivedTree) == 0
       || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");

From 502c0455a7459fc9187e36fd42e22af026c764ba Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 11 Mar 2022 11:23:43 +0100
Subject: [PATCH 011/403] Fix mark-sweep allocator to clear contents

---
 mark-sweep.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mark-sweep.h b/mark-sweep.h
index 9a2d3b422..8c15da53b 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -384,6 +384,7 @@ static void* allocate_large(struct context *cx, enum alloc_kind kind,
           unlink_large_object(prev, large);
           split_large_object(cx, large, granules);
           large->tag = tag_live(GCOBJ, kind);
+          large->next = NULL;
           return large;
         }
       }
@@ -451,6 +452,7 @@ static inline void* allocate_small(struct context *cx,
   struct gcobj_free *ret = *loc;
   *loc = ret->next;
   ret->tag = tag_live(GCOBJ, alloc_kind);
+  ret->next = NULL;
   return (void *) ret;
 }
 

From d2828975a5f9505beba3c11822fc56dc1ff72ce4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 10:48:42 +0100
Subject: [PATCH 012/403] Switch mark-sweep collector to mark stack

Slows down performance though!  Have to think here.
---
 Makefile        |   2 +-
 assert.h        |  15 ++++++
 debug.h         |  10 ++++
 mark-sweep.h    |  45 ++++++++--------
 serial-marker.h | 136 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 183 insertions(+), 25 deletions(-)
 create mode 100644 assert.h
 create mode 100644 debug.h
 create mode 100644 serial-marker.h

diff --git a/Makefile b/Makefile
index 0067cec1a..6e20a089e 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ bdw-%: bdw.h conservative-roots.h %.c
 semi-%: semi.h precise-roots.h %.c
 	$(CC) $(CFLAGS) -I. -DGC_SEMI -o $@ $*.c
 
-mark-sweep-%: mark-sweep.h precise-roots.h %.c
+mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %.c
 	$(CC) $(CFLAGS) -I. -DGC_MARK_SWEEP -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/assert.h b/assert.h
new file mode 100644
index 000000000..3133f105c
--- /dev/null
+++ b/assert.h
@@ -0,0 +1,15 @@
+#ifndef ASSERT_H
+#define ASSERT_H
+
+#define STATIC_ASSERT_EQ(a, b) _Static_assert((a) == (b), "eq")
+
+#define UNLIKELY(e) __builtin_expect(e, 0)
+
+#ifndef NDEBUG
+#define ASSERT(x) do { if (UNLIKELY(!(x))) __builtin_trap(); } while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+#define ASSERT_EQ(a,b) ASSERT((a) == (b))
+
+#endif // ASSERT_H
diff --git a/debug.h b/debug.h
new file mode 100644
index 000000000..7b161c556
--- /dev/null
+++ b/debug.h
@@ -0,0 +1,10 @@
+#ifndef DEBUG_H
+#define DEBUG_H
+
+#ifndef NDEBUG
+#define DEBUG(...) fprintf (stderr, "DEBUG: " __VA_ARGS__)
+#else
+#define DEBUG(...) do { } while (0)
+#endif
+
+#endif // DEBUG_H
diff --git a/mark-sweep.h b/mark-sweep.h
index 8c15da53b..747c46c72 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -4,16 +4,9 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include "assert.h"
 #include "precise-roots.h"
-
-#define STATIC_ASSERT_EQ(a, b) _Static_assert((a) == (b), "eq")
-
-#ifndef NDEBUG
-#define ASSERT(x) do { if (!(x)) __builtin_trap(); } while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-#define ASSERT_EQ(a,b) ASSERT((a) == (b))
+#include "serial-marker.h"
 
 #define GRANULE_SIZE 8
 #define GRANULE_SIZE_LOG_2 3
@@ -163,8 +156,13 @@ struct context {
   uintptr_t sweep;
   struct handle *roots;
   long count;
+  struct marker marker;
 };
 
+static inline struct marker* context_marker(struct context *cx) {
+  return &cx->marker;
+}
+
 static inline struct gcobj_free**
 get_small_object_freelist(struct context *cx, enum small_object_size kind) {
   ASSERT(kind < SMALL_OBJECT_SIZES);
@@ -187,27 +185,21 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 }
 
 static void collect(struct context *cx) __attribute__((noinline));
-static void mark(struct context *cx, void *p);
 
-static inline void visit(struct context *cx, void **loc) {
-  mark(cx, *loc);
+static inline int mark_object(struct gcobj *obj) {
+  if (tag_marked(obj->tag))
+    return 0;
+  tag_set_marked(&obj->tag);
+  return 1;
 }
 
-static void mark(struct context *cx, void *p) {
-  // A production mark implementation would use a worklist, to avoid
-  // stack overflow.  This implementation just uses the call stack.
-  struct gcobj *obj = p;
-  if (obj == NULL)
-    return;
-  if (tag_marked(obj->tag))
-    return;
-  tag_set_marked(&obj->tag);
+static void process(struct context *cx, struct gcobj *obj) {
   switch (tag_live_alloc_kind(obj->tag)) {
   case NODE:
-    visit_node_fields(cx, obj, visit);
+    visit_node_fields(cx, obj, marker_visit);
     break;
   case DOUBLE_ARRAY:
-    visit_double_array_fields(cx, obj, visit);
+    visit_double_array_fields(cx, obj, marker_visit);
     break;
   default:
     abort ();
@@ -223,8 +215,11 @@ static void clear_freelists(struct context *cx) {
 
 static void collect(struct context *cx) {
   // fprintf(stderr, "start collect #%ld:\n", cx->count);
+  marker_prepare(cx);
   for (struct handle *h = cx->roots; h; h = h->next)
-    mark(cx, h->v);
+    marker_visit_root(cx, &h->v);
+  marker_trace(cx, process);
+  marker_release(cx);
   // fprintf(stderr, "done marking\n");
   cx->sweep = cx->base;
   clear_freelists(cx);
@@ -513,6 +508,8 @@ static inline void initialize_gc(struct context *cx, size_t size) {
   cx->sweep = cx->base + cx->size;
   cx->roots = NULL;
   cx->count = 0;
+  if (!marker_init(cx))
+    abort();
   reclaim(cx, mem, size_to_granules(size));
 }
 
diff --git a/serial-marker.h b/serial-marker.h
new file mode 100644
index 000000000..432080a02
--- /dev/null
+++ b/serial-marker.h
@@ -0,0 +1,136 @@
+#ifndef SERIAL_TRACE_H
+#define SERIAL_TRACE_H
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+
+struct mark_stack {
+  size_t size;
+  size_t next;
+  uintptr_t *buf;
+};
+
+static const size_t mark_stack_max_size =
+  (1ULL << (sizeof(uintptr_t) * 8 - 1)) / sizeof(uintptr_t);
+static const size_t mark_stack_release_byte_threshold = 1 * 1024 * 1024;
+
+static void*
+mark_stack_alloc(size_t size) {
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow mark stack");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return NULL;
+  }
+  return mem;
+}
+
+static int
+mark_stack_init(struct mark_stack *stack) {
+  stack->size = getpagesize();
+  stack->next = 0;
+  stack->buf = mark_stack_alloc(stack->size);
+  return !!stack->buf;
+}
+  
+static int
+mark_stack_grow(struct mark_stack *stack) {
+  uintptr_t size = stack->size;
+  if (size >= mark_stack_max_size) {
+    DEBUG("mark stack already at max size of %zu bytes", size);
+    return 0;
+  }
+  size *= 2;
+  uintptr_t *buf = mark_stack_alloc(size);
+  if (!buf)
+    return 0;
+  memcpy(buf, stack->buf, stack->next * sizeof(uintptr_t));
+  munmap(stack->buf, stack->size * sizeof(uintptr_t));
+  stack->size = size;
+  stack->buf = buf;
+  return 1;
+}
+  
+static inline void
+mark_stack_push(struct mark_stack *stack, void *p) {
+  size_t next = stack->next;
+  if (UNLIKELY(next == stack->size)) {
+    if (!mark_stack_grow(stack))
+      abort();
+  }
+  stack->buf[next] = (uintptr_t)p;
+  stack->next = next + 1;
+}
+
+static inline void*
+mark_stack_pop(struct mark_stack *stack) {
+  size_t next = stack->next;
+  if (UNLIKELY(next == 0))
+    return NULL;
+  uintptr_t ret = stack->buf[next - 1];
+  stack->next = next - 1;
+  return (void*)ret;
+}
+
+static void
+mark_stack_release(struct mark_stack *stack) {
+  size_t byte_size = stack->size * sizeof(uintptr_t);
+  if (byte_size >= mark_stack_release_byte_threshold)
+    madvise(stack->buf, byte_size, MADV_DONTNEED);
+}
+
+static void
+mark_stack_destroy(struct mark_stack *stack) {
+  size_t byte_size = stack->size * sizeof(uintptr_t);
+  munmap(stack->buf, byte_size);
+}
+
+struct marker {
+  struct mark_stack stack;
+};
+
+struct context;
+static inline struct marker* context_marker(struct context *cx);
+
+static int
+marker_init(struct context *cx) {
+  return mark_stack_init(&context_marker(cx)->stack);
+}
+static void marker_prepare(struct context *cx) {}
+static void marker_release(struct context *cx) {
+  mark_stack_release(&context_marker(cx)->stack);
+}
+
+struct gcobj;
+static inline void marker_visit(struct context *cx, void **loc) __attribute__((always_inline));
+static inline void marker_trace(struct context *cx,
+                                void (*)(struct context *, struct gcobj *))
+  __attribute__((always_inline));
+static inline int mark_object(struct gcobj *obj) __attribute__((always_inline));
+
+static inline void
+marker_visit(struct context *cx, void **loc) {
+  struct gcobj *obj = *loc;
+  if (obj) {
+    __builtin_prefetch(obj);
+    mark_stack_push(&context_marker(cx)->stack, obj);
+  }
+}
+static inline void
+marker_visit_root(struct context *cx, void **loc) {
+  marker_visit(cx, loc);
+}
+static inline void
+marker_trace(struct context *cx,
+             void (*process)(struct context *, struct gcobj *)) {
+  struct gcobj *obj;
+  while ((obj = mark_stack_pop(&context_marker(cx)->stack)))
+    if (mark_object(obj))
+      process(cx, obj);
+}
+
+#endif // SERIAL_MARK_H

From 45405efe56af4992cf94e2b9187bf82ba35318ee Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 11:09:41 +0100
Subject: [PATCH 013/403] Move to mark queue, is it an improvement?

---
 serial-marker.h | 113 ++++++++++++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 47 deletions(-)

diff --git a/serial-marker.h b/serial-marker.h
index 432080a02..7459ad0b2 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -7,22 +7,23 @@
 #include "assert.h"
 #include "debug.h"
 
-struct mark_stack {
+struct mark_queue {
   size_t size;
-  size_t next;
+  size_t read;
+  size_t write;
   uintptr_t *buf;
 };
 
-static const size_t mark_stack_max_size =
+static const size_t mark_queue_max_size =
   (1ULL << (sizeof(uintptr_t) * 8 - 1)) / sizeof(uintptr_t);
-static const size_t mark_stack_release_byte_threshold = 1 * 1024 * 1024;
+static const size_t mark_queue_release_byte_threshold = 1 * 1024 * 1024;
 
 static void*
-mark_stack_alloc(size_t size) {
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+mark_queue_alloc(size_t size) {
+  void *mem = mmap(NULL, size * sizeof(uintptr_t), PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
-    perror("Failed to grow mark stack");
+    perror("Failed to grow mark queue");
     DEBUG("Failed to allocate %zu bytes", size);
     return NULL;
   }
@@ -30,67 +31,85 @@ mark_stack_alloc(size_t size) {
 }
 
 static int
-mark_stack_init(struct mark_stack *stack) {
-  stack->size = getpagesize();
-  stack->next = 0;
-  stack->buf = mark_stack_alloc(stack->size);
-  return !!stack->buf;
+mark_queue_init(struct mark_queue *q) {
+  q->size = getpagesize();
+  q->read = 0;
+  q->write = 0;
+  q->buf = mark_queue_alloc(q->size);
+  return !!q->buf;
 }
   
+static inline uintptr_t
+mark_queue_get(struct mark_queue *q, size_t idx) {
+  return q->buf[idx & (q->size - 1)];
+}
+
+static inline void
+mark_queue_put(struct mark_queue *q, size_t idx, uintptr_t x) {
+  q->buf[idx & (q->size - 1)] = x;
+}
+
 static int
-mark_stack_grow(struct mark_stack *stack) {
-  uintptr_t size = stack->size;
-  if (size >= mark_stack_max_size) {
-    DEBUG("mark stack already at max size of %zu bytes", size);
+mark_queue_grow(struct mark_queue *q) {
+  uintptr_t old_size = q->size;
+  size_t old_read = q->read;
+  size_t old_write = q->write;
+  uintptr_t *old_buf = q->buf;
+  if (old_size >= mark_queue_max_size) {
+    DEBUG("mark queue already at max size of %zu bytes", old_size);
     return 0;
   }
-  size *= 2;
-  uintptr_t *buf = mark_stack_alloc(size);
-  if (!buf)
+  uintptr_t new_size = old_size * 2;
+  size_t new_read = 0;
+  size_t new_write = 0;
+  uintptr_t *new_buf = mark_queue_alloc(new_size);
+  if (!new_buf)
     return 0;
-  memcpy(buf, stack->buf, stack->next * sizeof(uintptr_t));
-  munmap(stack->buf, stack->size * sizeof(uintptr_t));
-  stack->size = size;
-  stack->buf = buf;
+
+  while (old_read < old_write)
+    new_buf[new_write++] = mark_queue_get(q, old_read++);
+
+  munmap(old_buf, old_size * sizeof(uintptr_t));
+
+  q->size = new_size;
+  q->read = new_read;
+  q->write = new_write;
+  q->buf = new_buf;
   return 1;
 }
   
 static inline void
-mark_stack_push(struct mark_stack *stack, void *p) {
-  size_t next = stack->next;
-  if (UNLIKELY(next == stack->size)) {
-    if (!mark_stack_grow(stack))
+mark_queue_push(struct mark_queue *q, void *p) {
+  if (UNLIKELY(q->write - q->read == q->size)) {
+    if (!mark_queue_grow(q))
       abort();
   }
-  stack->buf[next] = (uintptr_t)p;
-  stack->next = next + 1;
+  mark_queue_put(q, q->write++, (uintptr_t)p);
 }
 
 static inline void*
-mark_stack_pop(struct mark_stack *stack) {
-  size_t next = stack->next;
-  if (UNLIKELY(next == 0))
+mark_queue_pop(struct mark_queue *q) {
+  if (UNLIKELY(q->read == q->write))
     return NULL;
-  uintptr_t ret = stack->buf[next - 1];
-  stack->next = next - 1;
-  return (void*)ret;
+  return (void*)mark_queue_get(q, q->read++);
 }
 
 static void
-mark_stack_release(struct mark_stack *stack) {
-  size_t byte_size = stack->size * sizeof(uintptr_t);
-  if (byte_size >= mark_stack_release_byte_threshold)
-    madvise(stack->buf, byte_size, MADV_DONTNEED);
+mark_queue_release(struct mark_queue *q) {
+  size_t byte_size = q->size * sizeof(uintptr_t);
+  if (byte_size >= mark_queue_release_byte_threshold)
+    madvise(q->buf, byte_size, MADV_DONTNEED);
+  q->read = q->write = 0;
 }
 
 static void
-mark_stack_destroy(struct mark_stack *stack) {
-  size_t byte_size = stack->size * sizeof(uintptr_t);
-  munmap(stack->buf, byte_size);
+mark_queue_destroy(struct mark_queue *q) {
+  size_t byte_size = q->size * sizeof(uintptr_t);
+  munmap(q->buf, byte_size);
 }
 
 struct marker {
-  struct mark_stack stack;
+  struct mark_queue queue;
 };
 
 struct context;
@@ -98,11 +117,11 @@ static inline struct marker* context_marker(struct context *cx);
 
 static int
 marker_init(struct context *cx) {
-  return mark_stack_init(&context_marker(cx)->stack);
+  return mark_queue_init(&context_marker(cx)->queue);
 }
 static void marker_prepare(struct context *cx) {}
 static void marker_release(struct context *cx) {
-  mark_stack_release(&context_marker(cx)->stack);
+  mark_queue_release(&context_marker(cx)->queue);
 }
 
 struct gcobj;
@@ -117,7 +136,7 @@ marker_visit(struct context *cx, void **loc) {
   struct gcobj *obj = *loc;
   if (obj) {
     __builtin_prefetch(obj);
-    mark_stack_push(&context_marker(cx)->stack, obj);
+    mark_queue_push(&context_marker(cx)->queue, obj);
   }
 }
 static inline void
@@ -128,7 +147,7 @@ static inline void
 marker_trace(struct context *cx,
              void (*process)(struct context *, struct gcobj *)) {
   struct gcobj *obj;
-  while ((obj = mark_stack_pop(&context_marker(cx)->stack)))
+  while ((obj = mark_queue_pop(&context_marker(cx)->queue)))
     if (mark_object(obj))
       process(cx, obj);
 }

From cfa7ea31aeebf54b2ae32b00077bf21d3a9cc52f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 11:12:45 +0100
Subject: [PATCH 014/403] Move back to marking objects instead of edges

---
 serial-marker.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/serial-marker.h b/serial-marker.h
index 7459ad0b2..64f27ddb7 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -32,7 +32,7 @@ mark_queue_alloc(size_t size) {
 
 static int
 mark_queue_init(struct mark_queue *q) {
-  q->size = getpagesize();
+  q->size = getpagesize() / sizeof(uintptr_t);
   q->read = 0;
   q->write = 0;
   q->buf = mark_queue_alloc(q->size);
@@ -49,6 +49,8 @@ mark_queue_put(struct mark_queue *q, size_t idx, uintptr_t x) {
   q->buf[idx & (q->size - 1)] = x;
 }
 
+static int mark_queue_grow(struct mark_queue *q) __attribute__((noinline));
+
 static int
 mark_queue_grow(struct mark_queue *q) {
   uintptr_t old_size = q->size;
@@ -134,10 +136,8 @@ static inline int mark_object(struct gcobj *obj) __attribute__((always_inline));
 static inline void
 marker_visit(struct context *cx, void **loc) {
   struct gcobj *obj = *loc;
-  if (obj) {
-    __builtin_prefetch(obj);
+  if (obj && mark_object(obj))
     mark_queue_push(&context_marker(cx)->queue, obj);
-  }
 }
 static inline void
 marker_visit_root(struct context *cx, void **loc) {
@@ -148,8 +148,7 @@ marker_trace(struct context *cx,
              void (*process)(struct context *, struct gcobj *)) {
   struct gcobj *obj;
   while ((obj = mark_queue_pop(&context_marker(cx)->queue)))
-    if (mark_object(obj))
-      process(cx, obj);
+    process(cx, obj);
 }
 
 #endif // SERIAL_MARK_H

From c612ff382552fadee4a5834840d973dfe3e556d0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 11:31:55 +0100
Subject: [PATCH 015/403] Optimize computation of size class from small object
 granule count

---
 mark-sweep.h | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 747c46c72..024da1ebc 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -36,12 +36,21 @@ static const uint8_t small_object_granule_sizes[] =
 #undef SMALL_OBJECT_GRANULE_SIZE
 };
 
+static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD + 1] = {
+  NOT_SMALL_OBJECT, NOT_SMALL_OBJECT, SMALL_OBJECT_2,  SMALL_OBJECT_3,
+  SMALL_OBJECT_4,   SMALL_OBJECT_5,   SMALL_OBJECT_6,  SMALL_OBJECT_8,
+  SMALL_OBJECT_8,   SMALL_OBJECT_10,  SMALL_OBJECT_10, SMALL_OBJECT_16,
+  SMALL_OBJECT_16,  SMALL_OBJECT_16,  SMALL_OBJECT_16, SMALL_OBJECT_16,
+  SMALL_OBJECT_16,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
+  SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
+  SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
+  SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
+  SMALL_OBJECT_32
+};
+
 static enum small_object_size granules_to_small_object_size(unsigned granules) {
-  if (granules <= 1) return NOT_SMALL_OBJECT;
-#define TEST_GRANULE_SIZE(i) if (granules <= i) return SMALL_OBJECT_##i;
-  FOR_EACH_SMALL_OBJECT_GRANULES(TEST_GRANULE_SIZE);
-#undef TEST_GRANULE_SIZE
-  return NOT_SMALL_OBJECT;
+  ASSERT(granules <= LARGE_OBJECT_GRANULE_THRESHOLD);
+  return small_object_sizes_for_granules[granules];
 }
   
 static uintptr_t align_up(uintptr_t addr, size_t align) {
@@ -494,6 +503,14 @@ static inline void* get_field(void **addr) {
 }
 
 static inline void initialize_gc(struct context *cx, size_t size) {
+#define SMALL_OBJECT_GRANULE_SIZE(i) \
+    ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]);
+  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE);
+#undef SMALL_OBJECT_GRANULE_SIZE
+
+  ASSERT_EQ(SMALL_OBJECT_SIZES - 1,
+            small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD]);
+
   size = align_up(size, getpagesize());
 
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,

From fb350fb3ffa163d98eeb494e8a5d3e870ea41009 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 11:55:43 +0100
Subject: [PATCH 016/403] Keep read/write positions when growing queue

---
 serial-marker.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/serial-marker.h b/serial-marker.h
index 64f27ddb7..8440daac5 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -53,29 +53,27 @@ static int mark_queue_grow(struct mark_queue *q) __attribute__((noinline));
 
 static int
 mark_queue_grow(struct mark_queue *q) {
-  uintptr_t old_size = q->size;
-  size_t old_read = q->read;
-  size_t old_write = q->write;
+  size_t old_size = q->size;
   uintptr_t *old_buf = q->buf;
   if (old_size >= mark_queue_max_size) {
     DEBUG("mark queue already at max size of %zu bytes", old_size);
     return 0;
   }
-  uintptr_t new_size = old_size * 2;
-  size_t new_read = 0;
-  size_t new_write = 0;
+
+  size_t new_size = old_size * 2;
   uintptr_t *new_buf = mark_queue_alloc(new_size);
   if (!new_buf)
     return 0;
 
-  while (old_read < old_write)
-    new_buf[new_write++] = mark_queue_get(q, old_read++);
+  size_t old_mask = old_size - 1;
+  size_t new_mask = new_size - 1;
+
+  for (size_t i = q->read; i < q->write; i++)
+    new_buf[i & new_mask] = old_buf[i & old_mask];
 
   munmap(old_buf, old_size * sizeof(uintptr_t));
 
   q->size = new_size;
-  q->read = new_read;
-  q->write = new_write;
   q->buf = new_buf;
   return 1;
 }

From 91a330e310bea70c0a789e36c8baab2ed0827a81 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 15:04:34 +0100
Subject: [PATCH 017/403] More asserts in mark-sweep

---
 mark-sweep.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 024da1ebc..3b8c4aecd 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -5,6 +5,7 @@
 #include <unistd.h>
 
 #include "assert.h"
+#include "debug.h"
 #include "precise-roots.h"
 #include "serial-marker.h"
 
@@ -36,7 +37,7 @@ static const uint8_t small_object_granule_sizes[] =
 #undef SMALL_OBJECT_GRANULE_SIZE
 };
 
-static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD + 1] = {
+static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD + 2] = {
   NOT_SMALL_OBJECT, NOT_SMALL_OBJECT, SMALL_OBJECT_2,  SMALL_OBJECT_3,
   SMALL_OBJECT_4,   SMALL_OBJECT_5,   SMALL_OBJECT_6,  SMALL_OBJECT_8,
   SMALL_OBJECT_8,   SMALL_OBJECT_10,  SMALL_OBJECT_10, SMALL_OBJECT_16,
@@ -45,7 +46,7 @@ static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT
   SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
   SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
   SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
-  SMALL_OBJECT_32
+  SMALL_OBJECT_32,  NOT_SMALL_OBJECT
 };
 
 static enum small_object_size granules_to_small_object_size(unsigned granules) {
@@ -223,13 +224,13 @@ static void clear_freelists(struct context *cx) {
 }
 
 static void collect(struct context *cx) {
-  // fprintf(stderr, "start collect #%ld:\n", cx->count);
+  DEBUG("start collect #%ld:\n", cx->count);
   marker_prepare(cx);
   for (struct handle *h = cx->roots; h; h = h->next)
     marker_visit_root(cx, &h->v);
   marker_trace(cx, process);
   marker_release(cx);
-  // fprintf(stderr, "done marking\n");
+  DEBUG("done marking\n");
   cx->sweep = cx->base;
   clear_freelists(cx);
   cx->count++;
@@ -504,7 +505,8 @@ static inline void* get_field(void **addr) {
 
 static inline void initialize_gc(struct context *cx, size_t size) {
 #define SMALL_OBJECT_GRANULE_SIZE(i) \
-    ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]);
+    ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]); \
+    ASSERT_EQ(SMALL_OBJECT_##i + 1, small_object_sizes_for_granules[i+1]);
   FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE);
 #undef SMALL_OBJECT_GRANULE_SIZE
 

From 5c8a8a2d3eba174565b795f246f1c27e9c9b94c2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 15:22:01 +0100
Subject: [PATCH 018/403] Store mark bits on the side

Lets the sweeper avoid chasing pointers, and is more amenable to
parallel marking.
---
 Makefile        |  6 +--
 mark-sweep.h    | 99 ++++++++++++++++++-------------------------------
 serial-marker.h |  5 ++-
 3 files changed, 43 insertions(+), 67 deletions(-)

diff --git a/Makefile b/Makefile
index 6e20a089e..81723fd93 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 all: $(ALL_TESTS)
 
 bdw-%: bdw.h conservative-roots.h %.c
-	$(CC) $(CFLAGS) -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
+	$(CC) $(CFLAGS) -DNDEBUG -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
 
 semi-%: semi.h precise-roots.h %.c
-	$(CC) $(CFLAGS) -I. -DGC_SEMI -o $@ $*.c
+	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_SEMI -o $@ $*.c
 
 mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %.c
-	$(CC) $(CFLAGS) -I. -DGC_MARK_SWEEP -o $@ $*.c
+	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
diff --git a/mark-sweep.h b/mark-sweep.h
index 3b8c4aecd..1daf90b74 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -72,40 +72,16 @@ static inline enum gcobj_kind tag_gcobj_kind(uintptr_t tag) {
   return tag & gcobj_kind_bit;
 }
 
-// If bit 1 of a tag is set, the object is potentially live.  allocate()
-// returns objects with this flag set.  When sweep() adds an object to
-// the freelist, it gets added as dead (with this flag unset).  If
-// sweep() ever sees a dead object, then the object represents wasted
-// space in the form of fragmentation.
-static const uintptr_t gcobj_live_bit = (1 << 1);
-static inline int tag_maybe_live(uintptr_t tag) {
-  return tag & gcobj_live_bit;
-}
-
-// The mark bit is bit 2, and can only ever be set on allocated object
-// (i.e. never for objects on a free list).  It is cleared by the
-// sweeper before the next collection.
-static const uintptr_t gcobj_mark_bit = (1 << 2);
-static inline int tag_marked(uintptr_t tag) {
-  return tag & gcobj_mark_bit;
-}
-static inline void tag_set_marked(uintptr_t *tag_loc) {
-  *tag_loc |= gcobj_mark_bit;
-}
-static inline void tag_clear_marked(uintptr_t *tag_loc) {
-  *tag_loc &= ~gcobj_mark_bit;
-}
-
-// Alloc kind is in bits 3-10, for live objects.
+// Alloc kind is in bits 1-8, for live objects.
 static const uintptr_t gcobj_alloc_kind_mask = 0xff;
-static const uintptr_t gcobj_alloc_kind_shift = 3;
+static const uintptr_t gcobj_alloc_kind_shift = 1;
 static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
   return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
 }
 
-// For free objects, bits 2 and up are free.  Non-tiny objects store the
+// For free objects, bits 1 and up are free.  Non-tiny objects store the
 // object size in granules there.
-static const uintptr_t gcobj_free_granules_shift = 2;
+static const uintptr_t gcobj_free_granules_shift = 1;
 static inline uintptr_t tag_free_granules(uintptr_t tag) {
   return tag >> gcobj_free_granules_shift;
 }
@@ -114,8 +90,7 @@ static inline uintptr_t tag_free(enum gcobj_kind kind, size_t granules) {
   return kind | (granules << gcobj_free_granules_shift);
 }
 static inline uintptr_t tag_live(enum gcobj_kind kind, uint8_t alloc_kind) {
-  return kind | gcobj_live_bit |
-    ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
+  return kind | ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
 }
 static inline uintptr_t tag_free_tiny(void) {
   return tag_free(GCOBJ_TINY, 0);
@@ -162,6 +137,8 @@ struct context {
   // Unordered list of large objects.
   struct gcobj_free *large_objects;
   uintptr_t base;
+  uint8_t *mark_bytes;
+  uintptr_t heap_base;
   size_t size;
   uintptr_t sweep;
   struct handle *roots;
@@ -196,10 +173,17 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 
 static void collect(struct context *cx) __attribute__((noinline));
 
-static inline int mark_object(struct gcobj *obj) {
-  if (tag_marked(obj->tag))
+static inline uint8_t* mark_byte(struct context *cx, struct gcobj *obj) {
+  uintptr_t granule = (((uintptr_t) obj) - cx->heap_base)  / GRANULE_SIZE;
+  ASSERT(granule < (cx->heap_base - cx->base));
+  return &cx->mark_bytes[granule];
+}
+
+static inline int mark_object(struct context *cx, struct gcobj *obj) {
+  uint8_t *byte = mark_byte(cx, obj);
+  if (*byte)
     return 0;
-  tag_set_marked(&obj->tag);
+  *byte = 1;
   return 1;
 }
 
@@ -231,7 +215,7 @@ static void collect(struct context *cx) {
   marker_trace(cx, process);
   marker_release(cx);
   DEBUG("done marking\n");
-  cx->sweep = cx->base;
+  cx->sweep = cx->heap_base;
   clear_freelists(cx);
   cx->count++;
 }
@@ -328,13 +312,6 @@ static size_t live_object_granules(struct gcobj *obj) {
   return small_object_granule_sizes[granules_to_small_object_size(granules)];
 }  
 
-static size_t free_object_granules(struct gcobj *obj) {
-  enum gcobj_kind size_kind = tag_gcobj_kind(obj->tag);
-  if (size_kind == GCOBJ_TINY)
-    return 1;
-  return tag_free_granules(obj->tag);
-}  
-
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
 static int sweep(struct context *cx) {
@@ -345,29 +322,24 @@ static int sweep(struct context *cx) {
   uintptr_t limit = cx->base + cx->size;
 
   while (to_reclaim > 0 && sweep < limit) {
-    struct gcobj *obj = (struct gcobj*)sweep;
-    size_t obj_granules = tag_maybe_live(obj->tag)
-      ? live_object_granules(obj) : free_object_granules(obj);
-    sweep += obj_granules * GRANULE_SIZE;
-    if (tag_maybe_live(obj->tag) && tag_marked(obj->tag)) {
+    uintptr_t sweep_base = sweep;
+    struct gcobj *obj = (struct gcobj*)sweep_base;
+    uint8_t* mark = mark_byte(cx, obj);
+    if (*mark) {
       // Object survived collection; clear mark and continue sweeping.
-      tag_clear_marked(&obj->tag);
+      ASSERT(*mark == 1);
+      *mark = 0;
+      sweep += live_object_granules(obj) * GRANULE_SIZE;
     } else {
-      // Found a free object.  Combine with any following free objects.
+      // Found a free object.  Combine with any following free space.
       // To avoid fragmentation, don't limit the amount to reclaim.
-      to_reclaim -= obj_granules;
-      while (sweep < limit) {
-        struct gcobj *next = (struct gcobj*)sweep;
-        if (tag_maybe_live(next->tag) && tag_marked(next->tag))
-          break;
-        size_t next_granules = tag_maybe_live(next->tag)
-          ? live_object_granules(next) : free_object_granules(next);
-        sweep += next_granules * GRANULE_SIZE;
-        to_reclaim -= next_granules;
-        obj_granules += next_granules;
-      }
-      memset(((char*)obj) + GRANULE_SIZE, 0, (obj_granules - 1) * GRANULE_SIZE);
-      reclaim(cx, obj, obj_granules);
+      do {
+        sweep += GRANULE_SIZE, to_reclaim--, mark++;
+      } while (sweep < limit && !*mark);
+      memset((void*)(sweep_base + GRANULE_SIZE),
+             0,
+             sweep - sweep_base - GRANULE_SIZE);
+      reclaim(cx, obj, (sweep - sweep_base) >> GRANULE_SIZE_LOG_2);
     }
   }
 
@@ -523,13 +495,16 @@ static inline void initialize_gc(struct context *cx, size_t size) {
   }
   clear_freelists(cx);
   cx->base = (uintptr_t) mem;
+  cx->mark_bytes = mem;
+  size_t heap_admin_size = align_up(size / GRANULE_SIZE, GRANULE_SIZE);
+  cx->heap_base = cx->base + heap_admin_size;
   cx->size = size;
   cx->sweep = cx->base + cx->size;
   cx->roots = NULL;
   cx->count = 0;
   if (!marker_init(cx))
     abort();
-  reclaim(cx, mem, size_to_granules(size));
+  reclaim(cx, (void*)cx->heap_base, size_to_granules(size - heap_admin_size));
 }
 
 static inline void print_start_gc_stats(struct context *cx) {
diff --git a/serial-marker.h b/serial-marker.h
index 8440daac5..755064372 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -129,12 +129,13 @@ static inline void marker_visit(struct context *cx, void **loc) __attribute__((a
 static inline void marker_trace(struct context *cx,
                                 void (*)(struct context *, struct gcobj *))
   __attribute__((always_inline));
-static inline int mark_object(struct gcobj *obj) __attribute__((always_inline));
+static inline int mark_object(struct context *cx,
+                              struct gcobj *obj) __attribute__((always_inline));
 
 static inline void
 marker_visit(struct context *cx, void **loc) {
   struct gcobj *obj = *loc;
-  if (obj && mark_object(obj))
+  if (obj && mark_object(cx, obj))
     mark_queue_push(&context_marker(cx)->queue, obj);
 }
 static inline void

From 5edc4fa81a29d1c1d7b3ca4014b6e45e6f2ae6ea Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 15:51:11 +0100
Subject: [PATCH 019/403] More efficient sweep

---
 mark-sweep.h | 50 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 1daf90b74..b7cc7b610 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -312,6 +312,21 @@ static size_t live_object_granules(struct gcobj *obj) {
   return small_object_granule_sizes[granules_to_small_object_size(granules)];
 }  
 
+static size_t next_mark(const uint8_t *mark, size_t limit) {
+  size_t n = 0;
+  for (; (((uintptr_t)mark) & 7) && n < limit; n++)
+    if (mark[n])
+      return n;
+  uintptr_t *word_mark = (uintptr_t *)(mark + n);
+  for (; n < limit; n += sizeof(uintptr_t), word_mark++)
+    if (word_mark)
+      break;
+  for (; n < limit; n++)
+    if (mark[n])
+      return n;
+  return limit;
+}
+
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
 static int sweep(struct context *cx) {
@@ -322,25 +337,26 @@ static int sweep(struct context *cx) {
   uintptr_t limit = cx->base + cx->size;
 
   while (to_reclaim > 0 && sweep < limit) {
-    uintptr_t sweep_base = sweep;
-    struct gcobj *obj = (struct gcobj*)sweep_base;
-    uint8_t* mark = mark_byte(cx, obj);
-    if (*mark) {
-      // Object survived collection; clear mark and continue sweeping.
-      ASSERT(*mark == 1);
-      *mark = 0;
-      sweep += live_object_granules(obj) * GRANULE_SIZE;
-    } else {
-      // Found a free object.  Combine with any following free space.
-      // To avoid fragmentation, don't limit the amount to reclaim.
-      do {
-        sweep += GRANULE_SIZE, to_reclaim--, mark++;
-      } while (sweep < limit && !*mark);
-      memset((void*)(sweep_base + GRANULE_SIZE),
+    uint8_t* mark = mark_byte(cx, (struct gcobj*)sweep);
+    size_t free_granules = next_mark(mark,
+                                     (limit - sweep) >> GRANULE_SIZE_LOG_2);
+    if (free_granules) {
+      size_t free_bytes = free_granules * GRANULE_SIZE;
+      memset((void*)(sweep + GRANULE_SIZE),
              0,
-             sweep - sweep_base - GRANULE_SIZE);
-      reclaim(cx, obj, (sweep - sweep_base) >> GRANULE_SIZE_LOG_2);
+             free_bytes - GRANULE_SIZE);
+      reclaim(cx, (void*)sweep, free_granules);
+      sweep += free_bytes;
+      to_reclaim -= free_granules;
+
+      mark += free_granules;
+      if (sweep == limit)
+        break;
     }
+    // Object survived collection; clear mark and continue sweeping.
+    ASSERT(*mark == 1);
+    *mark = 0;
+    sweep += live_object_granules((struct gcobj *)sweep) * GRANULE_SIZE;
   }
 
   cx->sweep = sweep;

From f6ac9d25712d017e1446cf9b292e63d46ee72815 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 16:47:52 +0100
Subject: [PATCH 020/403] Ability to set heap size on command line

---
 GCBench.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/GCBench.c b/GCBench.c
index 903713d3c..f00f71016 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -230,6 +230,13 @@ int main() {
   double kHeapMultiplier = 3;
   size_t kHeapSize = kHeapMaxLive * kHeapMultiplier;
 
+  if (getenv("HEAP_SIZE"))
+    kHeapSize = atol(getenv("HEAP_SIZE"));
+  if (!kHeapSize) {
+    fprintf(stderr, "Failed to parse HEAP_SIZE='%s'\n", getenv("HEAP_SIZE"));
+    return 1;
+  }
+
   struct context _cx;
   struct context *cx = &_cx;
   initialize_gc(cx, kHeapSize);

From 01d3f9627e14d1932b99108f9bdbcce390337107 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 11 Mar 2022 11:47:55 +0100
Subject: [PATCH 021/403] Further accelerate sweeping

---
 mark-sweep.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index b7cc7b610..0920821db 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -318,8 +318,15 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
     if (mark[n])
       return n;
   uintptr_t *word_mark = (uintptr_t *)(mark + n);
-  for (; n < limit; n += sizeof(uintptr_t), word_mark++)
-    if (word_mark)
+  for (;
+       n + sizeof(uintptr_t) * 4 <= limit;
+       n += sizeof(uintptr_t) * 4, word_mark += 4)
+    if (word_mark[0] | word_mark[1] | word_mark[2] | word_mark[3])
+      break;
+  for (;
+       n + sizeof(uintptr_t) <= limit;
+       n += sizeof(uintptr_t), word_mark += 1)
+    if (word_mark[0])
       break;
   for (; n < limit; n++)
     if (mark[n])

From 77ac5303609d9fa997c9ce0e47f50924e06c581d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Mar 2022 21:06:45 +0100
Subject: [PATCH 022/403] Add beginnings of parallel marker

---
 GCBench.c         |   3 +
 Makefile          |   5 +-
 mark-sweep.h      |   4 +
 parallel-marker.h | 269 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 280 insertions(+), 1 deletion(-)
 create mode 100644 parallel-marker.h

diff --git a/GCBench.c b/GCBench.c
index f00f71016..f229de866 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -50,6 +50,9 @@
 #include "semi.h"
 #elif defined(GC_MARK_SWEEP)
 #include "mark-sweep.h"
+#elif defined(GC_PARALLEL_MARK_SWEEP)
+#define GC_PARALLEL_MARK 1
+#include "mark-sweep.h"
 #else
 #error unknown gc
 #endif
diff --git a/Makefile b/Makefile
index 81723fd93..04a23ed6c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=GCBench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi mark-sweep
+COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
 CFLAGS=-Wall -O2 -g
@@ -17,6 +17,9 @@ semi-%: semi.h precise-roots.h %.c
 mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %.c
 	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
 
+parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %.c
+	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
+
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
 test-%: $(ALL_TESTS)
diff --git a/mark-sweep.h b/mark-sweep.h
index 0920821db..f6b000fc2 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -7,7 +7,11 @@
 #include "assert.h"
 #include "debug.h"
 #include "precise-roots.h"
+#ifdef GC_PARALLEL_MARK
+#include "parallel-marker.h"
+#else
 #include "serial-marker.h"
+#endif
 
 #define GRANULE_SIZE 8
 #define GRANULE_SIZE_LOG_2 3
diff --git a/parallel-marker.h b/parallel-marker.h
new file mode 100644
index 000000000..8bfac725a
--- /dev/null
+++ b/parallel-marker.h
@@ -0,0 +1,269 @@
+#ifndef SERIAL_TRACE_H
+#define SERIAL_TRACE_H
+
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+
+// The Chase-Lev work-stealing deque, as initially described in "Dynamic
+// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
+// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
+// and improved with C11 atomics in "Correct and Efficient Work-Stealing
+// for Weak Memory Models" (Lê et al, PPoPP'13)
+// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
+
+struct mark_buf {
+  unsigned log_size;
+  size_t size;
+  atomic_uintptr_t *data;
+};
+
+// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
+#define mark_buf_min_log_size ((unsigned) 10)
+// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
+#define mark_buf_max_log_size ((unsigned) 28)
+
+static int
+mark_buf_init(struct mark_buf *buf, unsigned log_size) {
+  ASSERT(log_size >= mark_buf_min_log_size);
+  ASSERT(log_size <= mark_buf_max_log_size);
+  size_t size = (1 << log_size) * sizeof(uintptr_t);
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow work-stealing dequeue");
+    DEBUG("Failed to allocate %zu bytes", size, );
+    return 0;
+  }
+  buf->log_size = log_size;
+  buf->size = 1 << log_size;
+  buf->data = mem;
+  return 1;
+}
+  
+static inline size_t
+mark_buf_size(struct mark_buf *buf) {
+  return buf->size;
+}
+
+static inline size_t
+mark_buf_byte_size(struct mark_buf *buf) {
+  return mark_buf_size(buf) * sizeof(uintptr_t);
+}
+
+static void
+mark_buf_release(struct mark_buf *buf) {
+  if (buf->data)
+    madvise(buf->data, mark_buf_byte_size(buf), MADV_DONTNEED);
+}
+
+static void
+mark_buf_destroy(struct mark_buf *buf) {
+  if (buf->data) {
+    munmap(buf->data, mark_buf_byte_size(buf));
+    buf->data = NULL;
+    buf->log_size = 0;
+    buf->size = 0;
+  }
+}
+
+static inline uintptr_t
+mark_buf_get(struct mark_buf *buf, size_t i) {
+  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
+                              memory_order_relaxed);
+}
+
+static inline void
+mark_buf_put(struct mark_buf *buf, size_t i, uintptr_t o) {
+  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
+                               o,
+                               memory_order_relaxed);
+}
+
+static inline int
+mark_buf_grow(struct mark_buf *from, struct mark_buf *to,
+              size_t b, size_t t) {
+  if (from->log_size == mark_buf_max_log_size)
+    return 0;
+  if (!mark_buf_init (to, from->log_size + 1))
+    return 0;
+  for (size_t i=t; i<b; i++)
+    mark_buf_put(to, i, mark_buf_get(from, i));
+  return 1;
+}
+
+static const uintptr_t mark_deque_empty = 0;
+static const uintptr_t mark_deque_abort = 1;
+
+// Chase-Lev work-stealing deque.  One thread pushes data into the deque
+// at the bottom, and many threads compete to steal data from the top.
+struct mark_deque {
+  // Ensure bottom and top are on different cache lines.
+  union {
+    atomic_size_t bottom;
+    char bottom_padding[64];
+  };
+  union {
+    atomic_size_t top;
+    char top_padding[64];
+  };
+  atomic_int active; // Which mark_buf is active.
+  struct mark_buf bufs[(mark_buf_max_log_size - mark_buf_min_log_size) + 1];
+};
+
+#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
+#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
+
+#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
+#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
+
+#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
+
+static int
+mark_deque_init(struct mark_deque *q) {
+  memset(q, 0, sizeof (*q));
+  int ret = mark_buf_init(&q->bufs[0], mark_buf_min_log_size);
+  // Note, this fence isn't in the paper, I added it out of caution.
+  atomic_thread_fence(memory_order_release);
+  return ret;
+}
+
+static void
+mark_deque_release(struct mark_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    mark_buf_release(&q->bufs[i]);
+}
+
+static void
+mark_deque_destroy(struct mark_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    mark_buf_destroy(&q->bufs[i]);
+}
+
+static int
+mark_deque_grow(struct mark_deque *q, int cur, size_t b, size_t t) {
+  if (!mark_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
+    fprintf(stderr, "failed to grow deque!!\n");
+    abort();
+  }
+
+  cur++;
+  STORE_RELAXED(&q->active, cur);
+  return cur;
+}
+
+static void
+mark_deque_push(struct mark_deque *q, uintptr_t x) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  if (b - t > mark_buf_size(&q->bufs[active]) - 1) /* Full queue. */
+    active = mark_deque_grow(q, active, b, t);
+
+  mark_buf_put(&q->bufs[active], b, x);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + 1);
+}
+
+static uintptr_t
+mark_deque_try_pop(struct mark_deque *q) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  b = b - 1;
+  int active = LOAD_RELAXED(&q->active);
+  STORE_RELAXED(&q->bottom, b);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t t = LOAD_RELAXED(&q->top);
+  uintptr_t x;
+  if (t <= b) { // Non-empty queue.
+    x = mark_buf_get(&q->bufs[active], b);
+    if (t == b) { // Single last element in queue.
+      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                   memory_order_seq_cst,
+                                                   memory_order_relaxed))
+        // Failed race.
+        x = mark_deque_empty;
+      STORE_RELAXED(&q->bottom, b + 1);
+    }
+  } else { // Empty queue.
+    x = mark_deque_empty;
+    STORE_RELAXED(&q->bottom, b + 1);
+  }
+  return x;
+}
+
+static uintptr_t
+mark_deque_steal(struct mark_deque *q) {
+  size_t t = LOAD_ACQUIRE(&q->top);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t b = LOAD_ACQUIRE(&q->bottom);
+  uintptr_t x = mark_deque_empty;
+  if (t < b) { // Non-empty queue.
+    int active = LOAD_CONSUME(&q->active);
+    x = mark_buf_get(&q->bufs[active], t);
+    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                 memory_order_seq_cst,
+                                                 memory_order_relaxed))
+      // Failed race.
+      return mark_deque_abort;
+  }
+  return x;
+}
+
+#undef LOAD_RELAXED
+#undef STORE_RELAXED
+#undef LOAD_ACQUIRE
+#undef STORE_RELEASE
+#undef LOAD_CONSUME
+
+struct marker {
+  struct mark_deque deque;
+};
+
+struct context;
+static inline struct marker* context_marker(struct context *cx);
+
+static int
+marker_init(struct context *cx) {
+  return mark_deque_init(&context_marker(cx)->deque);
+}
+static void marker_prepare(struct context *cx) {}
+static void marker_release(struct context *cx) {
+  mark_deque_release(&context_marker(cx)->deque);
+}
+
+struct gcobj;
+static inline void marker_visit(struct context *cx, void **loc) __attribute__((always_inline));
+static inline void marker_trace(struct context *cx,
+                                void (*)(struct context *, struct gcobj *))
+  __attribute__((always_inline));
+static inline int mark_object(struct context *cx,
+                              struct gcobj *obj) __attribute__((always_inline));
+
+static inline void
+marker_visit(struct context *cx, void **loc) {
+  struct gcobj *obj = *loc;
+  if (obj && mark_object(cx, obj))
+    mark_deque_push(&context_marker(cx)->deque, (uintptr_t)obj);
+}
+static inline void
+marker_visit_root(struct context *cx, void **loc) {
+  marker_visit(cx, loc);
+}
+static inline void
+marker_trace(struct context *cx,
+             void (*process)(struct context *, struct gcobj *)) {
+  while (1) {
+    uintptr_t addr = mark_deque_steal(&context_marker(cx)->deque);
+    if (addr == mark_deque_empty)
+      return;
+    if (addr == mark_deque_abort)
+      continue;
+    process(cx, (struct gcobj*)addr);
+  }
+}
+
+#endif // SERIAL_MARK_H

From f57a1b8a55fd003f6bc71ddca57f164fb0806537 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 11 Mar 2022 10:17:05 +0100
Subject: [PATCH 023/403] Refactor to separate gcbench from gc

---
 Makefile               |  2 +-
 bdw.h                  | 11 +++------
 gc.h                   | 17 +++++++++++++
 gcbench-types.h        | 30 +++++++++++++++++++++++
 GCBench.c => gcbench.c | 44 ++++++++++++++-------------------
 inline.h               |  7 ++++++
 mark-sweep.h           | 38 ++++++++++++-----------------
 parallel-marker.h      | 14 ++++++-----
 semi.h                 | 55 ++++++++++++++++++------------------------
 serial-marker.h        | 15 ++++++------
 10 files changed, 132 insertions(+), 101 deletions(-)
 create mode 100644 gc.h
 create mode 100644 gcbench-types.h
 rename GCBench.c => gcbench.c (90%)
 create mode 100644 inline.h

diff --git a/Makefile b/Makefile
index 04a23ed6c..2846749e7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS=GCBench # MT_GCBench MT_GCBench2
+TESTS=gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
diff --git a/bdw.h b/bdw.h
index 7a5538d40..51adea161 100644
--- a/bdw.h
+++ b/bdw.h
@@ -16,19 +16,16 @@
 
 struct context {};
 
-enum alloc_kind { NODE, DOUBLE_ARRAY };
-
-typedef void (*field_visitor)(struct context *, void **ref);
-
 #define GC_HEADER /**/
 
 static inline void* allocate(struct context *cx, enum alloc_kind kind,
                              size_t size) {
-  // memset to 0 by the collector.
   switch (kind) {
-  case NODE:
+  case ALLOC_KIND_NODE:
+    // cleared to 0 by the collector.
     return GC_malloc(size);
-  case DOUBLE_ARRAY:
+  case ALLOC_KIND_DOUBLE_ARRAY:
+    // warning: not cleared!
     return GC_malloc_atomic(size);
   }
   abort();
diff --git a/gc.h b/gc.h
new file mode 100644
index 000000000..2c0c59de0
--- /dev/null
+++ b/gc.h
@@ -0,0 +1,17 @@
+#ifndef GC_H_
+#define GC_H_
+
+#if defined(GC_BDW)
+#include "bdw.h"
+#elif defined(GC_SEMI)
+#include "semi.h"
+#elif defined(GC_MARK_SWEEP)
+#include "mark-sweep.h"
+#elif defined(GC_PARALLEL_MARK_SWEEP)
+#define GC_PARALLEL_MARK 1
+#include "mark-sweep.h"
+#else
+#error unknown gc
+#endif
+
+#endif // GC_H_
diff --git a/gcbench-types.h b/gcbench-types.h
new file mode 100644
index 000000000..20cef8be4
--- /dev/null
+++ b/gcbench-types.h
@@ -0,0 +1,30 @@
+#ifndef GCBENCH_TYPES_H
+#define GCBENCH_TYPES_H
+
+#include "inline.h"
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(node, Node, NODE) \
+  M(double_array, DoubleArray, DOUBLE_ARRAY)
+
+#define DECLARE_NODE_TYPE(name, Name, NAME) \
+  struct Name;                              \
+  typedef struct Name Name;
+FOR_EACH_HEAP_OBJECT_KIND(DECLARE_NODE_TYPE)
+#undef DECLARE_NODE_TYPE
+
+#define DEFINE_ENUM(name, Name, NAME) ALLOC_KIND_##NAME,
+enum alloc_kind {
+  FOR_EACH_HEAP_OBJECT_KIND(DEFINE_ENUM)
+};
+#undef DEFINE_ENUM
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(void **loc, void *visit_data), \
+                                           void *visit_data) ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+#endif // GCBENCH_TYPES_H
diff --git a/GCBench.c b/gcbench.c
similarity index 90%
rename from GCBench.c
rename to gcbench.c
index f229de866..6d8acd48a 100644
--- a/GCBench.c
+++ b/gcbench.c
@@ -43,19 +43,8 @@
 #include <sys/time.h>
 
 #include "assert.h"
-
-#if defined(GC_BDW)
-#include "bdw.h"
-#elif defined(GC_SEMI)
-#include "semi.h"
-#elif defined(GC_MARK_SWEEP)
-#include "mark-sweep.h"
-#elif defined(GC_PARALLEL_MARK_SWEEP)
-#define GC_PARALLEL_MARK 1
-#include "mark-sweep.h"
-#else
-#error unknown gc
-#endif
+#include "gcbench-types.h"
+#include "gc.h"
 
 static const int kStretchTreeDepth    = 18;      // about 16Mb
 static const int kLongLivedTreeDepth  = 16;  // about 4Mb
@@ -76,21 +65,23 @@ typedef struct DoubleArray {
   double values[0];
 } DoubleArray;
 
-static inline size_t node_size(void *obj) {
+static inline size_t node_size(Node *obj) {
   return sizeof(Node);
 }
-static inline size_t double_array_size(void *obj) {
-  DoubleArray *array = obj;
+static inline size_t double_array_size(DoubleArray *array) {
   return sizeof(*array) + array->length * sizeof(double);
 }
-static inline void visit_node_fields(struct context *cx, void *obj,
-                                     field_visitor visit) {
-  Node *node = obj;
-  visit(cx, (void**)&node->left);
-  visit(cx, (void**)&node->right);
+static inline void
+visit_node_fields(Node *node,
+                  void (*visit)(void **loc, void *visit_data),
+                  void *visit_data) {
+  visit((void**)&node->left, visit_data);
+  visit((void**)&node->right, visit_data);
 }
-static inline void visit_double_array_fields(struct context *cx, void *obj,
-                                             field_visitor visit) {
+static inline void
+visit_double_array_fields(DoubleArray *obj,
+                          void (*visit)(void **loc, void *visit_data),
+                          void *visit_data) {
 }
 
 typedef HANDLE_TO(Node) NodeHandle;
@@ -98,13 +89,14 @@ typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
 
 static Node* allocate_node(struct context *cx) {
   // memset to 0 by the collector.
-  return allocate(cx, NODE, sizeof (Node));
+  return allocate(cx, ALLOC_KIND_NODE, sizeof (Node));
 }
 
 static struct DoubleArray* allocate_double_array(struct context *cx,
                                                  size_t size) {
-  // note, not memset to 0 by the collector.
-  DoubleArray *ret = allocate(cx, DOUBLE_ARRAY, sizeof (double) * size);
+  // note, we might allow the collector to leave this data uninitialized.
+  DoubleArray *ret = allocate(cx, ALLOC_KIND_DOUBLE_ARRAY,
+                              sizeof(DoubleArray) + sizeof (double) * size);
   ret->length = size;
   return ret;
 }
diff --git a/inline.h b/inline.h
new file mode 100644
index 000000000..4e44690f5
--- /dev/null
+++ b/inline.h
@@ -0,0 +1,7 @@
+#ifndef INLINE_H
+#define INLINE_H
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+#define NEVER_INLINE __attribute__((noinline))
+
+#endif // INLINE_H
diff --git a/mark-sweep.h b/mark-sweep.h
index f6b000fc2..9c586da63 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -6,6 +6,7 @@
 
 #include "assert.h"
 #include "debug.h"
+#include "inline.h"
 #include "precise-roots.h"
 #ifdef GC_PARALLEL_MARK
 #include "parallel-marker.h"
@@ -162,20 +163,11 @@ get_small_object_freelist(struct context *cx, enum small_object_size kind) {
 
 #define GC_HEADER uintptr_t _gc_header
 
-enum alloc_kind { NODE, DOUBLE_ARRAY };
-
-typedef void (*field_visitor)(struct context *, void **ref);
-
-static inline size_t node_size(void *obj) __attribute__((always_inline));
-static inline size_t double_array_size(void *obj) __attribute__((always_inline));
-static inline void visit_node_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
-static inline void visit_double_array_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
-
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct context *cx) __attribute__((noinline));
+static void collect(struct context *cx) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct context *cx, struct gcobj *obj) {
   uintptr_t granule = (((uintptr_t) obj) - cx->heap_base)  / GRANULE_SIZE;
@@ -193,12 +185,12 @@ static inline int mark_object(struct context *cx, struct gcobj *obj) {
 
 static void process(struct context *cx, struct gcobj *obj) {
   switch (tag_live_alloc_kind(obj->tag)) {
-  case NODE:
-    visit_node_fields(cx, obj, marker_visit);
-    break;
-  case DOUBLE_ARRAY:
-    visit_double_array_fields(cx, obj, marker_visit);
-    break;
+#define SCAN_OBJECT(name, Name, NAME) \
+    case ALLOC_KIND_##NAME: \
+      visit_##name##_fields((Name*)obj, marker_visit, cx); \
+      break;
+    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
+#undef SCAN_OBJECT
   default:
     abort ();
   }
@@ -215,7 +207,7 @@ static void collect(struct context *cx) {
   DEBUG("start collect #%ld:\n", cx->count);
   marker_prepare(cx);
   for (struct handle *h = cx->roots; h; h = h->next)
-    marker_visit_root(cx, &h->v);
+    marker_visit_root(&h->v, cx);
   marker_trace(cx, process);
   marker_release(cx);
   DEBUG("done marking\n");
@@ -301,12 +293,12 @@ static size_t live_object_granules(struct gcobj *obj) {
     return 1;
   size_t bytes;
   switch (tag_live_alloc_kind (obj->tag)) {
-  case NODE:
-    bytes = node_size(obj);
-    break;
-  case DOUBLE_ARRAY:
-    bytes = double_array_size(obj);
-    break;
+#define COMPUTE_SIZE(name, Name, NAME) \
+    case ALLOC_KIND_##NAME:            \
+      bytes = name##_size((Name*)obj); \
+      break;
+    FOR_EACH_HEAP_OBJECT_KIND(COMPUTE_SIZE)
+#undef COMPUTE_SIZE
   default:
     abort ();
   }
diff --git a/parallel-marker.h b/parallel-marker.h
index 8bfac725a..e8da3e567 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -7,6 +7,7 @@
 
 #include "assert.h"
 #include "debug.h"
+#include "inline.h"
 
 // The Chase-Lev work-stealing deque, as initially described in "Dynamic
 // Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
@@ -236,22 +237,23 @@ static void marker_release(struct context *cx) {
 }
 
 struct gcobj;
-static inline void marker_visit(struct context *cx, void **loc) __attribute__((always_inline));
+static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
 static inline void marker_trace(struct context *cx,
                                 void (*)(struct context *, struct gcobj *))
-  __attribute__((always_inline));
+  ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
-                              struct gcobj *obj) __attribute__((always_inline));
+                              struct gcobj *obj) ALWAYS_INLINE;
 
 static inline void
-marker_visit(struct context *cx, void **loc) {
+marker_visit(void **loc, void *mark_data) {
+  struct context *cx = mark_data;
   struct gcobj *obj = *loc;
   if (obj && mark_object(cx, obj))
     mark_deque_push(&context_marker(cx)->deque, (uintptr_t)obj);
 }
 static inline void
-marker_visit_root(struct context *cx, void **loc) {
-  marker_visit(cx, loc);
+marker_visit_root(void **loc, struct context *cx) {
+  marker_visit(loc, cx);
 }
 static inline void
 marker_trace(struct context *cx,
diff --git a/semi.h b/semi.h
index 37b9f4ef4..5fdec4a07 100644
--- a/semi.h
+++ b/semi.h
@@ -23,22 +23,13 @@ static uintptr_t align_up(uintptr_t addr, size_t align) {
 
 #define GC_HEADER uintptr_t _gc_header
 
-enum alloc_kind { NODE, DOUBLE_ARRAY };
-
-typedef void (*field_visitor)(struct context *, void **ref);
-
-static inline size_t node_size(void *obj) __attribute__((always_inline));
-static inline size_t double_array_size(void *obj) __attribute__((always_inline));
-static inline void visit_node_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
-static inline void visit_double_array_fields(struct context *cx, void *obj, field_visitor visit) __attribute__((always_inline));
-
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct context *cx, size_t bytes) __attribute__((noinline));
+static void collect(struct context *cx, size_t bytes) NEVER_INLINE;
 
-static void process(struct context *cx, void **loc);
+static void visit(void **loc, void *visit_data);
 
 static void flip(struct context *cx) {
   uintptr_t split = cx->base + (cx->size >> 1);
@@ -55,12 +46,12 @@ static void flip(struct context *cx) {
 static void* copy(struct context *cx, uintptr_t kind, void *obj) {
   size_t size;
   switch (kind) {
-  case NODE:
-    size = node_size(obj);
-    break;
-  case DOUBLE_ARRAY:
-    size = double_array_size(obj);
-    break;
+#define COMPUTE_SIZE(name, Name, NAME) \
+    case ALLOC_KIND_##NAME: \
+      size = name##_size(obj); \
+      break;
+    FOR_EACH_HEAP_OBJECT_KIND(COMPUTE_SIZE)
+#undef COMPUTE_SIZE
   default:
     abort ();
   }
@@ -75,14 +66,12 @@ static uintptr_t scan(struct context *cx, uintptr_t grey) {
   void *obj = (void*)grey;
   uintptr_t kind = *(uintptr_t*) obj;
   switch (kind) {
-  case NODE:
-    visit_node_fields(cx, obj, process);
-    return grey + align_up (node_size(obj), ALIGNMENT);
-    break;
-  case DOUBLE_ARRAY:
-    visit_double_array_fields(cx, obj, process);
-    return grey + align_up (double_array_size(obj), ALIGNMENT);
-    break;
+#define SCAN_OBJECT(name, Name, NAME) \
+    case ALLOC_KIND_##NAME: \
+      visit_##name##_fields((Name*)obj, visit, cx); \
+      return grey + align_up(name##_size((Name*)obj), ALIGNMENT);
+    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
+#undef SCAN_OBJECT
   default:
     abort ();
   }
@@ -91,15 +80,18 @@ static uintptr_t scan(struct context *cx, uintptr_t grey) {
 static void* forward(struct context *cx, void *obj) {
   uintptr_t header_word = *(uintptr_t*)obj;
   switch (header_word) {
-  case NODE:
-  case DOUBLE_ARRAY:
+#define CASE_ALLOC_KIND(name, Name, NAME) \
+    case ALLOC_KIND_##NAME:
+    FOR_EACH_HEAP_OBJECT_KIND(CASE_ALLOC_KIND)
+#undef CASE_ALLOC_KIND
     return copy(cx, header_word, obj);
   default:
     return (void*)header_word;
   }
 }  
 
-static void process(struct context *cx, void **loc) {
+static void visit(void **loc, void *visit_data) {
+  struct context *cx = visit_data;
   void *obj = *loc;
   if (obj != NULL)
     *loc = forward(cx, obj);
@@ -109,7 +101,7 @@ static void collect(struct context *cx, size_t bytes) {
   flip(cx);
   uintptr_t grey = cx->hp;
   for (struct handle *h = cx->roots; h; h = h->next)
-    process(cx, &h->v);
+    visit(&h->v, cx);
   // fprintf(stderr, "pushed %zd bytes in roots\n", cx->hp - grey);
   while(grey < cx->hp)
     grey = scan(cx, grey);
@@ -134,8 +126,9 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
     void *ret = (void *)addr;
     uintptr_t *header_word = ret;
     *header_word = kind;
-    if (kind == NODE)
-      clear_memory(addr + sizeof(uintptr_t), size - sizeof(uintptr_t));
+    // FIXME: Allow allocator to avoid initializing pointerless memory?
+    // if (kind == NODE)
+    clear_memory(addr + sizeof(uintptr_t), size - sizeof(uintptr_t));
     return ret;
   }
 }
diff --git a/serial-marker.h b/serial-marker.h
index 755064372..8f7dec01d 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -49,7 +49,7 @@ mark_queue_put(struct mark_queue *q, size_t idx, uintptr_t x) {
   q->buf[idx & (q->size - 1)] = x;
 }
 
-static int mark_queue_grow(struct mark_queue *q) __attribute__((noinline));
+static int mark_queue_grow(struct mark_queue *q) NEVER_INLINE;
 
 static int
 mark_queue_grow(struct mark_queue *q) {
@@ -125,22 +125,23 @@ static void marker_release(struct context *cx) {
 }
 
 struct gcobj;
-static inline void marker_visit(struct context *cx, void **loc) __attribute__((always_inline));
+static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
 static inline void marker_trace(struct context *cx,
                                 void (*)(struct context *, struct gcobj *))
-  __attribute__((always_inline));
+  ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
-                              struct gcobj *obj) __attribute__((always_inline));
+                              struct gcobj *obj) ALWAYS_INLINE;
 
 static inline void
-marker_visit(struct context *cx, void **loc) {
+marker_visit(void **loc, void *mark_data) {
+  struct context *cx = mark_data;
   struct gcobj *obj = *loc;
   if (obj && mark_object(cx, obj))
     mark_queue_push(&context_marker(cx)->queue, obj);
 }
 static inline void
-marker_visit_root(struct context *cx, void **loc) {
-  marker_visit(cx, loc);
+marker_visit_root(void **loc, struct context *cx) {
+  marker_visit(loc, cx);
 }
 static inline void
 marker_trace(struct context *cx,

From df9edfdff21c98820adbfb8b7ed3f96e1ea222d8 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 11 Mar 2022 11:18:05 +0100
Subject: [PATCH 024/403] Remove tiny objects from mark-sweep

---
 mark-sweep.h | 169 ++++++++++++++-------------------------------------
 1 file changed, 45 insertions(+), 124 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 9c586da63..6e0e82db6 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -25,7 +25,7 @@ STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
 
 // There are small object pages for allocations of these sizes.
 #define FOR_EACH_SMALL_OBJECT_GRANULES(M) \
-  M(2) M(3) M(4) M(5) M(6) M(8) M(10) M(16) M(32)
+  M(1) M(2) M(3) M(4) M(5) M(6) M(8) M(10) M(16) M(32)
 
 enum small_object_size {
 #define SMALL_OBJECT_GRANULE_SIZE(i) SMALL_OBJECT_##i,
@@ -43,7 +43,7 @@ static const uint8_t small_object_granule_sizes[] =
 };
 
 static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD + 2] = {
-  NOT_SMALL_OBJECT, NOT_SMALL_OBJECT, SMALL_OBJECT_2,  SMALL_OBJECT_3,
+  SMALL_OBJECT_1,   SMALL_OBJECT_1, SMALL_OBJECT_2,  SMALL_OBJECT_3,
   SMALL_OBJECT_4,   SMALL_OBJECT_5,   SMALL_OBJECT_6,  SMALL_OBJECT_8,
   SMALL_OBJECT_8,   SMALL_OBJECT_10,  SMALL_OBJECT_10, SMALL_OBJECT_16,
   SMALL_OBJECT_16,  SMALL_OBJECT_16,  SMALL_OBJECT_16, SMALL_OBJECT_16,
@@ -67,80 +67,41 @@ static inline size_t size_to_granules(size_t size) {
   return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
 }
 
-// Object kind is stored in low bits of first word of all heap objects
-// (allocated or free).
-enum gcobj_kind { GCOBJ_TINY, GCOBJ };
-
-// gcobj_kind is in the low bit of tag.
-static const uintptr_t gcobj_kind_bit = (1 << 0);
-static inline enum gcobj_kind tag_gcobj_kind(uintptr_t tag) {
-  return tag & gcobj_kind_bit;
-}
-
-// Alloc kind is in bits 1-8, for live objects.
+// Alloc kind is in bits 0-7, for live objects.
 static const uintptr_t gcobj_alloc_kind_mask = 0xff;
-static const uintptr_t gcobj_alloc_kind_shift = 1;
+static const uintptr_t gcobj_alloc_kind_shift = 0;
 static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
   return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
 }
-
-// For free objects, bits 1 and up are free.  Non-tiny objects store the
-// object size in granules there.
-static const uintptr_t gcobj_free_granules_shift = 1;
-static inline uintptr_t tag_free_granules(uintptr_t tag) {
-  return tag >> gcobj_free_granules_shift;
+static inline uintptr_t tag_live(uint8_t alloc_kind) {
+  return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
 }
 
-static inline uintptr_t tag_free(enum gcobj_kind kind, size_t granules) {
-  return kind | (granules << gcobj_free_granules_shift);
-}
-static inline uintptr_t tag_live(enum gcobj_kind kind, uint8_t alloc_kind) {
-  return kind | ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
-}
-static inline uintptr_t tag_free_tiny(void) {
-  return tag_free(GCOBJ_TINY, 0);
-}
-
-// The gcobj_free_tiny and gcobj_free structs define the fields in free
-// tiny (1-granule), and non-tiny (2 granules and up) objects.
-struct gcobj_free_tiny {
-  // Low 2 bits of tag are GCOBJ_TINY, which is 0.  Bit 2 is live bit;
-  // never set for free objects.  Therefore for free objects, the
-  // 8-byte-aligned next pointer can alias the tag.
-  union {
-    uintptr_t tag;
-    struct gcobj_free_tiny *next;
-  };
+struct gcobj_free {
+  struct gcobj_free *next;
 };
 
-// Objects from 2 granules and up.
-struct gcobj_free {
-  // For free objects, we store the granule size in the tag's payload.
-  // Next pointer only valid for objects on small freelist.
-  uintptr_t tag;
-  struct gcobj_free *next;
+// Objects larger than LARGE_OBJECT_GRANULE_THRESHOLD.
+struct gcobj_free_large {
+  struct gcobj_free_large *next;
+  size_t granules;
 };
 
 struct gcobj {
   union {
     uintptr_t tag;
-    struct gcobj_free_tiny free_tiny;
     struct gcobj_free free;
+    struct gcobj_free_large free_large;
     uintptr_t words[0];
     void *pointers[0];
   };
 };
 
-static inline enum gcobj_kind gcobj_kind(struct gcobj *obj) {
-  return tag_gcobj_kind (obj->tag);
-}
-
 struct context {
-  // Segregated freelists of tiny and small objects.
-  struct gcobj_free_tiny *tiny_objects;
+  // Segregated freelists of small objects.
   struct gcobj_free *small_objects[SMALL_OBJECT_SIZES];
   // Unordered list of large objects.
-  struct gcobj_free *large_objects;
+  struct gcobj_free_large *large_objects;
   uintptr_t base;
   uint8_t *mark_bytes;
   uintptr_t heap_base;
@@ -197,7 +158,6 @@ static void process(struct context *cx, struct gcobj *obj) {
 }
 
 static void clear_freelists(struct context *cx) {
-  cx->tiny_objects = NULL;
   for (int i = 0; i < SMALL_OBJECT_SIZES; i++)
     cx->small_objects[i] = NULL;
   cx->large_objects = NULL;
@@ -216,25 +176,11 @@ static void collect(struct context *cx) {
   cx->count++;
 }
 
-static void push_free_tiny(struct gcobj_free_tiny **loc,
-                           struct gcobj_free_tiny *obj) {
-  // Rely on obj->next having low bits being 0, indicating a non-live
-  // tiny object.
+static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
   obj->next = *loc;
   *loc = obj;
 }
 
-static void push_free(struct gcobj_free **loc, struct gcobj_free *obj,
-                      size_t granules) {
-  obj->tag = tag_free(GCOBJ, granules);
-  obj->next = *loc;
-  *loc = obj;
-}
-
-static void push_tiny(struct context *cx, void *obj) {
-  push_free_tiny(&cx->tiny_objects, obj);
-}
-
 static void push_small(struct context *cx, void *region,
                        enum small_object_size kind, size_t region_granules) {
   uintptr_t addr = (uintptr_t) region;
@@ -242,39 +188,41 @@ static void push_small(struct context *cx, void *region,
     size_t granules = small_object_granule_sizes[kind];
     struct gcobj_free **loc = get_small_object_freelist(cx, kind);
     while (granules <= region_granules) {
-      push_free(loc, (struct gcobj_free*) addr, granules);
+      push_free(loc, (struct gcobj_free*) addr);
       region_granules -= granules;
       addr += granules * GRANULE_SIZE;
     }
-    if (region_granules == 1) {
-      // Region is actually a tiny object.
-      push_free_tiny(&cx->tiny_objects, (struct gcobj_free_tiny *)addr);
-      return;
-    }
     // Fit any remaining granules into smaller freelists.
     kind--;
   }
 }
 
 static void push_large(struct context *cx, void *region, size_t granules) {
-  push_free(&cx->large_objects, region, granules);
+  struct gcobj_free_large *large = region;
+  large->next = cx->large_objects;
+  large->granules = granules;
+  cx->large_objects = large;
 }
 
 static void reclaim(struct context *cx, void *obj, size_t granules) {
-  if (granules == 1) {
-    push_tiny(cx, obj);
-  } else if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD) {
+  if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
     push_small(cx, obj, SMALL_OBJECT_SIZES - 1, granules);
-  } else {
+  else
     push_large(cx, obj, granules);
-  }
 }
 
 static void split_large_object(struct context *cx,
-                               struct gcobj_free *large,
-                               size_t granules) {
-  size_t large_granules = tag_free_granules(large->tag);
+                                struct gcobj_free_large *large,
+                                size_t granules) {
+  size_t large_granules = large->granules;
   ASSERT(large_granules >= granules);
+  ASSERT(granules >= LARGE_OBJECT_GRANULE_THRESHOLD);
+  // Invariant: all words in LARGE are 0 except the two header words.
+  // LARGE is off the freelist.  We return a block of cleared memory, so
+  // clear those fields now.
+  large->next = NULL;
+  large->granules = 0;
+
   if (large_granules == granules)
     return;
   
@@ -282,15 +230,12 @@ static void split_large_object(struct context *cx,
   reclaim(cx, tail, large_granules - granules);
 }
 
-static void unlink_large_object(struct gcobj_free **prev,
-                                struct gcobj_free *large) {
+static void unlink_large_object(struct gcobj_free_large **prev,
+                                struct gcobj_free_large *large) {
   *prev = large->next;
 }
 
 static size_t live_object_granules(struct gcobj *obj) {
-  enum gcobj_kind size_kind = tag_gcobj_kind(obj->tag);
-  if (size_kind == GCOBJ_TINY)
-    return 1;
   size_t bytes;
   switch (tag_live_alloc_kind (obj->tag)) {
 #define COMPUTE_SIZE(name, Name, NAME) \
@@ -369,18 +314,18 @@ static int sweep(struct context *cx) {
 static void* allocate_large(struct context *cx, enum alloc_kind kind,
                             size_t granules) {
   int swept_from_beginning = 0;
-  struct gcobj_free *already_scanned = NULL;
+  struct gcobj_free_large *already_scanned = NULL;
   while (1) {
     do {
-      struct gcobj_free **prev = &cx->large_objects;
-      for (struct gcobj_free *large = cx->large_objects;
+      struct gcobj_free_large **prev = &cx->large_objects;
+      for (struct gcobj_free_large *large = cx->large_objects;
            large != already_scanned;
            prev = &large->next, large = large->next) {
-        if (tag_free_granules(large->tag) >= granules) {
+        if (large->granules >= granules) {
           unlink_large_object(prev, large);
           split_large_object(cx, large, granules);
-          large->tag = tag_live(GCOBJ, kind);
-          large->next = NULL;
+          struct gcobj *obj = (struct gcobj *)large;
+          obj->tag = tag_live(kind);
           return large;
         }
       }
@@ -419,7 +364,7 @@ static void fill_small(struct context *cx, enum small_object_size kind) {
     }
 
     // Otherwise if there is a large object, take and split it.
-    struct gcobj_free *large = cx->large_objects;
+    struct gcobj_free_large *large = cx->large_objects;
     if (large) {
       unlink_large_object(&cx->large_objects, large);
       split_large_object(cx, large, LARGE_OBJECT_GRANULE_THRESHOLD);
@@ -447,38 +392,14 @@ static inline void* allocate_small(struct context *cx,
     fill_small(cx, small_kind);
   struct gcobj_free *ret = *loc;
   *loc = ret->next;
-  ret->tag = tag_live(GCOBJ, alloc_kind);
-  ret->next = NULL;
-  return (void *) ret;
-}
-
-static inline void fill_tiny(struct context *cx) {
-  struct gcobj_free **loc = get_small_object_freelist(cx, SMALL_OBJECT_2);
-  if (!*loc)
-    fill_small(cx, SMALL_OBJECT_2);
-  struct gcobj_free *small = *loc;
-  *loc = small->next;
-  struct gcobj_free_tiny *ret = (struct gcobj_free_tiny *)small;
-  reclaim(cx, ret, 1);
-  reclaim(cx, ret + 1, 1);
-}
-
-static inline void* allocate_tiny(struct context *cx,
-                                  enum alloc_kind alloc_kind) {
-  if (!cx->tiny_objects)
-    fill_tiny(cx);
-
-  struct gcobj_free_tiny *ret = cx->tiny_objects;
-  cx->tiny_objects = ret->next;
-  ret->tag = tag_live(GCOBJ_TINY, alloc_kind);
-  return ret;
+  struct gcobj *obj = (struct gcobj *)ret;
+  obj->tag = tag_live(alloc_kind);
+  return obj;
 }
 
 static inline void* allocate(struct context *cx, enum alloc_kind kind,
                              size_t size) {
   size_t granules = size_to_granules(size);
-  if (granules <= 1)
-    return allocate_tiny(cx, kind);
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
     return allocate_small(cx, kind, granules_to_small_object_size(granules));
   return allocate_large(cx, kind, granules);

From 9c89672c886b0fd806e222baa5c52b0524003098 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 11 Mar 2022 11:57:14 +0100
Subject: [PATCH 025/403] Put a local mark queue in front of the work-stealing
 queue

---
 mark-sweep.h      | 10 +++---
 parallel-marker.h | 83 +++++++++++++++++++++++++++++++++++++++--------
 serial-marker.h   |  6 ++--
 3 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 6e0e82db6..6241d8ab6 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -144,11 +144,11 @@ static inline int mark_object(struct context *cx, struct gcobj *obj) {
   return 1;
 }
 
-static void process(struct context *cx, struct gcobj *obj) {
+static void trace_one(struct gcobj *obj, void *mark_data) {
   switch (tag_live_alloc_kind(obj->tag)) {
 #define SCAN_OBJECT(name, Name, NAME) \
     case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, marker_visit, cx); \
+      visit_##name##_fields((Name*)obj, marker_visit, mark_data); \
       break;
     FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
@@ -168,7 +168,7 @@ static void collect(struct context *cx) {
   marker_prepare(cx);
   for (struct handle *h = cx->roots; h; h = h->next)
     marker_visit_root(&h->v, cx);
-  marker_trace(cx, process);
+  marker_trace(cx, trace_one);
   marker_release(cx);
   DEBUG("done marking\n");
   cx->sweep = cx->heap_base;
@@ -290,9 +290,7 @@ static int sweep(struct context *cx) {
                                      (limit - sweep) >> GRANULE_SIZE_LOG_2);
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
-      memset((void*)(sweep + GRANULE_SIZE),
-             0,
-             free_bytes - GRANULE_SIZE);
+      clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
       reclaim(cx, (void*)sweep, free_granules);
       sweep += free_bytes;
       to_reclaim -= free_granules;
diff --git a/parallel-marker.h b/parallel-marker.h
index e8da3e567..805522ae7 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -36,7 +36,7 @@ mark_buf_init(struct mark_buf *buf, unsigned log_size) {
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("Failed to grow work-stealing dequeue");
-    DEBUG("Failed to allocate %zu bytes", size, );
+    DEBUG("Failed to allocate %zu bytes", size);
     return 0;
   }
   buf->log_size = log_size;
@@ -220,10 +220,49 @@ mark_deque_steal(struct mark_deque *q) {
 #undef STORE_RELEASE
 #undef LOAD_CONSUME
 
+#define LOCAL_MARK_QUEUE_SIZE 64
+#define LOCAL_MARK_QUEUE_MASK 63
+struct local_mark_queue {
+  size_t read;
+  size_t write;
+  uintptr_t data[LOCAL_MARK_QUEUE_SIZE];
+};
+
+static inline void
+local_mark_queue_init(struct local_mark_queue *q) {
+  q->read = q->write = 0;
+}
+static inline void
+local_mark_queue_poison(struct local_mark_queue *q) {
+  q->read = 0; q->write = LOCAL_MARK_QUEUE_SIZE;
+}
+static inline int
+local_mark_queue_empty(struct local_mark_queue *q) {
+  return q->read == q->write;
+}
+static inline int
+local_mark_queue_full(struct local_mark_queue *q) {
+  return q->read + LOCAL_MARK_QUEUE_SIZE == q->write;
+}
+static inline void
+local_mark_queue_push(struct local_mark_queue *q, uintptr_t v) {
+  q->data[q->write++ & LOCAL_MARK_QUEUE_MASK] = v;
+}
+static inline uintptr_t
+local_mark_queue_pop(struct local_mark_queue *q) {
+  return q->data[q->read++ & LOCAL_MARK_QUEUE_MASK];
+}
+
 struct marker {
   struct mark_deque deque;
 };
 
+struct local_marker {
+  struct local_mark_queue local;
+  struct mark_deque *deque;
+  struct context *cx;
+};
+
 struct context;
 static inline struct marker* context_marker(struct context *cx);
 
@@ -239,32 +278,50 @@ static void marker_release(struct context *cx) {
 struct gcobj;
 static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
 static inline void marker_trace(struct context *cx,
-                                void (*)(struct context *, struct gcobj *))
+                                void (*)(struct gcobj *, void *))
   ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
                               struct gcobj *obj) ALWAYS_INLINE;
 
 static inline void
 marker_visit(void **loc, void *mark_data) {
-  struct context *cx = mark_data;
+  struct local_marker *mark = mark_data;
   struct gcobj *obj = *loc;
-  if (obj && mark_object(cx, obj))
-    mark_deque_push(&context_marker(cx)->deque, (uintptr_t)obj);
+  if (obj && mark_object(mark->cx, obj)) {
+    if (!local_mark_queue_full(&mark->local))
+      local_mark_queue_push(&mark->local, (uintptr_t)obj);
+    else
+      mark_deque_push(mark->deque, (uintptr_t)obj);
+  }
 }
 static inline void
 marker_visit_root(void **loc, struct context *cx) {
-  marker_visit(loc, cx);
+  struct local_marker mark;
+  local_mark_queue_poison(&mark.local);
+  mark.deque = &context_marker(cx)->deque;
+  mark.cx = cx;
+  marker_visit(loc, &mark);
 }
 static inline void
 marker_trace(struct context *cx,
-             void (*process)(struct context *, struct gcobj *)) {
+             void (*trace_one)(struct gcobj *, void *)) {
+  struct local_marker mark;
+  local_mark_queue_init(&mark.local);
+  mark.deque = &context_marker(cx)->deque;
+  mark.cx = cx;
+
   while (1) {
-    uintptr_t addr = mark_deque_steal(&context_marker(cx)->deque);
-    if (addr == mark_deque_empty)
-      return;
-    if (addr == mark_deque_abort)
-      continue;
-    process(cx, (struct gcobj*)addr);
+    uintptr_t addr;
+    if (!local_mark_queue_empty(&mark.local)) {
+      addr = local_mark_queue_pop(&mark.local);
+    } else {
+      addr = mark_deque_steal(mark.deque);
+      if (addr == mark_deque_empty)
+        break;
+      if (addr == mark_deque_abort)
+        continue;
+    }
+    trace_one((struct gcobj*)addr, &mark);
   }
 }
 
diff --git a/serial-marker.h b/serial-marker.h
index 8f7dec01d..1c2e305a7 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -127,7 +127,7 @@ static void marker_release(struct context *cx) {
 struct gcobj;
 static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
 static inline void marker_trace(struct context *cx,
-                                void (*)(struct context *, struct gcobj *))
+                                void (*)(struct gcobj *, void *))
   ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
                               struct gcobj *obj) ALWAYS_INLINE;
@@ -145,10 +145,10 @@ marker_visit_root(void **loc, struct context *cx) {
 }
 static inline void
 marker_trace(struct context *cx,
-             void (*process)(struct context *, struct gcobj *)) {
+             void (*trace_one)(struct gcobj *, void *)) {
   struct gcobj *obj;
   while ((obj = mark_queue_pop(&context_marker(cx)->queue)))
-    process(cx, obj);
+    trace_one(obj, cx);
 }
 
 #endif // SERIAL_MARK_H

From 7ce07de6701ccbf7cbd4be287326fab4cc728127 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 12 Mar 2022 21:09:17 +0100
Subject: [PATCH 026/403] First crack at parallel marking

---
 Makefile          |   6 +-
 mark-sweep.h      |   4 +-
 parallel-marker.h | 421 +++++++++++++++++++++++++++++++++++++++++++---
 serial-marker.h   |   7 +-
 4 files changed, 406 insertions(+), 32 deletions(-)

diff --git a/Makefile b/Makefile
index 2846749e7..ecf35b3e3 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ TESTS=gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
-CFLAGS=-Wall -O2 -g
+CFLAGS=-Wall -O2 -g -fno-strict-aliasing
 
 ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
@@ -15,10 +15,10 @@ semi-%: semi.h precise-roots.h %.c
 	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_SEMI -o $@ $*.c
 
 mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %.c
-	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
+	$(CC) $(CFLAGS) -I. -Wno-unused -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
 
 parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %.c
-	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
+	$(CC) $(CFLAGS) -I. -Wno-unused -DNDEBUG -DGC_PARALLEL_MARK_SWEEP -lpthread -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
diff --git a/mark-sweep.h b/mark-sweep.h
index 6241d8ab6..2c90f48eb 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -144,7 +144,7 @@ static inline int mark_object(struct context *cx, struct gcobj *obj) {
   return 1;
 }
 
-static void trace_one(struct gcobj *obj, void *mark_data) {
+static inline void trace_one(struct gcobj *obj, void *mark_data) {
   switch (tag_live_alloc_kind(obj->tag)) {
 #define SCAN_OBJECT(name, Name, NAME) \
     case ALLOC_KIND_##NAME: \
@@ -168,7 +168,7 @@ static void collect(struct context *cx) {
   marker_prepare(cx);
   for (struct handle *h = cx->roots; h; h = h->next)
     marker_visit_root(&h->v, cx);
-  marker_trace(cx, trace_one);
+  marker_trace(cx);
   marker_release(cx);
   DEBUG("done marking\n");
   cx->sweep = cx->heap_base;
diff --git a/parallel-marker.h b/parallel-marker.h
index 805522ae7..f935bc155 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -1,6 +1,7 @@
 #ifndef SERIAL_TRACE_H
 #define SERIAL_TRACE_H
 
+#include <pthread.h>
 #include <stdatomic.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -253,23 +254,356 @@ local_mark_queue_pop(struct local_mark_queue *q) {
   return q->data[q->read++ & LOCAL_MARK_QUEUE_MASK];
 }
 
+struct mark_notify {
+  size_t notifiers;
+  int pending;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+};
+
+static void
+mark_notify_init(struct mark_notify *notify) {
+  notify->notifiers = 0;
+  notify->pending = 0;
+  pthread_mutex_init(&notify->lock, NULL);
+  pthread_cond_init(&notify->cond, NULL);
+}
+
+static void
+mark_notify_destroy(struct mark_notify *notify) {
+  pthread_mutex_destroy(&notify->lock);
+  pthread_cond_destroy(&notify->cond);
+}
+
+static void
+mark_notify_add_notifier(struct mark_notify *notify) {
+  pthread_mutex_lock(&notify->lock);
+  notify->notifiers++;
+  pthread_mutex_unlock(&notify->lock);
+}
+
+static void
+mark_notify_remove_notifier(struct mark_notify *notify) {
+  pthread_mutex_lock(&notify->lock);
+  notify->notifiers--;
+  if (notify->notifiers == 0)
+    pthread_cond_signal(&notify->cond);
+  pthread_mutex_unlock(&notify->lock);
+}
+
+enum mark_notify_status {
+  MARK_NOTIFY_DONE,
+  MARK_NOTIFY_WOKE
+};
+static enum mark_notify_status
+mark_notify_wait(struct mark_notify *notify) {
+  enum mark_notify_status res;
+
+  pthread_mutex_lock(&notify->lock);
+
+  if (notify->pending) {
+    res = MARK_NOTIFY_WOKE;
+    notify->pending = 0;
+    goto done;
+  }
+
+  if (notify->notifiers == 0) {
+    res = MARK_NOTIFY_DONE;
+    goto done;
+  }
+
+  // Spurious wakeup is OK.
+  pthread_cond_wait(&notify->cond, &notify->lock);
+  res = MARK_NOTIFY_WOKE;
+  notify->pending = 0;
+
+done:
+  pthread_mutex_unlock(&notify->lock);
+  return res;
+}
+
+static void
+mark_notify_wake(struct mark_notify *notify) {
+  pthread_mutex_lock(&notify->lock);
+  notify->pending = 1;
+  pthread_cond_signal(&notify->cond);
+  pthread_mutex_unlock(&notify->lock);
+}
+
+// A mostly lock-free multi-producer, single consumer queue, largely
+// inspired by Rust's std::sync::channel.
+//
+// https://www.1024cores.net/home/lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
+
+struct mark_channel_message {
+  struct mark_channel_message * _Atomic next;
+  // Payload will be zero only for free messages, and for the sentinel
+  // message.
+  atomic_uintptr_t payload;
+};
+
+#define MARK_CHANNEL_WRITER_MESSAGE_COUNT ((size_t)1024)
+
+struct mark_channel {
+  union {
+    struct mark_channel_message* _Atomic head;
+    char head_padding[64];
+  };
+  union {
+    atomic_size_t length;
+    char length_padding[64];
+  };
+  struct mark_channel_message* tail;
+  struct mark_channel_message sentinel;
+
+  struct mark_notify notify;
+};
+
+struct mark_channel_writer {
+  struct mark_channel_message messages[MARK_CHANNEL_WRITER_MESSAGE_COUNT];
+  size_t next_message;
+
+  struct mark_channel *channel;
+};
+
+static void
+mark_channel_init(struct mark_channel *ch) {
+  memset(ch, 0, sizeof(*ch));
+  atomic_init(&ch->head, &ch->sentinel);
+  atomic_init(&ch->length, 0);
+  mark_notify_init(&ch->notify);
+  ch->tail = &ch->sentinel;
+}
+
+static void
+mark_channel_destroy(struct mark_channel *ch) {
+  mark_notify_destroy(&ch->notify);
+}
+
+static void
+mark_channel_push(struct mark_channel *ch, struct mark_channel_message *msg) {
+  ASSERT(msg->payload);
+  atomic_store_explicit(&msg->next, NULL, memory_order_relaxed);
+
+  struct mark_channel_message *prev =
+    atomic_exchange_explicit(&ch->head, msg, memory_order_acq_rel);
+
+  atomic_store_explicit(&prev->next, msg, memory_order_release);
+
+  size_t old_length =
+    atomic_fetch_add_explicit(&ch->length, 1, memory_order_relaxed);
+  if (old_length == 0)
+    mark_notify_wake(&ch->notify);
+}
+
+static uintptr_t
+mark_channel_try_pop(struct mark_channel *ch) {
+  struct mark_channel_message *tail = ch->tail;
+  struct mark_channel_message *next =
+    atomic_load_explicit(&tail->next, memory_order_acquire);
+
+  if (next) {
+    ch->tail = next;
+    uintptr_t payload =
+      atomic_load_explicit(&next->payload, memory_order_acquire);
+    ASSERT(payload != 0);
+    // Indicate to the writer that the old tail node can now be re-used.
+    // Note though that the new tail node is floating garbage; its
+    // payload has been popped but the node itself is still part of the
+    // queue.  Care has to be taken to ensure that any remaining queue
+    // entries are popped before the associated channel writer's
+    // messages are deallocated.
+    atomic_store_explicit(&tail->payload, 0, memory_order_release);
+    atomic_fetch_sub_explicit(&ch->length, 1, memory_order_relaxed);
+    return payload;
+  }
+
+  // if (atomic_load_explicit(&ch->head) == tail) return EMPTY else INCONSISTENT
+  return 0;
+}
+
+static uintptr_t
+mark_channel_pop(struct mark_channel *ch) {
+  while (1) {
+    uintptr_t ret = mark_channel_try_pop(ch);
+    if (ret)
+      return ret;
+
+    if (atomic_load_explicit(&ch->length, memory_order_relaxed) == 0) {
+      if (mark_notify_wait(&ch->notify) == MARK_NOTIFY_DONE)
+        return 0;
+    }
+  }
+}
+
+static void
+mark_channel_writer_init(struct mark_channel *ch,
+                         struct mark_channel_writer *writer) {
+  memset(writer, 0, sizeof(*writer));
+  writer->channel = ch;
+}
+
+static void
+mark_channel_write(struct mark_channel_writer *writer, uintptr_t payload) {
+  ASSERT(payload);
+  struct mark_channel_message *msg = &writer->messages[writer->next_message];
+  while (atomic_load_explicit(&msg->payload, memory_order_acquire) != 0)
+    sched_yield();
+  writer->next_message++;
+  if (writer->next_message == MARK_CHANNEL_WRITER_MESSAGE_COUNT)
+    writer->next_message = 0;
+  atomic_store_explicit(&msg->payload, payload, memory_order_release);
+  mark_channel_push(writer->channel, msg);
+}
+
+static void
+mark_channel_writer_activate(struct mark_channel_writer *writer) {
+  mark_notify_add_notifier(&writer->channel->notify);
+}
+static void
+mark_channel_writer_deactivate(struct mark_channel_writer *writer) {
+  mark_notify_remove_notifier(&writer->channel->notify);
+}
+
+enum mark_worker_state {
+  MARK_WORKER_STOPPED,
+  MARK_WORKER_IDLE,
+  MARK_WORKER_MARKING,
+  MARK_WORKER_STOPPING,
+  MARK_WORKER_DEAD
+};
+
+struct mark_worker {
+  struct context *cx;
+  pthread_t thread;
+  enum mark_worker_state state;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+  struct mark_channel_writer writer;
+};
+
+#define MARK_WORKERS_MAX_COUNT 8
+
 struct marker {
   struct mark_deque deque;
+  struct mark_channel overflow;
+  size_t worker_count;
+  struct mark_worker workers[MARK_WORKERS_MAX_COUNT];
 };
 
 struct local_marker {
-  struct local_mark_queue local;
+  struct mark_worker *worker;
   struct mark_deque *deque;
   struct context *cx;
+  struct local_mark_queue local;
 };
 
 struct context;
 static inline struct marker* context_marker(struct context *cx);
 
+static size_t number_of_current_processors(void) { return 1; }
+
+static void
+mark_worker_init(struct mark_worker *worker, struct context *cx,
+                   struct marker *marker) {
+  worker->cx = cx;
+  worker->thread = 0;
+  worker->state = MARK_WORKER_STOPPED;
+  pthread_mutex_init(&worker->lock, NULL);
+  pthread_cond_init(&worker->cond, NULL);
+  mark_channel_writer_init(&marker->overflow, &worker->writer);
+}
+
+static void mark_worker_mark(struct mark_worker *worker);
+
+static void*
+mark_worker_thread(void *data) {
+  struct mark_worker *worker = data;
+
+  pthread_mutex_lock(&worker->lock);
+  while (1) {
+    switch (worker->state) {
+    case MARK_WORKER_IDLE:
+      pthread_cond_wait(&worker->cond, &worker->lock);
+      break;
+    case MARK_WORKER_MARKING:
+      mark_worker_mark(worker);
+      worker->state = MARK_WORKER_IDLE;
+      break;
+    case MARK_WORKER_STOPPING:
+      worker->state = MARK_WORKER_DEAD;
+      pthread_mutex_unlock(&worker->lock);
+      return NULL;
+    default:
+      abort();
+    }
+  }
+}
+
+static int
+mark_worker_spawn(struct mark_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == MARK_WORKER_STOPPED);
+  worker->state = MARK_WORKER_IDLE;
+  pthread_mutex_unlock(&worker->lock);
+
+  if (pthread_create(&worker->thread, NULL, mark_worker_thread, worker)) {
+    perror("spawning marker thread failed");
+    worker->state = MARK_WORKER_STOPPED;
+    return 0;
+  }
+
+  return 1;
+}
+
+static void
+mark_worker_request_mark(struct mark_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == MARK_WORKER_IDLE);
+  mark_channel_writer_activate(&worker->writer);
+  worker->state = MARK_WORKER_MARKING;
+  pthread_cond_signal(&worker->cond);
+  pthread_mutex_unlock(&worker->lock);
+}  
+
+static void
+mark_worker_finished_marking(struct mark_worker *worker) {
+  // Signal controller that we are done with marking.
+  mark_channel_writer_deactivate(&worker->writer);
+}
+
+static void
+mark_worker_request_stop(struct mark_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == MARK_WORKER_IDLE);
+  worker->state = MARK_WORKER_STOPPING;
+  pthread_cond_signal(&worker->cond);
+  pthread_mutex_unlock(&worker->lock);
+}  
+
 static int
 marker_init(struct context *cx) {
-  return mark_deque_init(&context_marker(cx)->deque);
+  struct marker *marker = context_marker(cx);
+  if (!mark_deque_init(&marker->deque))
+    return 0;
+  mark_channel_init(&marker->overflow);
+  size_t desired_worker_count = 0;
+  if (getenv("GC_MARKERS"))
+    desired_worker_count = atoi(getenv("GC_MARKERS"));
+  if (desired_worker_count == 0)
+    desired_worker_count = number_of_current_processors();
+  if (desired_worker_count > MARK_WORKERS_MAX_COUNT)
+    desired_worker_count = MARK_WORKERS_MAX_COUNT;
+  for (size_t i = 0; i < desired_worker_count; i++) {
+    mark_worker_init(&marker->workers[i], cx, marker);
+    if (mark_worker_spawn(&marker->workers[i]))
+      marker->worker_count++;
+    else
+      break;
+  }
+  return marker->worker_count > 0;
 }
+
 static void marker_prepare(struct context *cx) {}
 static void marker_release(struct context *cx) {
   mark_deque_release(&context_marker(cx)->deque);
@@ -277,9 +611,7 @@ static void marker_release(struct context *cx) {
 
 struct gcobj;
 static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
-static inline void marker_trace(struct context *cx,
-                                void (*)(struct gcobj *, void *))
-  ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
                               struct gcobj *obj) ALWAYS_INLINE;
 
@@ -290,26 +622,22 @@ marker_visit(void **loc, void *mark_data) {
   if (obj && mark_object(mark->cx, obj)) {
     if (!local_mark_queue_full(&mark->local))
       local_mark_queue_push(&mark->local, (uintptr_t)obj);
-    else
-      mark_deque_push(mark->deque, (uintptr_t)obj);
+    else {
+      mark_channel_write(&mark->worker->writer, (uintptr_t)obj);
+    }
   }
 }
-static inline void
-marker_visit_root(void **loc, struct context *cx) {
-  struct local_marker mark;
-  local_mark_queue_poison(&mark.local);
-  mark.deque = &context_marker(cx)->deque;
-  mark.cx = cx;
-  marker_visit(loc, &mark);
-}
-static inline void
-marker_trace(struct context *cx,
-             void (*trace_one)(struct gcobj *, void *)) {
-  struct local_marker mark;
-  local_mark_queue_init(&mark.local);
-  mark.deque = &context_marker(cx)->deque;
-  mark.cx = cx;
 
+static void
+mark_worker_mark(struct mark_worker *worker) {
+  struct local_marker mark;
+  mark.worker = worker;
+  mark.deque = &context_marker(worker->cx)->deque;
+  mark.cx = worker->cx;
+  local_mark_queue_init(&mark.local);
+
+  size_t n = 0;
+  DEBUG("marker %p: running mark loop\n", worker);
   while (1) {
     uintptr_t addr;
     if (!local_mark_queue_empty(&mark.local)) {
@@ -322,7 +650,56 @@ marker_trace(struct context *cx,
         continue;
     }
     trace_one((struct gcobj*)addr, &mark);
+    n++;
   }
+  DEBUG("marker %p: done marking, %zu objects traced\n", worker, n);
+
+  mark_worker_finished_marking(worker);
+}
+
+static inline void
+marker_visit_root(void **loc, struct context *cx) {
+  struct gcobj *obj = *loc;
+  if (obj && mark_object(cx, obj))
+    mark_deque_push(&context_marker(cx)->deque, (uintptr_t)obj);
+}
+
+static inline void
+marker_trace(struct context *cx) {
+  struct marker *marker = context_marker(cx);
+
+  DEBUG("starting trace; %zu workers\n", marker->worker_count);
+  while (1) {
+    DEBUG("waking workers\n");
+    for (size_t i = 0; i < marker->worker_count; i++)
+      mark_worker_request_mark(&marker->workers[i]);
+
+    DEBUG("running controller loop\n");
+    size_t n = 0;
+    while (1) {
+      uintptr_t addr = mark_channel_pop(&marker->overflow);
+      if (!addr)
+        break;
+      mark_deque_push(&marker->deque, addr);
+      n++;
+    }
+    DEBUG("controller loop done, %zu objects sent for rebalancing\n", n);
+
+    // As in the ISMM'16 paper, it's possible that a worker decides to
+    // stop because the deque is empty, but actually there was an
+    // in-flight object in the mark channel that we hadn't been able to
+    // push yet.  Loop in that case.
+    {
+      uintptr_t addr = mark_deque_try_pop(&marker->deque);
+      if (addr == mark_deque_empty)
+        break;
+      DEBUG("--> controller looping again due to slop\n");
+      mark_deque_push(&marker->deque, addr);
+    }
+  }
+  ASSERT(atomic_load(&marker->overflow.length) == 0);
+  ASSERT(atomic_load(&marker->overflow.head) == marker->overflow.tail);
+  DEBUG("trace finished\n");
 }
 
 #endif // SERIAL_MARK_H
diff --git a/serial-marker.h b/serial-marker.h
index 1c2e305a7..719ba1c51 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -126,9 +126,7 @@ static void marker_release(struct context *cx) {
 
 struct gcobj;
 static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
-static inline void marker_trace(struct context *cx,
-                                void (*)(struct gcobj *, void *))
-  ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
                               struct gcobj *obj) ALWAYS_INLINE;
 
@@ -144,8 +142,7 @@ marker_visit_root(void **loc, struct context *cx) {
   marker_visit(loc, cx);
 }
 static inline void
-marker_trace(struct context *cx,
-             void (*trace_one)(struct gcobj *, void *)) {
+marker_trace(struct context *cx) {
   struct gcobj *obj;
   while ((obj = mark_queue_pop(&context_marker(cx)->queue)))
     trace_one(obj, cx);

From 4d7041bfa9820a90f4fe234eea8ad183e1e8b2f2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 13 Mar 2022 13:54:58 +0100
Subject: [PATCH 027/403] Another attempt at parallel marking, avoiding the
 channel

Not great though!
---
 parallel-marker.h | 104 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 94 insertions(+), 10 deletions(-)

diff --git a/parallel-marker.h b/parallel-marker.h
index f935bc155..808a588ba 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -223,6 +223,8 @@ mark_deque_steal(struct mark_deque *q) {
 
 #define LOCAL_MARK_QUEUE_SIZE 64
 #define LOCAL_MARK_QUEUE_MASK 63
+#define LOCAL_MARK_QUEUE_SHARE_THRESHOLD 48
+#define LOCAL_MARK_QUEUE_SHARE_AMOUNT 32
 struct local_mark_queue {
   size_t read;
   size_t write;
@@ -237,13 +239,21 @@ static inline void
 local_mark_queue_poison(struct local_mark_queue *q) {
   q->read = 0; q->write = LOCAL_MARK_QUEUE_SIZE;
 }
+static inline size_t
+local_mark_queue_size(struct local_mark_queue *q) {
+  return q->write - q->read;
+}
 static inline int
 local_mark_queue_empty(struct local_mark_queue *q) {
-  return q->read == q->write;
+  return local_mark_queue_size(q) == 0;
+}
+static inline int
+local_mark_queue_should_share(struct local_mark_queue *q) {
+  return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SHARE_THRESHOLD;
 }
 static inline int
 local_mark_queue_full(struct local_mark_queue *q) {
-  return q->read + LOCAL_MARK_QUEUE_SIZE == q->write;
+  return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SIZE;
 }
 static inline void
 local_mark_queue_push(struct local_mark_queue *q, uintptr_t v) {
@@ -313,7 +323,9 @@ mark_notify_wait(struct mark_notify *notify) {
   }
 
   // Spurious wakeup is OK.
+  DEBUG("-- marker waiting\n");
   pthread_cond_wait(&notify->cond, &notify->lock);
+  DEBUG("-- marker woke\n");
   res = MARK_NOTIFY_WOKE;
   notify->pending = 0;
 
@@ -324,10 +336,12 @@ done:
 
 static void
 mark_notify_wake(struct mark_notify *notify) {
+  DEBUG("== notifying pending wake!\n");
   pthread_mutex_lock(&notify->lock);
   notify->pending = 1;
   pthread_cond_signal(&notify->cond);
   pthread_mutex_unlock(&notify->lock);
+  DEBUG("== notifying pending wake done\n");
 }
 
 // A mostly lock-free multi-producer, single consumer queue, largely
@@ -486,7 +500,9 @@ struct mark_worker {
 
 struct marker {
   struct mark_deque deque;
+  pthread_mutex_t deque_writer_lock;
   struct mark_channel overflow;
+  atomic_size_t active_markers;
   size_t worker_count;
   struct mark_worker workers[MARK_WORKERS_MAX_COUNT];
 };
@@ -586,6 +602,7 @@ marker_init(struct context *cx) {
   struct marker *marker = context_marker(cx);
   if (!mark_deque_init(&marker->deque))
     return 0;
+  pthread_mutex_init(&marker->deque_writer_lock, NULL);
   mark_channel_init(&marker->overflow);
   size_t desired_worker_count = 0;
   if (getenv("GC_MARKERS"))
@@ -615,16 +632,80 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
 static inline int mark_object(struct context *cx,
                               struct gcobj *obj) ALWAYS_INLINE;
 
+static inline void
+marker_share(struct local_marker *mark) {
+  struct marker *marker = context_marker(mark->cx);
+  DEBUG("marker %p: trying to share\n", mark->worker);
+  if (pthread_mutex_trylock(&marker->deque_writer_lock) != 0) {
+    DEBUG("marker %p: trylock failed\n", mark->worker);
+    if (local_mark_queue_full(&mark->local)) {
+      DEBUG("marker %p: forcing lock acquisition\n", mark->worker);
+      pthread_mutex_lock(&marker->deque_writer_lock);
+    } else
+      return;
+  }
+
+  DEBUG("marker %p: sharing\n", mark->worker);
+  for (size_t i = 0; i < LOCAL_MARK_QUEUE_SHARE_AMOUNT; i++)
+    mark_deque_push(&marker->deque, local_mark_queue_pop(&mark->local));
+
+  pthread_mutex_unlock(&marker->deque_writer_lock);
+}
+
 static inline void
 marker_visit(void **loc, void *mark_data) {
   struct local_marker *mark = mark_data;
   struct gcobj *obj = *loc;
   if (obj && mark_object(mark->cx, obj)) {
-    if (!local_mark_queue_full(&mark->local))
-      local_mark_queue_push(&mark->local, (uintptr_t)obj);
-    else {
-      mark_channel_write(&mark->worker->writer, (uintptr_t)obj);
+    if (local_mark_queue_should_share(&mark->local))
+      marker_share(mark);
+    local_mark_queue_push(&mark->local, (uintptr_t)obj);
+  }
+}
+
+static uintptr_t
+mark_worker_steal(struct local_marker *mark) {
+  DEBUG("marker %p: trying to steal\n", mark->worker);
+  while (1) {
+    uintptr_t addr = mark_deque_steal(mark->deque);
+    if (addr == mark_deque_empty) {
+      struct marker *marker = context_marker(mark->cx);
+      if (atomic_fetch_sub_explicit(&marker->active_markers, 1,
+                                    memory_order_relaxed) == 1) {
+        DEBUG("  ->> marker %p: DONE (no spinning) <<-\n", mark->worker);
+        return 0;
+      }
+      size_t spin_count = 0;
+      while (1) {
+        addr = mark_deque_steal(mark->deque);
+        if (addr != mark_deque_empty) {
+          DEBUG("marker %p: spinning got 0x%zx\n", mark->worker, addr);
+          atomic_fetch_add_explicit(&marker->active_markers, 1,
+                                    memory_order_relaxed);
+          break;
+        }
+        if (atomic_load_explicit(&marker->active_markers,
+                                 memory_order_relaxed) == 0) {
+          DEBUG("  ->> marker %p: DONE <<-\n", mark->worker);
+          return 0;
+        }
+        // spin
+        DEBUG("marker %p: spinning #%zu\n", mark->worker, spin_count);
+        if (spin_count < 10)
+          __builtin_ia32_pause();
+        else if (spin_count < 20)
+          sched_yield();
+        else if (spin_count < 40)
+          usleep(0);
+        else
+          usleep(1);
+        spin_count++;
+      }
     }
+    DEBUG("marker %p: stealing got 0x%zx\n", mark->worker, addr);
+    if (addr == mark_deque_abort)
+      continue;
+    return addr;
   }
 }
 
@@ -643,11 +724,9 @@ mark_worker_mark(struct mark_worker *worker) {
     if (!local_mark_queue_empty(&mark.local)) {
       addr = local_mark_queue_pop(&mark.local);
     } else {
-      addr = mark_deque_steal(mark.deque);
-      if (addr == mark_deque_empty)
+      addr = mark_worker_steal(&mark);
+      if (!addr)
         break;
-      if (addr == mark_deque_abort)
-        continue;
     }
     trace_one((struct gcobj*)addr, &mark);
     n++;
@@ -671,16 +750,21 @@ marker_trace(struct context *cx) {
   DEBUG("starting trace; %zu workers\n", marker->worker_count);
   while (1) {
     DEBUG("waking workers\n");
+    atomic_store_explicit(&marker->active_markers, marker->worker_count,
+                          memory_order_release);
     for (size_t i = 0; i < marker->worker_count; i++)
       mark_worker_request_mark(&marker->workers[i]);
 
     DEBUG("running controller loop\n");
     size_t n = 0;
     while (1) {
+      DEBUG("controller: popping\n");
       uintptr_t addr = mark_channel_pop(&marker->overflow);
+      DEBUG("controller: popped 0x%zx\n", addr);
       if (!addr)
         break;
       mark_deque_push(&marker->deque, addr);
+      DEBUG("controller: pushed to deque\n");
       n++;
     }
     DEBUG("controller loop done, %zu objects sent for rebalancing\n", n);

From fddd4d9416069a2201b48bc7899332c8d4ccc43a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 13 Mar 2022 21:38:59 +0100
Subject: [PATCH 028/403] Hey parallel marking is finally an improvement??

---
 parallel-marker.h | 512 ++++++++++++++++------------------------------
 1 file changed, 176 insertions(+), 336 deletions(-)

diff --git a/parallel-marker.h b/parallel-marker.h
index 808a588ba..1bfbbbd93 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -1,5 +1,5 @@
-#ifndef SERIAL_TRACE_H
-#define SERIAL_TRACE_H
+#ifndef PARALLEL_MARKER_H
+#define PARALLEL_MARKER_H
 
 #include <pthread.h>
 #include <stdatomic.h>
@@ -215,16 +215,23 @@ mark_deque_steal(struct mark_deque *q) {
   return x;
 }
 
+static int
+mark_deque_can_steal(struct mark_deque *q) {
+  size_t t = LOAD_ACQUIRE(&q->top);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t b = LOAD_ACQUIRE(&q->bottom);
+  return t < b;
+}
+
 #undef LOAD_RELAXED
 #undef STORE_RELAXED
 #undef LOAD_ACQUIRE
 #undef STORE_RELEASE
 #undef LOAD_CONSUME
 
-#define LOCAL_MARK_QUEUE_SIZE 64
-#define LOCAL_MARK_QUEUE_MASK 63
-#define LOCAL_MARK_QUEUE_SHARE_THRESHOLD 48
-#define LOCAL_MARK_QUEUE_SHARE_AMOUNT 32
+#define LOCAL_MARK_QUEUE_SIZE 1024
+#define LOCAL_MARK_QUEUE_MASK (LOCAL_MARK_QUEUE_SIZE - 1)
+#define LOCAL_MARK_QUEUE_SHARE_AMOUNT (LOCAL_MARK_QUEUE_SIZE * 3 / 4)
 struct local_mark_queue {
   size_t read;
   size_t write;
@@ -248,10 +255,6 @@ local_mark_queue_empty(struct local_mark_queue *q) {
   return local_mark_queue_size(q) == 0;
 }
 static inline int
-local_mark_queue_should_share(struct local_mark_queue *q) {
-  return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SHARE_THRESHOLD;
-}
-static inline int
 local_mark_queue_full(struct local_mark_queue *q) {
   return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SIZE;
 }
@@ -264,221 +267,6 @@ local_mark_queue_pop(struct local_mark_queue *q) {
   return q->data[q->read++ & LOCAL_MARK_QUEUE_MASK];
 }
 
-struct mark_notify {
-  size_t notifiers;
-  int pending;
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
-};
-
-static void
-mark_notify_init(struct mark_notify *notify) {
-  notify->notifiers = 0;
-  notify->pending = 0;
-  pthread_mutex_init(&notify->lock, NULL);
-  pthread_cond_init(&notify->cond, NULL);
-}
-
-static void
-mark_notify_destroy(struct mark_notify *notify) {
-  pthread_mutex_destroy(&notify->lock);
-  pthread_cond_destroy(&notify->cond);
-}
-
-static void
-mark_notify_add_notifier(struct mark_notify *notify) {
-  pthread_mutex_lock(&notify->lock);
-  notify->notifiers++;
-  pthread_mutex_unlock(&notify->lock);
-}
-
-static void
-mark_notify_remove_notifier(struct mark_notify *notify) {
-  pthread_mutex_lock(&notify->lock);
-  notify->notifiers--;
-  if (notify->notifiers == 0)
-    pthread_cond_signal(&notify->cond);
-  pthread_mutex_unlock(&notify->lock);
-}
-
-enum mark_notify_status {
-  MARK_NOTIFY_DONE,
-  MARK_NOTIFY_WOKE
-};
-static enum mark_notify_status
-mark_notify_wait(struct mark_notify *notify) {
-  enum mark_notify_status res;
-
-  pthread_mutex_lock(&notify->lock);
-
-  if (notify->pending) {
-    res = MARK_NOTIFY_WOKE;
-    notify->pending = 0;
-    goto done;
-  }
-
-  if (notify->notifiers == 0) {
-    res = MARK_NOTIFY_DONE;
-    goto done;
-  }
-
-  // Spurious wakeup is OK.
-  DEBUG("-- marker waiting\n");
-  pthread_cond_wait(&notify->cond, &notify->lock);
-  DEBUG("-- marker woke\n");
-  res = MARK_NOTIFY_WOKE;
-  notify->pending = 0;
-
-done:
-  pthread_mutex_unlock(&notify->lock);
-  return res;
-}
-
-static void
-mark_notify_wake(struct mark_notify *notify) {
-  DEBUG("== notifying pending wake!\n");
-  pthread_mutex_lock(&notify->lock);
-  notify->pending = 1;
-  pthread_cond_signal(&notify->cond);
-  pthread_mutex_unlock(&notify->lock);
-  DEBUG("== notifying pending wake done\n");
-}
-
-// A mostly lock-free multi-producer, single consumer queue, largely
-// inspired by Rust's std::sync::channel.
-//
-// https://www.1024cores.net/home/lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
-
-struct mark_channel_message {
-  struct mark_channel_message * _Atomic next;
-  // Payload will be zero only for free messages, and for the sentinel
-  // message.
-  atomic_uintptr_t payload;
-};
-
-#define MARK_CHANNEL_WRITER_MESSAGE_COUNT ((size_t)1024)
-
-struct mark_channel {
-  union {
-    struct mark_channel_message* _Atomic head;
-    char head_padding[64];
-  };
-  union {
-    atomic_size_t length;
-    char length_padding[64];
-  };
-  struct mark_channel_message* tail;
-  struct mark_channel_message sentinel;
-
-  struct mark_notify notify;
-};
-
-struct mark_channel_writer {
-  struct mark_channel_message messages[MARK_CHANNEL_WRITER_MESSAGE_COUNT];
-  size_t next_message;
-
-  struct mark_channel *channel;
-};
-
-static void
-mark_channel_init(struct mark_channel *ch) {
-  memset(ch, 0, sizeof(*ch));
-  atomic_init(&ch->head, &ch->sentinel);
-  atomic_init(&ch->length, 0);
-  mark_notify_init(&ch->notify);
-  ch->tail = &ch->sentinel;
-}
-
-static void
-mark_channel_destroy(struct mark_channel *ch) {
-  mark_notify_destroy(&ch->notify);
-}
-
-static void
-mark_channel_push(struct mark_channel *ch, struct mark_channel_message *msg) {
-  ASSERT(msg->payload);
-  atomic_store_explicit(&msg->next, NULL, memory_order_relaxed);
-
-  struct mark_channel_message *prev =
-    atomic_exchange_explicit(&ch->head, msg, memory_order_acq_rel);
-
-  atomic_store_explicit(&prev->next, msg, memory_order_release);
-
-  size_t old_length =
-    atomic_fetch_add_explicit(&ch->length, 1, memory_order_relaxed);
-  if (old_length == 0)
-    mark_notify_wake(&ch->notify);
-}
-
-static uintptr_t
-mark_channel_try_pop(struct mark_channel *ch) {
-  struct mark_channel_message *tail = ch->tail;
-  struct mark_channel_message *next =
-    atomic_load_explicit(&tail->next, memory_order_acquire);
-
-  if (next) {
-    ch->tail = next;
-    uintptr_t payload =
-      atomic_load_explicit(&next->payload, memory_order_acquire);
-    ASSERT(payload != 0);
-    // Indicate to the writer that the old tail node can now be re-used.
-    // Note though that the new tail node is floating garbage; its
-    // payload has been popped but the node itself is still part of the
-    // queue.  Care has to be taken to ensure that any remaining queue
-    // entries are popped before the associated channel writer's
-    // messages are deallocated.
-    atomic_store_explicit(&tail->payload, 0, memory_order_release);
-    atomic_fetch_sub_explicit(&ch->length, 1, memory_order_relaxed);
-    return payload;
-  }
-
-  // if (atomic_load_explicit(&ch->head) == tail) return EMPTY else INCONSISTENT
-  return 0;
-}
-
-static uintptr_t
-mark_channel_pop(struct mark_channel *ch) {
-  while (1) {
-    uintptr_t ret = mark_channel_try_pop(ch);
-    if (ret)
-      return ret;
-
-    if (atomic_load_explicit(&ch->length, memory_order_relaxed) == 0) {
-      if (mark_notify_wait(&ch->notify) == MARK_NOTIFY_DONE)
-        return 0;
-    }
-  }
-}
-
-static void
-mark_channel_writer_init(struct mark_channel *ch,
-                         struct mark_channel_writer *writer) {
-  memset(writer, 0, sizeof(*writer));
-  writer->channel = ch;
-}
-
-static void
-mark_channel_write(struct mark_channel_writer *writer, uintptr_t payload) {
-  ASSERT(payload);
-  struct mark_channel_message *msg = &writer->messages[writer->next_message];
-  while (atomic_load_explicit(&msg->payload, memory_order_acquire) != 0)
-    sched_yield();
-  writer->next_message++;
-  if (writer->next_message == MARK_CHANNEL_WRITER_MESSAGE_COUNT)
-    writer->next_message = 0;
-  atomic_store_explicit(&msg->payload, payload, memory_order_release);
-  mark_channel_push(writer->channel, msg);
-}
-
-static void
-mark_channel_writer_activate(struct mark_channel_writer *writer) {
-  mark_notify_add_notifier(&writer->channel->notify);
-}
-static void
-mark_channel_writer_deactivate(struct mark_channel_writer *writer) {
-  mark_notify_remove_notifier(&writer->channel->notify);
-}
-
 enum mark_worker_state {
   MARK_WORKER_STOPPED,
   MARK_WORKER_IDLE,
@@ -489,27 +277,30 @@ enum mark_worker_state {
 
 struct mark_worker {
   struct context *cx;
+  size_t id;
+  size_t steal_id;
   pthread_t thread;
   enum mark_worker_state state;
   pthread_mutex_t lock;
   pthread_cond_t cond;
-  struct mark_channel_writer writer;
+  struct mark_deque deque;
 };
 
 #define MARK_WORKERS_MAX_COUNT 8
 
 struct marker {
-  struct mark_deque deque;
-  pthread_mutex_t deque_writer_lock;
-  struct mark_channel overflow;
   atomic_size_t active_markers;
   size_t worker_count;
+  atomic_size_t running_markers;
+  long count;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
   struct mark_worker workers[MARK_WORKERS_MAX_COUNT];
 };
 
 struct local_marker {
   struct mark_worker *worker;
-  struct mark_deque *deque;
+  struct mark_deque *share_deque;
   struct context *cx;
   struct local_mark_queue local;
 };
@@ -519,15 +310,17 @@ static inline struct marker* context_marker(struct context *cx);
 
 static size_t number_of_current_processors(void) { return 1; }
 
-static void
+static int
 mark_worker_init(struct mark_worker *worker, struct context *cx,
-                   struct marker *marker) {
+                 struct marker *marker, size_t id) {
   worker->cx = cx;
+  worker->id = id;
+  worker->steal_id = 0;
   worker->thread = 0;
   worker->state = MARK_WORKER_STOPPED;
   pthread_mutex_init(&worker->lock, NULL);
   pthread_cond_init(&worker->cond, NULL);
-  mark_channel_writer_init(&marker->overflow, &worker->writer);
+  return mark_deque_init(&worker->deque);
 }
 
 static void mark_worker_mark(struct mark_worker *worker);
@@ -574,9 +367,10 @@ mark_worker_spawn(struct mark_worker *worker) {
 
 static void
 mark_worker_request_mark(struct mark_worker *worker) {
+  struct marker *marker = context_marker(worker->cx);
+    
   pthread_mutex_lock(&worker->lock);
   ASSERT(worker->state == MARK_WORKER_IDLE);
-  mark_channel_writer_activate(&worker->writer);
   worker->state = MARK_WORKER_MARKING;
   pthread_cond_signal(&worker->cond);
   pthread_mutex_unlock(&worker->lock);
@@ -585,7 +379,14 @@ mark_worker_request_mark(struct mark_worker *worker) {
 static void
 mark_worker_finished_marking(struct mark_worker *worker) {
   // Signal controller that we are done with marking.
-  mark_channel_writer_deactivate(&worker->writer);
+  struct marker *marker = context_marker(worker->cx);
+    
+  if (atomic_fetch_sub(&marker->running_markers, 1) == 1) {
+    pthread_mutex_lock(&marker->lock);
+    marker->count++;
+    pthread_cond_signal(&marker->cond);
+    pthread_mutex_unlock(&marker->lock);
+  }
 }
 
 static void
@@ -600,10 +401,11 @@ mark_worker_request_stop(struct mark_worker *worker) {
 static int
 marker_init(struct context *cx) {
   struct marker *marker = context_marker(cx);
-  if (!mark_deque_init(&marker->deque))
-    return 0;
-  pthread_mutex_init(&marker->deque_writer_lock, NULL);
-  mark_channel_init(&marker->overflow);
+  atomic_init(&marker->active_markers, 0);
+  atomic_init(&marker->running_markers, 0);
+  marker->count = 0;
+  pthread_mutex_init(&marker->lock, NULL);
+  pthread_cond_init(&marker->cond, NULL);
   size_t desired_worker_count = 0;
   if (getenv("GC_MARKERS"))
     desired_worker_count = atoi(getenv("GC_MARKERS"));
@@ -612,7 +414,8 @@ marker_init(struct context *cx) {
   if (desired_worker_count > MARK_WORKERS_MAX_COUNT)
     desired_worker_count = MARK_WORKERS_MAX_COUNT;
   for (size_t i = 0; i < desired_worker_count; i++) {
-    mark_worker_init(&marker->workers[i], cx, marker);
+    if (!mark_worker_init(&marker->workers[i], cx, marker, i))
+      break;
     if (mark_worker_spawn(&marker->workers[i]))
       marker->worker_count++;
     else
@@ -621,9 +424,15 @@ marker_init(struct context *cx) {
   return marker->worker_count > 0;
 }
 
-static void marker_prepare(struct context *cx) {}
+static void marker_prepare(struct context *cx) {
+  struct marker *marker = context_marker(cx);
+  for (size_t i = 0; i < marker->worker_count; i++)
+    marker->workers[i].steal_id = 0;
+}
 static void marker_release(struct context *cx) {
-  mark_deque_release(&context_marker(cx)->deque);
+  struct marker *marker = context_marker(cx);
+  for (size_t i = 0; i < marker->worker_count; i++)
+    mark_deque_release(&marker->workers[i].deque);
 }
 
 struct gcobj;
@@ -634,22 +443,9 @@ static inline int mark_object(struct context *cx,
 
 static inline void
 marker_share(struct local_marker *mark) {
-  struct marker *marker = context_marker(mark->cx);
-  DEBUG("marker %p: trying to share\n", mark->worker);
-  if (pthread_mutex_trylock(&marker->deque_writer_lock) != 0) {
-    DEBUG("marker %p: trylock failed\n", mark->worker);
-    if (local_mark_queue_full(&mark->local)) {
-      DEBUG("marker %p: forcing lock acquisition\n", mark->worker);
-      pthread_mutex_lock(&marker->deque_writer_lock);
-    } else
-      return;
-  }
-
-  DEBUG("marker %p: sharing\n", mark->worker);
+  DEBUG("marker #%zu: sharing\n", mark->worker->id);
   for (size_t i = 0; i < LOCAL_MARK_QUEUE_SHARE_AMOUNT; i++)
-    mark_deque_push(&marker->deque, local_mark_queue_pop(&mark->local));
-
-  pthread_mutex_unlock(&marker->deque_writer_lock);
+    mark_deque_push(mark->share_deque, local_mark_queue_pop(&mark->local));
 }
 
 static inline void
@@ -657,55 +453,114 @@ marker_visit(void **loc, void *mark_data) {
   struct local_marker *mark = mark_data;
   struct gcobj *obj = *loc;
   if (obj && mark_object(mark->cx, obj)) {
-    if (local_mark_queue_should_share(&mark->local))
+    if (local_mark_queue_full(&mark->local))
       marker_share(mark);
     local_mark_queue_push(&mark->local, (uintptr_t)obj);
   }
 }
 
 static uintptr_t
-mark_worker_steal(struct local_marker *mark) {
-  DEBUG("marker %p: trying to steal\n", mark->worker);
+marker_steal_from_worker(struct marker *marker, size_t id) {
+  ASSERT(id < marker->worker_count);
   while (1) {
-    uintptr_t addr = mark_deque_steal(mark->deque);
-    if (addr == mark_deque_empty) {
-      struct marker *marker = context_marker(mark->cx);
-      if (atomic_fetch_sub_explicit(&marker->active_markers, 1,
-                                    memory_order_relaxed) == 1) {
-        DEBUG("  ->> marker %p: DONE (no spinning) <<-\n", mark->worker);
-        return 0;
-      }
-      size_t spin_count = 0;
-      while (1) {
-        addr = mark_deque_steal(mark->deque);
-        if (addr != mark_deque_empty) {
-          DEBUG("marker %p: spinning got 0x%zx\n", mark->worker, addr);
-          atomic_fetch_add_explicit(&marker->active_markers, 1,
-                                    memory_order_relaxed);
-          break;
-        }
-        if (atomic_load_explicit(&marker->active_markers,
-                                 memory_order_relaxed) == 0) {
-          DEBUG("  ->> marker %p: DONE <<-\n", mark->worker);
-          return 0;
-        }
-        // spin
-        DEBUG("marker %p: spinning #%zu\n", mark->worker, spin_count);
-        if (spin_count < 10)
-          __builtin_ia32_pause();
-        else if (spin_count < 20)
-          sched_yield();
-        else if (spin_count < 40)
-          usleep(0);
-        else
-          usleep(1);
-        spin_count++;
-      }
-    }
-    DEBUG("marker %p: stealing got 0x%zx\n", mark->worker, addr);
-    if (addr == mark_deque_abort)
+    uintptr_t res = mark_deque_steal(&marker->workers[id].deque);
+    if (res == mark_deque_empty)
+      return 0;
+    if (res == mark_deque_abort)
       continue;
-    return addr;
+    return res;
+  }
+}
+
+static uintptr_t
+marker_can_steal_from_worker(struct marker *marker, size_t id) {
+  ASSERT(id < marker->worker_count);
+  return mark_deque_can_steal(&marker->workers[id].deque);
+}
+
+static uintptr_t
+mark_worker_steal_from_any(struct mark_worker *worker, struct marker *marker) {
+  size_t steal_id = worker->steal_id;
+  for (size_t i = 0; i < marker->worker_count; i++) {
+    steal_id = (steal_id + 1) % marker->worker_count;
+    DEBUG("marker #%zu: stealing from #%zu\n", worker->id, steal_id);
+    uintptr_t addr = marker_steal_from_worker(marker, steal_id);
+    if (addr) {
+      DEBUG("marker #%zu: stealing got 0x%zx\n", worker->id, addr);
+      worker->steal_id = steal_id;
+      return addr;
+    }
+  }
+  DEBUG("marker #%zu: failed to steal\n", worker->id);
+  return 0;
+}
+
+static int
+mark_worker_can_steal_from_any(struct mark_worker *worker, struct marker *marker) {
+  size_t steal_id = worker->steal_id;
+  DEBUG("marker #%zu: checking if any worker has tasks\n", worker->id);
+  for (size_t i = 0; i < marker->worker_count; i++) {
+    steal_id = (steal_id + 1) % marker->worker_count;
+    int res = marker_can_steal_from_worker(marker, steal_id);
+    if (res) {
+      DEBUG("marker #%zu: worker #%zu has tasks!\n", worker->id, steal_id);
+      worker->steal_id = steal_id;
+      return 1;
+    }
+  }
+  DEBUG("marker #%zu: nothing to steal\n", worker->id);
+  return 0;
+}
+
+static int
+mark_worker_check_termination(struct mark_worker *worker,
+                              struct marker *marker) {
+  // We went around all workers and nothing.  Enter termination phase.
+  if (atomic_fetch_sub_explicit(&marker->active_markers, 1,
+                                memory_order_relaxed) == 1) {
+    DEBUG("  ->> marker #%zu: DONE (no spinning) <<-\n", worker->id);
+    return 1;
+  }
+
+  size_t spin_count = 0;
+  while (1) {
+    if (mark_worker_can_steal_from_any(worker, marker)) {
+      atomic_fetch_add_explicit(&marker->active_markers, 1,
+                                memory_order_relaxed);
+      return 0;
+    }
+    if (atomic_load_explicit(&marker->active_markers,
+                             memory_order_relaxed) == 0) {
+      DEBUG("  ->> marker #%zu: DONE <<-\n", worker->id);
+      return 1;
+    }
+    // spin
+    DEBUG("marker #%zu: spinning #%zu\n", worker->id, spin_count);
+    if (spin_count < 10)
+      __builtin_ia32_pause();
+    else if (spin_count < 20)
+      sched_yield();
+    else if (spin_count < 40)
+      usleep(0);
+    else
+      usleep(1);
+    spin_count++;
+  }
+}
+
+static uintptr_t
+mark_worker_steal(struct local_marker *mark) {
+  struct marker *marker = context_marker(mark->cx);
+  struct mark_worker *worker = mark->worker;
+
+  while (1) {
+    DEBUG("marker #%zu: trying to steal\n", worker->id);
+    uintptr_t addr = mark_worker_steal_from_any(worker, marker);
+    if (addr)
+      return addr;
+
+    if (mark_worker_check_termination(worker, marker))
+      return 0;
   }
 }
 
@@ -713,12 +568,12 @@ static void
 mark_worker_mark(struct mark_worker *worker) {
   struct local_marker mark;
   mark.worker = worker;
-  mark.deque = &context_marker(worker->cx)->deque;
+  mark.share_deque = &worker->deque;
   mark.cx = worker->cx;
   local_mark_queue_init(&mark.local);
 
   size_t n = 0;
-  DEBUG("marker %p: running mark loop\n", worker);
+  DEBUG("marker #%zu: running mark loop\n", worker->id);
   while (1) {
     uintptr_t addr;
     if (!local_mark_queue_empty(&mark.local)) {
@@ -731,7 +586,7 @@ mark_worker_mark(struct mark_worker *worker) {
     trace_one((struct gcobj*)addr, &mark);
     n++;
   }
-  DEBUG("marker %p: done marking, %zu objects traced\n", worker, n);
+  DEBUG("marker #%zu: done marking, %zu objects traced\n", worker->id, n);
 
   mark_worker_finished_marking(worker);
 }
@@ -739,51 +594,36 @@ mark_worker_mark(struct mark_worker *worker) {
 static inline void
 marker_visit_root(void **loc, struct context *cx) {
   struct gcobj *obj = *loc;
+  struct mark_deque *worker0_deque = &context_marker(cx)->workers[0].deque;
   if (obj && mark_object(cx, obj))
-    mark_deque_push(&context_marker(cx)->deque, (uintptr_t)obj);
+    mark_deque_push(worker0_deque, (uintptr_t)obj);
 }
 
 static inline void
 marker_trace(struct context *cx) {
   struct marker *marker = context_marker(cx);
 
+  pthread_mutex_lock(&marker->lock);
+  long mark_count = marker->count;
+  pthread_mutex_unlock(&marker->lock);
+
   DEBUG("starting trace; %zu workers\n", marker->worker_count);
-  while (1) {
-    DEBUG("waking workers\n");
-    atomic_store_explicit(&marker->active_markers, marker->worker_count,
-                          memory_order_release);
-    for (size_t i = 0; i < marker->worker_count; i++)
-      mark_worker_request_mark(&marker->workers[i]);
+  DEBUG("waking workers\n");
+  atomic_store_explicit(&marker->active_markers, marker->worker_count,
+                        memory_order_release);
+  atomic_store_explicit(&marker->running_markers, marker->worker_count,
+                        memory_order_release);
+  for (size_t i = 0; i < marker->worker_count; i++)
+    mark_worker_request_mark(&marker->workers[i]);
 
-    DEBUG("running controller loop\n");
-    size_t n = 0;
-    while (1) {
-      DEBUG("controller: popping\n");
-      uintptr_t addr = mark_channel_pop(&marker->overflow);
-      DEBUG("controller: popped 0x%zx\n", addr);
-      if (!addr)
-        break;
-      mark_deque_push(&marker->deque, addr);
-      DEBUG("controller: pushed to deque\n");
-      n++;
-    }
-    DEBUG("controller loop done, %zu objects sent for rebalancing\n", n);
+  DEBUG("waiting on markers\n");
+
+  pthread_mutex_lock(&marker->lock);
+  while (marker->count <= mark_count)
+    pthread_cond_wait(&marker->cond, &marker->lock);
+  pthread_mutex_unlock(&marker->lock);
 
-    // As in the ISMM'16 paper, it's possible that a worker decides to
-    // stop because the deque is empty, but actually there was an
-    // in-flight object in the mark channel that we hadn't been able to
-    // push yet.  Loop in that case.
-    {
-      uintptr_t addr = mark_deque_try_pop(&marker->deque);
-      if (addr == mark_deque_empty)
-        break;
-      DEBUG("--> controller looping again due to slop\n");
-      mark_deque_push(&marker->deque, addr);
-    }
-  }
-  ASSERT(atomic_load(&marker->overflow.length) == 0);
-  ASSERT(atomic_load(&marker->overflow.head) == marker->overflow.tail);
   DEBUG("trace finished\n");
 }
 
-#endif // SERIAL_MARK_H
+#endif // PARALLEL_MARKER_H

From a693c4ea8a60edd7a35b13c2fe28b3f42bd22405 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 13 Mar 2022 21:45:20 +0100
Subject: [PATCH 029/403] Bugfix to mark-sweep

Before this, the last sweep would cause premature gc
---
 mark-sweep.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 2c90f48eb..b5219c644 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -306,7 +306,7 @@ static int sweep(struct context *cx) {
   }
 
   cx->sweep = sweep;
-  return sweep < limit;
+  return to_reclaim < 128;
 }
 
 static void* allocate_large(struct context *cx, enum alloc_kind kind,

From a1b4311cfc26e555e4a0b0974e75bc27d0e1834a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 13 Mar 2022 21:55:58 +0100
Subject: [PATCH 030/403] Update status

---
 README.md | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 3323d1377..48f56181b 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,8 @@ Then there are currently three collectors:
    mark-sweep segregated-fits collector with lazy sweeping.
  - `semi.h`: Semispace copying collector.
  - `mark-sweep.h`: Stop-the-world mark-sweep segregated-fits collector
-   with lazy sweeping.
+   with lazy sweeping.  Two different marking algorithms:
+   single-threaded and parallel.
 
 The two latter collectors reserve one word per object on the header,
 which makes them collect more frequently than `bdw` because the `Node`
@@ -27,10 +28,11 @@ garbage collector.  Guile currently uses BDW-GC.  In Guile if we have an
 object reference we generally have to be able to know what kind of
 object it is, because there are few global invariants enforced by
 typing.  Therefore it is reasonable to consider allowing the GC and the
-application to share the first word of an object, for example to store a
-mark bit, to allow the application to know what kind an object is, to
-allow the GC to find references within the object, to allow the GC to
-compute the object's size, and so on.
+application to share the first word of an object, for example to maybe
+store a mark bit (though an on-the-side mark byte seems to allow much
+more efficient sweeping, for mark-sweep), to allow the application to
+know what kind an object is, to allow the GC to find references within
+the object, to allow the GC to compute the object's size, and so on.
 
 The GCBench benchmark is small but then again many Guile processes also
 are quite short-lived, so perhaps it is useful to ensure that small
@@ -65,8 +67,9 @@ majority of use cases.
 
 ## To do
 
- - [ ] Implement a parallel marker for the mark-sweep collector.
- - [ ] Adapt GCBench for multiple mutator threads.
+ - [X] Implement a parallel marker for the mark-sweep collector.
+ - [ ] Adapt all GC implementations to allow multiple mutator threads.
+   Update gcbench.c.
  - [ ] Implement precise non-moving Immix whole-heap collector.
  - [ ] Add evacuation to Immix whole-heap collector.
  - [ ] Add parallelism to Immix stop-the-world phase.
@@ -79,7 +82,7 @@ majority of use cases.
 
 ## License
 
-GCBench.c, MT_GCBench.c, and MT_GCBench2.c are from
+gcbench.c, MT_GCBench.c, and MT_GCBench2.c are from
 https://hboehm.info/gc/gc_bench/ and have a somewhat unclear license.  I
 have modified GCBench significantly so that I can slot in different GC
 implementations.  The GC implementations themselves are available under

From aac0faf4cf1fd587a346dc82ee9ae195944daedb Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 16 Mar 2022 09:05:31 +0100
Subject: [PATCH 031/403] Refactor type definitions

---
 Makefile        |  8 ++++----
 gcbench-types.h | 22 +---------------------
 heap-objects.h  | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 25 deletions(-)
 create mode 100644 heap-objects.h

diff --git a/Makefile b/Makefile
index ecf35b3e3..0363edc06 100644
--- a/Makefile
+++ b/Makefile
@@ -8,16 +8,16 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
 all: $(ALL_TESTS)
 
-bdw-%: bdw.h conservative-roots.h %.c
+bdw-%: bdw.h conservative-roots.h %-types.h %.c
 	$(CC) $(CFLAGS) -DNDEBUG -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
 
-semi-%: semi.h precise-roots.h %.c
+semi-%: semi.h precise-roots.h %-types.h heap-objects.h %.c
 	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_SEMI -o $@ $*.c
 
-mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %.c
+mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(CC) $(CFLAGS) -I. -Wno-unused -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
 
-parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %.c
+parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(CC) $(CFLAGS) -I. -Wno-unused -DNDEBUG -DGC_PARALLEL_MARK_SWEEP -lpthread -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/gcbench-types.h b/gcbench-types.h
index 20cef8be4..a61b2b7d5 100644
--- a/gcbench-types.h
+++ b/gcbench-types.h
@@ -1,30 +1,10 @@
 #ifndef GCBENCH_TYPES_H
 #define GCBENCH_TYPES_H
 
-#include "inline.h"
-
 #define FOR_EACH_HEAP_OBJECT_KIND(M) \
   M(node, Node, NODE) \
   M(double_array, DoubleArray, DOUBLE_ARRAY)
 
-#define DECLARE_NODE_TYPE(name, Name, NAME) \
-  struct Name;                              \
-  typedef struct Name Name;
-FOR_EACH_HEAP_OBJECT_KIND(DECLARE_NODE_TYPE)
-#undef DECLARE_NODE_TYPE
-
-#define DEFINE_ENUM(name, Name, NAME) ALLOC_KIND_##NAME,
-enum alloc_kind {
-  FOR_EACH_HEAP_OBJECT_KIND(DEFINE_ENUM)
-};
-#undef DEFINE_ENUM
-
-#define DEFINE_METHODS(name, Name, NAME) \
-  static inline size_t name##_size(Name *obj) ALWAYS_INLINE; \
-  static inline void visit_##name##_fields(Name *obj,\
-                                           void (*visit)(void **loc, void *visit_data), \
-                                           void *visit_data) ALWAYS_INLINE;
-FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
-#undef DEFINE_METHODS
+#include "heap-objects.h"
 
 #endif // GCBENCH_TYPES_H
diff --git a/heap-objects.h b/heap-objects.h
new file mode 100644
index 000000000..db78e7b66
--- /dev/null
+++ b/heap-objects.h
@@ -0,0 +1,26 @@
+#ifndef HEAP_OBJECTS_H
+#define HEAP_OBJECTS_H
+
+#include "inline.h"
+
+#define DECLARE_NODE_TYPE(name, Name, NAME) \
+  struct Name;                              \
+  typedef struct Name Name;
+FOR_EACH_HEAP_OBJECT_KIND(DECLARE_NODE_TYPE)
+#undef DECLARE_NODE_TYPE
+
+#define DEFINE_ENUM(name, Name, NAME) ALLOC_KIND_##NAME,
+enum alloc_kind {
+  FOR_EACH_HEAP_OBJECT_KIND(DEFINE_ENUM)
+};
+#undef DEFINE_ENUM
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(void **loc, void *visit_data), \
+                                           void *visit_data) ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+#endif // HEAP_OBJECTS_H

From e7a3f83bcce0c0b5c066f0fb7bb1587fa5f1936e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 16 Mar 2022 14:13:43 +0100
Subject: [PATCH 032/403] Add quads benchmark

Also expand GC interface with "allocate_pointerless".  Limit lazy
sweeping to the allocation size that is causing the sweep, without
adding to fragmentation.
---
 Makefile      |   2 +-
 bdw.h         |  22 ++++---
 gcbench.c     |   7 +-
 mark-sweep.h  |  26 ++++++--
 quads-types.h |   9 +++
 quads.c       | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++
 semi.h        |  14 +++-
 7 files changed, 234 insertions(+), 23 deletions(-)
 create mode 100644 quads-types.h
 create mode 100644 quads.c

diff --git a/Makefile b/Makefile
index 0363edc06..9a33422d5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS=gcbench # MT_GCBench MT_GCBench2
+TESTS=gcbench quads # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
diff --git a/bdw.h b/bdw.h
index 51adea161..b32f9537f 100644
--- a/bdw.h
+++ b/bdw.h
@@ -1,3 +1,5 @@
+#include <stdint.h>
+
 #include "conservative-roots.h"
 
 // When pthreads are used, let `libgc' know about it and redirect
@@ -20,15 +22,17 @@ struct context {};
 
 static inline void* allocate(struct context *cx, enum alloc_kind kind,
                              size_t size) {
-  switch (kind) {
-  case ALLOC_KIND_NODE:
-    // cleared to 0 by the collector.
-    return GC_malloc(size);
-  case ALLOC_KIND_DOUBLE_ARRAY:
-    // warning: not cleared!
-    return GC_malloc_atomic(size);
-  }
-  abort();
+  return GC_malloc(size);
+}
+
+static inline void*
+allocate_pointerless(struct context *cx, enum alloc_kind kind,
+                     size_t size) {
+  return GC_malloc_atomic(size);
+}
+
+static inline void collect(struct context *cx) {
+  GC_gcollect();
 }
 
 static inline void init_field(void **addr, void *val) {
diff --git a/gcbench.c b/gcbench.c
index 6d8acd48a..d3d9c6412 100644
--- a/gcbench.c
+++ b/gcbench.c
@@ -94,9 +94,10 @@ static Node* allocate_node(struct context *cx) {
 
 static struct DoubleArray* allocate_double_array(struct context *cx,
                                                  size_t size) {
-  // note, we might allow the collector to leave this data uninitialized.
-  DoubleArray *ret = allocate(cx, ALLOC_KIND_DOUBLE_ARRAY,
-                              sizeof(DoubleArray) + sizeof (double) * size);
+  // May be uninitialized.
+  DoubleArray *ret =
+    allocate_pointerless(cx, ALLOC_KIND_DOUBLE_ARRAY,
+                         sizeof(DoubleArray) + sizeof (double) * size);
   ret->length = size;
   return ret;
 }
diff --git a/mark-sweep.h b/mark-sweep.h
index b5219c644..30e2e0b0b 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -14,6 +14,8 @@
 #include "serial-marker.h"
 #endif
 
+#define LAZY_SWEEP 1
+
 #define GRANULE_SIZE 8
 #define GRANULE_SIZE_LOG_2 3
 #define LARGE_OBJECT_THRESHOLD 256
@@ -277,17 +279,22 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
-static int sweep(struct context *cx) {
+static int sweep(struct context *cx, size_t for_granules) {
   // Sweep until we have reclaimed 128 granules (1024 kB), or we reach
   // the end of the heap.
   ssize_t to_reclaim = 128;
   uintptr_t sweep = cx->sweep;
   uintptr_t limit = cx->base + cx->size;
 
+  if (sweep == limit)
+    return 0;
+
   while (to_reclaim > 0 && sweep < limit) {
     uint8_t* mark = mark_byte(cx, (struct gcobj*)sweep);
-    size_t free_granules = next_mark(mark,
-                                     (limit - sweep) >> GRANULE_SIZE_LOG_2);
+    size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
+    if (limit_granules > for_granules)
+      limit_granules = for_granules;
+    size_t free_granules = next_mark(mark, limit_granules);
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
       clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
@@ -296,7 +303,7 @@ static int sweep(struct context *cx) {
       to_reclaim -= free_granules;
 
       mark += free_granules;
-      if (sweep == limit)
+      if (free_granules == limit_granules)
         break;
     }
     // Object survived collection; clear mark and continue sweeping.
@@ -306,7 +313,7 @@ static int sweep(struct context *cx) {
   }
 
   cx->sweep = sweep;
-  return to_reclaim < 128;
+  return 1;
 }
 
 static void* allocate_large(struct context *cx, enum alloc_kind kind,
@@ -328,7 +335,7 @@ static void* allocate_large(struct context *cx, enum alloc_kind kind,
         }
       }
       already_scanned = cx->large_objects;
-    } while (sweep (cx));
+    } while (sweep(cx, granules));
 
     // No large object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
@@ -370,7 +377,7 @@ static void fill_small(struct context *cx, enum small_object_size kind) {
       return;
     }
 
-    if (!sweep(cx)) {
+    if (!sweep(cx, LARGE_OBJECT_GRANULE_THRESHOLD)) {
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
         abort();
@@ -402,6 +409,11 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
     return allocate_small(cx, kind, granules_to_small_object_size(granules));
   return allocate_large(cx, kind, granules);
 }
+static inline void* allocate_pointerless(struct context *cx,
+                                         enum alloc_kind kind,
+                                         size_t size) {
+  return allocate(cx, kind, size);
+}
 
 static inline void init_field(void **addr, void *val) {
   *addr = val;
diff --git a/quads-types.h b/quads-types.h
new file mode 100644
index 000000000..16a1c62d0
--- /dev/null
+++ b/quads-types.h
@@ -0,0 +1,9 @@
+#ifndef QUADS_TYPES_H
+#define QUADS_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(quad, Quad, QUAD)
+
+#include "heap-objects.h"
+
+#endif // QUADS_TYPES_H
diff --git a/quads.c b/quads.c
new file mode 100644
index 000000000..1f7ce0ae8
--- /dev/null
+++ b/quads.c
@@ -0,0 +1,177 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "quads-types.h"
+#include "gc.h"
+
+typedef struct Quad {
+  GC_HEADER;
+  struct Quad *kids[4];
+} Quad;
+static inline size_t quad_size(Quad *obj) {
+  return sizeof(Quad);
+}
+static inline void
+visit_quad_fields(Quad *quad,
+                  void (*visit)(void **loc, void *visit_data),
+                  void *visit_data) {
+  for (size_t i = 0; i < 4; i++)
+    visit((void**)&quad->kids[i], visit_data);
+}
+typedef HANDLE_TO(Quad) QuadHandle;
+
+static Quad* allocate_quad(struct context *cx) {
+  // memset to 0 by the collector.
+  return allocate(cx, ALLOC_KIND_QUAD, sizeof (Quad));
+}
+
+/* Get the current time in microseconds */
+static unsigned long current_time(void)
+{
+  struct timeval t;
+  if (gettimeofday(&t, NULL) == -1)
+    return 0;
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+// Build tree bottom-up
+static Quad* make_tree(struct context *cx, int depth) {
+  if (depth<=0) {
+    return allocate_quad(cx);
+  } else {
+    QuadHandle kids[4] = { { NULL }, };
+    for (size_t i = 0; i < 4; i++) {
+      HANDLE_SET(kids[i], make_tree(cx, depth-1));
+      PUSH_HANDLE(cx, kids[i]);
+    }
+
+    Quad *result = allocate_quad(cx);
+    for (size_t i = 0; i < 4; i++)
+      init_field((void**)&result->kids[i], HANDLE_REF(kids[i]));
+
+    for (size_t i = 0; i < 4; i++)
+      POP_HANDLE(cx, kids[3 - i]);
+
+    return result;
+  }
+}
+
+static void validate_tree(Quad *tree, int depth) {
+  for (size_t i = 0; i < 4; i++) {
+    if (depth == 0) {
+      if (tree->kids[i])
+        abort();
+    } else {
+      if (!tree->kids[i])
+        abort();
+      validate_tree(tree->kids[i], depth - 1);
+    }
+  }
+}
+
+static void print_elapsed(const char *what, unsigned long start) {
+  unsigned long end = current_time();
+  unsigned long msec = (end - start) / 1000;
+  unsigned long usec = (end - start) % 1000;
+  printf("Completed %s in %lu.%.3lu msec\n", what, msec, usec);
+}
+
+static size_t parse_size(char *arg, const char *what) {
+  long val = atol(arg);
+  if (val <= 0) {
+    fprintf(stderr, "Failed to parse %s '%s'\n", what, arg);
+    exit(1);
+  }
+  return val;
+}
+
+static size_t tree_size(size_t depth) {
+  size_t nquads = 0;
+  size_t leaf_count = 1;
+  for (size_t i = 0; i <= depth; i++) {
+    if (nquads > ((size_t)-1) - leaf_count) {
+      fprintf(stderr,
+              "error: address space too small for quad tree of depth %zu\n",
+              depth);
+      exit(1);
+    }
+    nquads += leaf_count;
+    leaf_count *= 4;
+  }
+  return nquads;
+}
+
+
+int main(int argc, char *argv[]) {
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s DEPTH MULTIPLIER\n", argv[0]);
+    return 1;
+  }
+
+  size_t depth = parse_size(argv[1], "depth");
+  double multiplier = atof(argv[2]);
+
+  if (!(1.0 < multiplier && multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+
+  // Compute byte size not counting any header word, so as to compute the same
+  // heap size whether a header word is there or not.
+  size_t nquads = tree_size(depth);
+  size_t tree_bytes = nquads * 4 * sizeof(Quad*);
+  size_t heap_size = tree_bytes * multiplier;
+
+  printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
+         heap_size / 1e9, multiplier);
+  struct context _cx;
+  struct context *cx = &_cx;
+  initialize_gc(cx, heap_size);
+
+  QuadHandle quad = { NULL };
+
+  PUSH_HANDLE(cx, quad);
+
+  print_start_gc_stats(cx);
+
+  printf("Making quad tree of depth %zu (%zu nodes).  Total size %.3fGB.\n",
+         depth, nquads, (nquads * sizeof(Quad)) / 1e9);
+  unsigned long start = current_time();
+  HANDLE_SET(quad, make_tree(cx, depth));
+  print_elapsed("construction", start);
+
+  validate_tree(HANDLE_REF(quad), depth);
+
+  for (size_t i = 0; i < 10; i++) {
+    printf("Allocating 1 GB of garbage.\n");
+    size_t garbage_depth = 3;
+    start = current_time();
+    for (size_t i = 1e9/(tree_size(garbage_depth)*4*sizeof(Quad*)); i; i--)
+      make_tree(cx, garbage_depth);
+    print_elapsed("allocating garbage", start);
+
+#if 0
+#ifdef LAZY_SWEEP
+    start = current_time();
+    do {} while (sweep(cx));
+    print_elapsed("finishing lazy sweep", start);
+#endif
+
+    start = current_time();
+    collect(cx);
+    print_elapsed("collection", start);
+#endif
+
+    start = current_time();
+    validate_tree(HANDLE_REF(quad), depth);
+    print_elapsed("validate tree", start);
+  }
+
+  print_end_gc_stats(cx);
+
+  POP_HANDLE(cx, quad);
+  return 0;
+}
+
diff --git a/semi.h b/semi.h
index 5fdec4a07..fc72f0b4f 100644
--- a/semi.h
+++ b/semi.h
@@ -27,7 +27,8 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct context *cx, size_t bytes) NEVER_INLINE;
+static void collect(struct context *cx) NEVER_INLINE;
+static void collect_for_alloc(struct context *cx, size_t bytes) NEVER_INLINE;
 
 static void visit(void **loc, void *visit_data);
 
@@ -96,7 +97,7 @@ static void visit(void **loc, void *visit_data) {
   if (obj != NULL)
     *loc = forward(cx, obj);
 }
-static void collect(struct context *cx, size_t bytes) {
+static void collect(struct context *cx) {
   // fprintf(stderr, "start collect #%ld:\n", cx->count);
   flip(cx);
   uintptr_t grey = cx->hp;
@@ -107,6 +108,9 @@ static void collect(struct context *cx, size_t bytes) {
     grey = scan(cx, grey);
   // fprintf(stderr, "%zd bytes copied\n", (cx->size>>1)-(cx->limit-cx->hp));
 
+}
+static void collect_for_alloc(struct context *cx, size_t bytes) {
+  collect(cx);
   if (cx->limit - cx->hp < bytes) {
     fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
     abort();
@@ -119,7 +123,7 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
     uintptr_t addr = cx->hp;
     uintptr_t new_hp = align_up (addr + size, ALIGNMENT);
     if (cx->limit < new_hp) {
-      collect(cx, size);
+      collect_for_alloc(cx, size);
       continue;
     }
     cx->hp = new_hp;
@@ -132,6 +136,10 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
     return ret;
   }
 }
+static inline void* allocate_pointerless(struct context *cx,
+                                         enum alloc_kind kind, size_t size) {
+  return allocate(cx, kind, size);
+}
 
 static inline void init_field(void **addr, void *val) {
   *addr = val;

From f04b0bbd45b7c45a45f79f3cab0ec5687bbe0460 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 16 Mar 2022 14:28:49 +0100
Subject: [PATCH 033/403] Simplify output of quads test

---
 quads.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/quads.c b/quads.c
index 1f7ce0ae8..93b4dcbb2 100644
--- a/quads.c
+++ b/quads.c
@@ -124,6 +124,7 @@ int main(int argc, char *argv[]) {
   size_t tree_bytes = nquads * 4 * sizeof(Quad*);
   size_t heap_size = tree_bytes * multiplier;
 
+  unsigned long gc_start = current_time();
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, multiplier);
   struct context _cx;
@@ -144,30 +145,22 @@ int main(int argc, char *argv[]) {
 
   validate_tree(HANDLE_REF(quad), depth);
 
-  for (size_t i = 0; i < 10; i++) {
-    printf("Allocating 1 GB of garbage.\n");
+  size_t garbage_step = heap_size / 7.5;
+  printf("Allocating %.3f GB of garbage, 20 times, validating live tree each time.\n",
+         garbage_step / 1e9);
+  unsigned long garbage_start = current_time();
+  for (size_t i = 0; i < 20; i++) {
     size_t garbage_depth = 3;
     start = current_time();
-    for (size_t i = 1e9/(tree_size(garbage_depth)*4*sizeof(Quad*)); i; i--)
+    for (size_t i = garbage_step/(tree_size(garbage_depth)*4*sizeof(Quad*)); i; i--)
       make_tree(cx, garbage_depth);
     print_elapsed("allocating garbage", start);
 
-#if 0
-#ifdef LAZY_SWEEP
-    start = current_time();
-    do {} while (sweep(cx));
-    print_elapsed("finishing lazy sweep", start);
-#endif
-
-    start = current_time();
-    collect(cx);
-    print_elapsed("collection", start);
-#endif
-
     start = current_time();
     validate_tree(HANDLE_REF(quad), depth);
-    print_elapsed("validate tree", start);
   }
+  print_elapsed("allocation loop", garbage_start);
+  print_elapsed("quads test", gc_start);
 
   print_end_gc_stats(cx);
 

From 32ddaa76242cdd612c32a28a260137848469597a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 16 Mar 2022 21:31:51 +0100
Subject: [PATCH 034/403] Allocate GC context in GC-managed heap

---
 bdw.h        |  3 ++-
 gcbench.c    |  9 ++++++---
 mark-sweep.h | 47 +++++++++++++++++++++++++++++++++--------------
 quads.c      |  4 +---
 semi.h       | 32 +++++++++++++++++++-------------
 5 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/bdw.h b/bdw.h
index b32f9537f..56a1162bd 100644
--- a/bdw.h
+++ b/bdw.h
@@ -45,7 +45,7 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static inline void initialize_gc(struct context* cx, size_t heap_size) {
+static struct context* initialize_gc(size_t heap_size) {
   // GC_full_freq = 30;
   // GC_free_space_divisor = 16;
   // GC_enable_incremental();
@@ -55,6 +55,7 @@ static inline void initialize_gc(struct context* cx, size_t heap_size) {
     GC_set_max_heap_size (heap_size);
     GC_expand_hp(heap_size - current_heap_size);
   }
+  return GC_malloc_atomic(1);
 }
 
 static inline void print_start_gc_stats(struct context *cx) {
diff --git a/gcbench.c b/gcbench.c
index d3d9c6412..90e85a5fa 100644
--- a/gcbench.c
+++ b/gcbench.c
@@ -233,9 +233,12 @@ int main() {
     return 1;
   }
 
-  struct context _cx;
-  struct context *cx = &_cx;
-  initialize_gc(cx, kHeapSize);
+  struct context *cx = initialize_gc(kHeapSize);
+  if (!cx) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            kHeapSize);
+    return 1;
+  }
 
   NodeHandle root = { NULL };
   NodeHandle longLivedTree = { NULL };
diff --git a/mark-sweep.h b/mark-sweep.h
index 30e2e0b0b..46e90de1d 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -107,9 +107,11 @@ struct context {
   uintptr_t base;
   uint8_t *mark_bytes;
   uintptr_t heap_base;
-  size_t size;
+  size_t heap_size;
   uintptr_t sweep;
   struct handle *roots;
+  void *mem;
+  size_t mem_size;
   long count;
   struct marker marker;
 };
@@ -133,8 +135,9 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 static void collect(struct context *cx) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct context *cx, struct gcobj *obj) {
+  ASSERT(cx->heap_base <= (uintptr_t) obj);
+  ASSERT((uintptr_t) obj < cx->heap_base + cx->heap_size);
   uintptr_t granule = (((uintptr_t) obj) - cx->heap_base)  / GRANULE_SIZE;
-  ASSERT(granule < (cx->heap_base - cx->base));
   return &cx->mark_bytes[granule];
 }
 
@@ -284,7 +287,7 @@ static int sweep(struct context *cx, size_t for_granules) {
   // the end of the heap.
   ssize_t to_reclaim = 128;
   uintptr_t sweep = cx->sweep;
-  uintptr_t limit = cx->base + cx->size;
+  uintptr_t limit = cx->heap_base + cx->heap_size;
 
   if (sweep == limit)
     return 0;
@@ -339,7 +342,7 @@ static void* allocate_large(struct context *cx, enum alloc_kind kind,
 
     // No large object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
-      fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
+      fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
       abort();
     } else {
       collect(cx);
@@ -379,7 +382,7 @@ static void fill_small(struct context *cx, enum small_object_size kind) {
 
     if (!sweep(cx, LARGE_OBJECT_GRANULE_THRESHOLD)) {
       if (swept_from_beginning) {
-        fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
+        fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
         abort();
       } else {
         collect(cx);
@@ -425,7 +428,7 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static inline void initialize_gc(struct context *cx, size_t size) {
+static struct context* initialize_gc(size_t size) {
 #define SMALL_OBJECT_GRANULE_SIZE(i) \
     ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]); \
     ASSERT_EQ(SMALL_OBJECT_##i + 1, small_object_sizes_for_granules[i+1]);
@@ -443,18 +446,34 @@ static inline void initialize_gc(struct context *cx, size_t size) {
     perror("mmap failed");
     abort();
   }
+
+  struct context *cx = mem;
+  cx->mem = mem;
+  cx->mem_size = size;
+  size_t overhead = sizeof(*cx);
+  // If there is 1 mark byte per granule, and SIZE bytes available for
+  // HEAP_SIZE + MARK_BYTES, then:
+  //
+  //   size = (granule_size + 1) / granule_size * heap_size
+  //   mark_bytes = 1/granule_size * heap_size
+  //   mark_bytes = ceil(size / (granule_size + 1))
+  cx->mark_bytes = ((uint8_t *)mem) + overhead;
+  size_t mark_bytes_size = (size - overhead + GRANULE_SIZE) / (GRANULE_SIZE + 1);
+  overhead += mark_bytes_size;
+  overhead = align_up(overhead, GRANULE_SIZE);
+
+  cx->heap_base = ((uintptr_t) mem) + overhead;
+  cx->heap_size = size - overhead;
+
   clear_freelists(cx);
-  cx->base = (uintptr_t) mem;
-  cx->mark_bytes = mem;
-  size_t heap_admin_size = align_up(size / GRANULE_SIZE, GRANULE_SIZE);
-  cx->heap_base = cx->base + heap_admin_size;
-  cx->size = size;
-  cx->sweep = cx->base + cx->size;
+  cx->sweep = cx->heap_base + cx->heap_size;
   cx->roots = NULL;
   cx->count = 0;
   if (!marker_init(cx))
     abort();
-  reclaim(cx, (void*)cx->heap_base, size_to_granules(size - heap_admin_size));
+  reclaim(cx, (void*)cx->heap_base, size_to_granules(cx->heap_size));
+
+  return cx;
 }
 
 static inline void print_start_gc_stats(struct context *cx) {
@@ -462,5 +481,5 @@ static inline void print_start_gc_stats(struct context *cx) {
 
 static inline void print_end_gc_stats(struct context *cx) {
   printf("Completed %ld collections\n", cx->count);
-  printf("Heap size is %zd\n", cx->size);
+  printf("Heap size with overhead is %zd\n", cx->mem_size);
 }
diff --git a/quads.c b/quads.c
index 93b4dcbb2..5b311b9fd 100644
--- a/quads.c
+++ b/quads.c
@@ -127,9 +127,7 @@ int main(int argc, char *argv[]) {
   unsigned long gc_start = current_time();
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, multiplier);
-  struct context _cx;
-  struct context *cx = &_cx;
-  initialize_gc(cx, heap_size);
+  struct context *cx = initialize_gc(heap_size);
 
   QuadHandle quad = { NULL };
 
diff --git a/semi.h b/semi.h
index fc72f0b4f..d2918d974 100644
--- a/semi.h
+++ b/semi.h
@@ -9,9 +9,11 @@
 struct context {
   uintptr_t hp;
   uintptr_t limit;
-  uintptr_t base;
-  size_t size;
+  uintptr_t heap_base;
+  size_t heap_size;
   struct handle *roots;
+  void *mem;
+  size_t mem_size;
   long count;
 };
 
@@ -33,12 +35,12 @@ static void collect_for_alloc(struct context *cx, size_t bytes) NEVER_INLINE;
 static void visit(void **loc, void *visit_data);
 
 static void flip(struct context *cx) {
-  uintptr_t split = cx->base + (cx->size >> 1);
+  uintptr_t split = cx->heap_base + (cx->heap_size >> 1);
   if (cx->hp <= split) {
     cx->hp = split;
-    cx->limit = cx->base + cx->size;
+    cx->limit = cx->heap_base + cx->heap_size;
   } else {
-    cx->hp = cx->base;
+    cx->hp = cx->heap_base;
     cx->limit = split;
   }
   cx->count++;
@@ -106,13 +108,13 @@ static void collect(struct context *cx) {
   // fprintf(stderr, "pushed %zd bytes in roots\n", cx->hp - grey);
   while(grey < cx->hp)
     grey = scan(cx, grey);
-  // fprintf(stderr, "%zd bytes copied\n", (cx->size>>1)-(cx->limit-cx->hp));
+  // fprintf(stderr, "%zd bytes copied\n", (cx->heap_size>>1)-(cx->limit-cx->hp));
 
 }
 static void collect_for_alloc(struct context *cx, size_t bytes) {
   collect(cx);
   if (cx->limit - cx->hp < bytes) {
-    fprintf(stderr, "ran out of space, heap size %zu\n", cx->size);
+    fprintf(stderr, "ran out of space, heap size %zu\n", cx->mem_size);
     abort();
   }
 }
@@ -151,20 +153,24 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static inline void initialize_gc(struct context *cx, size_t size) {
-  size = align_up(size, getpagesize());
-
+static struct context* initialize_gc(size_t size) {
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("mmap failed");
     abort();
   }
-  cx->hp = cx->base = (uintptr_t) mem;
-  cx->size = size;
+  struct context *cx = mem;
+  cx->mem = mem;
+  cx->mem_size = size;
+  // Round up to twice ALIGNMENT so that both spaces will be aligned.
+  size_t overhead = align_up(sizeof(*cx), ALIGNMENT * 2);
+  cx->hp = cx->heap_base = ((uintptr_t) mem) + overhead;
+  cx->heap_size = size - overhead;
   cx->count = -1;
   flip(cx);
   cx->roots = NULL;
+  return cx;
 }
 
 static inline void print_start_gc_stats(struct context *cx) {
@@ -172,5 +178,5 @@ static inline void print_start_gc_stats(struct context *cx) {
 
 static inline void print_end_gc_stats(struct context *cx) {
   printf("Completed %ld collections\n", cx->count);
-  printf("Heap size is %zd\n", cx->size);
+  printf("Heap size is %zd\n", cx->mem_size);
 }

From 7dda5b992dec1b1350db1747bd34c92423625c18 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 16 Mar 2022 21:36:21 +0100
Subject: [PATCH 035/403] Refactor pop_handle to not take the handle

---
 conservative-roots.h |  2 +-
 gcbench.c            | 20 ++++++++++----------
 precise-roots.h      |  6 +++---
 quads.c              |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/conservative-roots.h b/conservative-roots.h
index 7f2db0abd..f5b1a5708 100644
--- a/conservative-roots.h
+++ b/conservative-roots.h
@@ -4,4 +4,4 @@ struct handle { void *unused; };
 #define HANDLE_REF(h) h.v
 #define HANDLE_SET(h,val) do { h.v = val; } while (0)
 #define PUSH_HANDLE(cx, h) do { (void) &h; } while (0)
-#define POP_HANDLE(cx, h) do { (void) &h; } while (0)
+#define POP_HANDLE(cx) do { } while (0)
diff --git a/gcbench.c b/gcbench.c
index 90e85a5fa..d57258ccc 100644
--- a/gcbench.c
+++ b/gcbench.c
@@ -145,9 +145,9 @@ static void Populate(struct context *cx, int iDepth, Node *node) {
     set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
     Populate (cx, iDepth, HANDLE_REF(self)->left);
     Populate (cx, iDepth, HANDLE_REF(self)->right);
-    POP_HANDLE(cx, r);
-    POP_HANDLE(cx, l);
-    POP_HANDLE(cx, self);
+    POP_HANDLE(cx);
+    POP_HANDLE(cx);
+    POP_HANDLE(cx);
   }
 }
 
@@ -162,8 +162,8 @@ static Node* MakeTree(struct context *cx, int iDepth) {
     PUSH_HANDLE(cx, right);
     Node *result = allocate_node(cx);
     init_Node(result, HANDLE_REF(left), HANDLE_REF(right));
-    POP_HANDLE(cx, right);
-    POP_HANDLE(cx, left);
+    POP_HANDLE(cx);
+    POP_HANDLE(cx);
     return result;
   }
 }
@@ -216,7 +216,7 @@ static void TimeConstruction(struct context *cx, int depth) {
            tFinish - tStart);
   }
 
-  POP_HANDLE(cx, tempTree);
+  POP_HANDLE(cx);
 }
 
 int main() {
@@ -294,9 +294,9 @@ int main() {
   printf("Completed in %ld msec\n", tElapsed);
   print_end_gc_stats(cx);
 
-  POP_HANDLE(cx, array);
-  POP_HANDLE(cx, tempTree);
-  POP_HANDLE(cx, longLivedTree);
-  POP_HANDLE(cx, root);
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
 }
 
diff --git a/precise-roots.h b/precise-roots.h
index 919154b99..0465083b9 100644
--- a/precise-roots.h
+++ b/precise-roots.h
@@ -7,13 +7,13 @@ struct handle {
 #define HANDLE_REF(h) h.v
 #define HANDLE_SET(h,val) do { h.v = val; } while (0)
 #define PUSH_HANDLE(cx, h) push_handle(&cx->roots, &h.handle)
-#define POP_HANDLE(cx, h) pop_handle(&cx->roots, &h.handle)
+#define POP_HANDLE(cx) pop_handle(&cx->roots)
 
 static inline void push_handle(struct handle **roots, struct handle *handle) {
   handle->next = *roots;
   *roots = handle;
 }
 
-static inline void pop_handle(struct handle **roots, struct handle *handle) {
-  *roots = handle->next;
+static inline void pop_handle(struct handle **roots) {
+  *roots = (*roots)->next;
 }
diff --git a/quads.c b/quads.c
index 5b311b9fd..b6853d5ba 100644
--- a/quads.c
+++ b/quads.c
@@ -52,7 +52,7 @@ static Quad* make_tree(struct context *cx, int depth) {
       init_field((void**)&result->kids[i], HANDLE_REF(kids[i]));
 
     for (size_t i = 0; i < 4; i++)
-      POP_HANDLE(cx, kids[3 - i]);
+      POP_HANDLE(cx);
 
     return result;
   }
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
 
   print_end_gc_stats(cx);
 
-  POP_HANDLE(cx, quad);
+  POP_HANDLE(cx);
   return 0;
 }
 

From 887bdd54418243a08a746164c46e3ad0cfd11880 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 18 Mar 2022 09:42:20 +0100
Subject: [PATCH 036/403] Clean up gcbench naming, to be consistent

---
 gcbench.c | 225 ++++++++++++++++++++++++++----------------------------
 1 file changed, 109 insertions(+), 116 deletions(-)

diff --git a/gcbench.c b/gcbench.c
index d57258ccc..c42904a73 100644
--- a/gcbench.c
+++ b/gcbench.c
@@ -46,24 +46,24 @@
 #include "gcbench-types.h"
 #include "gc.h"
 
-static const int kStretchTreeDepth    = 18;      // about 16Mb
-static const int kLongLivedTreeDepth  = 16;  // about 4Mb
-static const int kArraySize  = 500000;  // about 4Mb
-static const int kMinTreeDepth = 4;
-static const int kMaxTreeDepth = 16;
+static const int stretch_tree_depth = 18; // about 16Mb
+static const int long_lived_tree_depth = 16; // about 4Mb
+static const int array_size = 500000; // about 4Mb
+static const int min_tree_depth = 4;
+static const int max_tree_depth = 16;
 
-typedef struct Node {
+struct Node {
   GC_HEADER;
   struct Node * left;
   struct Node * right;
   int i, j;
-} Node;
+};
 
-typedef struct DoubleArray {
+struct DoubleArray {
   GC_HEADER;
   size_t length;
   double values[0];
-} DoubleArray;
+};
 
 static inline size_t node_size(Node *obj) {
   return sizeof(Node);
@@ -92,7 +92,7 @@ static Node* allocate_node(struct context *cx) {
   return allocate(cx, ALLOC_KIND_NODE, sizeof (Node));
 }
 
-static struct DoubleArray* allocate_double_array(struct context *cx,
+static DoubleArray* allocate_double_array(struct context *cx,
                                                  size_t size) {
   // May be uninitialized.
   DoubleArray *ret =
@@ -102,73 +102,71 @@ static struct DoubleArray* allocate_double_array(struct context *cx,
   return ret;
 }
 
-/* Get the current time in milliseconds */
-static unsigned currentTime(void)
+static unsigned long current_time(void)
 {
-  struct timeval t;
-  struct timezone tz;
-
-  if (gettimeofday( &t, &tz ) == -1)
-    return 0;
-  return (t.tv_sec * 1000 + t.tv_usec / 1000);
+  struct timeval t = { 0 };
+  gettimeofday(&t, NULL);
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
 }
 
-void init_Node(Node *me, Node *l, Node *r) {
-  init_field((void**)&me->left, l);
-  init_field((void**)&me->right, r);
+static double elapsed_millis(unsigned long start) {
+  return (current_time() - start) * 1e-3;
 }
 
 // Nodes used by a tree of a given size
-static int TreeSize(int i) {
+static int tree_size(int i) {
   return ((1 << (i + 1)) - 1);
 }
 
 // Number of iterations to use for a given tree depth
-static int NumIters(int i) {
-  return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
+static int compute_num_iters(int i) {
+  return 2 * tree_size(stretch_tree_depth) / tree_size(i);
 }
 
 // Build tree top down, assigning to older objects.
-static void Populate(struct context *cx, int iDepth, Node *node) {
-  if (iDepth<=0) {
+static void populate(struct context *cx, int depth, Node *node) {
+  if (depth <= 0)
     return;
-  } else {
-    iDepth--;
-    
-    NodeHandle self = { node };
-    PUSH_HANDLE(cx, self);
-    NodeHandle l = { allocate_node(cx) };
-    PUSH_HANDLE(cx, l);
-    NodeHandle r = { allocate_node(cx) };
-    PUSH_HANDLE(cx, r);
-    set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
-    set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
-    Populate (cx, iDepth, HANDLE_REF(self)->left);
-    Populate (cx, iDepth, HANDLE_REF(self)->right);
-    POP_HANDLE(cx);
-    POP_HANDLE(cx);
-    POP_HANDLE(cx);
-  }
+
+  NodeHandle self = { node };
+  PUSH_HANDLE(cx, self);
+  NodeHandle l = { allocate_node(cx) };
+  PUSH_HANDLE(cx, l);
+  NodeHandle r = { allocate_node(cx) };
+  PUSH_HANDLE(cx, r);
+
+  set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
+  set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
+
+  populate(cx, depth-1, HANDLE_REF(self)->left);
+  populate(cx, depth-1, HANDLE_REF(self)->right);
+
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
 }
 
 // Build tree bottom-up
-static Node* MakeTree(struct context *cx, int iDepth) {
-  if (iDepth<=0) {
+static Node* make_tree(struct context *cx, int depth) {
+  if (depth <= 0)
     return allocate_node(cx);
-  } else {
-    NodeHandle left = { MakeTree(cx, iDepth-1) };
-    PUSH_HANDLE(cx, left);
-    NodeHandle right = { MakeTree(cx, iDepth-1) };
-    PUSH_HANDLE(cx, right);
-    Node *result = allocate_node(cx);
-    init_Node(result, HANDLE_REF(left), HANDLE_REF(right));
-    POP_HANDLE(cx);
-    POP_HANDLE(cx);
-    return result;
-  }
+
+  NodeHandle left = { make_tree(cx, depth-1) };
+  PUSH_HANDLE(cx, left);
+  NodeHandle right = { make_tree(cx, depth-1) };
+  PUSH_HANDLE(cx, right);
+
+  Node *result = allocate_node(cx);
+  init_field((void**)&result->left, HANDLE_REF(left));
+  init_field((void**)&result->right, HANDLE_REF(right));
+
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+
+  return result;
 }
 
-static void ValidateTree(Node *tree, int depth) {
+static void validate_tree(Node *tree, int depth) {
 #ifndef NDEBUG
   ASSERT_EQ(tree->i, 0);
   ASSERT_EQ(tree->j, 0);
@@ -178,120 +176,115 @@ static void ValidateTree(Node *tree, int depth) {
   } else {
     ASSERT(tree->left);
     ASSERT(tree->right);
-    ValidateTree(tree->left, depth - 1);
-    ValidateTree(tree->right, depth - 1);
+    validate_tree(tree->left, depth - 1);
+    validate_tree(tree->right, depth - 1);
   }
 #endif
 }
 
-static void TimeConstruction(struct context *cx, int depth) {
-  int iNumIters = NumIters(depth);
-  NodeHandle tempTree = { NULL };
-  PUSH_HANDLE(cx, tempTree);
+static void time_construction(struct context *cx, int depth) {
+  int num_iters = compute_num_iters(depth);
+  NodeHandle temp_tree = { NULL };
+  PUSH_HANDLE(cx, temp_tree);
 
-  printf("Creating %d trees of depth %d\n", iNumIters, depth);
+  printf("Creating %d trees of depth %d\n", num_iters, depth);
 
   {
-    long tStart = currentTime();
-    for (int i = 0; i < iNumIters; ++i) {
-      HANDLE_SET(tempTree, allocate_node(cx));
-      Populate(cx, depth, HANDLE_REF(tempTree));
-      ValidateTree(HANDLE_REF(tempTree), depth);
-      HANDLE_SET(tempTree, NULL);
+    unsigned long start = current_time();
+    for (int i = 0; i < num_iters; ++i) {
+      HANDLE_SET(temp_tree, allocate_node(cx));
+      populate(cx, depth, HANDLE_REF(temp_tree));
+      validate_tree(HANDLE_REF(temp_tree), depth);
+      HANDLE_SET(temp_tree, NULL);
     }
-    long tFinish = currentTime();
-    printf("\tTop down construction took %ld msec\n",
-           tFinish - tStart);
+    printf("\tTop down construction took %.3f msec\n",
+           elapsed_millis(start));
   }
 
   {
-    long tStart = currentTime();
-    for (int i = 0; i < iNumIters; ++i) {
-      HANDLE_SET(tempTree, MakeTree(cx, depth));
-      ValidateTree(HANDLE_REF(tempTree), depth);
-      HANDLE_SET(tempTree, NULL);
+    long start = current_time();
+    for (int i = 0; i < num_iters; ++i) {
+      HANDLE_SET(temp_tree, make_tree(cx, depth));
+      validate_tree(HANDLE_REF(temp_tree), depth);
+      HANDLE_SET(temp_tree, NULL);
     }
-    long tFinish = currentTime();
-    printf("\tBottom up construction took %ld msec\n",
-           tFinish - tStart);
+    printf("\tBottom up construction took %.3f msec\n",
+           elapsed_millis(start));
   }
 
   POP_HANDLE(cx);
 }
 
 int main() {
-  size_t kHeapMaxLive =
-    2 * sizeof(struct Node) * TreeSize(kLongLivedTreeDepth) +
-    sizeof(double) * kArraySize;
-  double kHeapMultiplier = 3;
-  size_t kHeapSize = kHeapMaxLive * kHeapMultiplier;
+  size_t heap_max_live =
+    2 * sizeof(Node) * tree_size(long_lived_tree_depth) +
+    sizeof(double) * array_size;
+  double heap_multiplier = 3;
+  size_t heap_size = heap_max_live * heap_multiplier;
 
   if (getenv("HEAP_SIZE"))
-    kHeapSize = atol(getenv("HEAP_SIZE"));
-  if (!kHeapSize) {
+    heap_size = atol(getenv("HEAP_SIZE"));
+  if (!heap_size) {
     fprintf(stderr, "Failed to parse HEAP_SIZE='%s'\n", getenv("HEAP_SIZE"));
     return 1;
   }
 
-  struct context *cx = initialize_gc(kHeapSize);
+  struct context *cx = initialize_gc(heap_size);
   if (!cx) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
-            kHeapSize);
+            heap_size);
     return 1;
   }
 
   NodeHandle root = { NULL };
-  NodeHandle longLivedTree = { NULL };
-  NodeHandle tempTree = { NULL };
+  NodeHandle long_lived_tree = { NULL };
+  NodeHandle temp_tree = { NULL };
   DoubleArrayHandle array = { NULL };
 
   PUSH_HANDLE(cx, root);
-  PUSH_HANDLE(cx, longLivedTree);
-  PUSH_HANDLE(cx, tempTree);
+  PUSH_HANDLE(cx, long_lived_tree);
+  PUSH_HANDLE(cx, temp_tree);
   PUSH_HANDLE(cx, array);
 
   printf("Garbage Collector Test\n");
-  printf(" Live storage will peak at %zd bytes.\n\n", kHeapMaxLive);
+  printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
   printf(" Stretching memory with a binary tree of depth %d\n",
-         kStretchTreeDepth);
+         stretch_tree_depth);
   print_start_gc_stats(cx);
        
-  long tStart = currentTime();
+  unsigned long start = current_time();
         
   // Stretch the memory space quickly
-  HANDLE_SET(tempTree, MakeTree(cx, kStretchTreeDepth));
-  ValidateTree(HANDLE_REF(tempTree), kStretchTreeDepth);
-  HANDLE_SET(tempTree, NULL);
+  HANDLE_SET(temp_tree, make_tree(cx, stretch_tree_depth));
+  validate_tree(HANDLE_REF(temp_tree), stretch_tree_depth);
+  HANDLE_SET(temp_tree, NULL);
 
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
-         kLongLivedTreeDepth);
-  HANDLE_SET(longLivedTree, allocate_node(cx));
-  Populate(cx, kLongLivedTreeDepth, HANDLE_REF(longLivedTree));
+         long_lived_tree_depth);
+  HANDLE_SET(long_lived_tree, allocate_node(cx));
+  populate(cx, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
 
   // Create long-lived array, filling half of it
-  printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-  HANDLE_SET(array, allocate_double_array(cx, kArraySize));
-  for (int i = 0; i < kArraySize/2; ++i) {
+  printf(" Creating a long-lived array of %d doubles\n", array_size);
+  HANDLE_SET(array, allocate_double_array(cx, array_size));
+  for (int i = 0; i < array_size/2; ++i) {
     HANDLE_REF(array)->values[i] = 1.0/i;
   }
 
-  for (int d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
-    TimeConstruction(cx, d);
+  for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
+    time_construction(cx, d);
   }
 
-  ValidateTree(HANDLE_REF(longLivedTree), kLongLivedTreeDepth);
+  validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);
 
-  if (HANDLE_REF(longLivedTree) == 0
+  // Fake reference to LongLivedTree and array to keep them from being optimized
+  // away.
+  if (HANDLE_REF(long_lived_tree) == 0
       || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
-  // fake reference to LongLivedTree
-  // and array
-  // to keep them from being optimized away
 
-  long tFinish = currentTime();
-  long tElapsed = tFinish - tStart;
-  printf("Completed in %ld msec\n", tElapsed);
+  printf("Completed in %.3f msec\n", elapsed_millis(start));
   print_end_gc_stats(cx);
 
   POP_HANDLE(cx);

From 4b7fb84ba05aa70a08cacf9b7996b4b351d2d86e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 18 Mar 2022 14:29:59 +0100
Subject: [PATCH 037/403] gcbench takes heap multiplier on command line

---
 gcbench.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/gcbench.c b/gcbench.c
index c42904a73..1affc78b3 100644
--- a/gcbench.c
+++ b/gcbench.c
@@ -215,20 +215,27 @@ static void time_construction(struct context *cx, int depth) {
   POP_HANDLE(cx);
 }
 
-int main() {
+int main(int argc, char *argv[]) {
+  // Define size of Node without any GC header.
+  size_t sizeof_node = 2 * sizeof(Node*) + 2 * sizeof(int);
+  size_t sizeof_double_array = sizeof(size_t);
   size_t heap_max_live =
-    2 * sizeof(Node) * tree_size(long_lived_tree_depth) +
-    sizeof(double) * array_size;
-  double heap_multiplier = 3;
-  size_t heap_size = heap_max_live * heap_multiplier;
-
-  if (getenv("HEAP_SIZE"))
-    heap_size = atol(getenv("HEAP_SIZE"));
-  if (!heap_size) {
-    fprintf(stderr, "Failed to parse HEAP_SIZE='%s'\n", getenv("HEAP_SIZE"));
+    tree_size(long_lived_tree_depth) * sizeof_node +
+    tree_size(max_tree_depth) * sizeof_node +
+    sizeof_double_array + sizeof(double) * array_size;
+  if (argc != 2) {
+    fprintf(stderr, "usage: %s MULTIPLIER\n", argv[0]);
     return 1;
   }
 
+  double multiplier = atof(argv[1]);
+
+  if (!(1.0 < multiplier && multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+
+  size_t heap_size = heap_max_live * multiplier;
   struct context *cx = initialize_gc(heap_size);
   if (!cx) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",

From e703568857bca39805f2b3531a122bd819af38db Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 18 Mar 2022 14:36:48 +0100
Subject: [PATCH 038/403] Remove heap-stretching phase

We should separate evaluation of the heap stretching heuristics from the
evaluation of the GC itself, otherwise our analysis of the GC itself
will be too sensitive to the details of the final heap size.  Anyway
this doesn't affect results as we already specified the heap size
precisely.
---
 gcbench.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/gcbench.c b/gcbench.c
index 1affc78b3..0b530669c 100644
--- a/gcbench.c
+++ b/gcbench.c
@@ -46,7 +46,6 @@
 #include "gcbench-types.h"
 #include "gc.h"
 
-static const int stretch_tree_depth = 18; // about 16Mb
 static const int long_lived_tree_depth = 16; // about 4Mb
 static const int array_size = 500000; // about 4Mb
 static const int min_tree_depth = 4;
@@ -120,7 +119,7 @@ static int tree_size(int i) {
 
 // Number of iterations to use for a given tree depth
 static int compute_num_iters(int i) {
-  return 2 * tree_size(stretch_tree_depth) / tree_size(i);
+  return 2 * tree_size(max_tree_depth + 2) / tree_size(i);
 }
 
 // Build tree top down, assigning to older objects.
@@ -255,17 +254,10 @@ int main(int argc, char *argv[]) {
 
   printf("Garbage Collector Test\n");
   printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
-  printf(" Stretching memory with a binary tree of depth %d\n",
-         stretch_tree_depth);
   print_start_gc_stats(cx);
-       
+
   unsigned long start = current_time();
         
-  // Stretch the memory space quickly
-  HANDLE_SET(temp_tree, make_tree(cx, stretch_tree_depth));
-  validate_tree(HANDLE_REF(temp_tree), stretch_tree_depth);
-  HANDLE_SET(temp_tree, NULL);
-
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
          long_lived_tree_depth);

From d63288048c3ff9cc0c597542b3882b1db0c9f52d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 18 Mar 2022 16:19:42 +0100
Subject: [PATCH 039/403] Add mt-gcbench

---
 Makefile           |   2 +-
 bdw.h              |  12 +-
 mt-gcbench-types.h |  10 ++
 mt-gcbench.c       | 359 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 381 insertions(+), 2 deletions(-)
 create mode 100644 mt-gcbench-types.h
 create mode 100644 mt-gcbench.c

diff --git a/Makefile b/Makefile
index 9a33422d5..ca644681c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS=gcbench quads # MT_GCBench MT_GCBench2
+TESTS=gcbench quads mt-gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
diff --git a/bdw.h b/bdw.h
index 56a1162bd..968dd22b2 100644
--- a/bdw.h
+++ b/bdw.h
@@ -55,12 +55,22 @@ static struct context* initialize_gc(size_t heap_size) {
     GC_set_max_heap_size (heap_size);
     GC_expand_hp(heap_size - current_heap_size);
   }
+  GC_allow_register_threads();
   return GC_malloc_atomic(1);
 }
 
+static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
+                                                struct context *parent) {
+  struct GC_stack_base base = { stack_base };
+  GC_register_my_thread(&base);
+  return GC_malloc_atomic(1);
+}
+static void finish_gc_for_thread(struct context *cx) {
+  GC_unregister_my_thread();
+}
+
 static inline void print_start_gc_stats(struct context *cx) {
 }
-
 static inline void print_end_gc_stats(struct context *cx) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
diff --git a/mt-gcbench-types.h b/mt-gcbench-types.h
new file mode 100644
index 000000000..a61b2b7d5
--- /dev/null
+++ b/mt-gcbench-types.h
@@ -0,0 +1,10 @@
+#ifndef GCBENCH_TYPES_H
+#define GCBENCH_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(node, Node, NODE) \
+  M(double_array, DoubleArray, DOUBLE_ARRAY)
+
+#include "heap-objects.h"
+
+#endif // GCBENCH_TYPES_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
new file mode 100644
index 000000000..269df9afe
--- /dev/null
+++ b/mt-gcbench.c
@@ -0,0 +1,359 @@
+// This is adapted from a benchmark written by John Ellis and Pete Kovac
+// of Post Communications.
+// It was modified by Hans Boehm of Silicon Graphics.
+// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
+// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
+//
+//      This is no substitute for real applications.  No actual application
+//      is likely to behave in exactly this way.  However, this benchmark was
+//      designed to be more representative of real applications than other
+//      Java GC benchmarks of which we are aware.
+//      It attempts to model those properties of allocation requests that
+//      are important to current GC techniques.
+//      It is designed to be used either to obtain a single overall performance
+//      number, or to give a more detailed estimate of how collector
+//      performance varies with object lifetimes.  It prints the time
+//      required to allocate and collect balanced binary trees of various
+//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
+//      allocates roughly the same amount of memory.
+//      Two data structures are kept around during the entire process, so
+//      that the measured performance is representative of applications
+//      that maintain some live in-memory data.  One of these is a tree
+//      containing many pointers.  The other is a large array containing
+//      double precision floating point numbers.  Both should be of comparable
+//      size.
+//
+//      The results are only really meaningful together with a specification
+//      of how much memory was used.  It is possible to trade memory for
+//      better time performance.  This benchmark should be run in a 32 MB
+//      heap, though we don't currently know how to enforce that uniformly.
+//
+//      Unlike the original Ellis and Kovac benchmark, we do not attempt
+//      measure pause times.  This facility should eventually be added back
+//      in.  There are several reasons for omitting it for now.  The original
+//      implementation depended on assumptions about the thread scheduler
+//      that don't hold uniformly.  The results really measure both the
+//      scheduler and GC.  Pause time measurements tend to not fit well with
+//      current benchmark suites.  As far as we know, none of the current
+//      commercial Java implementations seriously attempt to minimize GC pause
+//      times.
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gcbench-types.h"
+#include "gc.h"
+#include "inline.h"
+
+#define MAX_THREAD_COUNT 256
+
+static const int long_lived_tree_depth = 16; // about 4Mb
+static const int array_size = 500000; // about 4Mb
+static const int min_tree_depth = 4;
+static const int max_tree_depth = 16;
+
+struct Node {
+  GC_HEADER;
+  struct Node * left;
+  struct Node * right;
+  int i, j;
+};
+
+struct DoubleArray {
+  GC_HEADER;
+  size_t length;
+  double values[0];
+};
+
+static inline size_t node_size(Node *obj) {
+  return sizeof(Node);
+}
+static inline size_t double_array_size(DoubleArray *array) {
+  return sizeof(*array) + array->length * sizeof(double);
+}
+static inline void
+visit_node_fields(Node *node,
+                  void (*visit)(void **loc, void *visit_data),
+                  void *visit_data) {
+  visit((void**)&node->left, visit_data);
+  visit((void**)&node->right, visit_data);
+}
+static inline void
+visit_double_array_fields(DoubleArray *obj,
+                          void (*visit)(void **loc, void *visit_data),
+                          void *visit_data) {
+}
+
+typedef HANDLE_TO(Node) NodeHandle;
+typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
+
+static Node* allocate_node(struct context *cx) {
+  // memset to 0 by the collector.
+  return allocate(cx, ALLOC_KIND_NODE, sizeof (Node));
+}
+
+static DoubleArray* allocate_double_array(struct context *cx,
+                                                 size_t size) {
+  // May be uninitialized.
+  DoubleArray *ret =
+    allocate_pointerless(cx, ALLOC_KIND_DOUBLE_ARRAY,
+                         sizeof(DoubleArray) + sizeof (double) * size);
+  ret->length = size;
+  return ret;
+}
+
+static unsigned long current_time(void)
+{
+  struct timeval t = { 0 };
+  gettimeofday(&t, NULL);
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+static double elapsed_millis(unsigned long start) {
+  return (current_time() - start) * 1e-3;
+}
+
+// Nodes used by a tree of a given size
+static int tree_size(int i) {
+  return ((1 << (i + 1)) - 1);
+}
+
+// Number of iterations to use for a given tree depth
+static int compute_num_iters(int i) {
+  return 2 * tree_size(max_tree_depth + 2) / tree_size(i);
+}
+
+// Build tree top down, assigning to older objects.
+static void populate(struct context *cx, int depth, Node *node) {
+  if (depth <= 0)
+    return;
+
+  NodeHandle self = { node };
+  PUSH_HANDLE(cx, self);
+  NodeHandle l = { allocate_node(cx) };
+  PUSH_HANDLE(cx, l);
+  NodeHandle r = { allocate_node(cx) };
+  PUSH_HANDLE(cx, r);
+
+  set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
+  set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
+
+  populate(cx, depth-1, HANDLE_REF(self)->left);
+  populate(cx, depth-1, HANDLE_REF(self)->right);
+
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+}
+
+// Build tree bottom-up
+static Node* make_tree(struct context *cx, int depth) {
+  if (depth <= 0)
+    return allocate_node(cx);
+
+  NodeHandle left = { make_tree(cx, depth-1) };
+  PUSH_HANDLE(cx, left);
+  NodeHandle right = { make_tree(cx, depth-1) };
+  PUSH_HANDLE(cx, right);
+
+  Node *result = allocate_node(cx);
+  init_field((void**)&result->left, HANDLE_REF(left));
+  init_field((void**)&result->right, HANDLE_REF(right));
+
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+
+  return result;
+}
+
+static void validate_tree(Node *tree, int depth) {
+#ifndef NDEBUG
+  ASSERT_EQ(tree->i, 0);
+  ASSERT_EQ(tree->j, 0);
+  if (depth == 0) {
+    ASSERT(!tree->left);
+    ASSERT(!tree->right);
+  } else {
+    ASSERT(tree->left);
+    ASSERT(tree->right);
+    validate_tree(tree->left, depth - 1);
+    validate_tree(tree->right, depth - 1);
+  }
+#endif
+}
+
+static void time_construction(struct context *cx, int depth) {
+  int num_iters = compute_num_iters(depth);
+  NodeHandle temp_tree = { NULL };
+  PUSH_HANDLE(cx, temp_tree);
+
+  printf("Creating %d trees of depth %d\n", num_iters, depth);
+
+  {
+    unsigned long start = current_time();
+    for (int i = 0; i < num_iters; ++i) {
+      HANDLE_SET(temp_tree, allocate_node(cx));
+      populate(cx, depth, HANDLE_REF(temp_tree));
+      validate_tree(HANDLE_REF(temp_tree), depth);
+      HANDLE_SET(temp_tree, NULL);
+    }
+    printf("\tTop down construction took %.3f msec\n",
+           elapsed_millis(start));
+  }
+
+  {
+    long start = current_time();
+    for (int i = 0; i < num_iters; ++i) {
+      HANDLE_SET(temp_tree, make_tree(cx, depth));
+      validate_tree(HANDLE_REF(temp_tree), depth);
+      HANDLE_SET(temp_tree, NULL);
+    }
+    printf("\tBottom up construction took %.3f msec\n",
+           elapsed_millis(start));
+  }
+
+  POP_HANDLE(cx);
+}
+
+static void* call_with_stack_base(void* (*)(uintptr_t*, void*), void*) NEVER_INLINE;
+static void* call_with_stack_base_inner(void* (*)(uintptr_t*, void*), uintptr_t*, void*) NEVER_INLINE;
+static void* call_with_stack_base_inner(void* (*f)(uintptr_t *stack_base, void *arg),
+                                        uintptr_t *stack_base, void *arg) {
+  return f(stack_base, arg);
+}
+static void* call_with_stack_base(void* (*f)(uintptr_t *stack_base, void *arg),
+                                  void *arg) {
+  uintptr_t x;
+  return call_with_stack_base_inner(f, &x, arg);
+}
+
+struct call_with_gc_data {
+  void* (*f)(struct context *);
+  struct context *parent;
+};
+static void* call_with_gc_inner(uintptr_t *stack_base, void *arg) {
+  struct call_with_gc_data *data = arg;
+  struct context *cx = initialize_gc_for_thread(stack_base, data->parent);
+  void *ret = data->f(cx);
+  finish_gc_for_thread(cx);
+  return ret;
+}
+static void* call_with_gc(void* (*f)(struct context *),
+                          struct context *parent) {
+  struct call_with_gc_data data = { f, parent };
+  return call_with_stack_base(call_with_gc_inner, &data);
+}
+
+static void* run_one_test(struct context *cx) {
+  NodeHandle long_lived_tree = { NULL };
+  NodeHandle temp_tree = { NULL };
+  DoubleArrayHandle array = { NULL };
+
+  PUSH_HANDLE(cx, long_lived_tree);
+  PUSH_HANDLE(cx, temp_tree);
+  PUSH_HANDLE(cx, array);
+
+  // Create a long lived object
+  printf(" Creating a long-lived binary tree of depth %d\n",
+         long_lived_tree_depth);
+  HANDLE_SET(long_lived_tree, allocate_node(cx));
+  populate(cx, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
+
+  // Create long-lived array, filling half of it
+  printf(" Creating a long-lived array of %d doubles\n", array_size);
+  HANDLE_SET(array, allocate_double_array(cx, array_size));
+  for (int i = 0; i < array_size/2; ++i) {
+    HANDLE_REF(array)->values[i] = 1.0/i;
+  }
+
+  for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
+    time_construction(cx, d);
+  }
+
+  validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);
+
+  // Fake reference to LongLivedTree and array to keep them from being optimized
+  // away.
+  if (HANDLE_REF(long_lived_tree) == 0
+      || HANDLE_REF(array)->values[1000] != 1.0/1000)
+    fprintf(stderr, "Failed\n");
+
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+  POP_HANDLE(cx);
+  return NULL;
+}
+
+static void* run_one_test_in_thread(void *arg) {
+  struct context *parent_cx = arg;
+  return call_with_gc(run_one_test, parent_cx);
+}
+
+int main(int argc, char *argv[]) {
+  // Define size of Node without any GC header.
+  size_t sizeof_node = 2 * sizeof(Node*) + 2 * sizeof(int);
+  size_t sizeof_double_array = sizeof(size_t);
+  size_t heap_max_live =
+    tree_size(long_lived_tree_depth) * sizeof_node +
+    tree_size(max_tree_depth) * sizeof_node +
+    sizeof_double_array + sizeof(double) * array_size;
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s MULTIPLIER NTHREADS\n", argv[0]);
+    return 1;
+  }
+
+  double multiplier = atof(argv[1]);
+  size_t nthreads = atol(argv[2]);
+
+  if (!(1.0 < multiplier && multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+  if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for thread count, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[2]);
+    return 1;
+  }
+
+  size_t heap_size = heap_max_live * multiplier * nthreads;
+  struct context *cx = initialize_gc(heap_size);
+  if (!cx) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            heap_size);
+    return 1;
+  }
+
+  printf("Garbage Collector Test\n");
+  printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
+  print_start_gc_stats(cx);
+
+  unsigned long start = current_time();
+        
+  pthread_t threads[MAX_THREAD_COUNT];
+  // Run one of the threads in the main thread.
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, cx);
+    if (status) {
+      errno = status;
+      perror("Failed to create thread");
+      return 1;
+    }
+  }
+  run_one_test(cx);
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_join(threads[i], NULL);
+    if (status) {
+      errno = status;
+      perror("Failed to join thread");
+      return 1;
+    }
+  }
+  
+  printf("Completed in %.3f msec\n", elapsed_millis(start));
+  print_end_gc_stats(cx);
+}
+

From a654a790b9fb49ae00b325817fa71ffa8d9f4c5d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 18 Mar 2022 22:57:41 +0100
Subject: [PATCH 040/403] Add inline allocation for small objects for bdw-gc

---
 bdw.h | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 8 deletions(-)

diff --git a/bdw.h b/bdw.h
index 968dd22b2..ae7518f04 100644
--- a/bdw.h
+++ b/bdw.h
@@ -15,20 +15,86 @@
 #define GC_NO_THREAD_REDIRECTS 1
 
 #include <gc/gc.h>
+#include <gc/gc_inline.h> /* GC_generic_malloc_many */
 
-struct context {};
+#define GC_INLINE_GRANULE_WORDS 2
+#define GC_INLINE_GRANULE_BYTES (sizeof(void *) * GC_INLINE_GRANULE_WORDS)
+
+/* A freelist set contains GC_INLINE_FREELIST_COUNT pointers to singly
+   linked lists of objects of different sizes, the ith one containing
+   objects i + 1 granules in size.  This setting of
+   GC_INLINE_FREELIST_COUNT will hold freelists for allocations of
+   up to 256 bytes.  */
+#define GC_INLINE_FREELIST_COUNT (256U / GC_INLINE_GRANULE_BYTES)
+
+struct context {
+  void *freelists[GC_INLINE_FREELIST_COUNT];
+  void *pointerless_freelists[GC_INLINE_FREELIST_COUNT];
+};
+
+static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
+  return (bytes - 1U) / GC_INLINE_GRANULE_BYTES;
+}
+static inline size_t gc_inline_freelist_object_size(size_t idx) {
+  return (idx + 1U) * GC_INLINE_GRANULE_BYTES;
+}
+
+// The values of these must match the internal POINTERLESS and NORMAL
+// definitions in libgc, for which unfortunately there are no external
+// definitions.  Alack.
+enum gc_inline_kind {
+  GC_INLINE_KIND_POINTERLESS,
+  GC_INLINE_KIND_NORMAL
+};
+
+static void* allocate_small_slow(void **freelist, size_t idx,
+                                 enum gc_inline_kind kind) NEVER_INLINE;
+static void* allocate_small_slow(void **freelist, size_t idx,
+                                 enum gc_inline_kind kind) {
+  size_t bytes = gc_inline_freelist_object_size(idx);
+  GC_generic_malloc_many(bytes, kind, freelist);
+  void *head = *freelist;
+  if (UNLIKELY (!head)) {
+    fprintf(stderr, "ran out of space, heap size %zu\n",
+            GC_get_heap_size());
+    abort();
+  }
+  *freelist = *(void **)(head);
+  return head;
+}
+
+static inline void *
+allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
+  void *head = *freelist;
+
+  if (UNLIKELY (!head))
+    return allocate_small_slow(freelist, idx, kind);
+
+  *freelist = *(void **)(head);
+  return head;
+}
 
 #define GC_HEADER /**/
 
 static inline void* allocate(struct context *cx, enum alloc_kind kind,
                              size_t size) {
-  return GC_malloc(size);
+  size_t idx = gc_inline_bytes_to_freelist_index(size);
+
+  if (UNLIKELY(idx >= GC_INLINE_FREELIST_COUNT))
+    return GC_malloc(size);
+
+  return allocate_small(&cx->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
 }
 
-static inline void*
-allocate_pointerless(struct context *cx, enum alloc_kind kind,
-                     size_t size) {
-  return GC_malloc_atomic(size);
+static inline void* allocate_pointerless(struct context *cx,
+                                         enum alloc_kind kind, size_t size) {
+  size_t idx = gc_inline_bytes_to_freelist_index(size);
+
+  if (UNLIKELY (idx >= GC_INLINE_FREELIST_COUNT))
+    return GC_malloc_atomic(size);
+
+  return allocate_small(&cx->pointerless_freelists[idx], idx,
+                        GC_INLINE_KIND_POINTERLESS);
 }
 
 static inline void collect(struct context *cx) {
@@ -56,14 +122,14 @@ static struct context* initialize_gc(size_t heap_size) {
     GC_expand_hp(heap_size - current_heap_size);
   }
   GC_allow_register_threads();
-  return GC_malloc_atomic(1);
+  return GC_malloc(sizeof(struct context));
 }
 
 static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
                                                 struct context *parent) {
   struct GC_stack_base base = { stack_base };
   GC_register_my_thread(&base);
-  return GC_malloc_atomic(1);
+  return GC_malloc(sizeof(struct context));
 }
 static void finish_gc_for_thread(struct context *cx) {
   GC_unregister_my_thread();

From 883a761775071bf931f6f314f31d9c21db95c1dd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 20 Mar 2022 21:03:26 +0100
Subject: [PATCH 041/403] Stub out support for multiple mutator threads on
 semi, mark-sweep

For semi probably we never implement support for multiple mutator
threads.  We will do local freelists for mark-sweep though.
---
 Makefile     | 13 ++++++++-----
 mark-sweep.h |  9 +++++++++
 semi.h       |  9 +++++++++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index ca644681c..6237ac936 100644
--- a/Makefile
+++ b/Makefile
@@ -2,23 +2,26 @@ TESTS=gcbench quads mt-gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing
+CFLAGS=-Wall -O2 -g -fno-strict-aliasing -Wno-unused -DNDEBUG
+INCLUDES=-I.
+LDFLAGS=-lpthread
+COMPILE=$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS)
 
 ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
 all: $(ALL_TESTS)
 
 bdw-%: bdw.h conservative-roots.h %-types.h %.c
-	$(CC) $(CFLAGS) -DNDEBUG -lpthread `pkg-config --libs --cflags bdw-gc` -I. -DGC_BDW -o $@ $*.c
+	$(COMPILE) `pkg-config --libs --cflags bdw-gc` -DGC_BDW -o $@ $*.c
 
 semi-%: semi.h precise-roots.h %-types.h heap-objects.h %.c
-	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_SEMI -o $@ $*.c
+	$(COMPILE) -DGC_SEMI -o $@ $*.c
 
 mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(CC) $(CFLAGS) -I. -Wno-unused -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
+	$(COMPILE) -DGC_MARK_SWEEP -o $@ $*.c
 
 parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(CC) $(CFLAGS) -I. -Wno-unused -DNDEBUG -DGC_PARALLEL_MARK_SWEEP -lpthread -o $@ $*.c
+	$(COMPILE) -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
diff --git a/mark-sweep.h b/mark-sweep.h
index 46e90de1d..d82f4d0c2 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -476,6 +476,15 @@ static struct context* initialize_gc(size_t size) {
   return cx;
 }
 
+static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
+                                                struct context *parent) {
+  fprintf(stderr,
+          "Multiple mutator threads not yet implemented.\n");
+  exit(1);
+}
+static void finish_gc_for_thread(struct context *cx) {
+}
+
 static inline void print_start_gc_stats(struct context *cx) {
 }
 
diff --git a/semi.h b/semi.h
index d2918d974..72f8d0e19 100644
--- a/semi.h
+++ b/semi.h
@@ -173,6 +173,15 @@ static struct context* initialize_gc(size_t size) {
   return cx;
 }
 
+static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
+                                                struct context *parent) {
+  fprintf(stderr,
+          "Semispace copying collector not appropriate for multithreaded use.\n");
+  exit(1);
+}
+static void finish_gc_for_thread(struct context *cx) {
+}
+
 static inline void print_start_gc_stats(struct context *cx) {
 }
 

From 06a213d1edb2afc1493be9135c1224b63ecc95e0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 28 Mar 2022 20:49:24 +0200
Subject: [PATCH 042/403] Adapt GC API to have separate heap and mutator
 structs

Only BDW is adapted, so far.
---
 bdw.h        |  56 ++++++++++++++++++-------
 mt-gcbench.c | 116 +++++++++++++++++++++++++--------------------------
 2 files changed, 99 insertions(+), 73 deletions(-)

diff --git a/bdw.h b/bdw.h
index ae7518f04..d0d24f7e2 100644
--- a/bdw.h
+++ b/bdw.h
@@ -27,9 +27,15 @@
    up to 256 bytes.  */
 #define GC_INLINE_FREELIST_COUNT (256U / GC_INLINE_GRANULE_BYTES)
 
-struct context {
+struct heap {
+  pthread_mutex_t lock;
+  int multithreaded;
+};
+
+struct mutator {
   void *freelists[GC_INLINE_FREELIST_COUNT];
   void *pointerless_freelists[GC_INLINE_FREELIST_COUNT];
+  struct heap *heap;
 };
 
 static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
@@ -76,28 +82,28 @@ allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
 
 #define GC_HEADER /**/
 
-static inline void* allocate(struct context *cx, enum alloc_kind kind,
+static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
   size_t idx = gc_inline_bytes_to_freelist_index(size);
 
   if (UNLIKELY(idx >= GC_INLINE_FREELIST_COUNT))
     return GC_malloc(size);
 
-  return allocate_small(&cx->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
+  return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
 }
 
-static inline void* allocate_pointerless(struct context *cx,
+static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind, size_t size) {
   size_t idx = gc_inline_bytes_to_freelist_index(size);
 
   if (UNLIKELY (idx >= GC_INLINE_FREELIST_COUNT))
     return GC_malloc_atomic(size);
 
-  return allocate_small(&cx->pointerless_freelists[idx], idx,
+  return allocate_small(&mut->pointerless_freelists[idx], idx,
                         GC_INLINE_KIND_POINTERLESS);
 }
 
-static inline void collect(struct context *cx) {
+static inline void collect(struct mutator *mut) {
   GC_gcollect();
 }
 
@@ -111,7 +117,18 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static struct context* initialize_gc(size_t heap_size) {
+static inline struct mutator *add_mutator(struct heap *heap) {
+  struct mutator *ret = GC_malloc(sizeof(struct mutator));
+  ret->heap = heap;
+  return ret;
+}
+
+static inline struct heap *mutator_heap(struct mutator *mutator) {
+  return mutator->heap;
+}
+
+static int initialize_gc(size_t heap_size, struct heap **heap,
+                         struct mutator **mutator) {
   // GC_full_freq = 30;
   // GC_free_space_divisor = 16;
   // GC_enable_incremental();
@@ -121,23 +138,32 @@ static struct context* initialize_gc(size_t heap_size) {
     GC_set_max_heap_size (heap_size);
     GC_expand_hp(heap_size - current_heap_size);
   }
-  GC_allow_register_threads();
-  return GC_malloc(sizeof(struct context));
+  *heap = GC_malloc(sizeof(struct heap));
+  pthread_mutex_init(&(*heap)->lock, NULL);
+  *mutator = add_mutator(*heap);
+  return 1;
 }
 
-static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct context *parent) {
+static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
+                                                struct heap *heap) {
+  pthread_mutex_lock(&heap->lock);
+  if (!heap->multithreaded) {
+    GC_allow_register_threads();
+    heap->multithreaded = 1;
+  }
+  pthread_mutex_unlock(&heap->lock);
+
   struct GC_stack_base base = { stack_base };
   GC_register_my_thread(&base);
-  return GC_malloc(sizeof(struct context));
+  return add_mutator(heap);
 }
-static void finish_gc_for_thread(struct context *cx) {
+static void finish_gc_for_thread(struct mutator *mut) {
   GC_unregister_my_thread();
 }
 
-static inline void print_start_gc_stats(struct context *cx) {
+static inline void print_start_gc_stats(struct heap *heap) {
 }
-static inline void print_end_gc_stats(struct context *cx) {
+static inline void print_end_gc_stats(struct heap *heap) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
 }
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 269df9afe..8f6d007e5 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -91,16 +91,16 @@ visit_double_array_fields(DoubleArray *obj,
 typedef HANDLE_TO(Node) NodeHandle;
 typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
 
-static Node* allocate_node(struct context *cx) {
+static Node* allocate_node(struct mutator *mut) {
   // memset to 0 by the collector.
-  return allocate(cx, ALLOC_KIND_NODE, sizeof (Node));
+  return allocate(mut, ALLOC_KIND_NODE, sizeof (Node));
 }
 
-static DoubleArray* allocate_double_array(struct context *cx,
+static DoubleArray* allocate_double_array(struct mutator *mut,
                                                  size_t size) {
   // May be uninitialized.
   DoubleArray *ret =
-    allocate_pointerless(cx, ALLOC_KIND_DOUBLE_ARRAY,
+    allocate_pointerless(mut, ALLOC_KIND_DOUBLE_ARRAY,
                          sizeof(DoubleArray) + sizeof (double) * size);
   ret->length = size;
   return ret;
@@ -128,44 +128,44 @@ static int compute_num_iters(int i) {
 }
 
 // Build tree top down, assigning to older objects.
-static void populate(struct context *cx, int depth, Node *node) {
+static void populate(struct mutator *mut, int depth, Node *node) {
   if (depth <= 0)
     return;
 
   NodeHandle self = { node };
-  PUSH_HANDLE(cx, self);
-  NodeHandle l = { allocate_node(cx) };
-  PUSH_HANDLE(cx, l);
-  NodeHandle r = { allocate_node(cx) };
-  PUSH_HANDLE(cx, r);
+  PUSH_HANDLE(mut, self);
+  NodeHandle l = { allocate_node(mut) };
+  PUSH_HANDLE(mut, l);
+  NodeHandle r = { allocate_node(mut) };
+  PUSH_HANDLE(mut, r);
 
   set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
   set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
 
-  populate(cx, depth-1, HANDLE_REF(self)->left);
-  populate(cx, depth-1, HANDLE_REF(self)->right);
+  populate(mut, depth-1, HANDLE_REF(self)->left);
+  populate(mut, depth-1, HANDLE_REF(self)->right);
 
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
+  POP_HANDLE(mut);
+  POP_HANDLE(mut);
+  POP_HANDLE(mut);
 }
 
 // Build tree bottom-up
-static Node* make_tree(struct context *cx, int depth) {
+static Node* make_tree(struct mutator *mut, int depth) {
   if (depth <= 0)
-    return allocate_node(cx);
+    return allocate_node(mut);
 
-  NodeHandle left = { make_tree(cx, depth-1) };
-  PUSH_HANDLE(cx, left);
-  NodeHandle right = { make_tree(cx, depth-1) };
-  PUSH_HANDLE(cx, right);
+  NodeHandle left = { make_tree(mut, depth-1) };
+  PUSH_HANDLE(mut, left);
+  NodeHandle right = { make_tree(mut, depth-1) };
+  PUSH_HANDLE(mut, right);
 
-  Node *result = allocate_node(cx);
+  Node *result = allocate_node(mut);
   init_field((void**)&result->left, HANDLE_REF(left));
   init_field((void**)&result->right, HANDLE_REF(right));
 
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
+  POP_HANDLE(mut);
+  POP_HANDLE(mut);
 
   return result;
 }
@@ -186,18 +186,18 @@ static void validate_tree(Node *tree, int depth) {
 #endif
 }
 
-static void time_construction(struct context *cx, int depth) {
+static void time_construction(struct mutator *mut, int depth) {
   int num_iters = compute_num_iters(depth);
   NodeHandle temp_tree = { NULL };
-  PUSH_HANDLE(cx, temp_tree);
+  PUSH_HANDLE(mut, temp_tree);
 
   printf("Creating %d trees of depth %d\n", num_iters, depth);
 
   {
     unsigned long start = current_time();
     for (int i = 0; i < num_iters; ++i) {
-      HANDLE_SET(temp_tree, allocate_node(cx));
-      populate(cx, depth, HANDLE_REF(temp_tree));
+      HANDLE_SET(temp_tree, allocate_node(mut));
+      populate(mut, depth, HANDLE_REF(temp_tree));
       validate_tree(HANDLE_REF(temp_tree), depth);
       HANDLE_SET(temp_tree, NULL);
     }
@@ -208,7 +208,7 @@ static void time_construction(struct context *cx, int depth) {
   {
     long start = current_time();
     for (int i = 0; i < num_iters; ++i) {
-      HANDLE_SET(temp_tree, make_tree(cx, depth));
+      HANDLE_SET(temp_tree, make_tree(mut, depth));
       validate_tree(HANDLE_REF(temp_tree), depth);
       HANDLE_SET(temp_tree, NULL);
     }
@@ -216,7 +216,7 @@ static void time_construction(struct context *cx, int depth) {
            elapsed_millis(start));
   }
 
-  POP_HANDLE(cx);
+  POP_HANDLE(mut);
 }
 
 static void* call_with_stack_base(void* (*)(uintptr_t*, void*), void*) NEVER_INLINE;
@@ -232,46 +232,46 @@ static void* call_with_stack_base(void* (*f)(uintptr_t *stack_base, void *arg),
 }
 
 struct call_with_gc_data {
-  void* (*f)(struct context *);
-  struct context *parent;
+  void* (*f)(struct mutator *);
+  struct heap *heap;
 };
 static void* call_with_gc_inner(uintptr_t *stack_base, void *arg) {
   struct call_with_gc_data *data = arg;
-  struct context *cx = initialize_gc_for_thread(stack_base, data->parent);
-  void *ret = data->f(cx);
-  finish_gc_for_thread(cx);
+  struct mutator *mut = initialize_gc_for_thread(stack_base, data->heap);
+  void *ret = data->f(mut);
+  finish_gc_for_thread(mut);
   return ret;
 }
-static void* call_with_gc(void* (*f)(struct context *),
-                          struct context *parent) {
-  struct call_with_gc_data data = { f, parent };
+static void* call_with_gc(void* (*f)(struct mutator *),
+                          struct heap *heap) {
+  struct call_with_gc_data data = { f, heap };
   return call_with_stack_base(call_with_gc_inner, &data);
 }
 
-static void* run_one_test(struct context *cx) {
+static void* run_one_test(struct mutator *mut) {
   NodeHandle long_lived_tree = { NULL };
   NodeHandle temp_tree = { NULL };
   DoubleArrayHandle array = { NULL };
 
-  PUSH_HANDLE(cx, long_lived_tree);
-  PUSH_HANDLE(cx, temp_tree);
-  PUSH_HANDLE(cx, array);
+  PUSH_HANDLE(mut, long_lived_tree);
+  PUSH_HANDLE(mut, temp_tree);
+  PUSH_HANDLE(mut, array);
 
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
          long_lived_tree_depth);
-  HANDLE_SET(long_lived_tree, allocate_node(cx));
-  populate(cx, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
+  HANDLE_SET(long_lived_tree, allocate_node(mut));
+  populate(mut, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
 
   // Create long-lived array, filling half of it
   printf(" Creating a long-lived array of %d doubles\n", array_size);
-  HANDLE_SET(array, allocate_double_array(cx, array_size));
+  HANDLE_SET(array, allocate_double_array(mut, array_size));
   for (int i = 0; i < array_size/2; ++i) {
     HANDLE_REF(array)->values[i] = 1.0/i;
   }
 
   for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
-    time_construction(cx, d);
+    time_construction(mut, d);
   }
 
   validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);
@@ -282,15 +282,15 @@ static void* run_one_test(struct context *cx) {
       || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
 
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
+  POP_HANDLE(mut);
+  POP_HANDLE(mut);
+  POP_HANDLE(mut);
   return NULL;
 }
 
 static void* run_one_test_in_thread(void *arg) {
-  struct context *parent_cx = arg;
-  return call_with_gc(run_one_test, parent_cx);
+  struct heap *heap = arg;
+  return call_with_gc(run_one_test, heap);
 }
 
 int main(int argc, char *argv[]) {
@@ -320,8 +320,9 @@ int main(int argc, char *argv[]) {
   }
 
   size_t heap_size = heap_max_live * multiplier * nthreads;
-  struct context *cx = initialize_gc(heap_size);
-  if (!cx) {
+  struct heap *heap;
+  struct mutator *mut;
+  if (!initialize_gc(heap_size, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
@@ -329,21 +330,21 @@ int main(int argc, char *argv[]) {
 
   printf("Garbage Collector Test\n");
   printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
-  print_start_gc_stats(cx);
+  print_start_gc_stats(heap);
 
   unsigned long start = current_time();
         
   pthread_t threads[MAX_THREAD_COUNT];
   // Run one of the threads in the main thread.
   for (size_t i = 1; i < nthreads; i++) {
-    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, cx);
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, heap);
     if (status) {
       errno = status;
       perror("Failed to create thread");
       return 1;
     }
   }
-  run_one_test(cx);
+  run_one_test(mut);
   for (size_t i = 1; i < nthreads; i++) {
     int status = pthread_join(threads[i], NULL);
     if (status) {
@@ -354,6 +355,5 @@ int main(int argc, char *argv[]) {
   }
   
   printf("Completed in %.3f msec\n", elapsed_millis(start));
-  print_end_gc_stats(cx);
+  print_end_gc_stats(heap);
 }
-

From 81037fd6d29b79d48792b81e764b91d1d53c83d3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 28 Mar 2022 22:13:52 +0200
Subject: [PATCH 043/403] Convert semi-space collector to new API

---
 semi.h | 159 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 91 insertions(+), 68 deletions(-)

diff --git a/semi.h b/semi.h
index 72f8d0e19..af58f052d 100644
--- a/semi.h
+++ b/semi.h
@@ -1,3 +1,4 @@
+#include <malloc.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@@ -6,16 +7,31 @@
 
 #include "precise-roots.h"
 
-struct context {
+struct semi_space {
   uintptr_t hp;
   uintptr_t limit;
-  uintptr_t heap_base;
-  size_t heap_size;
-  struct handle *roots;
-  void *mem;
-  size_t mem_size;
+  uintptr_t base;
+  size_t size;
   long count;
 };
+struct heap {
+  struct semi_space semi_space;
+};
+// One mutator per space, can just store the heap in the mutator.
+struct mutator {
+  struct heap heap;
+  struct handle *roots;
+};
+
+static inline struct heap* mutator_heap(struct mutator *mut) {
+  return &mut->heap;
+}
+static inline struct semi_space* heap_semi_space(struct heap *heap) {
+  return &heap->semi_space;
+}
+static inline struct semi_space* mutator_semi_space(struct mutator *mut) {
+  return heap_semi_space(mutator_heap(mut));
+}
 
 static const uintptr_t ALIGNMENT = 8;
 
@@ -29,24 +45,24 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct context *cx) NEVER_INLINE;
-static void collect_for_alloc(struct context *cx, size_t bytes) NEVER_INLINE;
+static void collect(struct mutator *mut) NEVER_INLINE;
+static void collect_for_alloc(struct mutator *mut, size_t bytes) NEVER_INLINE;
 
 static void visit(void **loc, void *visit_data);
 
-static void flip(struct context *cx) {
-  uintptr_t split = cx->heap_base + (cx->heap_size >> 1);
-  if (cx->hp <= split) {
-    cx->hp = split;
-    cx->limit = cx->heap_base + cx->heap_size;
+static void flip(struct semi_space *space) {
+  uintptr_t split = space->base + (space->size >> 1);
+  if (space->hp <= split) {
+    space->hp = split;
+    space->limit = space->base + space->size;
   } else {
-    cx->hp = cx->heap_base;
-    cx->limit = split;
+    space->hp = space->base;
+    space->limit = split;
   }
-  cx->count++;
+  space->count++;
 }  
 
-static void* copy(struct context *cx, uintptr_t kind, void *obj) {
+static void* copy(struct semi_space *space, uintptr_t kind, void *obj) {
   size_t size;
   switch (kind) {
 #define COMPUTE_SIZE(name, Name, NAME) \
@@ -58,20 +74,20 @@ static void* copy(struct context *cx, uintptr_t kind, void *obj) {
   default:
     abort ();
   }
-  void *new_obj = (void*)cx->hp;
+  void *new_obj = (void*)space->hp;
   memcpy(new_obj, obj, size);
-  *(uintptr_t*) obj = cx->hp;
-  cx->hp += align_up (size, ALIGNMENT);
+  *(uintptr_t*) obj = space->hp;
+  space->hp += align_up (size, ALIGNMENT);
   return new_obj;
 }
 
-static uintptr_t scan(struct context *cx, uintptr_t grey) {
+static uintptr_t scan(struct semi_space *space, uintptr_t grey) {
   void *obj = (void*)grey;
   uintptr_t kind = *(uintptr_t*) obj;
   switch (kind) {
 #define SCAN_OBJECT(name, Name, NAME) \
     case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, visit, cx); \
+      visit_##name##_fields((Name*)obj, visit, space); \
       return grey + align_up(name##_size((Name*)obj), ALIGNMENT);
     FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
@@ -80,55 +96,58 @@ static uintptr_t scan(struct context *cx, uintptr_t grey) {
   }
 }
 
-static void* forward(struct context *cx, void *obj) {
+static void* forward(struct semi_space *space, void *obj) {
   uintptr_t header_word = *(uintptr_t*)obj;
   switch (header_word) {
 #define CASE_ALLOC_KIND(name, Name, NAME) \
     case ALLOC_KIND_##NAME:
     FOR_EACH_HEAP_OBJECT_KIND(CASE_ALLOC_KIND)
 #undef CASE_ALLOC_KIND
-    return copy(cx, header_word, obj);
+    return copy(space, header_word, obj);
   default:
     return (void*)header_word;
   }
 }  
 
 static void visit(void **loc, void *visit_data) {
-  struct context *cx = visit_data;
+  struct semi_space *space = visit_data;
   void *obj = *loc;
   if (obj != NULL)
-    *loc = forward(cx, obj);
+    *loc = forward(space, obj);
 }
-static void collect(struct context *cx) {
-  // fprintf(stderr, "start collect #%ld:\n", cx->count);
-  flip(cx);
-  uintptr_t grey = cx->hp;
-  for (struct handle *h = cx->roots; h; h = h->next)
-    visit(&h->v, cx);
-  // fprintf(stderr, "pushed %zd bytes in roots\n", cx->hp - grey);
-  while(grey < cx->hp)
-    grey = scan(cx, grey);
-  // fprintf(stderr, "%zd bytes copied\n", (cx->heap_size>>1)-(cx->limit-cx->hp));
+static void collect(struct mutator *mut) {
+  struct semi_space *space = mutator_semi_space(mut);
+  // fprintf(stderr, "start collect #%ld:\n", space->count);
+  flip(space);
+  uintptr_t grey = space->hp;
+  for (struct handle *h = mut->roots; h; h = h->next)
+    visit(&h->v, space);
+  // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
+  while(grey < space->hp)
+    grey = scan(space, grey);
+  // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 
 }
-static void collect_for_alloc(struct context *cx, size_t bytes) {
-  collect(cx);
-  if (cx->limit - cx->hp < bytes) {
-    fprintf(stderr, "ran out of space, heap size %zu\n", cx->mem_size);
+static void collect_for_alloc(struct mutator *mut, size_t bytes) {
+  collect(mut);
+  struct semi_space *space = mutator_semi_space(mut);
+  if (space->limit - space->hp < bytes) {
+    fprintf(stderr, "ran out of space, heap size %zu\n", space->size);
     abort();
   }
 }
 
-static inline void* allocate(struct context *cx, enum alloc_kind kind,
+static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
+  struct semi_space *space = mutator_semi_space(mut);
   while (1) {
-    uintptr_t addr = cx->hp;
+    uintptr_t addr = space->hp;
     uintptr_t new_hp = align_up (addr + size, ALIGNMENT);
-    if (cx->limit < new_hp) {
-      collect_for_alloc(cx, size);
+    if (space->limit < new_hp) {
+      collect_for_alloc(mut, size);
       continue;
     }
-    cx->hp = new_hp;
+    space->hp = new_hp;
     void *ret = (void *)addr;
     uintptr_t *header_word = ret;
     *header_word = kind;
@@ -138,9 +157,9 @@ static inline void* allocate(struct context *cx, enum alloc_kind kind,
     return ret;
   }
 }
-static inline void* allocate_pointerless(struct context *cx,
+static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind, size_t size) {
-  return allocate(cx, kind, size);
+  return allocate(mut, kind, size);
 }
 
 static inline void init_field(void **addr, void *val) {
@@ -153,39 +172,43 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static struct context* initialize_gc(size_t size) {
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+static int initialize_gc(size_t heap_size, struct heap **heap,
+                         struct mutator **mut) {
+  void *mem = mmap(NULL, heap_size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("mmap failed");
-    abort();
+    return 0;
   }
-  struct context *cx = mem;
-  cx->mem = mem;
-  cx->mem_size = size;
-  // Round up to twice ALIGNMENT so that both spaces will be aligned.
-  size_t overhead = align_up(sizeof(*cx), ALIGNMENT * 2);
-  cx->hp = cx->heap_base = ((uintptr_t) mem) + overhead;
-  cx->heap_size = size - overhead;
-  cx->count = -1;
-  flip(cx);
-  cx->roots = NULL;
-  return cx;
+
+  *mut = calloc(1, sizeof(struct mutator));
+  if (!*mut) abort();
+  *heap = mutator_heap(*mut);
+  struct semi_space *space = mutator_semi_space(*mut);
+
+  space->hp = space->base = (uintptr_t) mem;
+  space->size = heap_size;
+  space->count = -1;
+  flip(space);
+  (*mut)->roots = NULL;
+
+  return 1;
 }
 
-static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct context *parent) {
+static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
+                                                struct heap *heap) {
   fprintf(stderr,
           "Semispace copying collector not appropriate for multithreaded use.\n");
   exit(1);
 }
-static void finish_gc_for_thread(struct context *cx) {
+static void finish_gc_for_thread(struct mutator *space) {
 }
 
-static inline void print_start_gc_stats(struct context *cx) {
+static inline void print_start_gc_stats(struct heap *heap) {
 }
 
-static inline void print_end_gc_stats(struct context *cx) {
-  printf("Completed %ld collections\n", cx->count);
-  printf("Heap size is %zd\n", cx->mem_size);
+static inline void print_end_gc_stats(struct heap *heap) {
+  struct semi_space *space = heap_semi_space(heap);
+  printf("Completed %ld collections\n", space->count);
+  printf("Heap size is %zd\n", space->size);
 }

From 5a92b43e9452cd079e8f018259c5e0954f19afba Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 28 Mar 2022 22:48:04 +0200
Subject: [PATCH 044/403] Change serial marker to deal in struct gcobj* instead
 of uintptr

"struct gcobj*" is how we denote live objects, and the marker will only
see live objects.
---
 serial-marker.h | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/serial-marker.h b/serial-marker.h
index 719ba1c51..8cc65a328 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -7,20 +7,22 @@
 #include "assert.h"
 #include "debug.h"
 
+struct gcobj;
+
 struct mark_queue {
   size_t size;
   size_t read;
   size_t write;
-  uintptr_t *buf;
+  struct gcobj **buf;
 };
 
 static const size_t mark_queue_max_size =
-  (1ULL << (sizeof(uintptr_t) * 8 - 1)) / sizeof(uintptr_t);
+  (1ULL << (sizeof(struct gcobj *) * 8 - 1)) / sizeof(struct gcobj *);
 static const size_t mark_queue_release_byte_threshold = 1 * 1024 * 1024;
 
-static void*
+static struct gcobj **
 mark_queue_alloc(size_t size) {
-  void *mem = mmap(NULL, size * sizeof(uintptr_t), PROT_READ|PROT_WRITE,
+  void *mem = mmap(NULL, size * sizeof(struct gcobj *), PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("Failed to grow mark queue");
@@ -32,20 +34,20 @@ mark_queue_alloc(size_t size) {
 
 static int
 mark_queue_init(struct mark_queue *q) {
-  q->size = getpagesize() / sizeof(uintptr_t);
+  q->size = getpagesize() / sizeof(struct gcobj *);
   q->read = 0;
   q->write = 0;
   q->buf = mark_queue_alloc(q->size);
   return !!q->buf;
 }
   
-static inline uintptr_t
+static inline struct gcobj *
 mark_queue_get(struct mark_queue *q, size_t idx) {
   return q->buf[idx & (q->size - 1)];
 }
 
 static inline void
-mark_queue_put(struct mark_queue *q, size_t idx, uintptr_t x) {
+mark_queue_put(struct mark_queue *q, size_t idx, struct gcobj *x) {
   q->buf[idx & (q->size - 1)] = x;
 }
 
@@ -54,14 +56,14 @@ static int mark_queue_grow(struct mark_queue *q) NEVER_INLINE;
 static int
 mark_queue_grow(struct mark_queue *q) {
   size_t old_size = q->size;
-  uintptr_t *old_buf = q->buf;
+  struct gcobj **old_buf = q->buf;
   if (old_size >= mark_queue_max_size) {
     DEBUG("mark queue already at max size of %zu bytes", old_size);
     return 0;
   }
 
   size_t new_size = old_size * 2;
-  uintptr_t *new_buf = mark_queue_alloc(new_size);
+  struct gcobj **new_buf = mark_queue_alloc(new_size);
   if (!new_buf)
     return 0;
 
@@ -71,7 +73,7 @@ mark_queue_grow(struct mark_queue *q) {
   for (size_t i = q->read; i < q->write; i++)
     new_buf[i & new_mask] = old_buf[i & old_mask];
 
-  munmap(old_buf, old_size * sizeof(uintptr_t));
+  munmap(old_buf, old_size * sizeof(struct gcobj *));
 
   q->size = new_size;
   q->buf = new_buf;
@@ -79,24 +81,34 @@ mark_queue_grow(struct mark_queue *q) {
 }
   
 static inline void
-mark_queue_push(struct mark_queue *q, void *p) {
+mark_queue_push(struct mark_queue *q, struct gcobj *p) {
   if (UNLIKELY(q->write - q->read == q->size)) {
     if (!mark_queue_grow(q))
       abort();
   }
-  mark_queue_put(q, q->write++, (uintptr_t)p);
+  mark_queue_put(q, q->write++, p);
 }
 
-static inline void*
+static inline void
+mark_queue_push_many(struct mark_queue *q, struct gcobj **pv, size_t count) {
+  while (q->size - (q->write - q->read) < count) {
+    if (!mark_queue_grow(q))
+      abort();
+  }
+  for (size_t i = 0; i < count; i++)
+    mark_queue_put(q, q->write++, pv[i]);
+}
+
+static inline struct gcobj*
 mark_queue_pop(struct mark_queue *q) {
   if (UNLIKELY(q->read == q->write))
     return NULL;
-  return (void*)mark_queue_get(q, q->read++);
+  return mark_queue_get(q, q->read++);
 }
 
 static void
 mark_queue_release(struct mark_queue *q) {
-  size_t byte_size = q->size * sizeof(uintptr_t);
+  size_t byte_size = q->size * sizeof(struct gcobj *);
   if (byte_size >= mark_queue_release_byte_threshold)
     madvise(q->buf, byte_size, MADV_DONTNEED);
   q->read = q->write = 0;
@@ -104,7 +116,7 @@ mark_queue_release(struct mark_queue *q) {
 
 static void
 mark_queue_destroy(struct mark_queue *q) {
-  size_t byte_size = q->size * sizeof(uintptr_t);
+  size_t byte_size = q->size * sizeof(struct gcobj *);
   munmap(q->buf, byte_size);
 }
 

From edd46d8fe28450f946da233580a7561ddaca649d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 28 Mar 2022 23:03:37 +0200
Subject: [PATCH 045/403] Start to adapt mark-sweep collector for separate
 heap/mutator

The current hack is that the mutator contains the heap.  We'll relax
later on.
---
 mark-sweep.h | 72 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index d82f4d0c2..ec9b0ffca 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -14,8 +14,6 @@
 #include "serial-marker.h"
 #endif
 
-#define LAZY_SWEEP 1
-
 #define GRANULE_SIZE 8
 #define GRANULE_SIZE_LOG_2 3
 #define LARGE_OBJECT_THRESHOLD 256
@@ -109,13 +107,35 @@ struct context {
   uintptr_t heap_base;
   size_t heap_size;
   uintptr_t sweep;
-  struct handle *roots;
   void *mem;
   size_t mem_size;
   long count;
   struct marker marker;
 };
 
+struct mark_space { struct context cx; };
+struct heap { struct mark_space mark_space; };
+struct mutator {
+  struct heap heap;
+  struct handle *roots;
+};
+
+static inline struct heap* mutator_heap(struct mutator *mut) {
+  return &mut->heap;
+}
+static inline struct mark_space* heap_mark_space(struct heap *heap) {
+  return &heap->mark_space;
+}
+static inline struct context* mark_space_context(struct mark_space *space) {
+  return &space->cx;
+}
+static inline struct mark_space* mutator_mark_space(struct mutator *mut) {
+  return heap_mark_space(mutator_heap(mut));
+}
+static inline struct context* mutator_context(struct mutator *mut) {
+  return mark_space_context(mutator_mark_space(mut));
+}
+
 static inline struct marker* context_marker(struct context *cx) {
   return &cx->marker;
 }
@@ -171,7 +191,9 @@ static void clear_freelists(struct context *cx) {
 static void collect(struct context *cx) {
   DEBUG("start collect #%ld:\n", cx->count);
   marker_prepare(cx);
-  for (struct handle *h = cx->roots; h; h = h->next)
+  // HACK!!!
+  struct mutator *mut = (struct mutator *)cx;
+  for (struct handle *h = mut->roots; h; h = h->next)
     marker_visit_root(&h->v, cx);
   marker_trace(cx);
   marker_release(cx);
@@ -405,17 +427,18 @@ static inline void* allocate_small(struct context *cx,
   return obj;
 }
 
-static inline void* allocate(struct context *cx, enum alloc_kind kind,
+static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
+  struct context *cx = mutator_context(mut);
   size_t granules = size_to_granules(size);
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
     return allocate_small(cx, kind, granules_to_small_object_size(granules));
   return allocate_large(cx, kind, granules);
 }
-static inline void* allocate_pointerless(struct context *cx,
+static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind,
                                          size_t size) {
-  return allocate(cx, kind, size);
+  return allocate(mut, kind, size);
 }
 
 static inline void init_field(void **addr, void *val) {
@@ -428,7 +451,8 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static struct context* initialize_gc(size_t size) {
+static int initialize_gc(size_t size, struct heap **heap,
+                         struct mutator **mut) {
 #define SMALL_OBJECT_GRANULE_SIZE(i) \
     ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]); \
     ASSERT_EQ(SMALL_OBJECT_##i + 1, small_object_sizes_for_granules[i+1]);
@@ -444,51 +468,55 @@ static struct context* initialize_gc(size_t size) {
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("mmap failed");
-    abort();
+    return 0;
   }
 
-  struct context *cx = mem;
+  *mut = calloc(1, sizeof(struct mutator));
+  if (!*mut) abort();
+  (*mut)->roots = NULL;
+  *heap = mutator_heap(*mut);
+  struct mark_space *space = mutator_mark_space(*mut);
+  struct context *cx = &space->cx;
+
   cx->mem = mem;
   cx->mem_size = size;
-  size_t overhead = sizeof(*cx);
   // If there is 1 mark byte per granule, and SIZE bytes available for
   // HEAP_SIZE + MARK_BYTES, then:
   //
   //   size = (granule_size + 1) / granule_size * heap_size
   //   mark_bytes = 1/granule_size * heap_size
   //   mark_bytes = ceil(size / (granule_size + 1))
-  cx->mark_bytes = ((uint8_t *)mem) + overhead;
-  size_t mark_bytes_size = (size - overhead + GRANULE_SIZE) / (GRANULE_SIZE + 1);
-  overhead += mark_bytes_size;
-  overhead = align_up(overhead, GRANULE_SIZE);
+  cx->mark_bytes = (uint8_t *)mem;
+  size_t mark_bytes_size = (size + GRANULE_SIZE) / (GRANULE_SIZE + 1);
+  size_t overhead = align_up(mark_bytes_size, GRANULE_SIZE);
 
   cx->heap_base = ((uintptr_t) mem) + overhead;
   cx->heap_size = size - overhead;
 
   clear_freelists(cx);
   cx->sweep = cx->heap_base + cx->heap_size;
-  cx->roots = NULL;
   cx->count = 0;
   if (!marker_init(cx))
     abort();
   reclaim(cx, (void*)cx->heap_base, size_to_granules(cx->heap_size));
 
-  return cx;
+  return 1;
 }
 
-static struct context* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct context *parent) {
+static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
+                                                struct heap *parent) {
   fprintf(stderr,
           "Multiple mutator threads not yet implemented.\n");
   exit(1);
 }
-static void finish_gc_for_thread(struct context *cx) {
+static void finish_gc_for_thread(struct mutator *heap) {
 }
 
-static inline void print_start_gc_stats(struct context *cx) {
+static inline void print_start_gc_stats(struct heap *heap) {
 }
 
-static inline void print_end_gc_stats(struct context *cx) {
+static inline void print_end_gc_stats(struct heap *heap) {
+  struct context *cx = mark_space_context(heap_mark_space(heap));
   printf("Completed %ld collections\n", cx->count);
   printf("Heap size with overhead is %zd\n", cx->mem_size);
 }

From 61d38e4205878be197742d15a4bdbacedf405fda Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 08:34:19 +0200
Subject: [PATCH 046/403] Refactor mark-sweep to send mutator to collect()

This will let the mutator hold a pointer to the heap.
---
 mark-sweep.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index ec9b0ffca..f8993bb20 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -152,7 +152,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct context *cx) NEVER_INLINE;
+static void collect(struct mutator *mut) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct context *cx, struct gcobj *obj) {
   ASSERT(cx->heap_base <= (uintptr_t) obj);
@@ -188,11 +188,10 @@ static void clear_freelists(struct context *cx) {
   cx->large_objects = NULL;
 }
 
-static void collect(struct context *cx) {
+static void collect(struct mutator *mut) {
+  struct context *cx = mutator_context(mut);
   DEBUG("start collect #%ld:\n", cx->count);
   marker_prepare(cx);
-  // HACK!!!
-  struct mutator *mut = (struct mutator *)cx;
   for (struct handle *h = mut->roots; h; h = h->next)
     marker_visit_root(&h->v, cx);
   marker_trace(cx);
@@ -341,8 +340,9 @@ static int sweep(struct context *cx, size_t for_granules) {
   return 1;
 }
 
-static void* allocate_large(struct context *cx, enum alloc_kind kind,
+static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
                             size_t granules) {
+  struct context *cx = mutator_context(mut);
   int swept_from_beginning = 0;
   struct gcobj_free_large *already_scanned = NULL;
   while (1) {
@@ -367,13 +367,14 @@ static void* allocate_large(struct context *cx, enum alloc_kind kind,
       fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
       abort();
     } else {
-      collect(cx);
+      collect(mut);
       swept_from_beginning = 1;
     }
   }
 }
   
-static void fill_small(struct context *cx, enum small_object_size kind) {
+static void fill_small(struct mutator *mut, enum small_object_size kind) {
+  struct context *cx = mutator_context(mut);
   int swept_from_beginning = 0;
   while (1) {
     // First see if there are small objects already on the freelists
@@ -407,19 +408,20 @@ static void fill_small(struct context *cx, enum small_object_size kind) {
         fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
         abort();
       } else {
-        collect(cx);
+        collect(mut);
         swept_from_beginning = 1;
       }
     }
   }
 }
 
-static inline void* allocate_small(struct context *cx,
+static inline void* allocate_small(struct mutator *mut,
                                    enum alloc_kind alloc_kind,
                                    enum small_object_size small_kind) {
+  struct context *cx = mutator_context(mut);
   struct gcobj_free **loc = get_small_object_freelist(cx, small_kind);
   if (!*loc)
-    fill_small(cx, small_kind);
+    fill_small(mut, small_kind);
   struct gcobj_free *ret = *loc;
   *loc = ret->next;
   struct gcobj *obj = (struct gcobj *)ret;
@@ -429,11 +431,10 @@ static inline void* allocate_small(struct context *cx,
 
 static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
-  struct context *cx = mutator_context(mut);
   size_t granules = size_to_granules(size);
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
-    return allocate_small(cx, kind, granules_to_small_object_size(granules));
-  return allocate_large(cx, kind, granules);
+    return allocate_small(mut, kind, granules_to_small_object_size(granules));
+  return allocate_large(mut, kind, granules);
 }
 static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind,

From 2401732e318d553bac69dffe30ac04a58e371d9e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 08:36:24 +0200
Subject: [PATCH 047/403] mark-sweep: mutator data structure separate from heap

This will allow thread-local allocation buffers.
---
 mark-sweep.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index f8993bb20..034f59813 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -116,12 +116,12 @@ struct context {
 struct mark_space { struct context cx; };
 struct heap { struct mark_space mark_space; };
 struct mutator {
-  struct heap heap;
+  struct heap *heap;
   struct handle *roots;
 };
 
 static inline struct heap* mutator_heap(struct mutator *mut) {
-  return &mut->heap;
+  return mut->heap;
 }
 static inline struct mark_space* heap_mark_space(struct heap *heap) {
   return &heap->mark_space;
@@ -472,11 +472,9 @@ static int initialize_gc(size_t size, struct heap **heap,
     return 0;
   }
 
-  *mut = calloc(1, sizeof(struct mutator));
-  if (!*mut) abort();
-  (*mut)->roots = NULL;
-  *heap = mutator_heap(*mut);
-  struct mark_space *space = mutator_mark_space(*mut);
+  *heap = calloc(1, sizeof(struct heap));
+  if (!*heap) abort();
+  struct mark_space *space = heap_mark_space(*heap);
   struct context *cx = &space->cx;
 
   cx->mem = mem;
@@ -501,6 +499,11 @@ static int initialize_gc(size_t size, struct heap **heap,
     abort();
   reclaim(cx, (void*)cx->heap_base, size_to_granules(cx->heap_size));
 
+  *mut = calloc(1, sizeof(struct mutator));
+  if (!*mut) abort();
+  (*mut)->heap = *heap;
+  (*mut)->roots = NULL;
+
   return 1;
 }
 

From 9b0bc6e975b793c30ee9e6143f4198311f0ec9cb Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 08:59:27 +0200
Subject: [PATCH 048/403] mark-sweep: Update markers to deal in heap and spaces

This will let us get rid of "struct context".
---
 mark-sweep.h      | 36 ++++++++++++++++++++---------------
 parallel-marker.h | 48 +++++++++++++++++++++++------------------------
 serial-marker.h   | 41 ++++++++++++++++++++++------------------
 3 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 034f59813..7feb78d50 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -136,8 +136,8 @@ static inline struct context* mutator_context(struct mutator *mut) {
   return mark_space_context(mutator_mark_space(mut));
 }
 
-static inline struct marker* context_marker(struct context *cx) {
-  return &cx->marker;
+static inline struct marker* mark_space_marker(struct mark_space *space) {
+  return &mark_space_context(space)->marker;
 }
 
 static inline struct gcobj_free**
@@ -154,15 +154,16 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 
 static void collect(struct mutator *mut) NEVER_INLINE;
 
-static inline uint8_t* mark_byte(struct context *cx, struct gcobj *obj) {
+static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
+  struct context *cx = mark_space_context(space);
   ASSERT(cx->heap_base <= (uintptr_t) obj);
   ASSERT((uintptr_t) obj < cx->heap_base + cx->heap_size);
   uintptr_t granule = (((uintptr_t) obj) - cx->heap_base)  / GRANULE_SIZE;
   return &cx->mark_bytes[granule];
 }
 
-static inline int mark_object(struct context *cx, struct gcobj *obj) {
-  uint8_t *byte = mark_byte(cx, obj);
+static inline int mark_object(struct mark_space *space, struct gcobj *obj) {
+  uint8_t *byte = mark_byte(space, obj);
   if (*byte)
     return 0;
   *byte = 1;
@@ -189,13 +190,17 @@ static void clear_freelists(struct context *cx) {
 }
 
 static void collect(struct mutator *mut) {
+  struct mark_space *space = mutator_mark_space(mut);
   struct context *cx = mutator_context(mut);
   DEBUG("start collect #%ld:\n", cx->count);
-  marker_prepare(cx);
-  for (struct handle *h = mut->roots; h; h = h->next)
-    marker_visit_root(&h->v, cx);
-  marker_trace(cx);
-  marker_release(cx);
+  marker_prepare(space);
+  for (struct handle *h = mut->roots; h; h = h->next) {
+    struct gcobj *root = h->v;
+    if (root && mark_object(space, root))
+      marker_enqueue_root(mark_space_marker(space), root);
+  }
+  marker_trace(space);
+  marker_release(space);
   DEBUG("done marking\n");
   cx->sweep = cx->heap_base;
   clear_freelists(cx);
@@ -303,9 +308,10 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
-static int sweep(struct context *cx, size_t for_granules) {
+static int sweep(struct mark_space *space, size_t for_granules) {
   // Sweep until we have reclaimed 128 granules (1024 kB), or we reach
   // the end of the heap.
+  struct context *cx = mark_space_context(space);
   ssize_t to_reclaim = 128;
   uintptr_t sweep = cx->sweep;
   uintptr_t limit = cx->heap_base + cx->heap_size;
@@ -314,7 +320,7 @@ static int sweep(struct context *cx, size_t for_granules) {
     return 0;
 
   while (to_reclaim > 0 && sweep < limit) {
-    uint8_t* mark = mark_byte(cx, (struct gcobj*)sweep);
+    uint8_t* mark = mark_byte(space, (struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
     if (limit_granules > for_granules)
       limit_granules = for_granules;
@@ -360,7 +366,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
         }
       }
       already_scanned = cx->large_objects;
-    } while (sweep(cx, granules));
+    } while (sweep(mutator_mark_space(mut), granules));
 
     // No large object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
@@ -403,7 +409,7 @@ static void fill_small(struct mutator *mut, enum small_object_size kind) {
       return;
     }
 
-    if (!sweep(cx, LARGE_OBJECT_GRANULE_THRESHOLD)) {
+    if (!sweep(mutator_mark_space(mut), LARGE_OBJECT_GRANULE_THRESHOLD)) {
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
         abort();
@@ -495,7 +501,7 @@ static int initialize_gc(size_t size, struct heap **heap,
   clear_freelists(cx);
   cx->sweep = cx->heap_base + cx->heap_size;
   cx->count = 0;
-  if (!marker_init(cx))
+  if (!marker_init(space))
     abort();
   reclaim(cx, (void*)cx->heap_base, size_to_granules(cx->heap_size));
 
diff --git a/parallel-marker.h b/parallel-marker.h
index 1bfbbbd93..6240210aa 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -276,7 +276,7 @@ enum mark_worker_state {
 };
 
 struct mark_worker {
-  struct context *cx;
+  struct mark_space *space;
   size_t id;
   size_t steal_id;
   pthread_t thread;
@@ -301,19 +301,19 @@ struct marker {
 struct local_marker {
   struct mark_worker *worker;
   struct mark_deque *share_deque;
-  struct context *cx;
+  struct mark_space *space;
   struct local_mark_queue local;
 };
 
 struct context;
-static inline struct marker* context_marker(struct context *cx);
+static inline struct marker* mark_space_marker(struct mark_space *space);
 
 static size_t number_of_current_processors(void) { return 1; }
 
 static int
-mark_worker_init(struct mark_worker *worker, struct context *cx,
+mark_worker_init(struct mark_worker *worker, struct mark_space *space,
                  struct marker *marker, size_t id) {
-  worker->cx = cx;
+  worker->space = space;
   worker->id = id;
   worker->steal_id = 0;
   worker->thread = 0;
@@ -367,7 +367,7 @@ mark_worker_spawn(struct mark_worker *worker) {
 
 static void
 mark_worker_request_mark(struct mark_worker *worker) {
-  struct marker *marker = context_marker(worker->cx);
+  struct marker *marker = mark_space_marker(worker->space);
     
   pthread_mutex_lock(&worker->lock);
   ASSERT(worker->state == MARK_WORKER_IDLE);
@@ -379,7 +379,7 @@ mark_worker_request_mark(struct mark_worker *worker) {
 static void
 mark_worker_finished_marking(struct mark_worker *worker) {
   // Signal controller that we are done with marking.
-  struct marker *marker = context_marker(worker->cx);
+  struct marker *marker = mark_space_marker(worker->space);
     
   if (atomic_fetch_sub(&marker->running_markers, 1) == 1) {
     pthread_mutex_lock(&marker->lock);
@@ -399,8 +399,8 @@ mark_worker_request_stop(struct mark_worker *worker) {
 }  
 
 static int
-marker_init(struct context *cx) {
-  struct marker *marker = context_marker(cx);
+marker_init(struct mark_space *space) {
+  struct marker *marker = mark_space_marker(space);
   atomic_init(&marker->active_markers, 0);
   atomic_init(&marker->running_markers, 0);
   marker->count = 0;
@@ -414,7 +414,7 @@ marker_init(struct context *cx) {
   if (desired_worker_count > MARK_WORKERS_MAX_COUNT)
     desired_worker_count = MARK_WORKERS_MAX_COUNT;
   for (size_t i = 0; i < desired_worker_count; i++) {
-    if (!mark_worker_init(&marker->workers[i], cx, marker, i))
+    if (!mark_worker_init(&marker->workers[i], space, marker, i))
       break;
     if (mark_worker_spawn(&marker->workers[i]))
       marker->worker_count++;
@@ -424,13 +424,13 @@ marker_init(struct context *cx) {
   return marker->worker_count > 0;
 }
 
-static void marker_prepare(struct context *cx) {
-  struct marker *marker = context_marker(cx);
+static void marker_prepare(struct mark_space *space) {
+  struct marker *marker = mark_space_marker(space);
   for (size_t i = 0; i < marker->worker_count; i++)
     marker->workers[i].steal_id = 0;
 }
-static void marker_release(struct context *cx) {
-  struct marker *marker = context_marker(cx);
+static void marker_release(struct mark_space *space) {
+  struct marker *marker = mark_space_marker(space);
   for (size_t i = 0; i < marker->worker_count; i++)
     mark_deque_release(&marker->workers[i].deque);
 }
@@ -438,7 +438,7 @@ static void marker_release(struct context *cx) {
 struct gcobj;
 static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
 static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
-static inline int mark_object(struct context *cx,
+static inline int mark_object(struct mark_space *space,
                               struct gcobj *obj) ALWAYS_INLINE;
 
 static inline void
@@ -452,7 +452,7 @@ static inline void
 marker_visit(void **loc, void *mark_data) {
   struct local_marker *mark = mark_data;
   struct gcobj *obj = *loc;
-  if (obj && mark_object(mark->cx, obj)) {
+  if (obj && mark_object(mark->space, obj)) {
     if (local_mark_queue_full(&mark->local))
       marker_share(mark);
     local_mark_queue_push(&mark->local, (uintptr_t)obj);
@@ -550,7 +550,7 @@ mark_worker_check_termination(struct mark_worker *worker,
 
 static uintptr_t
 mark_worker_steal(struct local_marker *mark) {
-  struct marker *marker = context_marker(mark->cx);
+  struct marker *marker = mark_space_marker(mark->space);
   struct mark_worker *worker = mark->worker;
 
   while (1) {
@@ -569,7 +569,7 @@ mark_worker_mark(struct mark_worker *worker) {
   struct local_marker mark;
   mark.worker = worker;
   mark.share_deque = &worker->deque;
-  mark.cx = worker->cx;
+  mark.space = worker->space;
   local_mark_queue_init(&mark.local);
 
   size_t n = 0;
@@ -592,16 +592,14 @@ mark_worker_mark(struct mark_worker *worker) {
 }
 
 static inline void
-marker_visit_root(void **loc, struct context *cx) {
-  struct gcobj *obj = *loc;
-  struct mark_deque *worker0_deque = &context_marker(cx)->workers[0].deque;
-  if (obj && mark_object(cx, obj))
-    mark_deque_push(worker0_deque, (uintptr_t)obj);
+marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
+  struct mark_deque *worker0_deque = &marker->workers[0].deque;
+  mark_deque_push(worker0_deque, (uintptr_t)obj);
 }
 
 static inline void
-marker_trace(struct context *cx) {
-  struct marker *marker = context_marker(cx);
+marker_trace(struct mark_space *space) {
+  struct marker *marker = mark_space_marker(space);
 
   pthread_mutex_lock(&marker->lock);
   long mark_count = marker->count;
diff --git a/serial-marker.h b/serial-marker.h
index 8cc65a328..5f5330b6d 100644
--- a/serial-marker.h
+++ b/serial-marker.h
@@ -124,40 +124,45 @@ struct marker {
   struct mark_queue queue;
 };
 
-struct context;
-static inline struct marker* context_marker(struct context *cx);
+struct mark_space;
+static inline struct marker* mark_space_marker(struct mark_space *space);
 
 static int
-marker_init(struct context *cx) {
-  return mark_queue_init(&context_marker(cx)->queue);
+marker_init(struct mark_space *space) {
+  return mark_queue_init(&mark_space_marker(space)->queue);
 }
-static void marker_prepare(struct context *cx) {}
-static void marker_release(struct context *cx) {
-  mark_queue_release(&context_marker(cx)->queue);
+static void marker_prepare(struct mark_space *space) {}
+static void marker_release(struct mark_space *space) {
+  mark_queue_release(&mark_space_marker(space)->queue);
 }
 
 struct gcobj;
 static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
 static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
-static inline int mark_object(struct context *cx,
+static inline int mark_object(struct mark_space *space,
                               struct gcobj *obj) ALWAYS_INLINE;
 
+static inline void
+marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
+  mark_queue_push(&marker->queue, obj);
+}
+static inline void
+marker_enqueue_roots(struct marker *marker, struct gcobj **objs,
+                     size_t count) {
+  mark_queue_push_many(&marker->queue, objs, count);
+}
 static inline void
 marker_visit(void **loc, void *mark_data) {
-  struct context *cx = mark_data;
+  struct mark_space *space = mark_data;
   struct gcobj *obj = *loc;
-  if (obj && mark_object(cx, obj))
-    mark_queue_push(&context_marker(cx)->queue, obj);
+  if (obj && mark_object(space, obj))
+    marker_enqueue_root(mark_space_marker(space), obj);
 }
 static inline void
-marker_visit_root(void **loc, struct context *cx) {
-  marker_visit(loc, cx);
-}
-static inline void
-marker_trace(struct context *cx) {
+marker_trace(struct mark_space *space) {
   struct gcobj *obj;
-  while ((obj = mark_queue_pop(&context_marker(cx)->queue)))
-    trace_one(obj, cx);
+  while ((obj = mark_queue_pop(&mark_space_marker(space)->queue)))
+    trace_one(obj, space);
 }
 
 #endif // SERIAL_MARK_H

From be90f7ba49ce13a60fda8c2d75f544fd51f755ea Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 10:03:50 +0200
Subject: [PATCH 049/403] mark-sweep: Remove context, use mark space instead

This is the end of a series of refactors before adding thread-local
allocation.
---
 mark-sweep.h | 125 +++++++++++++++++++++++----------------------------
 1 file changed, 57 insertions(+), 68 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 7feb78d50..26f5362ba 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -97,7 +97,7 @@ struct gcobj {
   };
 };
 
-struct context {
+struct mark_space {
   // Segregated freelists of small objects.
   struct gcobj_free *small_objects[SMALL_OBJECT_SIZES];
   // Unordered list of large objects.
@@ -113,7 +113,6 @@ struct context {
   struct marker marker;
 };
 
-struct mark_space { struct context cx; };
 struct heap { struct mark_space mark_space; };
 struct mutator {
   struct heap *heap;
@@ -126,24 +125,18 @@ static inline struct heap* mutator_heap(struct mutator *mut) {
 static inline struct mark_space* heap_mark_space(struct heap *heap) {
   return &heap->mark_space;
 }
-static inline struct context* mark_space_context(struct mark_space *space) {
-  return &space->cx;
-}
 static inline struct mark_space* mutator_mark_space(struct mutator *mut) {
   return heap_mark_space(mutator_heap(mut));
 }
-static inline struct context* mutator_context(struct mutator *mut) {
-  return mark_space_context(mutator_mark_space(mut));
-}
 
 static inline struct marker* mark_space_marker(struct mark_space *space) {
-  return &mark_space_context(space)->marker;
+  return &space->marker;
 }
 
 static inline struct gcobj_free**
-get_small_object_freelist(struct context *cx, enum small_object_size kind) {
+get_small_object_freelist(struct mark_space *space, enum small_object_size kind) {
   ASSERT(kind < SMALL_OBJECT_SIZES);
-  return &cx->small_objects[kind];
+  return &space->small_objects[kind];
 }
 
 #define GC_HEADER uintptr_t _gc_header
@@ -155,11 +148,10 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 static void collect(struct mutator *mut) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
-  struct context *cx = mark_space_context(space);
-  ASSERT(cx->heap_base <= (uintptr_t) obj);
-  ASSERT((uintptr_t) obj < cx->heap_base + cx->heap_size);
-  uintptr_t granule = (((uintptr_t) obj) - cx->heap_base)  / GRANULE_SIZE;
-  return &cx->mark_bytes[granule];
+  ASSERT(space->heap_base <= (uintptr_t) obj);
+  ASSERT((uintptr_t) obj < space->heap_base + space->heap_size);
+  uintptr_t granule = (((uintptr_t) obj) - space->heap_base)  / GRANULE_SIZE;
+  return &space->mark_bytes[granule];
 }
 
 static inline int mark_object(struct mark_space *space, struct gcobj *obj) {
@@ -183,16 +175,15 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
   }
 }
 
-static void clear_freelists(struct context *cx) {
+static void clear_freelists(struct mark_space *space) {
   for (int i = 0; i < SMALL_OBJECT_SIZES; i++)
-    cx->small_objects[i] = NULL;
-  cx->large_objects = NULL;
+    space->small_objects[i] = NULL;
+  space->large_objects = NULL;
 }
 
 static void collect(struct mutator *mut) {
   struct mark_space *space = mutator_mark_space(mut);
-  struct context *cx = mutator_context(mut);
-  DEBUG("start collect #%ld:\n", cx->count);
+  DEBUG("start collect #%ld:\n", space->count);
   marker_prepare(space);
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gcobj *root = h->v;
@@ -202,9 +193,9 @@ static void collect(struct mutator *mut) {
   marker_trace(space);
   marker_release(space);
   DEBUG("done marking\n");
-  cx->sweep = cx->heap_base;
-  clear_freelists(cx);
-  cx->count++;
+  space->sweep = space->heap_base;
+  clear_freelists(space);
+  space->count++;
 }
 
 static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
@@ -212,12 +203,12 @@ static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
   *loc = obj;
 }
 
-static void push_small(struct context *cx, void *region,
+static void push_small(struct mark_space *space, void *region,
                        enum small_object_size kind, size_t region_granules) {
   uintptr_t addr = (uintptr_t) region;
   while (region_granules) {
     size_t granules = small_object_granule_sizes[kind];
-    struct gcobj_free **loc = get_small_object_freelist(cx, kind);
+    struct gcobj_free **loc = get_small_object_freelist(space, kind);
     while (granules <= region_granules) {
       push_free(loc, (struct gcobj_free*) addr);
       region_granules -= granules;
@@ -228,21 +219,21 @@ static void push_small(struct context *cx, void *region,
   }
 }
 
-static void push_large(struct context *cx, void *region, size_t granules) {
+static void push_large(struct mark_space *space, void *region, size_t granules) {
   struct gcobj_free_large *large = region;
-  large->next = cx->large_objects;
+  large->next = space->large_objects;
   large->granules = granules;
-  cx->large_objects = large;
+  space->large_objects = large;
 }
 
-static void reclaim(struct context *cx, void *obj, size_t granules) {
+static void reclaim(struct mark_space *space, void *obj, size_t granules) {
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
-    push_small(cx, obj, SMALL_OBJECT_SIZES - 1, granules);
+    push_small(space, obj, SMALL_OBJECT_SIZES - 1, granules);
   else
-    push_large(cx, obj, granules);
+    push_large(space, obj, granules);
 }
 
-static void split_large_object(struct context *cx,
+static void split_large_object(struct mark_space *space,
                                 struct gcobj_free_large *large,
                                 size_t granules) {
   size_t large_granules = large->granules;
@@ -258,7 +249,7 @@ static void split_large_object(struct context *cx,
     return;
   
   char *tail = ((char*)large) + granules * GRANULE_SIZE;
-  reclaim(cx, tail, large_granules - granules);
+  reclaim(space, tail, large_granules - granules);
 }
 
 static void unlink_large_object(struct gcobj_free_large **prev,
@@ -311,10 +302,9 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 static int sweep(struct mark_space *space, size_t for_granules) {
   // Sweep until we have reclaimed 128 granules (1024 kB), or we reach
   // the end of the heap.
-  struct context *cx = mark_space_context(space);
   ssize_t to_reclaim = 128;
-  uintptr_t sweep = cx->sweep;
-  uintptr_t limit = cx->heap_base + cx->heap_size;
+  uintptr_t sweep = space->sweep;
+  uintptr_t limit = space->heap_base + space->heap_size;
 
   if (sweep == limit)
     return 0;
@@ -328,7 +318,7 @@ static int sweep(struct mark_space *space, size_t for_granules) {
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
       clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
-      reclaim(cx, (void*)sweep, free_granules);
+      reclaim(space, (void*)sweep, free_granules);
       sweep += free_bytes;
       to_reclaim -= free_granules;
 
@@ -342,35 +332,35 @@ static int sweep(struct mark_space *space, size_t for_granules) {
     sweep += live_object_granules((struct gcobj *)sweep) * GRANULE_SIZE;
   }
 
-  cx->sweep = sweep;
+  space->sweep = sweep;
   return 1;
 }
 
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
                             size_t granules) {
-  struct context *cx = mutator_context(mut);
+  struct mark_space *space = mutator_mark_space(mut);
   int swept_from_beginning = 0;
   struct gcobj_free_large *already_scanned = NULL;
   while (1) {
     do {
-      struct gcobj_free_large **prev = &cx->large_objects;
-      for (struct gcobj_free_large *large = cx->large_objects;
+      struct gcobj_free_large **prev = &space->large_objects;
+      for (struct gcobj_free_large *large = space->large_objects;
            large != already_scanned;
            prev = &large->next, large = large->next) {
         if (large->granules >= granules) {
           unlink_large_object(prev, large);
-          split_large_object(cx, large, granules);
+          split_large_object(space, large, granules);
           struct gcobj *obj = (struct gcobj *)large;
           obj->tag = tag_live(kind);
           return large;
         }
       }
-      already_scanned = cx->large_objects;
+      already_scanned = space->large_objects;
     } while (sweep(mutator_mark_space(mut), granules));
 
     // No large object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
-      fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
+      fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
       abort();
     } else {
       collect(mut);
@@ -380,7 +370,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
 }
   
 static void fill_small(struct mutator *mut, enum small_object_size kind) {
-  struct context *cx = mutator_context(mut);
+  struct mark_space *space = mutator_mark_space(mut);
   int swept_from_beginning = 0;
   while (1) {
     // First see if there are small objects already on the freelists
@@ -388,12 +378,12 @@ static void fill_small(struct mutator *mut, enum small_object_size kind) {
     for (enum small_object_size next_kind = kind;
          next_kind < SMALL_OBJECT_SIZES;
          next_kind++) {
-      struct gcobj_free **loc = get_small_object_freelist(cx, next_kind);
+      struct gcobj_free **loc = get_small_object_freelist(space, next_kind);
       if (*loc) {
         if (kind != next_kind) {
           struct gcobj_free *ret = *loc;
           *loc = ret->next;
-          push_small(cx, ret, kind,
+          push_small(space, ret, kind,
                      small_object_granule_sizes[next_kind]);
         }
         return;
@@ -401,17 +391,17 @@ static void fill_small(struct mutator *mut, enum small_object_size kind) {
     }
 
     // Otherwise if there is a large object, take and split it.
-    struct gcobj_free_large *large = cx->large_objects;
+    struct gcobj_free_large *large = space->large_objects;
     if (large) {
-      unlink_large_object(&cx->large_objects, large);
-      split_large_object(cx, large, LARGE_OBJECT_GRANULE_THRESHOLD);
-      push_small(cx, large, kind, LARGE_OBJECT_GRANULE_THRESHOLD);
+      unlink_large_object(&space->large_objects, large);
+      split_large_object(space, large, LARGE_OBJECT_GRANULE_THRESHOLD);
+      push_small(space, large, kind, LARGE_OBJECT_GRANULE_THRESHOLD);
       return;
     }
 
     if (!sweep(mutator_mark_space(mut), LARGE_OBJECT_GRANULE_THRESHOLD)) {
       if (swept_from_beginning) {
-        fprintf(stderr, "ran out of space, heap size %zu\n", cx->heap_size);
+        fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
         abort();
       } else {
         collect(mut);
@@ -424,8 +414,8 @@ static void fill_small(struct mutator *mut, enum small_object_size kind) {
 static inline void* allocate_small(struct mutator *mut,
                                    enum alloc_kind alloc_kind,
                                    enum small_object_size small_kind) {
-  struct context *cx = mutator_context(mut);
-  struct gcobj_free **loc = get_small_object_freelist(cx, small_kind);
+  struct mark_space *space = mutator_mark_space(mut);
+  struct gcobj_free **loc = get_small_object_freelist(space, small_kind);
   if (!*loc)
     fill_small(mut, small_kind);
   struct gcobj_free *ret = *loc;
@@ -481,29 +471,28 @@ static int initialize_gc(size_t size, struct heap **heap,
   *heap = calloc(1, sizeof(struct heap));
   if (!*heap) abort();
   struct mark_space *space = heap_mark_space(*heap);
-  struct context *cx = &space->cx;
 
-  cx->mem = mem;
-  cx->mem_size = size;
+  space->mem = mem;
+  space->mem_size = size;
   // If there is 1 mark byte per granule, and SIZE bytes available for
   // HEAP_SIZE + MARK_BYTES, then:
   //
   //   size = (granule_size + 1) / granule_size * heap_size
   //   mark_bytes = 1/granule_size * heap_size
   //   mark_bytes = ceil(size / (granule_size + 1))
-  cx->mark_bytes = (uint8_t *)mem;
+  space->mark_bytes = (uint8_t *)mem;
   size_t mark_bytes_size = (size + GRANULE_SIZE) / (GRANULE_SIZE + 1);
   size_t overhead = align_up(mark_bytes_size, GRANULE_SIZE);
 
-  cx->heap_base = ((uintptr_t) mem) + overhead;
-  cx->heap_size = size - overhead;
+  space->heap_base = ((uintptr_t) mem) + overhead;
+  space->heap_size = size - overhead;
 
-  clear_freelists(cx);
-  cx->sweep = cx->heap_base + cx->heap_size;
-  cx->count = 0;
+  clear_freelists(space);
+  space->sweep = space->heap_base + space->heap_size;
+  space->count = 0;
   if (!marker_init(space))
     abort();
-  reclaim(cx, (void*)cx->heap_base, size_to_granules(cx->heap_size));
+  reclaim(space, (void*)space->heap_base, size_to_granules(space->heap_size));
 
   *mut = calloc(1, sizeof(struct mutator));
   if (!*mut) abort();
@@ -526,7 +515,7 @@ static inline void print_start_gc_stats(struct heap *heap) {
 }
 
 static inline void print_end_gc_stats(struct heap *heap) {
-  struct context *cx = mark_space_context(heap_mark_space(heap));
-  printf("Completed %ld collections\n", cx->count);
-  printf("Heap size with overhead is %zd\n", cx->mem_size);
+  struct mark_space *space = heap_mark_space(heap);
+  printf("Completed %ld collections\n", space->count);
+  printf("Heap size with overhead is %zd\n", space->mem_size);
 }

From 2d1e76eccc0699e685fdaac28b0840e55aec4ba0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 11:22:47 +0200
Subject: [PATCH 050/403] mark-sweep: remote markers can send roots via mark
 buffers

When you have multiple mutators -- perhaps many more than marker threads
-- they can mark their roots in parallel but they can't enqueue them on
the same mark queue concurrently -- mark queues are single-producer,
multiple-consumer queues.  Therefore, mutator threads will collect grey
roots from their own root sets, and then send them to the mutator that
is controlling GC, for it to add to the mark queue (somehow).
---
 mark-sweep.h | 293 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 213 insertions(+), 80 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 26f5362ba..c06f5a3f3 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -81,6 +81,10 @@ struct gcobj_free {
   struct gcobj_free *next;
 };
 
+struct gcobj_freelists {
+  struct gcobj_free *by_size[SMALL_OBJECT_SIZES];
+};
+
 // Objects larger than LARGE_OBJECT_GRANULE_THRESHOLD.
 struct gcobj_free_large {
   struct gcobj_free_large *next;
@@ -98,8 +102,6 @@ struct gcobj {
 };
 
 struct mark_space {
-  // Segregated freelists of small objects.
-  struct gcobj_free *small_objects[SMALL_OBJECT_SIZES];
   // Unordered list of large objects.
   struct gcobj_free_large *large_objects;
   uintptr_t base;
@@ -107,36 +109,50 @@ struct mark_space {
   uintptr_t heap_base;
   size_t heap_size;
   uintptr_t sweep;
+  struct mutator_mark_buf *mutator_roots;
   void *mem;
   size_t mem_size;
   long count;
   struct marker marker;
 };
 
-struct heap { struct mark_space mark_space; };
-struct mutator {
-  struct heap *heap;
-  struct handle *roots;
+struct heap {
+  struct mark_space mark_space;
 };
 
-static inline struct heap* mutator_heap(struct mutator *mut) {
-  return mut->heap;
-}
-static inline struct mark_space* heap_mark_space(struct heap *heap) {
-  return &heap->mark_space;
-}
-static inline struct mark_space* mutator_mark_space(struct mutator *mut) {
-  return heap_mark_space(mutator_heap(mut));
-}
+struct mutator_mark_buf {
+  struct mutator_mark_buf *next;
+  size_t size;
+  size_t capacity;
+  struct gcobj **objects;
+};
+
+struct mutator {
+  // Segregated freelists of small objects.
+  struct gcobj_freelists small_objects;
+  struct heap *heap;
+  struct handle *roots;
+  struct mutator_mark_buf mark_buf;
+};
 
 static inline struct marker* mark_space_marker(struct mark_space *space) {
   return &space->marker;
 }
+static inline struct mark_space* heap_mark_space(struct heap *heap) {
+  return &heap->mark_space;
+}
+static inline struct heap* mutator_heap(struct mutator *mutator) {
+  return mutator->heap;
+}
+static inline struct mark_space* mutator_mark_space(struct mutator *mutator) {
+  return heap_mark_space(mutator_heap(mutator));
+}
 
 static inline struct gcobj_free**
-get_small_object_freelist(struct mark_space *space, enum small_object_size kind) {
+get_small_object_freelist(struct gcobj_freelists *freelists,
+                          enum small_object_size kind) {
   ASSERT(kind < SMALL_OBJECT_SIZES);
-  return &space->small_objects[kind];
+  return &freelists->by_size[kind];
 }
 
 #define GC_HEADER uintptr_t _gc_header
@@ -145,7 +161,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct mutator *mut) NEVER_INLINE;
+static void collect(struct mark_space *space, struct mutator *mut) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   ASSERT(space->heap_base <= (uintptr_t) obj);
@@ -175,27 +191,109 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
   }
 }
 
-static void clear_freelists(struct mark_space *space) {
+static void clear_small_freelists(struct gcobj_freelists *small) {
   for (int i = 0; i < SMALL_OBJECT_SIZES; i++)
-    space->small_objects[i] = NULL;
+    small->by_size[i] = NULL;
+}
+static void clear_mutator_freelists(struct mutator *mut) {
+  clear_small_freelists(&mut->small_objects);
+}
+static void clear_global_freelists(struct mark_space *space) {
   space->large_objects = NULL;
 }
 
-static void collect(struct mutator *mut) {
+static void add_mutator(struct heap *heap, struct mutator *mut) {
+  mut->heap = heap;
+}
+
+static void remove_mutator(struct heap *heap, struct mutator *mut) {
+  mut->heap = NULL;
+}
+
+static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
+  size_t old_capacity = buf->capacity;
+  size_t old_bytes = old_capacity * sizeof(struct gcobj*);
+
+  size_t new_bytes = old_bytes ? old_bytes * 2 : getpagesize();
+  size_t new_capacity = new_bytes / sizeof(struct gcobj*);
+
+  void *mem = mmap(NULL, new_bytes, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("allocating mutator mark buffer failed");
+    abort();
+  }
+  if (old_bytes) {
+    memcpy(mem, buf->objects, old_bytes);
+    munmap(buf->objects, old_bytes);
+  }
+  buf->objects = mem;
+  buf->capacity = new_capacity;
+}
+
+static void mutator_mark_buf_push(struct mutator_mark_buf *buf,
+                                  struct gcobj *val) {
+  if (UNLIKELY(buf->size == buf->capacity))
+    mutator_mark_buf_grow(buf);
+  buf->objects[buf->size++] = val;
+}
+
+static void mutator_mark_buf_release(struct mutator_mark_buf *buf) {
+  size_t bytes = buf->size * sizeof(struct gcobj*);
+  if (bytes >= getpagesize())
+    madvise(buf->objects, align_up(bytes, getpagesize()), MADV_DONTNEED);
+  buf->size = 0;
+}
+
+static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
+  size_t bytes = buf->capacity * sizeof(struct gcobj*);
+  if (bytes)
+    munmap(buf->objects, bytes);
+}
+
+static void mark_mutator_roots(struct mutator *mut) {
   struct mark_space *space = mutator_mark_space(mut);
-  DEBUG("start collect #%ld:\n", space->count);
-  marker_prepare(space);
+  struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gcobj *root = h->v;
     if (root && mark_object(space, root))
-      marker_enqueue_root(mark_space_marker(space), root);
+      mutator_mark_buf_push(local_roots, root);
   }
+
+  // Post to global linked-list of thread roots.
+  struct mutator_mark_buf *next = space->mutator_roots;
+  local_roots->next = next;
+  space->mutator_roots = local_roots;
+}
+
+static void release_mutator_roots(struct mutator *mut) {
+  mutator_mark_buf_release(&mut->mark_buf);
+}
+
+static void mark_global_roots(struct mark_space *space) {
+  struct mutator_mark_buf *roots = space->mutator_roots;
+  for (; roots; roots = roots->next)
+    marker_enqueue_roots(&space->marker, roots->objects, roots->size);
+  space->mutator_roots = NULL;
+}
+
+static void reset_sweeper(struct mark_space *space) {
+  space->sweep = space->heap_base;
+}
+
+static void collect(struct mark_space *space, struct mutator *mut) {
+  DEBUG("start collect #%ld:\n", space->count);
+  marker_prepare(space);
+  mark_mutator_roots(mut);
+  mark_global_roots(space);
   marker_trace(space);
   marker_release(space);
-  DEBUG("done marking\n");
-  space->sweep = space->heap_base;
-  clear_freelists(space);
+  clear_global_freelists(space);
+  reset_sweeper(space);
   space->count++;
+  release_mutator_roots(mut);
+  clear_mutator_freelists(mut);
+  DEBUG("collect done\n");
 }
 
 static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
@@ -203,12 +301,12 @@ static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
   *loc = obj;
 }
 
-static void push_small(struct mark_space *space, void *region,
+static void push_small(struct gcobj_freelists *small_objects, void *region,
                        enum small_object_size kind, size_t region_granules) {
   uintptr_t addr = (uintptr_t) region;
   while (region_granules) {
     size_t granules = small_object_granule_sizes[kind];
-    struct gcobj_free **loc = get_small_object_freelist(space, kind);
+    struct gcobj_free **loc = get_small_object_freelist(small_objects, kind);
     while (granules <= region_granules) {
       push_free(loc, (struct gcobj_free*) addr);
       region_granules -= granules;
@@ -226,16 +324,19 @@ static void push_large(struct mark_space *space, void *region, size_t granules)
   space->large_objects = large;
 }
 
-static void reclaim(struct mark_space *space, void *obj, size_t granules) {
+static void reclaim(struct mark_space *space,
+                    struct gcobj_freelists *small_objects,
+                    void *obj, size_t granules) {
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
-    push_small(space, obj, SMALL_OBJECT_SIZES - 1, granules);
+    push_small(small_objects, obj, SMALL_OBJECT_SIZES - 1, granules);
   else
     push_large(space, obj, granules);
 }
 
 static void split_large_object(struct mark_space *space,
-                                struct gcobj_free_large *large,
-                                size_t granules) {
+                               struct gcobj_freelists *small_objects,
+                               struct gcobj_free_large *large,
+                               size_t granules) {
   size_t large_granules = large->granules;
   ASSERT(large_granules >= granules);
   ASSERT(granules >= LARGE_OBJECT_GRANULE_THRESHOLD);
@@ -249,7 +350,7 @@ static void split_large_object(struct mark_space *space,
     return;
   
   char *tail = ((char*)large) + granules * GRANULE_SIZE;
-  reclaim(space, tail, large_granules - granules);
+  reclaim(space, small_objects, tail, large_granules - granules);
 }
 
 static void unlink_large_object(struct gcobj_free_large **prev,
@@ -299,7 +400,8 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
-static int sweep(struct mark_space *space, size_t for_granules) {
+static int sweep(struct mark_space *space,
+                 struct gcobj_freelists *small_objects, size_t for_granules) {
   // Sweep until we have reclaimed 128 granules (1024 kB), or we reach
   // the end of the heap.
   ssize_t to_reclaim = 128;
@@ -318,7 +420,7 @@ static int sweep(struct mark_space *space, size_t for_granules) {
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
       clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
-      reclaim(space, (void*)sweep, free_granules);
+      reclaim(space, small_objects, (void*)sweep, free_granules);
       sweep += free_bytes;
       to_reclaim -= free_granules;
 
@@ -339,9 +441,11 @@ static int sweep(struct mark_space *space, size_t for_granules) {
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
                             size_t granules) {
   struct mark_space *space = mutator_mark_space(mut);
+  struct gcobj_freelists *small_objects = &mut->small_objects;
+
   int swept_from_beginning = 0;
-  struct gcobj_free_large *already_scanned = NULL;
   while (1) {
+    struct gcobj_free_large *already_scanned = NULL;
     do {
       struct gcobj_free_large **prev = &space->large_objects;
       for (struct gcobj_free_large *large = space->large_objects;
@@ -349,73 +453,107 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
            prev = &large->next, large = large->next) {
         if (large->granules >= granules) {
           unlink_large_object(prev, large);
-          split_large_object(space, large, granules);
+          split_large_object(space, small_objects, large, granules);
           struct gcobj *obj = (struct gcobj *)large;
           obj->tag = tag_live(kind);
           return large;
         }
       }
       already_scanned = space->large_objects;
-    } while (sweep(mutator_mark_space(mut), granules));
+    } while (sweep(space, small_objects, granules));
 
     // No large object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
       fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
       abort();
     } else {
-      collect(mut);
+      collect(space, mut);
       swept_from_beginning = 1;
     }
   }
 }
   
-static void fill_small(struct mutator *mut, enum small_object_size kind) {
+static int fill_small_from_local(struct gcobj_freelists *small_objects,
+                                 enum small_object_size kind) {
+  // Precondition: the freelist for KIND is already empty.
+  ASSERT(!*get_small_object_freelist(small_objects, kind));
+  // See if there are small objects already on the freelists
+  // that can be split.
+  for (enum small_object_size next_kind = kind + 1;
+       next_kind < SMALL_OBJECT_SIZES;
+       next_kind++) {
+    struct gcobj_free **loc = get_small_object_freelist(small_objects,
+                                                        next_kind);
+    if (*loc) {
+      struct gcobj_free *ret = *loc;
+      *loc = ret->next;
+      push_small(small_objects, ret, kind,
+                 small_object_granule_sizes[next_kind]);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static int fill_small_from_large(struct mark_space *space,
+                                 struct gcobj_freelists *small_objects,
+                                 enum small_object_size kind) {
+  // If there is a large object, take and split it.
+  struct gcobj_free_large *large = space->large_objects;
+  if (!large)
+    return 0;
+
+  unlink_large_object(&space->large_objects, large);
+  ASSERT(large->granules >= LARGE_OBJECT_GRANULE_THRESHOLD);
+  split_large_object(space, small_objects, large,
+                     LARGE_OBJECT_GRANULE_THRESHOLD);
+  push_small(small_objects, large, kind, LARGE_OBJECT_GRANULE_THRESHOLD);
+  return 1;
+}
+
+static void fill_small_from_global(struct mutator *mut,
+                                   enum small_object_size kind) NEVER_INLINE;
+static void fill_small_from_global(struct mutator *mut,
+                                   enum small_object_size kind) {
+  struct gcobj_freelists *small_objects = &mut->small_objects;
   struct mark_space *space = mutator_mark_space(mut);
+
   int swept_from_beginning = 0;
   while (1) {
-    // First see if there are small objects already on the freelists
-    // that can be split.
-    for (enum small_object_size next_kind = kind;
-         next_kind < SMALL_OBJECT_SIZES;
-         next_kind++) {
-      struct gcobj_free **loc = get_small_object_freelist(space, next_kind);
-      if (*loc) {
-        if (kind != next_kind) {
-          struct gcobj_free *ret = *loc;
-          *loc = ret->next;
-          push_small(space, ret, kind,
-                     small_object_granule_sizes[next_kind]);
-        }
-        return;
-      }
-    }
+    if (fill_small_from_large(space, small_objects, kind))
+      break;
 
-    // Otherwise if there is a large object, take and split it.
-    struct gcobj_free_large *large = space->large_objects;
-    if (large) {
-      unlink_large_object(&space->large_objects, large);
-      split_large_object(space, large, LARGE_OBJECT_GRANULE_THRESHOLD);
-      push_small(space, large, kind, LARGE_OBJECT_GRANULE_THRESHOLD);
-      return;
-    }
-
-    if (!sweep(mutator_mark_space(mut), LARGE_OBJECT_GRANULE_THRESHOLD)) {
+    if (!sweep(space, small_objects, LARGE_OBJECT_GRANULE_THRESHOLD)) {
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
         abort();
       } else {
-        collect(mut);
+        collect(space, mut);
         swept_from_beginning = 1;
       }
     }
+
+    if (*get_small_object_freelist(small_objects, kind))
+      break;
+    if (fill_small_from_local(small_objects, kind))
+      break;
   }
 }
 
+static void fill_small(struct mutator *mut, enum small_object_size kind) {
+  // See if there are small objects already on the local freelists that
+  // can be split.
+  if (fill_small_from_local(&mut->small_objects, kind))
+    return;
+
+  fill_small_from_global(mut, kind);
+}
+
 static inline void* allocate_small(struct mutator *mut,
                                    enum alloc_kind alloc_kind,
                                    enum small_object_size small_kind) {
-  struct mark_space *space = mutator_mark_space(mut);
-  struct gcobj_free **loc = get_small_object_freelist(space, small_kind);
+  struct gcobj_free **loc =
+    get_small_object_freelist(&mut->small_objects, small_kind);
   if (!*loc)
     fill_small(mut, small_kind);
   struct gcobj_free *ret = *loc;
@@ -471,7 +609,6 @@ static int initialize_gc(size_t size, struct heap **heap,
   *heap = calloc(1, sizeof(struct heap));
   if (!*heap) abort();
   struct mark_space *space = heap_mark_space(*heap);
-
   space->mem = mem;
   space->mem_size = size;
   // If there is 1 mark byte per granule, and SIZE bytes available for
@@ -479,26 +616,22 @@ static int initialize_gc(size_t size, struct heap **heap,
   //
   //   size = (granule_size + 1) / granule_size * heap_size
   //   mark_bytes = 1/granule_size * heap_size
-  //   mark_bytes = ceil(size / (granule_size + 1))
-  space->mark_bytes = (uint8_t *)mem;
+  //   mark_bytes = ceil(heap_size / (granule_size + 1))
+  space->mark_bytes = (uint8_t *) mem;
   size_t mark_bytes_size = (size + GRANULE_SIZE) / (GRANULE_SIZE + 1);
   size_t overhead = align_up(mark_bytes_size, GRANULE_SIZE);
 
   space->heap_base = ((uintptr_t) mem) + overhead;
   space->heap_size = size - overhead;
-
-  clear_freelists(space);
   space->sweep = space->heap_base + space->heap_size;
-  space->count = 0;
   if (!marker_init(space))
     abort();
-  reclaim(space, (void*)space->heap_base, size_to_granules(space->heap_size));
+  reclaim(space, NULL, (void*)space->heap_base,
+          size_to_granules(space->heap_size));
 
   *mut = calloc(1, sizeof(struct mutator));
   if (!*mut) abort();
-  (*mut)->heap = *heap;
-  (*mut)->roots = NULL;
-
+  add_mutator(*heap, *mut);
   return 1;
 }
 

From 14529f11e96cb4201c7aac71513e64e9c4f9482a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 10:07:24 +0200
Subject: [PATCH 051/403] mark-sweep: add global small object freelist

This will be useful to collect data when sweeping, if a mutator doesn't
need those objects.
---
 mark-sweep.h | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index c06f5a3f3..a54826138 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -102,6 +102,7 @@ struct gcobj {
 };
 
 struct mark_space {
+  struct gcobj_freelists small_objects;
   // Unordered list of large objects.
   struct gcobj_free_large *large_objects;
   uintptr_t base;
@@ -199,6 +200,7 @@ static void clear_mutator_freelists(struct mutator *mut) {
   clear_small_freelists(&mut->small_objects);
 }
 static void clear_global_freelists(struct mark_space *space) {
+  clear_small_freelists(&space->small_objects);
   space->large_objects = NULL;
 }
 
@@ -511,6 +513,24 @@ static int fill_small_from_large(struct mark_space *space,
   return 1;
 }
 
+static int fill_small_from_global_small(struct mark_space *space,
+                                        struct gcobj_freelists *small_objects,
+                                        enum small_object_size kind) {
+  struct gcobj_freelists *global_small = &space->small_objects;
+  if (*get_small_object_freelist(global_small, kind)
+      || fill_small_from_local(global_small, kind)) {
+    struct gcobj_free **src = get_small_object_freelist(global_small, kind);
+    ASSERT(*src);
+    struct gcobj_free **dst = get_small_object_freelist(small_objects, kind);
+    ASSERT(!*dst);
+    // FIXME: just take a few?
+    *dst = *src;
+    *src = NULL;
+    return 1;
+  }
+  return 0;
+}
+
 static void fill_small_from_global(struct mutator *mut,
                                    enum small_object_size kind) NEVER_INLINE;
 static void fill_small_from_global(struct mutator *mut,
@@ -520,6 +540,9 @@ static void fill_small_from_global(struct mutator *mut,
 
   int swept_from_beginning = 0;
   while (1) {
+    if (fill_small_from_global_small(space, small_objects, kind))
+      break;
+
     if (fill_small_from_large(space, small_objects, kind))
       break;
 
@@ -648,7 +671,6 @@ static inline void print_start_gc_stats(struct heap *heap) {
 }
 
 static inline void print_end_gc_stats(struct heap *heap) {
-  struct mark_space *space = heap_mark_space(heap);
-  printf("Completed %ld collections\n", space->count);
-  printf("Heap size with overhead is %zd\n", space->mem_size);
+  printf("Completed %ld collections\n", heap_mark_space(heap)->count);
+  printf("Heap size with overhead is %zd\n", heap_mark_space(heap)->mem_size);
 }

From ded3b3c7a3458bca023695f2381cfecebd047f2a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 14:25:28 +0200
Subject: [PATCH 052/403] Update parallel marker API to use struct gcobj

---
 parallel-marker.h | 119 ++++++++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 52 deletions(-)

diff --git a/parallel-marker.h b/parallel-marker.h
index 6240210aa..e39704852 100644
--- a/parallel-marker.h
+++ b/parallel-marker.h
@@ -17,10 +17,12 @@
 // for Weak Memory Models" (Lê et al, PPoPP'13)
 // (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
 
+struct gcobj;
+
 struct mark_buf {
   unsigned log_size;
   size_t size;
-  atomic_uintptr_t *data;
+  struct gcobj **data;
 };
 
 // Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
@@ -32,7 +34,7 @@ static int
 mark_buf_init(struct mark_buf *buf, unsigned log_size) {
   ASSERT(log_size >= mark_buf_min_log_size);
   ASSERT(log_size <= mark_buf_max_log_size);
-  size_t size = (1 << log_size) * sizeof(uintptr_t);
+  size_t size = (1 << log_size) * sizeof(struct gcobj *);
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
@@ -53,7 +55,7 @@ mark_buf_size(struct mark_buf *buf) {
 
 static inline size_t
 mark_buf_byte_size(struct mark_buf *buf) {
-  return mark_buf_size(buf) * sizeof(uintptr_t);
+  return mark_buf_size(buf) * sizeof(struct gcobj *);
 }
 
 static void
@@ -72,14 +74,14 @@ mark_buf_destroy(struct mark_buf *buf) {
   }
 }
 
-static inline uintptr_t
+static inline struct gcobj *
 mark_buf_get(struct mark_buf *buf, size_t i) {
   return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
                               memory_order_relaxed);
 }
 
 static inline void
-mark_buf_put(struct mark_buf *buf, size_t i, uintptr_t o) {
+mark_buf_put(struct mark_buf *buf, size_t i, struct gcobj * o) {
   return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
                                o,
                                memory_order_relaxed);
@@ -97,9 +99,6 @@ mark_buf_grow(struct mark_buf *from, struct mark_buf *to,
   return 1;
 }
 
-static const uintptr_t mark_deque_empty = 0;
-static const uintptr_t mark_deque_abort = 1;
-
 // Chase-Lev work-stealing deque.  One thread pushes data into the deque
 // at the bottom, and many threads compete to steal data from the top.
 struct mark_deque {
@@ -158,7 +157,7 @@ mark_deque_grow(struct mark_deque *q, int cur, size_t b, size_t t) {
 }
 
 static void
-mark_deque_push(struct mark_deque *q, uintptr_t x) {
+mark_deque_push(struct mark_deque *q, struct gcobj * x) {
   size_t b = LOAD_RELAXED(&q->bottom);
   size_t t = LOAD_ACQUIRE(&q->top);
   int active = LOAD_RELAXED(&q->active);
@@ -171,7 +170,22 @@ mark_deque_push(struct mark_deque *q, uintptr_t x) {
   STORE_RELAXED(&q->bottom, b + 1);
 }
 
-static uintptr_t
+static void
+mark_deque_push_many(struct mark_deque *q, struct gcobj **objv, size_t count) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  while (b - t > mark_buf_size(&q->bufs[active]) - count) /* Full queue. */
+    active = mark_deque_grow(q, active, b, t);
+
+  for (size_t i = 0; i < count; i++)
+    mark_buf_put(&q->bufs[active], b + i, objv[i]);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + count);
+}
+
+static struct gcobj *
 mark_deque_try_pop(struct mark_deque *q) {
   size_t b = LOAD_RELAXED(&q->bottom);
   b = b - 1;
@@ -179,7 +193,7 @@ mark_deque_try_pop(struct mark_deque *q) {
   STORE_RELAXED(&q->bottom, b);
   atomic_thread_fence(memory_order_seq_cst);
   size_t t = LOAD_RELAXED(&q->top);
-  uintptr_t x;
+  struct gcobj * x;
   if (t <= b) { // Non-empty queue.
     x = mark_buf_get(&q->bufs[active], b);
     if (t == b) { // Single last element in queue.
@@ -187,32 +201,33 @@ mark_deque_try_pop(struct mark_deque *q) {
                                                    memory_order_seq_cst,
                                                    memory_order_relaxed))
         // Failed race.
-        x = mark_deque_empty;
+        x = NULL;
       STORE_RELAXED(&q->bottom, b + 1);
     }
   } else { // Empty queue.
-    x = mark_deque_empty;
+    x = NULL;
     STORE_RELAXED(&q->bottom, b + 1);
   }
   return x;
 }
 
-static uintptr_t
+static struct gcobj *
 mark_deque_steal(struct mark_deque *q) {
-  size_t t = LOAD_ACQUIRE(&q->top);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t b = LOAD_ACQUIRE(&q->bottom);
-  uintptr_t x = mark_deque_empty;
-  if (t < b) { // Non-empty queue.
+  while (1) {
+    size_t t = LOAD_ACQUIRE(&q->top);
+    atomic_thread_fence(memory_order_seq_cst);
+    size_t b = LOAD_ACQUIRE(&q->bottom);
+    if (t >= b)
+      return NULL;
     int active = LOAD_CONSUME(&q->active);
-    x = mark_buf_get(&q->bufs[active], t);
+    struct gcobj *x = x = mark_buf_get(&q->bufs[active], t);
     if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
                                                  memory_order_seq_cst,
                                                  memory_order_relaxed))
       // Failed race.
-      return mark_deque_abort;
+      continue;
+    return x;
   }
-  return x;
 }
 
 static int
@@ -235,7 +250,7 @@ mark_deque_can_steal(struct mark_deque *q) {
 struct local_mark_queue {
   size_t read;
   size_t write;
-  uintptr_t data[LOCAL_MARK_QUEUE_SIZE];
+  struct gcobj * data[LOCAL_MARK_QUEUE_SIZE];
 };
 
 static inline void
@@ -259,10 +274,10 @@ local_mark_queue_full(struct local_mark_queue *q) {
   return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SIZE;
 }
 static inline void
-local_mark_queue_push(struct local_mark_queue *q, uintptr_t v) {
+local_mark_queue_push(struct local_mark_queue *q, struct gcobj * v) {
   q->data[q->write++ & LOCAL_MARK_QUEUE_MASK] = v;
 }
-static inline uintptr_t
+static inline struct gcobj *
 local_mark_queue_pop(struct local_mark_queue *q) {
   return q->data[q->read++ & LOCAL_MARK_QUEUE_MASK];
 }
@@ -455,40 +470,33 @@ marker_visit(void **loc, void *mark_data) {
   if (obj && mark_object(mark->space, obj)) {
     if (local_mark_queue_full(&mark->local))
       marker_share(mark);
-    local_mark_queue_push(&mark->local, (uintptr_t)obj);
+    local_mark_queue_push(&mark->local, obj);
   }
 }
 
-static uintptr_t
+static struct gcobj *
 marker_steal_from_worker(struct marker *marker, size_t id) {
   ASSERT(id < marker->worker_count);
-  while (1) {
-    uintptr_t res = mark_deque_steal(&marker->workers[id].deque);
-    if (res == mark_deque_empty)
-      return 0;
-    if (res == mark_deque_abort)
-      continue;
-    return res;
-  }
+  return mark_deque_steal(&marker->workers[id].deque);
 }
 
-static uintptr_t
+static int
 marker_can_steal_from_worker(struct marker *marker, size_t id) {
   ASSERT(id < marker->worker_count);
   return mark_deque_can_steal(&marker->workers[id].deque);
 }
 
-static uintptr_t
+static struct gcobj *
 mark_worker_steal_from_any(struct mark_worker *worker, struct marker *marker) {
   size_t steal_id = worker->steal_id;
   for (size_t i = 0; i < marker->worker_count; i++) {
     steal_id = (steal_id + 1) % marker->worker_count;
     DEBUG("marker #%zu: stealing from #%zu\n", worker->id, steal_id);
-    uintptr_t addr = marker_steal_from_worker(marker, steal_id);
-    if (addr) {
-      DEBUG("marker #%zu: stealing got 0x%zx\n", worker->id, addr);
+    struct gcobj * obj = marker_steal_from_worker(marker, steal_id);
+    if (obj) {
+      DEBUG("marker #%zu: stealing got %p\n", worker->id, obj);
       worker->steal_id = steal_id;
-      return addr;
+      return obj;
     }
   }
   DEBUG("marker #%zu: failed to steal\n", worker->id);
@@ -548,19 +556,19 @@ mark_worker_check_termination(struct mark_worker *worker,
   }
 }
 
-static uintptr_t
+static struct gcobj *
 mark_worker_steal(struct local_marker *mark) {
   struct marker *marker = mark_space_marker(mark->space);
   struct mark_worker *worker = mark->worker;
 
   while (1) {
     DEBUG("marker #%zu: trying to steal\n", worker->id);
-    uintptr_t addr = mark_worker_steal_from_any(worker, marker);
-    if (addr)
-      return addr;
+    struct gcobj *obj = mark_worker_steal_from_any(worker, marker);
+    if (obj)
+      return obj;
 
     if (mark_worker_check_termination(worker, marker))
-      return 0;
+      return NULL;
   }
 }
 
@@ -575,15 +583,15 @@ mark_worker_mark(struct mark_worker *worker) {
   size_t n = 0;
   DEBUG("marker #%zu: running mark loop\n", worker->id);
   while (1) {
-    uintptr_t addr;
+    struct gcobj * obj;
     if (!local_mark_queue_empty(&mark.local)) {
-      addr = local_mark_queue_pop(&mark.local);
+      obj = local_mark_queue_pop(&mark.local);
     } else {
-      addr = mark_worker_steal(&mark);
-      if (!addr)
+      obj = mark_worker_steal(&mark);
+      if (!obj)
         break;
     }
-    trace_one((struct gcobj*)addr, &mark);
+    trace_one(obj, &mark);
     n++;
   }
   DEBUG("marker #%zu: done marking, %zu objects traced\n", worker->id, n);
@@ -594,7 +602,14 @@ mark_worker_mark(struct mark_worker *worker) {
 static inline void
 marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
   struct mark_deque *worker0_deque = &marker->workers[0].deque;
-  mark_deque_push(worker0_deque, (uintptr_t)obj);
+  mark_deque_push(worker0_deque, obj);
+}
+
+static inline void
+marker_enqueue_roots(struct marker *marker, struct gcobj **objv,
+                     size_t count) {
+  struct mark_deque *worker0_deque = &marker->workers[0].deque;
+  mark_deque_push_many(worker0_deque, objv, count);
 }
 
 static inline void

From e837d51f530fdbe83da7cc8f5b48ba8358f75587 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 14:26:18 +0200
Subject: [PATCH 053/403] mark-sweep collector allows parallel mutators

---
 mark-sweep.h | 188 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 171 insertions(+), 17 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index a54826138..a59a4e67f 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -1,3 +1,4 @@
+#include <stdatomic.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@@ -102,6 +103,13 @@ struct gcobj {
 };
 
 struct mark_space {
+  pthread_mutex_t lock;
+  pthread_cond_t collector_cond;
+  pthread_cond_t mutator_cond;
+  int collecting;
+  int multithreaded;
+  size_t active_mutator_count;
+  size_t mutator_count;
   struct gcobj_freelists small_objects;
   // Unordered list of large objects.
   struct gcobj_free_large *large_objects;
@@ -110,6 +118,7 @@ struct mark_space {
   uintptr_t heap_base;
   size_t heap_size;
   uintptr_t sweep;
+  struct handle *global_roots;
   struct mutator_mark_buf *mutator_roots;
   void *mem;
   size_t mem_size;
@@ -204,12 +213,61 @@ static void clear_global_freelists(struct mark_space *space) {
   space->large_objects = NULL;
 }
 
+static int space_has_multiple_mutators(struct mark_space *space) {
+  return atomic_load_explicit(&space->multithreaded, memory_order_relaxed);
+}
+
+static int mutators_are_stopping(struct mark_space *space) {
+  return atomic_load_explicit(&space->collecting, memory_order_relaxed);
+}
+
+static inline void mark_space_lock(struct mark_space *space) {
+  pthread_mutex_lock(&space->lock);
+}
+static inline void mark_space_unlock(struct mark_space *space) {
+  pthread_mutex_unlock(&space->lock);
+}
+
 static void add_mutator(struct heap *heap, struct mutator *mut) {
   mut->heap = heap;
+  struct mark_space *space = heap_mark_space(heap);
+  mark_space_lock(space);
+  // We have no roots.  If there is a GC currently in progress, we have
+  // nothing to add.  Just wait until it's done.
+  while (mutators_are_stopping(space))
+    pthread_cond_wait(&space->mutator_cond, &space->lock);
+  if (space->mutator_count == 1)
+    space->multithreaded = 1;
+  space->active_mutator_count++;
+  space->mutator_count++;
+  mark_space_unlock(space);
 }
 
 static void remove_mutator(struct heap *heap, struct mutator *mut) {
   mut->heap = NULL;
+  struct mark_space *space = heap_mark_space(heap);
+  mark_space_lock(space);
+  space->active_mutator_count--;
+  space->mutator_count--;
+  // We have no roots.  If there is a GC stop currently in progress,
+  // maybe tell the controller it can continue.
+  if (mutators_are_stopping(space) && space->active_mutator_count == 0)
+    pthread_cond_signal(&space->collector_cond);
+  mark_space_unlock(space);
+}
+
+static void request_mutators_to_stop(struct mark_space *space) {
+  ASSERT(!mutators_are_stopping(space));
+  atomic_store_explicit(&space->collecting, 1, memory_order_relaxed);
+}
+
+static void allow_mutators_to_continue(struct mark_space *space) {
+  ASSERT(mutators_are_stopping(space));
+  ASSERT(space->active_mutator_count == 0);
+  space->active_mutator_count++;
+  atomic_store_explicit(&space->collecting, 0, memory_order_relaxed);
+  ASSERT(!mutators_are_stopping(space));
+  pthread_cond_broadcast(&space->mutator_cond);
 }
 
 static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
@@ -253,7 +311,9 @@ static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
     munmap(buf->objects, bytes);
 }
 
-static void mark_mutator_roots(struct mutator *mut) {
+// Mark the roots of a mutator that is stopping for GC.  We can't
+// enqueue them directly, so we send them to the controller in a buffer.
+static void mark_stopping_mutator_roots(struct mutator *mut) {
   struct mark_space *space = mutator_mark_space(mut);
   struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
@@ -263,20 +323,78 @@ static void mark_mutator_roots(struct mutator *mut) {
   }
 
   // Post to global linked-list of thread roots.
-  struct mutator_mark_buf *next = space->mutator_roots;
-  local_roots->next = next;
-  space->mutator_roots = local_roots;
+  struct mutator_mark_buf *next =
+    atomic_load_explicit(&space->mutator_roots, memory_order_acquire);
+  do {
+    local_roots->next = next;
+  } while (!atomic_compare_exchange_weak(&space->mutator_roots,
+                                         &next, local_roots));
 }
 
-static void release_mutator_roots(struct mutator *mut) {
+// Mark the roots of the mutator that causes GC.
+static void mark_controlling_mutator_roots(struct mutator *mut) {
+  struct mark_space *space = mutator_mark_space(mut);
+  for (struct handle *h = mut->roots; h; h = h->next) {
+    struct gcobj *root = h->v;
+    if (root && mark_object(space, root))
+      marker_enqueue_root(&space->marker, root);
+  }
+}
+
+static void release_stopping_mutator_roots(struct mutator *mut) {
   mutator_mark_buf_release(&mut->mark_buf);
 }
 
+static void wait_for_mutators_to_stop(struct mark_space *space) {
+  space->active_mutator_count--;
+  while (space->active_mutator_count)
+    pthread_cond_wait(&space->collector_cond, &space->lock);
+}
+
 static void mark_global_roots(struct mark_space *space) {
-  struct mutator_mark_buf *roots = space->mutator_roots;
+  for (struct handle *h = space->global_roots; h; h = h->next) {
+    struct gcobj *obj = h->v;
+    if (obj && mark_object(space, obj))
+      marker_enqueue_root(&space->marker, obj);
+  }
+
+  struct mutator_mark_buf *roots = atomic_load(&space->mutator_roots);
   for (; roots; roots = roots->next)
     marker_enqueue_roots(&space->marker, roots->objects, roots->size);
-  space->mutator_roots = NULL;
+  atomic_store(&space->mutator_roots, NULL);
+}
+
+static void pause_mutator_for_collection(struct mutator *mut) NEVER_INLINE;
+static void pause_mutator_for_collection(struct mutator *mut) {
+  struct mark_space *space = mutator_mark_space(mut);
+  ASSERT(mutators_are_stopping(space));
+  mark_stopping_mutator_roots(mut);
+  mark_space_lock(space);
+  ASSERT(space->active_mutator_count);
+  space->active_mutator_count--;
+  if (space->active_mutator_count == 0)
+    pthread_cond_signal(&space->collector_cond);
+
+  // Go to sleep and wake up when the collector is done.  Note,
+  // however, that it may be that some other mutator manages to
+  // trigger collection before we wake up.  In that case we need to
+  // mark roots, not just sleep again.  To detect a wakeup on this
+  // collection vs a future collection, we use the global GC count.
+  // This is safe because the count is protected by the space lock,
+  // which we hold.
+  long epoch = space->count;
+  do
+    pthread_cond_wait(&space->mutator_cond, &space->lock);
+  while (mutators_are_stopping(space) && space->count == epoch);
+
+  space->active_mutator_count++;
+  mark_space_unlock(space);
+  release_stopping_mutator_roots(mut);
+}
+
+static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
+  while (mutators_are_stopping(mutator_mark_space(mut)))
+    pause_mutator_for_collection(mut);
 }
 
 static void reset_sweeper(struct mark_space *space) {
@@ -286,14 +404,16 @@ static void reset_sweeper(struct mark_space *space) {
 static void collect(struct mark_space *space, struct mutator *mut) {
   DEBUG("start collect #%ld:\n", space->count);
   marker_prepare(space);
-  mark_mutator_roots(mut);
+  request_mutators_to_stop(space);
+  mark_controlling_mutator_roots(mut);
+  wait_for_mutators_to_stop(space);
   mark_global_roots(space);
   marker_trace(space);
   marker_release(space);
   clear_global_freelists(space);
   reset_sweeper(space);
   space->count++;
-  release_mutator_roots(mut);
+  allow_mutators_to_continue(space);
   clear_mutator_freelists(mut);
   DEBUG("collect done\n");
 }
@@ -443,8 +563,12 @@ static int sweep(struct mark_space *space,
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
                             size_t granules) {
   struct mark_space *space = mutator_mark_space(mut);
-  struct gcobj_freelists *small_objects = &mut->small_objects;
+  struct gcobj_freelists *small_objects = space_has_multiple_mutators(space) ?
+    &space->small_objects : &mut->small_objects;
 
+  maybe_pause_mutator_for_collection(mut);
+
+  mark_space_lock(space);
   int swept_from_beginning = 0;
   while (1) {
     struct gcobj_free_large *already_scanned = NULL;
@@ -456,6 +580,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
         if (large->granules >= granules) {
           unlink_large_object(prev, large);
           split_large_object(space, small_objects, large, granules);
+          mark_space_unlock(space);
           struct gcobj *obj = (struct gcobj *)large;
           obj->tag = tag_live(kind);
           return large;
@@ -469,7 +594,13 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
       fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
       abort();
     } else {
-      collect(space, mut);
+      if (mutators_are_stopping(space)) {
+        mark_space_unlock(space);
+        pause_mutator_for_collection(mut);
+        mark_space_lock(space);
+      } else {
+        collect(space, mut);
+      }
       swept_from_beginning = 1;
     }
   }
@@ -497,6 +628,7 @@ static int fill_small_from_local(struct gcobj_freelists *small_objects,
   return 0;
 }
 
+// with space lock
 static int fill_small_from_large(struct mark_space *space,
                                  struct gcobj_freelists *small_objects,
                                  enum small_object_size kind) {
@@ -516,6 +648,8 @@ static int fill_small_from_large(struct mark_space *space,
 static int fill_small_from_global_small(struct mark_space *space,
                                         struct gcobj_freelists *small_objects,
                                         enum small_object_size kind) {
+  if (!space_has_multiple_mutators(space))
+    return 0;
   struct gcobj_freelists *global_small = &space->small_objects;
   if (*get_small_object_freelist(global_small, kind)
       || fill_small_from_local(global_small, kind)) {
@@ -538,6 +672,9 @@ static void fill_small_from_global(struct mutator *mut,
   struct gcobj_freelists *small_objects = &mut->small_objects;
   struct mark_space *space = mutator_mark_space(mut);
 
+  maybe_pause_mutator_for_collection(mut);
+
+  mark_space_lock(space);
   int swept_from_beginning = 0;
   while (1) {
     if (fill_small_from_global_small(space, small_objects, kind))
@@ -551,7 +688,13 @@ static void fill_small_from_global(struct mutator *mut,
         fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
         abort();
       } else {
-        collect(space, mut);
+        if (mutators_are_stopping(space)) {
+          mark_space_unlock(space);
+          pause_mutator_for_collection(mut);
+          mark_space_lock(space);
+        } else {
+          collect(space, mut);
+        }
         swept_from_beginning = 1;
       }
     }
@@ -561,6 +704,7 @@ static void fill_small_from_global(struct mutator *mut,
     if (fill_small_from_local(small_objects, kind))
       break;
   }
+  mark_space_unlock(space);
 }
 
 static void fill_small(struct mutator *mut, enum small_object_size kind) {
@@ -644,6 +788,10 @@ static int initialize_gc(size_t size, struct heap **heap,
   size_t mark_bytes_size = (size + GRANULE_SIZE) / (GRANULE_SIZE + 1);
   size_t overhead = align_up(mark_bytes_size, GRANULE_SIZE);
 
+  pthread_mutex_init(&space->lock, NULL);
+  pthread_cond_init(&space->mutator_cond, NULL);
+  pthread_cond_init(&space->collector_cond, NULL);
+
   space->heap_base = ((uintptr_t) mem) + overhead;
   space->heap_size = size - overhead;
   space->sweep = space->heap_base + space->heap_size;
@@ -659,12 +807,18 @@ static int initialize_gc(size_t size, struct heap **heap,
 }
 
 static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct heap *parent) {
-  fprintf(stderr,
-          "Multiple mutator threads not yet implemented.\n");
-  exit(1);
+                                                struct heap *heap) {
+  struct mutator *ret = calloc(1, sizeof(struct mutator));
+  if (!ret)
+    abort();
+  add_mutator(heap, ret);
+  return ret;
 }
-static void finish_gc_for_thread(struct mutator *heap) {
+
+static void finish_gc_for_thread(struct mutator *mut) {
+  remove_mutator(mutator_heap(mut), mut);
+  mutator_mark_buf_destroy(&mut->mark_buf);
+  free(mut);
 }
 
 static inline void print_start_gc_stats(struct heap *heap) {

From ac57e01e3101017289bcb193fcc1f7820704ef9c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 14:51:20 +0200
Subject: [PATCH 054/403] BDW doesn't have mutator-local freelists for
 pointerless objects

---
 bdw.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/bdw.h b/bdw.h
index d0d24f7e2..e60802887 100644
--- a/bdw.h
+++ b/bdw.h
@@ -34,7 +34,6 @@ struct heap {
 
 struct mutator {
   void *freelists[GC_INLINE_FREELIST_COUNT];
-  void *pointerless_freelists[GC_INLINE_FREELIST_COUNT];
   struct heap *heap;
 };
 
@@ -94,13 +93,10 @@ static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
 
 static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind, size_t size) {
-  size_t idx = gc_inline_bytes_to_freelist_index(size);
-
-  if (UNLIKELY (idx >= GC_INLINE_FREELIST_COUNT))
-    return GC_malloc_atomic(size);
-
-  return allocate_small(&mut->pointerless_freelists[idx], idx,
-                        GC_INLINE_KIND_POINTERLESS);
+  // Because the BDW API requires us to implement a custom marker so
+  // that the pointerless freelist gets traced, even though it's in a
+  // pointerless region, we punt on thread-local pointerless freelists.
+  return GC_malloc_atomic(size);
 }
 
 static inline void collect(struct mutator *mut) {
@@ -138,6 +134,7 @@ static int initialize_gc(size_t heap_size, struct heap **heap,
     GC_set_max_heap_size (heap_size);
     GC_expand_hp(heap_size - current_heap_size);
   }
+  GC_allow_register_threads();
   *heap = GC_malloc(sizeof(struct heap));
   pthread_mutex_init(&(*heap)->lock, NULL);
   *mutator = add_mutator(*heap);

From 5522d827e3c5137c5f15fc17c237390b25b793ca Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 14:51:36 +0200
Subject: [PATCH 055/403] mt-gcbench: write the "j" field in the binary tree
 nodes.

---
 mt-gcbench.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mt-gcbench.c b/mt-gcbench.c
index 8f6d007e5..28dc92267 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -141,6 +141,8 @@ static void populate(struct mutator *mut, int depth, Node *node) {
 
   set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
   set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
+  // i is 0 because the memory is zeroed.
+  HANDLE_REF(self)->j = depth;
 
   populate(mut, depth-1, HANDLE_REF(self)->left);
   populate(mut, depth-1, HANDLE_REF(self)->right);
@@ -163,6 +165,8 @@ static Node* make_tree(struct mutator *mut, int depth) {
   Node *result = allocate_node(mut);
   init_field((void**)&result->left, HANDLE_REF(left));
   init_field((void**)&result->right, HANDLE_REF(right));
+  // i is 0 because the memory is zeroed.
+  result->j = depth;
 
   POP_HANDLE(mut);
   POP_HANDLE(mut);
@@ -173,7 +177,7 @@ static Node* make_tree(struct mutator *mut, int depth) {
 static void validate_tree(Node *tree, int depth) {
 #ifndef NDEBUG
   ASSERT_EQ(tree->i, 0);
-  ASSERT_EQ(tree->j, 0);
+  ASSERT_EQ(tree->j, depth);
   if (depth == 0) {
     ASSERT(!tree->left);
     ASSERT(!tree->right);
@@ -278,7 +282,7 @@ static void* run_one_test(struct mutator *mut) {
 
   // Fake reference to LongLivedTree and array to keep them from being optimized
   // away.
-  if (HANDLE_REF(long_lived_tree) == 0
+  if (HANDLE_REF(long_lived_tree)->i != 0
       || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
 

From d879a01913697bd697b63d7feac2dd3cc43b4d0a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 14:58:54 +0200
Subject: [PATCH 056/403] Remove gcbench in favor of mt-gcbench.  Update quads

---
 Makefile     |   2 +-
 gcbench.c    | 294 ---------------------------------------------------
 mark-sweep.h |   1 +
 quads.c      |  36 ++++---
 4 files changed, 23 insertions(+), 310 deletions(-)
 delete mode 100644 gcbench.c

diff --git a/Makefile b/Makefile
index 6237ac936..9ef8b8f85 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS=gcbench quads mt-gcbench # MT_GCBench MT_GCBench2
+TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
diff --git a/gcbench.c b/gcbench.c
deleted file mode 100644
index 0b530669c..000000000
--- a/gcbench.c
+++ /dev/null
@@ -1,294 +0,0 @@
-// This is adapted from a benchmark written by John Ellis and Pete Kovac
-// of Post Communications.
-// It was modified by Hans Boehm of Silicon Graphics.
-// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
-// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
-//
-//      This is no substitute for real applications.  No actual application
-//      is likely to behave in exactly this way.  However, this benchmark was
-//      designed to be more representative of real applications than other
-//      Java GC benchmarks of which we are aware.
-//      It attempts to model those properties of allocation requests that
-//      are important to current GC techniques.
-//      It is designed to be used either to obtain a single overall performance
-//      number, or to give a more detailed estimate of how collector
-//      performance varies with object lifetimes.  It prints the time
-//      required to allocate and collect balanced binary trees of various
-//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
-//      allocates roughly the same amount of memory.
-//      Two data structures are kept around during the entire process, so
-//      that the measured performance is representative of applications
-//      that maintain some live in-memory data.  One of these is a tree
-//      containing many pointers.  The other is a large array containing
-//      double precision floating point numbers.  Both should be of comparable
-//      size.
-//
-//      The results are only really meaningful together with a specification
-//      of how much memory was used.  It is possible to trade memory for
-//      better time performance.  This benchmark should be run in a 32 MB
-//      heap, though we don't currently know how to enforce that uniformly.
-//
-//      Unlike the original Ellis and Kovac benchmark, we do not attempt
-//      measure pause times.  This facility should eventually be added back
-//      in.  There are several reasons for omitting it for now.  The original
-//      implementation depended on assumptions about the thread scheduler
-//      that don't hold uniformly.  The results really measure both the
-//      scheduler and GC.  Pause time measurements tend to not fit well with
-//      current benchmark suites.  As far as we know, none of the current
-//      commercial Java implementations seriously attempt to minimize GC pause
-//      times.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-
-#include "assert.h"
-#include "gcbench-types.h"
-#include "gc.h"
-
-static const int long_lived_tree_depth = 16; // about 4Mb
-static const int array_size = 500000; // about 4Mb
-static const int min_tree_depth = 4;
-static const int max_tree_depth = 16;
-
-struct Node {
-  GC_HEADER;
-  struct Node * left;
-  struct Node * right;
-  int i, j;
-};
-
-struct DoubleArray {
-  GC_HEADER;
-  size_t length;
-  double values[0];
-};
-
-static inline size_t node_size(Node *obj) {
-  return sizeof(Node);
-}
-static inline size_t double_array_size(DoubleArray *array) {
-  return sizeof(*array) + array->length * sizeof(double);
-}
-static inline void
-visit_node_fields(Node *node,
-                  void (*visit)(void **loc, void *visit_data),
-                  void *visit_data) {
-  visit((void**)&node->left, visit_data);
-  visit((void**)&node->right, visit_data);
-}
-static inline void
-visit_double_array_fields(DoubleArray *obj,
-                          void (*visit)(void **loc, void *visit_data),
-                          void *visit_data) {
-}
-
-typedef HANDLE_TO(Node) NodeHandle;
-typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
-
-static Node* allocate_node(struct context *cx) {
-  // memset to 0 by the collector.
-  return allocate(cx, ALLOC_KIND_NODE, sizeof (Node));
-}
-
-static DoubleArray* allocate_double_array(struct context *cx,
-                                                 size_t size) {
-  // May be uninitialized.
-  DoubleArray *ret =
-    allocate_pointerless(cx, ALLOC_KIND_DOUBLE_ARRAY,
-                         sizeof(DoubleArray) + sizeof (double) * size);
-  ret->length = size;
-  return ret;
-}
-
-static unsigned long current_time(void)
-{
-  struct timeval t = { 0 };
-  gettimeofday(&t, NULL);
-  return t.tv_sec * 1000 * 1000 + t.tv_usec;
-}
-
-static double elapsed_millis(unsigned long start) {
-  return (current_time() - start) * 1e-3;
-}
-
-// Nodes used by a tree of a given size
-static int tree_size(int i) {
-  return ((1 << (i + 1)) - 1);
-}
-
-// Number of iterations to use for a given tree depth
-static int compute_num_iters(int i) {
-  return 2 * tree_size(max_tree_depth + 2) / tree_size(i);
-}
-
-// Build tree top down, assigning to older objects.
-static void populate(struct context *cx, int depth, Node *node) {
-  if (depth <= 0)
-    return;
-
-  NodeHandle self = { node };
-  PUSH_HANDLE(cx, self);
-  NodeHandle l = { allocate_node(cx) };
-  PUSH_HANDLE(cx, l);
-  NodeHandle r = { allocate_node(cx) };
-  PUSH_HANDLE(cx, r);
-
-  set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
-  set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
-
-  populate(cx, depth-1, HANDLE_REF(self)->left);
-  populate(cx, depth-1, HANDLE_REF(self)->right);
-
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-}
-
-// Build tree bottom-up
-static Node* make_tree(struct context *cx, int depth) {
-  if (depth <= 0)
-    return allocate_node(cx);
-
-  NodeHandle left = { make_tree(cx, depth-1) };
-  PUSH_HANDLE(cx, left);
-  NodeHandle right = { make_tree(cx, depth-1) };
-  PUSH_HANDLE(cx, right);
-
-  Node *result = allocate_node(cx);
-  init_field((void**)&result->left, HANDLE_REF(left));
-  init_field((void**)&result->right, HANDLE_REF(right));
-
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-
-  return result;
-}
-
-static void validate_tree(Node *tree, int depth) {
-#ifndef NDEBUG
-  ASSERT_EQ(tree->i, 0);
-  ASSERT_EQ(tree->j, 0);
-  if (depth == 0) {
-    ASSERT(!tree->left);
-    ASSERT(!tree->right);
-  } else {
-    ASSERT(tree->left);
-    ASSERT(tree->right);
-    validate_tree(tree->left, depth - 1);
-    validate_tree(tree->right, depth - 1);
-  }
-#endif
-}
-
-static void time_construction(struct context *cx, int depth) {
-  int num_iters = compute_num_iters(depth);
-  NodeHandle temp_tree = { NULL };
-  PUSH_HANDLE(cx, temp_tree);
-
-  printf("Creating %d trees of depth %d\n", num_iters, depth);
-
-  {
-    unsigned long start = current_time();
-    for (int i = 0; i < num_iters; ++i) {
-      HANDLE_SET(temp_tree, allocate_node(cx));
-      populate(cx, depth, HANDLE_REF(temp_tree));
-      validate_tree(HANDLE_REF(temp_tree), depth);
-      HANDLE_SET(temp_tree, NULL);
-    }
-    printf("\tTop down construction took %.3f msec\n",
-           elapsed_millis(start));
-  }
-
-  {
-    long start = current_time();
-    for (int i = 0; i < num_iters; ++i) {
-      HANDLE_SET(temp_tree, make_tree(cx, depth));
-      validate_tree(HANDLE_REF(temp_tree), depth);
-      HANDLE_SET(temp_tree, NULL);
-    }
-    printf("\tBottom up construction took %.3f msec\n",
-           elapsed_millis(start));
-  }
-
-  POP_HANDLE(cx);
-}
-
-int main(int argc, char *argv[]) {
-  // Define size of Node without any GC header.
-  size_t sizeof_node = 2 * sizeof(Node*) + 2 * sizeof(int);
-  size_t sizeof_double_array = sizeof(size_t);
-  size_t heap_max_live =
-    tree_size(long_lived_tree_depth) * sizeof_node +
-    tree_size(max_tree_depth) * sizeof_node +
-    sizeof_double_array + sizeof(double) * array_size;
-  if (argc != 2) {
-    fprintf(stderr, "usage: %s MULTIPLIER\n", argv[0]);
-    return 1;
-  }
-
-  double multiplier = atof(argv[1]);
-
-  if (!(1.0 < multiplier && multiplier < 100)) {
-    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
-    return 1;
-  }
-
-  size_t heap_size = heap_max_live * multiplier;
-  struct context *cx = initialize_gc(heap_size);
-  if (!cx) {
-    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
-            heap_size);
-    return 1;
-  }
-
-  NodeHandle root = { NULL };
-  NodeHandle long_lived_tree = { NULL };
-  NodeHandle temp_tree = { NULL };
-  DoubleArrayHandle array = { NULL };
-
-  PUSH_HANDLE(cx, root);
-  PUSH_HANDLE(cx, long_lived_tree);
-  PUSH_HANDLE(cx, temp_tree);
-  PUSH_HANDLE(cx, array);
-
-  printf("Garbage Collector Test\n");
-  printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
-  print_start_gc_stats(cx);
-
-  unsigned long start = current_time();
-        
-  // Create a long lived object
-  printf(" Creating a long-lived binary tree of depth %d\n",
-         long_lived_tree_depth);
-  HANDLE_SET(long_lived_tree, allocate_node(cx));
-  populate(cx, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
-
-  // Create long-lived array, filling half of it
-  printf(" Creating a long-lived array of %d doubles\n", array_size);
-  HANDLE_SET(array, allocate_double_array(cx, array_size));
-  for (int i = 0; i < array_size/2; ++i) {
-    HANDLE_REF(array)->values[i] = 1.0/i;
-  }
-
-  for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
-    time_construction(cx, d);
-  }
-
-  validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);
-
-  // Fake reference to LongLivedTree and array to keep them from being optimized
-  // away.
-  if (HANDLE_REF(long_lived_tree) == 0
-      || HANDLE_REF(array)->values[1000] != 1.0/1000)
-    fprintf(stderr, "Failed\n");
-
-  printf("Completed in %.3f msec\n", elapsed_millis(start));
-  print_end_gc_stats(cx);
-
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-  POP_HANDLE(cx);
-}
-
diff --git a/mark-sweep.h b/mark-sweep.h
index a59a4e67f..b4f270854 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -1,3 +1,4 @@
+#include <pthread.h>
 #include <stdatomic.h>
 #include <stdint.h>
 #include <stdio.h>
diff --git a/quads.c b/quads.c
index b6853d5ba..0f7e01857 100644
--- a/quads.c
+++ b/quads.c
@@ -22,9 +22,9 @@ visit_quad_fields(Quad *quad,
 }
 typedef HANDLE_TO(Quad) QuadHandle;
 
-static Quad* allocate_quad(struct context *cx) {
+static Quad* allocate_quad(struct mutator *mut) {
   // memset to 0 by the collector.
-  return allocate(cx, ALLOC_KIND_QUAD, sizeof (Quad));
+  return allocate(mut, ALLOC_KIND_QUAD, sizeof (Quad));
 }
 
 /* Get the current time in microseconds */
@@ -37,22 +37,22 @@ static unsigned long current_time(void)
 }
 
 // Build tree bottom-up
-static Quad* make_tree(struct context *cx, int depth) {
+static Quad* make_tree(struct mutator *mut, int depth) {
   if (depth<=0) {
-    return allocate_quad(cx);
+    return allocate_quad(mut);
   } else {
     QuadHandle kids[4] = { { NULL }, };
     for (size_t i = 0; i < 4; i++) {
-      HANDLE_SET(kids[i], make_tree(cx, depth-1));
-      PUSH_HANDLE(cx, kids[i]);
+      HANDLE_SET(kids[i], make_tree(mut, depth-1));
+      PUSH_HANDLE(mut, kids[i]);
     }
 
-    Quad *result = allocate_quad(cx);
+    Quad *result = allocate_quad(mut);
     for (size_t i = 0; i < 4; i++)
       init_field((void**)&result->kids[i], HANDLE_REF(kids[i]));
 
     for (size_t i = 0; i < 4; i++)
-      POP_HANDLE(cx);
+      POP_HANDLE(mut);
 
     return result;
   }
@@ -127,18 +127,24 @@ int main(int argc, char *argv[]) {
   unsigned long gc_start = current_time();
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, multiplier);
-  struct context *cx = initialize_gc(heap_size);
+  struct heap *heap;
+  struct mutator *mut;
+  if (!initialize_gc(heap_size, &heap, &mut)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            heap_size);
+    return 1;
+  }
 
   QuadHandle quad = { NULL };
 
-  PUSH_HANDLE(cx, quad);
+  PUSH_HANDLE(mut, quad);
 
-  print_start_gc_stats(cx);
+  print_start_gc_stats(heap);
 
   printf("Making quad tree of depth %zu (%zu nodes).  Total size %.3fGB.\n",
          depth, nquads, (nquads * sizeof(Quad)) / 1e9);
   unsigned long start = current_time();
-  HANDLE_SET(quad, make_tree(cx, depth));
+  HANDLE_SET(quad, make_tree(mut, depth));
   print_elapsed("construction", start);
 
   validate_tree(HANDLE_REF(quad), depth);
@@ -151,7 +157,7 @@ int main(int argc, char *argv[]) {
     size_t garbage_depth = 3;
     start = current_time();
     for (size_t i = garbage_step/(tree_size(garbage_depth)*4*sizeof(Quad*)); i; i--)
-      make_tree(cx, garbage_depth);
+      make_tree(mut, garbage_depth);
     print_elapsed("allocating garbage", start);
 
     start = current_time();
@@ -160,9 +166,9 @@ int main(int argc, char *argv[]) {
   print_elapsed("allocation loop", garbage_start);
   print_elapsed("quads test", gc_start);
 
-  print_end_gc_stats(cx);
+  print_end_gc_stats(heap);
 
-  POP_HANDLE(cx);
+  POP_HANDLE(mut);
   return 0;
 }
 

From 680032fa897bd18d34d562b9683bb4dc57218c54 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 15:47:08 +0200
Subject: [PATCH 057/403] Minor stop-the-world optimizations.  There are still
 bugs

Probably should switch to using a semaphore; no need to reacquire the
lock on wakeup.
---
 mark-sweep.h | 52 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index b4f270854..c20ec4b51 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -365,12 +365,9 @@ static void mark_global_roots(struct mark_space *space) {
   atomic_store(&space->mutator_roots, NULL);
 }
 
-static void pause_mutator_for_collection(struct mutator *mut) NEVER_INLINE;
-static void pause_mutator_for_collection(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
+static void pause_mutator_for_collection(struct mark_space *space) NEVER_INLINE;
+static void pause_mutator_for_collection(struct mark_space *space) {
   ASSERT(mutators_are_stopping(space));
-  mark_stopping_mutator_roots(mut);
-  mark_space_lock(space);
   ASSERT(space->active_mutator_count);
   space->active_mutator_count--;
   if (space->active_mutator_count == 0)
@@ -389,13 +386,32 @@ static void pause_mutator_for_collection(struct mutator *mut) {
   while (mutators_are_stopping(space) && space->count == epoch);
 
   space->active_mutator_count++;
+}
+
+static void pause_mutator_for_collection_with_lock(struct mutator *mut) NEVER_INLINE;
+static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
+  struct mark_space *space = mutator_mark_space(mut);
+  ASSERT(mutators_are_stopping(space));
+  mark_controlling_mutator_roots(mut);
+  pause_mutator_for_collection(space);
+  clear_mutator_freelists(mut);
+}
+
+static void pause_mutator_for_collection_without_lock(struct mutator *mut) NEVER_INLINE;
+static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
+  struct mark_space *space = mutator_mark_space(mut);
+  ASSERT(mutators_are_stopping(space));
+  mark_stopping_mutator_roots(mut);
+  mark_space_lock(space);
+  pause_mutator_for_collection(space);
   mark_space_unlock(space);
   release_stopping_mutator_roots(mut);
+  clear_mutator_freelists(mut);
 }
 
 static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
   while (mutators_are_stopping(mutator_mark_space(mut)))
-    pause_mutator_for_collection(mut);
+    pause_mutator_for_collection_without_lock(mut);
 }
 
 static void reset_sweeper(struct mark_space *space) {
@@ -570,6 +586,10 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
   maybe_pause_mutator_for_collection(mut);
 
   mark_space_lock(space);
+
+  while (mutators_are_stopping(space))
+    pause_mutator_for_collection_with_lock(mut);
+
   int swept_from_beginning = 0;
   while (1) {
     struct gcobj_free_large *already_scanned = NULL;
@@ -595,13 +615,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
       fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
       abort();
     } else {
-      if (mutators_are_stopping(space)) {
-        mark_space_unlock(space);
-        pause_mutator_for_collection(mut);
-        mark_space_lock(space);
-      } else {
-        collect(space, mut);
-      }
+      collect(space, mut);
       swept_from_beginning = 1;
     }
   }
@@ -676,6 +690,10 @@ static void fill_small_from_global(struct mutator *mut,
   maybe_pause_mutator_for_collection(mut);
 
   mark_space_lock(space);
+
+  while (mutators_are_stopping(space))
+    pause_mutator_for_collection_with_lock(mut);
+
   int swept_from_beginning = 0;
   while (1) {
     if (fill_small_from_global_small(space, small_objects, kind))
@@ -689,13 +707,7 @@ static void fill_small_from_global(struct mutator *mut,
         fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
         abort();
       } else {
-        if (mutators_are_stopping(space)) {
-          mark_space_unlock(space);
-          pause_mutator_for_collection(mut);
-          mark_space_lock(space);
-        } else {
-          collect(space, mut);
-        }
+        collect(space, mut);
         swept_from_beginning = 1;
       }
     }

From 63002037385c9f9995fc7b7f55891ba5004e4952 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 29 Mar 2022 21:58:52 +0200
Subject: [PATCH 058/403] Add call_without_gc API

This lets us call pthread_join safely
---
 bdw.h        |  8 ++++++++
 mark-sweep.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 mt-gcbench.c | 15 ++++++++++++---
 semi.h       |  6 ++++++
 4 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/bdw.h b/bdw.h
index e60802887..0034b0561 100644
--- a/bdw.h
+++ b/bdw.h
@@ -158,6 +158,14 @@ static void finish_gc_for_thread(struct mutator *mut) {
   GC_unregister_my_thread();
 }
 
+static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
+                             void *data) NEVER_INLINE;
+static void* call_without_gc(struct mutator *mut,
+                             void* (*f)(void*),
+                             void *data) {
+  return GC_do_blocking(f, data);
+}
+
 static inline void print_start_gc_stats(struct heap *heap) {
 }
 static inline void print_end_gc_stats(struct heap *heap) {
diff --git a/mark-sweep.h b/mark-sweep.h
index c20ec4b51..a69bd544e 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -125,6 +125,7 @@ struct mark_space {
   size_t mem_size;
   long count;
   struct marker marker;
+  struct mutator *deactivated_mutators;
 };
 
 struct heap {
@@ -144,6 +145,7 @@ struct mutator {
   struct heap *heap;
   struct handle *roots;
   struct mutator_mark_buf mark_buf;
+  struct mutator *next;
 };
 
 static inline struct marker* mark_space_marker(struct mark_space *space) {
@@ -352,6 +354,11 @@ static void wait_for_mutators_to_stop(struct mark_space *space) {
     pthread_cond_wait(&space->collector_cond, &space->lock);
 }
 
+static void mark_inactive_mutators(struct mark_space *space) {
+  for (struct mutator *mut = space->deactivated_mutators; mut; mut = mut->next)
+    mark_controlling_mutator_roots(mut);
+}
+
 static void mark_global_roots(struct mark_space *space) {
   for (struct handle *h = space->global_roots; h; h = h->next) {
     struct gcobj *obj = h->v;
@@ -424,6 +431,7 @@ static void collect(struct mark_space *space, struct mutator *mut) {
   request_mutators_to_stop(space);
   mark_controlling_mutator_roots(mut);
   wait_for_mutators_to_stop(space);
+  mark_inactive_mutators(space);
   mark_global_roots(space);
   marker_trace(space);
   marker_release(space);
@@ -834,6 +842,42 @@ static void finish_gc_for_thread(struct mutator *mut) {
   free(mut);
 }
 
+static void deactivate_mutator(struct mark_space *space, struct mutator *mut) {
+  ASSERT(mut->next == NULL);
+  mark_space_lock(space);
+  mut->next = space->deactivated_mutators;
+  space->deactivated_mutators = mut;
+  space->active_mutator_count--;
+  if (!space->active_mutator_count && mutators_are_stopping(space))
+    pthread_cond_signal(&space->collector_cond);
+  mark_space_unlock(space);
+}
+
+static void reactivate_mutator(struct mark_space *space, struct mutator *mut) {
+  mark_space_lock(space);
+  while (mutators_are_stopping(space))
+    pthread_cond_wait(&space->mutator_cond, &space->lock);
+  struct mutator **prev = &space->deactivated_mutators;
+  while (*prev != mut)
+    prev = &(*prev)->next;
+  *prev = mut->next;
+  mut->next = NULL;
+  space->active_mutator_count++;
+  mark_space_unlock(space);
+}
+
+static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
+                             void *data) NEVER_INLINE;
+static void* call_without_gc(struct mutator *mut,
+                             void* (*f)(void*),
+                             void *data) {
+  struct mark_space *space = mutator_mark_space(mut);
+  deactivate_mutator(space, mut);
+  void *ret = f(data);
+  reactivate_mutator(space, mut);
+  return ret;
+}
+
 static inline void print_start_gc_stats(struct heap *heap) {
 }
 
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 28dc92267..b80a0c83c 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -297,6 +297,14 @@ static void* run_one_test_in_thread(void *arg) {
   return call_with_gc(run_one_test, heap);
 }
 
+struct join_data { int status; pthread_t thread; };
+static void *join_thread(void *data) {
+  struct join_data *join_data = data;
+  void *ret;
+  join_data->status = pthread_join(join_data->thread, &ret);
+  return ret;
+}
+
 int main(int argc, char *argv[]) {
   // Define size of Node without any GC header.
   size_t sizeof_node = 2 * sizeof(Node*) + 2 * sizeof(int);
@@ -350,9 +358,10 @@ int main(int argc, char *argv[]) {
   }
   run_one_test(mut);
   for (size_t i = 1; i < nthreads; i++) {
-    int status = pthread_join(threads[i], NULL);
-    if (status) {
-      errno = status;
+    struct join_data data = { 0, threads[i] };
+    call_without_gc(mut, join_thread, &data);
+    if (data.status) {
+      errno = data.status;
       perror("Failed to join thread");
       return 1;
     }
diff --git a/semi.h b/semi.h
index af58f052d..096c57591 100644
--- a/semi.h
+++ b/semi.h
@@ -204,6 +204,12 @@ static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
 static void finish_gc_for_thread(struct mutator *space) {
 }
 
+static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
+                             void *data) {
+  // Can't be threads, then there won't be collection.
+  return f(data);
+}
+
 static inline void print_start_gc_stats(struct heap *heap) {
 }
 

From a1dbbfd6ae4502cf30edb6e269c8d3baa8e80fee Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 30 Mar 2022 23:01:24 +0200
Subject: [PATCH 059/403] Speed up sweeping for small objects

When sweeping for small objects of a known size, instead of fitting
swept regions into the largest available bucket size, eagerly break
the regions into the requested size.  Throw away any fragmented space;
the next collection will get it.

When allocating small objects, just look in the size-segmented freelist;
don't grovel in other sizes on the global freelist.  The thought is that
we only add to the global freelists when allocating large objects, and
in that case some fragmentation is OK.  Perhaps this is the wrong
dynamic.

Reclaim 32 kB at a time instead of 1 kB.  This helps remove scalability
bottlenecks.
---
 mark-sweep.h | 72 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 25 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index a69bd544e..b3184e50d 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -471,13 +471,32 @@ static void push_large(struct mark_space *space, void *region, size_t granules)
   space->large_objects = large;
 }
 
+static void reclaim_small(struct gcobj_freelists *small_objects,
+                          enum small_object_size kind,
+                          void *region, size_t region_granules) {
+  ASSERT(kind != NOT_SMALL_OBJECT);
+  struct gcobj_free **loc = get_small_object_freelist(small_objects, kind);
+  uintptr_t addr = (uintptr_t) region;
+  size_t object_granules = small_object_granule_sizes[kind];
+  while (region_granules >= object_granules) {
+    push_free(loc, (struct gcobj_free*) addr);
+    region_granules -= object_granules;
+    addr += object_granules * GRANULE_SIZE;
+  }
+  // Any leftover granules are wasted!
+}
+
 static void reclaim(struct mark_space *space,
                     struct gcobj_freelists *small_objects,
-                    void *obj, size_t granules) {
-  if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
-    push_small(small_objects, obj, SMALL_OBJECT_SIZES - 1, granules);
+                    enum small_object_size kind,
+                    void *region,
+                    size_t region_granules) {
+  if (kind != NOT_SMALL_OBJECT)
+    reclaim_small(small_objects, kind, region, region_granules);
+  else if (region_granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
+    push_small(small_objects, region, SMALL_OBJECT_SIZES - 1, region_granules);
   else
-    push_large(space, obj, granules);
+    push_large(space, region, region_granules);
 }
 
 static void split_large_object(struct mark_space *space,
@@ -497,7 +516,8 @@ static void split_large_object(struct mark_space *space,
     return;
   
   char *tail = ((char*)large) + granules * GRANULE_SIZE;
-  reclaim(space, small_objects, tail, large_granules - granules);
+  reclaim(space, small_objects, NOT_SMALL_OBJECT, tail,
+          large_granules - granules);
 }
 
 static void unlink_large_object(struct gcobj_free_large **prev,
@@ -548,10 +568,12 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
 static int sweep(struct mark_space *space,
-                 struct gcobj_freelists *small_objects, size_t for_granules) {
-  // Sweep until we have reclaimed 128 granules (1024 kB), or we reach
-  // the end of the heap.
-  ssize_t to_reclaim = 128;
+                 struct gcobj_freelists *small_objects,
+                 enum small_object_size kind,
+                 size_t large_object_granules) {
+  // Sweep until we have reclaimed 32 kB of free memory, or we reach the
+  // end of the heap.
+  ssize_t to_reclaim = 32 * 1024 / GRANULE_SIZE;
   uintptr_t sweep = space->sweep;
   uintptr_t limit = space->heap_base + space->heap_size;
 
@@ -561,13 +583,19 @@ static int sweep(struct mark_space *space,
   while (to_reclaim > 0 && sweep < limit) {
     uint8_t* mark = mark_byte(space, (struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
-    if (limit_granules > for_granules)
-      limit_granules = for_granules;
+    if (limit_granules > to_reclaim) {
+      if (kind == NOT_SMALL_OBJECT) {
+        if (large_object_granules < limit_granules)
+          limit_granules = large_object_granules;
+      } else {
+        limit_granules = to_reclaim;
+      }
+    }
     size_t free_granules = next_mark(mark, limit_granules);
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
       clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
-      reclaim(space, small_objects, (void*)sweep, free_granules);
+      reclaim(space, small_objects, kind, (void*)sweep, free_granules);
       sweep += free_bytes;
       to_reclaim -= free_granules;
 
@@ -616,7 +644,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
         }
       }
       already_scanned = space->large_objects;
-    } while (sweep(space, small_objects, granules));
+    } while (sweep(space, small_objects, NOT_SMALL_OBJECT, granules));
 
     // No large object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
@@ -671,16 +699,11 @@ static int fill_small_from_large(struct mark_space *space,
 static int fill_small_from_global_small(struct mark_space *space,
                                         struct gcobj_freelists *small_objects,
                                         enum small_object_size kind) {
-  if (!space_has_multiple_mutators(space))
-    return 0;
-  struct gcobj_freelists *global_small = &space->small_objects;
-  if (*get_small_object_freelist(global_small, kind)
-      || fill_small_from_local(global_small, kind)) {
-    struct gcobj_free **src = get_small_object_freelist(global_small, kind);
-    ASSERT(*src);
+  struct gcobj_free **src =
+    get_small_object_freelist(&space->small_objects, kind);
+  if (*src) {
     struct gcobj_free **dst = get_small_object_freelist(small_objects, kind);
     ASSERT(!*dst);
-    // FIXME: just take a few?
     *dst = *src;
     *src = NULL;
     return 1;
@@ -710,7 +733,8 @@ static void fill_small_from_global(struct mutator *mut,
     if (fill_small_from_large(space, small_objects, kind))
       break;
 
-    if (!sweep(space, small_objects, LARGE_OBJECT_GRANULE_THRESHOLD)) {
+    // By default, pull in 16 kB of data at a time.
+    if (!sweep(space, small_objects, kind, 0)) {
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
         abort();
@@ -722,8 +746,6 @@ static void fill_small_from_global(struct mutator *mut,
 
     if (*get_small_object_freelist(small_objects, kind))
       break;
-    if (fill_small_from_local(small_objects, kind))
-      break;
   }
   mark_space_unlock(space);
 }
@@ -818,7 +840,7 @@ static int initialize_gc(size_t size, struct heap **heap,
   space->sweep = space->heap_base + space->heap_size;
   if (!marker_init(space))
     abort();
-  reclaim(space, NULL, (void*)space->heap_base,
+  reclaim(space, NULL, NOT_SMALL_OBJECT, (void*)space->heap_base,
           size_to_granules(space->heap_size));
 
   *mut = calloc(1, sizeof(struct mutator));

From 54ce801c724b874eaae22b3de066d0082b177f14 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 30 Mar 2022 23:21:45 +0200
Subject: [PATCH 060/403] Update README now that we have parallel mutators

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 48f56181b..c8b104856 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ majority of use cases.
 ## To do
 
  - [X] Implement a parallel marker for the mark-sweep collector.
- - [ ] Adapt all GC implementations to allow multiple mutator threads.
+ - [X] Adapt all GC implementations to allow multiple mutator threads.
    Update gcbench.c.
  - [ ] Implement precise non-moving Immix whole-heap collector.
  - [ ] Add evacuation to Immix whole-heap collector.

From b0b4c4d89393022623ce9603ccb3587997362771 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 31 Mar 2022 09:24:54 +0200
Subject: [PATCH 061/403] Remove unneeded files

---
 MT_GCBench.c    | 341 -----------------------------------------
 MT_GCBench2.c   | 398 ------------------------------------------------
 gcbench-types.h |  10 --
 3 files changed, 749 deletions(-)
 delete mode 100644 MT_GCBench.c
 delete mode 100644 MT_GCBench2.c
 delete mode 100644 gcbench-types.h

diff --git a/MT_GCBench.c b/MT_GCBench.c
deleted file mode 100644
index ba3a594f9..000000000
--- a/MT_GCBench.c
+++ /dev/null
@@ -1,341 +0,0 @@
-// This is adapted from a benchmark written by John Ellis and Pete Kovac
-// of Post Communications.
-// It was modified by Hans Boehm of Silicon Graphics.
-// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
-// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
-// Adapted to run NTHREADS client threads concurrently.  Each
-// thread executes the original benchmark.  12 June 2000  by Hans Boehm.
-//
-//      This is no substitute for real applications.  No actual application
-//      is likely to behave in exactly this way.  However, this benchmark was
-//      designed to be more representative of real applications than other
-//      Java GC benchmarks of which we are aware.
-//      It attempts to model those properties of allocation requests that
-//      are important to current GC techniques.
-//      It is designed to be used either to obtain a single overall performance
-//      number, or to give a more detailed estimate of how collector
-//      performance varies with object lifetimes.  It prints the time
-//      required to allocate and collect balanced binary trees of various
-//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
-//      allocates roughly the same amount of memory.
-//      Two data structures are kept around during the entire process, so
-//      that the measured performance is representative of applications
-//      that maintain some live in-memory data.  One of these is a tree
-//      containing many pointers.  The other is a large array containing
-//      double precision floating point numbers.  Both should be of comparable
-//      size.
-//
-//      The results are only really meaningful together with a specification
-//      of how much memory was used.  It is possible to trade memory for
-//      better time performance.  This benchmark should be run in a 32 MB
-//      heap, though we don't currently know how to enforce that uniformly.
-//
-//      Unlike the original Ellis and Kovac benchmark, we do not attempt
-//      measure pause times.  This facility should eventually be added back
-//      in.  There are several reasons for omitting it for now.  The original
-//      implementation depended on assumptions about the thread scheduler
-//      that don't hold uniformly.  The results really measure both the
-//      scheduler and GC.  Pause time measurements tend to not fit well with
-//      current benchmark suites.  As far as we know, none of the current
-//      commercial Java implementations seriously attempt to minimize GC pause
-//      times.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <pthread.h>
-
-#ifdef GC
-#  ifndef LINUX_THREADS
-#     define LINUX_THREADS
-#  endif
-#  ifndef _REENTRANT
-#     define _REENTRANT
-#  endif
-#  ifdef LOCAL
-#    define GC_REDIRECT_TO_LOCAL
-#    include "gc_local_alloc.h"
-#  endif
-#  include "gc.h"
-#endif
-
-
-#ifndef NTHREADS
-#   define NTHREADS 1
-#endif
-
-#ifdef PROFIL
-  extern void init_profiling();
-  extern dump_profile();
-#endif
-
-//  These macros were a quick hack for the Macintosh.
-//
-//  #define currentTime() clock()
-//  #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC)
-
-#define currentTime() stats_rtclock()
-#define elapsedTime(x) (x)
-
-/* Get the current time in milliseconds */
-
-unsigned
-stats_rtclock( void )
-{
-  struct timeval t;
-  struct timezone tz;
-
-  if (gettimeofday( &t, &tz ) == -1)
-    return 0;
-  return (t.tv_sec * 1000 + t.tv_usec / 1000);
-}
-
-static const int kStretchTreeDepth    = 18;      // about 16Mb
-static const int kLongLivedTreeDepth  = 16;  // about 4Mb
-static const int kArraySize  = 500000;  // about 4Mb
-static const int kMinTreeDepth = 4;
-static const int kMaxTreeDepth = 16;
-
-typedef struct Node0_struct {
-        struct Node0_struct * left;
-        struct Node0_struct * right;
-        int i, j;
-} Node0;
-
-#ifdef HOLES
-#   define HOLE() GC_NEW(Node0);
-#else
-#   define HOLE()
-#endif
-
-typedef Node0 *Node;
-
-void init_Node(Node me, Node l, Node r) {
-    me -> left = l;
-    me -> right = r;
-}
-
-#ifndef GC
-  void destroy_Node(Node me) {
-    if (me -> left) {
-	destroy_Node(me -> left);
-    }
-    if (me -> right) {
-	destroy_Node(me -> right);
-    }
-    free(me);
-  }
-#endif
-
-// Nodes used by a tree of a given size
-static int TreeSize(int i) {
-        return ((1 << (i + 1)) - 1);
-}
-
-// Number of iterations to use for a given tree depth
-static int NumIters(int i) {
-        return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
-}
-
-// Build tree top down, assigning to older objects.
-static void Populate(int iDepth, Node thisNode) {
-        if (iDepth<=0) {
-                return;
-        } else {
-                iDepth--;
-#		ifdef GC
-                  thisNode->left  = GC_NEW(Node0); HOLE();
-                  thisNode->right = GC_NEW(Node0); HOLE();
-#		else
-                  thisNode->left  = calloc(1, sizeof(Node0));
-                  thisNode->right = calloc(1, sizeof(Node0));
-#		endif
-                Populate (iDepth, thisNode->left);
-                Populate (iDepth, thisNode->right);
-        }
-}
-
-// Build tree bottom-up
-static Node MakeTree(int iDepth) {
-	Node result;
-        if (iDepth<=0) {
-#	    ifndef GC
-		result = calloc(1, sizeof(Node0));
-#	    else
-		result = GC_NEW(Node0); HOLE();
-#	    endif
-	    /* result is implicitly initialized in both cases. */
-	    return result;
-        } else {
-	    Node left = MakeTree(iDepth-1);
-	    Node right = MakeTree(iDepth-1);
-#	    ifndef GC
-		result = malloc(sizeof(Node0));
-#	    else
-		result = GC_NEW(Node0); HOLE();
-#	    endif
-	    init_Node(result, left, right);
-	    return result;
-        }
-}
-
-static void PrintDiagnostics() {
-#if 0
-        long lFreeMemory = Runtime.getRuntime().freeMemory();
-        long lTotalMemory = Runtime.getRuntime().totalMemory();
-
-        System.out.print(" Total memory available="
-                         + lTotalMemory + " bytes");
-        System.out.println("  Free memory=" + lFreeMemory + " bytes");
-#endif
-}
-
-static void TimeConstruction(int depth) {
-        long    tStart, tFinish;
-        int     iNumIters = NumIters(depth);
-        Node    tempTree;
-	int 	i;
-
-	printf("0x%x: Creating %d trees of depth %d\n", pthread_self(), iNumIters, depth);
-        
-        tStart = currentTime();
-        for (i = 0; i < iNumIters; ++i) {
-#		ifndef GC
-                  tempTree = calloc(1, sizeof(Node0));
-#		else
-                  tempTree = GC_NEW(Node0);
-#		endif
-                Populate(depth, tempTree);
-#		ifndef GC
-                  destroy_Node(tempTree);
-#		endif
-                tempTree = 0;
-        }
-        tFinish = currentTime();
-        printf("\t0x%x: Top down construction took %d msec\n",
-               pthread_self(), elapsedTime(tFinish - tStart));
-             
-        tStart = currentTime();
-        for (i = 0; i < iNumIters; ++i) {
-                tempTree = MakeTree(depth);
-#		ifndef GC
-                  destroy_Node(tempTree);
-#		endif
-                tempTree = 0;
-        }
-        tFinish = currentTime();
-        printf("\t0x%x: Bottom up construction took %d msec\n",
-               pthread_self(), elapsedTime(tFinish - tStart));
-
-}
-
-void * run_one_test(void * arg) {
-	int d;
-        for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
-                TimeConstruction(d);
-        }
-}
-
-int main() {
-        Node    root;
-        Node    longLivedTree;
-        Node    tempTree;
-        long    tStart, tFinish;
-        long    tElapsed;
-  	int	i;
-	double 	*array;
-
-#ifdef GC
- // GC_full_freq = 30;
- // GC_free_space_divisor = 16;
- // GC_enable_incremental();
-#endif
-#       if defined(GC) && defined(LOCAL)
-	  GC_thr_init();
-#  	endif
-	printf("Garbage Collector Test\n");
- 	printf(" Live storage will peak at %d bytes.\n\n",
-               2 * sizeof(Node0) * TreeSize(kLongLivedTreeDepth) +
-               sizeof(double) * kArraySize);
-        printf(" Stretching memory with a binary tree of depth %d\n",
-               kStretchTreeDepth);
-        PrintDiagnostics();
-#	ifdef PROFIL
-	    init_profiling();
-#	endif
-       
-        tStart = currentTime();
-        
-        // Stretch the memory space quickly
-        tempTree = MakeTree(kStretchTreeDepth);
-#	ifndef GC
-          destroy_Node(tempTree);
-#	endif
-        tempTree = 0;
-
-        // Create a long lived object
-        printf(" Creating a long-lived binary tree of depth %d\n",
-               kLongLivedTreeDepth);
-#	ifndef GC
-          longLivedTree = calloc(1, sizeof(Node0));
-#	else 
-          longLivedTree = GC_NEW(Node0);
-#	endif
-        Populate(kLongLivedTreeDepth, longLivedTree);
-
-        // Create long-lived array, filling half of it
-	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-#	ifndef GC
-          array = malloc(kArraySize * sizeof(double));
-#	else
-#	  ifndef NO_PTRFREE
-            array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
-#	  else
-            array = GC_MALLOC(sizeof(double) * kArraySize);
-#	  endif
-#	endif
-        for (i = 0; i < kArraySize/2; ++i) {
-                array[i] = 1.0/i;
-        }
-
-        {
-	  pthread_t thread[NTHREADS];
-	  for (i = 1; i < NTHREADS; ++i) {
-    	    int code;
-
-	    if ((code = pthread_create(thread+i, 0, run_one_test, 0)) != 0) {
-    	      fprintf(stderr, "Thread creation failed %u\n", code);
-	      exit(1);
-	    }
-	  }
-	  /* We use the main thread to run one test.  This allows	*/
-	  /* profiling to work, for example.				*/
-	  run_one_test(0);
-	  for (i = 1; i < NTHREADS; ++i) {
-    	    int code;
-	    if ((code = pthread_join(thread[i], 0)) != 0) {
-        	fprintf(stderr, "Thread join failed %u\n", code);
-      	    }
- 	  }
-        }
-        PrintDiagnostics();
-
-        if (longLivedTree == 0 || array[1000] != 1.0/1000)
-		fprintf(stderr, "Failed\n");
-                                // fake reference to LongLivedTree
-                                // and array
-                                // to keep them from being optimized away
-
-        tFinish = currentTime();
-        tElapsed = elapsedTime(tFinish-tStart);
-        PrintDiagnostics();
-        printf("Completed in %d msec\n", tElapsed);
-#	ifdef GC
-	  printf("Completed %d collections\n", GC_gc_no);
-	  printf("Heap size is %d\n", GC_get_heap_size());
-#       endif
-#	ifdef PROFIL
-	  dump_profile();
-#	endif
-}
-
diff --git a/MT_GCBench2.c b/MT_GCBench2.c
deleted file mode 100644
index 07fe7e3a5..000000000
--- a/MT_GCBench2.c
+++ /dev/null
@@ -1,398 +0,0 @@
-// This is version 2 of the multithreaded GC Bench.
-// Heap expansion is handled differently from version 1, in an attempt
-// to make scalability measurements more meaningful.  The version with
-// N threads now immediately expands the heap to N*32MB.
-//
-// To run this with BDWGC versions 6 and later with thread local allocation,
-// define GC and LOCAL.  Without thread-local allocation, define just GC.
-// To run it with the University of Tokyo scalable GC,
-// define SGC.  To run it with malloc and explicit deallocation, define
-// none of these.  (This should also work for Hoard.)
-//
-// Note that defining GC or SGC removes the explicit deallocation passes,
-// which seems fair.
-// 
-// This is adapted from a benchmark written by John Ellis and Pete Kovac
-// of Post Communications.
-// It was modified by Hans Boehm of Silicon Graphics.
-// Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ.
-// Translated to C 15 March 2000 by Hans Boehm, now at HP Labs.
-// Adapted to run NTHREADS client threads concurrently.  Each
-// thread executes the original benchmark.  12 June 2000  by Hans Boehm.
-// Changed heap expansion rule, and made the number of threads run-time
-// configurable.  25 Oct 2000 by Hans Boehm.
-//
-//      This is no substitute for real applications.  No actual application
-//      is likely to behave in exactly this way.  However, this benchmark was
-//      designed to be more representative of real applications than other
-//      Java GC benchmarks of which we were aware at the time.
-//      It still doesn't seem too bad for something this small.
-//      It attempts to model those properties of allocation requests that
-//      are important to current GC techniques.
-//      It is designed to be used either to obtain a single overall performance
-//      number, or to give a more detailed estimate of how collector
-//      performance varies with object lifetimes.  It prints the time
-//      required to allocate and collect balanced binary trees of various
-//      sizes.  Smaller trees result in shorter object lifetimes.  Each cycle
-//      allocates roughly the same amount of memory.
-//      Two data structures are kept around during the entire process, so
-//      that the measured performance is representative of applications
-//      that maintain some live in-memory data.  One of these is a tree
-//      containing many pointers.  The other is a large array containing
-//      double precision floating point numbers.  Both should be of comparable
-//      size.
-//
-//      The results are only really meaningful together with a specification
-//      of how much memory was used.  This versions of the benchmark tries
-//      to preallocate a sufficiently large heap that expansion should not be
-//      needed.
-//
-//      Unlike the original Ellis and Kovac benchmark, we do not attempt
-//      measure pause times.  This facility should eventually be added back
-//      in.  There are several reasons for omitting it for now.  The original
-//      implementation depended on assumptions about the thread scheduler
-//      that don't hold uniformly.  The results really measure both the
-//      scheduler and GC.  Pause time measurements tend to not fit well with
-//      current benchmark suites.  As far as we know, none of the current
-//      commercial Java implementations seriously attempt to minimize GC pause
-//      times.
-//
-//      Since this benchmark has recently been more widely used, some
-//      anomalous behavious has been uncovered.  The user should be aware
-//      of this:
-//      1) Nearly all objects are of the same size.  This benchmark is
-//         not useful for analyzing fragmentation behavior.  It is unclear
-//         whether this is an issue for well-designed allocators.
-//      2) Unless HOLES is defined, it tends to drop consecutively allocated
-//         memory at the same time.  Many real applications do exhibit this
-//         phenomenon, but probably not to this extent.  (Defining HOLES tends
-//         to move the benchmark to the opposite extreme.)
-//      3) It appears harder to predict object lifetimes than for most real
-//         Java programs (see T. Harris, "Dynamic adptive pre-tenuring",
-//         ISMM '00).
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <pthread.h>
-
-#ifdef GC
-#  ifndef LINUX_THREADS
-#     define LINUX_THREADS
-#  endif
-#  ifndef _REENTRANT
-#     define _REENTRANT
-#  endif
-#  ifdef LOCAL
-#    define GC_REDIRECT_TO_LOCAL
-#    include "gc_local_alloc.h"
-#  endif
-#  include "gc.h"
-#endif
-#ifdef SGC
-#  include "sgc.h"
-#  define GC
-#  define pthread_create GC_pthread_create
-#  define pthread_join GC_pthread_join
-#endif
-
-#define MAX_NTHREADS 1024
-
-int nthreads = 0;
-
-#ifdef PROFIL
-  extern void init_profiling();
-  extern dump_profile();
-#endif
-
-//  These macros were a quick hack for the Macintosh.
-//
-//  #define currentTime() clock()
-//  #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC)
-
-#define currentTime() stats_rtclock()
-#define elapsedTime(x) (x)
-
-/* Get the current time in milliseconds */
-
-unsigned
-stats_rtclock( void )
-{
-  struct timeval t;
-  struct timezone tz;
-
-  if (gettimeofday( &t, &tz ) == -1)
-    return 0;
-  return (t.tv_sec * 1000 + t.tv_usec / 1000);
-}
-
-static const int kStretchTreeDepth    = 18;      // about 16Mb
-static const int kLongLivedTreeDepth  = 16;  // about 4Mb
-static const int kArraySize  = 500000;  // about 4Mb
-static const int kMinTreeDepth = 4;
-static const int kMaxTreeDepth = 16;
-
-typedef struct Node0_struct {
-        struct Node0_struct * left;
-        struct Node0_struct * right;
-        int i, j;
-} Node0;
-
-#ifdef HOLES
-#   define HOLE() GC_NEW(Node0);
-#else
-#   define HOLE()
-#endif
-
-typedef Node0 *Node;
-
-void init_Node(Node me, Node l, Node r) {
-    me -> left = l;
-    me -> right = r;
-}
-
-#ifndef GC
-  void destroy_Node(Node me) {
-    if (me -> left) {
-	destroy_Node(me -> left);
-    }
-    if (me -> right) {
-	destroy_Node(me -> right);
-    }
-    free(me);
-  }
-#endif
-
-// Nodes used by a tree of a given size
-static int TreeSize(int i) {
-        return ((1 << (i + 1)) - 1);
-}
-
-// Number of iterations to use for a given tree depth
-static int NumIters(int i) {
-        return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i);
-}
-
-// Build tree top down, assigning to older objects.
-static void Populate(int iDepth, Node thisNode) {
-        if (iDepth<=0) {
-                return;
-        } else {
-                iDepth--;
-#		ifdef GC
-                  thisNode->left  = GC_NEW(Node0); HOLE();
-                  thisNode->right = GC_NEW(Node0); HOLE();
-#		else
-                  thisNode->left  = calloc(1, sizeof(Node0));
-                  thisNode->right = calloc(1, sizeof(Node0));
-#		endif
-                Populate (iDepth, thisNode->left);
-                Populate (iDepth, thisNode->right);
-        }
-}
-
-// Build tree bottom-up
-static Node MakeTree(int iDepth) {
-	Node result;
-        if (iDepth<=0) {
-#	    ifndef GC
-		result = calloc(1, sizeof(Node0));
-#	    else
-		result = GC_NEW(Node0); HOLE();
-#	    endif
-	    /* result is implicitly initialized in both cases. */
-	    return result;
-        } else {
-	    Node left = MakeTree(iDepth-1);
-	    Node right = MakeTree(iDepth-1);
-#	    ifndef GC
-		result = malloc(sizeof(Node0));
-#	    else
-		result = GC_NEW(Node0); HOLE();
-#	    endif
-	    init_Node(result, left, right);
-	    return result;
-        }
-}
-
-static void PrintDiagnostics() {
-#if 0
-        long lFreeMemory = Runtime.getRuntime().freeMemory();
-        long lTotalMemory = Runtime.getRuntime().totalMemory();
-
-        System.out.print(" Total memory available="
-                         + lTotalMemory + " bytes");
-        System.out.println("  Free memory=" + lFreeMemory + " bytes");
-#endif
-}
-
-static void TimeConstruction(int depth) {
-        long    tStart, tFinish;
-        int     iNumIters = NumIters(depth);
-        Node    tempTree;
-	int 	i;
-
-	printf("0x%x: Creating %d trees of depth %d\n", pthread_self(), iNumIters, depth);
-        
-        tStart = currentTime();
-        for (i = 0; i < iNumIters; ++i) {
-#		ifndef GC
-                  tempTree = calloc(1, sizeof(Node0));
-#		else
-                  tempTree = GC_NEW(Node0);
-#		endif
-                Populate(depth, tempTree);
-#		ifndef GC
-                  destroy_Node(tempTree);
-#		endif
-                tempTree = 0;
-        }
-        tFinish = currentTime();
-        printf("\t0x%x: Top down construction took %d msec\n",
-               pthread_self(), elapsedTime(tFinish - tStart));
-             
-        tStart = currentTime();
-        for (i = 0; i < iNumIters; ++i) {
-                tempTree = MakeTree(depth);
-#		ifndef GC
-                  destroy_Node(tempTree);
-#		endif
-                tempTree = 0;
-        }
-        tFinish = currentTime();
-        printf("\t0x%x: Bottom up construction took %d msec\n",
-               pthread_self(), elapsedTime(tFinish - tStart));
-
-}
-
-void * run_one_test(void * arg) {
-	int d, i;
-        Node    longLivedTree;
-	double 	*array;
-	/* size_t initial_bytes = GC_get_total_bytes(); */
-
-        // Create a long lived object
-        printf(" Creating a long-lived binary tree of depth %d\n",
-               kLongLivedTreeDepth);
-#	ifndef GC
-          longLivedTree = calloc(1, sizeof(Node0));
-#	else 
-          longLivedTree = GC_NEW(Node0);
-#	endif
-        Populate(kLongLivedTreeDepth, longLivedTree);
-
-        // Create long-lived array, filling half of it
-	printf(" Creating a long-lived array of %d doubles\n", kArraySize);
-#	ifndef GC
-          array = malloc(kArraySize * sizeof(double));
-#	else
-#	  ifndef NO_PTRFREE
-            array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize);
-#	  else
-            array = GC_MALLOC(sizeof(double) * kArraySize);
-#	  endif
-#	endif
-        for (i = 0; i < kArraySize/2; ++i) {
-                array[i] = 1.0/i;
-        }
-
-        for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) {
-                TimeConstruction(d);
-        }
-	/* printf("Allocated %ld bytes before start, %ld after\n",
-		initial_bytes, GC_get_total_bytes() - initial_bytes); */
-        if (longLivedTree->left -> right == 0 || array[1000] != 1.0/1000)
-		fprintf(stderr, "Failed\n");
-                                // fake reference to LongLivedTree
-                                // and array
-                                // to keep them from being optimized away
-
-}
-
-int main(int argc, char **argv) {
-        Node    root;
-        Node    tempTree[MAX_NTHREADS];
-        long    tStart, tFinish;
-        long    tElapsed;
-  	int	i;
-#	ifdef SGC
-	  SGC_attr_t attr;
-#	endif
-
-	if (1 == argc) {
-	    nthreads = 1;
-	} else if (2 == argc) {
-	    nthreads = atoi(argv[1]);
-	    if (nthreads < 1 || nthreads > MAX_NTHREADS) {
-		fprintf(stderr, "Invalid # of threads argument\n");
-		exit(1);
-	    }
-	} else {
-	    fprintf(stderr, "Usage: %s [# of threads]\n");
-	    exit(1);
-	}
-#       if defined(SGC)
-	  /* The University of Tokyo collector needs explicit	*/
-	  /* initialization.					*/
-	  SGC_attr_init(&attr);
-	  SGC_init(nthreads, &attr);
-#  	endif
-#ifdef GC
- // GC_full_freq = 30;
- // GC_free_space_divisor = 16;
- // GC_enable_incremental();
-#endif
-	printf("Garbage Collector Test\n");
- 	printf(" Live storage will peak at %d bytes or less .\n\n",
-               2 * sizeof(Node0) * nthreads
-	         * (TreeSize(kLongLivedTreeDepth) + TreeSize(kMaxTreeDepth))
-               + sizeof(double) * kArraySize);
-        PrintDiagnostics();
-        
-#	ifdef GC
-	  /* GC_expand_hp fails with empty heap */
-	  GC_malloc(1);
-	  GC_expand_hp(32*1024*1024*nthreads);
-#	endif
-
-#	ifdef PROFIL
-	    init_profiling();
-#	endif
-       
-        tStart = currentTime();
-        {
-	  pthread_t thread[MAX_NTHREADS];
-	  for (i = 1; i < nthreads; ++i) {
-    	    int code;
-
-	    if ((code = pthread_create(thread+i, 0, run_one_test, 0)) != 0) {
-    	      fprintf(stderr, "Thread creation failed %u\n", code);
-	      exit(1);
-	    }
-	  }
-	  /* We use the main thread to run one test.  This allows	*/
-	  /* profiling to work, for example.				*/
-	  run_one_test(0);
-	  for (i = 1; i < nthreads; ++i) {
-    	    int code;
-	    if ((code = pthread_join(thread[i], 0)) != 0) {
-        	fprintf(stderr, "Thread join failed %u\n", code);
-      	    }
- 	  }
-        }
-        PrintDiagnostics();
-
-        tFinish = currentTime();
-        tElapsed = elapsedTime(tFinish-tStart);
-        PrintDiagnostics();
-        printf("Completed in %d msec\n", tElapsed);
-#	ifdef GC
-	  printf("Completed %d collections\n", GC_gc_no);
-	  printf("Heap size is %d\n", GC_get_heap_size());
-#       endif
-#	ifdef PROFIL
-	  dump_profile();
-#	endif
-        return 0;
-}
-
diff --git a/gcbench-types.h b/gcbench-types.h
deleted file mode 100644
index a61b2b7d5..000000000
--- a/gcbench-types.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef GCBENCH_TYPES_H
-#define GCBENCH_TYPES_H
-
-#define FOR_EACH_HEAP_OBJECT_KIND(M) \
-  M(node, Node, NODE) \
-  M(double_array, DoubleArray, DOUBLE_ARRAY)
-
-#include "heap-objects.h"
-
-#endif // GCBENCH_TYPES_H

From d425620d3700671243ce41b0795f3494561fbaa9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 12 Apr 2022 21:41:26 +0200
Subject: [PATCH 062/403] Add address map and set

---
 address-hash.h     |  45 ++++++++++
 address-map.h      | 210 +++++++++++++++++++++++++++++++++++++++++++++
 address-set.h      | 193 +++++++++++++++++++++++++++++++++++++++++
 test-address-map.c | 109 +++++++++++++++++++++++
 test-address-set.c |  98 +++++++++++++++++++++
 5 files changed, 655 insertions(+)
 create mode 100644 address-hash.h
 create mode 100644 address-map.h
 create mode 100644 address-set.h
 create mode 100644 test-address-map.c
 create mode 100644 test-address-set.c

diff --git a/address-hash.h b/address-hash.h
new file mode 100644
index 000000000..49c33be97
--- /dev/null
+++ b/address-hash.h
@@ -0,0 +1,45 @@
+#ifndef ADDRESS_HASH_H
+#define ADDRESS_HASH_H
+
+#include <stdint.h>
+
+static uintptr_t hash_address(uintptr_t x) {
+  if (sizeof (x) < 8) {
+    // Chris Wellon's lowbias32, from https://nullprogram.com/blog/2018/07/31/.
+    x ^= x >> 16;
+    x *= 0x7feb352dU;
+    x ^= x >> 15;
+    x *= 0x846ca68bU;
+    x ^= x >> 16;
+    return x;
+  } else {
+    // Sebastiano Vigna's splitmix64 integer mixer, from
+    // https://prng.di.unimi.it/splitmix64.c.
+    x ^= x >> 30;
+    x *= 0xbf58476d1ce4e5b9U;
+    x ^= x >> 27;
+    x *= 0x94d049bb133111ebU;
+    x ^= x >> 31;
+    return x;
+  }
+}
+// Inverse of hash_address from https://nullprogram.com/blog/2018/07/31/.
+static uintptr_t unhash_address(uintptr_t x) {
+  if (sizeof (x) < 8) {
+    x ^= x >> 16;
+    x *= 0x43021123U;
+    x ^= x >> 15 ^ x >> 30;
+    x *= 0x1d69e2a5U;
+    x ^= x >> 16;
+    return x;
+  } else {
+    x ^= x >> 31 ^ x >> 62;
+    x *= 0x319642b2d24d8ec3U;
+    x ^= x >> 27 ^ x >> 54;
+    x *= 0x96de1b173f119089U;
+    x ^= x >> 30 ^ x >> 60;
+    return x;
+  }
+}
+
+#endif // ADDRESS_HASH_H
diff --git a/address-map.h b/address-map.h
new file mode 100644
index 000000000..4b6b0c47f
--- /dev/null
+++ b/address-map.h
@@ -0,0 +1,210 @@
+#ifndef ADDRESS_MAP_H
+#define ADDRESS_MAP_H
+
+#include <malloc.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "address-hash.h"
+
+struct hash_map_entry {
+  uintptr_t k;
+  uintptr_t v;
+};
+
+struct hash_map {
+  struct hash_map_entry *data;
+  size_t size;    	// total number of slots
+  size_t n_items;	// number of items in set
+  uint8_t *bits;        // bitvector indicating set slots
+};
+
+static void hash_map_clear(struct hash_map *map) {
+  memset(map->bits, 0, map->size / 8);
+  map->n_items = 0;
+}
+  
+// Size must be a power of 2.
+static void hash_map_init(struct hash_map *map, size_t size) {
+  map->size = size;
+  map->data = malloc(sizeof(struct hash_map_entry) * size);
+  map->bits = malloc(size / 8);
+  hash_map_clear(map);
+}
+static void hash_map_destroy(struct hash_map *map) {
+  free(map->data);
+  free(map->bits);
+}
+
+static size_t hash_map_slot_index(struct hash_map *map, size_t idx) {
+  return idx & (map->size - 1);
+}
+static struct hash_map_entry* hash_map_slot_entry(struct hash_map *map,
+                                                  size_t idx) {
+  return &map->data[hash_map_slot_index(map, idx)];
+}
+static int hash_map_slot_is_empty(struct hash_map *map, size_t idx) {
+  idx = hash_map_slot_index(map, idx);
+  return (map->bits[idx / 8] & (1 << (idx % 8))) == 0;
+}
+static void hash_map_slot_acquire(struct hash_map *map, size_t idx) {
+  idx = hash_map_slot_index(map, idx);
+  map->bits[idx / 8] |= (1 << (idx % 8));
+  map->n_items++;
+}
+static void hash_map_slot_release(struct hash_map *map, size_t idx) {
+  idx = hash_map_slot_index(map, idx);
+  map->bits[idx / 8] &= ~(1 << (idx % 8));
+  map->n_items--;
+}
+static size_t hash_map_slot_distance(struct hash_map *map, size_t idx) {
+  return hash_map_slot_index(map, idx - hash_map_slot_entry(map, idx)->k);
+}
+static int hash_map_should_shrink(struct hash_map *map) {
+  return map->size > 8 && map->n_items <= (map->size >> 3);
+}
+static int hash_map_should_grow(struct hash_map *map) {
+  return map->n_items >= map->size - (map->size >> 3);
+}
+
+static void hash_map_do_insert(struct hash_map *map, uintptr_t k, uintptr_t v) {
+  size_t displacement = 0;
+  while (!hash_map_slot_is_empty(map, k + displacement)
+         && displacement < hash_map_slot_distance(map, k + displacement))
+    displacement++;
+  while (!hash_map_slot_is_empty(map, k + displacement)
+         && displacement == hash_map_slot_distance(map, k + displacement)) {
+    if (hash_map_slot_entry(map, k + displacement)->k == k) {
+      hash_map_slot_entry(map, k + displacement)->v = v;
+      return;
+    }
+    displacement++;
+  }
+  size_t idx = k + displacement;
+  size_t slots_to_move = 0;
+  while (!hash_map_slot_is_empty(map, idx + slots_to_move))
+    slots_to_move++;
+  hash_map_slot_acquire(map, idx + slots_to_move);
+  while (slots_to_move--)
+    *hash_map_slot_entry(map, idx + slots_to_move + 1) =
+      *hash_map_slot_entry(map, idx + slots_to_move);
+  *hash_map_slot_entry(map, idx) = (struct hash_map_entry){ k, v };
+}
+
+static void hash_map_populate(struct hash_map *dst, struct hash_map *src) {
+  for (size_t i = 0; i < src->size; i++)
+    if (!hash_map_slot_is_empty(src, i))
+      hash_map_do_insert(dst, hash_map_slot_entry(src, i)->k,
+                         hash_map_slot_entry(src, i)->v);
+}
+static void hash_map_grow(struct hash_map *map) {
+  struct hash_map fresh;
+  hash_map_init(&fresh, map->size << 1);
+  hash_map_populate(&fresh, map);
+  hash_map_destroy(map);
+  memcpy(map, &fresh, sizeof(fresh));
+}
+static void hash_map_shrink(struct hash_map *map) {
+  struct hash_map fresh;
+  hash_map_init(&fresh, map->size >> 1);
+  hash_map_populate(&fresh, map);
+  hash_map_destroy(map);
+  memcpy(map, &fresh, sizeof(fresh));
+}
+
+static void hash_map_insert(struct hash_map *map, uintptr_t k, uintptr_t v) {
+  if (hash_map_should_grow(map))
+    hash_map_grow(map);
+  hash_map_do_insert(map, k, v);
+}
+static void hash_map_remove(struct hash_map *map, uintptr_t k) {
+  size_t slot = k;
+  while (!hash_map_slot_is_empty(map, slot) && hash_map_slot_entry(map, slot)->k != k)
+    slot++;
+  if (hash_map_slot_is_empty(map, slot))
+    __builtin_trap();
+  while (!hash_map_slot_is_empty(map, slot + 1)
+         && hash_map_slot_distance(map, slot + 1)) {
+    *hash_map_slot_entry(map, slot) = *hash_map_slot_entry(map, slot + 1);
+    slot++;
+  }
+  hash_map_slot_release(map, slot);
+  if (hash_map_should_shrink(map))
+    hash_map_shrink(map);
+}
+static int hash_map_contains(struct hash_map *map, uintptr_t k) {
+  for (size_t slot = k; !hash_map_slot_is_empty(map, slot); slot++) {
+    if (hash_map_slot_entry(map, slot)->k == k)
+      return 1;
+    if (hash_map_slot_distance(map, slot) < (slot - k))
+      return 0;
+  }
+  return 0;
+}
+static uintptr_t hash_map_lookup(struct hash_map *map, uintptr_t k, uintptr_t default_) {
+  for (size_t slot = k; !hash_map_slot_is_empty(map, slot); slot++) {
+    if (hash_map_slot_entry(map, slot)->k == k)
+      return hash_map_slot_entry(map, slot)->v;
+    if (hash_map_slot_distance(map, slot) < (slot - k))
+      break;
+  }
+  return default_;
+}
+static inline void hash_map_for_each (struct hash_map *map,
+                                      void (*f)(uintptr_t, uintptr_t, void*),
+                                      void *data) __attribute__((always_inline));
+static inline void hash_map_for_each(struct hash_map *map,
+                                     void (*f)(uintptr_t, uintptr_t, void*),
+                                     void *data) {
+  for (size_t i = 0; i < map->size; i++)
+    if (!hash_map_slot_is_empty(map, i))
+      f(hash_map_slot_entry(map, i)->k, hash_map_slot_entry(map, i)->v, data);
+}
+  
+struct address_map {
+  struct hash_map hash_map;
+};
+
+static void address_map_init(struct address_map *map) {
+  hash_map_init(&map->hash_map, 8);
+}
+static void address_map_destroy(struct address_map *map) {
+  hash_map_destroy(&map->hash_map);
+}
+static void address_map_clear(struct address_map *map) {
+  hash_map_clear(&map->hash_map);
+}
+  
+static void address_map_add(struct address_map *map, uintptr_t addr, uintptr_t v) {
+  hash_map_insert(&map->hash_map, hash_address(addr), v);
+}
+static void address_map_remove(struct address_map *map, uintptr_t addr) {
+  hash_map_remove(&map->hash_map, hash_address(addr));
+}
+static int address_map_contains(struct address_map *map, uintptr_t addr) {
+  return hash_map_contains(&map->hash_map, hash_address(addr));
+}
+static uintptr_t address_map_lookup(struct address_map *map, uintptr_t addr,
+                                 uintptr_t default_) {
+  return hash_map_lookup(&map->hash_map, hash_address(addr), default_);
+}
+
+struct address_map_for_each_data {
+  void (*f)(uintptr_t, uintptr_t, void *);
+  void *data;
+};
+static void address_map_do_for_each(uintptr_t k, uintptr_t v, void *data) {
+  struct address_map_for_each_data *for_each_data = data;
+  for_each_data->f(unhash_address(k), v, for_each_data->data);
+}
+static inline void address_map_for_each (struct address_map *map,
+                                         void (*f)(uintptr_t, uintptr_t, void*),
+                                         void *data) __attribute__((always_inline));
+static inline void address_map_for_each (struct address_map *map,
+                                         void (*f)(uintptr_t, uintptr_t, void*),
+                                         void *data) {
+  struct address_map_for_each_data for_each_data = { f, data };
+  hash_map_for_each(&map->hash_map, address_map_do_for_each, &for_each_data);
+}
+
+#endif // ADDRESS_MAP_H
diff --git a/address-set.h b/address-set.h
new file mode 100644
index 000000000..5bbcb6168
--- /dev/null
+++ b/address-set.h
@@ -0,0 +1,193 @@
+#ifndef ADDRESS_SET_H
+#define ADDRESS_SET_H
+
+#include <malloc.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "address-hash.h"
+
+struct hash_set {
+  uintptr_t *data;
+  size_t size;    	// total number of slots
+  size_t n_items;	// number of items in set
+  uint8_t *bits;        // bitvector indicating set slots
+};
+
+static void hash_set_clear(struct hash_set *set) {
+  memset(set->bits, 0, set->size / 8);
+  set->n_items = 0;
+}
+  
+// Size must be a power of 2.
+static void hash_set_init(struct hash_set *set, size_t size) {
+  set->size = size;
+  set->data = malloc(sizeof(uintptr_t) * size);
+  set->bits = malloc(size / 8);
+  hash_set_clear(set);
+}
+static void hash_set_destroy(struct hash_set *set) {
+  free(set->data);
+  free(set->bits);
+}
+
+static size_t hash_set_slot_index(struct hash_set *set, size_t idx) {
+  return idx & (set->size - 1);
+}
+static int hash_set_slot_is_empty(struct hash_set *set, size_t idx) {
+  idx = hash_set_slot_index(set, idx);
+  return (set->bits[idx / 8] & (1 << (idx % 8))) == 0;
+}
+static uintptr_t hash_set_slot_ref(struct hash_set *set, size_t idx) {
+  return set->data[hash_set_slot_index(set, idx)];
+}
+static void hash_set_slot_set(struct hash_set *set, size_t idx, uintptr_t v) {
+  set->data[hash_set_slot_index(set, idx)] = v;
+}
+static void hash_set_slot_acquire(struct hash_set *set, size_t idx) {
+  idx = hash_set_slot_index(set, idx);
+  set->bits[idx / 8] |= (1 << (idx % 8));
+  set->n_items++;
+}
+static void hash_set_slot_release(struct hash_set *set, size_t idx) {
+  idx = hash_set_slot_index(set, idx);
+  set->bits[idx / 8] &= ~(1 << (idx % 8));
+  set->n_items--;
+}
+static size_t hash_set_slot_distance(struct hash_set *set, size_t idx) {
+  return hash_set_slot_index(set, idx - hash_set_slot_ref(set, idx));
+}
+static int hash_set_should_shrink(struct hash_set *set) {
+  return set->size > 8 && set->n_items <= (set->size >> 3);
+}
+static int hash_set_should_grow(struct hash_set *set) {
+  return set->n_items >= set->size - (set->size >> 3);
+}
+
+static void hash_set_do_insert(struct hash_set *set, uintptr_t v) {
+  size_t displacement = 0;
+  while (!hash_set_slot_is_empty(set, v + displacement)
+         && displacement < hash_set_slot_distance(set, v + displacement))
+    displacement++;
+  while (!hash_set_slot_is_empty(set, v + displacement)
+         && displacement == hash_set_slot_distance(set, v + displacement)) {
+    if (hash_set_slot_ref(set, v + displacement) == v)
+      return;
+    displacement++;
+  }
+  size_t idx = v + displacement;
+  size_t slots_to_move = 0;
+  while (!hash_set_slot_is_empty(set, idx + slots_to_move))
+    slots_to_move++;
+  hash_set_slot_acquire(set, idx + slots_to_move);
+  while (slots_to_move--)
+    hash_set_slot_set(set, idx + slots_to_move + 1,
+                      hash_set_slot_ref(set, idx + slots_to_move));
+  hash_set_slot_set(set, idx, v);
+}
+
+static void hash_set_populate(struct hash_set *dst, struct hash_set *src) {
+  for (size_t i = 0; i < src->size; i++)
+    if (!hash_set_slot_is_empty(src, i))
+      hash_set_do_insert(dst, hash_set_slot_ref(src, i));
+}
+static void hash_set_grow(struct hash_set *set) {
+  struct hash_set fresh;
+  hash_set_init(&fresh, set->size << 1);
+  hash_set_populate(&fresh, set);
+  hash_set_destroy(set);
+  memcpy(set, &fresh, sizeof(fresh));
+}
+static void hash_set_shrink(struct hash_set *set) {
+  struct hash_set fresh;
+  hash_set_init(&fresh, set->size >> 1);
+  hash_set_populate(&fresh, set);
+  hash_set_destroy(set);
+  memcpy(set, &fresh, sizeof(fresh));
+}
+
+static void hash_set_insert(struct hash_set *set, uintptr_t v) {
+  if (hash_set_should_grow(set))
+    hash_set_grow(set);
+  hash_set_do_insert(set, v);
+}
+
+static void hash_set_remove(struct hash_set *set, uintptr_t v) {
+  size_t slot = v;
+  while (!hash_set_slot_is_empty(set, slot) && hash_set_slot_ref(set, slot) != v)
+    slot++;
+  if (hash_set_slot_is_empty(set, slot))
+    __builtin_trap();
+  while (!hash_set_slot_is_empty(set, slot + 1)
+         && hash_set_slot_distance(set, slot + 1)) {
+    hash_set_slot_set(set, slot, hash_set_slot_ref(set, slot + 1));
+    slot++;
+  }
+  hash_set_slot_release(set, slot);
+  if (hash_set_should_shrink(set))
+    hash_set_shrink(set);
+}
+static int hash_set_contains(struct hash_set *set, uintptr_t v) {
+  for (size_t slot = v; !hash_set_slot_is_empty(set, slot); slot++) {
+    if (hash_set_slot_ref(set, slot) == v)
+      return 1;
+    if (hash_set_slot_distance(set, slot) < (slot - v))
+      return 0;
+  }
+  return 0;
+}
+static inline void hash_set_for_each (struct hash_set *set,
+                                      void (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void hash_set_for_each(struct hash_set *set,
+                                     void (*f)(uintptr_t, void*), void *data) {
+  for (size_t i = 0; i < set->size; i++)
+    if (!hash_set_slot_is_empty(set, i))
+      f(hash_set_slot_ref(set, i), data);
+}
+  
+struct address_set {
+  struct hash_set hash_set;
+};
+
+static void address_set_init(struct address_set *set) {
+  hash_set_init(&set->hash_set, 8);
+}
+static void address_set_destroy(struct address_set *set) {
+  hash_set_destroy(&set->hash_set);
+}
+static void address_set_clear(struct address_set *set) {
+  hash_set_clear(&set->hash_set);
+}
+  
+static void address_set_add(struct address_set *set, uintptr_t addr) {
+  hash_set_insert(&set->hash_set, hash_address(addr));
+}
+static void address_set_remove(struct address_set *set, uintptr_t addr) {
+  hash_set_remove(&set->hash_set, hash_address(addr));
+}
+static int address_set_contains(struct address_set *set, uintptr_t addr) {
+  return hash_set_contains(&set->hash_set, hash_address(addr));
+}
+static void address_set_union(struct address_set *set, struct address_set *other) {
+  while (set->hash_set.size < other->hash_set.size)
+    hash_set_grow(&set->hash_set);
+  hash_set_populate(&set->hash_set, &other->hash_set);
+}
+
+struct address_set_for_each_data {
+  void (*f)(uintptr_t, void *);
+  void *data;
+};
+static void address_set_do_for_each(uintptr_t v, void *data) {
+  struct address_set_for_each_data *for_each_data = data;
+  for_each_data->f(unhash_address(v), for_each_data->data);
+}
+static inline void address_set_for_each (struct address_set *set,
+                                         void (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void address_set_for_each (struct address_set *set,
+                                         void (*f)(uintptr_t, void*), void *data) {
+  struct address_set_for_each_data for_each_data = { f, data };
+  hash_set_for_each(&set->hash_set, address_set_do_for_each, &for_each_data);
+}
+
+#endif // ADDRESS_SET_H
diff --git a/test-address-map.c b/test-address-map.c
new file mode 100644
index 000000000..abe11c4b0
--- /dev/null
+++ b/test-address-map.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+
+#include "address-map.h"
+
+#define COUNT (1000 * 1000)
+
+static void add_to_other(uintptr_t addr, uintptr_t val, void *data) {
+  struct address_map *other = data;
+  if (addr >= COUNT)
+    fprintf(stdout, "unexpected address: %zu\n", addr);
+  if (address_map_contains(other, addr))
+    fprintf(stdout, "missing: %zu\n", addr);
+  address_map_add(other, addr, val);
+}
+
+int main(int argc, char *arv[]) {
+  struct address_map set;
+  address_map_init(&set);
+  for (size_t i = 0; i < COUNT; i++)
+    address_map_add(&set, i, -i);
+  fprintf(stdout, "after initial add, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  for (size_t i = 0; i < COUNT; i++) {
+    if (!address_map_contains(&set, i)) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+    if (address_map_lookup(&set, i, -1) != -i) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+  }
+  for (size_t i = COUNT; i < COUNT * 2; i++) {
+    if (address_map_contains(&set, i)) {
+      fprintf(stdout, "unexpectedly present: %zu\n", i);
+      return 1;
+    }
+  }
+  address_map_clear(&set);
+  fprintf(stdout, "after clear, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  for (size_t i = 0; i < COUNT; i++)
+    address_map_add(&set, i, 0);
+  // Now update.
+  fprintf(stdout, "after re-add, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  for (size_t i = 0; i < COUNT; i++)
+    address_map_add(&set, i, i + 1);
+  fprintf(stdout, "after idempotent re-add, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  for (size_t i = 0; i < COUNT; i++) {
+    if (!address_map_contains(&set, i)) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+    if (address_map_lookup(&set, i, -1) != i + 1) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+  }
+  for (size_t i = 0; i < COUNT; i++)
+    address_map_remove(&set, i);
+  fprintf(stdout, "after one-by-one removal, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  for (size_t i = COUNT; i < 2 * COUNT; i++) {
+    if (address_map_contains(&set, i)) {
+      fprintf(stdout, "unexpectedly present: %zu\n", i);
+      return 1;
+    }
+  }
+  for (size_t i = 0; i < COUNT; i++)
+    address_map_add(&set, i, i + 2);
+  struct address_map set2;
+  address_map_init(&set2);
+  address_map_for_each(&set, add_to_other, &set2);
+  fprintf(stdout, "after for-each set, %zu/%zu\n", set2.hash_map.n_items,
+          set2.hash_map.size);
+  for (size_t i = 0; i < COUNT; i++) {
+    if (address_map_lookup(&set2, i, -1) != i + 2) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+  }
+  address_map_destroy(&set2);
+
+  size_t burnin = 1000 * 1000 * 1000 / COUNT;
+  fprintf(stdout, "beginning clear then add %zu items, %zu times\n",
+          (size_t)COUNT, burnin);
+  for (size_t j = 0; j < burnin; j++) {
+    address_map_clear(&set);
+    for (size_t i = 0; i < COUNT; i++)
+      address_map_add(&set, i, i + 3);
+  }
+  fprintf(stdout, "after burnin, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  fprintf(stdout, "beginning lookup %zu items, %zu times\n",
+          (size_t)COUNT, burnin);
+  for (size_t j = 0; j < burnin; j++) {
+    for (size_t i = 0; i < COUNT; i++) {
+      if (address_map_lookup(&set, i, -1) != i + 3) {
+        fprintf(stdout, "missing: %zu\n", i);
+        return 1;
+      }
+    }
+  }
+  fprintf(stdout, "after burnin, %zu/%zu\n", set.hash_map.n_items,
+          set.hash_map.size);
+  address_map_destroy(&set);
+}
diff --git a/test-address-set.c b/test-address-set.c
new file mode 100644
index 000000000..ecd14b674
--- /dev/null
+++ b/test-address-set.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+
+#include "address-set.h"
+
+#define COUNT (1000 * 1000)
+
+static void remove_from_other(uintptr_t addr, void *data) {
+  struct address_set *other = data;
+  if (addr >= COUNT)
+    fprintf(stdout, "unexpected address: %zu\n", addr);
+  if (!address_set_contains(other, addr))
+    fprintf(stdout, "missing: %zu\n", addr);
+  address_set_remove(other, addr);
+}
+
+int main(int argc, char *arv[]) {
+  struct address_set set;
+  address_set_init(&set);
+  for (size_t i = 0; i < COUNT; i++)
+    address_set_add(&set, i);
+  fprintf(stdout, "after initial add, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  for (size_t i = 0; i < COUNT; i++) {
+    if (!address_set_contains(&set, i)) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+  }
+  for (size_t i = COUNT; i < COUNT * 2; i++) {
+    if (address_set_contains(&set, i)) {
+      fprintf(stdout, "unexpectedly present: %zu\n", i);
+      return 1;
+    }
+  }
+  address_set_clear(&set);
+  fprintf(stdout, "after clear, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  for (size_t i = 0; i < COUNT; i++)
+    address_set_add(&set, i);
+  // Do it twice.
+  fprintf(stdout, "after re-add, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  for (size_t i = 0; i < COUNT; i++)
+    address_set_add(&set, i);
+  fprintf(stdout, "after idempotent re-add, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  for (size_t i = 0; i < COUNT; i++) {
+    if (!address_set_contains(&set, i)) {
+      fprintf(stdout, "missing: %zu\n", i);
+      return 1;
+    }
+  }
+  for (size_t i = 0; i < COUNT; i++)
+    address_set_remove(&set, i);
+  fprintf(stdout, "after one-by-one removal, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  for (size_t i = COUNT; i < 2 * COUNT; i++) {
+    if (address_set_contains(&set, i)) {
+      fprintf(stdout, "unexpectedly present: %zu\n", i);
+      return 1;
+    }
+  }
+  for (size_t i = 0; i < COUNT; i++)
+    address_set_add(&set, i);
+  struct address_set set2;
+  address_set_init(&set2);
+  address_set_union(&set2, &set);
+  fprintf(stdout, "populated set2, %zu/%zu\n", set2.hash_set.n_items,
+          set2.hash_set.size);
+  address_set_for_each(&set, remove_from_other, &set2);
+  fprintf(stdout, "after for-each removal, %zu/%zu\n", set2.hash_set.n_items,
+          set2.hash_set.size);
+  address_set_destroy(&set2);
+
+  size_t burnin = 1000 * 1000 * 1000 / COUNT;
+  fprintf(stdout, "beginning clear then add %zu items, %zu times\n",
+          (size_t)COUNT, burnin);
+  for (size_t j = 0; j < burnin; j++) {
+    address_set_clear(&set);
+    for (size_t i = 0; i < COUNT; i++)
+      address_set_add(&set, i);
+  }
+  fprintf(stdout, "after burnin, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  fprintf(stdout, "beginning lookup %zu items, %zu times\n",
+          (size_t)COUNT, burnin);
+  for (size_t j = 0; j < burnin; j++) {
+    for (size_t i = 0; i < COUNT; i++) {
+      if (!address_set_contains(&set, i)) {
+        fprintf(stdout, "missing: %zu\n", i);
+        return 1;
+      }
+    }
+  }
+  fprintf(stdout, "after burnin, %zu/%zu\n", set.hash_set.n_items,
+          set.hash_set.size);
+  address_set_destroy(&set);
+}

From 619a49ba410e373d2b689123de3a01df6a3a9d7e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 13 Apr 2022 21:43:18 +0200
Subject: [PATCH 063/403] Add large object space

Not wired up yet.
---
 address-set.h        |  40 ++++++---
 large-object-space.h | 188 +++++++++++++++++++++++++++++++++++++++++++
 mt-gcbench.c         |   2 +-
 semi.h               |  31 +++++--
 4 files changed, 241 insertions(+), 20 deletions(-)
 create mode 100644 large-object-space.h

diff --git a/address-set.h b/address-set.h
index 5bbcb6168..74bc08888 100644
--- a/address-set.h
+++ b/address-set.h
@@ -136,13 +136,14 @@ static int hash_set_contains(struct hash_set *set, uintptr_t v) {
   }
   return 0;
 }
-static inline void hash_set_for_each (struct hash_set *set,
-                                      void (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
-static inline void hash_set_for_each(struct hash_set *set,
-                                     void (*f)(uintptr_t, void*), void *data) {
+static inline void hash_set_find(struct hash_set *set,
+                                 int (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void hash_set_find(struct hash_set *set,
+                                 int (*f)(uintptr_t, void*), void *data) {
   for (size_t i = 0; i < set->size; i++)
     if (!hash_set_slot_is_empty(set, i))
-      f(hash_set_slot_ref(set, i), data);
+      if (f(hash_set_slot_ref(set, i), data))
+        return;
 }
   
 struct address_set {
@@ -178,16 +179,33 @@ struct address_set_for_each_data {
   void (*f)(uintptr_t, void *);
   void *data;
 };
-static void address_set_do_for_each(uintptr_t v, void *data) {
+static int address_set_do_for_each(uintptr_t v, void *data) {
   struct address_set_for_each_data *for_each_data = data;
   for_each_data->f(unhash_address(v), for_each_data->data);
+  return 0;
 }
-static inline void address_set_for_each (struct address_set *set,
-                                         void (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
-static inline void address_set_for_each (struct address_set *set,
-                                         void (*f)(uintptr_t, void*), void *data) {
+static inline void address_set_for_each(struct address_set *set,
+                                        void (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void address_set_for_each(struct address_set *set,
+                                        void (*f)(uintptr_t, void*), void *data) {
   struct address_set_for_each_data for_each_data = { f, data };
-  hash_set_for_each(&set->hash_set, address_set_do_for_each, &for_each_data);
+  hash_set_find(&set->hash_set, address_set_do_for_each, &for_each_data);
+}
+
+struct address_set_find_data {
+  int (*f)(uintptr_t, void *);
+  void *data;
+};
+static int address_set_do_find(uintptr_t v, void *data) {
+  struct address_set_find_data *find_data = data;
+  return find_data->f(unhash_address(v), find_data->data);
+}
+static inline void address_set_find(struct address_set *set,
+                                    int (*f)(uintptr_t, void*), void *data) __attribute__((always_inline));
+static inline void address_set_find(struct address_set *set,
+                                    int (*f)(uintptr_t, void*), void *data) {
+  struct address_set_find_data find_data = { f, data };
+  hash_set_find(&set->hash_set, address_set_do_find, &find_data);
 }
 
 #endif // ADDRESS_SET_H
diff --git a/large-object-space.h b/large-object-space.h
new file mode 100644
index 000000000..bcd36e9e9
--- /dev/null
+++ b/large-object-space.h
@@ -0,0 +1,188 @@
+#ifndef LARGE_OBJECT_SPACE_H
+#define LARGE_OBJECT_SPACE_H
+
+#include <pthread.h>
+#include <malloc.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "address-map.h"
+#include "address-set.h"
+
+// Logically the large object space is a treadmill space -- somewhat like a
+// copying collector, in that we allocate into tospace, and collection flips
+// tospace to fromspace, except that we just keep a record on the side of which
+// objects are in which space.  That way we slot into the abstraction of a
+// copying collector while not actually copying data.
+
+struct heap;
+struct gcobj;
+
+struct large_object_space {
+  struct heap *heap;
+  
+  pthread_mutex_t lock;
+
+  size_t page_size;
+  size_t page_size_log2;
+  size_t total_pages;
+  size_t free_pages;
+  size_t live_pages_at_last_collection;
+
+  struct address_set from_space;
+  struct address_set to_space;
+  struct address_set free_space;
+  struct address_map object_pages; // for each object: size in pages.
+  struct address_map predecessors; // subsequent addr -> object addr
+};
+
+static int large_object_space_init(struct large_object_space *space,
+                                   struct heap *heap) {
+  pthread_mutex_init(&space->lock, NULL);
+  space->page_size = getpagesize();
+  space->page_size_log2 = __builtin_clz(space->page_size);
+  address_set_init(&space->from_space);
+  address_set_init(&space->to_space);
+  address_set_init(&space->free_space);
+  address_map_init(&space->object_pages);
+  address_map_init(&space->predecessors);
+  return 1;
+}
+
+static void large_object_space_start_gc(struct large_object_space *space) {
+  // Flip.  Note that when we flip, fromspace is empty, but it might have
+  // allocated storage, so we do need to do a proper swap.
+  struct address_set tmp;
+  memcpy(&tmp, &space->from_space, sizeof(tmp));
+  memcpy(&space->from_space, &space->to_space, sizeof(tmp));
+  memcpy(&space->to_space, &tmp, sizeof(tmp));
+  space->live_pages_at_last_collection = 0;
+}
+
+static int large_object_space_copy(struct large_object_space *space,
+                                   uintptr_t addr) {
+  int copied = 0;
+  pthread_mutex_lock(&space->lock);
+  if (!address_set_contains(&space->from_space, addr))
+    // Already copied; object is grey or white.
+    goto done;
+  space->live_pages_at_last_collection +=
+    address_map_lookup(&space->object_pages, addr, 0);
+  address_set_remove(&space->from_space, addr);
+  address_set_add(&space->to_space, addr);
+  // Object should be placed on mark stack for visiting its fields.  (While on
+  // mark stack it's actually grey, not black.)
+  copied = 1;
+done:
+  pthread_mutex_unlock(&space->lock);
+  return copied;
+}
+
+static void large_object_space_reclaim_one(uintptr_t addr, void *data) {
+  struct large_object_space *space = data;
+  size_t npages = address_map_lookup(&space->object_pages, addr, 0);
+  // Release the pages to the OS, and cause them to be zero on next use.
+  madvise((void*) addr, npages * space->page_size, MADV_DONTNEED);
+  size_t did_merge = 0;
+  uintptr_t pred = address_map_lookup(&space->predecessors, addr, 0);
+  uintptr_t succ = addr + npages * space->page_size;
+  if (pred && address_set_contains(&space->free_space, pred)) {
+    // Merge with free predecessor.
+    address_map_remove(&space->predecessors, addr);
+    address_map_remove(&space->object_pages, addr);
+    addr = pred;
+    npages += address_map_lookup(&space->object_pages, addr, 0);
+    did_merge = 1;
+  } else {
+    // Otherwise this is a new free object.
+    address_set_add(&space->free_space, addr);
+  }
+  if (address_set_contains(&space->free_space, succ)) {
+    // Merge with free successor.
+    size_t succ_npages = address_map_lookup(&space->object_pages, succ, 0);
+    address_map_remove(&space->predecessors, succ);
+    address_map_remove(&space->object_pages, succ);
+    address_set_remove(&space->free_space, succ);
+    npages += succ_npages;
+    succ += succ_npages * space->page_size;
+    did_merge = 1;
+  }
+  if (did_merge) {
+    // Update extents.
+    address_map_add(&space->object_pages, addr, npages);
+    address_map_add(&space->predecessors, succ, addr);
+  }
+}
+
+static void large_object_space_finish_gc(struct large_object_space *space) {
+  pthread_mutex_lock(&space->lock);
+  address_set_for_each(&space->from_space, large_object_space_reclaim_one,
+                       space);
+  address_set_clear(&space->from_space);
+  space->free_pages = space->total_pages - space->live_pages_at_last_collection;
+  pthread_mutex_unlock(&space->lock);
+}
+
+static inline int large_object_space_contains(struct large_object_space *space,
+                                              struct gcobj *ptr) {
+  int ret;
+  pthread_mutex_lock(&space->lock);
+  // ptr might be in fromspace or tospace.  Just check the object_pages table, which
+  // contains both, as well as object_pages for free blocks.
+  ret = address_map_contains(&space->object_pages, (uintptr_t)ptr);
+  pthread_mutex_unlock(&space->lock);
+  return ret;
+}
+
+struct large_object_space_candidate {
+  struct large_object_space *space;
+  size_t min_npages;
+  uintptr_t addr;
+  size_t npages;
+};
+
+static int large_object_space_best_fit(uintptr_t addr, void *data) {
+  struct large_object_space_candidate *found = data;
+  size_t npages = address_map_lookup(&found->space->object_pages, addr, 0);
+  if (npages < found->min_npages)
+    return 0;
+  if (npages >= found->npages)
+    return 0;
+  found->addr = addr;
+  found->npages = npages;
+  return found->min_npages == npages;
+}
+    
+static inline void* large_object_space_alloc(struct large_object_space *space,
+                                             size_t size) {
+  void *ret;
+  pthread_mutex_lock(&space->lock);
+  ret = NULL;
+  size_t npages = (size + space->page_size - 1) >> space->page_size_log2;
+  struct large_object_space_candidate found = { space, npages, 0, -1 };
+  address_set_find(&space->free_space, large_object_space_best_fit, &found);
+  if (found.addr) {
+    uintptr_t addr = found.addr;
+    ret = (void*)addr;
+    address_set_remove(&space->free_space, addr);
+    address_set_add(&space->to_space, addr);
+
+    if (found.npages > npages) {
+      uintptr_t succ = addr + npages * space->page_size;
+      uintptr_t succ_succ = succ + (found.npages - npages) * space->page_size;
+      address_map_add(&space->object_pages, addr, npages);
+      address_map_add(&space->object_pages, succ, found.npages - npages);
+      address_set_add(&space->free_space, succ);
+      address_map_add(&space->predecessors, succ, addr);
+      address_map_add(&space->predecessors, succ_succ, succ);
+    }
+    space->free_pages -= npages;
+  }
+  pthread_mutex_unlock(&space->lock);
+  return ret;
+}
+
+#endif // LARGE_OBJECT_SPACE_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index b80a0c83c..aa9177b98 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -45,7 +45,7 @@
 #include <sys/time.h>
 
 #include "assert.h"
-#include "gcbench-types.h"
+#include "mt-gcbench-types.h"
 #include "gc.h"
 #include "inline.h"
 
diff --git a/semi.h b/semi.h
index 096c57591..3661c7ace 100644
--- a/semi.h
+++ b/semi.h
@@ -5,6 +5,7 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include "large-object-space.h"
 #include "precise-roots.h"
 
 struct semi_space {
@@ -16,6 +17,7 @@ struct semi_space {
 };
 struct heap {
   struct semi_space semi_space;
+  struct large_object_space large_object_space;
 };
 // One mutator per space, can just store the heap in the mutator.
 struct mutator {
@@ -29,6 +31,9 @@ static inline struct heap* mutator_heap(struct mutator *mut) {
 static inline struct semi_space* heap_semi_space(struct heap *heap) {
   return &heap->semi_space;
 }
+static inline struct large_object_space* heap_large_object_space(struct heap *heap) {
+  return &heap->large_object_space;
+}
 static inline struct semi_space* mutator_semi_space(struct mutator *mut) {
   return heap_semi_space(mutator_heap(mut));
 }
@@ -172,24 +177,34 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static int initialize_gc(size_t heap_size, struct heap **heap,
-                         struct mutator **mut) {
-  void *mem = mmap(NULL, heap_size, PROT_READ|PROT_WRITE,
+static int initialize_semi_space(struct semi_space *space, size_t size) {
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("mmap failed");
     return 0;
   }
 
+  space->hp = space->base = (uintptr_t) mem;
+  space->size = size;
+  space->count = -1;
+  flip(space);
+
+  return 1;
+}
+  
+static int initialize_gc(size_t heap_size, struct heap **heap,
+                         struct mutator **mut) {
   *mut = calloc(1, sizeof(struct mutator));
   if (!*mut) abort();
   *heap = mutator_heap(*mut);
-  struct semi_space *space = mutator_semi_space(*mut);
 
-  space->hp = space->base = (uintptr_t) mem;
-  space->size = heap_size;
-  space->count = -1;
-  flip(space);
+  struct semi_space *space = mutator_semi_space(*mut);
+  if (!initialize_semi_space(space, heap_size))
+    return 0;
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+    return 0;
+  
   (*mut)->roots = NULL;
 
   return 1;

From 3315fc7477068377458fc61c772e903487e38620 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 14 Apr 2022 22:20:27 +0200
Subject: [PATCH 064/403] Add large object space to semi-space collector

---
 large-object-space.h |  32 ++++++++++--
 semi.h               | 113 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/large-object-space.h b/large-object-space.h
index bcd36e9e9..29d733a9e 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -43,7 +43,7 @@ static int large_object_space_init(struct large_object_space *space,
                                    struct heap *heap) {
   pthread_mutex_init(&space->lock, NULL);
   space->page_size = getpagesize();
-  space->page_size_log2 = __builtin_clz(space->page_size);
+  space->page_size_log2 = __builtin_ctz(space->page_size);
   address_set_init(&space->from_space);
   address_set_init(&space->to_space);
   address_set_init(&space->free_space);
@@ -52,6 +52,11 @@ static int large_object_space_init(struct large_object_space *space,
   return 1;
 }
 
+static size_t large_object_space_npages(struct large_object_space *space,
+                                       size_t bytes) {
+  return (bytes + space->page_size - 1) >> space->page_size_log2;
+}
+
 static void large_object_space_start_gc(struct large_object_space *space) {
   // Flip.  Note that when we flip, fromspace is empty, but it might have
   // allocated storage, so we do need to do a proper swap.
@@ -156,12 +161,11 @@ static int large_object_space_best_fit(uintptr_t addr, void *data) {
   return found->min_npages == npages;
 }
     
-static inline void* large_object_space_alloc(struct large_object_space *space,
-                                             size_t size) {
+static void* large_object_space_alloc(struct large_object_space *space,
+                                      size_t npages) {
   void *ret;
   pthread_mutex_lock(&space->lock);
   ret = NULL;
-  size_t npages = (size + space->page_size - 1) >> space->page_size_log2;
   struct large_object_space_candidate found = { space, npages, 0, -1 };
   address_set_find(&space->free_space, large_object_space_best_fit, &found);
   if (found.addr) {
@@ -185,4 +189,24 @@ static inline void* large_object_space_alloc(struct large_object_space *space,
   return ret;
 }
 
+static void*
+large_object_space_obtain_and_alloc(struct large_object_space *space,
+                                    size_t npages) {
+  size_t bytes = npages * space->page_size;
+  void *ret = mmap(NULL, bytes, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (ret == MAP_FAILED)
+    return NULL;
+
+  uintptr_t addr = (uintptr_t)ret;
+  pthread_mutex_lock(&space->lock);
+  address_map_add(&space->object_pages, addr, npages);
+  address_map_add(&space->predecessors, addr + bytes, addr);
+  address_set_add(&space->to_space, addr);
+  space->total_pages += npages;
+  pthread_mutex_unlock(&space->lock);
+
+  return ret;
+}
+
 #endif // LARGE_OBJECT_SPACE_H
diff --git a/semi.h b/semi.h
index 3661c7ace..0757ee6b0 100644
--- a/semi.h
+++ b/semi.h
@@ -55,6 +55,23 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) NEVER_INLINE;
 
 static void visit(void **loc, void *visit_data);
 
+static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
+  size_t page_size = getpagesize();
+
+  if (npages & 1) npages++;
+  size_t half_size = (npages * page_size) >> 1;
+  if (space->limit - space->hp < half_size)
+    return 0;
+
+  space->limit -= half_size;
+  size_t tospace_offset = space->limit - space->base;
+  madvise((void*)(space->base + tospace_offset), half_size, MADV_DONTNEED);
+  size_t fromspace_offset =
+    (tospace_offset + (space->size >> 1)) & (space->size - 1);
+  madvise((void*)(space->base + fromspace_offset), half_size, MADV_DONTNEED);
+  return 1;
+}
+
 static void flip(struct semi_space *space) {
   uintptr_t split = space->base + (space->size >> 1);
   if (space->hp <= split) {
@@ -86,13 +103,13 @@ static void* copy(struct semi_space *space, uintptr_t kind, void *obj) {
   return new_obj;
 }
 
-static uintptr_t scan(struct semi_space *space, uintptr_t grey) {
+static uintptr_t scan(struct heap *heap, uintptr_t grey) {
   void *obj = (void*)grey;
   uintptr_t kind = *(uintptr_t*) obj;
   switch (kind) {
 #define SCAN_OBJECT(name, Name, NAME) \
     case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, visit, space); \
+      visit_##name##_fields((Name*)obj, visit, heap); \
       return grey + align_up(name##_size((Name*)obj), ALIGNMENT);
     FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
@@ -114,25 +131,53 @@ static void* forward(struct semi_space *space, void *obj) {
   }
 }  
 
-static void visit(void **loc, void *visit_data) {
-  struct semi_space *space = visit_data;
-  void *obj = *loc;
-  if (obj != NULL)
-    *loc = forward(space, obj);
+static void visit_semi_space(struct heap *heap, struct semi_space *space,
+                             void **loc, void *obj) {
+  *loc = forward(space, obj);
 }
-static void collect(struct mutator *mut) {
-  struct semi_space *space = mutator_semi_space(mut);
-  // fprintf(stderr, "start collect #%ld:\n", space->count);
-  flip(space);
-  uintptr_t grey = space->hp;
-  for (struct handle *h = mut->roots; h; h = h->next)
-    visit(&h->v, space);
-  // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
-  while(grey < space->hp)
-    grey = scan(space, grey);
-  // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 
+static void visit_large_object_space(struct heap *heap,
+                                     struct large_object_space *space,
+                                     void **loc, void *obj) {
+  if (large_object_space_copy(space, (uintptr_t)obj))
+    scan(heap, (uintptr_t)obj);
 }
+
+static int semi_space_contains(struct semi_space *space, void *obj) {
+  return (((uintptr_t)obj) - space->base) < space->size;
+}
+
+static void visit(void **loc, void *visit_data) {
+  struct heap *heap = visit_data;
+  void *obj = *loc;
+  if (obj == NULL)
+    return;
+  if (semi_space_contains(heap_semi_space(heap), obj))
+    visit_semi_space(heap, heap_semi_space(heap), loc, obj);
+  else if (large_object_space_contains(heap_large_object_space(heap), obj))
+    visit_large_object_space(heap, heap_large_object_space(heap), loc, obj);
+  else
+    abort();
+}
+
+static void collect(struct mutator *mut) {
+  struct heap *heap = mutator_heap(mut);
+  struct semi_space *semi = heap_semi_space(heap);
+  struct large_object_space *large = heap_large_object_space(heap);
+  // fprintf(stderr, "start collect #%ld:\n", space->count);
+  large_object_space_start_gc(large);
+  flip(semi);
+  uintptr_t grey = semi->hp;
+  for (struct handle *h = mut->roots; h; h = h->next)
+    visit(&h->v, heap);
+  // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
+  while(grey < semi->hp)
+    grey = scan(heap, grey);
+  large_object_space_finish_gc(large);
+  semi_space_steal_pages(semi, large->live_pages_at_last_collection);
+  // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
+}
+
 static void collect_for_alloc(struct mutator *mut, size_t bytes) {
   collect(mut);
   struct semi_space *space = mutator_semi_space(mut);
@@ -142,8 +187,40 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) {
   }
 }
 
+static const size_t LARGE_OBJECT_THRESHOLD = 8192;
+static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
+                            size_t size) {
+  struct heap *heap = mutator_heap(mut);
+  struct large_object_space *space = heap_large_object_space(heap);
+  struct semi_space *semi_space = heap_semi_space(heap);
+
+  size_t npages = large_object_space_npages(space, size);
+  if (!semi_space_steal_pages(semi_space, npages)) {
+    collect(mut);
+    if (!semi_space_steal_pages(semi_space, npages)) {
+      fprintf(stderr, "ran out of space, heap size %zu\n", semi_space->size);
+      abort();
+    }
+  }
+
+  void *ret = large_object_space_alloc(space, npages);
+  if (!ret)
+    ret = large_object_space_obtain_and_alloc(space, npages);
+
+  if (!ret) {
+    perror("weird: we have the space but mmap didn't work");
+    abort();
+  }
+
+  *(uintptr_t*)ret = kind;
+  return ret;
+}
+
 static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
+  if (size >= LARGE_OBJECT_THRESHOLD)
+    return allocate_large(mut, kind, size);
+
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
     uintptr_t addr = space->hp;

From 3f54fb3dbf5e134e7e38541302cbb16b96b1d5db Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 17 Apr 2022 21:51:20 +0200
Subject: [PATCH 065/403] Fix semispace page stealing

Ensure number of stolen pages is even.  Avoid madvising on every
collection.  Cache the page size.
---
 semi.h | 61 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/semi.h b/semi.h
index 0757ee6b0..3b58376e5 100644
--- a/semi.h
+++ b/semi.h
@@ -11,6 +11,10 @@
 struct semi_space {
   uintptr_t hp;
   uintptr_t limit;
+  uintptr_t from_space;
+  uintptr_t to_space;
+  size_t page_size;
+  size_t stolen_pages;
   uintptr_t base;
   size_t size;
   long count;
@@ -56,31 +60,38 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) NEVER_INLINE;
 static void visit(void **loc, void *visit_data);
 
 static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
-  size_t page_size = getpagesize();
+  size_t stolen_pages = space->stolen_pages + npages;
+  size_t old_limit_size = space->limit - space->to_space;
+  size_t new_limit_size =
+    (space->size - align_up(stolen_pages, 2) * space->page_size) / 2;
 
-  if (npages & 1) npages++;
-  size_t half_size = (npages * page_size) >> 1;
-  if (space->limit - space->hp < half_size)
+  if (space->to_space + new_limit_size < space->hp)
     return 0;
 
-  space->limit -= half_size;
-  size_t tospace_offset = space->limit - space->base;
-  madvise((void*)(space->base + tospace_offset), half_size, MADV_DONTNEED);
-  size_t fromspace_offset =
-    (tospace_offset + (space->size >> 1)) & (space->size - 1);
-  madvise((void*)(space->base + fromspace_offset), half_size, MADV_DONTNEED);
+  space->limit = space->to_space + new_limit_size;
+  space->stolen_pages = stolen_pages;
+
+  madvise((void*)(space->to_space + new_limit_size),
+          old_limit_size - new_limit_size,
+          MADV_DONTNEED);
+  madvise((void*)(space->from_space + new_limit_size),
+          old_limit_size - new_limit_size,
+          MADV_DONTNEED);
   return 1;
 }
 
+static void semi_space_set_stolen_pages(struct semi_space *space, size_t npages) {
+  space->stolen_pages = npages;
+  size_t limit_size =
+    (space->size - align_up(npages, 2) * space->page_size) / 2;
+  space->limit = space->to_space + limit_size;
+}
+
 static void flip(struct semi_space *space) {
-  uintptr_t split = space->base + (space->size >> 1);
-  if (space->hp <= split) {
-    space->hp = split;
-    space->limit = space->base + space->size;
-  } else {
-    space->hp = space->base;
-    space->limit = split;
-  }
+  space->hp = space->from_space;
+  space->from_space = space->to_space;
+  space->to_space = space->hp;
+  space->limit = space->hp + space->size / 2;
   space->count++;
 }  
 
@@ -174,7 +185,7 @@ static void collect(struct mutator *mut) {
   while(grey < semi->hp)
     grey = scan(heap, grey);
   large_object_space_finish_gc(large);
-  semi_space_steal_pages(semi, large->live_pages_at_last_collection);
+  semi_space_set_stolen_pages(semi, large->live_pages_at_last_collection);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
 
@@ -255,6 +266,10 @@ static inline void* get_field(void **addr) {
 }
 
 static int initialize_semi_space(struct semi_space *space, size_t size) {
+  // Allocate even numbers of pages.
+  size_t page_size = getpagesize();
+  size = align_up(size, page_size * 2);
+
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
@@ -262,10 +277,12 @@ static int initialize_semi_space(struct semi_space *space, size_t size) {
     return 0;
   }
 
-  space->hp = space->base = (uintptr_t) mem;
+  space->to_space = space->hp = space->base = (uintptr_t) mem;
+  space->from_space = space->base + size / 2;
+  space->page_size = page_size;
+  space->stolen_pages = 0;
   space->size = size;
-  space->count = -1;
-  flip(space);
+  space->count = 0;
 
   return 1;
 }

From 19f7f72b6866e758e10f967ab7ef244a2e4c9006 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Apr 2022 10:00:44 +0200
Subject: [PATCH 066/403] Rename mark-sweep "large" objects to "medium"

---
 mark-sweep.h | 130 +++++++++++++++++++++++++--------------------------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index b3184e50d..48d268e01 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -18,12 +18,12 @@
 
 #define GRANULE_SIZE 8
 #define GRANULE_SIZE_LOG_2 3
-#define LARGE_OBJECT_THRESHOLD 256
-#define LARGE_OBJECT_GRANULE_THRESHOLD 32
+#define MEDIUM_OBJECT_THRESHOLD 256
+#define MEDIUM_OBJECT_GRANULE_THRESHOLD 32
 
 STATIC_ASSERT_EQ(GRANULE_SIZE, 1 << GRANULE_SIZE_LOG_2);
-STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
-                 LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
+STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
+                 MEDIUM_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
 
 // There are small object pages for allocations of these sizes.
 #define FOR_EACH_SMALL_OBJECT_GRANULES(M) \
@@ -44,7 +44,7 @@ static const uint8_t small_object_granule_sizes[] =
 #undef SMALL_OBJECT_GRANULE_SIZE
 };
 
-static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD + 2] = {
+static const enum small_object_size small_object_sizes_for_granules[MEDIUM_OBJECT_GRANULE_THRESHOLD + 2] = {
   SMALL_OBJECT_1,   SMALL_OBJECT_1, SMALL_OBJECT_2,  SMALL_OBJECT_3,
   SMALL_OBJECT_4,   SMALL_OBJECT_5,   SMALL_OBJECT_6,  SMALL_OBJECT_8,
   SMALL_OBJECT_8,   SMALL_OBJECT_10,  SMALL_OBJECT_10, SMALL_OBJECT_16,
@@ -57,7 +57,7 @@ static const enum small_object_size small_object_sizes_for_granules[LARGE_OBJECT
 };
 
 static enum small_object_size granules_to_small_object_size(unsigned granules) {
-  ASSERT(granules <= LARGE_OBJECT_GRANULE_THRESHOLD);
+  ASSERT(granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD);
   return small_object_sizes_for_granules[granules];
 }
   
@@ -87,9 +87,9 @@ struct gcobj_freelists {
   struct gcobj_free *by_size[SMALL_OBJECT_SIZES];
 };
 
-// Objects larger than LARGE_OBJECT_GRANULE_THRESHOLD.
-struct gcobj_free_large {
-  struct gcobj_free_large *next;
+// Objects larger than MEDIUM_OBJECT_GRANULE_THRESHOLD.
+struct gcobj_free_medium {
+  struct gcobj_free_medium *next;
   size_t granules;
 };
 
@@ -97,7 +97,7 @@ struct gcobj {
   union {
     uintptr_t tag;
     struct gcobj_free free;
-    struct gcobj_free_large free_large;
+    struct gcobj_free_medium free_medium;
     uintptr_t words[0];
     void *pointers[0];
   };
@@ -112,8 +112,8 @@ struct mark_space {
   size_t active_mutator_count;
   size_t mutator_count;
   struct gcobj_freelists small_objects;
-  // Unordered list of large objects.
-  struct gcobj_free_large *large_objects;
+  // Unordered list of medium objects.
+  struct gcobj_free_medium *medium_objects;
   uintptr_t base;
   uint8_t *mark_bytes;
   uintptr_t heap_base;
@@ -213,7 +213,7 @@ static void clear_mutator_freelists(struct mutator *mut) {
 }
 static void clear_global_freelists(struct mark_space *space) {
   clear_small_freelists(&space->small_objects);
-  space->large_objects = NULL;
+  space->medium_objects = NULL;
 }
 
 static int space_has_multiple_mutators(struct mark_space *space) {
@@ -464,11 +464,11 @@ static void push_small(struct gcobj_freelists *small_objects, void *region,
   }
 }
 
-static void push_large(struct mark_space *space, void *region, size_t granules) {
-  struct gcobj_free_large *large = region;
-  large->next = space->large_objects;
-  large->granules = granules;
-  space->large_objects = large;
+static void push_medium(struct mark_space *space, void *region, size_t granules) {
+  struct gcobj_free_medium *medium = region;
+  medium->next = space->medium_objects;
+  medium->granules = granules;
+  space->medium_objects = medium;
 }
 
 static void reclaim_small(struct gcobj_freelists *small_objects,
@@ -493,36 +493,36 @@ static void reclaim(struct mark_space *space,
                     size_t region_granules) {
   if (kind != NOT_SMALL_OBJECT)
     reclaim_small(small_objects, kind, region, region_granules);
-  else if (region_granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
+  else if (region_granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
     push_small(small_objects, region, SMALL_OBJECT_SIZES - 1, region_granules);
   else
-    push_large(space, region, region_granules);
+    push_medium(space, region, region_granules);
 }
 
-static void split_large_object(struct mark_space *space,
+static void split_medium_object(struct mark_space *space,
                                struct gcobj_freelists *small_objects,
-                               struct gcobj_free_large *large,
+                               struct gcobj_free_medium *medium,
                                size_t granules) {
-  size_t large_granules = large->granules;
-  ASSERT(large_granules >= granules);
-  ASSERT(granules >= LARGE_OBJECT_GRANULE_THRESHOLD);
-  // Invariant: all words in LARGE are 0 except the two header words.
-  // LARGE is off the freelist.  We return a block of cleared memory, so
+  size_t medium_granules = medium->granules;
+  ASSERT(medium_granules >= granules);
+  ASSERT(granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  // Invariant: all words in MEDIUM are 0 except the two header words.
+  // MEDIUM is off the freelist.  We return a block of cleared memory, so
   // clear those fields now.
-  large->next = NULL;
-  large->granules = 0;
+  medium->next = NULL;
+  medium->granules = 0;
 
-  if (large_granules == granules)
+  if (medium_granules == granules)
     return;
   
-  char *tail = ((char*)large) + granules * GRANULE_SIZE;
+  char *tail = ((char*)medium) + granules * GRANULE_SIZE;
   reclaim(space, small_objects, NOT_SMALL_OBJECT, tail,
-          large_granules - granules);
+          medium_granules - granules);
 }
 
-static void unlink_large_object(struct gcobj_free_large **prev,
-                                struct gcobj_free_large *large) {
-  *prev = large->next;
+static void unlink_medium_object(struct gcobj_free_medium **prev,
+                                struct gcobj_free_medium *medium) {
+  *prev = medium->next;
 }
 
 static size_t live_object_granules(struct gcobj *obj) {
@@ -538,7 +538,7 @@ static size_t live_object_granules(struct gcobj *obj) {
     abort ();
   }
   size_t granules = size_to_granules(bytes);
-  if (granules > LARGE_OBJECT_GRANULE_THRESHOLD)
+  if (granules > MEDIUM_OBJECT_GRANULE_THRESHOLD)
     return granules;
   return small_object_granule_sizes[granules_to_small_object_size(granules)];
 }  
@@ -570,7 +570,7 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 static int sweep(struct mark_space *space,
                  struct gcobj_freelists *small_objects,
                  enum small_object_size kind,
-                 size_t large_object_granules) {
+                 size_t medium_object_granules) {
   // Sweep until we have reclaimed 32 kB of free memory, or we reach the
   // end of the heap.
   ssize_t to_reclaim = 32 * 1024 / GRANULE_SIZE;
@@ -585,8 +585,8 @@ static int sweep(struct mark_space *space,
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
     if (limit_granules > to_reclaim) {
       if (kind == NOT_SMALL_OBJECT) {
-        if (large_object_granules < limit_granules)
-          limit_granules = large_object_granules;
+        if (medium_object_granules < limit_granules)
+          limit_granules = medium_object_granules;
       } else {
         limit_granules = to_reclaim;
       }
@@ -613,7 +613,7 @@ static int sweep(struct mark_space *space,
   return 1;
 }
 
-static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
+static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
                             size_t granules) {
   struct mark_space *space = mutator_mark_space(mut);
   struct gcobj_freelists *small_objects = space_has_multiple_mutators(space) ?
@@ -628,25 +628,25 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
 
   int swept_from_beginning = 0;
   while (1) {
-    struct gcobj_free_large *already_scanned = NULL;
+    struct gcobj_free_medium *already_scanned = NULL;
     do {
-      struct gcobj_free_large **prev = &space->large_objects;
-      for (struct gcobj_free_large *large = space->large_objects;
-           large != already_scanned;
-           prev = &large->next, large = large->next) {
-        if (large->granules >= granules) {
-          unlink_large_object(prev, large);
-          split_large_object(space, small_objects, large, granules);
+      struct gcobj_free_medium **prev = &space->medium_objects;
+      for (struct gcobj_free_medium *medium = space->medium_objects;
+           medium != already_scanned;
+           prev = &medium->next, medium = medium->next) {
+        if (medium->granules >= granules) {
+          unlink_medium_object(prev, medium);
+          split_medium_object(space, small_objects, medium, granules);
           mark_space_unlock(space);
-          struct gcobj *obj = (struct gcobj *)large;
+          struct gcobj *obj = (struct gcobj *)medium;
           obj->tag = tag_live(kind);
-          return large;
+          return medium;
         }
       }
-      already_scanned = space->large_objects;
+      already_scanned = space->medium_objects;
     } while (sweep(space, small_objects, NOT_SMALL_OBJECT, granules));
 
-    // No large object, and we swept across the whole heap.  Collect.
+    // No medium object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
       fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
       abort();
@@ -680,19 +680,19 @@ static int fill_small_from_local(struct gcobj_freelists *small_objects,
 }
 
 // with space lock
-static int fill_small_from_large(struct mark_space *space,
+static int fill_small_from_medium(struct mark_space *space,
                                  struct gcobj_freelists *small_objects,
                                  enum small_object_size kind) {
-  // If there is a large object, take and split it.
-  struct gcobj_free_large *large = space->large_objects;
-  if (!large)
+  // If there is a medium object, take and split it.
+  struct gcobj_free_medium *medium = space->medium_objects;
+  if (!medium)
     return 0;
 
-  unlink_large_object(&space->large_objects, large);
-  ASSERT(large->granules >= LARGE_OBJECT_GRANULE_THRESHOLD);
-  split_large_object(space, small_objects, large,
-                     LARGE_OBJECT_GRANULE_THRESHOLD);
-  push_small(small_objects, large, kind, LARGE_OBJECT_GRANULE_THRESHOLD);
+  unlink_medium_object(&space->medium_objects, medium);
+  ASSERT(medium->granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  split_medium_object(space, small_objects, medium,
+                     MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  push_small(small_objects, medium, kind, MEDIUM_OBJECT_GRANULE_THRESHOLD);
   return 1;
 }
 
@@ -730,7 +730,7 @@ static void fill_small_from_global(struct mutator *mut,
     if (fill_small_from_global_small(space, small_objects, kind))
       break;
 
-    if (fill_small_from_large(space, small_objects, kind))
+    if (fill_small_from_medium(space, small_objects, kind))
       break;
 
     // By default, pull in 16 kB of data at a time.
@@ -776,9 +776,9 @@ static inline void* allocate_small(struct mutator *mut,
 static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
   size_t granules = size_to_granules(size);
-  if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
+  if (granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
     return allocate_small(mut, kind, granules_to_small_object_size(granules));
-  return allocate_large(mut, kind, granules);
+  return allocate_medium(mut, kind, granules);
 }
 static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind,
@@ -805,7 +805,7 @@ static int initialize_gc(size_t size, struct heap **heap,
 #undef SMALL_OBJECT_GRANULE_SIZE
 
   ASSERT_EQ(SMALL_OBJECT_SIZES - 1,
-            small_object_sizes_for_granules[LARGE_OBJECT_GRANULE_THRESHOLD]);
+            small_object_sizes_for_granules[MEDIUM_OBJECT_GRANULE_THRESHOLD]);
 
   size = align_up(size, getpagesize());
 

From 119e273fa4a0896ed01203eb6a7f9c670dc6ae48 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Apr 2022 15:19:55 +0200
Subject: [PATCH 067/403] Rename mark-sweep "markers" to "tracers"

There could be other reasons than marking to trace the heap.
---
 Makefile          |   4 +-
 mark-sweep.h      |  57 ++--
 parallel-marker.h | 642 ---------------------------------------------
 parallel-tracer.h | 643 ++++++++++++++++++++++++++++++++++++++++++++++
 serial-marker.h   | 168 ------------
 serial-tracer.h   | 168 ++++++++++++
 6 files changed, 843 insertions(+), 839 deletions(-)
 delete mode 100644 parallel-marker.h
 create mode 100644 parallel-tracer.h
 delete mode 100644 serial-marker.h
 create mode 100644 serial-tracer.h

diff --git a/Makefile b/Makefile
index 9ef8b8f85..e6e568b71 100644
--- a/Makefile
+++ b/Makefile
@@ -17,10 +17,10 @@ bdw-%: bdw.h conservative-roots.h %-types.h %.c
 semi-%: semi.h precise-roots.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_SEMI -o $@ $*.c
 
-mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %-types.h heap-objects.h %.c
+mark-sweep-%: mark-sweep.h precise-roots.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_MARK_SWEEP -o $@ $*.c
 
-parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %-types.h heap-objects.h %.c
+parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/mark-sweep.h b/mark-sweep.h
index 48d268e01..0dc20e550 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -11,9 +11,9 @@
 #include "inline.h"
 #include "precise-roots.h"
 #ifdef GC_PARALLEL_MARK
-#include "parallel-marker.h"
+#include "parallel-tracer.h"
 #else
-#include "serial-marker.h"
+#include "serial-tracer.h"
 #endif
 
 #define GRANULE_SIZE 8
@@ -124,12 +124,12 @@ struct mark_space {
   void *mem;
   size_t mem_size;
   long count;
-  struct marker marker;
   struct mutator *deactivated_mutators;
 };
 
 struct heap {
   struct mark_space mark_space;
+  struct tracer tracer;
 };
 
 struct mutator_mark_buf {
@@ -148,8 +148,8 @@ struct mutator {
   struct mutator *next;
 };
 
-static inline struct marker* mark_space_marker(struct mark_space *space) {
-  return &space->marker;
+static inline struct tracer* heap_tracer(struct heap *heap) {
+  return &heap->tracer;
 }
 static inline struct mark_space* heap_mark_space(struct heap *heap) {
   return &heap->mark_space;
@@ -174,7 +174,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct mark_space *space, struct mutator *mut) NEVER_INLINE;
+static void collect(struct heap *heap, struct mutator *mut) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   ASSERT(space->heap_base <= (uintptr_t) obj);
@@ -183,8 +183,8 @@ static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return &space->mark_bytes[granule];
 }
 
-static inline int mark_object(struct mark_space *space, struct gcobj *obj) {
-  uint8_t *byte = mark_byte(space, obj);
+static inline int trace_object(struct heap *heap, struct gcobj *obj) {
+  uint8_t *byte = mark_byte(heap_mark_space(heap), obj);
   if (*byte)
     return 0;
   *byte = 1;
@@ -195,7 +195,7 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
   switch (tag_live_alloc_kind(obj->tag)) {
 #define SCAN_OBJECT(name, Name, NAME) \
     case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, marker_visit, mark_data); \
+      visit_##name##_fields((Name*)obj, tracer_visit, mark_data); \
       break;
     FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
@@ -317,11 +317,12 @@ static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void mark_stopping_mutator_roots(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
+  struct heap *heap = mutator_heap(mut);
+  struct mark_space *space = heap_mark_space(heap);
   struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gcobj *root = h->v;
-    if (root && mark_object(space, root))
+    if (root && trace_object(heap, root))
       mutator_mark_buf_push(local_roots, root);
   }
 
@@ -336,11 +337,11 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {
 
 // Mark the roots of the mutator that causes GC.
 static void mark_controlling_mutator_roots(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
+  struct heap *heap = mutator_heap(mut);
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gcobj *root = h->v;
-    if (root && mark_object(space, root))
-      marker_enqueue_root(&space->marker, root);
+    if (root && trace_object(heap, root))
+      tracer_enqueue_root(&heap->tracer, root);
   }
 }
 
@@ -359,16 +360,17 @@ static void mark_inactive_mutators(struct mark_space *space) {
     mark_controlling_mutator_roots(mut);
 }
 
-static void mark_global_roots(struct mark_space *space) {
+static void mark_global_roots(struct heap *heap) {
+  struct mark_space *space = heap_mark_space(heap);
   for (struct handle *h = space->global_roots; h; h = h->next) {
     struct gcobj *obj = h->v;
-    if (obj && mark_object(space, obj))
-      marker_enqueue_root(&space->marker, obj);
+    if (obj && trace_object(heap, obj))
+      tracer_enqueue_root(&heap->tracer, obj);
   }
 
   struct mutator_mark_buf *roots = atomic_load(&space->mutator_roots);
   for (; roots; roots = roots->next)
-    marker_enqueue_roots(&space->marker, roots->objects, roots->size);
+    tracer_enqueue_roots(&heap->tracer, roots->objects, roots->size);
   atomic_store(&space->mutator_roots, NULL);
 }
 
@@ -425,16 +427,17 @@ static void reset_sweeper(struct mark_space *space) {
   space->sweep = space->heap_base;
 }
 
-static void collect(struct mark_space *space, struct mutator *mut) {
+static void collect(struct heap *heap, struct mutator *mut) {
+  struct mark_space *space = heap_mark_space(heap);
   DEBUG("start collect #%ld:\n", space->count);
-  marker_prepare(space);
+  tracer_prepare(heap);
   request_mutators_to_stop(space);
   mark_controlling_mutator_roots(mut);
   wait_for_mutators_to_stop(space);
   mark_inactive_mutators(space);
-  mark_global_roots(space);
-  marker_trace(space);
-  marker_release(space);
+  mark_global_roots(heap);
+  tracer_trace(heap);
+  tracer_release(heap);
   clear_global_freelists(space);
   reset_sweeper(space);
   space->count++;
@@ -614,7 +617,7 @@ static int sweep(struct mark_space *space,
 }
 
 static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
-                            size_t granules) {
+                             size_t granules) {
   struct mark_space *space = mutator_mark_space(mut);
   struct gcobj_freelists *small_objects = space_has_multiple_mutators(space) ?
     &space->small_objects : &mut->small_objects;
@@ -651,7 +654,7 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
       fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
       abort();
     } else {
-      collect(space, mut);
+      collect(mutator_heap(mut), mut);
       swept_from_beginning = 1;
     }
   }
@@ -739,7 +742,7 @@ static void fill_small_from_global(struct mutator *mut,
         fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
         abort();
       } else {
-        collect(space, mut);
+        collect(mutator_heap(mut), mut);
         swept_from_beginning = 1;
       }
     }
@@ -838,7 +841,7 @@ static int initialize_gc(size_t size, struct heap **heap,
   space->heap_base = ((uintptr_t) mem) + overhead;
   space->heap_size = size - overhead;
   space->sweep = space->heap_base + space->heap_size;
-  if (!marker_init(space))
+  if (!tracer_init(*heap))
     abort();
   reclaim(space, NULL, NOT_SMALL_OBJECT, (void*)space->heap_base,
           size_to_granules(space->heap_size));
diff --git a/parallel-marker.h b/parallel-marker.h
deleted file mode 100644
index e39704852..000000000
--- a/parallel-marker.h
+++ /dev/null
@@ -1,642 +0,0 @@
-#ifndef PARALLEL_MARKER_H
-#define PARALLEL_MARKER_H
-
-#include <pthread.h>
-#include <stdatomic.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include "assert.h"
-#include "debug.h"
-#include "inline.h"
-
-// The Chase-Lev work-stealing deque, as initially described in "Dynamic
-// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
-// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
-// and improved with C11 atomics in "Correct and Efficient Work-Stealing
-// for Weak Memory Models" (Lê et al, PPoPP'13)
-// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
-
-struct gcobj;
-
-struct mark_buf {
-  unsigned log_size;
-  size_t size;
-  struct gcobj **data;
-};
-
-// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
-#define mark_buf_min_log_size ((unsigned) 10)
-// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
-#define mark_buf_max_log_size ((unsigned) 28)
-
-static int
-mark_buf_init(struct mark_buf *buf, unsigned log_size) {
-  ASSERT(log_size >= mark_buf_min_log_size);
-  ASSERT(log_size <= mark_buf_max_log_size);
-  size_t size = (1 << log_size) * sizeof(struct gcobj *);
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("Failed to grow work-stealing dequeue");
-    DEBUG("Failed to allocate %zu bytes", size);
-    return 0;
-  }
-  buf->log_size = log_size;
-  buf->size = 1 << log_size;
-  buf->data = mem;
-  return 1;
-}
-  
-static inline size_t
-mark_buf_size(struct mark_buf *buf) {
-  return buf->size;
-}
-
-static inline size_t
-mark_buf_byte_size(struct mark_buf *buf) {
-  return mark_buf_size(buf) * sizeof(struct gcobj *);
-}
-
-static void
-mark_buf_release(struct mark_buf *buf) {
-  if (buf->data)
-    madvise(buf->data, mark_buf_byte_size(buf), MADV_DONTNEED);
-}
-
-static void
-mark_buf_destroy(struct mark_buf *buf) {
-  if (buf->data) {
-    munmap(buf->data, mark_buf_byte_size(buf));
-    buf->data = NULL;
-    buf->log_size = 0;
-    buf->size = 0;
-  }
-}
-
-static inline struct gcobj *
-mark_buf_get(struct mark_buf *buf, size_t i) {
-  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
-                              memory_order_relaxed);
-}
-
-static inline void
-mark_buf_put(struct mark_buf *buf, size_t i, struct gcobj * o) {
-  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
-                               o,
-                               memory_order_relaxed);
-}
-
-static inline int
-mark_buf_grow(struct mark_buf *from, struct mark_buf *to,
-              size_t b, size_t t) {
-  if (from->log_size == mark_buf_max_log_size)
-    return 0;
-  if (!mark_buf_init (to, from->log_size + 1))
-    return 0;
-  for (size_t i=t; i<b; i++)
-    mark_buf_put(to, i, mark_buf_get(from, i));
-  return 1;
-}
-
-// Chase-Lev work-stealing deque.  One thread pushes data into the deque
-// at the bottom, and many threads compete to steal data from the top.
-struct mark_deque {
-  // Ensure bottom and top are on different cache lines.
-  union {
-    atomic_size_t bottom;
-    char bottom_padding[64];
-  };
-  union {
-    atomic_size_t top;
-    char top_padding[64];
-  };
-  atomic_int active; // Which mark_buf is active.
-  struct mark_buf bufs[(mark_buf_max_log_size - mark_buf_min_log_size) + 1];
-};
-
-#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
-#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
-
-#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
-#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
-
-#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
-
-static int
-mark_deque_init(struct mark_deque *q) {
-  memset(q, 0, sizeof (*q));
-  int ret = mark_buf_init(&q->bufs[0], mark_buf_min_log_size);
-  // Note, this fence isn't in the paper, I added it out of caution.
-  atomic_thread_fence(memory_order_release);
-  return ret;
-}
-
-static void
-mark_deque_release(struct mark_deque *q) {
-  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
-    mark_buf_release(&q->bufs[i]);
-}
-
-static void
-mark_deque_destroy(struct mark_deque *q) {
-  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
-    mark_buf_destroy(&q->bufs[i]);
-}
-
-static int
-mark_deque_grow(struct mark_deque *q, int cur, size_t b, size_t t) {
-  if (!mark_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
-    fprintf(stderr, "failed to grow deque!!\n");
-    abort();
-  }
-
-  cur++;
-  STORE_RELAXED(&q->active, cur);
-  return cur;
-}
-
-static void
-mark_deque_push(struct mark_deque *q, struct gcobj * x) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  size_t t = LOAD_ACQUIRE(&q->top);
-  int active = LOAD_RELAXED(&q->active);
-
-  if (b - t > mark_buf_size(&q->bufs[active]) - 1) /* Full queue. */
-    active = mark_deque_grow(q, active, b, t);
-
-  mark_buf_put(&q->bufs[active], b, x);
-  atomic_thread_fence(memory_order_release);
-  STORE_RELAXED(&q->bottom, b + 1);
-}
-
-static void
-mark_deque_push_many(struct mark_deque *q, struct gcobj **objv, size_t count) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  size_t t = LOAD_ACQUIRE(&q->top);
-  int active = LOAD_RELAXED(&q->active);
-
-  while (b - t > mark_buf_size(&q->bufs[active]) - count) /* Full queue. */
-    active = mark_deque_grow(q, active, b, t);
-
-  for (size_t i = 0; i < count; i++)
-    mark_buf_put(&q->bufs[active], b + i, objv[i]);
-  atomic_thread_fence(memory_order_release);
-  STORE_RELAXED(&q->bottom, b + count);
-}
-
-static struct gcobj *
-mark_deque_try_pop(struct mark_deque *q) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  b = b - 1;
-  int active = LOAD_RELAXED(&q->active);
-  STORE_RELAXED(&q->bottom, b);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t t = LOAD_RELAXED(&q->top);
-  struct gcobj * x;
-  if (t <= b) { // Non-empty queue.
-    x = mark_buf_get(&q->bufs[active], b);
-    if (t == b) { // Single last element in queue.
-      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
-                                                   memory_order_seq_cst,
-                                                   memory_order_relaxed))
-        // Failed race.
-        x = NULL;
-      STORE_RELAXED(&q->bottom, b + 1);
-    }
-  } else { // Empty queue.
-    x = NULL;
-    STORE_RELAXED(&q->bottom, b + 1);
-  }
-  return x;
-}
-
-static struct gcobj *
-mark_deque_steal(struct mark_deque *q) {
-  while (1) {
-    size_t t = LOAD_ACQUIRE(&q->top);
-    atomic_thread_fence(memory_order_seq_cst);
-    size_t b = LOAD_ACQUIRE(&q->bottom);
-    if (t >= b)
-      return NULL;
-    int active = LOAD_CONSUME(&q->active);
-    struct gcobj *x = x = mark_buf_get(&q->bufs[active], t);
-    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
-                                                 memory_order_seq_cst,
-                                                 memory_order_relaxed))
-      // Failed race.
-      continue;
-    return x;
-  }
-}
-
-static int
-mark_deque_can_steal(struct mark_deque *q) {
-  size_t t = LOAD_ACQUIRE(&q->top);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t b = LOAD_ACQUIRE(&q->bottom);
-  return t < b;
-}
-
-#undef LOAD_RELAXED
-#undef STORE_RELAXED
-#undef LOAD_ACQUIRE
-#undef STORE_RELEASE
-#undef LOAD_CONSUME
-
-#define LOCAL_MARK_QUEUE_SIZE 1024
-#define LOCAL_MARK_QUEUE_MASK (LOCAL_MARK_QUEUE_SIZE - 1)
-#define LOCAL_MARK_QUEUE_SHARE_AMOUNT (LOCAL_MARK_QUEUE_SIZE * 3 / 4)
-struct local_mark_queue {
-  size_t read;
-  size_t write;
-  struct gcobj * data[LOCAL_MARK_QUEUE_SIZE];
-};
-
-static inline void
-local_mark_queue_init(struct local_mark_queue *q) {
-  q->read = q->write = 0;
-}
-static inline void
-local_mark_queue_poison(struct local_mark_queue *q) {
-  q->read = 0; q->write = LOCAL_MARK_QUEUE_SIZE;
-}
-static inline size_t
-local_mark_queue_size(struct local_mark_queue *q) {
-  return q->write - q->read;
-}
-static inline int
-local_mark_queue_empty(struct local_mark_queue *q) {
-  return local_mark_queue_size(q) == 0;
-}
-static inline int
-local_mark_queue_full(struct local_mark_queue *q) {
-  return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SIZE;
-}
-static inline void
-local_mark_queue_push(struct local_mark_queue *q, struct gcobj * v) {
-  q->data[q->write++ & LOCAL_MARK_QUEUE_MASK] = v;
-}
-static inline struct gcobj *
-local_mark_queue_pop(struct local_mark_queue *q) {
-  return q->data[q->read++ & LOCAL_MARK_QUEUE_MASK];
-}
-
-enum mark_worker_state {
-  MARK_WORKER_STOPPED,
-  MARK_WORKER_IDLE,
-  MARK_WORKER_MARKING,
-  MARK_WORKER_STOPPING,
-  MARK_WORKER_DEAD
-};
-
-struct mark_worker {
-  struct mark_space *space;
-  size_t id;
-  size_t steal_id;
-  pthread_t thread;
-  enum mark_worker_state state;
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
-  struct mark_deque deque;
-};
-
-#define MARK_WORKERS_MAX_COUNT 8
-
-struct marker {
-  atomic_size_t active_markers;
-  size_t worker_count;
-  atomic_size_t running_markers;
-  long count;
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
-  struct mark_worker workers[MARK_WORKERS_MAX_COUNT];
-};
-
-struct local_marker {
-  struct mark_worker *worker;
-  struct mark_deque *share_deque;
-  struct mark_space *space;
-  struct local_mark_queue local;
-};
-
-struct context;
-static inline struct marker* mark_space_marker(struct mark_space *space);
-
-static size_t number_of_current_processors(void) { return 1; }
-
-static int
-mark_worker_init(struct mark_worker *worker, struct mark_space *space,
-                 struct marker *marker, size_t id) {
-  worker->space = space;
-  worker->id = id;
-  worker->steal_id = 0;
-  worker->thread = 0;
-  worker->state = MARK_WORKER_STOPPED;
-  pthread_mutex_init(&worker->lock, NULL);
-  pthread_cond_init(&worker->cond, NULL);
-  return mark_deque_init(&worker->deque);
-}
-
-static void mark_worker_mark(struct mark_worker *worker);
-
-static void*
-mark_worker_thread(void *data) {
-  struct mark_worker *worker = data;
-
-  pthread_mutex_lock(&worker->lock);
-  while (1) {
-    switch (worker->state) {
-    case MARK_WORKER_IDLE:
-      pthread_cond_wait(&worker->cond, &worker->lock);
-      break;
-    case MARK_WORKER_MARKING:
-      mark_worker_mark(worker);
-      worker->state = MARK_WORKER_IDLE;
-      break;
-    case MARK_WORKER_STOPPING:
-      worker->state = MARK_WORKER_DEAD;
-      pthread_mutex_unlock(&worker->lock);
-      return NULL;
-    default:
-      abort();
-    }
-  }
-}
-
-static int
-mark_worker_spawn(struct mark_worker *worker) {
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == MARK_WORKER_STOPPED);
-  worker->state = MARK_WORKER_IDLE;
-  pthread_mutex_unlock(&worker->lock);
-
-  if (pthread_create(&worker->thread, NULL, mark_worker_thread, worker)) {
-    perror("spawning marker thread failed");
-    worker->state = MARK_WORKER_STOPPED;
-    return 0;
-  }
-
-  return 1;
-}
-
-static void
-mark_worker_request_mark(struct mark_worker *worker) {
-  struct marker *marker = mark_space_marker(worker->space);
-    
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == MARK_WORKER_IDLE);
-  worker->state = MARK_WORKER_MARKING;
-  pthread_cond_signal(&worker->cond);
-  pthread_mutex_unlock(&worker->lock);
-}  
-
-static void
-mark_worker_finished_marking(struct mark_worker *worker) {
-  // Signal controller that we are done with marking.
-  struct marker *marker = mark_space_marker(worker->space);
-    
-  if (atomic_fetch_sub(&marker->running_markers, 1) == 1) {
-    pthread_mutex_lock(&marker->lock);
-    marker->count++;
-    pthread_cond_signal(&marker->cond);
-    pthread_mutex_unlock(&marker->lock);
-  }
-}
-
-static void
-mark_worker_request_stop(struct mark_worker *worker) {
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == MARK_WORKER_IDLE);
-  worker->state = MARK_WORKER_STOPPING;
-  pthread_cond_signal(&worker->cond);
-  pthread_mutex_unlock(&worker->lock);
-}  
-
-static int
-marker_init(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-  atomic_init(&marker->active_markers, 0);
-  atomic_init(&marker->running_markers, 0);
-  marker->count = 0;
-  pthread_mutex_init(&marker->lock, NULL);
-  pthread_cond_init(&marker->cond, NULL);
-  size_t desired_worker_count = 0;
-  if (getenv("GC_MARKERS"))
-    desired_worker_count = atoi(getenv("GC_MARKERS"));
-  if (desired_worker_count == 0)
-    desired_worker_count = number_of_current_processors();
-  if (desired_worker_count > MARK_WORKERS_MAX_COUNT)
-    desired_worker_count = MARK_WORKERS_MAX_COUNT;
-  for (size_t i = 0; i < desired_worker_count; i++) {
-    if (!mark_worker_init(&marker->workers[i], space, marker, i))
-      break;
-    if (mark_worker_spawn(&marker->workers[i]))
-      marker->worker_count++;
-    else
-      break;
-  }
-  return marker->worker_count > 0;
-}
-
-static void marker_prepare(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-  for (size_t i = 0; i < marker->worker_count; i++)
-    marker->workers[i].steal_id = 0;
-}
-static void marker_release(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-  for (size_t i = 0; i < marker->worker_count; i++)
-    mark_deque_release(&marker->workers[i].deque);
-}
-
-struct gcobj;
-static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
-static inline int mark_object(struct mark_space *space,
-                              struct gcobj *obj) ALWAYS_INLINE;
-
-static inline void
-marker_share(struct local_marker *mark) {
-  DEBUG("marker #%zu: sharing\n", mark->worker->id);
-  for (size_t i = 0; i < LOCAL_MARK_QUEUE_SHARE_AMOUNT; i++)
-    mark_deque_push(mark->share_deque, local_mark_queue_pop(&mark->local));
-}
-
-static inline void
-marker_visit(void **loc, void *mark_data) {
-  struct local_marker *mark = mark_data;
-  struct gcobj *obj = *loc;
-  if (obj && mark_object(mark->space, obj)) {
-    if (local_mark_queue_full(&mark->local))
-      marker_share(mark);
-    local_mark_queue_push(&mark->local, obj);
-  }
-}
-
-static struct gcobj *
-marker_steal_from_worker(struct marker *marker, size_t id) {
-  ASSERT(id < marker->worker_count);
-  return mark_deque_steal(&marker->workers[id].deque);
-}
-
-static int
-marker_can_steal_from_worker(struct marker *marker, size_t id) {
-  ASSERT(id < marker->worker_count);
-  return mark_deque_can_steal(&marker->workers[id].deque);
-}
-
-static struct gcobj *
-mark_worker_steal_from_any(struct mark_worker *worker, struct marker *marker) {
-  size_t steal_id = worker->steal_id;
-  for (size_t i = 0; i < marker->worker_count; i++) {
-    steal_id = (steal_id + 1) % marker->worker_count;
-    DEBUG("marker #%zu: stealing from #%zu\n", worker->id, steal_id);
-    struct gcobj * obj = marker_steal_from_worker(marker, steal_id);
-    if (obj) {
-      DEBUG("marker #%zu: stealing got %p\n", worker->id, obj);
-      worker->steal_id = steal_id;
-      return obj;
-    }
-  }
-  DEBUG("marker #%zu: failed to steal\n", worker->id);
-  return 0;
-}
-
-static int
-mark_worker_can_steal_from_any(struct mark_worker *worker, struct marker *marker) {
-  size_t steal_id = worker->steal_id;
-  DEBUG("marker #%zu: checking if any worker has tasks\n", worker->id);
-  for (size_t i = 0; i < marker->worker_count; i++) {
-    steal_id = (steal_id + 1) % marker->worker_count;
-    int res = marker_can_steal_from_worker(marker, steal_id);
-    if (res) {
-      DEBUG("marker #%zu: worker #%zu has tasks!\n", worker->id, steal_id);
-      worker->steal_id = steal_id;
-      return 1;
-    }
-  }
-  DEBUG("marker #%zu: nothing to steal\n", worker->id);
-  return 0;
-}
-
-static int
-mark_worker_check_termination(struct mark_worker *worker,
-                              struct marker *marker) {
-  // We went around all workers and nothing.  Enter termination phase.
-  if (atomic_fetch_sub_explicit(&marker->active_markers, 1,
-                                memory_order_relaxed) == 1) {
-    DEBUG("  ->> marker #%zu: DONE (no spinning) <<-\n", worker->id);
-    return 1;
-  }
-
-  size_t spin_count = 0;
-  while (1) {
-    if (mark_worker_can_steal_from_any(worker, marker)) {
-      atomic_fetch_add_explicit(&marker->active_markers, 1,
-                                memory_order_relaxed);
-      return 0;
-    }
-    if (atomic_load_explicit(&marker->active_markers,
-                             memory_order_relaxed) == 0) {
-      DEBUG("  ->> marker #%zu: DONE <<-\n", worker->id);
-      return 1;
-    }
-    // spin
-    DEBUG("marker #%zu: spinning #%zu\n", worker->id, spin_count);
-    if (spin_count < 10)
-      __builtin_ia32_pause();
-    else if (spin_count < 20)
-      sched_yield();
-    else if (spin_count < 40)
-      usleep(0);
-    else
-      usleep(1);
-    spin_count++;
-  }
-}
-
-static struct gcobj *
-mark_worker_steal(struct local_marker *mark) {
-  struct marker *marker = mark_space_marker(mark->space);
-  struct mark_worker *worker = mark->worker;
-
-  while (1) {
-    DEBUG("marker #%zu: trying to steal\n", worker->id);
-    struct gcobj *obj = mark_worker_steal_from_any(worker, marker);
-    if (obj)
-      return obj;
-
-    if (mark_worker_check_termination(worker, marker))
-      return NULL;
-  }
-}
-
-static void
-mark_worker_mark(struct mark_worker *worker) {
-  struct local_marker mark;
-  mark.worker = worker;
-  mark.share_deque = &worker->deque;
-  mark.space = worker->space;
-  local_mark_queue_init(&mark.local);
-
-  size_t n = 0;
-  DEBUG("marker #%zu: running mark loop\n", worker->id);
-  while (1) {
-    struct gcobj * obj;
-    if (!local_mark_queue_empty(&mark.local)) {
-      obj = local_mark_queue_pop(&mark.local);
-    } else {
-      obj = mark_worker_steal(&mark);
-      if (!obj)
-        break;
-    }
-    trace_one(obj, &mark);
-    n++;
-  }
-  DEBUG("marker #%zu: done marking, %zu objects traced\n", worker->id, n);
-
-  mark_worker_finished_marking(worker);
-}
-
-static inline void
-marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
-  struct mark_deque *worker0_deque = &marker->workers[0].deque;
-  mark_deque_push(worker0_deque, obj);
-}
-
-static inline void
-marker_enqueue_roots(struct marker *marker, struct gcobj **objv,
-                     size_t count) {
-  struct mark_deque *worker0_deque = &marker->workers[0].deque;
-  mark_deque_push_many(worker0_deque, objv, count);
-}
-
-static inline void
-marker_trace(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-
-  pthread_mutex_lock(&marker->lock);
-  long mark_count = marker->count;
-  pthread_mutex_unlock(&marker->lock);
-
-  DEBUG("starting trace; %zu workers\n", marker->worker_count);
-  DEBUG("waking workers\n");
-  atomic_store_explicit(&marker->active_markers, marker->worker_count,
-                        memory_order_release);
-  atomic_store_explicit(&marker->running_markers, marker->worker_count,
-                        memory_order_release);
-  for (size_t i = 0; i < marker->worker_count; i++)
-    mark_worker_request_mark(&marker->workers[i]);
-
-  DEBUG("waiting on markers\n");
-
-  pthread_mutex_lock(&marker->lock);
-  while (marker->count <= mark_count)
-    pthread_cond_wait(&marker->cond, &marker->lock);
-  pthread_mutex_unlock(&marker->lock);
-
-  DEBUG("trace finished\n");
-}
-
-#endif // PARALLEL_MARKER_H
diff --git a/parallel-tracer.h b/parallel-tracer.h
new file mode 100644
index 000000000..f96e93754
--- /dev/null
+++ b/parallel-tracer.h
@@ -0,0 +1,643 @@
+#ifndef PARALLEL_TRACER_H
+#define PARALLEL_TRACER_H
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "inline.h"
+
+// The Chase-Lev work-stealing deque, as initially described in "Dynamic
+// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
+// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
+// and improved with C11 atomics in "Correct and Efficient Work-Stealing
+// for Weak Memory Models" (Lê et al, PPoPP'13)
+// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
+
+struct gcobj;
+
+struct trace_buf {
+  unsigned log_size;
+  size_t size;
+  struct gcobj **data;
+};
+
+// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
+#define trace_buf_min_log_size ((unsigned) 10)
+// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
+#define trace_buf_max_log_size ((unsigned) 28)
+
+static int
+trace_buf_init(struct trace_buf *buf, unsigned log_size) {
+  ASSERT(log_size >= trace_buf_min_log_size);
+  ASSERT(log_size <= trace_buf_max_log_size);
+  size_t size = (1 << log_size) * sizeof(struct gcobj *);
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow work-stealing dequeue");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return 0;
+  }
+  buf->log_size = log_size;
+  buf->size = 1 << log_size;
+  buf->data = mem;
+  return 1;
+}
+  
+static inline size_t
+trace_buf_size(struct trace_buf *buf) {
+  return buf->size;
+}
+
+static inline size_t
+trace_buf_byte_size(struct trace_buf *buf) {
+  return trace_buf_size(buf) * sizeof(struct gcobj *);
+}
+
+static void
+trace_buf_release(struct trace_buf *buf) {
+  if (buf->data)
+    madvise(buf->data, trace_buf_byte_size(buf), MADV_DONTNEED);
+}
+
+static void
+trace_buf_destroy(struct trace_buf *buf) {
+  if (buf->data) {
+    munmap(buf->data, trace_buf_byte_size(buf));
+    buf->data = NULL;
+    buf->log_size = 0;
+    buf->size = 0;
+  }
+}
+
+static inline struct gcobj *
+trace_buf_get(struct trace_buf *buf, size_t i) {
+  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
+                              memory_order_relaxed);
+}
+
+static inline void
+trace_buf_put(struct trace_buf *buf, size_t i, struct gcobj * o) {
+  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
+                               o,
+                               memory_order_relaxed);
+}
+
+static inline int
+trace_buf_grow(struct trace_buf *from, struct trace_buf *to,
+              size_t b, size_t t) {
+  if (from->log_size == trace_buf_max_log_size)
+    return 0;
+  if (!trace_buf_init (to, from->log_size + 1))
+    return 0;
+  for (size_t i=t; i<b; i++)
+    trace_buf_put(to, i, trace_buf_get(from, i));
+  return 1;
+}
+
+// Chase-Lev work-stealing deque.  One thread pushes data into the deque
+// at the bottom, and many threads compete to steal data from the top.
+struct trace_deque {
+  // Ensure bottom and top are on different cache lines.
+  union {
+    atomic_size_t bottom;
+    char bottom_padding[64];
+  };
+  union {
+    atomic_size_t top;
+    char top_padding[64];
+  };
+  atomic_int active; // Which trace_buf is active.
+  struct trace_buf bufs[(trace_buf_max_log_size - trace_buf_min_log_size) + 1];
+};
+
+#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
+#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
+
+#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
+#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
+
+#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
+
+static int
+trace_deque_init(struct trace_deque *q) {
+  memset(q, 0, sizeof (*q));
+  int ret = trace_buf_init(&q->bufs[0], trace_buf_min_log_size);
+  // Note, this fence isn't in the paper, I added it out of caution.
+  atomic_thread_fence(memory_order_release);
+  return ret;
+}
+
+static void
+trace_deque_release(struct trace_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    trace_buf_release(&q->bufs[i]);
+}
+
+static void
+trace_deque_destroy(struct trace_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    trace_buf_destroy(&q->bufs[i]);
+}
+
+static int
+trace_deque_grow(struct trace_deque *q, int cur, size_t b, size_t t) {
+  if (!trace_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
+    fprintf(stderr, "failed to grow deque!!\n");
+    abort();
+  }
+
+  cur++;
+  STORE_RELAXED(&q->active, cur);
+  return cur;
+}
+
+static void
+trace_deque_push(struct trace_deque *q, struct gcobj * x) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  if (b - t > trace_buf_size(&q->bufs[active]) - 1) /* Full queue. */
+    active = trace_deque_grow(q, active, b, t);
+
+  trace_buf_put(&q->bufs[active], b, x);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + 1);
+}
+
+static void
+trace_deque_push_many(struct trace_deque *q, struct gcobj **objv, size_t count) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  while (b - t > trace_buf_size(&q->bufs[active]) - count) /* Full queue. */
+    active = trace_deque_grow(q, active, b, t);
+
+  for (size_t i = 0; i < count; i++)
+    trace_buf_put(&q->bufs[active], b + i, objv[i]);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + count);
+}
+
+static struct gcobj *
+trace_deque_try_pop(struct trace_deque *q) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  b = b - 1;
+  int active = LOAD_RELAXED(&q->active);
+  STORE_RELAXED(&q->bottom, b);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t t = LOAD_RELAXED(&q->top);
+  struct gcobj * x;
+  if (t <= b) { // Non-empty queue.
+    x = trace_buf_get(&q->bufs[active], b);
+    if (t == b) { // Single last element in queue.
+      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                   memory_order_seq_cst,
+                                                   memory_order_relaxed))
+        // Failed race.
+        x = NULL;
+      STORE_RELAXED(&q->bottom, b + 1);
+    }
+  } else { // Empty queue.
+    x = NULL;
+    STORE_RELAXED(&q->bottom, b + 1);
+  }
+  return x;
+}
+
+static struct gcobj *
+trace_deque_steal(struct trace_deque *q) {
+  while (1) {
+    size_t t = LOAD_ACQUIRE(&q->top);
+    atomic_thread_fence(memory_order_seq_cst);
+    size_t b = LOAD_ACQUIRE(&q->bottom);
+    if (t >= b)
+      return NULL;
+    int active = LOAD_CONSUME(&q->active);
+    struct gcobj *x = x = trace_buf_get(&q->bufs[active], t);
+    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                 memory_order_seq_cst,
+                                                 memory_order_relaxed))
+      // Failed race.
+      continue;
+    return x;
+  }
+}
+
+static int
+trace_deque_can_steal(struct trace_deque *q) {
+  size_t t = LOAD_ACQUIRE(&q->top);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t b = LOAD_ACQUIRE(&q->bottom);
+  return t < b;
+}
+
+#undef LOAD_RELAXED
+#undef STORE_RELAXED
+#undef LOAD_ACQUIRE
+#undef STORE_RELEASE
+#undef LOAD_CONSUME
+
+#define LOCAL_TRACE_QUEUE_SIZE 1024
+#define LOCAL_TRACE_QUEUE_MASK (LOCAL_TRACE_QUEUE_SIZE - 1)
+#define LOCAL_TRACE_QUEUE_SHARE_AMOUNT (LOCAL_TRACE_QUEUE_SIZE * 3 / 4)
+struct local_trace_queue {
+  size_t read;
+  size_t write;
+  struct gcobj * data[LOCAL_TRACE_QUEUE_SIZE];
+};
+
+static inline void
+local_trace_queue_init(struct local_trace_queue *q) {
+  q->read = q->write = 0;
+}
+static inline void
+local_trace_queue_poison(struct local_trace_queue *q) {
+  q->read = 0; q->write = LOCAL_TRACE_QUEUE_SIZE;
+}
+static inline size_t
+local_trace_queue_size(struct local_trace_queue *q) {
+  return q->write - q->read;
+}
+static inline int
+local_trace_queue_empty(struct local_trace_queue *q) {
+  return local_trace_queue_size(q) == 0;
+}
+static inline int
+local_trace_queue_full(struct local_trace_queue *q) {
+  return local_trace_queue_size(q) >= LOCAL_TRACE_QUEUE_SIZE;
+}
+static inline void
+local_trace_queue_push(struct local_trace_queue *q, struct gcobj * v) {
+  q->data[q->write++ & LOCAL_TRACE_QUEUE_MASK] = v;
+}
+static inline struct gcobj *
+local_trace_queue_pop(struct local_trace_queue *q) {
+  return q->data[q->read++ & LOCAL_TRACE_QUEUE_MASK];
+}
+
+enum trace_worker_state {
+  TRACE_WORKER_STOPPED,
+  TRACE_WORKER_IDLE,
+  TRACE_WORKER_TRACING,
+  TRACE_WORKER_STOPPING,
+  TRACE_WORKER_DEAD
+};
+
+struct heap;
+struct trace_worker {
+  struct heap *heap;
+  size_t id;
+  size_t steal_id;
+  pthread_t thread;
+  enum trace_worker_state state;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+  struct trace_deque deque;
+};
+
+#define TRACE_WORKERS_MAX_COUNT 8
+
+struct tracer {
+  atomic_size_t active_tracers;
+  size_t worker_count;
+  atomic_size_t running_tracers;
+  long count;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+  struct trace_worker workers[TRACE_WORKERS_MAX_COUNT];
+};
+
+struct local_tracer {
+  struct trace_worker *worker;
+  struct trace_deque *share_deque;
+  struct heap *heap;
+  struct local_trace_queue local;
+};
+
+struct context;
+static inline struct tracer* heap_tracer(struct heap *heap);
+
+static size_t number_of_current_processors(void) { return 1; }
+
+static int
+trace_worker_init(struct trace_worker *worker, struct heap *heap,
+                 struct tracer *tracer, size_t id) {
+  worker->heap = heap;
+  worker->id = id;
+  worker->steal_id = 0;
+  worker->thread = 0;
+  worker->state = TRACE_WORKER_STOPPED;
+  pthread_mutex_init(&worker->lock, NULL);
+  pthread_cond_init(&worker->cond, NULL);
+  return trace_deque_init(&worker->deque);
+}
+
+static void trace_worker_trace(struct trace_worker *worker);
+
+static void*
+trace_worker_thread(void *data) {
+  struct trace_worker *worker = data;
+
+  pthread_mutex_lock(&worker->lock);
+  while (1) {
+    switch (worker->state) {
+    case TRACE_WORKER_IDLE:
+      pthread_cond_wait(&worker->cond, &worker->lock);
+      break;
+    case TRACE_WORKER_TRACING:
+      trace_worker_trace(worker);
+      worker->state = TRACE_WORKER_IDLE;
+      break;
+    case TRACE_WORKER_STOPPING:
+      worker->state = TRACE_WORKER_DEAD;
+      pthread_mutex_unlock(&worker->lock);
+      return NULL;
+    default:
+      abort();
+    }
+  }
+}
+
+static int
+trace_worker_spawn(struct trace_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == TRACE_WORKER_STOPPED);
+  worker->state = TRACE_WORKER_IDLE;
+  pthread_mutex_unlock(&worker->lock);
+
+  if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) {
+    perror("spawning tracer thread failed");
+    worker->state = TRACE_WORKER_STOPPED;
+    return 0;
+  }
+
+  return 1;
+}
+
+static void
+trace_worker_request_trace(struct trace_worker *worker) {
+  struct tracer *tracer = heap_tracer(worker->heap);
+    
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == TRACE_WORKER_IDLE);
+  worker->state = TRACE_WORKER_TRACING;
+  pthread_cond_signal(&worker->cond);
+  pthread_mutex_unlock(&worker->lock);
+}  
+
+static void
+trace_worker_finished_tracing(struct trace_worker *worker) {
+  // Signal controller that we are done with tracing.
+  struct tracer *tracer = heap_tracer(worker->heap);
+    
+  if (atomic_fetch_sub(&tracer->running_tracers, 1) == 1) {
+    pthread_mutex_lock(&tracer->lock);
+    tracer->count++;
+    pthread_cond_signal(&tracer->cond);
+    pthread_mutex_unlock(&tracer->lock);
+  }
+}
+
+static void
+trace_worker_request_stop(struct trace_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == TRACE_WORKER_IDLE);
+  worker->state = TRACE_WORKER_STOPPING;
+  pthread_cond_signal(&worker->cond);
+  pthread_mutex_unlock(&worker->lock);
+}  
+
+static int
+tracer_init(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+  atomic_init(&tracer->active_tracers, 0);
+  atomic_init(&tracer->running_tracers, 0);
+  tracer->count = 0;
+  pthread_mutex_init(&tracer->lock, NULL);
+  pthread_cond_init(&tracer->cond, NULL);
+  size_t desired_worker_count = 0;
+  if (getenv("GC_TRACERS"))
+    desired_worker_count = atoi(getenv("GC_TRACERS"));
+  if (desired_worker_count == 0)
+    desired_worker_count = number_of_current_processors();
+  if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
+    desired_worker_count = TRACE_WORKERS_MAX_COUNT;
+  for (size_t i = 0; i < desired_worker_count; i++) {
+    if (!trace_worker_init(&tracer->workers[i], heap, tracer, i))
+      break;
+    if (trace_worker_spawn(&tracer->workers[i]))
+      tracer->worker_count++;
+    else
+      break;
+  }
+  return tracer->worker_count > 0;
+}
+
+static void tracer_prepare(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    tracer->workers[i].steal_id = 0;
+}
+static void tracer_release(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    trace_deque_release(&tracer->workers[i].deque);
+}
+
+struct gcobj;
+static inline void tracer_visit(void **loc, void *trace_data) ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
+static inline int trace_object(struct heap *heap,
+                               struct gcobj *obj) ALWAYS_INLINE;
+
+static inline void
+tracer_share(struct local_tracer *trace) {
+  DEBUG("tracer #%zu: sharing\n", trace->worker->id);
+  for (size_t i = 0; i < LOCAL_TRACE_QUEUE_SHARE_AMOUNT; i++)
+    trace_deque_push(trace->share_deque, local_trace_queue_pop(&trace->local));
+}
+
+static inline void
+tracer_visit(void **loc, void *trace_data) {
+  struct local_tracer *trace = trace_data;
+  struct gcobj *obj = *loc;
+  if (obj && trace_object(trace->heap, obj)) {
+    if (local_trace_queue_full(&trace->local))
+      tracer_share(trace);
+    local_trace_queue_push(&trace->local, obj);
+  }
+}
+
+static struct gcobj *
+tracer_steal_from_worker(struct tracer *tracer, size_t id) {
+  ASSERT(id < tracer->worker_count);
+  return trace_deque_steal(&tracer->workers[id].deque);
+}
+
+static int
+tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
+  ASSERT(id < tracer->worker_count);
+  return trace_deque_can_steal(&tracer->workers[id].deque);
+}
+
+static struct gcobj *
+trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
+  size_t steal_id = worker->steal_id;
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    steal_id = (steal_id + 1) % tracer->worker_count;
+    DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, steal_id);
+    struct gcobj * obj = tracer_steal_from_worker(tracer, steal_id);
+    if (obj) {
+      DEBUG("tracer #%zu: stealing got %p\n", worker->id, obj);
+      worker->steal_id = steal_id;
+      return obj;
+    }
+  }
+  DEBUG("tracer #%zu: failed to steal\n", worker->id);
+  return 0;
+}
+
+static int
+trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
+  size_t steal_id = worker->steal_id;
+  DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    steal_id = (steal_id + 1) % tracer->worker_count;
+    int res = tracer_can_steal_from_worker(tracer, steal_id);
+    if (res) {
+      DEBUG("tracer #%zu: worker #%zu has tasks!\n", worker->id, steal_id);
+      worker->steal_id = steal_id;
+      return 1;
+    }
+  }
+  DEBUG("tracer #%zu: nothing to steal\n", worker->id);
+  return 0;
+}
+
+static int
+trace_worker_check_termination(struct trace_worker *worker,
+                              struct tracer *tracer) {
+  // We went around all workers and nothing.  Enter termination phase.
+  if (atomic_fetch_sub_explicit(&tracer->active_tracers, 1,
+                                memory_order_relaxed) == 1) {
+    DEBUG("  ->> tracer #%zu: DONE (no spinning) <<-\n", worker->id);
+    return 1;
+  }
+
+  size_t spin_count = 0;
+  while (1) {
+    if (trace_worker_can_steal_from_any(worker, tracer)) {
+      atomic_fetch_add_explicit(&tracer->active_tracers, 1,
+                                memory_order_relaxed);
+      return 0;
+    }
+    if (atomic_load_explicit(&tracer->active_tracers,
+                             memory_order_relaxed) == 0) {
+      DEBUG("  ->> tracer #%zu: DONE <<-\n", worker->id);
+      return 1;
+    }
+    // spin
+    DEBUG("tracer #%zu: spinning #%zu\n", worker->id, spin_count);
+    if (spin_count < 10)
+      __builtin_ia32_pause();
+    else if (spin_count < 20)
+      sched_yield();
+    else if (spin_count < 40)
+      usleep(0);
+    else
+      usleep(1);
+    spin_count++;
+  }
+}
+
+static struct gcobj *
+trace_worker_steal(struct local_tracer *trace) {
+  struct tracer *tracer = heap_tracer(trace->heap);
+  struct trace_worker *worker = trace->worker;
+
+  while (1) {
+    DEBUG("tracer #%zu: trying to steal\n", worker->id);
+    struct gcobj *obj = trace_worker_steal_from_any(worker, tracer);
+    if (obj)
+      return obj;
+
+    if (trace_worker_check_termination(worker, tracer))
+      return NULL;
+  }
+}
+
+static void
+trace_worker_trace(struct trace_worker *worker) {
+  struct local_tracer trace;
+  trace.worker = worker;
+  trace.share_deque = &worker->deque;
+  trace.heap = worker->heap;
+  local_trace_queue_init(&trace.local);
+
+  size_t n = 0;
+  DEBUG("tracer #%zu: running trace loop\n", worker->id);
+  while (1) {
+    struct gcobj * obj;
+    if (!local_trace_queue_empty(&trace.local)) {
+      obj = local_trace_queue_pop(&trace.local);
+    } else {
+      obj = trace_worker_steal(&trace);
+      if (!obj)
+        break;
+    }
+    trace_one(obj, &trace);
+    n++;
+  }
+  DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
+
+  trace_worker_finished_tracing(worker);
+}
+
+static inline void
+tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
+  struct trace_deque *worker0_deque = &tracer->workers[0].deque;
+  trace_deque_push(worker0_deque, obj);
+}
+
+static inline void
+tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objv,
+                     size_t count) {
+  struct trace_deque *worker0_deque = &tracer->workers[0].deque;
+  trace_deque_push_many(worker0_deque, objv, count);
+}
+
+static inline void
+tracer_trace(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+
+  pthread_mutex_lock(&tracer->lock);
+  long trace_count = tracer->count;
+  pthread_mutex_unlock(&tracer->lock);
+
+  DEBUG("starting trace; %zu workers\n", tracer->worker_count);
+  DEBUG("waking workers\n");
+  atomic_store_explicit(&tracer->active_tracers, tracer->worker_count,
+                        memory_order_release);
+  atomic_store_explicit(&tracer->running_tracers, tracer->worker_count,
+                        memory_order_release);
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    trace_worker_request_trace(&tracer->workers[i]);
+
+  DEBUG("waiting on tracers\n");
+
+  pthread_mutex_lock(&tracer->lock);
+  while (tracer->count <= trace_count)
+    pthread_cond_wait(&tracer->cond, &tracer->lock);
+  pthread_mutex_unlock(&tracer->lock);
+
+  DEBUG("trace finished\n");
+}
+
+#endif // PARALLEL_TRACER_H
diff --git a/serial-marker.h b/serial-marker.h
deleted file mode 100644
index 5f5330b6d..000000000
--- a/serial-marker.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef SERIAL_TRACE_H
-#define SERIAL_TRACE_H
-
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include "assert.h"
-#include "debug.h"
-
-struct gcobj;
-
-struct mark_queue {
-  size_t size;
-  size_t read;
-  size_t write;
-  struct gcobj **buf;
-};
-
-static const size_t mark_queue_max_size =
-  (1ULL << (sizeof(struct gcobj *) * 8 - 1)) / sizeof(struct gcobj *);
-static const size_t mark_queue_release_byte_threshold = 1 * 1024 * 1024;
-
-static struct gcobj **
-mark_queue_alloc(size_t size) {
-  void *mem = mmap(NULL, size * sizeof(struct gcobj *), PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("Failed to grow mark queue");
-    DEBUG("Failed to allocate %zu bytes", size);
-    return NULL;
-  }
-  return mem;
-}
-
-static int
-mark_queue_init(struct mark_queue *q) {
-  q->size = getpagesize() / sizeof(struct gcobj *);
-  q->read = 0;
-  q->write = 0;
-  q->buf = mark_queue_alloc(q->size);
-  return !!q->buf;
-}
-  
-static inline struct gcobj *
-mark_queue_get(struct mark_queue *q, size_t idx) {
-  return q->buf[idx & (q->size - 1)];
-}
-
-static inline void
-mark_queue_put(struct mark_queue *q, size_t idx, struct gcobj *x) {
-  q->buf[idx & (q->size - 1)] = x;
-}
-
-static int mark_queue_grow(struct mark_queue *q) NEVER_INLINE;
-
-static int
-mark_queue_grow(struct mark_queue *q) {
-  size_t old_size = q->size;
-  struct gcobj **old_buf = q->buf;
-  if (old_size >= mark_queue_max_size) {
-    DEBUG("mark queue already at max size of %zu bytes", old_size);
-    return 0;
-  }
-
-  size_t new_size = old_size * 2;
-  struct gcobj **new_buf = mark_queue_alloc(new_size);
-  if (!new_buf)
-    return 0;
-
-  size_t old_mask = old_size - 1;
-  size_t new_mask = new_size - 1;
-
-  for (size_t i = q->read; i < q->write; i++)
-    new_buf[i & new_mask] = old_buf[i & old_mask];
-
-  munmap(old_buf, old_size * sizeof(struct gcobj *));
-
-  q->size = new_size;
-  q->buf = new_buf;
-  return 1;
-}
-  
-static inline void
-mark_queue_push(struct mark_queue *q, struct gcobj *p) {
-  if (UNLIKELY(q->write - q->read == q->size)) {
-    if (!mark_queue_grow(q))
-      abort();
-  }
-  mark_queue_put(q, q->write++, p);
-}
-
-static inline void
-mark_queue_push_many(struct mark_queue *q, struct gcobj **pv, size_t count) {
-  while (q->size - (q->write - q->read) < count) {
-    if (!mark_queue_grow(q))
-      abort();
-  }
-  for (size_t i = 0; i < count; i++)
-    mark_queue_put(q, q->write++, pv[i]);
-}
-
-static inline struct gcobj*
-mark_queue_pop(struct mark_queue *q) {
-  if (UNLIKELY(q->read == q->write))
-    return NULL;
-  return mark_queue_get(q, q->read++);
-}
-
-static void
-mark_queue_release(struct mark_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gcobj *);
-  if (byte_size >= mark_queue_release_byte_threshold)
-    madvise(q->buf, byte_size, MADV_DONTNEED);
-  q->read = q->write = 0;
-}
-
-static void
-mark_queue_destroy(struct mark_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gcobj *);
-  munmap(q->buf, byte_size);
-}
-
-struct marker {
-  struct mark_queue queue;
-};
-
-struct mark_space;
-static inline struct marker* mark_space_marker(struct mark_space *space);
-
-static int
-marker_init(struct mark_space *space) {
-  return mark_queue_init(&mark_space_marker(space)->queue);
-}
-static void marker_prepare(struct mark_space *space) {}
-static void marker_release(struct mark_space *space) {
-  mark_queue_release(&mark_space_marker(space)->queue);
-}
-
-struct gcobj;
-static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
-static inline int mark_object(struct mark_space *space,
-                              struct gcobj *obj) ALWAYS_INLINE;
-
-static inline void
-marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
-  mark_queue_push(&marker->queue, obj);
-}
-static inline void
-marker_enqueue_roots(struct marker *marker, struct gcobj **objs,
-                     size_t count) {
-  mark_queue_push_many(&marker->queue, objs, count);
-}
-static inline void
-marker_visit(void **loc, void *mark_data) {
-  struct mark_space *space = mark_data;
-  struct gcobj *obj = *loc;
-  if (obj && mark_object(space, obj))
-    marker_enqueue_root(mark_space_marker(space), obj);
-}
-static inline void
-marker_trace(struct mark_space *space) {
-  struct gcobj *obj;
-  while ((obj = mark_queue_pop(&mark_space_marker(space)->queue)))
-    trace_one(obj, space);
-}
-
-#endif // SERIAL_MARK_H
diff --git a/serial-tracer.h b/serial-tracer.h
new file mode 100644
index 000000000..7bea9e63e
--- /dev/null
+++ b/serial-tracer.h
@@ -0,0 +1,168 @@
+#ifndef SERIAL_TRACER_H
+#define SERIAL_TRACER_H
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+
+struct gcobj;
+
+struct trace_queue {
+  size_t size;
+  size_t read;
+  size_t write;
+  struct gcobj **buf;
+};
+
+static const size_t trace_queue_max_size =
+  (1ULL << (sizeof(struct gcobj *) * 8 - 1)) / sizeof(struct gcobj *);
+static const size_t trace_queue_release_byte_threshold = 1 * 1024 * 1024;
+
+static struct gcobj **
+trace_queue_alloc(size_t size) {
+  void *mem = mmap(NULL, size * sizeof(struct gcobj *), PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow trace queue");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return NULL;
+  }
+  return mem;
+}
+
+static int
+trace_queue_init(struct trace_queue *q) {
+  q->size = getpagesize() / sizeof(struct gcobj *);
+  q->read = 0;
+  q->write = 0;
+  q->buf = trace_queue_alloc(q->size);
+  return !!q->buf;
+}
+  
+static inline struct gcobj *
+trace_queue_get(struct trace_queue *q, size_t idx) {
+  return q->buf[idx & (q->size - 1)];
+}
+
+static inline void
+trace_queue_put(struct trace_queue *q, size_t idx, struct gcobj *x) {
+  q->buf[idx & (q->size - 1)] = x;
+}
+
+static int trace_queue_grow(struct trace_queue *q) NEVER_INLINE;
+
+static int
+trace_queue_grow(struct trace_queue *q) {
+  size_t old_size = q->size;
+  struct gcobj **old_buf = q->buf;
+  if (old_size >= trace_queue_max_size) {
+    DEBUG("trace queue already at max size of %zu bytes", old_size);
+    return 0;
+  }
+
+  size_t new_size = old_size * 2;
+  struct gcobj **new_buf = trace_queue_alloc(new_size);
+  if (!new_buf)
+    return 0;
+
+  size_t old_mask = old_size - 1;
+  size_t new_mask = new_size - 1;
+
+  for (size_t i = q->read; i < q->write; i++)
+    new_buf[i & new_mask] = old_buf[i & old_mask];
+
+  munmap(old_buf, old_size * sizeof(struct gcobj *));
+
+  q->size = new_size;
+  q->buf = new_buf;
+  return 1;
+}
+  
+static inline void
+trace_queue_push(struct trace_queue *q, struct gcobj *p) {
+  if (UNLIKELY(q->write - q->read == q->size)) {
+    if (!trace_queue_grow(q))
+      abort();
+  }
+  trace_queue_put(q, q->write++, p);
+}
+
+static inline void
+trace_queue_push_many(struct trace_queue *q, struct gcobj **pv, size_t count) {
+  while (q->size - (q->write - q->read) < count) {
+    if (!trace_queue_grow(q))
+      abort();
+  }
+  for (size_t i = 0; i < count; i++)
+    trace_queue_put(q, q->write++, pv[i]);
+}
+
+static inline struct gcobj*
+trace_queue_pop(struct trace_queue *q) {
+  if (UNLIKELY(q->read == q->write))
+    return NULL;
+  return trace_queue_get(q, q->read++);
+}
+
+static void
+trace_queue_release(struct trace_queue *q) {
+  size_t byte_size = q->size * sizeof(struct gcobj *);
+  if (byte_size >= trace_queue_release_byte_threshold)
+    madvise(q->buf, byte_size, MADV_DONTNEED);
+  q->read = q->write = 0;
+}
+
+static void
+trace_queue_destroy(struct trace_queue *q) {
+  size_t byte_size = q->size * sizeof(struct gcobj *);
+  munmap(q->buf, byte_size);
+}
+
+struct tracer {
+  struct trace_queue queue;
+};
+
+struct heap;
+static inline struct tracer* heap_tracer(struct heap *heap);
+
+static int
+tracer_init(struct heap *heap) {
+  return trace_queue_init(&heap_tracer(heap)->queue);
+}
+static void tracer_prepare(struct heap *heap) {}
+static void tracer_release(struct heap *heap) {
+  trace_queue_release(&heap_tracer(heap)->queue);
+}
+
+struct gcobj;
+static inline void tracer_visit(void **loc, void *trace_data) ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
+static inline int trace_object(struct heap *heap,
+                               struct gcobj *obj) ALWAYS_INLINE;
+
+static inline void
+tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
+  trace_queue_push(&tracer->queue, obj);
+}
+static inline void
+tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objs,
+                     size_t count) {
+  trace_queue_push_many(&tracer->queue, objs, count);
+}
+static inline void
+tracer_visit(void **loc, void *trace_data) {
+  struct heap *heap = trace_data;
+  struct gcobj *obj = *loc;
+  if (obj && trace_object(heap, obj))
+    tracer_enqueue_root(heap_tracer(heap), obj);
+}
+static inline void
+tracer_trace(struct heap *heap) {
+  struct gcobj *obj;
+  while ((obj = trace_queue_pop(&heap_tracer(heap)->queue)))
+    trace_one(obj, heap);
+}
+
+#endif // SERIAL_TRACER_H

From 3ee2009de92fd56654cf253af01e5b460958ae80 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Apr 2022 20:56:48 +0200
Subject: [PATCH 068/403] Move a lot of mark_space state to heap

---
 mark-sweep.h | 290 ++++++++++++++++++++++++++-------------------------
 1 file changed, 149 insertions(+), 141 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 0dc20e550..4e10ba858 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -104,13 +104,6 @@ struct gcobj {
 };
 
 struct mark_space {
-  pthread_mutex_t lock;
-  pthread_cond_t collector_cond;
-  pthread_cond_t mutator_cond;
-  int collecting;
-  int multithreaded;
-  size_t active_mutator_count;
-  size_t mutator_count;
   struct gcobj_freelists small_objects;
   // Unordered list of medium objects.
   struct gcobj_free_medium *medium_objects;
@@ -119,16 +112,24 @@ struct mark_space {
   uintptr_t heap_base;
   size_t heap_size;
   uintptr_t sweep;
-  struct handle *global_roots;
-  struct mutator_mark_buf *mutator_roots;
   void *mem;
   size_t mem_size;
-  long count;
-  struct mutator *deactivated_mutators;
 };
 
 struct heap {
   struct mark_space mark_space;
+  pthread_mutex_t lock;
+  pthread_cond_t collector_cond;
+  pthread_cond_t mutator_cond;
+  size_t size;
+  int collecting;
+  int multithreaded;
+  size_t active_mutator_count;
+  size_t mutator_count;
+  struct handle *global_roots;
+  struct mutator_mark_buf *mutator_roots;
+  long count;
+  struct mutator *deactivated_mutators;
   struct tracer tracer;
 };
 
@@ -157,9 +158,6 @@ static inline struct mark_space* heap_mark_space(struct heap *heap) {
 static inline struct heap* mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
-static inline struct mark_space* mutator_mark_space(struct mutator *mutator) {
-  return heap_mark_space(mutator_heap(mutator));
-}
 
 static inline struct gcobj_free**
 get_small_object_freelist(struct gcobj_freelists *freelists,
@@ -216,61 +214,59 @@ static void clear_global_freelists(struct mark_space *space) {
   space->medium_objects = NULL;
 }
 
-static int space_has_multiple_mutators(struct mark_space *space) {
-  return atomic_load_explicit(&space->multithreaded, memory_order_relaxed);
+static int heap_has_multiple_mutators(struct heap *heap) {
+  return atomic_load_explicit(&heap->multithreaded, memory_order_relaxed);
 }
 
-static int mutators_are_stopping(struct mark_space *space) {
-  return atomic_load_explicit(&space->collecting, memory_order_relaxed);
+static int mutators_are_stopping(struct heap *heap) {
+  return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
 }
 
-static inline void mark_space_lock(struct mark_space *space) {
-  pthread_mutex_lock(&space->lock);
+static inline void heap_lock(struct heap *heap) {
+  pthread_mutex_lock(&heap->lock);
 }
-static inline void mark_space_unlock(struct mark_space *space) {
-  pthread_mutex_unlock(&space->lock);
+static inline void heap_unlock(struct heap *heap) {
+  pthread_mutex_unlock(&heap->lock);
 }
 
 static void add_mutator(struct heap *heap, struct mutator *mut) {
   mut->heap = heap;
-  struct mark_space *space = heap_mark_space(heap);
-  mark_space_lock(space);
+  heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
-  while (mutators_are_stopping(space))
-    pthread_cond_wait(&space->mutator_cond, &space->lock);
-  if (space->mutator_count == 1)
-    space->multithreaded = 1;
-  space->active_mutator_count++;
-  space->mutator_count++;
-  mark_space_unlock(space);
+  while (mutators_are_stopping(heap))
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  if (heap->mutator_count == 1)
+    heap->multithreaded = 1;
+  heap->active_mutator_count++;
+  heap->mutator_count++;
+  heap_unlock(heap);
 }
 
 static void remove_mutator(struct heap *heap, struct mutator *mut) {
   mut->heap = NULL;
-  struct mark_space *space = heap_mark_space(heap);
-  mark_space_lock(space);
-  space->active_mutator_count--;
-  space->mutator_count--;
+  heap_lock(heap);
+  heap->active_mutator_count--;
+  heap->mutator_count--;
   // We have no roots.  If there is a GC stop currently in progress,
   // maybe tell the controller it can continue.
-  if (mutators_are_stopping(space) && space->active_mutator_count == 0)
-    pthread_cond_signal(&space->collector_cond);
-  mark_space_unlock(space);
+  if (mutators_are_stopping(heap) && heap->active_mutator_count == 0)
+    pthread_cond_signal(&heap->collector_cond);
+  heap_unlock(heap);
 }
 
-static void request_mutators_to_stop(struct mark_space *space) {
-  ASSERT(!mutators_are_stopping(space));
-  atomic_store_explicit(&space->collecting, 1, memory_order_relaxed);
+static void request_mutators_to_stop(struct heap *heap) {
+  ASSERT(!mutators_are_stopping(heap));
+  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
 }
 
-static void allow_mutators_to_continue(struct mark_space *space) {
-  ASSERT(mutators_are_stopping(space));
-  ASSERT(space->active_mutator_count == 0);
-  space->active_mutator_count++;
-  atomic_store_explicit(&space->collecting, 0, memory_order_relaxed);
-  ASSERT(!mutators_are_stopping(space));
-  pthread_cond_broadcast(&space->mutator_cond);
+static void allow_mutators_to_continue(struct heap *heap) {
+  ASSERT(mutators_are_stopping(heap));
+  ASSERT(heap->active_mutator_count == 0);
+  heap->active_mutator_count++;
+  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
+  ASSERT(!mutators_are_stopping(heap));
+  pthread_cond_broadcast(&heap->mutator_cond);
 }
 
 static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
@@ -318,7 +314,6 @@ static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
 // enqueue them directly, so we send them to the controller in a buffer.
 static void mark_stopping_mutator_roots(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
-  struct mark_space *space = heap_mark_space(heap);
   struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gcobj *root = h->v;
@@ -328,10 +323,10 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {
 
   // Post to global linked-list of thread roots.
   struct mutator_mark_buf *next =
-    atomic_load_explicit(&space->mutator_roots, memory_order_acquire);
+    atomic_load_explicit(&heap->mutator_roots, memory_order_acquire);
   do {
     local_roots->next = next;
-  } while (!atomic_compare_exchange_weak(&space->mutator_roots,
+  } while (!atomic_compare_exchange_weak(&heap->mutator_roots,
                                          &next, local_roots));
 }
 
@@ -349,77 +344,76 @@ static void release_stopping_mutator_roots(struct mutator *mut) {
   mutator_mark_buf_release(&mut->mark_buf);
 }
 
-static void wait_for_mutators_to_stop(struct mark_space *space) {
-  space->active_mutator_count--;
-  while (space->active_mutator_count)
-    pthread_cond_wait(&space->collector_cond, &space->lock);
+static void wait_for_mutators_to_stop(struct heap *heap) {
+  heap->active_mutator_count--;
+  while (heap->active_mutator_count)
+    pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static void mark_inactive_mutators(struct mark_space *space) {
-  for (struct mutator *mut = space->deactivated_mutators; mut; mut = mut->next)
+static void mark_inactive_mutators(struct heap *heap) {
+  for (struct mutator *mut = heap->deactivated_mutators; mut; mut = mut->next)
     mark_controlling_mutator_roots(mut);
 }
 
 static void mark_global_roots(struct heap *heap) {
-  struct mark_space *space = heap_mark_space(heap);
-  for (struct handle *h = space->global_roots; h; h = h->next) {
+  for (struct handle *h = heap->global_roots; h; h = h->next) {
     struct gcobj *obj = h->v;
     if (obj && trace_object(heap, obj))
       tracer_enqueue_root(&heap->tracer, obj);
   }
 
-  struct mutator_mark_buf *roots = atomic_load(&space->mutator_roots);
+  struct mutator_mark_buf *roots = atomic_load(&heap->mutator_roots);
   for (; roots; roots = roots->next)
     tracer_enqueue_roots(&heap->tracer, roots->objects, roots->size);
-  atomic_store(&space->mutator_roots, NULL);
+  atomic_store(&heap->mutator_roots, NULL);
 }
 
-static void pause_mutator_for_collection(struct mark_space *space) NEVER_INLINE;
-static void pause_mutator_for_collection(struct mark_space *space) {
-  ASSERT(mutators_are_stopping(space));
-  ASSERT(space->active_mutator_count);
-  space->active_mutator_count--;
-  if (space->active_mutator_count == 0)
-    pthread_cond_signal(&space->collector_cond);
+static void pause_mutator_for_collection(struct heap *heap) NEVER_INLINE;
+static void pause_mutator_for_collection(struct heap *heap) {
+  ASSERT(mutators_are_stopping(heap));
+  ASSERT(heap->active_mutator_count);
+  heap->active_mutator_count--;
+  if (heap->active_mutator_count == 0)
+    pthread_cond_signal(&heap->collector_cond);
 
   // Go to sleep and wake up when the collector is done.  Note,
   // however, that it may be that some other mutator manages to
   // trigger collection before we wake up.  In that case we need to
   // mark roots, not just sleep again.  To detect a wakeup on this
   // collection vs a future collection, we use the global GC count.
-  // This is safe because the count is protected by the space lock,
+  // This is safe because the count is protected by the heap lock,
   // which we hold.
-  long epoch = space->count;
+  long epoch = heap->count;
   do
-    pthread_cond_wait(&space->mutator_cond, &space->lock);
-  while (mutators_are_stopping(space) && space->count == epoch);
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  while (mutators_are_stopping(heap) && heap->count == epoch);
 
-  space->active_mutator_count++;
+  heap->active_mutator_count++;
 }
 
 static void pause_mutator_for_collection_with_lock(struct mutator *mut) NEVER_INLINE;
 static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
-  ASSERT(mutators_are_stopping(space));
+  struct heap *heap = mutator_heap(mut);
+  ASSERT(mutators_are_stopping(heap));
   mark_controlling_mutator_roots(mut);
-  pause_mutator_for_collection(space);
+  pause_mutator_for_collection(heap);
   clear_mutator_freelists(mut);
 }
 
 static void pause_mutator_for_collection_without_lock(struct mutator *mut) NEVER_INLINE;
 static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
-  ASSERT(mutators_are_stopping(space));
+  struct heap *heap = mutator_heap(mut);
+  ASSERT(mutators_are_stopping(heap));
   mark_stopping_mutator_roots(mut);
-  mark_space_lock(space);
-  pause_mutator_for_collection(space);
-  mark_space_unlock(space);
+  heap_lock(heap);
+  pause_mutator_for_collection(heap);
+  heap_unlock(heap);
   release_stopping_mutator_roots(mut);
   clear_mutator_freelists(mut);
 }
 
 static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
-  while (mutators_are_stopping(mutator_mark_space(mut)))
+  while (mutators_are_stopping(mutator_heap(mut)))
     pause_mutator_for_collection_without_lock(mut);
 }
 
@@ -429,19 +423,19 @@ static void reset_sweeper(struct mark_space *space) {
 
 static void collect(struct heap *heap, struct mutator *mut) {
   struct mark_space *space = heap_mark_space(heap);
-  DEBUG("start collect #%ld:\n", space->count);
+  DEBUG("start collect #%ld:\n", heap->count);
   tracer_prepare(heap);
-  request_mutators_to_stop(space);
+  request_mutators_to_stop(heap);
   mark_controlling_mutator_roots(mut);
-  wait_for_mutators_to_stop(space);
-  mark_inactive_mutators(space);
+  wait_for_mutators_to_stop(heap);
+  mark_inactive_mutators(heap);
   mark_global_roots(heap);
   tracer_trace(heap);
   tracer_release(heap);
   clear_global_freelists(space);
   reset_sweeper(space);
-  space->count++;
-  allow_mutators_to_continue(space);
+  heap->count++;
+  allow_mutators_to_continue(heap);
   clear_mutator_freelists(mut);
   DEBUG("collect done\n");
 }
@@ -618,15 +612,16 @@ static int sweep(struct mark_space *space,
 
 static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
                              size_t granules) {
-  struct mark_space *space = mutator_mark_space(mut);
-  struct gcobj_freelists *small_objects = space_has_multiple_mutators(space) ?
+  struct heap *heap = mutator_heap(mut);
+  struct mark_space *space = heap_mark_space(heap);
+  struct gcobj_freelists *small_objects = heap_has_multiple_mutators(heap) ?
     &space->small_objects : &mut->small_objects;
 
   maybe_pause_mutator_for_collection(mut);
 
-  mark_space_lock(space);
+  heap_lock(heap);
 
-  while (mutators_are_stopping(space))
+  while (mutators_are_stopping(heap))
     pause_mutator_for_collection_with_lock(mut);
 
   int swept_from_beginning = 0;
@@ -640,7 +635,7 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
         if (medium->granules >= granules) {
           unlink_medium_object(prev, medium);
           split_medium_object(space, small_objects, medium, granules);
-          mark_space_unlock(space);
+          heap_unlock(heap);
           struct gcobj *obj = (struct gcobj *)medium;
           obj->tag = tag_live(kind);
           return medium;
@@ -651,7 +646,7 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
 
     // No medium object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
-      fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
+      fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
       abort();
     } else {
       collect(mutator_heap(mut), mut);
@@ -682,7 +677,7 @@ static int fill_small_from_local(struct gcobj_freelists *small_objects,
   return 0;
 }
 
-// with space lock
+// with heap lock
 static int fill_small_from_medium(struct mark_space *space,
                                  struct gcobj_freelists *small_objects,
                                  enum small_object_size kind) {
@@ -719,13 +714,14 @@ static void fill_small_from_global(struct mutator *mut,
 static void fill_small_from_global(struct mutator *mut,
                                    enum small_object_size kind) {
   struct gcobj_freelists *small_objects = &mut->small_objects;
-  struct mark_space *space = mutator_mark_space(mut);
+  struct heap *heap = mutator_heap(mut);
+  struct mark_space *space = heap_mark_space(heap);
 
   maybe_pause_mutator_for_collection(mut);
 
-  mark_space_lock(space);
+  heap_lock(heap);
 
-  while (mutators_are_stopping(space))
+  while (mutators_are_stopping(heap))
     pause_mutator_for_collection_with_lock(mut);
 
   int swept_from_beginning = 0;
@@ -739,7 +735,7 @@ static void fill_small_from_global(struct mutator *mut,
     // By default, pull in 16 kB of data at a time.
     if (!sweep(space, small_objects, kind, 0)) {
       if (swept_from_beginning) {
-        fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
+        fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
         abort();
       } else {
         collect(mutator_heap(mut), mut);
@@ -750,7 +746,7 @@ static void fill_small_from_global(struct mutator *mut,
     if (*get_small_object_freelist(small_objects, kind))
       break;
   }
-  mark_space_unlock(space);
+  heap_unlock(heap);
 }
 
 static void fill_small(struct mutator *mut, enum small_object_size kind) {
@@ -799,18 +795,8 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static int initialize_gc(size_t size, struct heap **heap,
-                         struct mutator **mut) {
-#define SMALL_OBJECT_GRANULE_SIZE(i) \
-    ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]); \
-    ASSERT_EQ(SMALL_OBJECT_##i + 1, small_object_sizes_for_granules[i+1]);
-  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE);
-#undef SMALL_OBJECT_GRANULE_SIZE
-
-  ASSERT_EQ(SMALL_OBJECT_SIZES - 1,
-            small_object_sizes_for_granules[MEDIUM_OBJECT_GRANULE_THRESHOLD]);
-
-  size = align_up(size, getpagesize());
+static int mark_space_init(struct mark_space *space, struct heap *heap) {
+  size_t size = align_up(heap->size, getpagesize());
 
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
@@ -819,9 +805,6 @@ static int initialize_gc(size_t size, struct heap **heap,
     return 0;
   }
 
-  *heap = calloc(1, sizeof(struct heap));
-  if (!*heap) abort();
-  struct mark_space *space = heap_mark_space(*heap);
   space->mem = mem;
   space->mem_size = size;
   // If there is 1 mark byte per granule, and SIZE bytes available for
@@ -834,18 +817,43 @@ static int initialize_gc(size_t size, struct heap **heap,
   size_t mark_bytes_size = (size + GRANULE_SIZE) / (GRANULE_SIZE + 1);
   size_t overhead = align_up(mark_bytes_size, GRANULE_SIZE);
 
-  pthread_mutex_init(&space->lock, NULL);
-  pthread_cond_init(&space->mutator_cond, NULL);
-  pthread_cond_init(&space->collector_cond, NULL);
-
   space->heap_base = ((uintptr_t) mem) + overhead;
   space->heap_size = size - overhead;
   space->sweep = space->heap_base + space->heap_size;
-  if (!tracer_init(*heap))
-    abort();
   reclaim(space, NULL, NOT_SMALL_OBJECT, (void*)space->heap_base,
           size_to_granules(space->heap_size));
+  return 1;
+}
 
+static int initialize_gc(size_t size, struct heap **heap,
+                         struct mutator **mut) {
+#define SMALL_OBJECT_GRANULE_SIZE(i) \
+    ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]); \
+    ASSERT_EQ(SMALL_OBJECT_##i + 1, small_object_sizes_for_granules[i+1]);
+  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE);
+#undef SMALL_OBJECT_GRANULE_SIZE
+
+  ASSERT_EQ(SMALL_OBJECT_SIZES - 1,
+            small_object_sizes_for_granules[MEDIUM_OBJECT_GRANULE_THRESHOLD]);
+
+  *heap = calloc(1, sizeof(struct heap));
+  if (!*heap) abort();
+
+  pthread_mutex_init(&(*heap)->lock, NULL);
+  pthread_cond_init(&(*heap)->mutator_cond, NULL);
+  pthread_cond_init(&(*heap)->collector_cond, NULL);
+  (*heap)->size = size;
+
+  if (!tracer_init(*heap))
+    abort();
+
+  struct mark_space *space = heap_mark_space(*heap);
+  if (!mark_space_init(space, *heap)) {
+    free(*heap);
+    *heap = NULL;
+    return 0;
+  }
+  
   *mut = calloc(1, sizeof(struct mutator));
   if (!*mut) abort();
   add_mutator(*heap, *mut);
@@ -867,28 +875,28 @@ static void finish_gc_for_thread(struct mutator *mut) {
   free(mut);
 }
 
-static void deactivate_mutator(struct mark_space *space, struct mutator *mut) {
+static void deactivate_mutator(struct heap *heap, struct mutator *mut) {
   ASSERT(mut->next == NULL);
-  mark_space_lock(space);
-  mut->next = space->deactivated_mutators;
-  space->deactivated_mutators = mut;
-  space->active_mutator_count--;
-  if (!space->active_mutator_count && mutators_are_stopping(space))
-    pthread_cond_signal(&space->collector_cond);
-  mark_space_unlock(space);
+  heap_lock(heap);
+  mut->next = heap->deactivated_mutators;
+  heap->deactivated_mutators = mut;
+  heap->active_mutator_count--;
+  if (!heap->active_mutator_count && mutators_are_stopping(heap))
+    pthread_cond_signal(&heap->collector_cond);
+  heap_unlock(heap);
 }
 
-static void reactivate_mutator(struct mark_space *space, struct mutator *mut) {
-  mark_space_lock(space);
-  while (mutators_are_stopping(space))
-    pthread_cond_wait(&space->mutator_cond, &space->lock);
-  struct mutator **prev = &space->deactivated_mutators;
+static void reactivate_mutator(struct heap *heap, struct mutator *mut) {
+  heap_lock(heap);
+  while (mutators_are_stopping(heap))
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  struct mutator **prev = &heap->deactivated_mutators;
   while (*prev != mut)
     prev = &(*prev)->next;
   *prev = mut->next;
   mut->next = NULL;
-  space->active_mutator_count++;
-  mark_space_unlock(space);
+  heap->active_mutator_count++;
+  heap_unlock(heap);
 }
 
 static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
@@ -896,10 +904,10 @@ static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
 static void* call_without_gc(struct mutator *mut,
                              void* (*f)(void*),
                              void *data) {
-  struct mark_space *space = mutator_mark_space(mut);
-  deactivate_mutator(space, mut);
+  struct heap *heap = mutator_heap(mut);
+  deactivate_mutator(heap, mut);
   void *ret = f(data);
-  reactivate_mutator(space, mut);
+  reactivate_mutator(heap, mut);
   return ret;
 }
 
@@ -907,6 +915,6 @@ static inline void print_start_gc_stats(struct heap *heap) {
 }
 
 static inline void print_end_gc_stats(struct heap *heap) {
-  printf("Completed %ld collections\n", heap_mark_space(heap)->count);
-  printf("Heap size with overhead is %zd\n", heap_mark_space(heap)->mem_size);
+  printf("Completed %ld collections\n", heap->count);
+  printf("Heap size with overhead is %zd\n", heap->size);
 }

From adc4a7a26915586c2d121c3559ee7715fa66c940 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Apr 2022 21:20:00 +0200
Subject: [PATCH 069/403] Add large object space to mark-sweep collector

This will let us partition the mark space into chunks of 32 or 64 kB, as
we won't need to allocate chunk-spanning objects.  This will improve
sweeping parallelism and is a step on the way to immix.
---
 Makefile             |  6 +--
 assert.h             |  1 +
 large-object-space.h |  2 -
 mark-sweep.h         | 91 ++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index e6e568b71..4e0946319 100644
--- a/Makefile
+++ b/Makefile
@@ -14,13 +14,13 @@ all: $(ALL_TESTS)
 bdw-%: bdw.h conservative-roots.h %-types.h %.c
 	$(COMPILE) `pkg-config --libs --cflags bdw-gc` -DGC_BDW -o $@ $*.c
 
-semi-%: semi.h precise-roots.h %-types.h heap-objects.h %.c
+semi-%: semi.h precise-roots.h large-object-space.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_SEMI -o $@ $*.c
 
-mark-sweep-%: mark-sweep.h precise-roots.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
+mark-sweep-%: mark-sweep.h precise-roots.h large-object-space.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_MARK_SWEEP -o $@ $*.c
 
-parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
+parallel-mark-sweep-%: mark-sweep.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/assert.h b/assert.h
index 3133f105c..0c6db2f89 100644
--- a/assert.h
+++ b/assert.h
@@ -4,6 +4,7 @@
 #define STATIC_ASSERT_EQ(a, b) _Static_assert((a) == (b), "eq")
 
 #define UNLIKELY(e) __builtin_expect(e, 0)
+#define LIKELY(e) __builtin_expect(e, 1)
 
 #ifndef NDEBUG
 #define ASSERT(x) do { if (UNLIKELY(!(x))) __builtin_trap(); } while (0)
diff --git a/large-object-space.h b/large-object-space.h
index 29d733a9e..cf5be0b29 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -22,8 +22,6 @@ struct heap;
 struct gcobj;
 
 struct large_object_space {
-  struct heap *heap;
-  
   pthread_mutex_t lock;
 
   size_t page_size;
diff --git a/mark-sweep.h b/mark-sweep.h
index 4e10ba858..fd7d55838 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -9,6 +9,7 @@
 #include "assert.h"
 #include "debug.h"
 #include "inline.h"
+#include "large-object-space.h"
 #include "precise-roots.h"
 #ifdef GC_PARALLEL_MARK
 #include "parallel-tracer.h"
@@ -20,10 +21,14 @@
 #define GRANULE_SIZE_LOG_2 3
 #define MEDIUM_OBJECT_THRESHOLD 256
 #define MEDIUM_OBJECT_GRANULE_THRESHOLD 32
+#define LARGE_OBJECT_THRESHOLD 8192
+#define LARGE_OBJECT_GRANULE_THRESHOLD 1024
 
 STATIC_ASSERT_EQ(GRANULE_SIZE, 1 << GRANULE_SIZE_LOG_2);
 STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
                  MEDIUM_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
+STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
+                 LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
 
 // There are small object pages for allocations of these sizes.
 #define FOR_EACH_SMALL_OBJECT_GRANULES(M) \
@@ -118,6 +123,7 @@ struct mark_space {
 
 struct heap {
   struct mark_space mark_space;
+  struct large_object_space large_object_space;
   pthread_mutex_t lock;
   pthread_cond_t collector_cond;
   pthread_cond_t mutator_cond;
@@ -155,6 +161,9 @@ static inline struct tracer* heap_tracer(struct heap *heap) {
 static inline struct mark_space* heap_mark_space(struct heap *heap) {
   return &heap->mark_space;
 }
+static inline struct large_object_space* heap_large_object_space(struct heap *heap) {
+  return &heap->large_object_space;
+}
 static inline struct heap* mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
@@ -172,7 +181,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct heap *heap, struct mutator *mut) NEVER_INLINE;
+static void collect(struct mutator *mut) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   ASSERT(space->heap_base <= (uintptr_t) obj);
@@ -181,14 +190,35 @@ static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return &space->mark_bytes[granule];
 }
 
-static inline int trace_object(struct heap *heap, struct gcobj *obj) {
-  uint8_t *byte = mark_byte(heap_mark_space(heap), obj);
+static inline int mark_space_trace_object(struct mark_space *space,
+                                          struct gcobj *obj) {
+  uint8_t *byte = mark_byte(space, obj);
   if (*byte)
     return 0;
   *byte = 1;
   return 1;
 }
 
+static inline int mark_space_contains(struct mark_space *space,
+                                      struct gcobj *obj) {
+  uintptr_t addr = (uintptr_t)obj;
+  return addr - space->heap_base < space->heap_size;
+}
+
+static inline int large_object_space_trace_object(struct large_object_space *space,
+                                                  struct gcobj *obj) {
+  return large_object_space_copy(space, (uintptr_t)obj);
+}
+
+static inline int trace_object(struct heap *heap, struct gcobj *obj) {
+  if (LIKELY(mark_space_contains(heap_mark_space(heap), obj)))
+    return mark_space_trace_object(heap_mark_space(heap), obj);
+  else if (large_object_space_contains(heap_large_object_space(heap), obj))
+    return large_object_space_trace_object(heap_large_object_space(heap), obj);
+  else
+    abort();
+}
+
 static inline void trace_one(struct gcobj *obj, void *mark_data) {
   switch (tag_live_alloc_kind(obj->tag)) {
 #define SCAN_OBJECT(name, Name, NAME) \
@@ -269,6 +299,15 @@ static void allow_mutators_to_continue(struct heap *heap) {
   pthread_cond_broadcast(&heap->mutator_cond);
 }
 
+static int heap_steal_pages(struct heap *heap, size_t npages) {
+  // FIXME: When we have a block-structured mark space, actually return
+  // pages to the OS, and limit to the current heap size.
+  return 1;
+}
+static void heap_reset_stolen_pages(struct heap *heap, size_t npages) {
+  // FIXME: Possibly reclaim blocks from the reclaimed set.
+}
+
 static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
   size_t old_capacity = buf->capacity;
   size_t old_bytes = old_capacity * sizeof(struct gcobj*);
@@ -421,9 +460,12 @@ static void reset_sweeper(struct mark_space *space) {
   space->sweep = space->heap_base;
 }
 
-static void collect(struct heap *heap, struct mutator *mut) {
+static void collect(struct mutator *mut) {
+  struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
+  struct large_object_space *lospace = heap_large_object_space(heap);
   DEBUG("start collect #%ld:\n", heap->count);
+  large_object_space_start_gc(lospace);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
   mark_controlling_mutator_roots(mut);
@@ -435,6 +477,8 @@ static void collect(struct heap *heap, struct mutator *mut) {
   clear_global_freelists(space);
   reset_sweeper(space);
   heap->count++;
+  large_object_space_finish_gc(lospace);
+  heap_reset_stolen_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
   clear_mutator_freelists(mut);
   DEBUG("collect done\n");
@@ -610,6 +654,34 @@ static int sweep(struct mark_space *space,
   return 1;
 }
 
+static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
+                            size_t granules) {
+  struct heap *heap = mutator_heap(mut);
+  struct large_object_space *space = heap_large_object_space(heap);
+
+  size_t size = granules * GRANULE_SIZE;
+  size_t npages = large_object_space_npages(space, size);
+  if (!heap_steal_pages(heap, npages)) {
+    collect(mut);
+    if (!heap_steal_pages(heap, npages)) {
+      fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
+      abort();
+    }
+  }
+
+  void *ret = large_object_space_alloc(space, npages);
+  if (!ret)
+    ret = large_object_space_obtain_and_alloc(space, npages);
+
+  if (!ret) {
+    perror("weird: we have the space but mmap didn't work");
+    abort();
+  }
+
+  *(uintptr_t*)ret = kind;
+  return ret;
+}
+
 static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
                              size_t granules) {
   struct heap *heap = mutator_heap(mut);
@@ -649,7 +721,7 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
       fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
       abort();
     } else {
-      collect(mutator_heap(mut), mut);
+      collect(mut);
       swept_from_beginning = 1;
     }
   }
@@ -738,7 +810,7 @@ static void fill_small_from_global(struct mutator *mut,
         fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
         abort();
       } else {
-        collect(mutator_heap(mut), mut);
+        collect(mut);
         swept_from_beginning = 1;
       }
     }
@@ -777,7 +849,9 @@ static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
   size_t granules = size_to_granules(size);
   if (granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
     return allocate_small(mut, kind, granules_to_small_object_size(granules));
-  return allocate_medium(mut, kind, granules);
+  if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
+    return allocate_medium(mut, kind, granules);
+  return allocate_large(mut, kind, granules);
 }
 static inline void* allocate_pointerless(struct mutator *mut,
                                          enum alloc_kind kind,
@@ -854,6 +928,9 @@ static int initialize_gc(size_t size, struct heap **heap,
     return 0;
   }
   
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+    abort();
+
   *mut = calloc(1, sizeof(struct mutator));
   if (!*mut) abort();
   add_mutator(*heap, *mut);

From bea9ce883dac5aa07626c29dff9ceaac70053939 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 20 Apr 2022 10:54:19 +0200
Subject: [PATCH 070/403] mark-sweep collector uses 16 byte granules, packed
 small freelists

Probably the collector should use 8 byte granules on 32-bit but for now
we're working on 64-bit sizes.  Since we don't (and never did) pack
pages with same-sized small objects, no need to make sure that small
object sizes fit evenly into the medium object threshold; just keep
packed freelists.  This is a simplification that lets us reclaim the
tail of a region in constant time rather than looping through the size
classes.
---
 mark-sweep.h | 189 +++++++++++++++++----------------------------------
 1 file changed, 62 insertions(+), 127 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index fd7d55838..47cf6c8aa 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -17,12 +17,12 @@
 #include "serial-tracer.h"
 #endif
 
-#define GRANULE_SIZE 8
-#define GRANULE_SIZE_LOG_2 3
+#define GRANULE_SIZE 16
+#define GRANULE_SIZE_LOG_2 4
 #define MEDIUM_OBJECT_THRESHOLD 256
-#define MEDIUM_OBJECT_GRANULE_THRESHOLD 32
+#define MEDIUM_OBJECT_GRANULE_THRESHOLD 16
 #define LARGE_OBJECT_THRESHOLD 8192
-#define LARGE_OBJECT_GRANULE_THRESHOLD 1024
+#define LARGE_OBJECT_GRANULE_THRESHOLD 512
 
 STATIC_ASSERT_EQ(GRANULE_SIZE, 1 << GRANULE_SIZE_LOG_2);
 STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
@@ -30,42 +30,6 @@ STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
 STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
                  LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
 
-// There are small object pages for allocations of these sizes.
-#define FOR_EACH_SMALL_OBJECT_GRANULES(M) \
-  M(1) M(2) M(3) M(4) M(5) M(6) M(8) M(10) M(16) M(32)
-
-enum small_object_size {
-#define SMALL_OBJECT_GRANULE_SIZE(i) SMALL_OBJECT_##i,
-  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE)
-#undef SMALL_OBJECT_GRANULE_SIZE
-  SMALL_OBJECT_SIZES,
-  NOT_SMALL_OBJECT = SMALL_OBJECT_SIZES
-};
-
-static const uint8_t small_object_granule_sizes[] = 
-{
-#define SMALL_OBJECT_GRANULE_SIZE(i) i,
-  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE)
-#undef SMALL_OBJECT_GRANULE_SIZE
-};
-
-static const enum small_object_size small_object_sizes_for_granules[MEDIUM_OBJECT_GRANULE_THRESHOLD + 2] = {
-  SMALL_OBJECT_1,   SMALL_OBJECT_1, SMALL_OBJECT_2,  SMALL_OBJECT_3,
-  SMALL_OBJECT_4,   SMALL_OBJECT_5,   SMALL_OBJECT_6,  SMALL_OBJECT_8,
-  SMALL_OBJECT_8,   SMALL_OBJECT_10,  SMALL_OBJECT_10, SMALL_OBJECT_16,
-  SMALL_OBJECT_16,  SMALL_OBJECT_16,  SMALL_OBJECT_16, SMALL_OBJECT_16,
-  SMALL_OBJECT_16,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
-  SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
-  SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
-  SMALL_OBJECT_32,  SMALL_OBJECT_32,  SMALL_OBJECT_32, SMALL_OBJECT_32,
-  SMALL_OBJECT_32,  NOT_SMALL_OBJECT
-};
-
-static enum small_object_size granules_to_small_object_size(unsigned granules) {
-  ASSERT(granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  return small_object_sizes_for_granules[granules];
-}
-  
 static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
@@ -89,7 +53,7 @@ struct gcobj_free {
 };
 
 struct gcobj_freelists {
-  struct gcobj_free *by_size[SMALL_OBJECT_SIZES];
+  struct gcobj_free *by_size[MEDIUM_OBJECT_GRANULE_THRESHOLD];
 };
 
 // Objects larger than MEDIUM_OBJECT_GRANULE_THRESHOLD.
@@ -170,9 +134,9 @@ static inline struct heap* mutator_heap(struct mutator *mutator) {
 
 static inline struct gcobj_free**
 get_small_object_freelist(struct gcobj_freelists *freelists,
-                          enum small_object_size kind) {
-  ASSERT(kind < SMALL_OBJECT_SIZES);
-  return &freelists->by_size[kind];
+                          size_t granules) {
+  ASSERT(granules > 0 && granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  return &freelists->by_size[granules - 1];
 }
 
 #define GC_HEADER uintptr_t _gc_header
@@ -233,7 +197,7 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
 }
 
 static void clear_small_freelists(struct gcobj_freelists *small) {
-  for (int i = 0; i < SMALL_OBJECT_SIZES; i++)
+  for (int i = 0; i < MEDIUM_OBJECT_GRANULE_THRESHOLD; i++)
     small->by_size[i] = NULL;
 }
 static void clear_mutator_freelists(struct mutator *mut) {
@@ -490,19 +454,18 @@ static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
 }
 
 static void push_small(struct gcobj_freelists *small_objects, void *region,
-                       enum small_object_size kind, size_t region_granules) {
+                       size_t granules, size_t region_granules) {
   uintptr_t addr = (uintptr_t) region;
-  while (region_granules) {
-    size_t granules = small_object_granule_sizes[kind];
-    struct gcobj_free **loc = get_small_object_freelist(small_objects, kind);
-    while (granules <= region_granules) {
-      push_free(loc, (struct gcobj_free*) addr);
-      region_granules -= granules;
-      addr += granules * GRANULE_SIZE;
-    }
-    // Fit any remaining granules into smaller freelists.
-    kind--;
+  struct gcobj_free **loc = get_small_object_freelist(small_objects, granules);
+  while (granules <= region_granules) {
+    push_free(loc, (struct gcobj_free*) addr);
+    region_granules -= granules;
+    addr += granules * GRANULE_SIZE;
   }
+  // Fit any remaining granules into smaller freelist.
+  if (region_granules)
+    push_free(get_small_object_freelist(small_objects, region_granules),
+              (struct gcobj_free*) addr);
 }
 
 static void push_medium(struct mark_space *space, void *region, size_t granules) {
@@ -512,30 +475,15 @@ static void push_medium(struct mark_space *space, void *region, size_t granules)
   space->medium_objects = medium;
 }
 
-static void reclaim_small(struct gcobj_freelists *small_objects,
-                          enum small_object_size kind,
-                          void *region, size_t region_granules) {
-  ASSERT(kind != NOT_SMALL_OBJECT);
-  struct gcobj_free **loc = get_small_object_freelist(small_objects, kind);
-  uintptr_t addr = (uintptr_t) region;
-  size_t object_granules = small_object_granule_sizes[kind];
-  while (region_granules >= object_granules) {
-    push_free(loc, (struct gcobj_free*) addr);
-    region_granules -= object_granules;
-    addr += object_granules * GRANULE_SIZE;
-  }
-  // Any leftover granules are wasted!
-}
-
 static void reclaim(struct mark_space *space,
                     struct gcobj_freelists *small_objects,
-                    enum small_object_size kind,
+                    size_t small_object_granules,
                     void *region,
                     size_t region_granules) {
-  if (kind != NOT_SMALL_OBJECT)
-    reclaim_small(small_objects, kind, region, region_granules);
-  else if (region_granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    push_small(small_objects, region, SMALL_OBJECT_SIZES - 1, region_granules);
+  if (small_object_granules == 0)
+    small_object_granules = region_granules;
+  if (small_object_granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
+    push_small(small_objects, region, small_object_granules, region_granules);
   else
     push_medium(space, region, region_granules);
 }
@@ -557,8 +505,7 @@ static void split_medium_object(struct mark_space *space,
     return;
   
   char *tail = ((char*)medium) + granules * GRANULE_SIZE;
-  reclaim(space, small_objects, NOT_SMALL_OBJECT, tail,
-          medium_granules - granules);
+  reclaim(space, small_objects, 0, tail, medium_granules - granules);
 }
 
 static void unlink_medium_object(struct gcobj_free_medium **prev,
@@ -578,10 +525,7 @@ static size_t live_object_granules(struct gcobj *obj) {
   default:
     abort ();
   }
-  size_t granules = size_to_granules(bytes);
-  if (granules > MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    return granules;
-  return small_object_granule_sizes[granules_to_small_object_size(granules)];
+  return size_to_granules(bytes);
 }  
 
 static size_t next_mark(const uint8_t *mark, size_t limit) {
@@ -610,7 +554,7 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
 // heap to sweep, or 0 if we reached the end.
 static int sweep(struct mark_space *space,
                  struct gcobj_freelists *small_objects,
-                 enum small_object_size kind,
+                 size_t small_object_granules,
                  size_t medium_object_granules) {
   // Sweep until we have reclaimed 32 kB of free memory, or we reach the
   // end of the heap.
@@ -625,7 +569,7 @@ static int sweep(struct mark_space *space,
     uint8_t* mark = mark_byte(space, (struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
     if (limit_granules > to_reclaim) {
-      if (kind == NOT_SMALL_OBJECT) {
+      if (small_object_granules == 0) {
         if (medium_object_granules < limit_granules)
           limit_granules = medium_object_granules;
       } else {
@@ -636,7 +580,8 @@ static int sweep(struct mark_space *space,
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
       clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
-      reclaim(space, small_objects, kind, (void*)sweep, free_granules);
+      reclaim(space, small_objects, small_object_granules, (void*)sweep,
+              free_granules);
       sweep += free_bytes;
       to_reclaim -= free_granules;
 
@@ -714,7 +659,7 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
         }
       }
       already_scanned = space->medium_objects;
-    } while (sweep(space, small_objects, NOT_SMALL_OBJECT, granules));
+    } while (sweep(space, small_objects, 0, granules));
 
     // No medium object, and we swept across the whole heap.  Collect.
     if (swept_from_beginning) {
@@ -728,21 +673,20 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
 }
   
 static int fill_small_from_local(struct gcobj_freelists *small_objects,
-                                 enum small_object_size kind) {
+                                 size_t granules) {
   // Precondition: the freelist for KIND is already empty.
-  ASSERT(!*get_small_object_freelist(small_objects, kind));
+  ASSERT(!*get_small_object_freelist(small_objects, granules));
   // See if there are small objects already on the freelists
   // that can be split.
-  for (enum small_object_size next_kind = kind + 1;
-       next_kind < SMALL_OBJECT_SIZES;
-       next_kind++) {
+  for (size_t next_size = granules + 1;
+       next_size <= MEDIUM_OBJECT_GRANULE_THRESHOLD;
+       next_size++) {
     struct gcobj_free **loc = get_small_object_freelist(small_objects,
-                                                        next_kind);
+                                                        next_size);
     if (*loc) {
       struct gcobj_free *ret = *loc;
       *loc = ret->next;
-      push_small(small_objects, ret, kind,
-                 small_object_granule_sizes[next_kind]);
+      push_small(small_objects, ret, granules, next_size);
       return 1;
     }
   }
@@ -751,8 +695,8 @@ static int fill_small_from_local(struct gcobj_freelists *small_objects,
 
 // with heap lock
 static int fill_small_from_medium(struct mark_space *space,
-                                 struct gcobj_freelists *small_objects,
-                                 enum small_object_size kind) {
+                                  struct gcobj_freelists *small_objects,
+                                  size_t granules) {
   // If there is a medium object, take and split it.
   struct gcobj_free_medium *medium = space->medium_objects;
   if (!medium)
@@ -761,18 +705,18 @@ static int fill_small_from_medium(struct mark_space *space,
   unlink_medium_object(&space->medium_objects, medium);
   ASSERT(medium->granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
   split_medium_object(space, small_objects, medium,
-                     MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  push_small(small_objects, medium, kind, MEDIUM_OBJECT_GRANULE_THRESHOLD);
+                      MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  push_small(small_objects, medium, granules, MEDIUM_OBJECT_GRANULE_THRESHOLD);
   return 1;
 }
 
 static int fill_small_from_global_small(struct mark_space *space,
                                         struct gcobj_freelists *small_objects,
-                                        enum small_object_size kind) {
+                                        size_t granules) {
   struct gcobj_free **src =
-    get_small_object_freelist(&space->small_objects, kind);
+    get_small_object_freelist(&space->small_objects, granules);
   if (*src) {
-    struct gcobj_free **dst = get_small_object_freelist(small_objects, kind);
+    struct gcobj_free **dst = get_small_object_freelist(small_objects, granules);
     ASSERT(!*dst);
     *dst = *src;
     *src = NULL;
@@ -782,9 +726,9 @@ static int fill_small_from_global_small(struct mark_space *space,
 }
 
 static void fill_small_from_global(struct mutator *mut,
-                                   enum small_object_size kind) NEVER_INLINE;
+                                   size_t granules) NEVER_INLINE;
 static void fill_small_from_global(struct mutator *mut,
-                                   enum small_object_size kind) {
+                                   size_t granules) {
   struct gcobj_freelists *small_objects = &mut->small_objects;
   struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
@@ -798,14 +742,14 @@ static void fill_small_from_global(struct mutator *mut,
 
   int swept_from_beginning = 0;
   while (1) {
-    if (fill_small_from_global_small(space, small_objects, kind))
+    if (fill_small_from_global_small(space, small_objects, granules))
       break;
 
-    if (fill_small_from_medium(space, small_objects, kind))
+    if (fill_small_from_medium(space, small_objects, granules))
       break;
 
     // By default, pull in 16 kB of data at a time.
-    if (!sweep(space, small_objects, kind, 0)) {
+    if (!sweep(space, small_objects, granules, 0)) {
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
         abort();
@@ -815,32 +759,32 @@ static void fill_small_from_global(struct mutator *mut,
       }
     }
 
-    if (*get_small_object_freelist(small_objects, kind))
+    if (*get_small_object_freelist(small_objects, granules))
       break;
   }
   heap_unlock(heap);
 }
 
-static void fill_small(struct mutator *mut, enum small_object_size kind) {
+static void fill_small(struct mutator *mut, size_t granules) {
   // See if there are small objects already on the local freelists that
   // can be split.
-  if (fill_small_from_local(&mut->small_objects, kind))
+  if (fill_small_from_local(&mut->small_objects, granules))
     return;
 
-  fill_small_from_global(mut, kind);
+  fill_small_from_global(mut, granules);
 }
 
-static inline void* allocate_small(struct mutator *mut,
-                                   enum alloc_kind alloc_kind,
-                                   enum small_object_size small_kind) {
+static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
+                                   size_t granules) {
+  ASSERT(granules > 0); // allocating 0 granules would be silly
   struct gcobj_free **loc =
-    get_small_object_freelist(&mut->small_objects, small_kind);
+    get_small_object_freelist(&mut->small_objects, granules);
   if (!*loc)
-    fill_small(mut, small_kind);
+    fill_small(mut, granules);
   struct gcobj_free *ret = *loc;
   *loc = ret->next;
   struct gcobj *obj = (struct gcobj *)ret;
-  obj->tag = tag_live(alloc_kind);
+  obj->tag = tag_live(kind);
   return obj;
 }
 
@@ -848,7 +792,7 @@ static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
   size_t granules = size_to_granules(size);
   if (granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    return allocate_small(mut, kind, granules_to_small_object_size(granules));
+    return allocate_small(mut, kind, granules);
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
     return allocate_medium(mut, kind, granules);
   return allocate_large(mut, kind, granules);
@@ -894,22 +838,13 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->heap_base = ((uintptr_t) mem) + overhead;
   space->heap_size = size - overhead;
   space->sweep = space->heap_base + space->heap_size;
-  reclaim(space, NULL, NOT_SMALL_OBJECT, (void*)space->heap_base,
+  reclaim(space, NULL, 0, (void*)space->heap_base,
           size_to_granules(space->heap_size));
   return 1;
 }
 
 static int initialize_gc(size_t size, struct heap **heap,
                          struct mutator **mut) {
-#define SMALL_OBJECT_GRANULE_SIZE(i) \
-    ASSERT_EQ(SMALL_OBJECT_##i, small_object_sizes_for_granules[i]); \
-    ASSERT_EQ(SMALL_OBJECT_##i + 1, small_object_sizes_for_granules[i+1]);
-  FOR_EACH_SMALL_OBJECT_GRANULES(SMALL_OBJECT_GRANULE_SIZE);
-#undef SMALL_OBJECT_GRANULE_SIZE
-
-  ASSERT_EQ(SMALL_OBJECT_SIZES - 1,
-            small_object_sizes_for_granules[MEDIUM_OBJECT_GRANULE_THRESHOLD]);
-
   *heap = calloc(1, sizeof(struct heap));
   if (!*heap) abort();
 

From 7fc2fdbbf7b787cc51f49d7a13c1714f43ae53ec Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 27 Apr 2022 22:31:09 +0200
Subject: [PATCH 071/403] Use block-structured heap for mark-sweep

There are 4 MB aligned slabs, divided into 64 KB pages.  (On 32-bit this
will be 2 MB ad 32 kB).  Then you can get a mark byte per granule by
slab plus granule offset.  The unused slack that would correspond to
mark bytes for the blocks used *by* the mark bytes is used for other
purposes: remembered sets (not yet used), block summaries (not used),
and a slab header (likewise).
---
 mark-sweep.h | 167 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 133 insertions(+), 34 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 47cf6c8aa..c26c2049c 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -30,6 +30,92 @@ STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
 STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
                  LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
 
+#define SLAB_SIZE (4 * 1024 * 1024)
+#define BLOCK_SIZE (64 * 1024)
+#define METADATA_BYTES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
+#define BLOCKS_PER_SLAB (SLAB_SIZE / BLOCK_SIZE)
+#define META_BLOCKS_PER_SLAB (METADATA_BYTES_PER_BLOCK * BLOCKS_PER_SLAB / BLOCK_SIZE)
+#define NONMETA_BLOCKS_PER_SLAB (BLOCKS_PER_SLAB - META_BLOCKS_PER_SLAB)
+#define METADATA_BYTES_PER_SLAB (NONMETA_BLOCKS_PER_SLAB * METADATA_BYTES_PER_BLOCK)
+#define SLACK_METADATA_BYTES_PER_SLAB (META_BLOCKS_PER_SLAB * METADATA_BYTES_PER_BLOCK)
+#define REMSET_BYTES_PER_BLOCK (SLACK_METADATA_BYTES_PER_SLAB / BLOCKS_PER_SLAB)
+#define REMSET_BYTES_PER_SLAB (REMSET_BYTES_PER_BLOCK * NONMETA_BLOCKS_PER_SLAB)
+#define SLACK_REMSET_BYTES_PER_SLAB (REMSET_BYTES_PER_BLOCK * META_BLOCKS_PER_SLAB)
+#define SUMMARY_BYTES_PER_BLOCK (SLACK_REMSET_BYTES_PER_SLAB / BLOCKS_PER_SLAB)
+#define SUMMARY_BYTES_PER_SLAB (SUMMARY_BYTES_PER_BLOCK * NONMETA_BLOCKS_PER_SLAB)
+#define SLACK_SUMMARY_BYTES_PER_SLAB (SUMMARY_BYTES_PER_BLOCK * META_BLOCKS_PER_SLAB)
+#define HEADER_BYTES_PER_SLAB SLACK_SUMMARY_BYTES_PER_SLAB
+
+struct slab;
+
+struct slab_header {
+  union {
+    struct {
+      struct slab *next;
+      struct slab *prev;
+    };
+    uint8_t padding[HEADER_BYTES_PER_SLAB];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct slab_header), HEADER_BYTES_PER_SLAB);
+
+struct block_summary {
+  union {
+    struct {
+      uint16_t wasted_granules;
+      uint16_t wasted_spans;
+      uint8_t out_for_thread;
+      uint8_t has_pin;
+      uint8_t paged_out;
+    };
+    uint8_t padding[SUMMARY_BYTES_PER_BLOCK];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct block_summary), SUMMARY_BYTES_PER_BLOCK);
+
+struct block {
+  char data[BLOCK_SIZE];
+};
+
+struct slab {
+  struct slab_header header;
+  struct block_summary summaries[NONMETA_BLOCKS_PER_SLAB];
+  uint8_t remsets[REMSET_BYTES_PER_SLAB];
+  uint8_t metadata[METADATA_BYTES_PER_SLAB];
+  struct block blocks[NONMETA_BLOCKS_PER_SLAB];
+};
+STATIC_ASSERT_EQ(sizeof(struct slab), SLAB_SIZE);
+
+static struct slab *object_slab(void *obj) {
+  uintptr_t addr = (uintptr_t) obj;
+  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  return (struct slab*) base;
+}
+
+static uint8_t *object_metadata_byte(void *obj) {
+  uintptr_t addr = (uintptr_t) obj;
+  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
+  return (uint8_t*) (base + granule);
+}
+
+#define GRANULES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
+#define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
+static uint8_t *object_remset_byte(void *obj) {
+  uintptr_t addr = (uintptr_t) obj;
+  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
+  uintptr_t remset_byte = granule / GRANULES_PER_REMSET_BYTE;
+  return (uint8_t*) (base + remset_byte);
+}
+
+static struct block_summary* object_block_summary(void *obj) {
+  uintptr_t addr = (uintptr_t) obj;
+  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  uintptr_t block = (addr & (SLAB_SIZE - 1)) / BLOCK_SIZE;
+  return (struct block_summary*) (base + block * sizeof(struct block_summary));
+}
+
 static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
@@ -76,13 +162,12 @@ struct mark_space {
   struct gcobj_freelists small_objects;
   // Unordered list of medium objects.
   struct gcobj_free_medium *medium_objects;
-  uintptr_t base;
-  uint8_t *mark_bytes;
-  uintptr_t heap_base;
+  uintptr_t low_addr;
+  size_t extent;
   size_t heap_size;
   uintptr_t sweep;
-  void *mem;
-  size_t mem_size;
+  struct slab *slabs;
+  size_t nslabs;
 };
 
 struct heap {
@@ -148,10 +233,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 static void collect(struct mutator *mut) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
-  ASSERT(space->heap_base <= (uintptr_t) obj);
-  ASSERT((uintptr_t) obj < space->heap_base + space->heap_size);
-  uintptr_t granule = (((uintptr_t) obj) - space->heap_base)  / GRANULE_SIZE;
-  return &space->mark_bytes[granule];
+  return object_metadata_byte(obj);
 }
 
 static inline int mark_space_trace_object(struct mark_space *space,
@@ -166,7 +248,7 @@ static inline int mark_space_trace_object(struct mark_space *space,
 static inline int mark_space_contains(struct mark_space *space,
                                       struct gcobj *obj) {
   uintptr_t addr = (uintptr_t)obj;
-  return addr - space->heap_base < space->heap_size;
+  return addr - space->low_addr < space->extent;
 }
 
 static inline int large_object_space_trace_object(struct large_object_space *space,
@@ -421,7 +503,7 @@ static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
 }
 
 static void reset_sweeper(struct mark_space *space) {
-  space->sweep = space->heap_base;
+  space->sweep = (uintptr_t) &space->slabs[0].blocks;
 }
 
 static void collect(struct mutator *mut) {
@@ -560,10 +642,15 @@ static int sweep(struct mark_space *space,
   // end of the heap.
   ssize_t to_reclaim = 32 * 1024 / GRANULE_SIZE;
   uintptr_t sweep = space->sweep;
-  uintptr_t limit = space->heap_base + space->heap_size;
+  uintptr_t limit = align_up(sweep, SLAB_SIZE);
 
-  if (sweep == limit)
-    return 0;
+  if (sweep == limit) {
+    if (sweep == space->low_addr + space->extent)
+      return 0;
+    // Assumes contiguous slabs.  To relax later.
+    sweep += META_BLOCKS_PER_SLAB * BLOCK_SIZE;
+    limit += SLAB_SIZE;
+  }
 
   while (to_reclaim > 0 && sweep < limit) {
     uint8_t* mark = mark_byte(space, (struct gcobj*)sweep);
@@ -813,33 +900,45 @@ static inline void* get_field(void **addr) {
   return *addr;
 }
 
-static int mark_space_init(struct mark_space *space, struct heap *heap) {
-  size_t size = align_up(heap->size, getpagesize());
+static struct slab* allocate_slabs(size_t nslabs) {
+  size_t size = nslabs * SLAB_SIZE;
+  size_t extent = size + SLAB_SIZE;
 
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("mmap failed");
-    return 0;
+    return NULL;
   }
 
-  space->mem = mem;
-  space->mem_size = size;
-  // If there is 1 mark byte per granule, and SIZE bytes available for
-  // HEAP_SIZE + MARK_BYTES, then:
-  //
-  //   size = (granule_size + 1) / granule_size * heap_size
-  //   mark_bytes = 1/granule_size * heap_size
-  //   mark_bytes = ceil(heap_size / (granule_size + 1))
-  space->mark_bytes = (uint8_t *) mem;
-  size_t mark_bytes_size = (size + GRANULE_SIZE) / (GRANULE_SIZE + 1);
-  size_t overhead = align_up(mark_bytes_size, GRANULE_SIZE);
+  uintptr_t base = (uintptr_t) mem;
+  uintptr_t end = base + extent;
+  uintptr_t aligned_base = align_up(base, SLAB_SIZE);
+  uintptr_t aligned_end = aligned_base + size;
 
-  space->heap_base = ((uintptr_t) mem) + overhead;
-  space->heap_size = size - overhead;
-  space->sweep = space->heap_base + space->heap_size;
-  reclaim(space, NULL, 0, (void*)space->heap_base,
-          size_to_granules(space->heap_size));
+  if (aligned_base - base)
+    munmap((void*)base, aligned_base - base);
+  if (end - aligned_end)
+    munmap((void*)aligned_end, end - aligned_end);
+
+  return (struct slab*) aligned_base;
+}
+
+static int mark_space_init(struct mark_space *space, struct heap *heap) {
+  size_t size = align_up(heap->size, SLAB_SIZE);
+  size_t nslabs = size / SLAB_SIZE;
+  struct slab *slabs = allocate_slabs(nslabs);
+  if (!slabs)
+    return 0;
+
+  space->slabs = slabs;
+  space->nslabs = nslabs;
+  space->low_addr = (uintptr_t) slabs;
+  space->extent = size;
+  space->sweep = space->low_addr + space->extent;
+  for (size_t i = 0; i < nslabs; i++)
+    reclaim(space, NULL, 0, &slabs[i].blocks,
+            NONMETA_BLOCKS_PER_SLAB * GRANULES_PER_BLOCK);
   return 1;
 }
 

From 83bf1d8cf3acd43b4ade361d097d7e88a11a36fd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 14:45:25 +0200
Subject: [PATCH 072/403] Fix bug ensuring zeroed memory

If the granule size is bigger than a pointer, we were leaving the first
granule uncleared.
---
 mark-sweep.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index c26c2049c..883eb5c84 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -666,7 +666,7 @@ static int sweep(struct mark_space *space,
     size_t free_granules = next_mark(mark, limit_granules);
     if (free_granules) {
       size_t free_bytes = free_granules * GRANULE_SIZE;
-      clear_memory(sweep + GRANULE_SIZE, free_bytes - GRANULE_SIZE);
+      clear_memory(sweep + sizeof(uintptr_t), free_bytes - sizeof(uintptr_t));
       reclaim(space, small_objects, small_object_granules, (void*)sweep,
               free_granules);
       sweep += free_bytes;

From f97906421eef769113be67a6c728f21cb4347de2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 14:46:36 +0200
Subject: [PATCH 073/403] Sweep by block, not by slab

This lets mutators run in parallel.  There is a bug currently however
with a race between stopping mutators marking their roots and other
mutators still sweeping.  Will fix in a followup.
---
 mark-sweep.h | 262 ++++++++++++++++++++++++---------------------------
 1 file changed, 124 insertions(+), 138 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 883eb5c84..7079b5aeb 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -138,10 +138,6 @@ struct gcobj_free {
   struct gcobj_free *next;
 };
 
-struct gcobj_freelists {
-  struct gcobj_free *by_size[MEDIUM_OBJECT_GRANULE_THRESHOLD];
-};
-
 // Objects larger than MEDIUM_OBJECT_GRANULE_THRESHOLD.
 struct gcobj_free_medium {
   struct gcobj_free_medium *next;
@@ -159,13 +155,10 @@ struct gcobj {
 };
 
 struct mark_space {
-  struct gcobj_freelists small_objects;
-  // Unordered list of medium objects.
-  struct gcobj_free_medium *medium_objects;
   uintptr_t low_addr;
   size_t extent;
   size_t heap_size;
-  uintptr_t sweep;
+  uintptr_t next_block;
   struct slab *slabs;
   size_t nslabs;
 };
@@ -197,7 +190,10 @@ struct mutator_mark_buf {
 
 struct mutator {
   // Segregated freelists of small objects.
-  struct gcobj_freelists small_objects;
+  struct gcobj_free *small_objects[MEDIUM_OBJECT_GRANULE_THRESHOLD];
+  // Unordered list of medium objects.
+  struct gcobj_free_medium *medium_objects;
+  uintptr_t sweep;
   struct heap *heap;
   struct handle *roots;
   struct mutator_mark_buf mark_buf;
@@ -218,10 +214,9 @@ static inline struct heap* mutator_heap(struct mutator *mutator) {
 }
 
 static inline struct gcobj_free**
-get_small_object_freelist(struct gcobj_freelists *freelists,
-                          size_t granules) {
+get_small_object_freelist(struct mutator *mut, size_t granules) {
   ASSERT(granules > 0 && granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  return &freelists->by_size[granules - 1];
+  return &mut->small_objects[granules - 1];
 }
 
 #define GC_HEADER uintptr_t _gc_header
@@ -278,16 +273,10 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
   }
 }
 
-static void clear_small_freelists(struct gcobj_freelists *small) {
-  for (int i = 0; i < MEDIUM_OBJECT_GRANULE_THRESHOLD; i++)
-    small->by_size[i] = NULL;
-}
 static void clear_mutator_freelists(struct mutator *mut) {
-  clear_small_freelists(&mut->small_objects);
-}
-static void clear_global_freelists(struct mark_space *space) {
-  clear_small_freelists(&space->small_objects);
-  space->medium_objects = NULL;
+  for (int i = 0; i < MEDIUM_OBJECT_GRANULE_THRESHOLD; i++)
+    mut->small_objects[i] = NULL;
+  mut->medium_objects = NULL;
 }
 
 static int heap_has_multiple_mutators(struct heap *heap) {
@@ -435,9 +424,13 @@ static void wait_for_mutators_to_stop(struct heap *heap) {
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
+static void finish_sweeping(struct mutator *mut);
+
 static void mark_inactive_mutators(struct heap *heap) {
-  for (struct mutator *mut = heap->deactivated_mutators; mut; mut = mut->next)
+  for (struct mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
+    finish_sweeping(mut);
     mark_controlling_mutator_roots(mut);
+  }
 }
 
 static void mark_global_roots(struct heap *heap) {
@@ -480,6 +473,7 @@ static void pause_mutator_for_collection_with_lock(struct mutator *mut) NEVER_IN
 static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   ASSERT(mutators_are_stopping(heap));
+  finish_sweeping(mut);
   mark_controlling_mutator_roots(mut);
   pause_mutator_for_collection(heap);
   clear_mutator_freelists(mut);
@@ -489,6 +483,7 @@ static void pause_mutator_for_collection_without_lock(struct mutator *mut) NEVER
 static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   ASSERT(mutators_are_stopping(heap));
+  finish_sweeping(mut);
   mark_stopping_mutator_roots(mut);
   heap_lock(heap);
   pause_mutator_for_collection(heap);
@@ -503,7 +498,7 @@ static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
 }
 
 static void reset_sweeper(struct mark_space *space) {
-  space->sweep = (uintptr_t) &space->slabs[0].blocks;
+  space->next_block = (uintptr_t) &space->slabs[0].blocks;
 }
 
 static void collect(struct mutator *mut) {
@@ -520,7 +515,6 @@ static void collect(struct mutator *mut) {
   mark_global_roots(heap);
   tracer_trace(heap);
   tracer_release(heap);
-  clear_global_freelists(space);
   reset_sweeper(space);
   heap->count++;
   large_object_space_finish_gc(lospace);
@@ -535,10 +529,10 @@ static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
   *loc = obj;
 }
 
-static void push_small(struct gcobj_freelists *small_objects, void *region,
+static void push_small(struct mutator *mut, void *region,
                        size_t granules, size_t region_granules) {
   uintptr_t addr = (uintptr_t) region;
-  struct gcobj_free **loc = get_small_object_freelist(small_objects, granules);
+  struct gcobj_free **loc = get_small_object_freelist(mut, granules);
   while (granules <= region_granules) {
     push_free(loc, (struct gcobj_free*) addr);
     region_granules -= granules;
@@ -546,34 +540,32 @@ static void push_small(struct gcobj_freelists *small_objects, void *region,
   }
   // Fit any remaining granules into smaller freelist.
   if (region_granules)
-    push_free(get_small_object_freelist(small_objects, region_granules),
+    push_free(get_small_object_freelist(mut, region_granules),
               (struct gcobj_free*) addr);
 }
 
-static void push_medium(struct mark_space *space, void *region, size_t granules) {
+static void push_medium(struct mutator *mut, void *region, size_t granules) {
   struct gcobj_free_medium *medium = region;
-  medium->next = space->medium_objects;
+  medium->next = mut->medium_objects;
   medium->granules = granules;
-  space->medium_objects = medium;
+  mut->medium_objects = medium;
 }
 
-static void reclaim(struct mark_space *space,
-                    struct gcobj_freelists *small_objects,
+static void reclaim(struct mutator *mut,
                     size_t small_object_granules,
                     void *region,
                     size_t region_granules) {
   if (small_object_granules == 0)
     small_object_granules = region_granules;
   if (small_object_granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    push_small(small_objects, region, small_object_granules, region_granules);
+    push_small(mut, region, small_object_granules, region_granules);
   else
-    push_medium(space, region, region_granules);
+    push_medium(mut, region, region_granules);
 }
 
-static void split_medium_object(struct mark_space *space,
-                               struct gcobj_freelists *small_objects,
-                               struct gcobj_free_medium *medium,
-                               size_t granules) {
+static void split_medium_object(struct mutator *mut,
+                                struct gcobj_free_medium *medium,
+                                size_t granules) {
   size_t medium_granules = medium->granules;
   ASSERT(medium_granules >= granules);
   ASSERT(granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
@@ -587,11 +579,11 @@ static void split_medium_object(struct mark_space *space,
     return;
   
   char *tail = ((char*)medium) + granules * GRANULE_SIZE;
-  reclaim(space, small_objects, 0, tail, medium_granules - granules);
+  reclaim(mut, 0, tail, medium_granules - granules);
 }
 
 static void unlink_medium_object(struct gcobj_free_medium **prev,
-                                struct gcobj_free_medium *medium) {
+                                 struct gcobj_free_medium *medium) {
   *prev = medium->next;
 }
 
@@ -632,28 +624,51 @@ static size_t next_mark(const uint8_t *mark, size_t limit) {
   return limit;
 }
 
+static uintptr_t mark_space_next_block(struct mark_space *space) {
+  uintptr_t block = atomic_load_explicit(&space->next_block,
+                                         memory_order_acquire);
+  uintptr_t next_block;
+  do {
+    if (block == 0)
+      return 0;
+
+    next_block = block + BLOCK_SIZE;
+    if (next_block % SLAB_SIZE == 0) {
+      uintptr_t hi_addr = space->low_addr + space->extent;
+      if (next_block == hi_addr)
+        next_block = 0;
+      else
+        next_block += META_BLOCKS_PER_SLAB * BLOCK_SIZE;
+    }
+  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
+                                         next_block));
+  return block;
+}
+
 // Sweep some heap to reclaim free space.  Return 1 if there is more
 // heap to sweep, or 0 if we reached the end.
-static int sweep(struct mark_space *space,
-                 struct gcobj_freelists *small_objects,
+static int sweep(struct mutator *mut,
                  size_t small_object_granules,
                  size_t medium_object_granules) {
-  // Sweep until we have reclaimed 32 kB of free memory, or we reach the
-  // end of the heap.
-  ssize_t to_reclaim = 32 * 1024 / GRANULE_SIZE;
-  uintptr_t sweep = space->sweep;
-  uintptr_t limit = align_up(sweep, SLAB_SIZE);
+  // Sweep until we have reclaimed memory corresponding to twice the
+  // size of the smallest medium object, or we reach the end of the
+  // block.
+  ssize_t to_reclaim = 2 * MEDIUM_OBJECT_GRANULE_THRESHOLD;
+  uintptr_t sweep = mut->sweep;
+  uintptr_t limit = align_up(sweep, BLOCK_SIZE);
 
   if (sweep == limit) {
-    if (sweep == space->low_addr + space->extent)
+    sweep = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
+    if (sweep == 0) {
+      mut->sweep = 0;
       return 0;
-    // Assumes contiguous slabs.  To relax later.
-    sweep += META_BLOCKS_PER_SLAB * BLOCK_SIZE;
-    limit += SLAB_SIZE;
+    }
+    limit = sweep + BLOCK_SIZE;
   }
 
   while (to_reclaim > 0 && sweep < limit) {
-    uint8_t* mark = mark_byte(space, (struct gcobj*)sweep);
+    ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
+    uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
     if (limit_granules > to_reclaim) {
       if (small_object_granules == 0) {
@@ -665,10 +680,10 @@ static int sweep(struct mark_space *space,
     }
     size_t free_granules = next_mark(mark, limit_granules);
     if (free_granules) {
+      ASSERT(free_granules <= limit_granules);
       size_t free_bytes = free_granules * GRANULE_SIZE;
       clear_memory(sweep + sizeof(uintptr_t), free_bytes - sizeof(uintptr_t));
-      reclaim(space, small_objects, small_object_granules, (void*)sweep,
-              free_granules);
+      reclaim(mut, small_object_granules, (void*)sweep, free_granules);
       sweep += free_bytes;
       to_reclaim -= free_granules;
 
@@ -682,10 +697,23 @@ static int sweep(struct mark_space *space,
     sweep += live_object_granules((struct gcobj *)sweep) * GRANULE_SIZE;
   }
 
-  space->sweep = sweep;
+  mut->sweep = sweep;
   return 1;
 }
 
+// Another thread is triggering GC.  Before we stop, finish clearing the
+// mark bytes for the mutator's block, and release the block.
+static void finish_sweeping(struct mutator *mut) {
+  uintptr_t sweep = mut->sweep;
+  if (sweep) {
+    uintptr_t limit = align_up(sweep, BLOCK_SIZE);
+    uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
+    size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
+    memset(mark, 0, limit_granules);
+    mut->sweep = 0;
+  }
+}
+
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
                             size_t granules) {
   struct heap *heap = mutator_heap(mut);
@@ -693,6 +721,9 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
 
   size_t size = granules * GRANULE_SIZE;
   size_t npages = large_object_space_npages(space, size);
+
+  heap_lock(heap);
+
   if (!heap_steal_pages(heap, npages)) {
     collect(mut);
     if (!heap_steal_pages(heap, npages)) {
@@ -705,6 +736,8 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
   if (!ret)
     ret = large_object_space_obtain_and_alloc(space, npages);
 
+  heap_unlock(heap);
+
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
     abort();
@@ -716,156 +749,112 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
 
 static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
                              size_t granules) {
-  struct heap *heap = mutator_heap(mut);
-  struct mark_space *space = heap_mark_space(heap);
-  struct gcobj_freelists *small_objects = heap_has_multiple_mutators(heap) ?
-    &space->small_objects : &mut->small_objects;
-
   maybe_pause_mutator_for_collection(mut);
 
-  heap_lock(heap);
-
-  while (mutators_are_stopping(heap))
-    pause_mutator_for_collection_with_lock(mut);
-
   int swept_from_beginning = 0;
   while (1) {
     struct gcobj_free_medium *already_scanned = NULL;
     do {
-      struct gcobj_free_medium **prev = &space->medium_objects;
-      for (struct gcobj_free_medium *medium = space->medium_objects;
+      struct gcobj_free_medium **prev = &mut->medium_objects;
+      for (struct gcobj_free_medium *medium = mut->medium_objects;
            medium != already_scanned;
            prev = &medium->next, medium = medium->next) {
         if (medium->granules >= granules) {
           unlink_medium_object(prev, medium);
-          split_medium_object(space, small_objects, medium, granules);
-          heap_unlock(heap);
+          split_medium_object(mut, medium, granules);
           struct gcobj *obj = (struct gcobj *)medium;
           obj->tag = tag_live(kind);
           return medium;
         }
       }
-      already_scanned = space->medium_objects;
-    } while (sweep(space, small_objects, 0, granules));
+      already_scanned = mut->medium_objects;
+    } while (sweep(mut, 0, granules));
 
-    // No medium object, and we swept across the whole heap.  Collect.
+    struct heap *heap = mutator_heap(mut);
     if (swept_from_beginning) {
       fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
       abort();
     } else {
-      collect(mut);
+      heap_lock(heap);
+      if (mutators_are_stopping(heap))
+        pause_mutator_for_collection_with_lock(mut);
+      else
+        collect(mut);
+      heap_unlock(heap);
       swept_from_beginning = 1;
     }
   }
 }
   
-static int fill_small_from_local(struct gcobj_freelists *small_objects,
-                                 size_t granules) {
+static int fill_small_from_small(struct mutator *mut, size_t granules) {
   // Precondition: the freelist for KIND is already empty.
-  ASSERT(!*get_small_object_freelist(small_objects, granules));
+  ASSERT(!*get_small_object_freelist(mut, granules));
   // See if there are small objects already on the freelists
   // that can be split.
   for (size_t next_size = granules + 1;
        next_size <= MEDIUM_OBJECT_GRANULE_THRESHOLD;
        next_size++) {
-    struct gcobj_free **loc = get_small_object_freelist(small_objects,
-                                                        next_size);
+    struct gcobj_free **loc = get_small_object_freelist(mut, next_size);
     if (*loc) {
       struct gcobj_free *ret = *loc;
       *loc = ret->next;
-      push_small(small_objects, ret, granules, next_size);
+      push_small(mut, ret, granules, next_size);
       return 1;
     }
   }
   return 0;
 }
 
-// with heap lock
-static int fill_small_from_medium(struct mark_space *space,
-                                  struct gcobj_freelists *small_objects,
-                                  size_t granules) {
+static int fill_small_from_medium(struct mutator *mut, size_t granules) {
   // If there is a medium object, take and split it.
-  struct gcobj_free_medium *medium = space->medium_objects;
+  struct gcobj_free_medium *medium = mut->medium_objects;
   if (!medium)
     return 0;
 
-  unlink_medium_object(&space->medium_objects, medium);
+  unlink_medium_object(&mut->medium_objects, medium);
   ASSERT(medium->granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  split_medium_object(space, small_objects, medium,
-                      MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  push_small(small_objects, medium, granules, MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  split_medium_object(mut, medium, MEDIUM_OBJECT_GRANULE_THRESHOLD);
+  push_small(mut, medium, granules, MEDIUM_OBJECT_GRANULE_THRESHOLD);
   return 1;
 }
 
-static int fill_small_from_global_small(struct mark_space *space,
-                                        struct gcobj_freelists *small_objects,
-                                        size_t granules) {
-  struct gcobj_free **src =
-    get_small_object_freelist(&space->small_objects, granules);
-  if (*src) {
-    struct gcobj_free **dst = get_small_object_freelist(small_objects, granules);
-    ASSERT(!*dst);
-    *dst = *src;
-    *src = NULL;
-    return 1;
-  }
-  return 0;
-}
-
-static void fill_small_from_global(struct mutator *mut,
-                                   size_t granules) NEVER_INLINE;
-static void fill_small_from_global(struct mutator *mut,
-                                   size_t granules) {
-  struct gcobj_freelists *small_objects = &mut->small_objects;
-  struct heap *heap = mutator_heap(mut);
-  struct mark_space *space = heap_mark_space(heap);
-
+static void fill_small(struct mutator *mut, size_t granules) NEVER_INLINE;
+static void fill_small(struct mutator *mut, size_t granules) {
   maybe_pause_mutator_for_collection(mut);
 
-  heap_lock(heap);
-
-  while (mutators_are_stopping(heap))
-    pause_mutator_for_collection_with_lock(mut);
-
   int swept_from_beginning = 0;
   while (1) {
-    if (fill_small_from_global_small(space, small_objects, granules))
+    if (fill_small_from_small(mut, granules))
       break;
 
-    if (fill_small_from_medium(space, small_objects, granules))
+    if (fill_small_from_medium(mut, granules))
       break;
 
-    // By default, pull in 16 kB of data at a time.
-    if (!sweep(space, small_objects, granules, 0)) {
+    if (!sweep(mut, granules, 0)) {
+      struct heap *heap = mutator_heap(mut);
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
         abort();
       } else {
-        collect(mut);
+        heap_lock(heap);
+        if (mutators_are_stopping(heap))
+          pause_mutator_for_collection_with_lock(mut);
+        else
+          collect(mut);
+        heap_unlock(heap);
         swept_from_beginning = 1;
       }
     }
 
-    if (*get_small_object_freelist(small_objects, granules))
+    if (*get_small_object_freelist(mut, granules))
       break;
   }
-  heap_unlock(heap);
-}
-
-static void fill_small(struct mutator *mut, size_t granules) {
-  // See if there are small objects already on the local freelists that
-  // can be split.
-  if (fill_small_from_local(&mut->small_objects, granules))
-    return;
-
-  fill_small_from_global(mut, granules);
 }
 
 static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
                                    size_t granules) {
   ASSERT(granules > 0); // allocating 0 granules would be silly
-  struct gcobj_free **loc =
-    get_small_object_freelist(&mut->small_objects, granules);
+  struct gcobj_free **loc = get_small_object_freelist(mut, granules);
   if (!*loc)
     fill_small(mut, granules);
   struct gcobj_free *ret = *loc;
@@ -935,10 +924,7 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->nslabs = nslabs;
   space->low_addr = (uintptr_t) slabs;
   space->extent = size;
-  space->sweep = space->low_addr + space->extent;
-  for (size_t i = 0; i < nslabs; i++)
-    reclaim(space, NULL, 0, &slabs[i].blocks,
-            NONMETA_BLOCKS_PER_SLAB * GRANULES_PER_BLOCK);
+  reset_sweeper(space);
   return 1;
 }
 

From 3a04078044c613bc07f7bab72b8afdce5edcb0e7 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 15:04:21 +0200
Subject: [PATCH 074/403] mark-sweep uses all the metadata bits

Don't require that mark bytes be cleared; instead we have rotating
colors.  Beginnings of support for concurrent marking, pinning,
conservative roots, and generational collection.
---
 mark-sweep.h | 138 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 109 insertions(+), 29 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 7079b5aeb..7814f0382 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -30,6 +30,54 @@ STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
 STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
                  LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
 
+// Each granule has one metadata byte stored in a side table, used for
+// mark bits but also for other per-object metadata.  Already we were
+// using a byte instead of a bit to facilitate parallel marking.
+// (Parallel markers are allowed to race.)  Turns out we can put a
+// pinned bit there too, for objects that can't be moved.  Actually
+// there are two pinned bits: one that's managed by the collector, which
+// pins referents of conservative roots, and one for pins managed
+// externally (maybe because the mutator requested a pin.)  Then there's
+// a "remembered" bit, indicating that the object should be scanned for
+// references to the nursery.  If the remembered bit is set, the
+// corresponding remset byte should also be set in the slab (see below).
+//
+// Getting back to mark bits -- because we want to allow for
+// conservative roots, we need to know whether an address indicates an
+// object or not.  That means that when an object is allocated, it has
+// to set a bit, somewhere.  In our case we use the metadata byte, and
+// set the "young" bit.  In future we could use this for generational
+// GC, with the sticky mark bit strategy.
+//
+// When an object becomes dead after a GC, it will still have a bit set
+// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
+// clear these bits before the next collection.  But, for concurrent
+// marking, we will also be marking "live" objects, updating their mark
+// bits.  So there are four object states concurrently observable:
+// young, dead, survivor, and marked.  (If we didn't have concurrent
+// marking we would still need the "marked" state, because marking
+// mutator roots before stopping is also a form of concurrent marking.)
+// Even though these states are mutually exclusive, we use separate bits
+// for them because we have the space.  After each collection, the dead,
+// survivor, and marked states rotate by one bit.
+enum metadata_byte {
+  METADATA_BYTE_NONE = 0,
+  METADATA_BYTE_YOUNG = 1,
+  METADATA_BYTE_MARK_0 = 2,
+  METADATA_BYTE_MARK_1 = 4,
+  METADATA_BYTE_MARK_2 = 8,
+  METADATA_BYTE_END = 16,
+  METADATA_BYTE_PINNED = 32,
+  METADATA_BYTE_PERMAPINNED = 64,
+  METADATA_BYTE_REMEMBERED = 128
+};
+
+static uint8_t rotate_dead_survivor_marked(uint8_t mask) {
+  uint8_t all =
+    METADATA_BYTE_MARK_0 | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
+  return ((mask << 1) | (mask >> 2)) & all;
+}
+
 #define SLAB_SIZE (4 * 1024 * 1024)
 #define BLOCK_SIZE (64 * 1024)
 #define METADATA_BYTES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
@@ -155,6 +203,8 @@ struct gcobj {
 };
 
 struct mark_space {
+  uint8_t sweep_live_mask;
+  uint8_t marked_mask;
   uintptr_t low_addr;
   size_t extent;
   size_t heap_size;
@@ -233,10 +283,13 @@ static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
 
 static inline int mark_space_trace_object(struct mark_space *space,
                                           struct gcobj *obj) {
-  uint8_t *byte = mark_byte(space, obj);
-  if (*byte)
+  uint8_t *loc = object_metadata_byte(obj);
+  uint8_t byte = *loc;
+  if (byte & space->marked_mask)
     return 0;
-  *byte = 1;
+  uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
+    | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
+  *loc = (byte & ~mask) | space->marked_mask;
   return 1;
 }
 
@@ -501,6 +554,11 @@ static void reset_sweeper(struct mark_space *space) {
   space->next_block = (uintptr_t) &space->slabs[0].blocks;
 }
 
+static void rotate_mark_bytes(struct mark_space *space) {
+  space->sweep_live_mask = rotate_dead_survivor_marked(space->sweep_live_mask);
+  space->marked_mask = rotate_dead_survivor_marked(space->marked_mask);
+}
+
 static void collect(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
@@ -516,6 +574,7 @@ static void collect(struct mutator *mut) {
   tracer_trace(heap);
   tracer_release(heap);
   reset_sweeper(space);
+  rotate_mark_bytes(space);
   heap->count++;
   large_object_space_finish_gc(lospace);
   heap_reset_stolen_pages(heap, lospace->live_pages_at_last_collection);
@@ -602,24 +661,23 @@ static size_t live_object_granules(struct gcobj *obj) {
   return size_to_granules(bytes);
 }  
 
-static size_t next_mark(const uint8_t *mark, size_t limit) {
-  size_t n = 0;
-  for (; (((uintptr_t)mark) & 7) && n < limit; n++)
-    if (mark[n])
-      return n;
-  uintptr_t *word_mark = (uintptr_t *)(mark + n);
-  for (;
-       n + sizeof(uintptr_t) * 4 <= limit;
-       n += sizeof(uintptr_t) * 4, word_mark += 4)
-    if (word_mark[0] | word_mark[1] | word_mark[2] | word_mark[3])
-      break;
-  for (;
-       n + sizeof(uintptr_t) <= limit;
-       n += sizeof(uintptr_t), word_mark += 1)
-    if (word_mark[0])
-      break;
-  for (; n < limit; n++)
-    if (mark[n])
+static size_t sweep_and_check_live(uint8_t *loc, uint8_t live_mask) {
+  uint8_t metadata = *loc;
+  // If the metadata byte is nonzero, that means either a young, dead,
+  // survived, or marked object.  If it's live (young, survived, or
+  // marked), we found the next mark.  Otherwise it's dead and we clear
+  // the byte.
+  if (metadata) {
+    if (metadata & live_mask)
+      return 1;
+    *loc = 0;
+  }
+  return 0;
+}
+
+static size_t next_mark(uint8_t *mark, size_t limit, uint8_t live_mask) {
+  for (size_t n = 0; n < limit; n++)
+    if (sweep_and_check_live(&mark[n], live_mask))
       return n;
   return limit;
 }
@@ -656,6 +714,7 @@ static int sweep(struct mutator *mut,
   ssize_t to_reclaim = 2 * MEDIUM_OBJECT_GRANULE_THRESHOLD;
   uintptr_t sweep = mut->sweep;
   uintptr_t limit = align_up(sweep, BLOCK_SIZE);
+  uint8_t live_mask = heap_mark_space(mutator_heap(mut))->sweep_live_mask;
 
   if (sweep == limit) {
     sweep = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
@@ -678,7 +737,7 @@ static int sweep(struct mutator *mut,
         limit_granules = to_reclaim;
       }
     }
-    size_t free_granules = next_mark(mark, limit_granules);
+    size_t free_granules = next_mark(mark, limit_granules, live_mask);
     if (free_granules) {
       ASSERT(free_granules <= limit_granules);
       size_t free_bytes = free_granules * GRANULE_SIZE;
@@ -691,9 +750,8 @@ static int sweep(struct mutator *mut,
       if (free_granules == limit_granules)
         break;
     }
-    // Object survived collection; clear mark and continue sweeping.
-    ASSERT(*mark == 1);
-    *mark = 0;
+    // Object survived collection; skip over it and continue sweeping.
+    ASSERT((*mark) & live_mask);
     sweep += live_object_granules((struct gcobj *)sweep) * GRANULE_SIZE;
   }
 
@@ -702,15 +760,32 @@ static int sweep(struct mutator *mut,
 }
 
 // Another thread is triggering GC.  Before we stop, finish clearing the
-// mark bytes for the mutator's block, and release the block.
+// dead mark bytes for the mutator's block, and release the block.
 static void finish_sweeping(struct mutator *mut) {
   uintptr_t sweep = mut->sweep;
+  uintptr_t limit = align_up(sweep, BLOCK_SIZE);
+  uint8_t live_mask = heap_mark_space(mutator_heap(mut))->sweep_live_mask;
   if (sweep) {
-    uintptr_t limit = align_up(sweep, BLOCK_SIZE);
     uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
-    memset(mark, 0, limit_granules);
-    mut->sweep = 0;
+    while (limit_granules) {
+      size_t free_granules = next_mark(mark, limit_granules, live_mask);
+      if (free_granules) {
+        ASSERT(free_granules <= limit_granules);
+        size_t free_bytes = free_granules * GRANULE_SIZE;
+        sweep += free_bytes;
+        mark += free_granules;
+        limit_granules -= free_granules;
+        if (limit_granules == 0)
+          break;
+      }
+      // Object survived collection; skip over it and continue sweeping.
+      ASSERT((*mark) & live_mask);
+      size_t live_granules = live_object_granules((struct gcobj *)sweep);
+      sweep += live_granules * GRANULE_SIZE;
+      limit_granules -= live_granules;
+      mark += live_granules;
+    }
   }
 }
 
@@ -920,6 +995,11 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   if (!slabs)
     return 0;
 
+  uint8_t dead = METADATA_BYTE_MARK_0;
+  uint8_t survived = METADATA_BYTE_MARK_1;
+  uint8_t marked = METADATA_BYTE_MARK_2;
+  space->marked_mask = marked;
+  space->sweep_live_mask = METADATA_BYTE_YOUNG | survived | marked;
   space->slabs = slabs;
   space->nslabs = nslabs;
   space->low_addr = (uintptr_t) slabs;

From ce69e9ed4cd43d8cd0dae1ac459f09b86a9e6bd7 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 15:19:13 +0200
Subject: [PATCH 075/403] Record object sizes in metadata byte array

This will let us avoid paging in objects when sweeping.
---
 mark-sweep.h | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 7814f0382..93caf2178 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -646,19 +646,11 @@ static void unlink_medium_object(struct gcobj_free_medium **prev,
   *prev = medium->next;
 }
 
-static size_t live_object_granules(struct gcobj *obj) {
-  size_t bytes;
-  switch (tag_live_alloc_kind (obj->tag)) {
-#define COMPUTE_SIZE(name, Name, NAME) \
-    case ALLOC_KIND_##NAME:            \
-      bytes = name##_size((Name*)obj); \
-      break;
-    FOR_EACH_HEAP_OBJECT_KIND(COMPUTE_SIZE)
-#undef COMPUTE_SIZE
-  default:
-    abort ();
-  }
-  return size_to_granules(bytes);
+static size_t mark_space_live_object_granules(uint8_t *metadata) {
+  size_t n = 0;
+  while ((metadata[n] & METADATA_BYTE_END) == 0)
+    n++;
+  return n + 1;
 }  
 
 static size_t sweep_and_check_live(uint8_t *loc, uint8_t live_mask) {
@@ -752,7 +744,7 @@ static int sweep(struct mutator *mut,
     }
     // Object survived collection; skip over it and continue sweeping.
     ASSERT((*mark) & live_mask);
-    sweep += live_object_granules((struct gcobj *)sweep) * GRANULE_SIZE;
+    sweep += mark_space_live_object_granules(mark) * GRANULE_SIZE;
   }
 
   mut->sweep = sweep;
@@ -772,8 +764,6 @@ static void finish_sweeping(struct mutator *mut) {
       size_t free_granules = next_mark(mark, limit_granules, live_mask);
       if (free_granules) {
         ASSERT(free_granules <= limit_granules);
-        size_t free_bytes = free_granules * GRANULE_SIZE;
-        sweep += free_bytes;
         mark += free_granules;
         limit_granules -= free_granules;
         if (limit_granules == 0)
@@ -781,8 +771,7 @@ static void finish_sweeping(struct mutator *mut) {
       }
       // Object survived collection; skip over it and continue sweeping.
       ASSERT((*mark) & live_mask);
-      size_t live_granules = live_object_granules((struct gcobj *)sweep);
-      sweep += live_granules * GRANULE_SIZE;
+      size_t live_granules = mark_space_live_object_granules(mark);
       limit_granules -= live_granules;
       mark += live_granules;
     }
@@ -839,6 +828,9 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
           split_medium_object(mut, medium, granules);
           struct gcobj *obj = (struct gcobj *)medium;
           obj->tag = tag_live(kind);
+          uint8_t *metadata = object_metadata_byte(obj);
+          metadata[0] = METADATA_BYTE_YOUNG;
+          metadata[granules - 1] = METADATA_BYTE_END;
           return medium;
         }
       }
@@ -933,6 +925,13 @@ static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
   if (!*loc)
     fill_small(mut, granules);
   struct gcobj_free *ret = *loc;
+  uint8_t *metadata = object_metadata_byte(ret);
+  if (granules == 1) {
+    metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
+  } else {
+    metadata[0] = METADATA_BYTE_YOUNG;
+    metadata[granules - 1] = METADATA_BYTE_END;
+  }
   *loc = ret->next;
   struct gcobj *obj = (struct gcobj *)ret;
   obj->tag = tag_live(kind);

From 2a68dadf22af9f4eb3dd51ea6336b1a437e5d2c7 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 16:09:20 +0200
Subject: [PATCH 076/403] Accelerate sweeping

Read a word at a time from the mark byte array.  If the mark word
doesn't correspond to live data there will be no contention and we can
clear it with one write.
---
 mark-sweep.h | 58 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 93caf2178..b700aa905 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -203,7 +203,8 @@ struct gcobj {
 };
 
 struct mark_space {
-  uint8_t sweep_live_mask;
+  uintptr_t sweep_mask;
+  uint8_t live_mask;
   uint8_t marked_mask;
   uintptr_t low_addr;
   size_t extent;
@@ -554,9 +555,15 @@ static void reset_sweeper(struct mark_space *space) {
   space->next_block = (uintptr_t) &space->slabs[0].blocks;
 }
 
+static uintptr_t broadcast_byte(uint8_t byte) {
+  uintptr_t result = byte;
+  return result * 0x0101010101010101ULL;
+}
+
 static void rotate_mark_bytes(struct mark_space *space) {
-  space->sweep_live_mask = rotate_dead_survivor_marked(space->sweep_live_mask);
+  space->live_mask = rotate_dead_survivor_marked(space->live_mask);
   space->marked_mask = rotate_dead_survivor_marked(space->marked_mask);
+  space->sweep_mask = broadcast_byte(space->live_mask);
 }
 
 static void collect(struct mutator *mut) {
@@ -653,23 +660,45 @@ static size_t mark_space_live_object_granules(uint8_t *metadata) {
   return n + 1;
 }  
 
-static size_t sweep_and_check_live(uint8_t *loc, uint8_t live_mask) {
+// FIXME: use atomics
+static int sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
   uint8_t metadata = *loc;
   // If the metadata byte is nonzero, that means either a young, dead,
   // survived, or marked object.  If it's live (young, survived, or
   // marked), we found the next mark.  Otherwise it's dead and we clear
   // the byte.
   if (metadata) {
-    if (metadata & live_mask)
+    if (metadata & sweep_mask)
       return 1;
     *loc = 0;
   }
   return 0;
 }
 
-static size_t next_mark(uint8_t *mark, size_t limit, uint8_t live_mask) {
-  for (size_t n = 0; n < limit; n++)
-    if (sweep_and_check_live(&mark[n], live_mask))
+static int sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
+  uintptr_t metadata = *loc;
+  if (metadata) {
+    if (metadata & sweep_mask)
+      return 1;
+    *loc = 0;
+  }
+  return 0;
+}
+
+static size_t next_mark(uint8_t *mark, size_t limit, uintptr_t sweep_mask) {
+  size_t n = 0;
+  // FIXME: may_alias
+  for (; (((uintptr_t)mark) & (sizeof(uintptr_t)-1)) && n < limit; n++)
+    if (sweep_byte(&mark[n], sweep_mask))
+      return n;
+
+  uintptr_t *mark_word = (uintptr_t*)&mark[n];
+  for (; n + sizeof(uintptr_t) <= limit; n += sizeof(uintptr_t), mark_word++)
+    if (sweep_word(mark_word, sweep_mask))
+      break;
+
+  for (; n < limit; n++)
+    if (sweep_byte(&mark[n], sweep_mask))
       return n;
   return limit;
 }
@@ -706,7 +735,7 @@ static int sweep(struct mutator *mut,
   ssize_t to_reclaim = 2 * MEDIUM_OBJECT_GRANULE_THRESHOLD;
   uintptr_t sweep = mut->sweep;
   uintptr_t limit = align_up(sweep, BLOCK_SIZE);
-  uint8_t live_mask = heap_mark_space(mutator_heap(mut))->sweep_live_mask;
+  uintptr_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
 
   if (sweep == limit) {
     sweep = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
@@ -729,7 +758,7 @@ static int sweep(struct mutator *mut,
         limit_granules = to_reclaim;
       }
     }
-    size_t free_granules = next_mark(mark, limit_granules, live_mask);
+    size_t free_granules = next_mark(mark, limit_granules, sweep_mask);
     if (free_granules) {
       ASSERT(free_granules <= limit_granules);
       size_t free_bytes = free_granules * GRANULE_SIZE;
@@ -743,7 +772,7 @@ static int sweep(struct mutator *mut,
         break;
     }
     // Object survived collection; skip over it and continue sweeping.
-    ASSERT((*mark) & live_mask);
+    ASSERT((*mark) & sweep_mask);
     sweep += mark_space_live_object_granules(mark) * GRANULE_SIZE;
   }
 
@@ -756,12 +785,12 @@ static int sweep(struct mutator *mut,
 static void finish_sweeping(struct mutator *mut) {
   uintptr_t sweep = mut->sweep;
   uintptr_t limit = align_up(sweep, BLOCK_SIZE);
-  uint8_t live_mask = heap_mark_space(mutator_heap(mut))->sweep_live_mask;
+  uint8_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
   if (sweep) {
     uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
     while (limit_granules) {
-      size_t free_granules = next_mark(mark, limit_granules, live_mask);
+      size_t free_granules = next_mark(mark, limit_granules, sweep_mask);
       if (free_granules) {
         ASSERT(free_granules <= limit_granules);
         mark += free_granules;
@@ -770,7 +799,7 @@ static void finish_sweeping(struct mutator *mut) {
           break;
       }
       // Object survived collection; skip over it and continue sweeping.
-      ASSERT((*mark) & live_mask);
+      ASSERT((*mark) & sweep_mask);
       size_t live_granules = mark_space_live_object_granules(mark);
       limit_granules -= live_granules;
       mark += live_granules;
@@ -998,7 +1027,8 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   uint8_t survived = METADATA_BYTE_MARK_1;
   uint8_t marked = METADATA_BYTE_MARK_2;
   space->marked_mask = marked;
-  space->sweep_live_mask = METADATA_BYTE_YOUNG | survived | marked;
+  space->live_mask = METADATA_BYTE_YOUNG | survived | marked;
+  rotate_mark_bytes(space);
   space->slabs = slabs;
   space->nslabs = nslabs;
   space->low_addr = (uintptr_t) slabs;

From f51e9697309c5679b4786a3d9bd8eb9d29e760d3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 16:23:10 +0200
Subject: [PATCH 077/403] Use atomics when sweeping

Otherwise, there is a race with concurrent marking, though possibly just
during the ragged stop.
---
 mark-sweep.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index b700aa905..8137e3632 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -660,27 +660,27 @@ static size_t mark_space_live_object_granules(uint8_t *metadata) {
   return n + 1;
 }  
 
-// FIXME: use atomics
 static int sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
-  uint8_t metadata = *loc;
+  uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
   // If the metadata byte is nonzero, that means either a young, dead,
   // survived, or marked object.  If it's live (young, survived, or
   // marked), we found the next mark.  Otherwise it's dead and we clear
-  // the byte.
+  // the byte.  If we see an END, that means an end of a dead object;
+  // clear it.
   if (metadata) {
     if (metadata & sweep_mask)
       return 1;
-    *loc = 0;
+    atomic_store_explicit(loc, 0, memory_order_relaxed);
   }
   return 0;
 }
 
 static int sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
-  uintptr_t metadata = *loc;
+  uintptr_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
   if (metadata) {
     if (metadata & sweep_mask)
       return 1;
-    *loc = 0;
+    atomic_store_explicit(loc, 0, memory_order_relaxed);
   }
   return 0;
 }

From 0d0d684952a780aed3c7e42bd54a51ea2e82e110 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 1 May 2022 17:07:30 +0200
Subject: [PATCH 078/403] Mark-sweep does bump-pointer allocation into holes

Instead of freelists, have mark-sweep use the metadata byte array to
identify holes, and bump-pointer allocate into those holes.
---
 mark-sweep.h | 293 ++++++++-------------------------------------------
 1 file changed, 46 insertions(+), 247 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index 8137e3632..eb4d2e353 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -182,21 +182,9 @@ static inline uintptr_t tag_live(uint8_t alloc_kind) {
   return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
 }
 
-struct gcobj_free {
-  struct gcobj_free *next;
-};
-
-// Objects larger than MEDIUM_OBJECT_GRANULE_THRESHOLD.
-struct gcobj_free_medium {
-  struct gcobj_free_medium *next;
-  size_t granules;
-};
-
 struct gcobj {
   union {
     uintptr_t tag;
-    struct gcobj_free free;
-    struct gcobj_free_medium free_medium;
     uintptr_t words[0];
     void *pointers[0];
   };
@@ -240,10 +228,8 @@ struct mutator_mark_buf {
 };
 
 struct mutator {
-  // Segregated freelists of small objects.
-  struct gcobj_free *small_objects[MEDIUM_OBJECT_GRANULE_THRESHOLD];
-  // Unordered list of medium objects.
-  struct gcobj_free_medium *medium_objects;
+  // Bump-pointer allocation into holes.
+  uintptr_t alloc;
   uintptr_t sweep;
   struct heap *heap;
   struct handle *roots;
@@ -264,12 +250,6 @@ static inline struct heap* mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
 
-static inline struct gcobj_free**
-get_small_object_freelist(struct mutator *mut, size_t granules) {
-  ASSERT(granules > 0 && granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  return &mut->small_objects[granules - 1];
-}
-
 #define GC_HEADER uintptr_t _gc_header
 
 static inline void clear_memory(uintptr_t addr, size_t size) {
@@ -327,12 +307,6 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
   }
 }
 
-static void clear_mutator_freelists(struct mutator *mut) {
-  for (int i = 0; i < MEDIUM_OBJECT_GRANULE_THRESHOLD; i++)
-    mut->small_objects[i] = NULL;
-  mut->medium_objects = NULL;
-}
-
 static int heap_has_multiple_mutators(struct heap *heap) {
   return atomic_load_explicit(&heap->multithreaded, memory_order_relaxed);
 }
@@ -530,7 +504,6 @@ static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   finish_sweeping(mut);
   mark_controlling_mutator_roots(mut);
   pause_mutator_for_collection(heap);
-  clear_mutator_freelists(mut);
 }
 
 static void pause_mutator_for_collection_without_lock(struct mutator *mut) NEVER_INLINE;
@@ -543,7 +516,6 @@ static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
   pause_mutator_for_collection(heap);
   heap_unlock(heap);
   release_stopping_mutator_roots(mut);
-  clear_mutator_freelists(mut);
 }
 
 static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
@@ -586,73 +558,9 @@ static void collect(struct mutator *mut) {
   large_object_space_finish_gc(lospace);
   heap_reset_stolen_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
-  clear_mutator_freelists(mut);
   DEBUG("collect done\n");
 }
 
-static void push_free(struct gcobj_free **loc, struct gcobj_free *obj) {
-  obj->next = *loc;
-  *loc = obj;
-}
-
-static void push_small(struct mutator *mut, void *region,
-                       size_t granules, size_t region_granules) {
-  uintptr_t addr = (uintptr_t) region;
-  struct gcobj_free **loc = get_small_object_freelist(mut, granules);
-  while (granules <= region_granules) {
-    push_free(loc, (struct gcobj_free*) addr);
-    region_granules -= granules;
-    addr += granules * GRANULE_SIZE;
-  }
-  // Fit any remaining granules into smaller freelist.
-  if (region_granules)
-    push_free(get_small_object_freelist(mut, region_granules),
-              (struct gcobj_free*) addr);
-}
-
-static void push_medium(struct mutator *mut, void *region, size_t granules) {
-  struct gcobj_free_medium *medium = region;
-  medium->next = mut->medium_objects;
-  medium->granules = granules;
-  mut->medium_objects = medium;
-}
-
-static void reclaim(struct mutator *mut,
-                    size_t small_object_granules,
-                    void *region,
-                    size_t region_granules) {
-  if (small_object_granules == 0)
-    small_object_granules = region_granules;
-  if (small_object_granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    push_small(mut, region, small_object_granules, region_granules);
-  else
-    push_medium(mut, region, region_granules);
-}
-
-static void split_medium_object(struct mutator *mut,
-                                struct gcobj_free_medium *medium,
-                                size_t granules) {
-  size_t medium_granules = medium->granules;
-  ASSERT(medium_granules >= granules);
-  ASSERT(granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  // Invariant: all words in MEDIUM are 0 except the two header words.
-  // MEDIUM is off the freelist.  We return a block of cleared memory, so
-  // clear those fields now.
-  medium->next = NULL;
-  medium->granules = 0;
-
-  if (medium_granules == granules)
-    return;
-  
-  char *tail = ((char*)medium) + granules * GRANULE_SIZE;
-  reclaim(mut, 0, tail, medium_granules - granules);
-}
-
-static void unlink_medium_object(struct gcobj_free_medium **prev,
-                                 struct gcobj_free_medium *medium) {
-  *prev = medium->next;
-}
-
 static size_t mark_space_live_object_granules(uint8_t *metadata) {
   size_t n = 0;
   while ((metadata[n] & METADATA_BYTE_END) == 0)
@@ -724,87 +632,46 @@ static uintptr_t mark_space_next_block(struct mark_space *space) {
   return block;
 }
 
-// Sweep some heap to reclaim free space.  Return 1 if there is more
-// heap to sweep, or 0 if we reached the end.
-static int sweep(struct mutator *mut,
-                 size_t small_object_granules,
-                 size_t medium_object_granules) {
-  // Sweep until we have reclaimed memory corresponding to twice the
-  // size of the smallest medium object, or we reach the end of the
-  // block.
-  ssize_t to_reclaim = 2 * MEDIUM_OBJECT_GRANULE_THRESHOLD;
+// Sweep some heap to reclaim free space, resetting mut->alloc and
+// mut->sweep.  Return the size of the hole in granules.
+static size_t next_hole(struct mutator *mut, size_t clear_size) {
   uintptr_t sweep = mut->sweep;
   uintptr_t limit = align_up(sweep, BLOCK_SIZE);
   uintptr_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
 
-  if (sweep == limit) {
-    sweep = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
-    if (sweep == 0) {
-      mut->sweep = 0;
-      return 0;
+  while (1) {
+    if (sweep == limit) {
+      sweep = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
+      if (sweep == 0) {
+        mut->alloc = mut->sweep = 0;
+        return 0;
+      }
+      limit = sweep + BLOCK_SIZE;
     }
-    limit = sweep + BLOCK_SIZE;
-  }
 
-  while (to_reclaim > 0 && sweep < limit) {
     ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
     uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
-    if (limit_granules > to_reclaim) {
-      if (small_object_granules == 0) {
-        if (medium_object_granules < limit_granules)
-          limit_granules = medium_object_granules;
-      } else {
-        limit_granules = to_reclaim;
-      }
-    }
     size_t free_granules = next_mark(mark, limit_granules, sweep_mask);
     if (free_granules) {
       ASSERT(free_granules <= limit_granules);
       size_t free_bytes = free_granules * GRANULE_SIZE;
-      clear_memory(sweep + sizeof(uintptr_t), free_bytes - sizeof(uintptr_t));
-      reclaim(mut, small_object_granules, (void*)sweep, free_granules);
-      sweep += free_bytes;
-      to_reclaim -= free_granules;
-
-      mark += free_granules;
-      if (free_granules == limit_granules)
-        break;
+      if (free_granules >= clear_size)
+        clear_memory(sweep, free_bytes);
+      mut->alloc = sweep;
+      mut->sweep = sweep + free_bytes;
+      return free_granules;
     }
     // Object survived collection; skip over it and continue sweeping.
     ASSERT((*mark) & sweep_mask);
     sweep += mark_space_live_object_granules(mark) * GRANULE_SIZE;
   }
-
-  mut->sweep = sweep;
-  return 1;
 }
 
 // Another thread is triggering GC.  Before we stop, finish clearing the
 // dead mark bytes for the mutator's block, and release the block.
 static void finish_sweeping(struct mutator *mut) {
-  uintptr_t sweep = mut->sweep;
-  uintptr_t limit = align_up(sweep, BLOCK_SIZE);
-  uint8_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
-  if (sweep) {
-    uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
-    size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
-    while (limit_granules) {
-      size_t free_granules = next_mark(mark, limit_granules, sweep_mask);
-      if (free_granules) {
-        ASSERT(free_granules <= limit_granules);
-        mark += free_granules;
-        limit_granules -= free_granules;
-        if (limit_granules == 0)
-          break;
-      }
-      // Object survived collection; skip over it and continue sweeping.
-      ASSERT((*mark) & sweep_mask);
-      size_t live_granules = mark_space_live_object_granules(mark);
-      limit_granules -= live_granules;
-      mark += live_granules;
-    }
-  }
+  while (next_hole(mut, -1)) {}
 }
 
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
@@ -840,93 +707,16 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
   return ret;
 }
 
-static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
-                             size_t granules) {
-  maybe_pause_mutator_for_collection(mut);
-
+static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
+                                 size_t granules) NEVER_INLINE;
+static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
+                                 size_t granules) {
   int swept_from_beginning = 0;
   while (1) {
-    struct gcobj_free_medium *already_scanned = NULL;
-    do {
-      struct gcobj_free_medium **prev = &mut->medium_objects;
-      for (struct gcobj_free_medium *medium = mut->medium_objects;
-           medium != already_scanned;
-           prev = &medium->next, medium = medium->next) {
-        if (medium->granules >= granules) {
-          unlink_medium_object(prev, medium);
-          split_medium_object(mut, medium, granules);
-          struct gcobj *obj = (struct gcobj *)medium;
-          obj->tag = tag_live(kind);
-          uint8_t *metadata = object_metadata_byte(obj);
-          metadata[0] = METADATA_BYTE_YOUNG;
-          metadata[granules - 1] = METADATA_BYTE_END;
-          return medium;
-        }
-      }
-      already_scanned = mut->medium_objects;
-    } while (sweep(mut, 0, granules));
-
-    struct heap *heap = mutator_heap(mut);
-    if (swept_from_beginning) {
-      fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
-      abort();
-    } else {
-      heap_lock(heap);
-      if (mutators_are_stopping(heap))
-        pause_mutator_for_collection_with_lock(mut);
-      else
-        collect(mut);
-      heap_unlock(heap);
-      swept_from_beginning = 1;
-    }
-  }
-}
-  
-static int fill_small_from_small(struct mutator *mut, size_t granules) {
-  // Precondition: the freelist for KIND is already empty.
-  ASSERT(!*get_small_object_freelist(mut, granules));
-  // See if there are small objects already on the freelists
-  // that can be split.
-  for (size_t next_size = granules + 1;
-       next_size <= MEDIUM_OBJECT_GRANULE_THRESHOLD;
-       next_size++) {
-    struct gcobj_free **loc = get_small_object_freelist(mut, next_size);
-    if (*loc) {
-      struct gcobj_free *ret = *loc;
-      *loc = ret->next;
-      push_small(mut, ret, granules, next_size);
-      return 1;
-    }
-  }
-  return 0;
-}
-
-static int fill_small_from_medium(struct mutator *mut, size_t granules) {
-  // If there is a medium object, take and split it.
-  struct gcobj_free_medium *medium = mut->medium_objects;
-  if (!medium)
-    return 0;
-
-  unlink_medium_object(&mut->medium_objects, medium);
-  ASSERT(medium->granules >= MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  split_medium_object(mut, medium, MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  push_small(mut, medium, granules, MEDIUM_OBJECT_GRANULE_THRESHOLD);
-  return 1;
-}
-
-static void fill_small(struct mutator *mut, size_t granules) NEVER_INLINE;
-static void fill_small(struct mutator *mut, size_t granules) {
-  maybe_pause_mutator_for_collection(mut);
-
-  int swept_from_beginning = 0;
-  while (1) {
-    if (fill_small_from_small(mut, granules))
+    size_t hole = next_hole(mut, granules);
+    if (hole >= granules)
       break;
-
-    if (fill_small_from_medium(mut, granules))
-      break;
-
-    if (!sweep(mut, granules, 0)) {
+    if (!hole) {
       struct heap *heap = mutator_heap(mut);
       if (swept_from_beginning) {
         fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
@@ -941,32 +731,41 @@ static void fill_small(struct mutator *mut, size_t granules) {
         swept_from_beginning = 1;
       }
     }
-
-    if (*get_small_object_freelist(mut, granules))
-      break;
   }
+  struct gcobj* ret = (struct gcobj*)mut->alloc;
+  mut->alloc += granules * GRANULE_SIZE;
+  return ret;
 }
 
 static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
                                    size_t granules) {
   ASSERT(granules > 0); // allocating 0 granules would be silly
-  struct gcobj_free **loc = get_small_object_freelist(mut, granules);
-  if (!*loc)
-    fill_small(mut, granules);
-  struct gcobj_free *ret = *loc;
-  uint8_t *metadata = object_metadata_byte(ret);
+  uintptr_t alloc = mut->alloc;
+  uintptr_t sweep = mut->sweep;
+  uintptr_t new_alloc = alloc + granules * GRANULE_SIZE;
+  struct gcobj *obj;
+  if (new_alloc <= sweep) {
+    mut->alloc = new_alloc;
+    obj = (struct gcobj *)alloc;
+  } else {
+    obj = allocate_small_slow(mut, kind, granules);
+  }
+  obj->tag = tag_live(kind);
+  uint8_t *metadata = object_metadata_byte(obj);
   if (granules == 1) {
     metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
   } else {
     metadata[0] = METADATA_BYTE_YOUNG;
     metadata[granules - 1] = METADATA_BYTE_END;
   }
-  *loc = ret->next;
-  struct gcobj *obj = (struct gcobj *)ret;
-  obj->tag = tag_live(kind);
   return obj;
 }
 
+static inline void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
+                                    size_t granules) {
+  return allocate_small(mut, kind, granules);
+}
+
 static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
   size_t granules = size_to_granules(size);

From 815f206e2863aa6bf1f4373b19d1c942a075a5bb Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 6 May 2022 12:54:28 +0200
Subject: [PATCH 079/403] Optimize sweeping

Use uint64 instead of uintptr when bulk-reading metadata bytes.  Assume
that live objects come in plugs rather than each object being separated
by a hole.  Always bulk-load metadata bytes when measuring holes, and be
less branchy.  Lazily clear hole bytes as we allocate.  Add a place to
record lost space due to fragmentation.
---
 mark-sweep.h | 187 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 135 insertions(+), 52 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index eb4d2e353..d19528d2d 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -191,7 +191,7 @@ struct gcobj {
 };
 
 struct mark_space {
-  uintptr_t sweep_mask;
+  uint64_t sweep_mask;
   uint8_t live_mask;
   uint8_t marked_mask;
   uintptr_t low_addr;
@@ -231,6 +231,7 @@ struct mutator {
   // Bump-pointer allocation into holes.
   uintptr_t alloc;
   uintptr_t sweep;
+  uintptr_t block;
   struct heap *heap;
   struct handle *roots;
   struct mutator_mark_buf mark_buf;
@@ -453,10 +454,11 @@ static void wait_for_mutators_to_stop(struct heap *heap) {
 }
 
 static void finish_sweeping(struct mutator *mut);
+static void finish_sweeping_in_block(struct mutator *mut);
 
 static void mark_inactive_mutators(struct heap *heap) {
   for (struct mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
-    finish_sweeping(mut);
+    finish_sweeping_in_block(mut);
     mark_controlling_mutator_roots(mut);
   }
 }
@@ -501,7 +503,7 @@ static void pause_mutator_for_collection_with_lock(struct mutator *mut) NEVER_IN
 static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   ASSERT(mutators_are_stopping(heap));
-  finish_sweeping(mut);
+  finish_sweeping_in_block(mut);
   mark_controlling_mutator_roots(mut);
   pause_mutator_for_collection(heap);
 }
@@ -527,8 +529,8 @@ static void reset_sweeper(struct mark_space *space) {
   space->next_block = (uintptr_t) &space->slabs[0].blocks;
 }
 
-static uintptr_t broadcast_byte(uint8_t byte) {
-  uintptr_t result = byte;
+static uint64_t broadcast_byte(uint8_t byte) {
+  uint64_t result = byte;
   return result * 0x0101010101010101ULL;
 }
 
@@ -547,6 +549,7 @@ static void collect(struct mutator *mut) {
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
   mark_controlling_mutator_roots(mut);
+  finish_sweeping(mut);
   wait_for_mutators_to_stop(heap);
   mark_inactive_mutators(heap);
   mark_global_roots(heap);
@@ -593,21 +596,42 @@ static int sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
   return 0;
 }
 
-static size_t next_mark(uint8_t *mark, size_t limit, uintptr_t sweep_mask) {
+static inline uint64_t load_mark_bytes(uint8_t *mark) {
+  ASSERT(((uintptr_t)mark & 7) == 0);
+  uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
+  uint64_t word;
+  memcpy(&word, aligned_mark, 8);
+#ifdef WORDS_BIGENDIAN
+  word = __builtin_bswap64(word);
+#endif
+  return word;
+}
+
+static inline size_t count_zero_bytes(uint64_t bytes) {
+  return bytes ? (__builtin_ctz(bytes) / 8) : sizeof(bytes);
+}
+
+static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
   size_t n = 0;
-  // FIXME: may_alias
-  for (; (((uintptr_t)mark) & (sizeof(uintptr_t)-1)) && n < limit; n++)
-    if (sweep_byte(&mark[n], sweep_mask))
-      return n;
+  // If we have a hole, it is likely to be more that 8 granules long.
+  // Assuming that it's better to make aligned loads, first we align the
+  // sweep pointer, then we load aligned mark words.
+  size_t unaligned = ((uintptr_t) mark) & 7;
+  if (unaligned) {
+    uint64_t bytes = load_mark_bytes(mark - unaligned) >> (unaligned * 8);
+    bytes &= sweep_mask;
+    if (bytes)
+      return count_zero_bytes(bytes);
+    n += 8 - unaligned;
+  }
 
-  uintptr_t *mark_word = (uintptr_t*)&mark[n];
-  for (; n + sizeof(uintptr_t) <= limit; n += sizeof(uintptr_t), mark_word++)
-    if (sweep_word(mark_word, sweep_mask))
-      break;
+  for(; n < limit; n += 8) {
+    uint64_t bytes = load_mark_bytes(mark + n);
+    bytes &= sweep_mask;
+    if (bytes)
+      return n + count_zero_bytes(bytes);
+  }
 
-  for (; n < limit; n++)
-    if (sweep_byte(&mark[n], sweep_mask))
-      return n;
   return limit;
 }
 
@@ -632,46 +656,103 @@ static uintptr_t mark_space_next_block(struct mark_space *space) {
   return block;
 }
 
+static void finish_block(struct mutator *mut) {
+  mut->block = mut->alloc = mut->sweep = 0;
+}
+
+static int next_block(struct mutator *mut) {
+  ASSERT(mut->sweep == 0);
+  uintptr_t block = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
+  if (block == 0)
+    return 0;
+
+  mut->alloc = mut->sweep = mut->block = block;
+  return 1;
+}
+
 // Sweep some heap to reclaim free space, resetting mut->alloc and
 // mut->sweep.  Return the size of the hole in granules.
-static size_t next_hole(struct mutator *mut, size_t clear_size) {
+static size_t next_hole_in_block(struct mutator *mut) {
   uintptr_t sweep = mut->sweep;
-  uintptr_t limit = align_up(sweep, BLOCK_SIZE);
+  if (sweep == 0)
+    return 0;
+  uintptr_t limit = mut->block + BLOCK_SIZE;
   uintptr_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
 
-  while (1) {
-    if (sweep == limit) {
-      sweep = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
-      if (sweep == 0) {
-        mut->alloc = mut->sweep = 0;
-        return 0;
+  while (sweep != limit) {
+    ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
+    uint8_t* metadata = object_metadata_byte((struct gcobj*)sweep);
+    size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
+
+    // Except for when we first get a block, mut->sweep is positioned
+    // right after a hole, which can point to either the end of the
+    // block or to a live object.  Assume that a live object is more
+    // common.
+    {
+      size_t live_granules = 0;
+      while (limit_granules && (metadata[0] & sweep_mask)) {
+        // Object survived collection; skip over it and continue sweeping.
+        size_t object_granules = mark_space_live_object_granules(metadata);
+        live_granules += object_granules;
+        limit_granules -= object_granules;
+        metadata += object_granules;
       }
-      limit = sweep + BLOCK_SIZE;
+      if (!limit_granules)
+        break;
+      sweep += live_granules * GRANULE_SIZE;
     }
 
-    ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
-    uint8_t* mark = object_metadata_byte((struct gcobj*)sweep);
-    size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
-    size_t free_granules = next_mark(mark, limit_granules, sweep_mask);
-    if (free_granules) {
-      ASSERT(free_granules <= limit_granules);
-      size_t free_bytes = free_granules * GRANULE_SIZE;
-      if (free_granules >= clear_size)
-        clear_memory(sweep, free_bytes);
-      mut->alloc = sweep;
-      mut->sweep = sweep + free_bytes;
-      return free_granules;
-    }
-    // Object survived collection; skip over it and continue sweeping.
-    ASSERT((*mark) & sweep_mask);
-    sweep += mark_space_live_object_granules(mark) * GRANULE_SIZE;
+    size_t free_granules = next_mark(metadata, limit_granules, sweep_mask);
+    ASSERT(free_granules);
+    ASSERT(free_granules <= limit_granules);
+    size_t free_bytes = free_granules * GRANULE_SIZE;
+    mut->alloc = sweep;
+    mut->sweep = sweep + free_bytes;
+    return free_granules;
   }
+
+  finish_block(mut);
+  return 0;
+}
+
+static void finish_hole(struct mutator *mut) {
+  size_t granules = (mut->sweep - mut->alloc) / GRANULE_SIZE;
+  if (granules) {
+    uint8_t *metadata = object_metadata_byte((void*)mut->alloc);
+    memset(metadata, 0, granules);
+    mut->alloc = mut->sweep;
+  }
+  // FIXME: add to fragmentation
+}
+
+static size_t next_hole(struct mutator *mut) {
+  finish_hole(mut);
+  while (1) {
+    size_t granules = next_hole_in_block(mut);
+    if (granules)
+      return granules;
+    if (!next_block(mut))
+      return 0;
+  }
+}
+
+static void finish_sweeping_in_block(struct mutator *mut) {
+  while (next_hole_in_block(mut))
+    finish_hole(mut);
 }
 
 // Another thread is triggering GC.  Before we stop, finish clearing the
 // dead mark bytes for the mutator's block, and release the block.
 static void finish_sweeping(struct mutator *mut) {
-  while (next_hole(mut, -1)) {}
+  while (next_hole(mut))
+    finish_hole(mut);
+}
+
+static void out_of_memory(struct mutator *mut) {
+  struct heap *heap = mutator_heap(mut);
+  fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
+          heap->size, heap_mark_space(heap)->nslabs);
+  abort();
 }
 
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
@@ -686,10 +767,8 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
 
   if (!heap_steal_pages(heap, npages)) {
     collect(mut);
-    if (!heap_steal_pages(heap, npages)) {
-      fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
-      abort();
-    }
+    if (!heap_steal_pages(heap, npages))
+      out_of_memory(mut);
   }
 
   void *ret = large_object_space_alloc(space, npages);
@@ -713,14 +792,15 @@ static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
                                  size_t granules) {
   int swept_from_beginning = 0;
   while (1) {
-    size_t hole = next_hole(mut, granules);
-    if (hole >= granules)
+    size_t hole = next_hole(mut);
+    if (hole >= granules) {
+      clear_memory(mut->alloc, hole * GRANULE_SIZE);
       break;
+    }
     if (!hole) {
       struct heap *heap = mutator_heap(mut);
       if (swept_from_beginning) {
-        fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
-        abort();
+        out_of_memory(mut);
       } else {
         heap_lock(heap);
         if (mutators_are_stopping(heap))
@@ -756,6 +836,8 @@ static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
     metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
   } else {
     metadata[0] = METADATA_BYTE_YOUNG;
+    if (granules > 2)
+      memset(metadata + 1, 0, granules - 2);
     metadata[granules - 1] = METADATA_BYTE_END;
   }
   return obj;
@@ -921,5 +1003,6 @@ static inline void print_start_gc_stats(struct heap *heap) {
 
 static inline void print_end_gc_stats(struct heap *heap) {
   printf("Completed %ld collections\n", heap->count);
-  printf("Heap size with overhead is %zd\n", heap->size);
+  printf("Heap size with overhead is %zd (%zu slabs)\n",
+         heap->size, heap_mark_space(heap)->nslabs);
 }

From 7461b2d5c3ecd2580e964b6cff060c81dc546b67 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 6 May 2022 15:08:24 +0200
Subject: [PATCH 080/403] Be more permissive with heap multiplier

Also if there's an error, print the right argument
---
 mt-gcbench.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mt-gcbench.c b/mt-gcbench.c
index aa9177b98..80c802988 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -321,8 +321,8 @@ int main(int argc, char *argv[]) {
   double multiplier = atof(argv[1]);
   size_t nthreads = atol(argv[2]);
 
-  if (!(1.0 < multiplier && multiplier < 100)) {
-    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+  if (!(0.1 < multiplier && multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[1]);
     return 1;
   }
   if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {

From 3bc81b1654fe925b55cb81f1605f57ba336dbdc9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 9 May 2022 21:46:27 +0200
Subject: [PATCH 081/403] Collect per-block statistics

This will let us compute fragmentation.
---
 mark-sweep.h | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index d19528d2d..aa9acedc5 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -110,11 +110,19 @@ STATIC_ASSERT_EQ(sizeof(struct slab_header), HEADER_BYTES_PER_SLAB);
 struct block_summary {
   union {
     struct {
-      uint16_t wasted_granules;
-      uint16_t wasted_spans;
+      // Counters related to previous collection: how many holes there
+      // were, and how much space they had.
+      uint16_t hole_count;
+      uint16_t free_granules;
+      // Counters related to allocation since previous collection:
+      // wasted space due to fragmentation.
+      uint16_t holes_with_fragmentation;
+      uint16_t fragmentation_granules;
+      // Status bytes.
       uint8_t out_for_thread;
       uint8_t has_pin;
       uint8_t paged_out;
+      uint8_t recycled;
     };
     uint8_t padding[SUMMARY_BYTES_PER_BLOCK];
   };
@@ -157,8 +165,7 @@ static uint8_t *object_remset_byte(void *obj) {
   return (uint8_t*) (base + remset_byte);
 }
 
-static struct block_summary* object_block_summary(void *obj) {
-  uintptr_t addr = (uintptr_t) obj;
+static struct block_summary* block_summary_for_addr(uintptr_t addr) {
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
   uintptr_t block = (addr & (SLAB_SIZE - 1)) / BLOCK_SIZE;
   return (struct block_summary*) (base + block * sizeof(struct block_summary));
@@ -661,12 +668,18 @@ static void finish_block(struct mutator *mut) {
 }
 
 static int next_block(struct mutator *mut) {
-  ASSERT(mut->sweep == 0);
+  ASSERT(mut->block == 0);
   uintptr_t block = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
   if (block == 0)
     return 0;
 
-  mut->alloc = mut->sweep = mut->block = block;
+  struct block_summary *summary = block_summary_for_addr(block);
+  summary->hole_count = 0;
+  summary->free_granules = 0;
+  summary->holes_with_fragmentation = 0;
+  summary->fragmentation_granules = 0;
+
+  mut->block = block;
   return 1;
 }
 
@@ -705,6 +718,11 @@ static size_t next_hole_in_block(struct mutator *mut) {
     size_t free_granules = next_mark(metadata, limit_granules, sweep_mask);
     ASSERT(free_granules);
     ASSERT(free_granules <= limit_granules);
+
+    struct block_summary *summary = block_summary_for_addr(sweep);
+    summary->hole_count++;
+    summary->free_granules += free_granules;
+
     size_t free_bytes = free_granules * GRANULE_SIZE;
     mut->alloc = sweep;
     mut->sweep = sweep + free_bytes;
@@ -718,6 +736,9 @@ static size_t next_hole_in_block(struct mutator *mut) {
 static void finish_hole(struct mutator *mut) {
   size_t granules = (mut->sweep - mut->alloc) / GRANULE_SIZE;
   if (granules) {
+    struct block_summary *summary = block_summary_for_addr(mut->block);
+    summary->holes_with_fragmentation++;
+    summary->fragmentation_granules += granules;
     uint8_t *metadata = object_metadata_byte((void*)mut->alloc);
     memset(metadata, 0, granules);
     mut->alloc = mut->sweep;
@@ -733,6 +754,16 @@ static size_t next_hole(struct mutator *mut) {
       return granules;
     if (!next_block(mut))
       return 0;
+    struct block_summary *summary = block_summary_for_addr(mut->block);
+    if (!summary->recycled) {
+      summary->hole_count++;
+      summary->free_granules = GRANULES_PER_BLOCK;
+      mut->alloc = mut->block;
+      mut->sweep = mut->block + BLOCK_SIZE;
+      summary->recycled = 1;
+      return GRANULES_PER_BLOCK;
+    }
+    mut->alloc = mut->sweep = mut->block;
   }
 }
 

From fa3b7bd1b3e270061fa33f589cb3a55894818093 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 9 May 2022 22:03:50 +0200
Subject: [PATCH 082/403] Add global yield and fragmentation computation

---
 mark-sweep.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mark-sweep.h b/mark-sweep.h
index aa9acedc5..e8735e6d4 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -207,6 +207,8 @@ struct mark_space {
   uintptr_t next_block;
   struct slab *slabs;
   size_t nslabs;
+  uintptr_t granules_freed_by_last_collection;
+  uintptr_t fragmentation_granules_since_last_collection;
 };
 
 struct heap {
@@ -547,6 +549,11 @@ static void rotate_mark_bytes(struct mark_space *space) {
   space->sweep_mask = broadcast_byte(space->live_mask);
 }
 
+static void reset_statistics(struct mark_space *space) {
+  space->granules_freed_by_last_collection = 0;
+  space->fragmentation_granules_since_last_collection = 0;
+}
+
 static void collect(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
@@ -558,6 +565,11 @@ static void collect(struct mutator *mut) {
   mark_controlling_mutator_roots(mut);
   finish_sweeping(mut);
   wait_for_mutators_to_stop(heap);
+  double yield = space->granules_freed_by_last_collection * GRANULE_SIZE;
+  double fragmentation = space->fragmentation_granules_since_last_collection * GRANULE_SIZE;
+  yield /= SLAB_SIZE * space->nslabs;
+  fragmentation /= SLAB_SIZE * space->nslabs;
+  fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   mark_inactive_mutators(heap);
   mark_global_roots(heap);
   tracer_trace(heap);
@@ -565,6 +577,7 @@ static void collect(struct mutator *mut) {
   reset_sweeper(space);
   rotate_mark_bytes(space);
   heap->count++;
+  reset_statistics(space);
   large_object_space_finish_gc(lospace);
   heap_reset_stolen_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
@@ -664,6 +677,14 @@ static uintptr_t mark_space_next_block(struct mark_space *space) {
 }
 
 static void finish_block(struct mutator *mut) {
+  ASSERT(mut->block);
+  struct block_summary *block = block_summary_for_addr(mut->block);
+  struct mark_space *space = heap_mark_space(mutator_heap(mut));
+  atomic_fetch_add(&space->granules_freed_by_last_collection,
+                   block->free_granules);
+  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
+                   block->fragmentation_granules);
+
   mut->block = mut->alloc = mut->sweep = 0;
 }
 

From 7ac0b5bb4b621f598b412f8fcf0f5fd0c4d4255b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 May 2022 21:19:26 +0200
Subject: [PATCH 083/403] More precise heap size control

No longer clamped to 4 MB boundaries.  Not important in production but
very important for comparing against other collectors.
---
 mark-sweep.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/mark-sweep.h b/mark-sweep.h
index e8735e6d4..98a3b30e1 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -122,7 +122,8 @@ struct block_summary {
       uint8_t out_for_thread;
       uint8_t has_pin;
       uint8_t paged_out;
-      uint8_t recycled;
+      uint8_t needs_sweep;
+      uint8_t unavailable;
     };
     uint8_t padding[SUMMARY_BYTES_PER_BLOCK];
   };
@@ -773,15 +774,18 @@ static size_t next_hole(struct mutator *mut) {
     size_t granules = next_hole_in_block(mut);
     if (granules)
       return granules;
-    if (!next_block(mut))
-      return 0;
-    struct block_summary *summary = block_summary_for_addr(mut->block);
-    if (!summary->recycled) {
+    struct block_summary *summary;
+    do {
+      if (!next_block(mut))
+        return 0;
+      summary = block_summary_for_addr(mut->block);
+    } while (summary->unavailable);
+    if (!summary->needs_sweep) {
       summary->hole_count++;
       summary->free_granules = GRANULES_PER_BLOCK;
       mut->alloc = mut->block;
       mut->sweep = mut->block + BLOCK_SIZE;
-      summary->recycled = 1;
+      summary->needs_sweep = 1;
       return GRANULES_PER_BLOCK;
     }
     mut->alloc = mut->sweep = mut->block;
@@ -967,6 +971,14 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->low_addr = (uintptr_t) slabs;
   space->extent = size;
   reset_sweeper(space);
+  for (size_t block = BLOCKS_PER_SLAB - 1;
+       block >= META_BLOCKS_PER_SLAB;
+       block--) {
+    if (size < heap->size)
+      break;
+    space->slabs[nslabs-1].summaries[block].unavailable = 1;
+    size -= BLOCK_SIZE;
+  }
   return 1;
 }
 

From c39e26159dd1933ca1b4459f716966c0aef9b6df Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 May 2022 22:25:09 +0200
Subject: [PATCH 084/403] Some README updates

---
 README.md | 106 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 93 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index c8b104856..e787cd462 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,90 @@
-# GC workbench
+# Whippet Garbage Collector
 
-This repository is a workbench for implementing different GCs.  It's a
-scratch space.
+This repository is for development of Whippet, a new garbage collector
+implementation, eventually for use in [Guile
+Scheme](https://gnu.org/s/guile).
+
+## Design
+
+Whippet is a mark-region collector, like
+[Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
+See also the lovely detailed [Rust
+implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
+
+To a first approximation, Whippet is a whole-heap Immix collector.  See
+the Immix paper for full details, but basically Immix divides the heap
+into 32kB blocks, and then divides those blocks into 128B lines.  An
+Immix allocation never spans blocks; allocations larger than 8kB go into
+a separate large object space.  Mutators request blocks from the global
+store and allocate into those blocks using bump-pointer allocation.
+When all blocks are consumed, Immix stops the world and traces the
+object graph, marking objects but also the lines that objects are on.
+After marking, blocks contain some lines with live objects and others
+that are completely free.  Spans of free lines are called holes.  When a
+mutator gets a recycled block from the global block store, it allocates
+into those holes.  Also, sometimes Immix can choose to evacuate rather
+than mark.  Bump-pointer-into-holes allocation is quite compatible with
+conservative roots, so it's an interesting option for Guile, which has a
+lot of legacy C API users.
+
+The essential difference of Whippet from Immix stems from a simple
+observation: Immix needs a side table of line mark bytes and also a mark
+bit or bits in each object (or in a side table).  But if instead you
+choose to store mark bytes instead of bits (for concurrency reasons) in
+a side table, with one mark byte per granule (unit of allocation,
+perhaps 16 bytes), then you effectively have a line mark table where the
+granule size is the line size.  You can bump-pointer allocate into holes
+in the mark byte table.
+
+You might think this is a bad tradeoff, and perhaps it is: I don't know
+yet.  If your granule size is two pointers, then one mark byte per
+granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
+32-bit, it's a lot!  On the other hand, you don't need GC bits in the
+object itself, and you get a number of other benefits from the mark byte
+table -- you can also stuff other per-object data there, such as pinning
+bits, nursery and remset bits, multiple mark colors for concurrent
+marking, and you can also use the mark byte (which is now a metadata
+byte) to record the object end, so that finding holes in a block can
+just read the mark table and can avoid looking at object memory.
+
+Other ideas in whippet:
+
+ * Minimize stop-the-world phase via parallel marking and punting all
+   sweeping to mutators
+
+ * Enable mutator parallelism via lock-free block acquisition and lazy
+   statistics collation
+
+ * Allocate block space using aligned 4 MB slabs, with embedded metadata
+   to allow metadata bytes, slab headers, and block metadata to be
+   located via address arithmetic
+
+ * Facilitate conservative collection via mark byte array, oracle for
+   "does this address start an object"
+
+ * Enable in-place generational collection via nursery bit in metadata
+   byte for new allocations, remset bit for objects that should be
+   traced for nursery roots, and a card table with one entry per 256B or
+   so; but write barrier and generational trace not yet implemented
+
+ * Enable concurrent marking by having three mark bit states (dead,
+   survivor, marked) that rotate at each collection, and sweeping a
+   block clears metadata for dead objects; but concurrent marking and
+   associated SATB barrier not yet implemented
 
 ## What's there
 
-There's just the (modified) GCBench, which is an old but standard
-benchmark that allocates different sizes of binary trees.  It takes a
-heap of 25 MB or so, not very large, and causes somewhere between 20 and
-50 collections, running in 100 to 500 milliseconds on 2022 machines.
-
-Then there are currently three collectors:
+There are currently three collectors:
 
  - `bdw.h`: The external BDW-GC conservative parallel stop-the-world
    mark-sweep segregated-fits collector with lazy sweeping.
  - `semi.h`: Semispace copying collector.
- - `mark-sweep.h`: Stop-the-world mark-sweep segregated-fits collector
-   with lazy sweeping.  Two different marking algorithms:
+ - `mark-sweep.h`: The whippet collector.  Two different marking algorithms:
    single-threaded and parallel.
 
 The two latter collectors reserve one word per object on the header,
-which makes them collect more frequently than `bdw` because the `Node`
-data type takes 32 bytes instead of 24 bytes.
+which might make them collect more frequently than `bdw` because the
+`Node` data type takes 32 bytes instead of 24 bytes.
 
 These collectors are sketches and exercises for improving Guile's
 garbage collector.  Guile currently uses BDW-GC.  In Guile if we have an
@@ -34,6 +97,16 @@ more efficient sweeping, for mark-sweep), to allow the application to
 know what kind an object is, to allow the GC to find references within
 the object, to allow the GC to compute the object's size, and so on.
 
+There's just the (modified) GCBench, which is an old but standard
+benchmark that allocates different sizes of binary trees.  As parameters
+it takes a heap multiplier and a number of mutator threads.  We
+analytically compute the peak amount of live data and then size the GC
+heap as a multiplier of that size.  It has a peak heap consumption of 10
+MB or so per mutator thread: not very large.  At a 2x heap multiplier,
+it causes about 30 collections for the whippet collector, and runs
+somewhere around 200-400 milliseconds in single-threaded mode, on the
+machines I have in 2022.
+
 The GCBench benchmark is small but then again many Guile processes also
 are quite short-lived, so perhaps it is useful to ensure that small
 heaps remain lightweight.
@@ -47,6 +120,7 @@ chooses to do so.  We assume that object references within a heap object
 can be precisely identified.  (The current BDW-GC scans for references
 conservatively even on the heap.)
 
+A generationa
 A likely good solution for Guile would be an [Immix
 collector](https://www.cs.utexas.edu/users/speedway/DaCapo/papers/immix-pldi-2008.pdf)
 with conservative roots, and a parallel stop-the-world mark/evacuate
@@ -80,6 +154,12 @@ majority of use cases.
  - [ ] Implement generational GC with semispace nursery and Immix
    old generation.
 
+## About the name
+
+It sounds better than WIP (work-in-progress) garbage collector, doesn't
+it?  Also apparently a whippet is a kind of dog that is fast for its
+size.  It would be nice if whippet-gc turns out to have this property.
+
 ## License
 
 gcbench.c, MT_GCBench.c, and MT_GCBench2.c are from

From 69d7ff83ddc729179d24541def278d7b1075e5a1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 May 2022 22:29:37 +0200
Subject: [PATCH 085/403] More wording

---
 README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e787cd462..c00cba9bd 100644
--- a/README.md
+++ b/README.md
@@ -39,13 +39,16 @@ in the mark byte table.
 You might think this is a bad tradeoff, and perhaps it is: I don't know
 yet.  If your granule size is two pointers, then one mark byte per
 granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
-32-bit, it's a lot!  On the other hand, you don't need GC bits in the
-object itself, and you get a number of other benefits from the mark byte
-table -- you can also stuff other per-object data there, such as pinning
-bits, nursery and remset bits, multiple mark colors for concurrent
-marking, and you can also use the mark byte (which is now a metadata
-byte) to record the object end, so that finding holes in a block can
-just read the mark table and can avoid looking at object memory.
+32-bit, it's a lot!  On the other hand, instead of the worst case of one
+survivor object wasting a line (or two, in the case of conservative line
+marking), granule-size-is-line-size instead wastes nothing.  Also, you
+don't need GC bits in the object itself, and you get a number of other
+benefits from the mark byte table -- you can also stuff other per-object
+data there, such as pinning bits, nursery and remset bits, multiple mark
+colors for concurrent marking, and you can also use the mark byte (which
+is now a metadata byte) to record the object end, so that finding holes
+in a block can just read the mark table and can avoid looking at object
+memory.
 
 Other ideas in whippet:
 

From 061d92d125d89edbcf5d3a1342c236eab25fa3c3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 15 May 2022 22:06:41 +0200
Subject: [PATCH 086/403] Update README

---
 README.md | 173 +++++++++++++++++++++++++++---------------------------
 1 file changed, 88 insertions(+), 85 deletions(-)

diff --git a/README.md b/README.md
index c00cba9bd..21fd96c7a 100644
--- a/README.md
+++ b/README.md
@@ -6,26 +6,26 @@ Scheme](https://gnu.org/s/guile).
 
 ## Design
 
-Whippet is a mark-region collector, like
+Whippet is mainly a mark-region collector, like
 [Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
 See also the lovely detailed [Rust
 implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
 
-To a first approximation, Whippet is a whole-heap Immix collector.  See
-the Immix paper for full details, but basically Immix divides the heap
-into 32kB blocks, and then divides those blocks into 128B lines.  An
-Immix allocation never spans blocks; allocations larger than 8kB go into
-a separate large object space.  Mutators request blocks from the global
-store and allocate into those blocks using bump-pointer allocation.
-When all blocks are consumed, Immix stops the world and traces the
-object graph, marking objects but also the lines that objects are on.
-After marking, blocks contain some lines with live objects and others
-that are completely free.  Spans of free lines are called holes.  When a
-mutator gets a recycled block from the global block store, it allocates
-into those holes.  Also, sometimes Immix can choose to evacuate rather
-than mark.  Bump-pointer-into-holes allocation is quite compatible with
-conservative roots, so it's an interesting option for Guile, which has a
-lot of legacy C API users.
+To a first approximation, Whippet is a whole-heap Immix collector with a
+large object space on the side.  See the Immix paper for full details,
+but basically Immix divides the heap into 32kB blocks, and then divides
+those blocks into 128B lines.  An Immix allocation never spans blocks;
+allocations larger than 8kB go into a separate large object space.
+Mutators request blocks from the global store and allocate into those
+blocks using bump-pointer allocation.  When all blocks are consumed,
+Immix stops the world and traces the object graph, marking objects but
+also the lines that objects are on.  After marking, blocks contain some
+lines with live objects and others that are completely free.  Spans of
+free lines are called holes.  When a mutator gets a recycled block from
+the global block store, it allocates into those holes.  Also, sometimes
+Immix can choose to evacuate rather than mark.  Bump-pointer-into-holes
+allocation is quite compatible with conservative roots, so it's an
+interesting option for Guile, which has a lot of legacy C API users.
 
 The essential difference of Whippet from Immix stems from a simple
 observation: Immix needs a side table of line mark bytes and also a mark
@@ -50,7 +50,7 @@ is now a metadata byte) to record the object end, so that finding holes
 in a block can just read the mark table and can avoid looking at object
 memory.
 
-Other ideas in whippet:
+Other ideas in Whippet:
 
  * Minimize stop-the-world phase via parallel marking and punting all
    sweeping to mutators
@@ -77,85 +77,88 @@ Other ideas in whippet:
 
 ## What's there
 
-There are currently three collectors:
+This repository is a workspace for Whippet implementation.  As such, it
+has files implementing Whippet itself.  It also has some benchmarks to
+use in optimizing Whippet:
+
+ - [`mt-gcbench.c`](./mt-gcbench.c): The multi-threaded [GCBench
+   benchmark](https://hboehm.info/gc/gc_bench.html).  An old but
+   standard benchmark that allocates different sizes of binary trees.
+   As parameters it takes a heap multiplier and a number of mutator
+   threads.  We analytically compute the peak amount of live data and
+   then size the GC heap as a multiplier of that size.  It has a peak
+   heap consumption of 10 MB or so per mutator thread: not very large.
+   At a 2x heap multiplier, it causes about 30 collections for the
+   whippet collector, and runs somewhere around 200-400 milliseconds in
+   single-threaded mode, on the machines I have in 2022.  For low thread
+   counts, the GCBench benchmark is small; but then again many Guile
+   processes also are quite short-lived, so perhaps it is useful to
+   ensure that small heaps remain lightweight.
+
+ - [`quads.c`](./quads.c): A synthetic benchmark that allocates quad
+   trees.  The mutator begins by allocating one long-lived tree of depth
+   N, and then allocates 13% of the heap in depth-3 trees, 20 times,
+   simulating a fixed working set and otherwise an allocation-heavy
+   workload.  By observing the times to allocate 13% of the heap in
+   garbage we can infer mutator overheads, and also note the variance
+   for the cycles in which GC hits.
+
+The repository has two other collector implementations, to appropriately
+situate Whippet's performance in context:
 
  - `bdw.h`: The external BDW-GC conservative parallel stop-the-world
    mark-sweep segregated-fits collector with lazy sweeping.
  - `semi.h`: Semispace copying collector.
- - `mark-sweep.h`: The whippet collector.  Two different marking algorithms:
-   single-threaded and parallel.
+ - `mark-sweep.h`: The whippet collector.  Two different marking
+   implementations: single-threaded and parallel.
 
-The two latter collectors reserve one word per object on the header,
-which might make them collect more frequently than `bdw` because the
-`Node` data type takes 32 bytes instead of 24 bytes.
+## Guile
 
-These collectors are sketches and exercises for improving Guile's
-garbage collector.  Guile currently uses BDW-GC.  In Guile if we have an
-object reference we generally have to be able to know what kind of
-object it is, because there are few global invariants enforced by
-typing.  Therefore it is reasonable to consider allowing the GC and the
-application to share the first word of an object, for example to maybe
-store a mark bit (though an on-the-side mark byte seems to allow much
-more efficient sweeping, for mark-sweep), to allow the application to
-know what kind an object is, to allow the GC to find references within
-the object, to allow the GC to compute the object's size, and so on.
+If the Whippet collector works out, it could replace Guile's garbage
+collector.  Guile currently uses BDW-GC.  Guile has a widely used C API
+and implements part of its run-time in C.  For this reason it may be
+infeasible to require precise enumeration of GC roots -- we may need to
+allow GC roots to be conservatively identified from data sections and
+from stacks.  Such conservative roots would be pinned, but other objects
+can be moved by the collector if it chooses to do so.  We assume that
+object references within a heap object can be precisely identified.
+(However, Guile currently uses BDW-GC in its default configuration,
+which scans for references conservatively even on the heap.)
 
-There's just the (modified) GCBench, which is an old but standard
-benchmark that allocates different sizes of binary trees.  As parameters
-it takes a heap multiplier and a number of mutator threads.  We
-analytically compute the peak amount of live data and then size the GC
-heap as a multiplier of that size.  It has a peak heap consumption of 10
-MB or so per mutator thread: not very large.  At a 2x heap multiplier,
-it causes about 30 collections for the whippet collector, and runs
-somewhere around 200-400 milliseconds in single-threaded mode, on the
-machines I have in 2022.
+The existing C API allows direct access to mutable object fields,
+without the mediation of read or write barriers.  Therefore it may be
+impossible to switch to collector strategies that need barriers, such as
+generational or concurrent collectors.  However, we shouldn't write off
+this possibility entirely; an ideal replacement for Guile's GC will
+offer the possibility of migration to other GC designs without imposing
+new requirements on C API users in the initial phase.
 
-The GCBench benchmark is small but then again many Guile processes also
-are quite short-lived, so perhaps it is useful to ensure that small
-heaps remain lightweight.
-
-Guile has a widely used C API and implements part of its run-time in C.
-For this reason it may be infeasible to require precise enumeration of
-GC roots -- we may need to allow GC roots to be conservatively
-identified from data sections and from stacks.  Such conservative roots
-would be pinned, but other objects can be moved by the collector if it
-chooses to do so.  We assume that object references within a heap object
-can be precisely identified.  (The current BDW-GC scans for references
-conservatively even on the heap.)
-
-A generationa
-A likely good solution for Guile would be an [Immix
-collector](https://www.cs.utexas.edu/users/speedway/DaCapo/papers/immix-pldi-2008.pdf)
-with conservative roots, and a parallel stop-the-world mark/evacuate
-phase.  We would probably follow the [Rust
-implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf),
-more or less, with support for per-line pinning.  In an ideal world we
-would work out some kind of generational solution as well, either via a
-semispace nursery or via sticky mark bits, but this requires Guile to
-use a write barrier -- something that's possible to do within Guile
-itself but it's unclear if we can extend this obligation to users of
-Guile's C API.
-
-In any case, these experiments also have the goal of identifying a
-smallish GC abstraction in Guile, so that we might consider evolving GC
-implementation in the future without too much pain.  If we switch away
-from BDW-GC, we should be able to evaluate that it's a win for a large
-majority of use cases.
+In this regard, the Whippet experiment also has the goal of identifying
+a smallish GC abstraction in Guile, so that we might consider evolving
+GC implementation in the future without too much pain.  If we switch
+away from BDW-GC, we should be able to evaluate that it's a win for a
+large majority of use cases.
 
 ## To do
 
- - [X] Implement a parallel marker for the mark-sweep collector.
- - [X] Adapt all GC implementations to allow multiple mutator threads.
-   Update gcbench.c.
- - [ ] Implement precise non-moving Immix whole-heap collector.
- - [ ] Add evacuation to Immix whole-heap collector.
- - [ ] Add parallelism to Immix stop-the-world phase.
- - [ ] Implement conservative root-finding for the mark-sweep collector.
- - [ ] Implement conservative root-finding and pinning for Immix.
- - [ ] Implement generational GC with semispace nursery and mark-sweep
-   old generation.
- - [ ] Implement generational GC with semispace nursery and Immix
-   old generation.
+### Missing features before Guile can use Whippet
+
+ - [ ] Pinning
+ - [ ] Conservative stacks
+ - [ ] Conservative data segments
+ - [ ] Heap growth/shrinking
+ - [ ] Debugging/tracing
+ - [ ] Finalizers
+ - [ ] Weak references / weak maps
+
+### Features that would improve Whippet performance
+
+ - [ ] Immix-style opportunistic evacuation
+ - [ ] Overflow allocation
+ - [ ] Lazy identification of empty blocks
+ - [ ] Generational GC via sticky mark bits
+ - [ ] Generational GC with semi-space nursery
+ - [ ] Concurrent marking with SATB barrier
 
 ## About the name
 

From 7d80d45c79a04d488673c8bdab91c796fe79bbb1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 May 2022 21:47:13 +0200
Subject: [PATCH 087/403] Rename mark-sweep.h to whippet.h

---
 Makefile                  | 10 +++++-----
 README.md                 |  2 +-
 gc.h                      | 10 +++++-----
 mark-sweep.h => whippet.h |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)
 rename mark-sweep.h => whippet.h (99%)

diff --git a/Makefile b/Makefile
index 4e0946319..e01421e2d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
+COLLECTORS=bdw semi whippet parallel-whippet
 
 CC=gcc
 CFLAGS=-Wall -O2 -g -fno-strict-aliasing -Wno-unused -DNDEBUG
@@ -17,11 +17,11 @@ bdw-%: bdw.h conservative-roots.h %-types.h %.c
 semi-%: semi.h precise-roots.h large-object-space.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_SEMI -o $@ $*.c
 
-mark-sweep-%: mark-sweep.h precise-roots.h large-object-space.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_MARK_SWEEP -o $@ $*.c
+whippet-%: whippet.h precise-roots.h large-object-space.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
+	$(COMPILE) -DGC_WHIPPET -o $@ $*.c
 
-parallel-mark-sweep-%: mark-sweep.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
+parallel-whippet-%: whippet.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL_WHIPPET -o $@ $*.c
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
diff --git a/README.md b/README.md
index 21fd96c7a..2732566be 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ situate Whippet's performance in context:
  - `bdw.h`: The external BDW-GC conservative parallel stop-the-world
    mark-sweep segregated-fits collector with lazy sweeping.
  - `semi.h`: Semispace copying collector.
- - `mark-sweep.h`: The whippet collector.  Two different marking
+ - `whippet.h`: The whippet collector.  Two different marking
    implementations: single-threaded and parallel.
 
 ## Guile
diff --git a/gc.h b/gc.h
index 2c0c59de0..5d6268d9f 100644
--- a/gc.h
+++ b/gc.h
@@ -5,11 +5,11 @@
 #include "bdw.h"
 #elif defined(GC_SEMI)
 #include "semi.h"
-#elif defined(GC_MARK_SWEEP)
-#include "mark-sweep.h"
-#elif defined(GC_PARALLEL_MARK_SWEEP)
-#define GC_PARALLEL_MARK 1
-#include "mark-sweep.h"
+#elif defined(GC_WHIPPET)
+#include "whippet.h"
+#elif defined(GC_PARALLEL_WHIPPET)
+#define GC_PARALLEL_TRACE 1
+#include "whippet.h"
 #else
 #error unknown gc
 #endif
diff --git a/mark-sweep.h b/whippet.h
similarity index 99%
rename from mark-sweep.h
rename to whippet.h
index 98a3b30e1..48b1a53ee 100644
--- a/mark-sweep.h
+++ b/whippet.h
@@ -11,7 +11,7 @@
 #include "inline.h"
 #include "large-object-space.h"
 #include "precise-roots.h"
-#ifdef GC_PARALLEL_MARK
+#ifdef GC_PARALLEL_TRACE
 #include "parallel-tracer.h"
 #else
 #include "serial-tracer.h"

From 8f06b914b0231d947afc733adf458bb31c6a18ee Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 19 May 2022 22:05:09 +0200
Subject: [PATCH 088/403] Refactor to allow "next" pointer embedded in block
 summary

---
 whippet.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 10 deletions(-)

diff --git a/whippet.h b/whippet.h
index 48b1a53ee..61e0b1f79 100644
--- a/whippet.h
+++ b/whippet.h
@@ -107,9 +107,34 @@ struct slab_header {
 };
 STATIC_ASSERT_EQ(sizeof(struct slab_header), HEADER_BYTES_PER_SLAB);
 
+// Sometimes we want to put a block on a singly-linked list.  For that
+// there's a pointer reserved in the block summary.  But because the
+// pointer is aligned (32kB on 32-bit, 64kB on 64-bit), we can portably
+// hide up to 15 flags in the low bits.  These flags can be accessed
+// non-atomically by the mutator when it owns a block; otherwise they
+// need to be accessed atomically.
+enum block_summary_flag {
+  BLOCK_OUT_FOR_THREAD = 0x1,
+  BLOCK_HAS_PIN = 0x2,
+  BLOCK_PAGED_OUT = 0x4,
+  BLOCK_NEEDS_SWEEP = 0x8,
+  BLOCK_UNAVAILABLE = 0x10,
+  BLOCK_FLAG_UNUSED_5 = 0x20,
+  BLOCK_FLAG_UNUSED_6 = 0x40,
+  BLOCK_FLAG_UNUSED_7 = 0x80,
+  BLOCK_FLAG_UNUSED_8 = 0x100,
+  BLOCK_FLAG_UNUSED_9 = 0x200,
+  BLOCK_FLAG_UNUSED_10 = 0x400,
+  BLOCK_FLAG_UNUSED_11 = 0x800,
+  BLOCK_FLAG_UNUSED_12 = 0x1000,
+  BLOCK_FLAG_UNUSED_13 = 0x2000,
+  BLOCK_FLAG_UNUSED_14 = 0x4000,
+};
+
 struct block_summary {
   union {
     struct {
+      //struct block *next;
       // Counters related to previous collection: how many holes there
       // were, and how much space they had.
       uint16_t hole_count;
@@ -118,12 +143,12 @@ struct block_summary {
       // wasted space due to fragmentation.
       uint16_t holes_with_fragmentation;
       uint16_t fragmentation_granules;
-      // Status bytes.
-      uint8_t out_for_thread;
-      uint8_t has_pin;
-      uint8_t paged_out;
-      uint8_t needs_sweep;
-      uint8_t unavailable;
+      // After a block is swept, if it's empty it goes on the empties
+      // list.  Otherwise if it's not immediately used by a mutator (as
+      // is usually the case), it goes on the swept list.  Both of these
+      // lists use this field.  But as the next element in the field is
+      // block-aligned, we stash flags in the low bits.
+      uintptr_t next_and_flags;
     };
     uint8_t padding[SUMMARY_BYTES_PER_BLOCK];
   };
@@ -172,6 +197,22 @@ static struct block_summary* block_summary_for_addr(uintptr_t addr) {
   return (struct block_summary*) (base + block * sizeof(struct block_summary));
 }
 
+static uintptr_t block_summary_has_flag(struct block_summary *summary,
+                                        enum block_summary_flag flag) {
+  return summary->next_and_flags & flag;
+}
+static void block_summary_set_flag(struct block_summary *summary,
+                                   enum block_summary_flag flag) {
+  summary->next_and_flags |= flag;
+}
+static void block_summary_clear_flag(struct block_summary *summary,
+                                     enum block_summary_flag flag) {
+  summary->next_and_flags &= ~(uintptr_t)flag;
+}
+static struct block* block_summary_next(struct block_summary *summary) {
+  return (struct block*) (summary->next_and_flags & ~(BLOCK_SIZE - 1));
+}
+
 static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
@@ -779,13 +820,13 @@ static size_t next_hole(struct mutator *mut) {
       if (!next_block(mut))
         return 0;
       summary = block_summary_for_addr(mut->block);
-    } while (summary->unavailable);
-    if (!summary->needs_sweep) {
+    } while (block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
+    if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
       summary->hole_count++;
       summary->free_granules = GRANULES_PER_BLOCK;
       mut->alloc = mut->block;
       mut->sweep = mut->block + BLOCK_SIZE;
-      summary->needs_sweep = 1;
+      block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
       return GRANULES_PER_BLOCK;
     }
     mut->alloc = mut->sweep = mut->block;
@@ -976,7 +1017,8 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
        block--) {
     if (size < heap->size)
       break;
-    space->slabs[nslabs-1].summaries[block].unavailable = 1;
+    block_summary_set_flag(&space->slabs[nslabs-1].summaries[block],
+                           BLOCK_UNAVAILABLE);
     size -= BLOCK_SIZE;
   }
   return 1;

From 71b656bca4cdba83dc1872e9c8fa1b5a6253023f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 20 May 2022 22:09:20 +0200
Subject: [PATCH 089/403] When sweeping, return empty blocks to global freelist

This will facilitate management of overhead for defragmentation as well
as blocks to unmap, for compensating large object allocations.
---
 whippet.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/whippet.h b/whippet.h
index 61e0b1f79..42d4a62ef 100644
--- a/whippet.h
+++ b/whippet.h
@@ -209,8 +209,36 @@ static void block_summary_clear_flag(struct block_summary *summary,
                                      enum block_summary_flag flag) {
   summary->next_and_flags &= ~(uintptr_t)flag;
 }
-static struct block* block_summary_next(struct block_summary *summary) {
-  return (struct block*) (summary->next_and_flags & ~(BLOCK_SIZE - 1));
+static uintptr_t block_summary_next(struct block_summary *summary) {
+  return summary->next_and_flags & ~(BLOCK_SIZE - 1);
+}
+static void block_summary_set_next(struct block_summary *summary,
+                                   uintptr_t next) {
+  ASSERT((next & ~(BLOCK_SIZE - 1)) == 0);
+  summary->next_and_flags =
+    (summary->next_and_flags & (BLOCK_SIZE - 1)) | next;
+}
+
+static void push_block(uintptr_t *loc, uintptr_t block) {
+  struct block_summary *summary = block_summary_for_addr(block);
+  uintptr_t next = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    block_summary_set_next(summary, next);
+  } while (!atomic_compare_exchange_weak(loc, &next, block));
+}
+
+static uintptr_t pop_block(uintptr_t *loc) {
+  uintptr_t head = atomic_load_explicit(loc, memory_order_acquire);
+  struct block_summary *summary;
+  uintptr_t next;
+  do {
+    if (!head)
+      return 0;
+    summary = block_summary_for_addr(head);
+    next = block_summary_next(summary);
+  } while (!atomic_compare_exchange_weak(loc, &head, next));
+  block_summary_set_next(summary, 0);
+  return head;
 }
 
 static uintptr_t align_up(uintptr_t addr, size_t align) {
@@ -246,11 +274,12 @@ struct mark_space {
   uintptr_t low_addr;
   size_t extent;
   size_t heap_size;
-  uintptr_t next_block;
+  uintptr_t next_block;   // atomically
+  uintptr_t empty_blocks; // atomically
   struct slab *slabs;
   size_t nslabs;
-  uintptr_t granules_freed_by_last_collection;
-  uintptr_t fragmentation_granules_since_last_collection;
+  uintptr_t granules_freed_by_last_collection; // atomically
+  uintptr_t fragmentation_granules_since_last_collection; // atomically
 };
 
 struct heap {
@@ -703,7 +732,7 @@ static uintptr_t mark_space_next_block(struct mark_space *space) {
   uintptr_t next_block;
   do {
     if (block == 0)
-      return 0;
+      return pop_block(&space->empty_blocks);
 
     next_block = block + BLOCK_SIZE;
     if (next_block % SLAB_SIZE == 0) {
@@ -809,12 +838,30 @@ static void finish_hole(struct mutator *mut) {
   // FIXME: add to fragmentation
 }
 
+static void return_empty_block(struct mutator *mut) {
+  ASSERT(mut->block);
+  struct mark_space *space = heap_mark_space(mutator_heap(mut));
+  uintptr_t block = mut->block;
+  struct block_summary *summary = block_summary_for_addr(block);
+  block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
+  push_block(&space->empty_blocks, block);
+  mut->alloc = mut->sweep = mut->block = 0;
+}
+
 static size_t next_hole(struct mutator *mut) {
   finish_hole(mut);
+  // As we sweep if we find that a block is empty, we return it to the
+  // empties list.  Empties are precious.  But if we return 10 blocks in
+  // a row, and still find an 11th empty, go ahead and use it.
+  size_t empties_countdown = 10;
   while (1) {
     size_t granules = next_hole_in_block(mut);
-    if (granules)
-      return granules;
+    if (granules) {
+      if (granules == GRANULES_PER_BLOCK && empties_countdown--)
+        return_empty_block(mut);
+      else
+        return granules;
+    }
     struct block_summary *summary;
     do {
       if (!next_block(mut))
@@ -822,11 +869,11 @@ static size_t next_hole(struct mutator *mut) {
       summary = block_summary_for_addr(mut->block);
     } while (block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
     if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
+      block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
       summary->hole_count++;
       summary->free_granules = GRANULES_PER_BLOCK;
       mut->alloc = mut->block;
       mut->sweep = mut->block + BLOCK_SIZE;
-      block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
       return GRANULES_PER_BLOCK;
     }
     mut->alloc = mut->sweep = mut->block;

From 33a3af2c73a66f79c6bb071c49224b7a0367ea86 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 21 May 2022 21:29:21 +0200
Subject: [PATCH 090/403] Large object space properly acquires blocks from mark
 space

If the mutator finds completely empty blocks, it puts them on the side.
The large object space acquires empty blocks, sweeping if needed, and
causes them to be unmapped, possibly causing GC.
---
 whippet.h | 259 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 194 insertions(+), 65 deletions(-)

diff --git a/whippet.h b/whippet.h
index 42d4a62ef..f365bd60f 100644
--- a/whippet.h
+++ b/whippet.h
@@ -214,20 +214,21 @@ static uintptr_t block_summary_next(struct block_summary *summary) {
 }
 static void block_summary_set_next(struct block_summary *summary,
                                    uintptr_t next) {
-  ASSERT((next & ~(BLOCK_SIZE - 1)) == 0);
+  ASSERT((next & (BLOCK_SIZE - 1)) == 0);
   summary->next_and_flags =
     (summary->next_and_flags & (BLOCK_SIZE - 1)) | next;
 }
 
-static void push_block(uintptr_t *loc, uintptr_t block) {
+static void push_block(uintptr_t *loc, size_t *count, uintptr_t block) {
   struct block_summary *summary = block_summary_for_addr(block);
   uintptr_t next = atomic_load_explicit(loc, memory_order_acquire);
   do {
     block_summary_set_next(summary, next);
   } while (!atomic_compare_exchange_weak(loc, &next, block));
+  atomic_fetch_add_explicit(count, 1, memory_order_acq_rel);
 }
 
-static uintptr_t pop_block(uintptr_t *loc) {
+static uintptr_t pop_block(uintptr_t *loc, size_t *count) {
   uintptr_t head = atomic_load_explicit(loc, memory_order_acquire);
   struct block_summary *summary;
   uintptr_t next;
@@ -238,6 +239,7 @@ static uintptr_t pop_block(uintptr_t *loc) {
     next = block_summary_next(summary);
   } while (!atomic_compare_exchange_weak(loc, &head, next));
   block_summary_set_next(summary, 0);
+  atomic_fetch_sub_explicit(count, 1, memory_order_acq_rel);
   return head;
 }
 
@@ -276,6 +278,10 @@ struct mark_space {
   size_t heap_size;
   uintptr_t next_block;   // atomically
   uintptr_t empty_blocks; // atomically
+  size_t empty_blocks_count; // atomically
+  uintptr_t unavailable_blocks; // atomically
+  size_t unavailable_blocks_count; // atomically
+  ssize_t pending_unavailable_bytes; // atomically
   struct slab *slabs;
   size_t nslabs;
   uintptr_t granules_freed_by_last_collection; // atomically
@@ -285,6 +291,7 @@ struct mark_space {
 struct heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
+  size_t large_object_pages;
   pthread_mutex_t lock;
   pthread_cond_t collector_cond;
   pthread_cond_t mutator_cond;
@@ -443,13 +450,90 @@ static void allow_mutators_to_continue(struct heap *heap) {
   pthread_cond_broadcast(&heap->mutator_cond);
 }
 
-static int heap_steal_pages(struct heap *heap, size_t npages) {
-  // FIXME: When we have a block-structured mark space, actually return
-  // pages to the OS, and limit to the current heap size.
-  return 1;
+static void push_unavailable_block(struct mark_space *space, uintptr_t block) {
+  struct block_summary *summary = block_summary_for_addr(block);
+  ASSERT(!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP));
+  ASSERT(!block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
+  block_summary_set_flag(summary, BLOCK_UNAVAILABLE);
+  madvise((void*)block, BLOCK_SIZE, MADV_DONTNEED);
+  push_block(&space->unavailable_blocks, &space->unavailable_blocks_count,
+             block);
 }
-static void heap_reset_stolen_pages(struct heap *heap, size_t npages) {
-  // FIXME: Possibly reclaim blocks from the reclaimed set.
+
+static uintptr_t pop_unavailable_block(struct mark_space *space) {
+  uintptr_t block = pop_block(&space->unavailable_blocks,
+                              &space->unavailable_blocks_count);
+  if (!block)
+    return 0;
+  struct block_summary *summary = block_summary_for_addr(block);
+  ASSERT(block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
+  block_summary_clear_flag(summary, BLOCK_UNAVAILABLE);
+  return block;
+}
+
+static uintptr_t pop_empty_block(struct mark_space *space) {
+  return pop_block(&space->empty_blocks, &space->empty_blocks_count);
+}
+
+static void push_empty_block(struct mark_space *space, uintptr_t block) {
+  ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
+                                 BLOCK_NEEDS_SWEEP));
+  push_block(&space->empty_blocks, &space->empty_blocks_count, block);
+}
+
+static ssize_t mark_space_request_release_memory(struct mark_space *space,
+                                                 size_t bytes) {
+  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
+}
+
+static void mark_space_reacquire_memory(struct mark_space *space,
+                                        size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
+  while (pending + BLOCK_SIZE <= 0) {
+    uintptr_t block = pop_unavailable_block(space);
+    ASSERT(block);
+    push_empty_block(space, block);
+    pending += BLOCK_SIZE;
+  }
+}
+
+static size_t next_hole(struct mutator *mut);
+
+static int sweep_until_memory_released(struct mutator *mut) {
+  struct mark_space *space = heap_mark_space(mutator_heap(mut));
+  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                         memory_order_acquire);
+  // First try to unmap previously-identified empty blocks.  If pending
+  // > 0 and other mutators happen to identify empty blocks, they will
+  // be unmapped directly and moved to the unavailable list.
+  while (pending > 0) {
+    uintptr_t block = pop_empty_block(space);
+    if (!block)
+      break;
+    push_unavailable_block(space, block);
+    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, BLOCK_SIZE);
+    pending -= BLOCK_SIZE;
+  }
+  // Otherwise, sweep, transitioning any empty blocks to unavailable and
+  // throwing away any non-empty block.  A bit wasteful but hastening
+  // the next collection is a reasonable thing to do here.
+  while (pending > 0) {
+    if (!next_hole(mut))
+      return 0;
+    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                   memory_order_acquire);
+  }
+  return pending <= 0;
+}
+
+static void heap_reset_large_object_pages(struct heap *heap, size_t npages) {
+  size_t previous = heap->large_object_pages;
+  heap->large_object_pages = npages;
+  ASSERT(npages <= previous);
+  size_t bytes = (previous - npages) <<
+    heap_large_object_space(heap)->page_size_log2;
+  mark_space_reacquire_memory(heap_mark_space(heap), bytes);
 }
 
 static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
@@ -650,7 +734,7 @@ static void collect(struct mutator *mut) {
   heap->count++;
   reset_statistics(space);
   large_object_space_finish_gc(lospace);
-  heap_reset_stolen_pages(heap, lospace->live_pages_at_last_collection);
+  heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
   DEBUG("collect done\n");
 }
@@ -726,13 +810,13 @@ static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
   return limit;
 }
 
-static uintptr_t mark_space_next_block(struct mark_space *space) {
+static uintptr_t mark_space_next_block_to_sweep(struct mark_space *space) {
   uintptr_t block = atomic_load_explicit(&space->next_block,
                                          memory_order_acquire);
   uintptr_t next_block;
   do {
     if (block == 0)
-      return pop_block(&space->empty_blocks);
+      return 0;
 
     next_block = block + BLOCK_SIZE;
     if (next_block % SLAB_SIZE == 0) {
@@ -759,22 +843,6 @@ static void finish_block(struct mutator *mut) {
   mut->block = mut->alloc = mut->sweep = 0;
 }
 
-static int next_block(struct mutator *mut) {
-  ASSERT(mut->block == 0);
-  uintptr_t block = mark_space_next_block(heap_mark_space(mutator_heap(mut)));
-  if (block == 0)
-    return 0;
-
-  struct block_summary *summary = block_summary_for_addr(block);
-  summary->hole_count = 0;
-  summary->free_granules = 0;
-  summary->holes_with_fragmentation = 0;
-  summary->fragmentation_granules = 0;
-
-  mut->block = block;
-  return 1;
-}
-
 // Sweep some heap to reclaim free space, resetting mut->alloc and
 // mut->sweep.  Return the size of the hole in granules.
 static size_t next_hole_in_block(struct mutator *mut) {
@@ -838,14 +906,19 @@ static void finish_hole(struct mutator *mut) {
   // FIXME: add to fragmentation
 }
 
-static void return_empty_block(struct mutator *mut) {
+static int maybe_release_swept_empty_block(struct mutator *mut) {
   ASSERT(mut->block);
   struct mark_space *space = heap_mark_space(mutator_heap(mut));
   uintptr_t block = mut->block;
-  struct block_summary *summary = block_summary_for_addr(block);
-  block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
-  push_block(&space->empty_blocks, block);
+  if (atomic_load_explicit(&space->pending_unavailable_bytes,
+                           memory_order_acquire) <= 0)
+    return 0;
+
+  block_summary_clear_flag(block_summary_for_addr(block), BLOCK_NEEDS_SWEEP);
+  push_unavailable_block(space, block);
+  atomic_fetch_sub(&space->pending_unavailable_bytes, BLOCK_SIZE);
   mut->alloc = mut->sweep = mut->block = 0;
+  return 1;
 }
 
 static size_t next_hole(struct mutator *mut) {
@@ -854,29 +927,79 @@ static size_t next_hole(struct mutator *mut) {
   // empties list.  Empties are precious.  But if we return 10 blocks in
   // a row, and still find an 11th empty, go ahead and use it.
   size_t empties_countdown = 10;
+  struct mark_space *space = heap_mark_space(mutator_heap(mut));
   while (1) {
+    // Sweep current block for a hole.
     size_t granules = next_hole_in_block(mut);
     if (granules) {
-      if (granules == GRANULES_PER_BLOCK && empties_countdown--)
-        return_empty_block(mut);
-      else
+      // If the hole spans only part of a block, give it to the mutator.
+      if (granules <= GRANULES_PER_BLOCK)
         return granules;
+      // Sweeping found a completely empty block.  If we have pending
+      // pages to release to the OS, we should unmap this block.
+      if (maybe_release_swept_empty_block(mut))
+        continue;
+      // Otherwise if we've already returned lots of empty blocks to the
+      // freelist, give this block to the mutator.
+      if (!empties_countdown)
+        return granules;
+      // Otherwise we push to the empty blocks list.
+      struct block_summary *summary = block_summary_for_addr(mut->block);
+      block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
+      push_empty_block(space, mut->block);
+      mut->alloc = mut->sweep = mut->block = 0;
+      empties_countdown--;
     }
-    struct block_summary *summary;
-    do {
-      if (!next_block(mut))
-        return 0;
-      summary = block_summary_for_addr(mut->block);
-    } while (block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
-    if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
-      block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
-      summary->hole_count++;
-      summary->free_granules = GRANULES_PER_BLOCK;
-      mut->alloc = mut->block;
-      mut->sweep = mut->block + BLOCK_SIZE;
-      return GRANULES_PER_BLOCK;
+    ASSERT(mut->block == 0);
+    while (1) {
+      uintptr_t block = mark_space_next_block_to_sweep(space);
+      if (block) {
+        // Sweeping found a block.  We might take it for allocation, or
+        // we might send it back.
+        struct block_summary *summary = block_summary_for_addr(block);
+        // If it's marked unavailable, it's already on a list of
+        // unavailable blocks, so skip and get the next block.
+        if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
+          continue;
+        if (block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
+          // This block was marked in the last GC and needs sweeping.
+          // As we sweep we'll want to record how many bytes were live
+          // at the last collection.  As we allocate we'll record how
+          // many granules were wasted because of fragmentation.
+          summary->hole_count = 0;
+          summary->free_granules = 0;
+          summary->holes_with_fragmentation = 0;
+          summary->fragmentation_granules = 0;
+          // Prepare to sweep the block for holes.
+          mut->alloc = mut->sweep = mut->block = block;
+          break;
+        } else {
+          // Otherwise this block is completely empty and is on the
+          // empties list.  We take from the empties list only after all
+          // the NEEDS_SWEEP blocks are processed.
+          continue;
+        }
+      } else {
+        // We are done sweeping for blocks.  Now take from the empties
+        // list.
+        block = pop_empty_block(space);
+        // No empty block?  Return 0 to cause collection.
+        if (!block)
+          return 0;
+        
+        // Otherwise return the block to the mutator.
+        struct block_summary *summary = block_summary_for_addr(block);
+        block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
+        summary->hole_count = 1;
+        summary->free_granules = GRANULES_PER_BLOCK;
+        summary->holes_with_fragmentation = 0;
+        summary->fragmentation_granules = 0;
+        mut->block = block;
+        mut->alloc = block;
+        mut->sweep = block + BLOCK_SIZE;
+        return GRANULES_PER_BLOCK;
+      }
     }
-    mut->alloc = mut->sweep = mut->block;
   }
 }
 
@@ -907,20 +1030,24 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
   size_t size = granules * GRANULE_SIZE;
   size_t npages = large_object_space_npages(space, size);
 
-  heap_lock(heap);
-
-  if (!heap_steal_pages(heap, npages)) {
-    collect(mut);
-    if (!heap_steal_pages(heap, npages))
+  mark_space_request_release_memory(heap_mark_space(heap),
+                                    npages << space->page_size_log2);
+  if (!sweep_until_memory_released(mut)) {
+    heap_lock(heap);
+    if (mutators_are_stopping(heap))
+      pause_mutator_for_collection_with_lock(mut);
+    else
+      collect(mut);
+    heap_unlock(heap);
+    if (!sweep_until_memory_released(mut))
       out_of_memory(mut);
   }
+  atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
   if (!ret)
     ret = large_object_space_obtain_and_alloc(space, npages);
 
-  heap_unlock(heap);
-
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
     abort();
@@ -1058,15 +1185,17 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->nslabs = nslabs;
   space->low_addr = (uintptr_t) slabs;
   space->extent = size;
-  reset_sweeper(space);
-  for (size_t block = BLOCKS_PER_SLAB - 1;
-       block >= META_BLOCKS_PER_SLAB;
-       block--) {
-    if (size < heap->size)
-      break;
-    block_summary_set_flag(&space->slabs[nslabs-1].summaries[block],
-                           BLOCK_UNAVAILABLE);
-    size -= BLOCK_SIZE;
+  space->next_block = 0;
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
+      uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
+      if (size > heap->size) {
+        push_unavailable_block(space, addr);
+        size -= BLOCK_SIZE;
+      } else {
+        push_empty_block(space, addr);
+      }
+    }
   }
   return 1;
 }

From 157d40466b116edff151d5fb2bd3ccee70a3f7f5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 25 May 2022 14:36:27 +0200
Subject: [PATCH 091/403] mark_space_reacquire_memory updates
 pending_unavailable_bytes

---
 whippet.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index f365bd60f..f62e81c3b 100644
--- a/whippet.h
+++ b/whippet.h
@@ -494,7 +494,8 @@ static void mark_space_reacquire_memory(struct mark_space *space,
     uintptr_t block = pop_unavailable_block(space);
     ASSERT(block);
     push_empty_block(space, block);
-    pending += BLOCK_SIZE;
+    pending = atomic_fetch_add(&space->pending_unavailable_bytes, BLOCK_SIZE)
+      + BLOCK_SIZE;
   }
 }
 

From bc73c5ad02a9b2506e1392982f8598d328d33852 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 26 May 2022 08:37:02 +0200
Subject: [PATCH 092/403] Whitespace fix

---
 whippet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index f62e81c3b..1f72430d6 100644
--- a/whippet.h
+++ b/whippet.h
@@ -987,7 +987,7 @@ static size_t next_hole(struct mutator *mut) {
         // No empty block?  Return 0 to cause collection.
         if (!block)
           return 0;
-        
+
         // Otherwise return the block to the mutator.
         struct block_summary *summary = block_summary_for_addr(block);
         block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);

From 808d365f4bf2fa12383fa3a2044d0510f09d9e29 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 26 May 2022 08:49:46 +0200
Subject: [PATCH 093/403] We identify empty blocks lazily now

---
 README.md | 1 -
 whippet.h | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2732566be..3bda79f9f 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,6 @@ large majority of use cases.
 
  - [ ] Immix-style opportunistic evacuation
  - [ ] Overflow allocation
- - [ ] Lazy identification of empty blocks
  - [ ] Generational GC via sticky mark bits
  - [ ] Generational GC with semi-space nursery
  - [ ] Concurrent marking with SATB barrier
diff --git a/whippet.h b/whippet.h
index 1f72430d6..3c1fb070b 100644
--- a/whippet.h
+++ b/whippet.h
@@ -934,7 +934,7 @@ static size_t next_hole(struct mutator *mut) {
     size_t granules = next_hole_in_block(mut);
     if (granules) {
       // If the hole spans only part of a block, give it to the mutator.
-      if (granules <= GRANULES_PER_BLOCK)
+      if (granules < GRANULES_PER_BLOCK)
         return granules;
       // Sweeping found a completely empty block.  If we have pending
       // pages to release to the OS, we should unmap this block.

From 52166fe286fb365d0000e87cd79cc38916f7714a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 4 Jun 2022 21:54:49 +0200
Subject: [PATCH 094/403] Add gc_edge data structure

Less casting in user programs, and it's a step on the way to evacuation
in whippet.
---
 gc-types.h        | 26 ++++++++++++++++++++++++++
 gc.h              |  2 ++
 heap-objects.h    |  3 ++-
 mt-gcbench.c      |  8 ++++----
 parallel-tracer.h | 13 ++++++-------
 quads.c           |  4 ++--
 semi.h            | 20 ++++++++++----------
 serial-tracer.h   | 14 +++++++-------
 whippet.h         | 39 ++++++++++++++++++++++-----------------
 9 files changed, 81 insertions(+), 48 deletions(-)
 create mode 100644 gc-types.h

diff --git a/gc-types.h b/gc-types.h
new file mode 100644
index 000000000..4779cbd2c
--- /dev/null
+++ b/gc-types.h
@@ -0,0 +1,26 @@
+#ifndef GC_TYPES_H_
+#define GC_TYPES_H_
+
+struct gc_edge {
+  union {
+    void *addr;
+    void **loc;
+  };
+};
+
+static inline struct gc_edge gc_edge(void* addr) {
+  struct gc_edge edge;
+  edge.addr = addr;
+  return edge;
+}
+static inline struct gc_edge object_field(void* addr) {
+  return gc_edge(addr);
+}
+static inline void* dereference_edge(struct gc_edge edge) {
+  return *edge.loc;
+}
+static inline void update_edge(struct gc_edge edge, void *value) {
+  *edge.loc = value;
+}
+
+#endif // GC_TYPES_H_
diff --git a/gc.h b/gc.h
index 5d6268d9f..2f6240122 100644
--- a/gc.h
+++ b/gc.h
@@ -1,6 +1,8 @@
 #ifndef GC_H_
 #define GC_H_
 
+#include "gc-types.h"
+
 #if defined(GC_BDW)
 #include "bdw.h"
 #elif defined(GC_SEMI)
diff --git a/heap-objects.h b/heap-objects.h
index db78e7b66..44e282cc1 100644
--- a/heap-objects.h
+++ b/heap-objects.h
@@ -2,6 +2,7 @@
 #define HEAP_OBJECTS_H
 
 #include "inline.h"
+#include "gc-types.h"
 
 #define DECLARE_NODE_TYPE(name, Name, NAME) \
   struct Name;                              \
@@ -18,7 +19,7 @@ enum alloc_kind {
 #define DEFINE_METHODS(name, Name, NAME) \
   static inline size_t name##_size(Name *obj) ALWAYS_INLINE; \
   static inline void visit_##name##_fields(Name *obj,\
-                                           void (*visit)(void **loc, void *visit_data), \
+                                           void (*visit)(struct gc_edge edge, void *visit_data), \
                                            void *visit_data) ALWAYS_INLINE;
 FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
 #undef DEFINE_METHODS
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 80c802988..0dd9ef6f4 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -77,14 +77,14 @@ static inline size_t double_array_size(DoubleArray *array) {
 }
 static inline void
 visit_node_fields(Node *node,
-                  void (*visit)(void **loc, void *visit_data),
+                  void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
-  visit((void**)&node->left, visit_data);
-  visit((void**)&node->right, visit_data);
+  visit(object_field(&node->left), visit_data);
+  visit(object_field(&node->right), visit_data);
 }
 static inline void
 visit_double_array_fields(DoubleArray *obj,
-                          void (*visit)(void **loc, void *visit_data),
+                          void (*visit)(struct gc_edge edge, void *visit_data),
                           void *visit_data) {
 }
 
diff --git a/parallel-tracer.h b/parallel-tracer.h
index f96e93754..297e2dde8 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -452,10 +452,10 @@ static void tracer_release(struct heap *heap) {
 }
 
 struct gcobj;
-static inline void tracer_visit(void **loc, void *trace_data) ALWAYS_INLINE;
+static inline void tracer_visit(struct gc_edge edge, void *trace_data) ALWAYS_INLINE;
 static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
-static inline int trace_object(struct heap *heap,
-                               struct gcobj *obj) ALWAYS_INLINE;
+static inline int trace_edge(struct heap *heap,
+                             struct gc_edge edge) ALWAYS_INLINE;
 
 static inline void
 tracer_share(struct local_tracer *trace) {
@@ -465,13 +465,12 @@ tracer_share(struct local_tracer *trace) {
 }
 
 static inline void
-tracer_visit(void **loc, void *trace_data) {
+tracer_visit(struct gc_edge edge, void *trace_data) {
   struct local_tracer *trace = trace_data;
-  struct gcobj *obj = *loc;
-  if (obj && trace_object(trace->heap, obj)) {
+  if (trace_edge(trace->heap, edge)) {
     if (local_trace_queue_full(&trace->local))
       tracer_share(trace);
-    local_trace_queue_push(&trace->local, obj);
+    local_trace_queue_push(&trace->local, dereference_edge(edge));
   }
 }
 
diff --git a/quads.c b/quads.c
index 0f7e01857..0ba9ea3f4 100644
--- a/quads.c
+++ b/quads.c
@@ -15,10 +15,10 @@ static inline size_t quad_size(Quad *obj) {
 }
 static inline void
 visit_quad_fields(Quad *quad,
-                  void (*visit)(void **loc, void *visit_data),
+                  void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
   for (size_t i = 0; i < 4; i++)
-    visit((void**)&quad->kids[i], visit_data);
+    visit(object_field(&quad->kids[i]), visit_data);
 }
 typedef HANDLE_TO(Quad) QuadHandle;
 
diff --git a/semi.h b/semi.h
index 3b58376e5..6ed67de8b 100644
--- a/semi.h
+++ b/semi.h
@@ -57,7 +57,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 static void collect(struct mutator *mut) NEVER_INLINE;
 static void collect_for_alloc(struct mutator *mut, size_t bytes) NEVER_INLINE;
 
-static void visit(void **loc, void *visit_data);
+static void visit(struct gc_edge edge, void *visit_data);
 
 static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
   size_t stolen_pages = space->stolen_pages + npages;
@@ -143,13 +143,13 @@ static void* forward(struct semi_space *space, void *obj) {
 }  
 
 static void visit_semi_space(struct heap *heap, struct semi_space *space,
-                             void **loc, void *obj) {
-  *loc = forward(space, obj);
+                             struct gc_edge edge, void *obj) {
+  update_edge(edge, forward(space, obj));
 }
 
 static void visit_large_object_space(struct heap *heap,
                                      struct large_object_space *space,
-                                     void **loc, void *obj) {
+                                     void *obj) {
   if (large_object_space_copy(space, (uintptr_t)obj))
     scan(heap, (uintptr_t)obj);
 }
@@ -158,15 +158,15 @@ static int semi_space_contains(struct semi_space *space, void *obj) {
   return (((uintptr_t)obj) - space->base) < space->size;
 }
 
-static void visit(void **loc, void *visit_data) {
+static void visit(struct gc_edge edge, void *visit_data) {
   struct heap *heap = visit_data;
-  void *obj = *loc;
+  void *obj = dereference_edge(edge);
   if (obj == NULL)
     return;
-  if (semi_space_contains(heap_semi_space(heap), obj))
-    visit_semi_space(heap, heap_semi_space(heap), loc, obj);
+  else if (semi_space_contains(heap_semi_space(heap), obj))
+    visit_semi_space(heap, heap_semi_space(heap), edge, obj);
   else if (large_object_space_contains(heap_large_object_space(heap), obj))
-    visit_large_object_space(heap, heap_large_object_space(heap), loc, obj);
+    visit_large_object_space(heap, heap_large_object_space(heap), obj);
   else
     abort();
 }
@@ -180,7 +180,7 @@ static void collect(struct mutator *mut) {
   flip(semi);
   uintptr_t grey = semi->hp;
   for (struct handle *h = mut->roots; h; h = h->next)
-    visit(&h->v, heap);
+    visit(gc_edge(&h->v), heap);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, grey);
diff --git a/serial-tracer.h b/serial-tracer.h
index 7bea9e63e..6b861a471 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -6,6 +6,7 @@
 
 #include "assert.h"
 #include "debug.h"
+#include "gc-types.h"
 
 struct gcobj;
 
@@ -137,10 +138,10 @@ static void tracer_release(struct heap *heap) {
 }
 
 struct gcobj;
-static inline void tracer_visit(void **loc, void *trace_data) ALWAYS_INLINE;
+static inline void tracer_visit(struct gc_edge edge, void *trace_data) ALWAYS_INLINE;
 static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
-static inline int trace_object(struct heap *heap,
-                               struct gcobj *obj) ALWAYS_INLINE;
+static inline int trace_edge(struct heap *heap,
+                             struct gc_edge edge) ALWAYS_INLINE;
 
 static inline void
 tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
@@ -152,11 +153,10 @@ tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objs,
   trace_queue_push_many(&tracer->queue, objs, count);
 }
 static inline void
-tracer_visit(void **loc, void *trace_data) {
+tracer_visit(struct gc_edge edge, void *trace_data) {
   struct heap *heap = trace_data;
-  struct gcobj *obj = *loc;
-  if (obj && trace_object(heap, obj))
-    tracer_enqueue_root(heap_tracer(heap), obj);
+  if (trace_edge(heap, edge))
+    tracer_enqueue_root(heap_tracer(heap), dereference_edge(edge));
 }
 static inline void
 tracer_trace(struct heap *heap) {
diff --git a/whippet.h b/whippet.h
index 3c1fb070b..f124437fc 100644
--- a/whippet.h
+++ b/whippet.h
@@ -350,8 +350,9 @@ static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return object_metadata_byte(obj);
 }
 
-static inline int mark_space_trace_object(struct mark_space *space,
-                                          struct gcobj *obj) {
+static inline int mark_space_mark_object(struct mark_space *space,
+                                         struct gc_edge edge) {
+  struct gcobj *obj = dereference_edge(edge);
   uint8_t *loc = object_metadata_byte(obj);
   uint8_t byte = *loc;
   if (byte & space->marked_mask)
@@ -368,16 +369,20 @@ static inline int mark_space_contains(struct mark_space *space,
   return addr - space->low_addr < space->extent;
 }
 
-static inline int large_object_space_trace_object(struct large_object_space *space,
-                                                  struct gcobj *obj) {
+static inline int large_object_space_mark_object(struct large_object_space *space,
+                                                 struct gcobj *obj) {
   return large_object_space_copy(space, (uintptr_t)obj);
 }
 
-static inline int trace_object(struct heap *heap, struct gcobj *obj) {
-  if (LIKELY(mark_space_contains(heap_mark_space(heap), obj)))
-    return mark_space_trace_object(heap_mark_space(heap), obj);
+static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
+  struct gcobj *obj = dereference_edge(edge);
+  if (!obj)
+    return 0;
+  else if (LIKELY(mark_space_contains(heap_mark_space(heap), obj)))
+    return mark_space_mark_object(heap_mark_space(heap), edge);
   else if (large_object_space_contains(heap_large_object_space(heap), obj))
-    return large_object_space_trace_object(heap_large_object_space(heap), obj);
+    return large_object_space_mark_object(heap_large_object_space(heap),
+                                          obj);
   else
     abort();
 }
@@ -584,9 +589,9 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
-    struct gcobj *root = h->v;
-    if (root && trace_object(heap, root))
-      mutator_mark_buf_push(local_roots, root);
+    struct gc_edge root = gc_edge(&h->v);
+    if (trace_edge(heap, root))
+      mutator_mark_buf_push(local_roots, dereference_edge(root));
   }
 
   // Post to global linked-list of thread roots.
@@ -602,9 +607,9 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {
 static void mark_controlling_mutator_roots(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   for (struct handle *h = mut->roots; h; h = h->next) {
-    struct gcobj *root = h->v;
-    if (root && trace_object(heap, root))
-      tracer_enqueue_root(&heap->tracer, root);
+    struct gc_edge root = gc_edge(&h->v);
+    if (trace_edge(heap, root))
+      tracer_enqueue_root(&heap->tracer, dereference_edge(root));
   }
 }
 
@@ -630,9 +635,9 @@ static void mark_inactive_mutators(struct heap *heap) {
 
 static void mark_global_roots(struct heap *heap) {
   for (struct handle *h = heap->global_roots; h; h = h->next) {
-    struct gcobj *obj = h->v;
-    if (obj && trace_object(heap, obj))
-      tracer_enqueue_root(&heap->tracer, obj);
+    struct gc_edge edge = gc_edge(&h->v);
+    if (trace_edge(heap, edge))
+      tracer_enqueue_root(&heap->tracer, dereference_edge(edge));
   }
 
   struct mutator_mark_buf *roots = atomic_load(&heap->mutator_roots);

From e4342f6c4586cda8639f49dd07cb02c1c7bae15e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Jun 2022 16:57:22 +0200
Subject: [PATCH 095/403] Add helper for yielding in a spinlock

---
 parallel-tracer.h | 14 +++-----------
 spin.h            | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 11 deletions(-)
 create mode 100644 spin.h

diff --git a/parallel-tracer.h b/parallel-tracer.h
index 297e2dde8..02f641352 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -9,6 +9,7 @@
 #include "assert.h"
 #include "debug.h"
 #include "inline.h"
+#include "spin.h"
 
 // The Chase-Lev work-stealing deque, as initially described in "Dynamic
 // Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
@@ -530,8 +531,7 @@ trace_worker_check_termination(struct trace_worker *worker,
     return 1;
   }
 
-  size_t spin_count = 0;
-  while (1) {
+  for (size_t spin_count = 0;; spin_count++) {
     if (trace_worker_can_steal_from_any(worker, tracer)) {
       atomic_fetch_add_explicit(&tracer->active_tracers, 1,
                                 memory_order_relaxed);
@@ -544,15 +544,7 @@ trace_worker_check_termination(struct trace_worker *worker,
     }
     // spin
     DEBUG("tracer #%zu: spinning #%zu\n", worker->id, spin_count);
-    if (spin_count < 10)
-      __builtin_ia32_pause();
-    else if (spin_count < 20)
-      sched_yield();
-    else if (spin_count < 40)
-      usleep(0);
-    else
-      usleep(1);
-    spin_count++;
+    yield_for_spin(spin_count);
   }
 }
 
diff --git a/spin.h b/spin.h
new file mode 100644
index 000000000..d650c3216
--- /dev/null
+++ b/spin.h
@@ -0,0 +1,18 @@
+#ifndef SPIN_H
+#define SPIN_H
+
+#include <sched.h>
+#include <unistd.h>
+
+static inline void yield_for_spin(size_t spin_count) {
+  if (spin_count < 10)
+    __builtin_ia32_pause();
+  else if (spin_count < 20)
+    sched_yield();
+  else if (spin_count < 40)
+    usleep(0);
+  else
+    usleep(1);
+}  
+
+#endif // SPIN_H

From 7af8bb6bd00b641765bc8b2cb711e837dd5b0cec Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 15 Jul 2022 20:57:12 +0200
Subject: [PATCH 096/403] Add machinery to disable ragged-stop marking

We'll need to disable the optimization that mutators mark their own
stacks once we support evacuation.
---
 whippet.h | 125 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 95 insertions(+), 30 deletions(-)

diff --git a/whippet.h b/whippet.h
index f124437fc..562f62010 100644
--- a/whippet.h
+++ b/whippet.h
@@ -301,14 +301,13 @@ struct heap {
   size_t active_mutator_count;
   size_t mutator_count;
   struct handle *global_roots;
-  struct mutator_mark_buf *mutator_roots;
+  struct mutator *mutator_trace_list;
   long count;
   struct mutator *deactivated_mutators;
   struct tracer tracer;
 };
 
 struct mutator_mark_buf {
-  struct mutator_mark_buf *next;
   size_t size;
   size_t capacity;
   struct gcobj **objects;
@@ -322,6 +321,10 @@ struct mutator {
   struct heap *heap;
   struct handle *roots;
   struct mutator_mark_buf mark_buf;
+  // Three uses for this in-object linked-list pointer:
+  //  - inactive (blocked in syscall) mutators
+  //  - grey objects when stopping active mutators for mark-in-place
+  //  - untraced mutators when stopping active mutators for evacuation
   struct mutator *next;
 };
 
@@ -344,7 +347,12 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct mutator *mut) NEVER_INLINE;
+enum gc_reason {
+  GC_REASON_SMALL_ALLOCATION,
+  GC_REASON_LARGE_ALLOCATION
+};
+
+static void collect(struct mutator *mut, enum gc_reason reason) NEVER_INLINE;
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return object_metadata_byte(obj);
@@ -583,9 +591,35 @@ static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
     munmap(buf->objects, bytes);
 }
 
+static void enqueue_mutator_for_tracing(struct mutator *mut) {
+  struct heap *heap = mutator_heap(mut);
+  ASSERT(mut->next == NULL);
+  struct mutator *next =
+    atomic_load_explicit(&heap->mutator_trace_list, memory_order_acquire);
+  do {
+    mut->next = next;
+  } while (!atomic_compare_exchange_weak(&heap->mutator_trace_list,
+                                         &next, mut));
+}
+
+static int heap_should_mark_while_stopping(struct heap *heap) {
+  return 1;
+}
+
+static int mutator_should_mark_while_stopping(struct mutator *mut) {
+  // If we are marking in place, we allow mutators to mark their own
+  // stacks before pausing.  This is a limited form of concurrent
+  // marking, as other mutators might be running, not having received
+  // the signal to stop yet.  We can't do this for a compacting
+  // collection, however, as that would become concurrent evacuation,
+  // which is a different kettle of fish.
+  return heap_should_mark_while_stopping(mutator_heap(mut));
+}
+
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void mark_stopping_mutator_roots(struct mutator *mut) {
+  ASSERT(mutator_should_mark_while_stopping(mut));
   struct heap *heap = mutator_heap(mut);
   struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
@@ -593,18 +627,10 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {
     if (trace_edge(heap, root))
       mutator_mark_buf_push(local_roots, dereference_edge(root));
   }
-
-  // Post to global linked-list of thread roots.
-  struct mutator_mark_buf *next =
-    atomic_load_explicit(&heap->mutator_roots, memory_order_acquire);
-  do {
-    local_roots->next = next;
-  } while (!atomic_compare_exchange_weak(&heap->mutator_roots,
-                                         &next, local_roots));
 }
 
-// Mark the roots of the mutator that causes GC.
-static void mark_controlling_mutator_roots(struct mutator *mut) {
+// Precondition: the caller holds the heap lock.
+static void mark_mutator_roots_with_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gc_edge root = gc_edge(&h->v);
@@ -613,6 +639,17 @@ static void mark_controlling_mutator_roots(struct mutator *mut) {
   }
 }
 
+static void trace_mutator_roots_with_lock(struct mutator *mut) {
+  mark_mutator_roots_with_lock(mut);
+}
+
+static void trace_mutator_roots_with_lock_before_stop(struct mutator *mut) {
+  if (mutator_should_mark_while_stopping(mut))
+    mark_mutator_roots_with_lock(mut);
+  else
+    enqueue_mutator_for_tracing(mut);
+}
+
 static void release_stopping_mutator_roots(struct mutator *mut) {
   mutator_mark_buf_release(&mut->mark_buf);
 }
@@ -626,24 +663,33 @@ static void wait_for_mutators_to_stop(struct heap *heap) {
 static void finish_sweeping(struct mutator *mut);
 static void finish_sweeping_in_block(struct mutator *mut);
 
-static void mark_inactive_mutators(struct heap *heap) {
+static void trace_mutator_roots_after_stop(struct heap *heap) {
+  struct mutator *mut = atomic_load(&heap->mutator_trace_list);
+  int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
+  while (mut) {
+    if (active_mutators_already_marked)
+      tracer_enqueue_roots(&heap->tracer,
+                           mut->mark_buf.objects, mut->mark_buf.size);
+    else
+      trace_mutator_roots_with_lock(mut);
+    struct mutator *next = mut->next;
+    mut->next = NULL;
+    mut = next;
+  }
+  atomic_store(&heap->mutator_trace_list, NULL);
+
   for (struct mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
     finish_sweeping_in_block(mut);
-    mark_controlling_mutator_roots(mut);
+    trace_mutator_roots_with_lock(mut);
   }
 }
 
-static void mark_global_roots(struct heap *heap) {
+static void trace_global_roots(struct heap *heap) {
   for (struct handle *h = heap->global_roots; h; h = h->next) {
     struct gc_edge edge = gc_edge(&h->v);
     if (trace_edge(heap, edge))
       tracer_enqueue_root(&heap->tracer, dereference_edge(edge));
   }
-
-  struct mutator_mark_buf *roots = atomic_load(&heap->mutator_roots);
-  for (; roots; roots = roots->next)
-    tracer_enqueue_roots(&heap->tracer, roots->objects, roots->size);
-  atomic_store(&heap->mutator_roots, NULL);
 }
 
 static void pause_mutator_for_collection(struct heap *heap) NEVER_INLINE;
@@ -674,7 +720,11 @@ static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   ASSERT(mutators_are_stopping(heap));
   finish_sweeping_in_block(mut);
-  mark_controlling_mutator_roots(mut);
+  if (mutator_should_mark_while_stopping(mut))
+    // No need to collect results in mark buf; we can enqueue roots directly.
+    mark_mutator_roots_with_lock(mut);
+  else
+    enqueue_mutator_for_tracing(mut);
   pause_mutator_for_collection(heap);
 }
 
@@ -683,7 +733,9 @@ static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   ASSERT(mutators_are_stopping(heap));
   finish_sweeping(mut);
-  mark_stopping_mutator_roots(mut);
+  if (mutator_should_mark_while_stopping(mut))
+    mark_stopping_mutator_roots(mut);
+  enqueue_mutator_for_tracing(mut);
   heap_lock(heap);
   pause_mutator_for_collection(heap);
   heap_unlock(heap);
@@ -715,15 +767,28 @@ static void reset_statistics(struct mark_space *space) {
   space->fragmentation_granules_since_last_collection = 0;
 }
 
-static void collect(struct mutator *mut) {
+static int maybe_grow_heap(struct heap *heap, enum gc_reason reason) {
+  return 0;
+}
+
+static void determine_collection_kind(struct heap *heap,
+                                      enum gc_reason reason) {
+}
+
+static void collect(struct mutator *mut, enum gc_reason reason) {
   struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
+  if (maybe_grow_heap(heap, reason)) {
+    DEBUG("grew heap instead of collecting #%ld:\n", heap->count);
+    return;
+  }
   DEBUG("start collect #%ld:\n", heap->count);
+  determine_collection_kind(heap, reason);
   large_object_space_start_gc(lospace);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
-  mark_controlling_mutator_roots(mut);
+  trace_mutator_roots_with_lock_before_stop(mut);
   finish_sweeping(mut);
   wait_for_mutators_to_stop(heap);
   double yield = space->granules_freed_by_last_collection * GRANULE_SIZE;
@@ -731,8 +796,8 @@ static void collect(struct mutator *mut) {
   yield /= SLAB_SIZE * space->nslabs;
   fragmentation /= SLAB_SIZE * space->nslabs;
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
-  mark_inactive_mutators(heap);
-  mark_global_roots(heap);
+  trace_mutator_roots_after_stop(heap);
+  trace_global_roots(heap);
   tracer_trace(heap);
   tracer_release(heap);
   reset_sweeper(space);
@@ -1043,7 +1108,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
     if (mutators_are_stopping(heap))
       pause_mutator_for_collection_with_lock(mut);
     else
-      collect(mut);
+      collect(mut, GC_REASON_LARGE_ALLOCATION);
     heap_unlock(heap);
     if (!sweep_until_memory_released(mut))
       out_of_memory(mut);
@@ -1059,7 +1124,7 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
     abort();
   }
 
-  *(uintptr_t*)ret = kind;
+  *(uintptr_t*)ret = tag_live(kind);
   return ret;
 }
 
@@ -1083,7 +1148,7 @@ static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
         if (mutators_are_stopping(heap))
           pause_mutator_for_collection_with_lock(mut);
         else
-          collect(mut);
+          collect(mut, GC_REASON_SMALL_ALLOCATION);
         heap_unlock(heap);
         swept_from_beginning = 1;
       }

From c998f1cd5cb0de7bafc3bfc090298d70f683eca6 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 20 Jul 2022 11:11:50 +0200
Subject: [PATCH 097/403] Measure fragmentation as fraction of total heap size

This allows a relatively more fragmented mark space if the majority of
the heap is taken up by lospace.
---
 whippet.h | 47 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/whippet.h b/whippet.h
index 562f62010..56eee50ed 100644
--- a/whippet.h
+++ b/whippet.h
@@ -305,6 +305,8 @@ struct heap {
   long count;
   struct mutator *deactivated_mutators;
   struct tracer tracer;
+  double fragmentation_low_threshold;
+  double fragmentation_high_threshold;
 };
 
 struct mutator_mark_buf {
@@ -771,6 +773,24 @@ static int maybe_grow_heap(struct heap *heap, enum gc_reason reason) {
   return 0;
 }
 
+static double heap_fragmentation(struct heap *heap) {
+  struct mark_space *mark_space = heap_mark_space(heap);
+  size_t mark_space_blocks = mark_space->nslabs * NONMETA_BLOCKS_PER_SLAB;
+  mark_space_blocks -= atomic_load(&mark_space->unavailable_blocks_count);
+  size_t mark_space_granules = mark_space_blocks * GRANULES_PER_BLOCK;
+  size_t fragmentation_granules =
+    mark_space->fragmentation_granules_since_last_collection;
+
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  size_t lospace_pages = lospace->total_pages - lospace->free_pages;
+  size_t lospace_granules =
+    lospace_pages << (lospace->page_size_log2 - GRANULE_SIZE_LOG_2);
+
+  size_t heap_granules = mark_space_granules + lospace_granules;
+
+  return ((double)fragmentation_granules) / heap_granules;
+}
+
 static void determine_collection_kind(struct heap *heap,
                                       enum gc_reason reason) {
 }
@@ -792,9 +812,8 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   finish_sweeping(mut);
   wait_for_mutators_to_stop(heap);
   double yield = space->granules_freed_by_last_collection * GRANULE_SIZE;
-  double fragmentation = space->fragmentation_granules_since_last_collection * GRANULE_SIZE;
+  double fragmentation = heap_fragmentation(heap);
   yield /= SLAB_SIZE * space->nslabs;
-  fragmentation /= SLAB_SIZE * space->nslabs;
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   trace_mutator_roots_after_stop(heap);
   trace_global_roots(heap);
@@ -1239,6 +1258,23 @@ static struct slab* allocate_slabs(size_t nslabs) {
   return (struct slab*) aligned_base;
 }
 
+static int heap_init(struct heap *heap, size_t size) {
+  // *heap is already initialized to 0.
+
+  pthread_mutex_init(&heap->lock, NULL);
+  pthread_cond_init(&heap->mutator_cond, NULL);
+  pthread_cond_init(&heap->collector_cond, NULL);
+  heap->size = size;
+
+  if (!tracer_init(heap))
+    abort();
+
+  heap->fragmentation_low_threshold = 0.05;
+  heap->fragmentation_high_threshold = 0.10;
+
+  return 1;
+}
+
 static int mark_space_init(struct mark_space *space, struct heap *heap) {
   size_t size = align_up(heap->size, SLAB_SIZE);
   size_t nslabs = size / SLAB_SIZE;
@@ -1276,12 +1312,7 @@ static int initialize_gc(size_t size, struct heap **heap,
   *heap = calloc(1, sizeof(struct heap));
   if (!*heap) abort();
 
-  pthread_mutex_init(&(*heap)->lock, NULL);
-  pthread_cond_init(&(*heap)->mutator_cond, NULL);
-  pthread_cond_init(&(*heap)->collector_cond, NULL);
-  (*heap)->size = size;
-
-  if (!tracer_init(*heap))
+  if (!heap_init(*heap, size))
     abort();
 
   struct mark_space *space = heap_mark_space(*heap);

From 09d2df162645a1ec98d7ee9b3b66cebada5b7e88 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Jul 2022 14:22:23 +0200
Subject: [PATCH 098/403] Compute GC yield as fraction of total heap size

---
 large-object-space.h |  5 ++++-
 whippet.h            | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/large-object-space.h b/large-object-space.h
index cf5be0b29..c708641fb 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -29,6 +29,7 @@ struct large_object_space {
   size_t total_pages;
   size_t free_pages;
   size_t live_pages_at_last_collection;
+  size_t pages_freed_by_last_collection;
 
   struct address_set from_space;
   struct address_set to_space;
@@ -125,7 +126,9 @@ static void large_object_space_finish_gc(struct large_object_space *space) {
   address_set_for_each(&space->from_space, large_object_space_reclaim_one,
                        space);
   address_set_clear(&space->from_space);
-  space->free_pages = space->total_pages - space->live_pages_at_last_collection;
+  size_t free_pages = space->total_pages - space->live_pages_at_last_collection;
+  space->pages_freed_by_last_collection = free_pages - space->free_pages;
+  space->free_pages = free_pages;
   pthread_mutex_unlock(&space->lock);
 }
 
diff --git a/whippet.h b/whippet.h
index 56eee50ed..cce3f3090 100644
--- a/whippet.h
+++ b/whippet.h
@@ -773,6 +773,18 @@ static int maybe_grow_heap(struct heap *heap, enum gc_reason reason) {
   return 0;
 }
 
+static double heap_last_gc_yield(struct heap *heap) {
+  struct mark_space *mark_space = heap_mark_space(heap);
+  size_t mark_space_yield = mark_space->granules_freed_by_last_collection;
+  mark_space_yield <<= GRANULE_SIZE_LOG_2;
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  size_t lospace_yield = lospace->pages_freed_by_last_collection;
+  lospace_yield <<= lospace->page_size_log2;
+
+  double yield = mark_space_yield + lospace_yield;
+  return yield / heap->size;
+}
+
 static double heap_fragmentation(struct heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   size_t mark_space_blocks = mark_space->nslabs * NONMETA_BLOCKS_PER_SLAB;
@@ -811,9 +823,8 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   trace_mutator_roots_with_lock_before_stop(mut);
   finish_sweeping(mut);
   wait_for_mutators_to_stop(heap);
-  double yield = space->granules_freed_by_last_collection * GRANULE_SIZE;
+  double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
-  yield /= SLAB_SIZE * space->nslabs;
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   trace_mutator_roots_after_stop(heap);
   trace_global_roots(heap);

From 69caead18272902c66f20a7a877849f449083da8 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 15 Jul 2022 20:57:36 +0200
Subject: [PATCH 099/403] Add heuristics to choose when to compact or mark in
 place

We can choose to compact (evacuate) or mark in place.  What we choose
has some effects on how we mark.
---
 whippet.h | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index cce3f3090..8e1f12a49 100644
--- a/whippet.h
+++ b/whippet.h
@@ -288,6 +288,11 @@ struct mark_space {
   uintptr_t fragmentation_granules_since_last_collection; // atomically
 };
 
+enum gc_kind {
+  GC_KIND_MARK_IN_PLACE,
+  GC_KIND_COMPACT
+};
+
 struct heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
@@ -297,6 +302,7 @@ struct heap {
   pthread_cond_t mutator_cond;
   size_t size;
   int collecting;
+  enum gc_kind gc_kind;
   int multithreaded;
   size_t active_mutator_count;
   size_t mutator_count;
@@ -605,7 +611,7 @@ static void enqueue_mutator_for_tracing(struct mutator *mut) {
 }
 
 static int heap_should_mark_while_stopping(struct heap *heap) {
-  return 1;
+  return atomic_load(&heap->gc_kind) == GC_KIND_MARK_IN_PLACE;
 }
 
 static int mutator_should_mark_while_stopping(struct mutator *mut) {
@@ -805,6 +811,42 @@ static double heap_fragmentation(struct heap *heap) {
 
 static void determine_collection_kind(struct heap *heap,
                                       enum gc_reason reason) {
+  switch (reason) {
+    case GC_REASON_LARGE_ALLOCATION:
+      // We are collecting because a large allocation could not find
+      // enough free blocks, and we decided not to expand the heap.
+      // Let's evacuate to maximize the free block yield.
+      heap->gc_kind = GC_KIND_COMPACT;
+      break;
+    case GC_REASON_SMALL_ALLOCATION: {
+      // We are making a small allocation and ran out of blocks.
+      // Evacuate if the heap is "too fragmented", where fragmentation
+      // is measured as a percentage of granules that couldn't be used
+      // for allocations in the last cycle.
+      double fragmentation = heap_fragmentation(heap);
+      if (atomic_load(&heap->gc_kind) == GC_KIND_COMPACT) {
+        // For some reason, we already decided to compact in the past.
+        // Keep going until we measure that wasted space due to
+        // fragmentation is below a low-water-mark.
+        if (fragmentation < heap->fragmentation_low_threshold) {
+          DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
+                fragmentation * 100.,
+                heap->fragmentation_low_threshold * 100.);
+          atomic_store(&heap->gc_kind, GC_KIND_MARK_IN_PLACE);
+        }
+      } else {
+        // Otherwise switch to evacuation mode if the heap is too
+        // fragmented.
+        if (fragmentation > heap->fragmentation_high_threshold) {
+          DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
+                fragmentation * 100.,
+                heap->fragmentation_high_threshold * 100.);
+          atomic_store(&heap->gc_kind, GC_KIND_COMPACT);
+        }
+      }
+      break;
+    }
+  }
 }
 
 static void collect(struct mutator *mut, enum gc_reason reason) {

From 8409383ee116c5ce39a4408fe2d50a68c737d64c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 20 Jul 2022 10:43:17 +0200
Subject: [PATCH 100/403] Refactor post-collection for mark space

---
 whippet.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/whippet.h b/whippet.h
index 8e1f12a49..3314397d5 100644
--- a/whippet.h
+++ b/whippet.h
@@ -849,6 +849,12 @@ static void determine_collection_kind(struct heap *heap,
   }
 }
 
+static void mark_space_finish_gc(struct mark_space *space) {
+  reset_sweeper(space);
+  rotate_mark_bytes(space);
+  reset_statistics(space);
+}
+
 static void collect(struct mutator *mut, enum gc_reason reason) {
   struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
@@ -872,11 +878,9 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   trace_global_roots(heap);
   tracer_trace(heap);
   tracer_release(heap);
-  reset_sweeper(space);
-  rotate_mark_bytes(space);
-  heap->count++;
-  reset_statistics(space);
+  mark_space_finish_gc(space);
   large_object_space_finish_gc(lospace);
+  heap->count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
   DEBUG("collect done\n");

From a8214af467142b45729355daf97528515f19ea77 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Jun 2022 16:58:53 +0200
Subject: [PATCH 101/403] Whippet reserves a bit in object kind for forwarding

Tags without the bit are forwarding addresses.
---
 whippet.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/whippet.h b/whippet.h
index 3314397d5..ab358faee 100644
--- a/whippet.h
+++ b/whippet.h
@@ -251,14 +251,20 @@ static inline size_t size_to_granules(size_t size) {
   return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
 }
 
-// Alloc kind is in bits 0-7, for live objects.
-static const uintptr_t gcobj_alloc_kind_mask = 0xff;
-static const uintptr_t gcobj_alloc_kind_shift = 0;
+// Alloc kind is in bits 1-7, for live objects.
+static const uintptr_t gcobj_alloc_kind_mask = 0x7f;
+static const uintptr_t gcobj_alloc_kind_shift = 1;
+static const uintptr_t gcobj_forwarded_mask = 0x1;
+static const uintptr_t gcobj_not_forwarded_bit = 0x1;
 static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
   return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
 }
 static inline uintptr_t tag_live(uint8_t alloc_kind) {
-  return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift);
+  return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift)
+    | gcobj_not_forwarded_bit;
+}
+static inline uintptr_t tag_forwarded(struct gcobj *new_addr) {
+  return (uintptr_t)new_addr;
 }
 
 struct gcobj {

From c7c8fa2d3255b8325d31d6d6c0ac7dd269a3ba7a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Jul 2022 09:42:51 +0200
Subject: [PATCH 102/403] Refactor to add "block_list" type

---
 whippet.h | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/whippet.h b/whippet.h
index ab358faee..3c2da28f1 100644
--- a/whippet.h
+++ b/whippet.h
@@ -219,17 +219,23 @@ static void block_summary_set_next(struct block_summary *summary,
     (summary->next_and_flags & (BLOCK_SIZE - 1)) | next;
 }
 
-static void push_block(uintptr_t *loc, size_t *count, uintptr_t block) {
+// Lock-free block list.
+struct block_list {
+  size_t count;
+  uintptr_t blocks;
+};
+
+static void push_block(struct block_list *list, uintptr_t block) {
+  atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
   struct block_summary *summary = block_summary_for_addr(block);
-  uintptr_t next = atomic_load_explicit(loc, memory_order_acquire);
+  uintptr_t next = atomic_load_explicit(&list->blocks, memory_order_acquire);
   do {
     block_summary_set_next(summary, next);
-  } while (!atomic_compare_exchange_weak(loc, &next, block));
-  atomic_fetch_add_explicit(count, 1, memory_order_acq_rel);
+  } while (!atomic_compare_exchange_weak(&list->blocks, &next, block));
 }
 
-static uintptr_t pop_block(uintptr_t *loc, size_t *count) {
-  uintptr_t head = atomic_load_explicit(loc, memory_order_acquire);
+static uintptr_t pop_block(struct block_list *list) {
+  uintptr_t head = atomic_load_explicit(&list->blocks, memory_order_acquire);
   struct block_summary *summary;
   uintptr_t next;
   do {
@@ -237,9 +243,9 @@ static uintptr_t pop_block(uintptr_t *loc, size_t *count) {
       return 0;
     summary = block_summary_for_addr(head);
     next = block_summary_next(summary);
-  } while (!atomic_compare_exchange_weak(loc, &head, next));
+  } while (!atomic_compare_exchange_weak(&list->blocks, &head, next));
   block_summary_set_next(summary, 0);
-  atomic_fetch_sub_explicit(count, 1, memory_order_acq_rel);
+  atomic_fetch_sub_explicit(&list->count, 1, memory_order_acq_rel);
   return head;
 }
 
@@ -283,10 +289,8 @@ struct mark_space {
   size_t extent;
   size_t heap_size;
   uintptr_t next_block;   // atomically
-  uintptr_t empty_blocks; // atomically
-  size_t empty_blocks_count; // atomically
-  uintptr_t unavailable_blocks; // atomically
-  size_t unavailable_blocks_count; // atomically
+  struct block_list empty;
+  struct block_list unavailable;
   ssize_t pending_unavailable_bytes; // atomically
   struct slab *slabs;
   size_t nslabs;
@@ -483,13 +487,11 @@ static void push_unavailable_block(struct mark_space *space, uintptr_t block) {
   ASSERT(!block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
   block_summary_set_flag(summary, BLOCK_UNAVAILABLE);
   madvise((void*)block, BLOCK_SIZE, MADV_DONTNEED);
-  push_block(&space->unavailable_blocks, &space->unavailable_blocks_count,
-             block);
+  push_block(&space->unavailable, block);
 }
 
 static uintptr_t pop_unavailable_block(struct mark_space *space) {
-  uintptr_t block = pop_block(&space->unavailable_blocks,
-                              &space->unavailable_blocks_count);
+  uintptr_t block = pop_block(&space->unavailable);
   if (!block)
     return 0;
   struct block_summary *summary = block_summary_for_addr(block);
@@ -499,13 +501,13 @@ static uintptr_t pop_unavailable_block(struct mark_space *space) {
 }
 
 static uintptr_t pop_empty_block(struct mark_space *space) {
-  return pop_block(&space->empty_blocks, &space->empty_blocks_count);
+  return pop_block(&space->empty);
 }
 
 static void push_empty_block(struct mark_space *space, uintptr_t block) {
   ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
                                  BLOCK_NEEDS_SWEEP));
-  push_block(&space->empty_blocks, &space->empty_blocks_count, block);
+  push_block(&space->empty, block);
 }
 
 static ssize_t mark_space_request_release_memory(struct mark_space *space,
@@ -800,7 +802,7 @@ static double heap_last_gc_yield(struct heap *heap) {
 static double heap_fragmentation(struct heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   size_t mark_space_blocks = mark_space->nslabs * NONMETA_BLOCKS_PER_SLAB;
-  mark_space_blocks -= atomic_load(&mark_space->unavailable_blocks_count);
+  mark_space_blocks -= atomic_load(&mark_space->unavailable.count);
   size_t mark_space_granules = mark_space_blocks * GRANULES_PER_BLOCK;
   size_t fragmentation_granules =
     mark_space->fragmentation_granules_since_last_collection;

From a16bb1833c8caa1c5c45509dd3a5de2804a21675 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Jul 2022 11:36:46 +0200
Subject: [PATCH 103/403] Add logic to compute evacuation candidate blocks

---
 whippet.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index 3c2da28f1..57ca1f0fd 100644
--- a/whippet.h
+++ b/whippet.h
@@ -119,7 +119,7 @@ enum block_summary_flag {
   BLOCK_PAGED_OUT = 0x4,
   BLOCK_NEEDS_SWEEP = 0x8,
   BLOCK_UNAVAILABLE = 0x10,
-  BLOCK_FLAG_UNUSED_5 = 0x20,
+  BLOCK_EVACUATE = 0x20,
   BLOCK_FLAG_UNUSED_6 = 0x40,
   BLOCK_FLAG_UNUSED_7 = 0x80,
   BLOCK_FLAG_UNUSED_8 = 0x100,
@@ -291,6 +291,7 @@ struct mark_space {
   uintptr_t next_block;   // atomically
   struct block_list empty;
   struct block_list unavailable;
+  struct block_list evacuation_targets;
   ssize_t pending_unavailable_bytes; // atomically
   struct slab *slabs;
   size_t nslabs;
@@ -857,6 +858,69 @@ static void determine_collection_kind(struct heap *heap,
   }
 }
 
+static void compute_evacuation_candidates(struct heap *heap) {
+  if (heap->gc_kind == GC_KIND_MARK_IN_PLACE)
+    return;
+
+  struct mark_space *space = heap_mark_space(heap);
+  size_t target_blocks = space->evacuation_targets.count;
+  size_t target_granules = target_blocks * GRANULES_PER_BLOCK;
+  // Compute histogram where domain is the number of granules in a block
+  // that survived the last collection, aggregated into 33 buckets, and
+  // range is number of blocks in that bucket.  (Bucket 0 is for blocks
+  // that were found to be completely empty; such blocks may be on the
+  // evacuation target list.)
+  const size_t bucket_count = 33;
+  size_t histogram[33] = {0,};
+  size_t bucket_size = GRANULES_PER_BLOCK / 32;
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct block_summary *summary = &space->slabs[slab].summaries[block];
+      if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
+        continue;
+      size_t survivor_granules = GRANULES_PER_BLOCK - summary->free_granules;
+      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+      histogram[bucket]++;
+    }
+  }
+
+  // Evacuation targets must be in bucket 0.  These blocks will later be
+  // marked also as evacuation candidates, but that's not a problem,
+  // because they contain no source objects.
+  ASSERT(histogram[0] >= target_blocks);
+
+  // Now select a number of blocks that is likely to fill the space in
+  // the target blocks.  Prefer candidate blocks with fewer survivors
+  // from the last GC, to increase expected free block yield.
+  for (size_t bucket = 0; bucket < bucket_count; bucket++) {
+    size_t bucket_granules = bucket * bucket_size * histogram[bucket];
+    if (bucket_granules <= target_granules) {
+      target_granules -= bucket_granules;
+    } else {
+      histogram[bucket] = target_granules / (bucket_size * bucket);
+      target_granules = 0;
+    }
+  }
+
+  // Having selected the number of blocks, now we set the evacuation
+  // candidate flag on all blocks.
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct block_summary *summary = &space->slabs[slab].summaries[block];
+      if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
+        continue;
+      size_t survivor_granules = GRANULES_PER_BLOCK - summary->free_granules;
+      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+      if (histogram[bucket]) {
+        block_summary_set_flag(summary, BLOCK_EVACUATE);
+        histogram[bucket]--;
+      } else {
+        block_summary_clear_flag(summary, BLOCK_EVACUATE);
+      }
+    }
+  }
+}
+
 static void mark_space_finish_gc(struct mark_space *space) {
   reset_sweeper(space);
   rotate_mark_bytes(space);
@@ -882,6 +946,7 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
+  compute_evacuation_candidates(heap);
   trace_mutator_roots_after_stop(heap);
   trace_global_roots(heap);
   tracer_trace(heap);

From 4a9908bc4d39ae65bd95f328c628014c734b32ba Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Jul 2022 14:37:30 +0200
Subject: [PATCH 104/403] Refactor evacuation vs pinning support

Marking conservative roots in place effectively prohibits them from
being moved, and we need to trace the roots anyway to discover
conservative roots.  No need therefore for a pin bit.
---
 whippet.h | 52 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/whippet.h b/whippet.h
index 57ca1f0fd..fc50fb449 100644
--- a/whippet.h
+++ b/whippet.h
@@ -34,13 +34,14 @@ STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
 // mark bits but also for other per-object metadata.  Already we were
 // using a byte instead of a bit to facilitate parallel marking.
 // (Parallel markers are allowed to race.)  Turns out we can put a
-// pinned bit there too, for objects that can't be moved.  Actually
-// there are two pinned bits: one that's managed by the collector, which
-// pins referents of conservative roots, and one for pins managed
-// externally (maybe because the mutator requested a pin.)  Then there's
-// a "remembered" bit, indicating that the object should be scanned for
-// references to the nursery.  If the remembered bit is set, the
-// corresponding remset byte should also be set in the slab (see below).
+// pinned bit there too, for objects that can't be moved (perhaps
+// because they have been passed to unmanaged C code).  (Objects can
+// also be temporarily pinned if they are referenced by a conservative
+// root, but that doesn't need a separate bit; we can just use the mark
+// bit.)  Then there's a "remembered" bit, indicating that the object
+// should be scanned for references to the nursery.  If the remembered
+// bit is set, the corresponding remset byte should also be set in the
+// slab (see below).
 //
 // Getting back to mark bits -- because we want to allow for
 // conservative roots, we need to know whether an address indicates an
@@ -68,8 +69,8 @@ enum metadata_byte {
   METADATA_BYTE_MARK_2 = 8,
   METADATA_BYTE_END = 16,
   METADATA_BYTE_PINNED = 32,
-  METADATA_BYTE_PERMAPINNED = 64,
-  METADATA_BYTE_REMEMBERED = 128
+  METADATA_BYTE_REMEMBERED = 64,
+  METADATA_BYTE_UNUSED = 128
 };
 
 static uint8_t rotate_dead_survivor_marked(uint8_t mask) {
@@ -285,6 +286,7 @@ struct mark_space {
   uint64_t sweep_mask;
   uint8_t live_mask;
   uint8_t marked_mask;
+  uint8_t evacuating;
   uintptr_t low_addr;
   size_t extent;
   size_t heap_size;
@@ -858,11 +860,14 @@ static void determine_collection_kind(struct heap *heap,
   }
 }
 
-static void compute_evacuation_candidates(struct heap *heap) {
-  if (heap->gc_kind == GC_KIND_MARK_IN_PLACE)
-    return;
-
+static void prepare_for_evacuation(struct heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
+
+  if (heap->gc_kind == GC_KIND_MARK_IN_PLACE) {
+    space->evacuating = 0;
+    return;
+  }
+
   size_t target_blocks = space->evacuation_targets.count;
   size_t target_granules = target_blocks * GRANULES_PER_BLOCK;
   // Compute histogram where domain is the number of granules in a block
@@ -919,9 +924,24 @@ static void compute_evacuation_candidates(struct heap *heap) {
       }
     }
   }
+
+  // We are ready to evacuate!
+  space->evacuating = 1;
+}
+
+static void trace_conservative_roots_after_stop(struct heap *heap) {
+  // FIXME: Visit conservative roots, if the collector is configured in
+  // that way.  Mark them in place, preventing any subsequent
+  // evacuation.
+}
+
+static void trace_precise_roots_after_stop(struct heap *heap) {
+  trace_mutator_roots_after_stop(heap);
+  trace_global_roots(heap);
 }
 
 static void mark_space_finish_gc(struct mark_space *space) {
+  space->evacuating = 0;
   reset_sweeper(space);
   rotate_mark_bytes(space);
   reset_statistics(space);
@@ -946,9 +966,9 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
-  compute_evacuation_candidates(heap);
-  trace_mutator_roots_after_stop(heap);
-  trace_global_roots(heap);
+  trace_conservative_roots_after_stop(heap);
+  prepare_for_evacuation(heap);
+  trace_precise_roots_after_stop(heap);
   tracer_trace(heap);
   tracer_release(heap);
   mark_space_finish_gc(space);

From 92b05a63107670d1c93373d0659a1899922925d0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Jul 2022 15:01:07 +0200
Subject: [PATCH 105/403] Add implementation of parallel evacuation

---
 whippet.h | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 100 insertions(+), 8 deletions(-)

diff --git a/whippet.h b/whippet.h
index fc50fb449..9c0042bff 100644
--- a/whippet.h
+++ b/whippet.h
@@ -16,6 +16,7 @@
 #else
 #include "serial-tracer.h"
 #endif
+#include "spin.h"
 
 #define GRANULE_SIZE 16
 #define GRANULE_SIZE_LOG_2 4
@@ -379,6 +380,13 @@ static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return object_metadata_byte(obj);
 }
 
+static size_t mark_space_live_object_granules(uint8_t *metadata) {
+  size_t n = 0;
+  while ((metadata[n] & METADATA_BYTE_END) == 0)
+    n++;
+  return n + 1;
+}
+
 static inline int mark_space_mark_object(struct mark_space *space,
                                          struct gc_edge edge) {
   struct gcobj *obj = dereference_edge(edge);
@@ -392,6 +400,94 @@ static inline int mark_space_mark_object(struct mark_space *space,
   return 1;
 }
 
+static struct gcobj *evacuation_allocate(struct mark_space *space,
+                                         size_t granules) {
+  return NULL;
+}
+
+static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
+                                                     struct gc_edge edge) {
+  struct gcobj *obj = dereference_edge(edge);
+  uint8_t *metadata = object_metadata_byte(obj);
+  uint8_t byte = *metadata;
+  if (byte & space->marked_mask)
+    return 0;
+  if (space->evacuating &&
+      block_summary_has_flag(block_summary_for_addr((uintptr_t)obj),
+                             BLOCK_EVACUATE) &&
+      ((byte & METADATA_BYTE_PINNED) == 0)) {
+    // This is an evacuating collection, and we are attempting to
+    // evacuate this block, and this particular object isn't pinned.
+    // First, see if someone evacuated this object already.
+    uintptr_t header_word = atomic_load_explicit(&obj->tag,
+                                                 memory_order_relaxed);
+    uintptr_t busy_header_word = 0;
+    if (header_word != busy_header_word &&
+        (header_word & gcobj_not_forwarded_bit) == 0) {
+      // The object has been evacuated already.  Update the edge;
+      // whoever forwarded the object will make sure it's eventually
+      // traced.
+      struct gcobj *forwarded = (struct gcobj*) header_word;
+      update_edge(edge, forwarded);
+      return 0;
+    }
+    // Otherwise try to claim it for evacuation.
+    if (header_word != busy_header_word &&
+        atomic_compare_exchange_strong(&obj->tag, &header_word,
+                                       busy_header_word)) {
+      // We claimed the object successfully; evacuating is up to us.
+      size_t object_granules = mark_space_live_object_granules(metadata);
+      struct gcobj *new_obj = evacuation_allocate(space, object_granules);
+      if (new_obj) {
+        // We were able to reserve space in which to evacuate this object.
+        // Commit the evacuation by overwriting the tag.
+        uintptr_t new_header_word = tag_forwarded(new_obj);
+        atomic_store_explicit(&obj->tag, new_header_word,
+                              memory_order_release);
+        // Now copy the object contents, update extent metadata, and
+        // indicate to the caller that the object's fields need to be
+        // traced.
+        new_obj->tag = header_word;
+        memcpy(&new_obj->words[1], &obj->words[1],
+               object_granules * GRANULE_SIZE - sizeof(header_word));
+        uint8_t *new_metadata = object_metadata_byte(new_obj);
+        memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
+        update_edge(edge, new_obj);
+        obj = new_obj;
+        metadata = new_metadata;
+        // Fall through to set mark bits.
+      } else {
+        // Well shucks; allocation failed, marking the end of
+        // opportunistic evacuation.  No future evacuation of this
+        // object will succeed.  Restore the original header word and
+        // mark instead.
+        atomic_store_explicit(&obj->tag, header_word,
+                              memory_order_release);
+      }
+    } else {
+      // Someone else claimed this object first.  Spin until new address
+      // known, or evacuation aborts.
+      for (size_t spin_count = 0;; spin_count++) {
+        header_word = atomic_load_explicit(&obj->tag, memory_order_acquire);
+        if (header_word)
+          break;
+        yield_for_spin(spin_count);
+      }
+      if ((header_word & gcobj_not_forwarded_bit) == 0) {
+        struct gcobj *forwarded = (struct gcobj*) header_word;
+        update_edge(edge, forwarded);
+      }
+      // Either way, the other party is responsible for adding the
+      // object to the mark queue.
+      return 0;
+    }
+  }
+  uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
+    | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
+  *metadata = (byte & ~mask) | space->marked_mask;
+  return 1;
+}
+
 static inline int mark_space_contains(struct mark_space *space,
                                       struct gcobj *obj) {
   uintptr_t addr = (uintptr_t)obj;
@@ -407,8 +503,11 @@ static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
   struct gcobj *obj = dereference_edge(edge);
   if (!obj)
     return 0;
-  else if (LIKELY(mark_space_contains(heap_mark_space(heap), obj)))
+  else if (LIKELY(mark_space_contains(heap_mark_space(heap), obj))) {
+    if (heap_mark_space(heap)->evacuating)
+      return mark_space_evacuate_or_mark_object(heap_mark_space(heap), edge);
     return mark_space_mark_object(heap_mark_space(heap), edge);
+  }
   else if (large_object_space_contains(heap_large_object_space(heap), obj))
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           obj);
@@ -979,13 +1078,6 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   DEBUG("collect done\n");
 }
 
-static size_t mark_space_live_object_granules(uint8_t *metadata) {
-  size_t n = 0;
-  while ((metadata[n] & METADATA_BYTE_END) == 0)
-    n++;
-  return n + 1;
-}  
-
 static int sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
   uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
   // If the metadata byte is nonzero, that means either a young, dead,

From d106f3ca7124f8732d6400f8909e2d797b095252 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 18 Jul 2022 15:01:47 +0200
Subject: [PATCH 106/403] Mutator collects evacuation target blocks

---
 whippet.h | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index 9c0042bff..ac9148788 100644
--- a/whippet.h
+++ b/whippet.h
@@ -283,6 +283,11 @@ struct gcobj {
   };
 };
 
+struct evacuation_allocator {
+  size_t allocated; // atomically
+  uintptr_t block_cursor; // atomically
+};
+
 struct mark_space {
   uint64_t sweep_mask;
   uint8_t live_mask;
@@ -295,7 +300,9 @@ struct mark_space {
   struct block_list empty;
   struct block_list unavailable;
   struct block_list evacuation_targets;
+  double evacuation_reserve;
   ssize_t pending_unavailable_bytes; // atomically
+  struct evacuation_allocator evacuation_allocator;
   struct slab *slabs;
   size_t nslabs;
   uintptr_t granules_freed_by_last_collection; // atomically
@@ -400,9 +407,97 @@ static inline int mark_space_mark_object(struct mark_space *space,
   return 1;
 }
 
+static uintptr_t make_evacuation_allocator_cursor(uintptr_t block,
+                                                  size_t allocated) {
+  ASSERT(allocated < (BLOCK_SIZE - 1) * (uint64_t) BLOCK_SIZE);
+  return (block & ~(BLOCK_SIZE - 1)) | (allocated / BLOCK_SIZE);
+}
+
+static void prepare_evacuation_allocator(struct evacuation_allocator *alloc,
+                                         struct block_list *targets) {
+  uintptr_t first_block = targets->blocks;
+  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
+  atomic_store_explicit(&alloc->block_cursor,
+                        make_evacuation_allocator_cursor(first_block, 0),
+                        memory_order_release);
+}
+
+static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
+                                        struct block_list *targets,
+                                        struct block_list *empties) {
+  // Blocks that we used for evacuation get returned to the mutator as
+  // sweepable blocks.  Blocks that we didn't get to use go to the
+  // empties.
+  while (alloc->allocated) {
+    uintptr_t block = pop_block(targets);
+    if (!block)
+      break;
+    block_summary_set_flag(block_summary_for_addr(block),
+                           BLOCK_NEEDS_SWEEP);
+    if (alloc->allocated <= BLOCK_SIZE)
+      break;
+    alloc->allocated -= BLOCK_SIZE;
+  }
+  while (1) {
+    uintptr_t block = pop_block(targets);
+    if (!block)
+      break;
+    push_block(empties, block);
+  }
+}
+
 static struct gcobj *evacuation_allocate(struct mark_space *space,
                                          size_t granules) {
-  return NULL;
+  // All collector threads compete to allocate from what is logically a
+  // single bump-pointer arena, which is actually composed of a linked
+  // list of blocks.
+  struct evacuation_allocator *alloc = &space->evacuation_allocator;
+  uintptr_t cursor = atomic_load_explicit(&alloc->block_cursor,
+                                          memory_order_acquire);
+  if (cursor == -1)
+    // No more space.
+    return NULL;
+  size_t bytes = granules * GRANULE_SIZE;
+  size_t prev = alloc->allocated;
+  size_t block_mask = (BLOCK_SIZE - 1);
+  size_t next;
+  do {
+    next = prev + bytes;
+    if ((prev ^ next) & ~block_mask)
+      // Allocation straddles a block boundary; advance so it starts a
+      // fresh block.
+      next = (next & ~block_mask) + bytes;
+  } while (!atomic_compare_exchange_weak(&alloc->allocated, &prev, next));
+  // OK, we've claimed our memory, starting at next - bytes.  Now find
+  // the node in the linked list of evacuation targets that corresponds
+  // to this allocation pointer.
+  uintptr_t block = cursor & ~block_mask;
+  // This is the SEQ'th block to be allocated into.
+  uintptr_t seq = cursor & block_mask;
+  // Therefore this block handles allocations starting at SEQ*BLOCK_SIZE
+  // and continuing for BLOCK_SIZE bytes.
+  uintptr_t base = seq * BLOCK_SIZE;
+
+  while ((base ^ next) & ~block_mask) {
+    ASSERT(base < next);
+    // Cursor lags; advance it.
+    block = block_summary_next(block_summary_for_addr(block));
+    if (!block) {
+      // Ran out of blocks!
+      atomic_store_explicit(&alloc->block_cursor, -1, memory_order_release);
+      return NULL;
+    }
+    base += BLOCK_SIZE;
+    // This store can race with other allocators, but that's OK as long
+    // as it never advances the cursor beyond the allocation pointer,
+    // which it won't because we updated the allocation pointer already.
+    atomic_store_explicit(&alloc->block_cursor,
+                          make_evacuation_allocator_cursor(block, base),
+                          memory_order_release);
+  }
+
+  uintptr_t addr = block + (next & block_mask) - bytes;
+  return (struct gcobj*) addr;
 }
 
 static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
@@ -612,6 +707,24 @@ static void push_empty_block(struct mark_space *space, uintptr_t block) {
   push_block(&space->empty, block);
 }
 
+static int maybe_push_evacuation_target(struct mark_space *space,
+                                        uintptr_t block) {
+  size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
+                                        memory_order_acquire);
+  size_t total = space->nslabs * NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
+                                            memory_order_acquire);
+  if (targets >= (total - unavailable) * space->evacuation_reserve)
+    return 0;
+
+  // We reached the end of the allocation cycle and just obtained a
+  // known-empty block from the empties list.  If the last cycle was an
+  // evacuating collection, put this block back on the list of
+  // evacuation target blocks.
+  push_block(&space->evacuation_targets, block);
+  return 1;
+}
+
 static ssize_t mark_space_request_release_memory(struct mark_space *space,
                                                  size_t bytes) {
   return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
@@ -643,6 +756,13 @@ static int sweep_until_memory_released(struct mutator *mut) {
     uintptr_t block = pop_empty_block(space);
     if (!block)
       break;
+    // Note that we may have competing uses; if we're evacuating,
+    // perhaps we should push this block to the evacuation target list.
+    // That would enable us to reach a fragmentation low water-mark in
+    // fewer cycles.  But maybe evacuation started in order to obtain
+    // free blocks for large objects; in that case we should just reap
+    // the fruits of our labor.  Probably this second use-case is more
+    // important.
     push_unavailable_block(space, block);
     pending = atomic_fetch_sub(&space->pending_unavailable_bytes, BLOCK_SIZE);
     pending -= BLOCK_SIZE;
@@ -959,15 +1079,34 @@ static void determine_collection_kind(struct heap *heap,
   }
 }
 
+static void release_evacuation_target_blocks(struct mark_space *space) {
+  // Move any collected evacuation target blocks back to empties.
+  finish_evacuation_allocator(&space->evacuation_allocator,
+                              &space->evacuation_targets, &space->empty);
+}
+
 static void prepare_for_evacuation(struct heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
 
   if (heap->gc_kind == GC_KIND_MARK_IN_PLACE) {
     space->evacuating = 0;
+    space->evacuation_reserve = 0.02;
     return;
   }
 
+  // Put the mutator into evacuation mode, collecting up to 50% of free space as
+  // evacuation blocks.
+  space->evacuation_reserve = 0.5;
+
   size_t target_blocks = space->evacuation_targets.count;
+  DEBUG("evacuation target block count: %zu\n", target_blocks);
+
+  if (target_blocks == 0) {
+    DEBUG("no evacuation target blocks, disabling evacuation for this round\n");
+    space->evacuating = 0;
+    return;
+  }
+
   size_t target_granules = target_blocks * GRANULES_PER_BLOCK;
   // Compute histogram where domain is the number of granules in a block
   // that survived the last collection, aggregated into 33 buckets, and
@@ -1025,6 +1164,8 @@ static void prepare_for_evacuation(struct heap *heap) {
   }
 
   // We are ready to evacuate!
+  prepare_evacuation_allocator(&space->evacuation_allocator,
+                               &space->evacuation_targets);
   space->evacuating = 1;
 }
 
@@ -1044,6 +1185,7 @@ static void mark_space_finish_gc(struct mark_space *space) {
   reset_sweeper(space);
   rotate_mark_bytes(space);
   reset_statistics(space);
+  release_evacuation_target_blocks(space);
 }
 
 static void collect(struct mutator *mut, enum gc_reason reason) {
@@ -1319,6 +1461,10 @@ static size_t next_hole(struct mutator *mut) {
         if (!block)
           return 0;
 
+        // Maybe we should use this empty as a target for evacuation.
+        if (maybe_push_evacuation_target(space, block))
+          continue;
+
         // Otherwise return the block to the mutator.
         struct block_summary *summary = block_summary_for_addr(block);
         block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
@@ -1535,6 +1681,7 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->low_addr = (uintptr_t) slabs;
   space->extent = size;
   space->next_block = 0;
+  space->evacuation_reserve = 0.02;
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;

From 279309b821d2636fd2dce3a838f9835c073c3bec Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 20 Jul 2022 10:44:06 +0200
Subject: [PATCH 107/403] mt-gcbench allocates garbage between live data

This obviously invalidates previous benchmark results; perhaps we should
make this optional.
---
 mt-gcbench-types.h |  3 +-
 mt-gcbench.c       | 93 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/mt-gcbench-types.h b/mt-gcbench-types.h
index a61b2b7d5..04bf6d258 100644
--- a/mt-gcbench-types.h
+++ b/mt-gcbench-types.h
@@ -3,7 +3,8 @@
 
 #define FOR_EACH_HEAP_OBJECT_KIND(M) \
   M(node, Node, NODE) \
-  M(double_array, DoubleArray, DOUBLE_ARRAY)
+  M(double_array, DoubleArray, DOUBLE_ARRAY) \
+  M(hole, Hole, HOLE)
 
 #include "heap-objects.h"
 
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 0dd9ef6f4..2af4c04c8 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -69,12 +69,21 @@ struct DoubleArray {
   double values[0];
 };
 
+struct Hole {
+  GC_HEADER;
+  size_t length;
+  uintptr_t values[0];
+};
+
 static inline size_t node_size(Node *obj) {
   return sizeof(Node);
 }
 static inline size_t double_array_size(DoubleArray *array) {
   return sizeof(*array) + array->length * sizeof(double);
 }
+static inline size_t hole_size(Hole *hole) {
+  return sizeof(*hole) + hole->length * sizeof(uintptr_t);
+}
 static inline void
 visit_node_fields(Node *node,
                   void (*visit)(struct gc_edge edge, void *visit_data),
@@ -87,6 +96,11 @@ visit_double_array_fields(DoubleArray *obj,
                           void (*visit)(struct gc_edge edge, void *visit_data),
                           void *visit_data) {
 }
+static inline void
+visit_hole_fields(Hole *obj,
+                  void (*visit)(struct gc_edge edge, void *visit_data),
+                  void *visit_data) {
+}
 
 typedef HANDLE_TO(Node) NodeHandle;
 typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
@@ -106,8 +120,14 @@ static DoubleArray* allocate_double_array(struct mutator *mut,
   return ret;
 }
 
-static unsigned long current_time(void)
-{
+static Hole* allocate_hole(struct mutator *mut, size_t size) {
+  Hole *ret = allocate(mut, ALLOC_KIND_HOLE,
+                       sizeof(Hole) + sizeof (uintptr_t) * size);
+  ret->length = size;
+  return ret;
+}
+
+static unsigned long current_time(void) {
   struct timeval t = { 0 };
   gettimeofday(&t, NULL);
   return t.tv_sec * 1000 * 1000 + t.tv_usec;
@@ -127,15 +147,58 @@ static int compute_num_iters(int i) {
   return 2 * tree_size(max_tree_depth + 2) / tree_size(i);
 }
 
+// A power-law distribution.  Each integer was selected by starting at 0, taking
+// a random number in [0,1), and then accepting the integer if the random number
+// was less than 0.15, or trying again with the next integer otherwise.  Useful
+// for modelling allocation sizes or number of garbage objects to allocate
+// between live allocations.
+static const uint8_t power_law_distribution[256] = {
+  1, 15, 3, 12, 2, 8, 4, 0, 18, 7, 9, 8, 15, 2, 36, 5,
+  1, 9, 6, 11, 9, 19, 2, 0, 0, 3, 9, 6, 3, 2, 1, 1,
+  6, 1, 8, 4, 2, 0, 5, 3, 7, 0, 0, 3, 0, 4, 1, 7,
+  1, 8, 2, 2, 2, 14, 0, 7, 8, 0, 2, 1, 4, 12, 7, 5,
+  0, 3, 4, 13, 10, 2, 3, 7, 0, 8, 0, 23, 0, 16, 1, 1,
+  6, 28, 1, 18, 0, 3, 6, 5, 8, 6, 14, 5, 2, 5, 0, 11,
+  0, 18, 4, 16, 1, 4, 3, 13, 3, 23, 7, 4, 10, 5, 3, 13,
+  0, 14, 5, 5, 2, 5, 0, 16, 2, 0, 1, 1, 0, 0, 4, 2,
+  7, 7, 0, 5, 7, 2, 1, 24, 27, 3, 7, 1, 0, 8, 1, 4,
+  0, 3, 0, 7, 7, 3, 9, 2, 9, 2, 5, 10, 1, 1, 12, 6,
+  2, 9, 5, 0, 4, 6, 0, 7, 2, 1, 5, 4, 1, 0, 1, 15,
+  4, 0, 15, 4, 0, 0, 32, 18, 2, 2, 1, 7, 8, 3, 11, 1,
+  2, 7, 11, 1, 9, 1, 2, 6, 11, 17, 1, 2, 5, 1, 14, 3,
+  6, 1, 1, 15, 3, 1, 0, 6, 10, 8, 1, 3, 2, 7, 0, 1,
+  0, 11, 3, 3, 5, 8, 2, 0, 0, 7, 12, 2, 5, 20, 3, 7,
+  4, 4, 5, 22, 1, 5, 2, 7, 15, 2, 4, 6, 11, 8, 12, 1
+};
+
+static size_t power_law(size_t *counter) {
+  return power_law_distribution[(*counter)++ & 0xff];
+}
+
+struct thread {
+  struct mutator *mut;
+  size_t counter;
+};
+
+static void allocate_garbage(struct thread *t) {
+  size_t hole = power_law(&t->counter);
+  if (hole) {
+    allocate_hole(t->mut, hole);
+  }
+}
+
 // Build tree top down, assigning to older objects.
-static void populate(struct mutator *mut, int depth, Node *node) {
+static void populate(struct thread *t, int depth, Node *node) {
+  struct mutator *mut = t->mut;
   if (depth <= 0)
     return;
 
   NodeHandle self = { node };
   PUSH_HANDLE(mut, self);
+  allocate_garbage(t);
   NodeHandle l = { allocate_node(mut) };
   PUSH_HANDLE(mut, l);
+  allocate_garbage(t);
   NodeHandle r = { allocate_node(mut) };
   PUSH_HANDLE(mut, r);
 
@@ -144,8 +207,8 @@ static void populate(struct mutator *mut, int depth, Node *node) {
   // i is 0 because the memory is zeroed.
   HANDLE_REF(self)->j = depth;
 
-  populate(mut, depth-1, HANDLE_REF(self)->left);
-  populate(mut, depth-1, HANDLE_REF(self)->right);
+  populate(t, depth-1, HANDLE_REF(self)->left);
+  populate(t, depth-1, HANDLE_REF(self)->right);
 
   POP_HANDLE(mut);
   POP_HANDLE(mut);
@@ -153,15 +216,17 @@ static void populate(struct mutator *mut, int depth, Node *node) {
 }
 
 // Build tree bottom-up
-static Node* make_tree(struct mutator *mut, int depth) {
+static Node* make_tree(struct thread *t, int depth) {
+  struct mutator *mut = t->mut;
   if (depth <= 0)
     return allocate_node(mut);
 
-  NodeHandle left = { make_tree(mut, depth-1) };
+  NodeHandle left = { make_tree(t, depth-1) };
   PUSH_HANDLE(mut, left);
-  NodeHandle right = { make_tree(mut, depth-1) };
+  NodeHandle right = { make_tree(t, depth-1) };
   PUSH_HANDLE(mut, right);
 
+  allocate_garbage(t);
   Node *result = allocate_node(mut);
   init_field((void**)&result->left, HANDLE_REF(left));
   init_field((void**)&result->right, HANDLE_REF(right));
@@ -190,7 +255,8 @@ static void validate_tree(Node *tree, int depth) {
 #endif
 }
 
-static void time_construction(struct mutator *mut, int depth) {
+static void time_construction(struct thread *t, int depth) {
+  struct mutator *mut = t->mut;
   int num_iters = compute_num_iters(depth);
   NodeHandle temp_tree = { NULL };
   PUSH_HANDLE(mut, temp_tree);
@@ -201,7 +267,7 @@ static void time_construction(struct mutator *mut, int depth) {
     unsigned long start = current_time();
     for (int i = 0; i < num_iters; ++i) {
       HANDLE_SET(temp_tree, allocate_node(mut));
-      populate(mut, depth, HANDLE_REF(temp_tree));
+      populate(t, depth, HANDLE_REF(temp_tree));
       validate_tree(HANDLE_REF(temp_tree), depth);
       HANDLE_SET(temp_tree, NULL);
     }
@@ -212,7 +278,7 @@ static void time_construction(struct mutator *mut, int depth) {
   {
     long start = current_time();
     for (int i = 0; i < num_iters; ++i) {
-      HANDLE_SET(temp_tree, make_tree(mut, depth));
+      HANDLE_SET(temp_tree, make_tree(t, depth));
       validate_tree(HANDLE_REF(temp_tree), depth);
       HANDLE_SET(temp_tree, NULL);
     }
@@ -256,6 +322,7 @@ static void* run_one_test(struct mutator *mut) {
   NodeHandle long_lived_tree = { NULL };
   NodeHandle temp_tree = { NULL };
   DoubleArrayHandle array = { NULL };
+  struct thread t = { mut, 0 };
 
   PUSH_HANDLE(mut, long_lived_tree);
   PUSH_HANDLE(mut, temp_tree);
@@ -265,7 +332,7 @@ static void* run_one_test(struct mutator *mut) {
   printf(" Creating a long-lived binary tree of depth %d\n",
          long_lived_tree_depth);
   HANDLE_SET(long_lived_tree, allocate_node(mut));
-  populate(mut, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
+  populate(&t, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
 
   // Create long-lived array, filling half of it
   printf(" Creating a long-lived array of %d doubles\n", array_size);
@@ -275,7 +342,7 @@ static void* run_one_test(struct mutator *mut) {
   }
 
   for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
-    time_construction(mut, d);
+    time_construction(&t, d);
   }
 
   validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);

From 22a9cc87a073f2b98e4beda4e7576e71b2095f69 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 20 Jul 2022 11:39:06 +0200
Subject: [PATCH 108/403] Update TODO

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3bda79f9f..7bb0ccc24 100644
--- a/README.md
+++ b/README.md
@@ -153,7 +153,7 @@ large majority of use cases.
 
 ### Features that would improve Whippet performance
 
- - [ ] Immix-style opportunistic evacuation
+ - [X] Immix-style opportunistic evacuation
  - [ ] Overflow allocation
  - [ ] Generational GC via sticky mark bits
  - [ ] Generational GC with semi-space nursery

From 1781c5aed45a119e68b5cef40bd730ff53f758bd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 2 Aug 2022 13:56:27 +0200
Subject: [PATCH 109/403] Fix evacuation allocator to clear any holes

---
 whippet.h | 71 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/whippet.h b/whippet.h
index ac9148788..840738c71 100644
--- a/whippet.h
+++ b/whippet.h
@@ -285,6 +285,7 @@ struct gcobj {
 
 struct evacuation_allocator {
   size_t allocated; // atomically
+  size_t limit;
   uintptr_t block_cursor; // atomically
 };
 
@@ -417,26 +418,58 @@ static void prepare_evacuation_allocator(struct evacuation_allocator *alloc,
                                          struct block_list *targets) {
   uintptr_t first_block = targets->blocks;
   atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
+  alloc->limit =
+    atomic_load_explicit(&targets->count, memory_order_acquire) * BLOCK_SIZE;
   atomic_store_explicit(&alloc->block_cursor,
                         make_evacuation_allocator_cursor(first_block, 0),
                         memory_order_release);
 }
 
+static void clear_remaining_metadata_bytes_in_block(uintptr_t block,
+                                                    uintptr_t allocated) {
+  ASSERT((allocated & (GRANULE_SIZE - 1)) == 0);
+  uintptr_t base = block + allocated;
+  uintptr_t limit = block + BLOCK_SIZE;
+  uintptr_t granules = (limit - base) >> GRANULE_SIZE_LOG_2;
+  ASSERT(granules <= GRANULES_PER_BLOCK);
+  memset(object_metadata_byte((void*)base), 0, granules);
+}
+
+static void finish_evacuation_allocator_block(uintptr_t block,
+                                              uintptr_t allocated) {
+  ASSERT(allocated <= BLOCK_SIZE);
+  struct block_summary *summary = block_summary_for_addr(block);
+  block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
+  size_t fragmentation = (BLOCK_SIZE - allocated) >> GRANULE_SIZE_LOG_2;
+  summary->hole_count = 1;
+  summary->free_granules = GRANULES_PER_BLOCK;
+  summary->holes_with_fragmentation = fragmentation ? 1 : 0;
+  summary->fragmentation_granules = fragmentation;
+  if (fragmentation)
+    clear_remaining_metadata_bytes_in_block(block, allocated);
+}
+
 static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
                                         struct block_list *targets,
                                         struct block_list *empties) {
   // Blocks that we used for evacuation get returned to the mutator as
   // sweepable blocks.  Blocks that we didn't get to use go to the
   // empties.
-  while (alloc->allocated) {
+  size_t allocated = atomic_load_explicit(&alloc->allocated,
+                                          memory_order_acquire);
+  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
+  if (allocated > alloc->limit)
+    allocated = alloc->limit;
+  while (allocated >= BLOCK_SIZE) {
     uintptr_t block = pop_block(targets);
-    if (!block)
-      break;
-    block_summary_set_flag(block_summary_for_addr(block),
-                           BLOCK_NEEDS_SWEEP);
-    if (alloc->allocated <= BLOCK_SIZE)
-      break;
-    alloc->allocated -= BLOCK_SIZE;
+    ASSERT(block);
+    allocated -= BLOCK_SIZE;
+  }
+  if (allocated) {
+    // Finish off the last partially-filled block.
+    uintptr_t block = pop_block(targets);
+    ASSERT(block);
+    finish_evacuation_allocator_block(block, allocated);
   }
   while (1) {
     uintptr_t block = pop_block(targets);
@@ -454,14 +487,14 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
   struct evacuation_allocator *alloc = &space->evacuation_allocator;
   uintptr_t cursor = atomic_load_explicit(&alloc->block_cursor,
                                           memory_order_acquire);
-  if (cursor == -1)
-    // No more space.
-    return NULL;
   size_t bytes = granules * GRANULE_SIZE;
-  size_t prev = alloc->allocated;
+  size_t prev = atomic_load_explicit(&alloc->allocated, memory_order_acquire);
   size_t block_mask = (BLOCK_SIZE - 1);
   size_t next;
   do {
+    if (prev >= alloc->limit)
+      // No more space.
+      return NULL;
     next = prev + bytes;
     if ((prev ^ next) & ~block_mask)
       // Allocation straddles a block boundary; advance so it starts a
@@ -480,14 +513,22 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
 
   while ((base ^ next) & ~block_mask) {
     ASSERT(base < next);
+    if (base + BLOCK_SIZE > prev) {
+      // The allocation straddles a block boundary, and the cursor has
+      // caught up so that we identify the block for the previous
+      // allocation pointer.  Finish the previous block, probably
+      // leaving a small hole at the end.
+      finish_evacuation_allocator_block(block, prev - base);
+    }
     // Cursor lags; advance it.
     block = block_summary_next(block_summary_for_addr(block));
-    if (!block) {
+    base += BLOCK_SIZE;
+    if (base >= alloc->limit) {
       // Ran out of blocks!
-      atomic_store_explicit(&alloc->block_cursor, -1, memory_order_release);
+      ASSERT(!block);
       return NULL;
     }
-    base += BLOCK_SIZE;
+    ASSERT(block);
     // This store can race with other allocators, but that's OK as long
     // as it never advances the cursor beyond the allocation pointer,
     // which it won't because we updated the allocation pointer already.

From 7f405c929e65773105dbf9bca975df42c2a42316 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 2 Aug 2022 13:57:15 +0200
Subject: [PATCH 110/403] Initial live mask does not include young allocations

After rotation, the young bit wasn't being included anyway.  This just
improves the first collection.
---
 whippet.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/whippet.h b/whippet.h
index 840738c71..f2fd90985 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1264,10 +1264,9 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
 static int sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
   uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
   // If the metadata byte is nonzero, that means either a young, dead,
-  // survived, or marked object.  If it's live (young, survived, or
-  // marked), we found the next mark.  Otherwise it's dead and we clear
-  // the byte.  If we see an END, that means an end of a dead object;
-  // clear it.
+  // survived, or marked object.  If it's live (survived or marked), we
+  // found the next mark.  Otherwise it's dead and we clear the byte.
+  // If we see an END, that means an end of a dead object; clear it.
   if (metadata) {
     if (metadata & sweep_mask)
       return 1;
@@ -1715,7 +1714,7 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   uint8_t survived = METADATA_BYTE_MARK_1;
   uint8_t marked = METADATA_BYTE_MARK_2;
   space->marked_mask = marked;
-  space->live_mask = METADATA_BYTE_YOUNG | survived | marked;
+  space->live_mask = survived | marked;
   rotate_mark_bytes(space);
   space->slabs = slabs;
   space->nslabs = nslabs;

From 13b3bb5b246b0a2325777deec9bee5ddc69251ac Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 31 Jul 2022 21:30:59 +0200
Subject: [PATCH 111/403] Update barrier functions to also have the object
 being written

Also remove read barriers, as they were unused, and we have no plans to
use them.
---
 bdw.h        | 7 ++-----
 mt-gcbench.c | 8 ++++----
 quads.c      | 2 +-
 semi.h       | 7 ++-----
 whippet.h    | 7 ++-----
 5 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/bdw.h b/bdw.h
index 0034b0561..69a147b56 100644
--- a/bdw.h
+++ b/bdw.h
@@ -103,15 +103,12 @@ static inline void collect(struct mutator *mut) {
   GC_gcollect();
 }
 
-static inline void init_field(void **addr, void *val) {
+static inline void init_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
-static inline void set_field(void **addr, void *val) {
+static inline void set_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
-static inline void* get_field(void **addr) {
-  return *addr;
-}
 
 static inline struct mutator *add_mutator(struct heap *heap) {
   struct mutator *ret = GC_malloc(sizeof(struct mutator));
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 2af4c04c8..1819655fb 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -202,8 +202,8 @@ static void populate(struct thread *t, int depth, Node *node) {
   NodeHandle r = { allocate_node(mut) };
   PUSH_HANDLE(mut, r);
 
-  set_field((void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
-  set_field((void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
+  set_field(HANDLE_REF(self), (void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
+  set_field(HANDLE_REF(self), (void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
   // i is 0 because the memory is zeroed.
   HANDLE_REF(self)->j = depth;
 
@@ -228,8 +228,8 @@ static Node* make_tree(struct thread *t, int depth) {
 
   allocate_garbage(t);
   Node *result = allocate_node(mut);
-  init_field((void**)&result->left, HANDLE_REF(left));
-  init_field((void**)&result->right, HANDLE_REF(right));
+  init_field(result, (void**)&result->left, HANDLE_REF(left));
+  init_field(result, (void**)&result->right, HANDLE_REF(right));
   // i is 0 because the memory is zeroed.
   result->j = depth;
 
diff --git a/quads.c b/quads.c
index 0ba9ea3f4..9743b88a8 100644
--- a/quads.c
+++ b/quads.c
@@ -49,7 +49,7 @@ static Quad* make_tree(struct mutator *mut, int depth) {
 
     Quad *result = allocate_quad(mut);
     for (size_t i = 0; i < 4; i++)
-      init_field((void**)&result->kids[i], HANDLE_REF(kids[i]));
+      init_field(result, (void**)&result->kids[i], HANDLE_REF(kids[i]));
 
     for (size_t i = 0; i < 4; i++)
       POP_HANDLE(mut);
diff --git a/semi.h b/semi.h
index 6ed67de8b..ce3d938f1 100644
--- a/semi.h
+++ b/semi.h
@@ -255,15 +255,12 @@ static inline void* allocate_pointerless(struct mutator *mut,
   return allocate(mut, kind, size);
 }
 
-static inline void init_field(void **addr, void *val) {
+static inline void init_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
-static inline void set_field(void **addr, void *val) {
+static inline void set_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
-static inline void* get_field(void **addr) {
-  return *addr;
-}
 
 static int initialize_semi_space(struct semi_space *space, size_t size) {
   // Allocate even numbers of pages.
diff --git a/whippet.h b/whippet.h
index f2fd90985..146321e8c 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1652,15 +1652,12 @@ static inline void* allocate_pointerless(struct mutator *mut,
   return allocate(mut, kind, size);
 }
 
-static inline void init_field(void **addr, void *val) {
+static inline void init_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
-static inline void set_field(void **addr, void *val) {
+static inline void set_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
-static inline void* get_field(void **addr) {
-  return *addr;
-}
 
 static struct slab* allocate_slabs(size_t nslabs) {
   size_t size = nslabs * SLAB_SIZE;

From a4e1f55f370a20541b04a952ce3ee4dacc0d8877 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 1 Aug 2022 17:16:33 +0200
Subject: [PATCH 112/403] Implement generational collection

Not really battle-tested but it seems to work.  Need to implement
heuristics for when to do generational vs full-heap GC.
---
 Makefile             |   8 +-
 gc.h                 |  11 ++
 large-object-space.h |  28 +++--
 semi.h               |   4 +-
 whippet.h            | 236 +++++++++++++++++++++++++++++++++++--------
 5 files changed, 233 insertions(+), 54 deletions(-)

diff --git a/Makefile b/Makefile
index e01421e2d..4dc47225d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi whippet parallel-whippet
+COLLECTORS=bdw semi whippet parallel-whippet generational-whippet parallel-generational-whippet
 
 CC=gcc
 CFLAGS=-Wall -O2 -g -fno-strict-aliasing -Wno-unused -DNDEBUG
@@ -23,6 +23,12 @@ whippet-%: whippet.h precise-roots.h large-object-space.h serial-tracer.h assert
 parallel-whippet-%: whippet.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL_WHIPPET -o $@ $*.c
 
+generational-whippet-%: whippet.h precise-roots.h large-object-space.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
+	$(COMPILE) -DGC_GENERATIONAL_WHIPPET -o $@ $*.c
+
+parallel-generational-whippet-%: whippet.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL_GENERATIONAL_WHIPPET -o $@ $*.c
+
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
 test-%: $(ALL_TESTS)
diff --git a/gc.h b/gc.h
index 2f6240122..931d86636 100644
--- a/gc.h
+++ b/gc.h
@@ -8,9 +8,20 @@
 #elif defined(GC_SEMI)
 #include "semi.h"
 #elif defined(GC_WHIPPET)
+#define GC_PARALLEL_TRACE 0
+#define GC_GENERATIONAL 0
 #include "whippet.h"
 #elif defined(GC_PARALLEL_WHIPPET)
 #define GC_PARALLEL_TRACE 1
+#define GC_GENERATIONAL 0
+#include "whippet.h"
+#elif defined(GC_GENERATIONAL_WHIPPET)
+#define GC_PARALLEL_TRACE 0
+#define GC_GENERATIONAL 1
+#include "whippet.h"
+#elif defined(GC_PARALLEL_GENERATIONAL_WHIPPET)
+#define GC_PARALLEL_TRACE 1
+#define GC_GENERATIONAL 1
 #include "whippet.h"
 #else
 #error unknown gc
diff --git a/large-object-space.h b/large-object-space.h
index c708641fb..68f1cf1cf 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -56,7 +56,11 @@ static size_t large_object_space_npages(struct large_object_space *space,
   return (bytes + space->page_size - 1) >> space->page_size_log2;
 }
 
-static void large_object_space_start_gc(struct large_object_space *space) {
+static void large_object_space_start_gc(struct large_object_space *space,
+                                        int is_minor_gc) {
+  if (is_minor_gc)
+    return;
+
   // Flip.  Note that when we flip, fromspace is empty, but it might have
   // allocated storage, so we do need to do a proper swap.
   struct address_set tmp;
@@ -121,14 +125,22 @@ static void large_object_space_reclaim_one(uintptr_t addr, void *data) {
   }
 }
 
-static void large_object_space_finish_gc(struct large_object_space *space) {
+static void large_object_space_finish_gc(struct large_object_space *space,
+                                         int is_minor_gc) {
   pthread_mutex_lock(&space->lock);
-  address_set_for_each(&space->from_space, large_object_space_reclaim_one,
-                       space);
-  address_set_clear(&space->from_space);
-  size_t free_pages = space->total_pages - space->live_pages_at_last_collection;
-  space->pages_freed_by_last_collection = free_pages - space->free_pages;
-  space->free_pages = free_pages;
+  if (is_minor_gc) {
+    space->live_pages_at_last_collection =
+      space->total_pages - space->free_pages;
+    space->pages_freed_by_last_collection = 0;
+  } else {
+    address_set_for_each(&space->from_space, large_object_space_reclaim_one,
+                         space);
+    address_set_clear(&space->from_space);
+    size_t free_pages =
+      space->total_pages - space->live_pages_at_last_collection;
+    space->pages_freed_by_last_collection = free_pages - space->free_pages;
+    space->free_pages = free_pages;
+  }
   pthread_mutex_unlock(&space->lock);
 }
 
diff --git a/semi.h b/semi.h
index ce3d938f1..e2769fe7f 100644
--- a/semi.h
+++ b/semi.h
@@ -176,7 +176,7 @@ static void collect(struct mutator *mut) {
   struct semi_space *semi = heap_semi_space(heap);
   struct large_object_space *large = heap_large_object_space(heap);
   // fprintf(stderr, "start collect #%ld:\n", space->count);
-  large_object_space_start_gc(large);
+  large_object_space_start_gc(large, 0);
   flip(semi);
   uintptr_t grey = semi->hp;
   for (struct handle *h = mut->roots; h; h = h->next)
@@ -184,7 +184,7 @@ static void collect(struct mutator *mut) {
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, grey);
-  large_object_space_finish_gc(large);
+  large_object_space_finish_gc(large, 0);
   semi_space_set_stolen_pages(semi, large->live_pages_at_last_collection);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
diff --git a/whippet.h b/whippet.h
index 146321e8c..644b4eeb2 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1,3 +1,11 @@
+#ifndef GC_PARALLEL_TRACE
+#error define GC_PARALLEL_TRACE to 1 or 0
+#endif
+
+#ifndef GC_GENERATIONAL
+#error define GC_GENERATIONAL to 1 or 0
+#endif
+
 #include <pthread.h>
 #include <stdatomic.h>
 #include <stdint.h>
@@ -11,7 +19,7 @@
 #include "inline.h"
 #include "large-object-space.h"
 #include "precise-roots.h"
-#ifdef GC_PARALLEL_TRACE
+#if GC_PARALLEL_TRACE
 #include "parallel-tracer.h"
 #else
 #include "serial-tracer.h"
@@ -39,10 +47,7 @@ STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
 // because they have been passed to unmanaged C code).  (Objects can
 // also be temporarily pinned if they are referenced by a conservative
 // root, but that doesn't need a separate bit; we can just use the mark
-// bit.)  Then there's a "remembered" bit, indicating that the object
-// should be scanned for references to the nursery.  If the remembered
-// bit is set, the corresponding remset byte should also be set in the
-// slab (see below).
+// bit.)
 //
 // Getting back to mark bits -- because we want to allow for
 // conservative roots, we need to know whether an address indicates an
@@ -70,8 +75,8 @@ enum metadata_byte {
   METADATA_BYTE_MARK_2 = 8,
   METADATA_BYTE_END = 16,
   METADATA_BYTE_PINNED = 32,
-  METADATA_BYTE_REMEMBERED = 64,
-  METADATA_BYTE_UNUSED = 128
+  METADATA_BYTE_UNUSED_1 = 64,
+  METADATA_BYTE_UNUSED_2 = 128
 };
 
 static uint8_t rotate_dead_survivor_marked(uint8_t mask) {
@@ -164,7 +169,7 @@ struct block {
 struct slab {
   struct slab_header header;
   struct block_summary summaries[NONMETA_BLOCKS_PER_SLAB];
-  uint8_t remsets[REMSET_BYTES_PER_SLAB];
+  uint8_t remembered_set[REMSET_BYTES_PER_SLAB];
   uint8_t metadata[METADATA_BYTES_PER_SLAB];
   struct block blocks[NONMETA_BLOCKS_PER_SLAB];
 };
@@ -176,6 +181,8 @@ static struct slab *object_slab(void *obj) {
   return (struct slab*) base;
 }
 
+static int heap_object_is_large(struct gcobj *obj);
+
 static uint8_t *object_metadata_byte(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
@@ -186,6 +193,7 @@ static uint8_t *object_metadata_byte(void *obj) {
 #define GRANULES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
 #define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
 static uint8_t *object_remset_byte(void *obj) {
+  ASSERT(!heap_object_is_large(obj));
   uintptr_t addr = (uintptr_t) obj;
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
   uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
@@ -311,8 +319,12 @@ struct mark_space {
 };
 
 enum gc_kind {
-  GC_KIND_MARK_IN_PLACE,
-  GC_KIND_COMPACT
+  GC_KIND_FLAG_MINOR = GC_GENERATIONAL, // 0 or 1
+  GC_KIND_FLAG_EVACUATING = 0x2,
+  GC_KIND_MINOR_IN_PLACE = GC_KIND_FLAG_MINOR,
+  GC_KIND_MINOR_EVACUATING = GC_KIND_FLAG_MINOR | GC_KIND_FLAG_EVACUATING,
+  GC_KIND_MAJOR_IN_PLACE = 0,
+  GC_KIND_MAJOR_EVACUATING = GC_KIND_FLAG_EVACUATING,
 };
 
 struct heap {
@@ -326,15 +338,19 @@ struct heap {
   int collecting;
   enum gc_kind gc_kind;
   int multithreaded;
+  int allow_pinning;
   size_t active_mutator_count;
   size_t mutator_count;
   struct handle *global_roots;
   struct mutator *mutator_trace_list;
   long count;
+  long minor_count;
   struct mutator *deactivated_mutators;
   struct tracer tracer;
   double fragmentation_low_threshold;
   double fragmentation_high_threshold;
+  double minor_gc_yield_threshold;
+  double major_gc_yield_threshold;
 };
 
 struct mutator_mark_buf {
@@ -384,6 +400,18 @@ enum gc_reason {
 
 static void collect(struct mutator *mut, enum gc_reason reason) NEVER_INLINE;
 
+static int heap_object_is_large(struct gcobj *obj) {
+  switch (tag_live_alloc_kind(obj->tag)) {
+#define IS_LARGE(name, Name, NAME) \
+    case ALLOC_KIND_##NAME: \
+      return name##_size((Name*)obj) > LARGE_OBJECT_THRESHOLD;
+      break;
+    FOR_EACH_HEAP_OBJECT_KIND(IS_LARGE)
+#undef IS_LARGE
+  }
+  abort();
+}
+
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return object_metadata_byte(obj);
 }
@@ -882,16 +910,30 @@ static void enqueue_mutator_for_tracing(struct mutator *mut) {
 }
 
 static int heap_should_mark_while_stopping(struct heap *heap) {
-  return atomic_load(&heap->gc_kind) == GC_KIND_MARK_IN_PLACE;
-}
-
-static int mutator_should_mark_while_stopping(struct mutator *mut) {
+  if (heap->allow_pinning) {
+    // The metadata byte is mostly used for marking and object extent.
+    // For marking, we allow updates to race, because the state
+    // transition space is limited.  However during ragged stop there is
+    // the possibility of races between the marker and updates from the
+    // mutator to the pinned bit in the metadata byte.
+    //
+    // Losing the pinned bit would be bad.  Perhaps this means we should
+    // store the pinned bit elsewhere.  Or, perhaps for this reason (and
+    // in all cases?)  markers should use proper synchronization to
+    // update metadata mark bits instead of racing.  But for now it is
+    // sufficient to simply avoid ragged stops if we allow pins.
+    return 0;
+  }
   // If we are marking in place, we allow mutators to mark their own
   // stacks before pausing.  This is a limited form of concurrent
   // marking, as other mutators might be running, not having received
   // the signal to stop yet.  We can't do this for a compacting
   // collection, however, as that would become concurrent evacuation,
   // which is a different kettle of fish.
+  return (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_EVACUATING) == 0;
+}
+
+static int mutator_should_mark_while_stopping(struct mutator *mut) {
   return heap_should_mark_while_stopping(mutator_heap(mut));
 }
 
@@ -971,6 +1013,63 @@ static void trace_global_roots(struct heap *heap) {
   }
 }
 
+static inline int
+heap_object_is_young(struct heap *heap, struct gcobj *obj) {
+  if (UNLIKELY(!mark_space_contains(heap_mark_space(heap), obj))) {
+    // No lospace nursery, for the moment.
+    return 0;
+  }
+  ASSERT(!heap_object_is_large(obj));
+  return (*object_metadata_byte(obj)) & METADATA_BYTE_YOUNG;
+}
+
+static void mark_space_trace_generational_roots(struct mark_space *space,
+                                                struct heap *heap) {
+  uint8_t live_tenured_mask = space->live_mask;
+  for (size_t s = 0; s < space->nslabs; s++) {
+    struct slab *slab = &space->slabs[s];
+    uint8_t *remset = slab->remembered_set;
+    // TODO: Load 8 bytes at a time instead.
+    for (size_t card = 0; card < REMSET_BYTES_PER_SLAB; card++) {
+      if (remset[card]) {
+        remset[card] = 0;
+        size_t base = card * GRANULES_PER_REMSET_BYTE;
+        size_t limit = base + GRANULES_PER_REMSET_BYTE;
+        // We could accelerate this but GRANULES_PER_REMSET_BYTE is 16
+        // on 64-bit hosts, so maybe it's not so important.
+        for (size_t granule = base; granule < limit; granule++) {
+          if (slab->metadata[granule] & live_tenured_mask) {
+            struct block *block0 = &slab->blocks[0];
+            uintptr_t addr = ((uintptr_t)block0->data) + granule * GRANULE_SIZE;
+            struct gcobj *obj = (struct gcobj*)addr;
+            ASSERT(object_metadata_byte(obj) == &slab->metadata[granule]);
+            tracer_enqueue_root(&heap->tracer, obj);
+          }
+        }
+        // Note that it's quite possible (and even likely) that this
+        // remset byte doesn't cause any roots, if all stores were to
+        // nursery objects.
+      }
+    }
+  }
+}
+
+static void mark_space_clear_generational_roots(struct mark_space *space) {
+  if (!GC_GENERATIONAL) return;
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    memset(space->slabs[slab].remembered_set, 0, REMSET_BYTES_PER_SLAB);
+  }
+}
+
+static void trace_generational_roots(struct heap *heap) {
+  // TODO: Add lospace nursery.
+  if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
+    mark_space_trace_generational_roots(heap_mark_space(heap), heap);
+  } else {
+    mark_space_clear_generational_roots(heap_mark_space(heap));
+  }
+}
+
 static void pause_mutator_for_collection(struct heap *heap) NEVER_INLINE;
 static void pause_mutator_for_collection(struct heap *heap) {
   ASSERT(mutators_are_stopping(heap));
@@ -1080,14 +1179,17 @@ static double heap_fragmentation(struct heap *heap) {
   return ((double)fragmentation_granules) / heap_granules;
 }
 
-static void determine_collection_kind(struct heap *heap,
-                                      enum gc_reason reason) {
+static enum gc_kind determine_collection_kind(struct heap *heap,
+                                              enum gc_reason reason) {
+  enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
+  enum gc_kind gc_kind;
   switch (reason) {
     case GC_REASON_LARGE_ALLOCATION:
       // We are collecting because a large allocation could not find
       // enough free blocks, and we decided not to expand the heap.
-      // Let's evacuate to maximize the free block yield.
-      heap->gc_kind = GC_KIND_COMPACT;
+      // Let's do an evacuating major collection to maximize the free
+      // block yield.
+      gc_kind = GC_KIND_MAJOR_EVACUATING;
       break;
     case GC_REASON_SMALL_ALLOCATION: {
       // We are making a small allocation and ran out of blocks.
@@ -1095,29 +1197,57 @@ static void determine_collection_kind(struct heap *heap,
       // is measured as a percentage of granules that couldn't be used
       // for allocations in the last cycle.
       double fragmentation = heap_fragmentation(heap);
-      if (atomic_load(&heap->gc_kind) == GC_KIND_COMPACT) {
-        // For some reason, we already decided to compact in the past.
-        // Keep going until we measure that wasted space due to
-        // fragmentation is below a low-water-mark.
-        if (fragmentation < heap->fragmentation_low_threshold) {
-          DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
-                fragmentation * 100.,
-                heap->fragmentation_low_threshold * 100.);
-          atomic_store(&heap->gc_kind, GC_KIND_MARK_IN_PLACE);
-        }
+      if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
+          && fragmentation >= heap->fragmentation_low_threshold) {
+        DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
+              fragmentation * 100.,
+              heap->fragmentation_low_threshold * 100.);
+        // For some reason, we already decided to compact in the past,
+        // and fragmentation hasn't yet fallen below a low-water-mark.
+        // Keep going.
+        gc_kind = GC_KIND_MAJOR_EVACUATING;
+      } else if (fragmentation > heap->fragmentation_high_threshold) {
+        // Switch to evacuation mode if the heap is too fragmented.
+        DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
+              fragmentation * 100.,
+              heap->fragmentation_high_threshold * 100.);
+        gc_kind = GC_KIND_MAJOR_EVACUATING;
+      } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING) {
+        // We were evacuating, but we're good now.  Go back to minor
+        // collections.
+        DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
+              fragmentation * 100.,
+              heap->fragmentation_low_threshold * 100.);
+        gc_kind = GC_KIND_MINOR_IN_PLACE;
+      } else if (previous_gc_kind != GC_KIND_MINOR_IN_PLACE) {
+        DEBUG("returning to minor collection after major collection\n");
+        // Go back to minor collections.
+        gc_kind = GC_KIND_MINOR_IN_PLACE;
+      } else if (heap_last_gc_yield(heap) < heap->major_gc_yield_threshold) {
+        DEBUG("collection yield too low, triggering major collection\n");
+        // Nursery is getting tight; trigger a major GC.
+        gc_kind = GC_KIND_MAJOR_IN_PLACE;
       } else {
-        // Otherwise switch to evacuation mode if the heap is too
-        // fragmented.
-        if (fragmentation > heap->fragmentation_high_threshold) {
-          DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
-                fragmentation * 100.,
-                heap->fragmentation_high_threshold * 100.);
-          atomic_store(&heap->gc_kind, GC_KIND_COMPACT);
-        }
+        DEBUG("keeping on with minor GC\n");
+        // Nursery has adequate space; keep trucking with minor GCs.
+        ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
+        gc_kind = GC_KIND_MINOR_IN_PLACE;
       }
       break;
     }
   }
+  // If this is the first in a series of minor collections, reset the
+  // threshold at which we should do a major GC.
+  if ((gc_kind & GC_KIND_FLAG_MINOR) &&
+      (previous_gc_kind & GC_KIND_FLAG_MINOR) != GC_KIND_FLAG_MINOR) {
+    double yield = heap_last_gc_yield(heap);
+    double threshold = yield * heap->minor_gc_yield_threshold;
+    heap->major_gc_yield_threshold = threshold;
+    DEBUG("first minor collection at yield %.2f%%, threshold %.2f%%\n",
+          yield * 100., threshold * 100.);
+  }
+  atomic_store(&heap->gc_kind, gc_kind);
+  return gc_kind;
 }
 
 static void release_evacuation_target_blocks(struct mark_space *space) {
@@ -1129,7 +1259,7 @@ static void release_evacuation_target_blocks(struct mark_space *space) {
 static void prepare_for_evacuation(struct heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
 
-  if (heap->gc_kind == GC_KIND_MARK_IN_PLACE) {
+  if ((heap->gc_kind & GC_KIND_FLAG_EVACUATING) == 0) {
     space->evacuating = 0;
     space->evacuation_reserve = 0.02;
     return;
@@ -1219,12 +1349,15 @@ static void trace_conservative_roots_after_stop(struct heap *heap) {
 static void trace_precise_roots_after_stop(struct heap *heap) {
   trace_mutator_roots_after_stop(heap);
   trace_global_roots(heap);
+  trace_generational_roots(heap);
 }
 
-static void mark_space_finish_gc(struct mark_space *space) {
+static void mark_space_finish_gc(struct mark_space *space,
+                                 enum gc_kind gc_kind) {
   space->evacuating = 0;
   reset_sweeper(space);
-  rotate_mark_bytes(space);
+  if ((gc_kind & GC_KIND_FLAG_MINOR) == 0)
+    rotate_mark_bytes(space);
   reset_statistics(space);
   release_evacuation_target_blocks(space);
 }
@@ -1238,8 +1371,8 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
     return;
   }
   DEBUG("start collect #%ld:\n", heap->count);
-  determine_collection_kind(heap, reason);
-  large_object_space_start_gc(lospace);
+  enum gc_kind gc_kind = determine_collection_kind(heap, reason);
+  large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
   trace_mutator_roots_with_lock_before_stop(mut);
@@ -1253,9 +1386,11 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   trace_precise_roots_after_stop(heap);
   tracer_trace(heap);
   tracer_release(heap);
-  mark_space_finish_gc(space);
-  large_object_space_finish_gc(lospace);
+  mark_space_finish_gc(space, gc_kind);
+  large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
   heap->count++;
+  if (gc_kind & GC_KIND_FLAG_MINOR)
+    heap->minor_count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
   DEBUG("collect done\n");
@@ -1652,10 +1787,23 @@ static inline void* allocate_pointerless(struct mutator *mut,
   return allocate(mut, kind, size);
 }
 
+static inline void mark_space_write_barrier(void *obj) {
+  // Unconditionally mark the card the object is in.  Precondition: obj
+  // is in the mark space (is not a large object).
+  atomic_store_explicit(object_remset_byte(obj), 1, memory_order_relaxed);
+}
+
+// init_field is an optimization for the case in which there is no
+// intervening allocation or safepoint between allocating an object and
+// setting the value of a field in the object.  For the purposes of
+// generational collection, we can omit the barrier in that case,
+// because we know the source object is in the nursery.  It is always
+// correct to replace it with set_field.
 static inline void init_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
 static inline void set_field(void *obj, void **addr, void *val) {
+  if (GC_GENERATIONAL) mark_space_write_barrier(obj);
   *addr = val;
 }
 
@@ -1696,6 +1844,8 @@ static int heap_init(struct heap *heap, size_t size) {
 
   heap->fragmentation_low_threshold = 0.05;
   heap->fragmentation_high_threshold = 0.10;
+  heap->minor_gc_yield_threshold = 0.30;
+  heap->major_gc_yield_threshold = heap->minor_gc_yield_threshold;
 
   return 1;
 }

From 1358d99abc2673ba865bbd9604712ddb3e79af51 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 2 Aug 2022 22:15:13 +0200
Subject: [PATCH 113/403] Fix yield calculation after evacuating collections

---
 whippet.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index 644b4eeb2..cd2c18af0 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1153,11 +1153,14 @@ static double heap_last_gc_yield(struct heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   size_t mark_space_yield = mark_space->granules_freed_by_last_collection;
   mark_space_yield <<= GRANULE_SIZE_LOG_2;
+  size_t evacuation_block_yield =
+    atomic_load_explicit(&mark_space->evacuation_targets.count,
+                         memory_order_acquire) * BLOCK_SIZE;
   struct large_object_space *lospace = heap_large_object_space(heap);
   size_t lospace_yield = lospace->pages_freed_by_last_collection;
   lospace_yield <<= lospace->page_size_log2;
 
-  double yield = mark_space_yield + lospace_yield;
+  double yield = mark_space_yield + lospace_yield + evacuation_block_yield;
   return yield / heap->size;
 }
 

From 0210a8caf0585804bb5c8ea28780b35d2dabc795 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 3 Aug 2022 10:10:33 +0200
Subject: [PATCH 114/403] Refactor out-of-memory detection

Firstly, we add a priority evacuation reserve to prioritize having a few
evacuation blocks on hand.  Otherwise if we give them all to big
allocations first and we have a fragmented heap, we won't be able to
evacuate that fragmented heap to give more blocks to the large
allocations.

Secondly, we remove `enum gc_reason`.  The issue is that with multiple
mutator threads, the precise thread triggering GC does not provide much
information.  Instead we should make choices on how to collect based on
the state of the heap.

Finally, we move detection of out-of-memory inside the collector,
instead of the allocator.

Together, these changes let mt-gcbench (with fragmentation) operate in
smaller heaps.
---
 whippet.h | 328 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 188 insertions(+), 140 deletions(-)

diff --git a/whippet.h b/whippet.h
index cd2c18af0..63ee0262e 100644
--- a/whippet.h
+++ b/whippet.h
@@ -309,6 +309,7 @@ struct mark_space {
   struct block_list empty;
   struct block_list unavailable;
   struct block_list evacuation_targets;
+  double evacuation_minimum_reserve;
   double evacuation_reserve;
   ssize_t pending_unavailable_bytes; // atomically
   struct evacuation_allocator evacuation_allocator;
@@ -351,6 +352,7 @@ struct heap {
   double fragmentation_high_threshold;
   double minor_gc_yield_threshold;
   double major_gc_yield_threshold;
+  double minimum_major_gc_yield_threshold;
 };
 
 struct mutator_mark_buf {
@@ -393,12 +395,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-enum gc_reason {
-  GC_REASON_SMALL_ALLOCATION,
-  GC_REASON_LARGE_ALLOCATION
-};
-
-static void collect(struct mutator *mut, enum gc_reason reason) NEVER_INLINE;
+static void collect(struct mutator *mut) NEVER_INLINE;
 
 static int heap_object_is_large(struct gcobj *obj) {
   switch (tag_live_alloc_kind(obj->tag)) {
@@ -479,7 +476,8 @@ static void finish_evacuation_allocator_block(uintptr_t block,
 
 static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
                                         struct block_list *targets,
-                                        struct block_list *empties) {
+                                        struct block_list *empties,
+                                        size_t reserve) {
   // Blocks that we used for evacuation get returned to the mutator as
   // sweepable blocks.  Blocks that we didn't get to use go to the
   // empties.
@@ -499,12 +497,9 @@ static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
     ASSERT(block);
     finish_evacuation_allocator_block(block, allocated);
   }
-  while (1) {
-    uintptr_t block = pop_block(targets);
-    if (!block)
-      break;
-    push_block(empties, block);
-  }
+  size_t remaining = atomic_load_explicit(&targets->count, memory_order_acquire);
+  while (remaining-- > reserve)
+    push_block(empties, pop_block(targets));
 }
 
 static struct gcobj *evacuation_allocate(struct mark_space *space,
@@ -770,30 +765,40 @@ static uintptr_t pop_empty_block(struct mark_space *space) {
   return pop_block(&space->empty);
 }
 
-static void push_empty_block(struct mark_space *space, uintptr_t block) {
+static int maybe_push_evacuation_target(struct mark_space *space,
+                                        uintptr_t block, double reserve) {
   ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
                                  BLOCK_NEEDS_SWEEP));
-  push_block(&space->empty, block);
-}
-
-static int maybe_push_evacuation_target(struct mark_space *space,
-                                        uintptr_t block) {
   size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
                                         memory_order_acquire);
   size_t total = space->nslabs * NONMETA_BLOCKS_PER_SLAB;
   size_t unavailable = atomic_load_explicit(&space->unavailable.count,
                                             memory_order_acquire);
-  if (targets >= (total - unavailable) * space->evacuation_reserve)
+  if (targets >= (total - unavailable) * reserve)
     return 0;
 
-  // We reached the end of the allocation cycle and just obtained a
-  // known-empty block from the empties list.  If the last cycle was an
-  // evacuating collection, put this block back on the list of
-  // evacuation target blocks.
   push_block(&space->evacuation_targets, block);
   return 1;
 }
 
+static int push_evacuation_target_if_needed(struct mark_space *space,
+                                            uintptr_t block) {
+  return maybe_push_evacuation_target(space, block,
+                                      space->evacuation_minimum_reserve);
+}
+
+static int push_evacuation_target_if_possible(struct mark_space *space,
+                                              uintptr_t block) {
+  return maybe_push_evacuation_target(space, block,
+                                      space->evacuation_reserve);
+}
+
+static void push_empty_block(struct mark_space *space, uintptr_t block) {
+  ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
+                                 BLOCK_NEEDS_SWEEP));
+  push_block(&space->empty, block);
+}
+
 static ssize_t mark_space_request_release_memory(struct mark_space *space,
                                                  size_t bytes) {
   return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
@@ -806,6 +811,8 @@ static void mark_space_reacquire_memory(struct mark_space *space,
   while (pending + BLOCK_SIZE <= 0) {
     uintptr_t block = pop_unavailable_block(space);
     ASSERT(block);
+    if (push_evacuation_target_if_needed(space, block))
+      continue;
     push_empty_block(space, block);
     pending = atomic_fetch_add(&space->pending_unavailable_bytes, BLOCK_SIZE)
       + BLOCK_SIZE;
@@ -1145,7 +1152,7 @@ static void reset_statistics(struct mark_space *space) {
   space->fragmentation_granules_since_last_collection = 0;
 }
 
-static int maybe_grow_heap(struct heap *heap, enum gc_reason reason) {
+static int maybe_grow_heap(struct heap *heap) {
   return 0;
 }
 
@@ -1156,6 +1163,12 @@ static double heap_last_gc_yield(struct heap *heap) {
   size_t evacuation_block_yield =
     atomic_load_explicit(&mark_space->evacuation_targets.count,
                          memory_order_acquire) * BLOCK_SIZE;
+  size_t minimum_evacuation_block_yield =
+    heap->size * mark_space->evacuation_minimum_reserve;
+  if (evacuation_block_yield < minimum_evacuation_block_yield)
+    evacuation_block_yield = 0;
+  else
+    evacuation_block_yield -= minimum_evacuation_block_yield;
   struct large_object_space *lospace = heap_large_object_space(heap);
   size_t lospace_yield = lospace->pages_freed_by_last_collection;
   lospace_yield <<= lospace->page_size_log2;
@@ -1166,97 +1179,127 @@ static double heap_last_gc_yield(struct heap *heap) {
 
 static double heap_fragmentation(struct heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
-  size_t mark_space_blocks = mark_space->nslabs * NONMETA_BLOCKS_PER_SLAB;
-  mark_space_blocks -= atomic_load(&mark_space->unavailable.count);
-  size_t mark_space_granules = mark_space_blocks * GRANULES_PER_BLOCK;
   size_t fragmentation_granules =
     mark_space->fragmentation_granules_since_last_collection;
-
-  struct large_object_space *lospace = heap_large_object_space(heap);
-  size_t lospace_pages = lospace->total_pages - lospace->free_pages;
-  size_t lospace_granules =
-    lospace_pages << (lospace->page_size_log2 - GRANULE_SIZE_LOG_2);
-
-  size_t heap_granules = mark_space_granules + lospace_granules;
+  size_t heap_granules = heap->size >> GRANULE_SIZE_LOG_2;
 
   return ((double)fragmentation_granules) / heap_granules;
 }
 
-static enum gc_kind determine_collection_kind(struct heap *heap,
-                                              enum gc_reason reason) {
+static void detect_out_of_memory(struct heap *heap) {
+  struct mark_space *mark_space = heap_mark_space(heap);
+  struct large_object_space *lospace = heap_large_object_space(heap);
+
+  if (heap->count == 0)
+    return;
+
+  double last_yield = heap_last_gc_yield(heap);
+  double fragmentation = heap_fragmentation(heap);
+
+  double yield_epsilon = BLOCK_SIZE * 1.0 / heap->size;
+  double fragmentation_epsilon = LARGE_OBJECT_THRESHOLD * 1.0 / BLOCK_SIZE;
+
+  if (last_yield - fragmentation > yield_epsilon)
+    return;
+
+  if (fragmentation > fragmentation_epsilon
+      && atomic_load(&mark_space->evacuation_targets.count))
+    return;
+
+  // No yield in last gc and we do not expect defragmentation to
+  // be able to yield more space: out of memory.
+  fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
+          heap->size, mark_space->nslabs);
+  abort();
+}
+
+static double clamp_major_gc_yield_threshold(struct heap *heap,
+                                             double threshold) {
+  if (threshold < heap->minimum_major_gc_yield_threshold)
+    threshold = heap->minimum_major_gc_yield_threshold;
+  double one_block = BLOCK_SIZE * 1.0 / heap->size;
+  if (threshold < one_block)
+    threshold = one_block;
+  return threshold;
+}
+
+static enum gc_kind determine_collection_kind(struct heap *heap) {
+  struct mark_space *mark_space = heap_mark_space(heap);
   enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
   enum gc_kind gc_kind;
-  switch (reason) {
-    case GC_REASON_LARGE_ALLOCATION:
-      // We are collecting because a large allocation could not find
-      // enough free blocks, and we decided not to expand the heap.
-      // Let's do an evacuating major collection to maximize the free
-      // block yield.
-      gc_kind = GC_KIND_MAJOR_EVACUATING;
-      break;
-    case GC_REASON_SMALL_ALLOCATION: {
-      // We are making a small allocation and ran out of blocks.
-      // Evacuate if the heap is "too fragmented", where fragmentation
-      // is measured as a percentage of granules that couldn't be used
-      // for allocations in the last cycle.
-      double fragmentation = heap_fragmentation(heap);
-      if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
-          && fragmentation >= heap->fragmentation_low_threshold) {
-        DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
-              fragmentation * 100.,
-              heap->fragmentation_low_threshold * 100.);
-        // For some reason, we already decided to compact in the past,
-        // and fragmentation hasn't yet fallen below a low-water-mark.
-        // Keep going.
-        gc_kind = GC_KIND_MAJOR_EVACUATING;
-      } else if (fragmentation > heap->fragmentation_high_threshold) {
-        // Switch to evacuation mode if the heap is too fragmented.
-        DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
-              fragmentation * 100.,
-              heap->fragmentation_high_threshold * 100.);
-        gc_kind = GC_KIND_MAJOR_EVACUATING;
-      } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING) {
-        // We were evacuating, but we're good now.  Go back to minor
-        // collections.
-        DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
-              fragmentation * 100.,
-              heap->fragmentation_low_threshold * 100.);
-        gc_kind = GC_KIND_MINOR_IN_PLACE;
-      } else if (previous_gc_kind != GC_KIND_MINOR_IN_PLACE) {
-        DEBUG("returning to minor collection after major collection\n");
-        // Go back to minor collections.
-        gc_kind = GC_KIND_MINOR_IN_PLACE;
-      } else if (heap_last_gc_yield(heap) < heap->major_gc_yield_threshold) {
-        DEBUG("collection yield too low, triggering major collection\n");
-        // Nursery is getting tight; trigger a major GC.
-        gc_kind = GC_KIND_MAJOR_IN_PLACE;
-      } else {
-        DEBUG("keeping on with minor GC\n");
-        // Nursery has adequate space; keep trucking with minor GCs.
-        ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
-        gc_kind = GC_KIND_MINOR_IN_PLACE;
-      }
-      break;
-    }
+  double yield = heap_last_gc_yield(heap);
+  double fragmentation = heap_fragmentation(heap);
+
+  if (heap->count == 0) {
+    DEBUG("first collection is always major\n");
+    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+  } else if (atomic_load_explicit(&mark_space->pending_unavailable_bytes,
+                                  memory_order_acquire) > 0) {
+    // During the last cycle, a large allocation could not find enough
+    // free blocks, and we decided not to expand the heap.  Let's do an
+    // evacuating major collection to maximize the free block yield.
+    gc_kind = GC_KIND_MAJOR_EVACUATING;
+  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
+             && fragmentation >= heap->fragmentation_low_threshold) {
+    DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
+          fragmentation * 100.,
+          heap->fragmentation_low_threshold * 100.);
+    // For some reason, we already decided to compact in the past,
+    // and fragmentation hasn't yet fallen below a low-water-mark.
+    // Keep going.
+    gc_kind = GC_KIND_MAJOR_EVACUATING;
+  } else if (fragmentation > heap->fragmentation_high_threshold) {
+    // Switch to evacuation mode if the heap is too fragmented.
+    DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
+          fragmentation * 100.,
+          heap->fragmentation_high_threshold * 100.);
+    gc_kind = GC_KIND_MAJOR_EVACUATING;
+  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING) {
+    // We were evacuating, but we're good now.  Go back to minor
+    // collections.
+    DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
+          fragmentation * 100.,
+          heap->fragmentation_low_threshold * 100.);
+    gc_kind = GC_KIND_MINOR_IN_PLACE;
+  } else if (previous_gc_kind != GC_KIND_MINOR_IN_PLACE) {
+    DEBUG("returning to minor collection after major collection\n");
+    // Go back to minor collections.
+    gc_kind = GC_KIND_MINOR_IN_PLACE;
+  } else if (yield < heap->major_gc_yield_threshold) {
+    DEBUG("collection yield too low, triggering major collection\n");
+    // Nursery is getting tight; trigger a major GC.
+    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+  } else {
+    DEBUG("keeping on with minor GC\n");
+    // Nursery has adequate space; keep trucking with minor GCs.
+    ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
+    gc_kind = GC_KIND_MINOR_IN_PLACE;
   }
+
   // If this is the first in a series of minor collections, reset the
   // threshold at which we should do a major GC.
   if ((gc_kind & GC_KIND_FLAG_MINOR) &&
       (previous_gc_kind & GC_KIND_FLAG_MINOR) != GC_KIND_FLAG_MINOR) {
     double yield = heap_last_gc_yield(heap);
     double threshold = yield * heap->minor_gc_yield_threshold;
-    heap->major_gc_yield_threshold = threshold;
+    double clamped = clamp_major_gc_yield_threshold(heap, threshold);
+    heap->major_gc_yield_threshold = clamped;
     DEBUG("first minor collection at yield %.2f%%, threshold %.2f%%\n",
-          yield * 100., threshold * 100.);
+          yield * 100., clamped * 100.);
   }
   atomic_store(&heap->gc_kind, gc_kind);
   return gc_kind;
 }
 
 static void release_evacuation_target_blocks(struct mark_space *space) {
-  // Move any collected evacuation target blocks back to empties.
+  // Move excess evacuation target blocks back to empties.
+  size_t total = space->nslabs * NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
+                                            memory_order_acquire);
+  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
   finish_evacuation_allocator(&space->evacuation_allocator,
-                              &space->evacuation_targets, &space->empty);
+                              &space->evacuation_targets, &space->empty,
+                              reserve);
 }
 
 static void prepare_for_evacuation(struct heap *heap) {
@@ -1264,7 +1307,7 @@ static void prepare_for_evacuation(struct heap *heap) {
 
   if ((heap->gc_kind & GC_KIND_FLAG_EVACUATING) == 0) {
     space->evacuating = 0;
-    space->evacuation_reserve = 0.02;
+    space->evacuation_reserve = space->evacuation_minimum_reserve;
     return;
   }
 
@@ -1290,21 +1333,27 @@ static void prepare_for_evacuation(struct heap *heap) {
   const size_t bucket_count = 33;
   size_t histogram[33] = {0,};
   size_t bucket_size = GRANULES_PER_BLOCK / 32;
+  size_t empties = 0;
   for (size_t slab = 0; slab < space->nslabs; slab++) {
     for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
       struct block_summary *summary = &space->slabs[slab].summaries[block];
       if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
         continue;
+      if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
+        empties++;
+        continue;
+      }
       size_t survivor_granules = GRANULES_PER_BLOCK - summary->free_granules;
       size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
       histogram[bucket]++;
     }
   }
 
-  // Evacuation targets must be in bucket 0.  These blocks will later be
-  // marked also as evacuation candidates, but that's not a problem,
-  // because they contain no source objects.
-  ASSERT(histogram[0] >= target_blocks);
+  // Blocks which lack the NEEDS_SWEEP flag are empty, either because
+  // they have been removed from the pool and have the UNAVAILABLE flag
+  // set, or because they are on the empties or evacuation target
+  // lists.  When evacuation starts, the empties list should be empty.
+  ASSERT(empties == target_blocks);
 
   // Now select a number of blocks that is likely to fill the space in
   // the target blocks.  Prefer candidate blocks with fewer survivors
@@ -1326,6 +1375,8 @@ static void prepare_for_evacuation(struct heap *heap) {
       struct block_summary *summary = &space->slabs[slab].summaries[block];
       if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
         continue;
+      if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP))
+        continue;
       size_t survivor_granules = GRANULES_PER_BLOCK - summary->free_granules;
       size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
       if (histogram[bucket]) {
@@ -1365,16 +1416,16 @@ static void mark_space_finish_gc(struct mark_space *space,
   release_evacuation_target_blocks(space);
 }
 
-static void collect(struct mutator *mut, enum gc_reason reason) {
+static void collect(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
-  if (maybe_grow_heap(heap, reason)) {
+  if (maybe_grow_heap(heap)) {
     DEBUG("grew heap instead of collecting #%ld:\n", heap->count);
     return;
   }
   DEBUG("start collect #%ld:\n", heap->count);
-  enum gc_kind gc_kind = determine_collection_kind(heap, reason);
+  enum gc_kind gc_kind = determine_collection_kind(heap);
   large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
@@ -1384,6 +1435,7 @@ static void collect(struct mutator *mut, enum gc_reason reason) {
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
+  detect_out_of_memory(heap);
   trace_conservative_roots_after_stop(heap);
   prepare_for_evacuation(heap);
   trace_precise_roots_after_stop(heap);
@@ -1533,6 +1585,7 @@ static size_t next_hole_in_block(struct mutator *mut) {
 
     struct block_summary *summary = block_summary_for_addr(sweep);
     summary->hole_count++;
+    ASSERT(free_granules <= GRANULES_PER_BLOCK - summary->free_granules);
     summary->free_granules += free_granules;
 
     size_t free_bytes = free_granules * GRANULE_SIZE;
@@ -1566,7 +1619,6 @@ static int maybe_release_swept_empty_block(struct mutator *mut) {
                            memory_order_acquire) <= 0)
     return 0;
 
-  block_summary_clear_flag(block_summary_for_addr(block), BLOCK_NEEDS_SWEEP);
   push_unavailable_block(space, block);
   atomic_fetch_sub(&space->pending_unavailable_bytes, BLOCK_SIZE);
   mut->alloc = mut->sweep = mut->block = 0;
@@ -1587,17 +1639,26 @@ static size_t next_hole(struct mutator *mut) {
       // If the hole spans only part of a block, give it to the mutator.
       if (granules < GRANULES_PER_BLOCK)
         return granules;
-      // Sweeping found a completely empty block.  If we have pending
-      // pages to release to the OS, we should unmap this block.
+      struct block_summary *summary = block_summary_for_addr(mut->block);
+      block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
+      // Sweeping found a completely empty block.  If we are below the
+      // minimum evacuation reserve, take the block.
+      if (push_evacuation_target_if_needed(space, mut->block)) {
+        mut->alloc = mut->sweep = mut->block = 0;
+        continue;
+      }
+      // If we have pending pages to release to the OS, we should unmap
+      // this block.
       if (maybe_release_swept_empty_block(mut))
         continue;
       // Otherwise if we've already returned lots of empty blocks to the
       // freelist, give this block to the mutator.
-      if (!empties_countdown)
+      if (!empties_countdown) {
+        // After this block is allocated into, it will need to be swept.
+        block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
         return granules;
+      }
       // Otherwise we push to the empty blocks list.
-      struct block_summary *summary = block_summary_for_addr(mut->block);
-      block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
       push_empty_block(space, mut->block);
       mut->alloc = mut->sweep = mut->block = 0;
       empties_countdown--;
@@ -1640,7 +1701,7 @@ static size_t next_hole(struct mutator *mut) {
           return 0;
 
         // Maybe we should use this empty as a target for evacuation.
-        if (maybe_push_evacuation_target(space, block))
+        if (push_evacuation_target_if_possible(space, block))
           continue;
 
         // Otherwise return the block to the mutator.
@@ -1671,11 +1732,14 @@ static void finish_sweeping(struct mutator *mut) {
     finish_hole(mut);
 }
 
-static void out_of_memory(struct mutator *mut) {
+static void trigger_collection(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
-  fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
-          heap->size, heap_mark_space(heap)->nslabs);
-  abort();
+  heap_lock(heap);
+  if (mutators_are_stopping(heap))
+    pause_mutator_for_collection_with_lock(mut);
+  else
+    collect(mut);
+  heap_unlock(heap);
 }
 
 static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
@@ -1688,16 +1752,9 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
 
   mark_space_request_release_memory(heap_mark_space(heap),
                                     npages << space->page_size_log2);
-  if (!sweep_until_memory_released(mut)) {
-    heap_lock(heap);
-    if (mutators_are_stopping(heap))
-      pause_mutator_for_collection_with_lock(mut);
-    else
-      collect(mut, GC_REASON_LARGE_ALLOCATION);
-    heap_unlock(heap);
-    if (!sweep_until_memory_released(mut))
-      out_of_memory(mut);
-  }
+
+  while (!sweep_until_memory_released(mut))
+    trigger_collection(mut);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
@@ -1717,27 +1774,14 @@ static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
                                  size_t granules) NEVER_INLINE;
 static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
                                  size_t granules) {
-  int swept_from_beginning = 0;
   while (1) {
     size_t hole = next_hole(mut);
     if (hole >= granules) {
       clear_memory(mut->alloc, hole * GRANULE_SIZE);
       break;
     }
-    if (!hole) {
-      struct heap *heap = mutator_heap(mut);
-      if (swept_from_beginning) {
-        out_of_memory(mut);
-      } else {
-        heap_lock(heap);
-        if (mutators_are_stopping(heap))
-          pause_mutator_for_collection_with_lock(mut);
-        else
-          collect(mut, GC_REASON_SMALL_ALLOCATION);
-        heap_unlock(heap);
-        swept_from_beginning = 1;
-      }
-    }
+    if (!hole)
+      trigger_collection(mut);
   }
   struct gcobj* ret = (struct gcobj*)mut->alloc;
   mut->alloc += granules * GRANULE_SIZE;
@@ -1848,7 +1892,9 @@ static int heap_init(struct heap *heap, size_t size) {
   heap->fragmentation_low_threshold = 0.05;
   heap->fragmentation_high_threshold = 0.10;
   heap->minor_gc_yield_threshold = 0.30;
-  heap->major_gc_yield_threshold = heap->minor_gc_yield_threshold;
+  heap->minimum_major_gc_yield_threshold = 0.05;
+  heap->major_gc_yield_threshold =
+    clamp_major_gc_yield_threshold(heap, heap->minor_gc_yield_threshold);
 
   return 1;
 }
@@ -1871,7 +1917,8 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->low_addr = (uintptr_t) slabs;
   space->extent = size;
   space->next_block = 0;
-  space->evacuation_reserve = 0.02;
+  space->evacuation_minimum_reserve = 0.02;
+  space->evacuation_reserve = space->evacuation_minimum_reserve;
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
@@ -1879,7 +1926,8 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
         push_unavailable_block(space, addr);
         size -= BLOCK_SIZE;
       } else {
-        push_empty_block(space, addr);
+        if (!push_evacuation_target_if_needed(space, addr))
+          push_empty_block(space, addr);
       }
     }
   }

From 96b68095b72a9c11c62dfeed1e031f9ae818df86 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 3 Aug 2022 12:06:19 +0200
Subject: [PATCH 115/403] Fix mark pattern updating for generational whippet

After a minor collection, we were erroneously failing to sweep dead
objects with the survivor tag.
---
 whippet.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/whippet.h b/whippet.h
index 63ee0262e..b533093f5 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1032,6 +1032,7 @@ heap_object_is_young(struct heap *heap, struct gcobj *obj) {
 
 static void mark_space_trace_generational_roots(struct mark_space *space,
                                                 struct heap *heap) {
+  ASSERT(!space->evacuating);
   uint8_t live_tenured_mask = space->live_mask;
   for (size_t s = 0; s < space->nslabs; s++) {
     struct slab *slab = &space->slabs[s];
@@ -1045,7 +1046,7 @@ static void mark_space_trace_generational_roots(struct mark_space *space,
         // We could accelerate this but GRANULES_PER_REMSET_BYTE is 16
         // on 64-bit hosts, so maybe it's not so important.
         for (size_t granule = base; granule < limit; granule++) {
-          if (slab->metadata[granule] & live_tenured_mask) {
+          if (slab->metadata[granule] & space->live_mask) {
             struct block *block0 = &slab->blocks[0];
             uintptr_t addr = ((uintptr_t)block0->data) + granule * GRANULE_SIZE;
             struct gcobj *obj = (struct gcobj*)addr;
@@ -1141,9 +1142,13 @@ static uint64_t broadcast_byte(uint8_t byte) {
   return result * 0x0101010101010101ULL;
 }
 
-static void rotate_mark_bytes(struct mark_space *space) {
-  space->live_mask = rotate_dead_survivor_marked(space->live_mask);
-  space->marked_mask = rotate_dead_survivor_marked(space->marked_mask);
+static void update_mark_patterns(struct mark_space *space,
+                                 int advance_mark_mask) {
+  uint8_t survivor_mask = space->marked_mask;
+  uint8_t next_marked_mask = rotate_dead_survivor_marked(survivor_mask);
+  if (advance_mark_mask)
+    space->marked_mask = next_marked_mask;
+  space->live_mask = survivor_mask | next_marked_mask;
   space->sweep_mask = broadcast_byte(space->live_mask);
 }
 
@@ -1410,8 +1415,7 @@ static void mark_space_finish_gc(struct mark_space *space,
                                  enum gc_kind gc_kind) {
   space->evacuating = 0;
   reset_sweeper(space);
-  if ((gc_kind & GC_KIND_FLAG_MINOR) == 0)
-    rotate_mark_bytes(space);
+  update_mark_patterns(space, 0);
   reset_statistics(space);
   release_evacuation_target_blocks(space);
 }
@@ -1426,6 +1430,7 @@ static void collect(struct mutator *mut) {
   }
   DEBUG("start collect #%ld:\n", heap->count);
   enum gc_kind gc_kind = determine_collection_kind(heap);
+  update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
   large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
@@ -1906,12 +1911,8 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   if (!slabs)
     return 0;
 
-  uint8_t dead = METADATA_BYTE_MARK_0;
-  uint8_t survived = METADATA_BYTE_MARK_1;
-  uint8_t marked = METADATA_BYTE_MARK_2;
-  space->marked_mask = marked;
-  space->live_mask = survived | marked;
-  rotate_mark_bytes(space);
+  space->marked_mask = METADATA_BYTE_MARK_0;
+  update_mark_patterns(space, 0);
   space->slabs = slabs;
   space->nslabs = nslabs;
   space->low_addr = (uintptr_t) slabs;

From 8f6a2692ab711907a5552ef67183ce1d077d556c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 3 Aug 2022 12:13:25 +0200
Subject: [PATCH 116/403] Update README

---
 README.md | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 7bb0ccc24..bab4d80a1 100644
--- a/README.md
+++ b/README.md
@@ -42,13 +42,9 @@ granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
 32-bit, it's a lot!  On the other hand, instead of the worst case of one
 survivor object wasting a line (or two, in the case of conservative line
 marking), granule-size-is-line-size instead wastes nothing.  Also, you
-don't need GC bits in the object itself, and you get a number of other
-benefits from the mark byte table -- you can also stuff other per-object
-data there, such as pinning bits, nursery and remset bits, multiple mark
-colors for concurrent marking, and you can also use the mark byte (which
-is now a metadata byte) to record the object end, so that finding holes
-in a block can just read the mark table and can avoid looking at object
-memory.
+don't need GC bits in the object itself, and you can use the mark byte
+array to record the object end, so that finding holes in a block can
+just read the mark table and can avoid looking at object memory.
 
 Other ideas in Whippet:
 
@@ -65,10 +61,8 @@ Other ideas in Whippet:
  * Facilitate conservative collection via mark byte array, oracle for
    "does this address start an object"
 
- * Enable in-place generational collection via nursery bit in metadata
-   byte for new allocations, remset bit for objects that should be
-   traced for nursery roots, and a card table with one entry per 256B or
-   so; but write barrier and generational trace not yet implemented
+ * Enable in-place generational collection via card table with one entry
+   per 256B or so
 
  * Enable concurrent marking by having three mark bit states (dead,
    survivor, marked) that rotate at each collection, and sweeping a
@@ -95,6 +89,10 @@ use in optimizing Whippet:
    processes also are quite short-lived, so perhaps it is useful to
    ensure that small heaps remain lightweight.
 
+   To stress Whippet's handling of fragmentation, we modified this
+   benchmark to intersperse pseudorandomly-sized holes between tree
+   nodes.
+
  - [`quads.c`](./quads.c): A synthetic benchmark that allocates quad
    trees.  The mutator begins by allocating one long-lived tree of depth
    N, and then allocates 13% of the heap in depth-3 trees, 20 times,
@@ -110,7 +108,8 @@ situate Whippet's performance in context:
    mark-sweep segregated-fits collector with lazy sweeping.
  - `semi.h`: Semispace copying collector.
  - `whippet.h`: The whippet collector.  Two different marking
-   implementations: single-threaded and parallel.
+   implementations: single-threaded and parallel.  Generational and
+   non-generational variants, also.
 
 ## Guile
 
@@ -154,8 +153,8 @@ large majority of use cases.
 ### Features that would improve Whippet performance
 
  - [X] Immix-style opportunistic evacuation
- - [ ] Overflow allocation
- - [ ] Generational GC via sticky mark bits
+ - ~~[ ] Overflow allocation~~ (should just evacuate instead)
+ - [X] Generational GC via sticky mark bits
  - [ ] Generational GC with semi-space nursery
  - [ ] Concurrent marking with SATB barrier
 

From 47c07dd0eb4e5d1481c112df031d13b98180de5a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 3 Aug 2022 16:40:34 +0200
Subject: [PATCH 117/403] Fix embarassing ctz issue

---
 whippet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index b533093f5..49b0af9a7 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1492,7 +1492,7 @@ static inline uint64_t load_mark_bytes(uint8_t *mark) {
 }
 
 static inline size_t count_zero_bytes(uint64_t bytes) {
-  return bytes ? (__builtin_ctz(bytes) / 8) : sizeof(bytes);
+  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
 }
 
 static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {

From 0fe13e1cab84b6e3ab8d38c476b7813a5f5d799b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 3 Aug 2022 21:25:18 +0200
Subject: [PATCH 118/403] Accelerate scanning of remembered set

---
 whippet.h | 113 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 49 deletions(-)

diff --git a/whippet.h b/whippet.h
index 49b0af9a7..322848d62 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1030,39 +1030,73 @@ heap_object_is_young(struct heap *heap, struct gcobj *obj) {
   return (*object_metadata_byte(obj)) & METADATA_BYTE_YOUNG;
 }
 
-static void mark_space_trace_generational_roots(struct mark_space *space,
-                                                struct heap *heap) {
+static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
+  ASSERT(((uintptr_t)mark & 7) == 0);
+  uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
+  uint64_t word;
+  memcpy(&word, aligned_mark, 8);
+#ifdef WORDS_BIGENDIAN
+  word = __builtin_bswap64(word);
+#endif
+  return word;
+}
+
+static inline size_t count_zero_bytes(uint64_t bytes) {
+  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
+}
+
+static uint64_t broadcast_byte(uint8_t byte) {
+  uint64_t result = byte;
+  return result * 0x0101010101010101ULL;
+}
+
+// Note that it's quite possible (and even likely) that any given remset
+// byte doesn't hold any roots, if all stores were to nursery objects.
+STATIC_ASSERT_EQ(GRANULES_PER_REMSET_BYTE % 8, 0);
+static void mark_space_trace_card(struct mark_space *space,
+                                  struct heap *heap, struct slab *slab,
+                                  size_t card) {
+  uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
+  size_t granule_base = card * GRANULES_PER_REMSET_BYTE;
+  for (size_t granule_in_remset = 0;
+       granule_in_remset < GRANULES_PER_REMSET_BYTE;
+       granule_in_remset += 8, granule_base += 8) {
+    uint64_t mark_bytes = load_eight_aligned_bytes(slab->metadata + granule_base);
+    mark_bytes &= space->sweep_mask;
+    while (mark_bytes) {
+      size_t granule_offset = count_zero_bytes(mark_bytes);
+      mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
+      size_t granule = granule_base + granule_offset;
+      uintptr_t addr = first_addr_in_slab + granule * GRANULE_SIZE;
+      struct gcobj *obj = (struct gcobj*)addr;
+      ASSERT(object_metadata_byte(obj) == &slab->metadata[granule]);
+      tracer_enqueue_root(&heap->tracer, obj);
+    }
+  }
+}
+
+static void mark_space_trace_remembered_set(struct mark_space *space,
+                                            struct heap *heap) {
   ASSERT(!space->evacuating);
-  uint8_t live_tenured_mask = space->live_mask;
   for (size_t s = 0; s < space->nslabs; s++) {
     struct slab *slab = &space->slabs[s];
     uint8_t *remset = slab->remembered_set;
-    // TODO: Load 8 bytes at a time instead.
-    for (size_t card = 0; card < REMSET_BYTES_PER_SLAB; card++) {
-      if (remset[card]) {
-        remset[card] = 0;
-        size_t base = card * GRANULES_PER_REMSET_BYTE;
-        size_t limit = base + GRANULES_PER_REMSET_BYTE;
-        // We could accelerate this but GRANULES_PER_REMSET_BYTE is 16
-        // on 64-bit hosts, so maybe it's not so important.
-        for (size_t granule = base; granule < limit; granule++) {
-          if (slab->metadata[granule] & space->live_mask) {
-            struct block *block0 = &slab->blocks[0];
-            uintptr_t addr = ((uintptr_t)block0->data) + granule * GRANULE_SIZE;
-            struct gcobj *obj = (struct gcobj*)addr;
-            ASSERT(object_metadata_byte(obj) == &slab->metadata[granule]);
-            tracer_enqueue_root(&heap->tracer, obj);
-          }
-        }
-        // Note that it's quite possible (and even likely) that this
-        // remset byte doesn't cause any roots, if all stores were to
-        // nursery objects.
+    for (size_t card_base = 0;
+         card_base < REMSET_BYTES_PER_SLAB;
+         card_base += 8) {
+      uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
+      if (!remset_bytes) continue;
+      memset(remset + card_base, 0, 8);
+      while (remset_bytes) {
+        size_t card_offset = count_zero_bytes(remset_bytes);
+        remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
+        mark_space_trace_card(space, heap, slab, card_base + card_offset);
       }
     }
   }
 }
 
-static void mark_space_clear_generational_roots(struct mark_space *space) {
+static void mark_space_clear_remembered_set(struct mark_space *space) {
   if (!GC_GENERATIONAL) return;
   for (size_t slab = 0; slab < space->nslabs; slab++) {
     memset(space->slabs[slab].remembered_set, 0, REMSET_BYTES_PER_SLAB);
@@ -1072,9 +1106,9 @@ static void mark_space_clear_generational_roots(struct mark_space *space) {
 static void trace_generational_roots(struct heap *heap) {
   // TODO: Add lospace nursery.
   if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
-    mark_space_trace_generational_roots(heap_mark_space(heap), heap);
+    mark_space_trace_remembered_set(heap_mark_space(heap), heap);
   } else {
-    mark_space_clear_generational_roots(heap_mark_space(heap));
+    mark_space_clear_remembered_set(heap_mark_space(heap));
   }
 }
 
@@ -1137,11 +1171,6 @@ static void reset_sweeper(struct mark_space *space) {
   space->next_block = (uintptr_t) &space->slabs[0].blocks;
 }
 
-static uint64_t broadcast_byte(uint8_t byte) {
-  uint64_t result = byte;
-  return result * 0x0101010101010101ULL;
-}
-
 static void update_mark_patterns(struct mark_space *space,
                                  int advance_mark_mask) {
   uint8_t survivor_mask = space->marked_mask;
@@ -1480,21 +1509,6 @@ static int sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
   return 0;
 }
 
-static inline uint64_t load_mark_bytes(uint8_t *mark) {
-  ASSERT(((uintptr_t)mark & 7) == 0);
-  uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
-  uint64_t word;
-  memcpy(&word, aligned_mark, 8);
-#ifdef WORDS_BIGENDIAN
-  word = __builtin_bswap64(word);
-#endif
-  return word;
-}
-
-static inline size_t count_zero_bytes(uint64_t bytes) {
-  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
-}
-
 static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
   size_t n = 0;
   // If we have a hole, it is likely to be more that 8 granules long.
@@ -1502,7 +1516,7 @@ static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
   // sweep pointer, then we load aligned mark words.
   size_t unaligned = ((uintptr_t) mark) & 7;
   if (unaligned) {
-    uint64_t bytes = load_mark_bytes(mark - unaligned) >> (unaligned * 8);
+    uint64_t bytes = load_eight_aligned_bytes(mark - unaligned) >> (unaligned * 8);
     bytes &= sweep_mask;
     if (bytes)
       return count_zero_bytes(bytes);
@@ -1510,7 +1524,7 @@ static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
   }
 
   for(; n < limit; n += 8) {
-    uint64_t bytes = load_mark_bytes(mark + n);
+    uint64_t bytes = load_eight_aligned_bytes(mark + n);
     bytes &= sweep_mask;
     if (bytes)
       return n + count_zero_bytes(bytes);
@@ -2014,7 +2028,8 @@ static inline void print_start_gc_stats(struct heap *heap) {
 }
 
 static inline void print_end_gc_stats(struct heap *heap) {
-  printf("Completed %ld collections\n", heap->count);
+  printf("Completed %ld collections (%ld major)\n",
+         heap->count, heap->count - heap->minor_count);
   printf("Heap size with overhead is %zd (%zu slabs)\n",
          heap->size, heap_mark_space(heap)->nslabs);
 }

From 0450a282dd393cda304e88570ce4221ee3efbc85 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 4 Aug 2022 09:04:27 +0200
Subject: [PATCH 119/403] Skip mostly-tenured blocks during sweep/allocate
 after minor GC

---
 whippet.h | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/whippet.h b/whippet.h
index 322848d62..466f58778 100644
--- a/whippet.h
+++ b/whippet.h
@@ -127,8 +127,8 @@ enum block_summary_flag {
   BLOCK_NEEDS_SWEEP = 0x8,
   BLOCK_UNAVAILABLE = 0x10,
   BLOCK_EVACUATE = 0x20,
-  BLOCK_FLAG_UNUSED_6 = 0x40,
-  BLOCK_FLAG_UNUSED_7 = 0x80,
+  BLOCK_VENERABLE = 0x40,
+  BLOCK_VENERABLE_AFTER_SWEEP = 0x80,
   BLOCK_FLAG_UNUSED_8 = 0x100,
   BLOCK_FLAG_UNUSED_9 = 0x200,
   BLOCK_FLAG_UNUSED_10 = 0x400,
@@ -311,6 +311,7 @@ struct mark_space {
   struct block_list evacuation_targets;
   double evacuation_minimum_reserve;
   double evacuation_reserve;
+  double venerable_threshold;
   ssize_t pending_unavailable_bytes; // atomically
   struct evacuation_allocator evacuation_allocator;
   struct slab *slabs;
@@ -346,6 +347,7 @@ struct heap {
   struct mutator *mutator_trace_list;
   long count;
   long minor_count;
+  uint8_t last_collection_was_minor;
   struct mutator *deactivated_mutators;
   struct tracer tracer;
   double fragmentation_low_threshold;
@@ -1478,7 +1480,8 @@ static void collect(struct mutator *mut) {
   mark_space_finish_gc(space, gc_kind);
   large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
   heap->count++;
-  if (gc_kind & GC_KIND_FLAG_MINOR)
+  heap->last_collection_was_minor = gc_kind & GC_KIND_FLAG_MINOR;
+  if (heap->last_collection_was_minor)
     heap->minor_count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   allow_mutators_to_continue(heap);
@@ -1563,6 +1566,15 @@ static void finish_block(struct mutator *mut) {
   atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
                    block->fragmentation_granules);
 
+  // If this block has mostly survivors, we should avoid sweeping it and
+  // trying to allocate into it for a minor GC.  Sweep it next time to
+  // clear any garbage allocated in this cycle and mark it as
+  // "venerable" (i.e., old).
+  ASSERT(!block_summary_has_flag(block, BLOCK_VENERABLE));
+  if (!block_summary_has_flag(block, BLOCK_VENERABLE_AFTER_SWEEP) &&
+      block->free_granules < GRANULES_PER_BLOCK * space->venerable_threshold)
+    block_summary_set_flag(block, BLOCK_VENERABLE_AFTER_SWEEP);
+
   mut->block = mut->alloc = mut->sweep = 0;
 }
 
@@ -1693,7 +1705,31 @@ static size_t next_hole(struct mutator *mut) {
         // unavailable blocks, so skip and get the next block.
         if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
           continue;
+        if (block_summary_has_flag(summary, BLOCK_VENERABLE)) {
+          // Skip venerable blocks after a minor GC -- we don't need to
+          // sweep as they weren't allocated into last cycle, and the
+          // mark bytes didn't rotate, so we have no cleanup to do; and
+          // we shouldn't try to allocate into them as it's not worth
+          // it.  Any wasted space is measured as fragmentation.
+          if (mutator_heap(mut)->last_collection_was_minor)
+            continue;
+          else
+            block_summary_clear_flag(summary, BLOCK_VENERABLE);
+        }
         if (block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
+          // Prepare to sweep the block for holes.
+          mut->alloc = mut->sweep = mut->block = block;
+          if (block_summary_has_flag(summary, BLOCK_VENERABLE_AFTER_SWEEP)) {
+            // In the last cycle we noted that this block consists of
+            // mostly old data.  Sweep any garbage, commit the mark as
+            // venerable, and avoid allocating into it.
+            block_summary_clear_flag(summary, BLOCK_VENERABLE_AFTER_SWEEP);
+            if (mutator_heap(mut)->last_collection_was_minor) {
+              finish_sweeping_in_block(mut);
+              block_summary_set_flag(summary, BLOCK_VENERABLE);
+              continue;
+            }
+          }
           // This block was marked in the last GC and needs sweeping.
           // As we sweep we'll want to record how many bytes were live
           // at the last collection.  As we allocate we'll record how
@@ -1702,8 +1738,6 @@ static size_t next_hole(struct mutator *mut) {
           summary->free_granules = 0;
           summary->holes_with_fragmentation = 0;
           summary->fragmentation_granules = 0;
-          // Prepare to sweep the block for holes.
-          mut->alloc = mut->sweep = mut->block = block;
           break;
         } else {
           // Otherwise this block is completely empty and is on the
@@ -1934,6 +1968,7 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->next_block = 0;
   space->evacuation_minimum_reserve = 0.02;
   space->evacuation_reserve = space->evacuation_minimum_reserve;
+  space->venerable_threshold = 0.1;
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;

From 67f9c89f2a96f1c4ea52dcec161554dcf921f692 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 4 Aug 2022 11:32:06 +0200
Subject: [PATCH 120/403] Use fragmentation_low_threshold for
 venerable_threshold

This way fragmentation from venerable blocks doesn't cause the collector
to keep evacuating.
---
 whippet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whippet.h b/whippet.h
index 466f58778..f8232bf44 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1968,7 +1968,7 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   space->next_block = 0;
   space->evacuation_minimum_reserve = 0.02;
   space->evacuation_reserve = space->evacuation_minimum_reserve;
-  space->venerable_threshold = 0.1;
+  space->venerable_threshold = heap->fragmentation_low_threshold;
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;

From c824f17bd98cc696a0fb8f3e8e4a00aa6ab1f254 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Aug 2022 11:08:36 +0200
Subject: [PATCH 121/403] Rename gc-types.h to gc-api.h

---
 gc-types.h => gc-api.h | 0
 gc.h                   | 2 +-
 heap-objects.h         | 2 +-
 serial-tracer.h        | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename gc-types.h => gc-api.h (100%)

diff --git a/gc-types.h b/gc-api.h
similarity index 100%
rename from gc-types.h
rename to gc-api.h
diff --git a/gc.h b/gc.h
index 931d86636..1de10afdd 100644
--- a/gc.h
+++ b/gc.h
@@ -1,7 +1,7 @@
 #ifndef GC_H_
 #define GC_H_
 
-#include "gc-types.h"
+#include "gc-api.h"
 
 #if defined(GC_BDW)
 #include "bdw.h"
diff --git a/heap-objects.h b/heap-objects.h
index 44e282cc1..ea84d2b84 100644
--- a/heap-objects.h
+++ b/heap-objects.h
@@ -2,7 +2,7 @@
 #define HEAP_OBJECTS_H
 
 #include "inline.h"
-#include "gc-types.h"
+#include "gc-api.h"
 
 #define DECLARE_NODE_TYPE(name, Name, NAME) \
   struct Name;                              \
diff --git a/serial-tracer.h b/serial-tracer.h
index 6b861a471..81664ddf3 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -6,7 +6,7 @@
 
 #include "assert.h"
 #include "debug.h"
-#include "gc-types.h"
+#include "gc-api.h"
 
 struct gcobj;
 

From 2e6dde66b3e0b7bf27d468cd4e37893133a144cd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 9 Aug 2022 09:49:51 +0200
Subject: [PATCH 122/403] Attempt to start creating a proper API

---
 gc-api.h          | 70 +++++++++++++++++++++++++++++++++++------------
 mt-gcbench.c      |  4 +--
 parallel-tracer.h |  3 +-
 quads.c           |  2 +-
 semi.h            |  9 +++---
 serial-tracer.h   |  3 +-
 whippet.h         | 41 ++++++++++++++-------------
 7 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/gc-api.h b/gc-api.h
index 4779cbd2c..6c072bc31 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -1,26 +1,62 @@
-#ifndef GC_TYPES_H_
-#define GC_TYPES_H_
+#ifndef GC_API_H_
+#define GC_API_H_
+
+#include <stdint.h>
+
+#ifndef GC_DEBUG
+#define GC_DEBUG 0
+#endif
+
+#define GC_UNLIKELY(e) __builtin_expect(e, 0)
+#define GC_LIKELY(e) __builtin_expect(e, 1)
+
+#if GC_DEBUG
+#define GC_ASSERT(x) do { if (GC_UNLIKELY(!(x))) __builtin_trap(); } while (0)
+#else
+#define GC_ASSERT(x) do { } while (0)
+#endif
+
+struct gc_ref {
+  uintptr_t value;
+};
+
+static inline struct gc_ref gc_ref(uintptr_t value) {
+  return (struct gc_ref){value};
+}
+static inline uintptr_t gc_ref_value(struct gc_ref ref) {
+  return ref.value;
+}
+
+static inline struct gc_ref gc_ref_null(void) {
+  return gc_ref(0);
+}
+static inline int gc_ref_is_heap_object(struct gc_ref ref) {
+  return ref.value != 0;
+}
+static inline struct gc_ref gc_ref_from_heap_object_or_null(void *obj) {
+  return gc_ref((uintptr_t) obj);
+}
+static inline struct gc_ref gc_ref_from_heap_object(void *obj) {
+  GC_ASSERT(obj);
+  return gc_ref_from_heap_object_or_null(obj);
+}
+static inline void* gc_ref_heap_object(struct gc_ref ref) {
+  GC_ASSERT(gc_ref_is_heap_object(ref));
+  return (void *) gc_ref_value(ref);
+}
 
 struct gc_edge {
-  union {
-    void *addr;
-    void **loc;
-  };
+  struct gc_ref *dst;
 };
 
 static inline struct gc_edge gc_edge(void* addr) {
-  struct gc_edge edge;
-  edge.addr = addr;
-  return edge;
+  return (struct gc_edge){addr};
 }
-static inline struct gc_edge object_field(void* addr) {
-  return gc_edge(addr);
+static struct gc_ref gc_edge_ref(struct gc_edge edge) {
+  return *edge.dst;
 }
-static inline void* dereference_edge(struct gc_edge edge) {
-  return *edge.loc;
-}
-static inline void update_edge(struct gc_edge edge, void *value) {
-  *edge.loc = value;
+static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
+  *edge.dst = ref;
 }
 
-#endif // GC_TYPES_H_
+#endif // GC_API_H_
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 1819655fb..4d1c92255 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -88,8 +88,8 @@ static inline void
 visit_node_fields(Node *node,
                   void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
-  visit(object_field(&node->left), visit_data);
-  visit(object_field(&node->right), visit_data);
+  visit(gc_edge(&node->left), visit_data);
+  visit(gc_edge(&node->right), visit_data);
 }
 static inline void
 visit_double_array_fields(DoubleArray *obj,
diff --git a/parallel-tracer.h b/parallel-tracer.h
index 02f641352..92c0a86d4 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -471,7 +471,8 @@ tracer_visit(struct gc_edge edge, void *trace_data) {
   if (trace_edge(trace->heap, edge)) {
     if (local_trace_queue_full(&trace->local))
       tracer_share(trace);
-    local_trace_queue_push(&trace->local, dereference_edge(edge));
+    local_trace_queue_push(&trace->local,
+                           gc_ref_heap_object(gc_edge_ref(edge)));
   }
 }
 
diff --git a/quads.c b/quads.c
index 9743b88a8..d2cba2bf0 100644
--- a/quads.c
+++ b/quads.c
@@ -18,7 +18,7 @@ visit_quad_fields(Quad *quad,
                   void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
   for (size_t i = 0; i < 4; i++)
-    visit(object_field(&quad->kids[i]), visit_data);
+    visit(gc_edge(&quad->kids[i]), visit_data);
 }
 typedef HANDLE_TO(Quad) QuadHandle;
 
diff --git a/semi.h b/semi.h
index e2769fe7f..7cd776398 100644
--- a/semi.h
+++ b/semi.h
@@ -144,7 +144,7 @@ static void* forward(struct semi_space *space, void *obj) {
 
 static void visit_semi_space(struct heap *heap, struct semi_space *space,
                              struct gc_edge edge, void *obj) {
-  update_edge(edge, forward(space, obj));
+  gc_edge_update(edge, gc_ref_from_heap_object(forward(space, obj)));
 }
 
 static void visit_large_object_space(struct heap *heap,
@@ -160,10 +160,11 @@ static int semi_space_contains(struct semi_space *space, void *obj) {
 
 static void visit(struct gc_edge edge, void *visit_data) {
   struct heap *heap = visit_data;
-  void *obj = dereference_edge(edge);
-  if (obj == NULL)
+  struct gc_ref ref = gc_edge_ref(edge);
+  if (!gc_ref_is_heap_object(ref))
     return;
-  else if (semi_space_contains(heap_semi_space(heap), obj))
+  void *obj = gc_ref_heap_object(ref);
+  if (semi_space_contains(heap_semi_space(heap), obj))
     visit_semi_space(heap, heap_semi_space(heap), edge, obj);
   else if (large_object_space_contains(heap_large_object_space(heap), obj))
     visit_large_object_space(heap, heap_large_object_space(heap), obj);
diff --git a/serial-tracer.h b/serial-tracer.h
index 81664ddf3..3376d9608 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -156,7 +156,8 @@ static inline void
 tracer_visit(struct gc_edge edge, void *trace_data) {
   struct heap *heap = trace_data;
   if (trace_edge(heap, edge))
-    tracer_enqueue_root(heap_tracer(heap), dereference_edge(edge));
+    tracer_enqueue_root(heap_tracer(heap),
+                        gc_ref_heap_object(gc_edge_ref(edge)));
 }
 static inline void
 tracer_trace(struct heap *heap) {
diff --git a/whippet.h b/whippet.h
index f8232bf44..93936aaae 100644
--- a/whippet.h
+++ b/whippet.h
@@ -423,8 +423,8 @@ static size_t mark_space_live_object_granules(uint8_t *metadata) {
 }
 
 static inline int mark_space_mark_object(struct mark_space *space,
-                                         struct gc_edge edge) {
-  struct gcobj *obj = dereference_edge(edge);
+                                         struct gc_ref ref) {
+  struct gcobj *obj = gc_ref_heap_object(ref);
   uint8_t *loc = object_metadata_byte(obj);
   uint8_t byte = *loc;
   if (byte & space->marked_mask)
@@ -567,8 +567,9 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
 }
 
 static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
-                                                     struct gc_edge edge) {
-  struct gcobj *obj = dereference_edge(edge);
+                                                     struct gc_edge edge,
+                                                     struct gc_ref old_ref) {
+  struct gcobj *obj = gc_ref_heap_object(old_ref);
   uint8_t *metadata = object_metadata_byte(obj);
   uint8_t byte = *metadata;
   if (byte & space->marked_mask)
@@ -588,8 +589,7 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
       // The object has been evacuated already.  Update the edge;
       // whoever forwarded the object will make sure it's eventually
       // traced.
-      struct gcobj *forwarded = (struct gcobj*) header_word;
-      update_edge(edge, forwarded);
+      gc_edge_update(edge, gc_ref(header_word));
       return 0;
     }
     // Otherwise try to claim it for evacuation.
@@ -613,7 +613,7 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
                object_granules * GRANULE_SIZE - sizeof(header_word));
         uint8_t *new_metadata = object_metadata_byte(new_obj);
         memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
-        update_edge(edge, new_obj);
+        gc_edge_update(edge, gc_ref_from_heap_object(new_obj));
         obj = new_obj;
         metadata = new_metadata;
         // Fall through to set mark bits.
@@ -634,10 +634,8 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
           break;
         yield_for_spin(spin_count);
       }
-      if ((header_word & gcobj_not_forwarded_bit) == 0) {
-        struct gcobj *forwarded = (struct gcobj*) header_word;
-        update_edge(edge, forwarded);
-      }
+      if ((header_word & gcobj_not_forwarded_bit) == 0)
+        gc_edge_update(edge, gc_ref(header_word));
       // Either way, the other party is responsible for adding the
       // object to the mark queue.
       return 0;
@@ -661,13 +659,15 @@ static inline int large_object_space_mark_object(struct large_object_space *spac
 }
 
 static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
-  struct gcobj *obj = dereference_edge(edge);
-  if (!obj)
+  struct gc_ref ref = gc_edge_ref(edge);
+  if (!gc_ref_is_heap_object(ref))
     return 0;
-  else if (LIKELY(mark_space_contains(heap_mark_space(heap), obj))) {
+  struct gcobj *obj = gc_ref_heap_object(ref);
+  if (LIKELY(mark_space_contains(heap_mark_space(heap), obj))) {
     if (heap_mark_space(heap)->evacuating)
-      return mark_space_evacuate_or_mark_object(heap_mark_space(heap), edge);
-    return mark_space_mark_object(heap_mark_space(heap), edge);
+      return mark_space_evacuate_or_mark_object(heap_mark_space(heap), edge,
+                                                ref);
+    return mark_space_mark_object(heap_mark_space(heap), ref);
   }
   else if (large_object_space_contains(heap_large_object_space(heap), obj))
     return large_object_space_mark_object(heap_large_object_space(heap),
@@ -955,7 +955,8 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gc_edge root = gc_edge(&h->v);
     if (trace_edge(heap, root))
-      mutator_mark_buf_push(local_roots, dereference_edge(root));
+      mutator_mark_buf_push(local_roots,
+                            gc_ref_heap_object(gc_edge_ref(root)));
   }
 }
 
@@ -965,7 +966,8 @@ static void mark_mutator_roots_with_lock(struct mutator *mut) {
   for (struct handle *h = mut->roots; h; h = h->next) {
     struct gc_edge root = gc_edge(&h->v);
     if (trace_edge(heap, root))
-      tracer_enqueue_root(&heap->tracer, dereference_edge(root));
+      tracer_enqueue_root(&heap->tracer,
+                          gc_ref_heap_object(gc_edge_ref(root)));
   }
 }
 
@@ -1018,7 +1020,8 @@ static void trace_global_roots(struct heap *heap) {
   for (struct handle *h = heap->global_roots; h; h = h->next) {
     struct gc_edge edge = gc_edge(&h->v);
     if (trace_edge(heap, edge))
-      tracer_enqueue_root(&heap->tracer, dereference_edge(edge));
+      tracer_enqueue_root(&heap->tracer,
+                          gc_ref_heap_object(gc_edge_ref(edge)));
   }
 }
 

From 4ccb489869268dfd6f05b452f39bbcaa2b5ddb9b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 9 Aug 2022 11:21:02 +0200
Subject: [PATCH 123/403] Set fixed heap size, parallelism via explicit options

---
 bdw.h             | 87 +++++++++++++++++++++++++++++++++++++++++++----
 gc-api.h          | 23 +++++++++++++
 mt-gcbench.c      | 14 ++++++--
 parallel-tracer.h | 11 ++----
 quads.c           | 14 ++++++--
 semi.h            | 74 ++++++++++++++++++++++++++++++++++++++--
 serial-tracer.h   |  2 +-
 whippet.h         | 80 +++++++++++++++++++++++++++++++++++++++----
 8 files changed, 275 insertions(+), 30 deletions(-)

diff --git a/bdw.h b/bdw.h
index 69a147b56..2d30fb3b6 100644
--- a/bdw.h
+++ b/bdw.h
@@ -1,4 +1,5 @@
 #include <stdint.h>
+#include <string.h>
 
 #include "conservative-roots.h"
 
@@ -120,17 +121,89 @@ static inline struct heap *mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
 
-static int initialize_gc(size_t heap_size, struct heap **heap,
-                         struct mutator **mutator) {
+#define FOR_EACH_GC_OPTION(M) \
+  M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
+  M(GC_OPTION_PARALLELISM, "parallelism")
+
+static void dump_available_gc_options(void) {
+  fprintf(stderr, "available gc options:");
+#define PRINT_OPTION(option, name) fprintf(stderr, " %s", name);
+  FOR_EACH_GC_OPTION(PRINT_OPTION)
+#undef PRINT_OPTION
+  fprintf(stderr, "\n");
+}
+
+static int gc_option_from_string(const char *str) {
+#define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
+  FOR_EACH_GC_OPTION(PARSE_OPTION)
+#undef PARSE_OPTION
+  if (strcmp(str, "fixed-heap-size") == 0)
+    return GC_OPTION_FIXED_HEAP_SIZE;
+  if (strcmp(str, "parallelism") == 0)
+    return GC_OPTION_PARALLELISM;
+  fprintf(stderr, "bad gc option: '%s'\n", str);
+  dump_available_gc_options();
+  return -1;
+}
+
+struct options {
+  size_t fixed_heap_size;
+  size_t parallelism;
+};
+
+static size_t parse_size_t(double value) {
+  ASSERT(value >= 0);
+  ASSERT(value <= (size_t) -1);
+  return value;
+}
+
+static size_t number_of_current_processors(void) { return 1; }
+
+static int parse_options(int argc, struct gc_option argv[],
+                         struct options *options) {
+  for (int i = 0; i < argc; i++) {
+    switch (argv[i].option) {
+    case GC_OPTION_FIXED_HEAP_SIZE:
+      options->fixed_heap_size = parse_size_t(argv[i].value);
+      break;
+    case GC_OPTION_PARALLELISM:
+      options->parallelism = parse_size_t(argv[i].value);
+      break;
+    default:
+      abort();
+    }
+  }
+
+  if (!options->fixed_heap_size) {
+    fprintf(stderr, "fixed heap size is currently required\n");
+    return 0;
+  }
+  if (!options->parallelism)
+    options->parallelism = number_of_current_processors();
+
+  return 1;
+}
+
+static int gc_init(int argc, struct gc_option argv[],
+                   struct heap **heap, struct mutator **mutator) {
+  struct options options = { 0, };
+  if (!parse_options(argc, argv, &options))
+    return 0;
+
   // GC_full_freq = 30;
   // GC_free_space_divisor = 16;
   // GC_enable_incremental();
-  GC_INIT();
+  
+  GC_set_max_heap_size(options.fixed_heap_size);
+  // Not part of 7.3, sigh.  Have to set an env var.
+  // GC_set_markers_count(options.parallelism);
+  char markers[21] = {0,}; // 21 bytes enough for 2**64 in decimal + NUL.
+  snprintf(markers, sizeof(markers), "%zu", options.parallelism);
+  setenv("GC_MARKERS", markers, 1);
+  GC_init();
   size_t current_heap_size = GC_get_heap_size();
-  if (heap_size > current_heap_size) {
-    GC_set_max_heap_size (heap_size);
-    GC_expand_hp(heap_size - current_heap_size);
-  }
+  if (options.fixed_heap_size > current_heap_size)
+    GC_expand_hp(options.fixed_heap_size - current_heap_size);
   GC_allow_register_threads();
   *heap = GC_malloc(sizeof(struct heap));
   pthread_mutex_init(&(*heap)->lock, NULL);
diff --git a/gc-api.h b/gc-api.h
index 6c072bc31..7d6eead51 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -59,4 +59,27 @@ static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
   *edge.dst = ref;
 }
 
+// FIXME: prefix with gc_
+struct heap;
+struct mutator;
+
+enum {
+  GC_OPTION_FIXED_HEAP_SIZE,
+  GC_OPTION_PARALLELISM
+};
+
+struct gc_option {
+  int option;
+  double value;
+};
+
+// FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
+#ifndef GC_API_
+#define GC_API_ static
+#endif
+
+GC_API_ int gc_option_from_string(const char *str);
+GC_API_ int gc_init(int argc, struct gc_option argv[],
+                    struct heap **heap, struct mutator **mutator);
+
 #endif // GC_API_H_
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 4d1c92255..23fb235c2 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -380,13 +380,14 @@ int main(int argc, char *argv[]) {
     tree_size(long_lived_tree_depth) * sizeof_node +
     tree_size(max_tree_depth) * sizeof_node +
     sizeof_double_array + sizeof(double) * array_size;
-  if (argc != 3) {
-    fprintf(stderr, "usage: %s MULTIPLIER NTHREADS\n", argv[0]);
+  if (argc != 4) {
+    fprintf(stderr, "usage: %s MULTIPLIER NTHREADS PARALLELISM\n", argv[0]);
     return 1;
   }
 
   double multiplier = atof(argv[1]);
   size_t nthreads = atol(argv[2]);
+  size_t parallelism = atol(argv[3]);
 
   if (!(0.1 < multiplier && multiplier < 100)) {
     fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[1]);
@@ -397,11 +398,18 @@ int main(int argc, char *argv[]) {
             (int)MAX_THREAD_COUNT, argv[2]);
     return 1;
   }
+  if (parallelism < 1 || parallelism > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for parallelism, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[3]);
+    return 1;
+  }
 
   size_t heap_size = heap_max_live * multiplier * nthreads;
+  struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, heap_size },
+                                 { GC_OPTION_PARALLELISM, parallelism } };
   struct heap *heap;
   struct mutator *mut;
-  if (!initialize_gc(heap_size, &heap, &mut)) {
+  if (!gc_init(sizeof options / sizeof options[0], options, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
diff --git a/parallel-tracer.h b/parallel-tracer.h
index 92c0a86d4..0634c91ec 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -325,8 +325,6 @@ struct local_tracer {
 struct context;
 static inline struct tracer* heap_tracer(struct heap *heap);
 
-static size_t number_of_current_processors(void) { return 1; }
-
 static int
 trace_worker_init(struct trace_worker *worker, struct heap *heap,
                  struct tracer *tracer, size_t id) {
@@ -416,18 +414,15 @@ trace_worker_request_stop(struct trace_worker *worker) {
 }  
 
 static int
-tracer_init(struct heap *heap) {
+tracer_init(struct heap *heap, size_t parallelism) {
   struct tracer *tracer = heap_tracer(heap);
   atomic_init(&tracer->active_tracers, 0);
   atomic_init(&tracer->running_tracers, 0);
   tracer->count = 0;
   pthread_mutex_init(&tracer->lock, NULL);
   pthread_cond_init(&tracer->cond, NULL);
-  size_t desired_worker_count = 0;
-  if (getenv("GC_TRACERS"))
-    desired_worker_count = atoi(getenv("GC_TRACERS"));
-  if (desired_worker_count == 0)
-    desired_worker_count = number_of_current_processors();
+  size_t desired_worker_count = parallelism;
+  ASSERT(desired_worker_count);
   if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
     desired_worker_count = TRACE_WORKERS_MAX_COUNT;
   for (size_t i = 0; i < desired_worker_count; i++) {
diff --git a/quads.c b/quads.c
index d2cba2bf0..9c6dedfaf 100644
--- a/quads.c
+++ b/quads.c
@@ -103,20 +103,27 @@ static size_t tree_size(size_t depth) {
   return nquads;
 }
 
+#define MAX_THREAD_COUNT 256
 
 int main(int argc, char *argv[]) {
   if (argc != 3) {
-    fprintf(stderr, "usage: %s DEPTH MULTIPLIER\n", argv[0]);
+    fprintf(stderr, "usage: %s DEPTH MULTIPLIER PARALLELISM\n", argv[0]);
     return 1;
   }
 
   size_t depth = parse_size(argv[1], "depth");
   double multiplier = atof(argv[2]);
+  size_t parallelism = atol(argv[3]);
 
   if (!(1.0 < multiplier && multiplier < 100)) {
     fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
     return 1;
   }
+  if (parallelism < 1 || parallelism > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for parallelism, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[3]);
+    return 1;
+  }
 
   // Compute byte size not counting any header word, so as to compute the same
   // heap size whether a header word is there or not.
@@ -127,9 +134,12 @@ int main(int argc, char *argv[]) {
   unsigned long gc_start = current_time();
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, multiplier);
+
+  struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, heap_size },
+                                 { GC_OPTION_PARALLELISM, parallelism } };
   struct heap *heap;
   struct mutator *mut;
-  if (!initialize_gc(heap_size, &heap, &mut)) {
+  if (!gc_init(sizeof options / sizeof options[0], options, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
diff --git a/semi.h b/semi.h
index 7cd776398..0f6071266 100644
--- a/semi.h
+++ b/semi.h
@@ -285,14 +285,82 @@ static int initialize_semi_space(struct semi_space *space, size_t size) {
   return 1;
 }
   
-static int initialize_gc(size_t heap_size, struct heap **heap,
-                         struct mutator **mut) {
+#define FOR_EACH_GC_OPTION(M) \
+  M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
+  M(GC_OPTION_PARALLELISM, "parallelism")
+
+static void dump_available_gc_options(void) {
+  fprintf(stderr, "available gc options:");
+#define PRINT_OPTION(option, name) fprintf(stderr, " %s", name);
+  FOR_EACH_GC_OPTION(PRINT_OPTION)
+#undef PRINT_OPTION
+  fprintf(stderr, "\n");
+}
+
+static int gc_option_from_string(const char *str) {
+#define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
+  FOR_EACH_GC_OPTION(PARSE_OPTION)
+#undef PARSE_OPTION
+  if (strcmp(str, "fixed-heap-size") == 0)
+    return GC_OPTION_FIXED_HEAP_SIZE;
+  if (strcmp(str, "parallelism") == 0)
+    return GC_OPTION_PARALLELISM;
+  fprintf(stderr, "bad gc option: '%s'\n", str);
+  dump_available_gc_options();
+  return -1;
+}
+
+struct options {
+  size_t fixed_heap_size;
+  size_t parallelism;
+};
+
+static size_t parse_size_t(double value) {
+  ASSERT(value >= 0);
+  ASSERT(value <= (size_t) -1);
+  return value;
+}
+
+static int parse_options(int argc, struct gc_option argv[],
+                         struct options *options) {
+  options->parallelism = 1;
+  for (int i = 0; i < argc; i++) {
+    switch (argv[i].option) {
+    case GC_OPTION_FIXED_HEAP_SIZE:
+      options->fixed_heap_size = parse_size_t(argv[i].value);
+      break;
+    case GC_OPTION_PARALLELISM:
+      options->parallelism = parse_size_t(argv[i].value);
+      break;
+    default:
+      abort();
+    }
+  }
+
+  if (!options->fixed_heap_size) {
+    fprintf(stderr, "fixed heap size is currently required\n");
+    return 0;
+  }
+  if (options->parallelism != 1) {
+    fprintf(stderr, "parallelism unimplemented in semispace copying collector\n");
+    return 0;
+  }
+
+  return 1;
+}
+
+static int gc_init(int argc, struct gc_option argv[],
+                   struct heap **heap, struct mutator **mut) {
+  struct options options = { 0, };
+  if (!parse_options(argc, argv, &options))
+    return 0;
+
   *mut = calloc(1, sizeof(struct mutator));
   if (!*mut) abort();
   *heap = mutator_heap(*mut);
 
   struct semi_space *space = mutator_semi_space(*mut);
-  if (!initialize_semi_space(space, heap_size))
+  if (!initialize_semi_space(space, options.fixed_heap_size))
     return 0;
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     return 0;
diff --git a/serial-tracer.h b/serial-tracer.h
index 3376d9608..474de34d7 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -129,7 +129,7 @@ struct heap;
 static inline struct tracer* heap_tracer(struct heap *heap);
 
 static int
-tracer_init(struct heap *heap) {
+tracer_init(struct heap *heap, size_t parallelism) {
   return trace_queue_init(&heap_tracer(heap)->queue);
 }
 static void tracer_prepare(struct heap *heap) {}
diff --git a/whippet.h b/whippet.h
index 93936aaae..b100ca04f 100644
--- a/whippet.h
+++ b/whippet.h
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <string.h>
 #include <unistd.h>
 
 #include "assert.h"
@@ -1910,6 +1911,69 @@ static inline void set_field(void *obj, void **addr, void *val) {
   *addr = val;
 }
 
+#define FOR_EACH_GC_OPTION(M) \
+  M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
+  M(GC_OPTION_PARALLELISM, "parallelism")
+
+static void dump_available_gc_options(void) {
+  fprintf(stderr, "available gc options:");
+#define PRINT_OPTION(option, name) fprintf(stderr, " %s", name);
+  FOR_EACH_GC_OPTION(PRINT_OPTION)
+#undef PRINT_OPTION
+  fprintf(stderr, "\n");
+}
+
+static int gc_option_from_string(const char *str) {
+#define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
+  FOR_EACH_GC_OPTION(PARSE_OPTION)
+#undef PARSE_OPTION
+  if (strcmp(str, "fixed-heap-size") == 0)
+    return GC_OPTION_FIXED_HEAP_SIZE;
+  if (strcmp(str, "parallelism") == 0)
+    return GC_OPTION_PARALLELISM;
+  fprintf(stderr, "bad gc option: '%s'\n", str);
+  dump_available_gc_options();
+  return -1;
+}
+
+struct options {
+  size_t fixed_heap_size;
+  size_t parallelism;
+};
+
+static size_t parse_size_t(double value) {
+  ASSERT(value >= 0);
+  ASSERT(value <= (size_t) -1);
+  return value;
+}
+
+static size_t number_of_current_processors(void) { return 1; }
+
+static int parse_options(int argc, struct gc_option argv[],
+                         struct options *options) {
+  for (int i = 0; i < argc; i++) {
+    switch (argv[i].option) {
+    case GC_OPTION_FIXED_HEAP_SIZE:
+      options->fixed_heap_size = parse_size_t(argv[i].value);
+      break;
+    case GC_OPTION_PARALLELISM:
+      options->parallelism = parse_size_t(argv[i].value);
+      break;
+    default:
+      abort();
+    }
+  }
+
+  if (!options->fixed_heap_size) {
+    fprintf(stderr, "fixed heap size is currently required\n");
+    return 0;
+  }
+  if (!options->parallelism)
+    options->parallelism = number_of_current_processors();
+
+  return 1;
+}
+
 static struct slab* allocate_slabs(size_t nslabs) {
   size_t size = nslabs * SLAB_SIZE;
   size_t extent = size + SLAB_SIZE;
@@ -1934,15 +1998,15 @@ static struct slab* allocate_slabs(size_t nslabs) {
   return (struct slab*) aligned_base;
 }
 
-static int heap_init(struct heap *heap, size_t size) {
+static int heap_init(struct heap *heap, struct options *options) {
   // *heap is already initialized to 0.
 
   pthread_mutex_init(&heap->lock, NULL);
   pthread_cond_init(&heap->mutator_cond, NULL);
   pthread_cond_init(&heap->collector_cond, NULL);
-  heap->size = size;
+  heap->size = options->fixed_heap_size;
 
-  if (!tracer_init(heap))
+  if (!tracer_init(heap, options->parallelism))
     abort();
 
   heap->fragmentation_low_threshold = 0.05;
@@ -1987,12 +2051,16 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   return 1;
 }
 
-static int initialize_gc(size_t size, struct heap **heap,
-                         struct mutator **mut) {
+static int gc_init(int argc, struct gc_option argv[],
+                   struct heap **heap, struct mutator **mut) {
+  struct options options = { 0, };
+  if (!parse_options(argc, argv, &options))
+    return 0;
+
   *heap = calloc(1, sizeof(struct heap));
   if (!*heap) abort();
 
-  if (!heap_init(*heap, size))
+  if (!heap_init(*heap, &options))
     abort();
 
   struct mark_space *space = heap_mark_space(*heap);

From d8bcbf2d743d33a6f1f9882dbbec61e24e8bcd70 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 9 Aug 2022 11:35:31 +0200
Subject: [PATCH 124/403] More API-ification

---
 bdw.h        | 14 ++++++--------
 gc-api.h     | 11 ++++++++++-
 mt-gcbench.c |  6 +++---
 semi.h       | 10 +++++-----
 whippet.h    | 14 ++++++--------
 5 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/bdw.h b/bdw.h
index 2d30fb3b6..142e5bccb 100644
--- a/bdw.h
+++ b/bdw.h
@@ -211,8 +211,8 @@ static int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct heap *heap) {
+static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                          struct heap *heap) {
   pthread_mutex_lock(&heap->lock);
   if (!heap->multithreaded) {
     GC_allow_register_threads();
@@ -224,15 +224,13 @@ static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
   GC_register_my_thread(&base);
   return add_mutator(heap);
 }
-static void finish_gc_for_thread(struct mutator *mut) {
+static void gc_finish_for_thread(struct mutator *mut) {
   GC_unregister_my_thread();
 }
 
-static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
-                             void *data) NEVER_INLINE;
-static void* call_without_gc(struct mutator *mut,
-                             void* (*f)(void*),
-                             void *data) {
+static void* gc_call_without_gc(struct mutator *mut,
+                                void* (*f)(void*),
+                                void *data) {
   return GC_do_blocking(f, data);
 }
 
diff --git a/gc-api.h b/gc-api.h
index 7d6eead51..c88fcde7b 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -7,6 +7,9 @@
 #define GC_DEBUG 0
 #endif
 
+#define GC_ALWAYS_INLINE __attribute__((always_inline))
+#define GC_NEVER_INLINE __attribute__((noinline))
+
 #define GC_UNLIKELY(e) __builtin_expect(e, 0)
 #define GC_LIKELY(e) __builtin_expect(e, 1)
 
@@ -52,7 +55,7 @@ struct gc_edge {
 static inline struct gc_edge gc_edge(void* addr) {
   return (struct gc_edge){addr};
 }
-static struct gc_ref gc_edge_ref(struct gc_edge edge) {
+static inline struct gc_ref gc_edge_ref(struct gc_edge edge) {
   return *edge.dst;
 }
 static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
@@ -82,4 +85,10 @@ GC_API_ int gc_option_from_string(const char *str);
 GC_API_ int gc_init(int argc, struct gc_option argv[],
                     struct heap **heap, struct mutator **mutator);
 
+GC_API_ struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                           struct heap *heap);
+GC_API_ void gc_finish_for_thread(struct mutator *mut);
+GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
+                                 void *data) GC_NEVER_INLINE;
+
 #endif // GC_API_H_
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 23fb235c2..c3d92fef0 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -307,9 +307,9 @@ struct call_with_gc_data {
 };
 static void* call_with_gc_inner(uintptr_t *stack_base, void *arg) {
   struct call_with_gc_data *data = arg;
-  struct mutator *mut = initialize_gc_for_thread(stack_base, data->heap);
+  struct mutator *mut = gc_init_for_thread(stack_base, data->heap);
   void *ret = data->f(mut);
-  finish_gc_for_thread(mut);
+  gc_finish_for_thread(mut);
   return ret;
 }
 static void* call_with_gc(void* (*f)(struct mutator *),
@@ -434,7 +434,7 @@ int main(int argc, char *argv[]) {
   run_one_test(mut);
   for (size_t i = 1; i < nthreads; i++) {
     struct join_data data = { 0, threads[i] };
-    call_without_gc(mut, join_thread, &data);
+    gc_call_without_gc(mut, join_thread, &data);
     if (data.status) {
       errno = data.status;
       perror("Failed to join thread");
diff --git a/semi.h b/semi.h
index 0f6071266..944ab88f0 100644
--- a/semi.h
+++ b/semi.h
@@ -370,17 +370,17 @@ static int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct heap *heap) {
+static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                          struct heap *heap) {
   fprintf(stderr,
           "Semispace copying collector not appropriate for multithreaded use.\n");
   exit(1);
 }
-static void finish_gc_for_thread(struct mutator *space) {
+static void gc_finish_for_thread(struct mutator *space) {
 }
 
-static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
-                             void *data) {
+static void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
+                                void *data) {
   // Can't be threads, then there won't be collection.
   return f(data);
 }
diff --git a/whippet.h b/whippet.h
index b100ca04f..db59bc2db 100644
--- a/whippet.h
+++ b/whippet.h
@@ -2079,8 +2079,8 @@ static int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
-                                                struct heap *heap) {
+static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                          struct heap *heap) {
   struct mutator *ret = calloc(1, sizeof(struct mutator));
   if (!ret)
     abort();
@@ -2088,7 +2088,7 @@ static struct mutator* initialize_gc_for_thread(uintptr_t *stack_base,
   return ret;
 }
 
-static void finish_gc_for_thread(struct mutator *mut) {
+static void gc_finish_for_thread(struct mutator *mut) {
   remove_mutator(mutator_heap(mut), mut);
   mutator_mark_buf_destroy(&mut->mark_buf);
   free(mut);
@@ -2118,11 +2118,9 @@ static void reactivate_mutator(struct heap *heap, struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void* call_without_gc(struct mutator *mut, void* (*f)(void*),
-                             void *data) NEVER_INLINE;
-static void* call_without_gc(struct mutator *mut,
-                             void* (*f)(void*),
-                             void *data) {
+static void* gc_call_without_gc(struct mutator *mut,
+                                void* (*f)(void*),
+                                void *data) {
   struct heap *heap = mutator_heap(mut);
   deactivate_mutator(heap, mut);
   void *ret = f(data);

From cacc28b5777f13412b8e8f053d8fa070abeb9921 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 9 Aug 2022 16:14:47 +0200
Subject: [PATCH 125/403] Always add a header onto objects

We're targetting systems that need to be able to inspect the kind of an
object, so this information has to be somewhere.  If it's out-of-line,
we might save memory, but we would lose locality.  Concretely in Guile
the tag bits are in the object itself.
---
 bdw.h        |  2 --
 gc-api.h     |  4 ++++
 mt-gcbench.c | 18 ++++++++----------
 quads.c      |  6 ++----
 semi.h       |  2 --
 whippet.h    |  2 --
 6 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/bdw.h b/bdw.h
index 142e5bccb..bda8beb56 100644
--- a/bdw.h
+++ b/bdw.h
@@ -80,8 +80,6 @@ allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   return head;
 }
 
-#define GC_HEADER /**/
-
 static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
                              size_t size) {
   size_t idx = gc_inline_bytes_to_freelist_index(size);
diff --git a/gc-api.h b/gc-api.h
index c88fcde7b..d23c1c5f4 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -91,4 +91,8 @@ GC_API_ void gc_finish_for_thread(struct mutator *mut);
 GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
+struct gc_header {
+  uintptr_t tag;
+};
+
 #endif // GC_API_H_
diff --git a/mt-gcbench.c b/mt-gcbench.c
index c3d92fef0..f1fe9df3d 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -57,20 +57,20 @@ static const int min_tree_depth = 4;
 static const int max_tree_depth = 16;
 
 struct Node {
-  GC_HEADER;
-  struct Node * left;
-  struct Node * right;
+  struct gc_header header;
+  struct Node *left;
+  struct Node *right;
   int i, j;
 };
 
 struct DoubleArray {
-  GC_HEADER;
+  struct gc_header header;
   size_t length;
   double values[0];
 };
 
 struct Hole {
-  GC_HEADER;
+  struct gc_header header;
   size_t length;
   uintptr_t values[0];
 };
@@ -373,13 +373,11 @@ static void *join_thread(void *data) {
 }
 
 int main(int argc, char *argv[]) {
-  // Define size of Node without any GC header.
-  size_t sizeof_node = 2 * sizeof(Node*) + 2 * sizeof(int);
   size_t sizeof_double_array = sizeof(size_t);
   size_t heap_max_live =
-    tree_size(long_lived_tree_depth) * sizeof_node +
-    tree_size(max_tree_depth) * sizeof_node +
-    sizeof_double_array + sizeof(double) * array_size;
+    tree_size(long_lived_tree_depth) * sizeof(Node) +
+    tree_size(max_tree_depth) * sizeof(Node) +
+    sizeof(DoubleArray) + sizeof(double) * array_size;
   if (argc != 4) {
     fprintf(stderr, "usage: %s MULTIPLIER NTHREADS PARALLELISM\n", argv[0]);
     return 1;
diff --git a/quads.c b/quads.c
index 9c6dedfaf..ef639712f 100644
--- a/quads.c
+++ b/quads.c
@@ -7,7 +7,7 @@
 #include "gc.h"
 
 typedef struct Quad {
-  GC_HEADER;
+  struct gc_header header;
   struct Quad *kids[4];
 } Quad;
 static inline size_t quad_size(Quad *obj) {
@@ -125,10 +125,8 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
-  // Compute byte size not counting any header word, so as to compute the same
-  // heap size whether a header word is there or not.
   size_t nquads = tree_size(depth);
-  size_t tree_bytes = nquads * 4 * sizeof(Quad*);
+  size_t tree_bytes = nquads * sizeof(Quad);
   size_t heap_size = tree_bytes * multiplier;
 
   unsigned long gc_start = current_time();
diff --git a/semi.h b/semi.h
index 944ab88f0..2c1421d11 100644
--- a/semi.h
+++ b/semi.h
@@ -48,8 +48,6 @@ static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
 
-#define GC_HEADER uintptr_t _gc_header
-
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
diff --git a/whippet.h b/whippet.h
index db59bc2db..3cbcee9a8 100644
--- a/whippet.h
+++ b/whippet.h
@@ -392,8 +392,6 @@ static inline struct heap* mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
 
-#define GC_HEADER uintptr_t _gc_header
-
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }

From fb71c4c363ef550265aa3aee55a9ad9e4fa1fa57 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 12 Aug 2022 16:44:38 +0200
Subject: [PATCH 126/403] Separate tagging from collector

The collector now has an abstract interface onto the embedder.  The
embedder has to supply some functionality, such as tracing and
forwarding.  This is a pretty big change in terms of lines but it's
supposed to have no functional or performance change.
---
 bdw.h                   |   9 +-
 gc-api.h                |  70 ++---------
 gc-assert.h             |  15 +++
 gc-config.h             |   8 ++
 gc-edge.h               |  20 +++
 gc-embedder-api.h       |  28 +++++
 gc-forwarding.h         |  20 +++
 gc-inline.h             |   7 ++
 gc-ref.h                |  37 ++++++
 heap-objects.h          |   8 +-
 inline.h                |   7 --
 mt-gcbench.c            |  26 ++--
 parallel-tracer.h       |   8 +-
 quads.c                 |   6 +-
 semi.h                  |  68 +++--------
 serial-tracer.h         |   8 +-
 simple-allocator.h      |  21 ++++
 simple-gc-embedder.h    |  98 +++++++++++++++
 simple-tagging-scheme.h |  29 +++++
 whippet.h               | 265 ++++++++++++++++------------------------
 20 files changed, 452 insertions(+), 306 deletions(-)
 create mode 100644 gc-assert.h
 create mode 100644 gc-config.h
 create mode 100644 gc-edge.h
 create mode 100644 gc-embedder-api.h
 create mode 100644 gc-forwarding.h
 create mode 100644 gc-inline.h
 create mode 100644 gc-ref.h
 delete mode 100644 inline.h
 create mode 100644 simple-allocator.h
 create mode 100644 simple-gc-embedder.h
 create mode 100644 simple-tagging-scheme.h

diff --git a/bdw.h b/bdw.h
index bda8beb56..1eb841662 100644
--- a/bdw.h
+++ b/bdw.h
@@ -54,7 +54,7 @@ enum gc_inline_kind {
 };
 
 static void* allocate_small_slow(void **freelist, size_t idx,
-                                 enum gc_inline_kind kind) NEVER_INLINE;
+                                 enum gc_inline_kind kind) GC_NEVER_INLINE;
 static void* allocate_small_slow(void **freelist, size_t idx,
                                  enum gc_inline_kind kind) {
   size_t bytes = gc_inline_freelist_object_size(idx);
@@ -80,8 +80,7 @@ allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   return head;
 }
 
-static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
-                             size_t size) {
+static inline void* gc_allocate(struct mutator *mut, size_t size) {
   size_t idx = gc_inline_bytes_to_freelist_index(size);
 
   if (UNLIKELY(idx >= GC_INLINE_FREELIST_COUNT))
@@ -90,8 +89,8 @@ static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
   return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
 }
 
-static inline void* allocate_pointerless(struct mutator *mut,
-                                         enum alloc_kind kind, size_t size) {
+static inline void* gc_allocate_pointerless(struct mutator *mut,
+                                            size_t size) {
   // Because the BDW API requires us to implement a custom marker so
   // that the pointerless freelist gets traced, even though it's in a
   // pointerless region, we punt on thread-local pointerless freelists.
diff --git a/gc-api.h b/gc-api.h
index d23c1c5f4..9bcd69d70 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -1,67 +1,13 @@
 #ifndef GC_API_H_
 #define GC_API_H_
 
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-ref.h"
+#include "gc-edge.h"
+
 #include <stdint.h>
 
-#ifndef GC_DEBUG
-#define GC_DEBUG 0
-#endif
-
-#define GC_ALWAYS_INLINE __attribute__((always_inline))
-#define GC_NEVER_INLINE __attribute__((noinline))
-
-#define GC_UNLIKELY(e) __builtin_expect(e, 0)
-#define GC_LIKELY(e) __builtin_expect(e, 1)
-
-#if GC_DEBUG
-#define GC_ASSERT(x) do { if (GC_UNLIKELY(!(x))) __builtin_trap(); } while (0)
-#else
-#define GC_ASSERT(x) do { } while (0)
-#endif
-
-struct gc_ref {
-  uintptr_t value;
-};
-
-static inline struct gc_ref gc_ref(uintptr_t value) {
-  return (struct gc_ref){value};
-}
-static inline uintptr_t gc_ref_value(struct gc_ref ref) {
-  return ref.value;
-}
-
-static inline struct gc_ref gc_ref_null(void) {
-  return gc_ref(0);
-}
-static inline int gc_ref_is_heap_object(struct gc_ref ref) {
-  return ref.value != 0;
-}
-static inline struct gc_ref gc_ref_from_heap_object_or_null(void *obj) {
-  return gc_ref((uintptr_t) obj);
-}
-static inline struct gc_ref gc_ref_from_heap_object(void *obj) {
-  GC_ASSERT(obj);
-  return gc_ref_from_heap_object_or_null(obj);
-}
-static inline void* gc_ref_heap_object(struct gc_ref ref) {
-  GC_ASSERT(gc_ref_is_heap_object(ref));
-  return (void *) gc_ref_value(ref);
-}
-
-struct gc_edge {
-  struct gc_ref *dst;
-};
-
-static inline struct gc_edge gc_edge(void* addr) {
-  return (struct gc_edge){addr};
-}
-static inline struct gc_ref gc_edge_ref(struct gc_edge edge) {
-  return *edge.dst;
-}
-static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
-  *edge.dst = ref;
-}
-
 // FIXME: prefix with gc_
 struct heap;
 struct mutator;
@@ -91,8 +37,8 @@ GC_API_ void gc_finish_for_thread(struct mutator *mut);
 GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
-struct gc_header {
-  uintptr_t tag;
-};
+GC_API_ inline void* gc_allocate(struct mutator *mut, size_t bytes);
+// FIXME: remove :P
+GC_API_ inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
 
 #endif // GC_API_H_
diff --git a/gc-assert.h b/gc-assert.h
new file mode 100644
index 000000000..472297a1e
--- /dev/null
+++ b/gc-assert.h
@@ -0,0 +1,15 @@
+#ifndef GC_ASSERT_H
+#define GC_ASSERT_H
+
+#include "gc-config.h"
+
+#define GC_UNLIKELY(e) __builtin_expect(e, 0)
+#define GC_LIKELY(e) __builtin_expect(e, 1)
+
+#if GC_DEBUG
+#define GC_ASSERT(x) do { if (GC_UNLIKELY(!(x))) __builtin_trap(); } while (0)
+#else
+#define GC_ASSERT(x) do { } while (0)
+#endif
+
+#endif // GC_ASSERT_H
diff --git a/gc-config.h b/gc-config.h
new file mode 100644
index 000000000..cd78e23d5
--- /dev/null
+++ b/gc-config.h
@@ -0,0 +1,8 @@
+#ifndef GC_CONFIG_H
+#define GC_CONFIG_H
+
+#ifndef GC_DEBUG
+#define GC_DEBUG 0
+#endif
+
+#endif // GC_CONFIG_H
diff --git a/gc-edge.h b/gc-edge.h
new file mode 100644
index 000000000..cfd769c59
--- /dev/null
+++ b/gc-edge.h
@@ -0,0 +1,20 @@
+#ifndef GC_EDGE_H
+#define GC_EDGE_H
+
+#include "gc-ref.h"
+
+struct gc_edge {
+  struct gc_ref *dst;
+};
+
+static inline struct gc_edge gc_edge(void* addr) {
+  return (struct gc_edge){addr};
+}
+static inline struct gc_ref gc_edge_ref(struct gc_edge edge) {
+  return *edge.dst;
+}
+static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
+  *edge.dst = ref;
+}
+
+#endif // GC_EDGE_H
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
new file mode 100644
index 000000000..f80ffe995
--- /dev/null
+++ b/gc-embedder-api.h
@@ -0,0 +1,28 @@
+#ifndef GC_EMBEDDER_API_H
+#define GC_EMBEDDER_API_H
+
+#include "gc-edge.h"
+#include "gc-forwarding.h"
+
+#ifndef GC_EMBEDDER_API
+#define GC_EMBEDDER_API static
+#endif
+
+GC_EMBEDDER_API inline void gc_trace_object(void *object,
+                                            void (*trace_edge)(struct gc_edge edge,
+                                                               void *trace_data),
+                                            void *trace_data,
+                                            size_t *size) GC_ALWAYS_INLINE;
+
+GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(void *object);
+GC_EMBEDDER_API inline void gc_object_forward_nonatomic(void *object, uintptr_t new_addr);
+
+GC_EMBEDDER_API inline struct gc_atomic_forward gc_atomic_forward_begin(void *obj);
+GC_EMBEDDER_API inline void gc_atomic_forward_acquire(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline int gc_atomic_forward_retry_busy(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline void gc_atomic_forward_abort(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline void gc_atomic_forward_commit(struct gc_atomic_forward *,
+                                                     uintptr_t new_addr);
+GC_EMBEDDER_API inline uintptr_t gc_atomic_forward_address(struct gc_atomic_forward *);
+
+#endif // GC_EMBEDDER_API_H
diff --git a/gc-forwarding.h b/gc-forwarding.h
new file mode 100644
index 000000000..4fb1dec2c
--- /dev/null
+++ b/gc-forwarding.h
@@ -0,0 +1,20 @@
+#ifndef GC_FORWARDING_H
+#define GC_FORWARDING_H
+
+#include <stdint.h>
+
+enum gc_forwarding_state {
+  GC_FORWARDING_STATE_FORWARDED,
+  GC_FORWARDING_STATE_BUSY,
+  GC_FORWARDING_STATE_ACQUIRED,
+  GC_FORWARDING_STATE_NOT_FORWARDED,
+  GC_FORWARDING_STATE_ABORTED
+};
+
+struct gc_atomic_forward {
+  void *object;
+  uintptr_t data;
+  enum gc_forwarding_state state;
+};
+
+#endif // GC_FORWARDING_H
diff --git a/gc-inline.h b/gc-inline.h
new file mode 100644
index 000000000..30eac54f3
--- /dev/null
+++ b/gc-inline.h
@@ -0,0 +1,7 @@
+#ifndef GC_INLINE_H_
+#define GC_INLINE_H_
+
+#define GC_ALWAYS_INLINE __attribute__((always_inline))
+#define GC_NEVER_INLINE __attribute__((noinline))
+
+#endif // GC_INLINE_H_
diff --git a/gc-ref.h b/gc-ref.h
new file mode 100644
index 000000000..33ac5e73b
--- /dev/null
+++ b/gc-ref.h
@@ -0,0 +1,37 @@
+#ifndef GC_REF_H
+#define GC_REF_H
+
+#include "gc-assert.h"
+
+#include <stdint.h>
+
+struct gc_ref {
+  uintptr_t value;
+};
+
+static inline struct gc_ref gc_ref(uintptr_t value) {
+  return (struct gc_ref){value};
+}
+static inline uintptr_t gc_ref_value(struct gc_ref ref) {
+  return ref.value;
+}
+
+static inline struct gc_ref gc_ref_null(void) {
+  return gc_ref(0);
+}
+static inline int gc_ref_is_heap_object(struct gc_ref ref) {
+  return ref.value != 0;
+}
+static inline struct gc_ref gc_ref_from_heap_object_or_null(void *obj) {
+  return gc_ref((uintptr_t) obj);
+}
+static inline struct gc_ref gc_ref_from_heap_object(void *obj) {
+  GC_ASSERT(obj);
+  return gc_ref_from_heap_object_or_null(obj);
+}
+static inline void* gc_ref_heap_object(struct gc_ref ref) {
+  GC_ASSERT(gc_ref_is_heap_object(ref));
+  return (void *) gc_ref_value(ref);
+}
+
+#endif // GC_REF_H
diff --git a/heap-objects.h b/heap-objects.h
index ea84d2b84..d76d5ee36 100644
--- a/heap-objects.h
+++ b/heap-objects.h
@@ -1,8 +1,8 @@
 #ifndef HEAP_OBJECTS_H
 #define HEAP_OBJECTS_H
 
-#include "inline.h"
-#include "gc-api.h"
+#include "gc-inline.h"
+#include "gc-edge.h"
 
 #define DECLARE_NODE_TYPE(name, Name, NAME) \
   struct Name;                              \
@@ -17,10 +17,10 @@ enum alloc_kind {
 #undef DEFINE_ENUM
 
 #define DEFINE_METHODS(name, Name, NAME) \
-  static inline size_t name##_size(Name *obj) ALWAYS_INLINE; \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
   static inline void visit_##name##_fields(Name *obj,\
                                            void (*visit)(struct gc_edge edge, void *visit_data), \
-                                           void *visit_data) ALWAYS_INLINE;
+                                           void *visit_data) GC_ALWAYS_INLINE;
 FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
 #undef DEFINE_METHODS
 
diff --git a/inline.h b/inline.h
deleted file mode 100644
index 4e44690f5..000000000
--- a/inline.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef INLINE_H
-#define INLINE_H
-
-#define ALWAYS_INLINE __attribute__((always_inline))
-#define NEVER_INLINE __attribute__((noinline))
-
-#endif // INLINE_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index f1fe9df3d..90634c365 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -44,10 +44,17 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
-#include "assert.h"
+// Tracer will be specialized with respect to tags defined in this header.
 #include "mt-gcbench-types.h"
+
+#include "assert.h"
+#include "simple-allocator.h"
+#include "simple-gc-embedder.h"
+#include "gc-api.h"
+
 #include "gc.h"
-#include "inline.h"
+
+#include "gc-inline.h"
 
 #define MAX_THREAD_COUNT 256
 
@@ -100,6 +107,7 @@ static inline void
 visit_hole_fields(Hole *obj,
                   void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
+  abort();
 }
 
 typedef HANDLE_TO(Node) NodeHandle;
@@ -107,22 +115,22 @@ typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
 
 static Node* allocate_node(struct mutator *mut) {
   // memset to 0 by the collector.
-  return allocate(mut, ALLOC_KIND_NODE, sizeof (Node));
+  return gc_allocate_with_kind(mut, ALLOC_KIND_NODE, sizeof (Node));
 }
 
 static DoubleArray* allocate_double_array(struct mutator *mut,
                                                  size_t size) {
   // May be uninitialized.
+  size_t bytes = sizeof(DoubleArray) + sizeof (double) * size;
   DoubleArray *ret =
-    allocate_pointerless(mut, ALLOC_KIND_DOUBLE_ARRAY,
-                         sizeof(DoubleArray) + sizeof (double) * size);
+    gc_allocate_pointerless_with_kind(mut, ALLOC_KIND_DOUBLE_ARRAY, bytes);
   ret->length = size;
   return ret;
 }
 
 static Hole* allocate_hole(struct mutator *mut, size_t size) {
-  Hole *ret = allocate(mut, ALLOC_KIND_HOLE,
-                       sizeof(Hole) + sizeof (uintptr_t) * size);
+  size_t bytes = sizeof(Hole) + sizeof (uintptr_t) * size;
+  Hole *ret = gc_allocate_with_kind(mut, ALLOC_KIND_HOLE, bytes);
   ret->length = size;
   return ret;
 }
@@ -289,8 +297,8 @@ static void time_construction(struct thread *t, int depth) {
   POP_HANDLE(mut);
 }
 
-static void* call_with_stack_base(void* (*)(uintptr_t*, void*), void*) NEVER_INLINE;
-static void* call_with_stack_base_inner(void* (*)(uintptr_t*, void*), uintptr_t*, void*) NEVER_INLINE;
+static void* call_with_stack_base(void* (*)(uintptr_t*, void*), void*) GC_NEVER_INLINE;
+static void* call_with_stack_base_inner(void* (*)(uintptr_t*, void*), uintptr_t*, void*) GC_NEVER_INLINE;
 static void* call_with_stack_base_inner(void* (*f)(uintptr_t *stack_base, void *arg),
                                         uintptr_t *stack_base, void *arg) {
   return f(stack_base, arg);
diff --git a/parallel-tracer.h b/parallel-tracer.h
index 0634c91ec..4ee90de70 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -8,7 +8,7 @@
 
 #include "assert.h"
 #include "debug.h"
-#include "inline.h"
+#include "gc-inline.h"
 #include "spin.h"
 
 // The Chase-Lev work-stealing deque, as initially described in "Dynamic
@@ -448,10 +448,10 @@ static void tracer_release(struct heap *heap) {
 }
 
 struct gcobj;
-static inline void tracer_visit(struct gc_edge edge, void *trace_data) ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
+static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct heap *heap,
-                             struct gc_edge edge) ALWAYS_INLINE;
+                             struct gc_edge edge) GC_ALWAYS_INLINE;
 
 static inline void
 tracer_share(struct local_tracer *trace) {
diff --git a/quads.c b/quads.c
index ef639712f..f7ca4e56a 100644
--- a/quads.c
+++ b/quads.c
@@ -4,6 +4,8 @@
 
 #include "assert.h"
 #include "quads-types.h"
+#include "simple-allocator.h"
+#include "simple-gc-embedder.h"
 #include "gc.h"
 
 typedef struct Quad {
@@ -24,7 +26,7 @@ typedef HANDLE_TO(Quad) QuadHandle;
 
 static Quad* allocate_quad(struct mutator *mut) {
   // memset to 0 by the collector.
-  return allocate(mut, ALLOC_KIND_QUAD, sizeof (Quad));
+  return gc_allocate_with_kind(mut, ALLOC_KIND_QUAD, sizeof (Quad));
 }
 
 /* Get the current time in microseconds */
@@ -106,7 +108,7 @@ static size_t tree_size(size_t depth) {
 #define MAX_THREAD_COUNT 256
 
 int main(int argc, char *argv[]) {
-  if (argc != 3) {
+  if (argc != 4) {
     fprintf(stderr, "usage: %s DEPTH MULTIPLIER PARALLELISM\n", argv[0]);
     return 1;
   }
diff --git a/semi.h b/semi.h
index 2c1421d11..02677d9c5 100644
--- a/semi.h
+++ b/semi.h
@@ -52,8 +52,8 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct mutator *mut) NEVER_INLINE;
-static void collect_for_alloc(struct mutator *mut, size_t bytes) NEVER_INLINE;
+static void collect(struct mutator *mut) GC_NEVER_INLINE;
+static void collect_for_alloc(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
 static void visit(struct gc_edge edge, void *visit_data);
 
@@ -93,18 +93,9 @@ static void flip(struct semi_space *space) {
   space->count++;
 }  
 
-static void* copy(struct semi_space *space, uintptr_t kind, void *obj) {
+static void* copy(struct semi_space *space, void *obj) {
   size_t size;
-  switch (kind) {
-#define COMPUTE_SIZE(name, Name, NAME) \
-    case ALLOC_KIND_##NAME: \
-      size = name##_size(obj); \
-      break;
-    FOR_EACH_HEAP_OBJECT_KIND(COMPUTE_SIZE)
-#undef COMPUTE_SIZE
-  default:
-    abort ();
-  }
+  gc_trace_object(obj, NULL, NULL, &size);
   void *new_obj = (void*)space->hp;
   memcpy(new_obj, obj, size);
   *(uintptr_t*) obj = space->hp;
@@ -113,31 +104,14 @@ static void* copy(struct semi_space *space, uintptr_t kind, void *obj) {
 }
 
 static uintptr_t scan(struct heap *heap, uintptr_t grey) {
-  void *obj = (void*)grey;
-  uintptr_t kind = *(uintptr_t*) obj;
-  switch (kind) {
-#define SCAN_OBJECT(name, Name, NAME) \
-    case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, visit, heap); \
-      return grey + align_up(name##_size((Name*)obj), ALIGNMENT);
-    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
-#undef SCAN_OBJECT
-  default:
-    abort ();
-  }
+  size_t size;
+  gc_trace_object((void*)grey, visit, heap, &size);
+  return grey + align_up(size, ALIGNMENT);
 }
 
 static void* forward(struct semi_space *space, void *obj) {
-  uintptr_t header_word = *(uintptr_t*)obj;
-  switch (header_word) {
-#define CASE_ALLOC_KIND(name, Name, NAME) \
-    case ALLOC_KIND_##NAME:
-    FOR_EACH_HEAP_OBJECT_KIND(CASE_ALLOC_KIND)
-#undef CASE_ALLOC_KIND
-    return copy(space, header_word, obj);
-  default:
-    return (void*)header_word;
-  }
+  uintptr_t forwarded = gc_object_forwarded_nonatomic(obj);
+  return forwarded ? (void*)forwarded : copy(space, obj);
 }  
 
 static void visit_semi_space(struct heap *heap, struct semi_space *space,
@@ -198,8 +172,7 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) {
 }
 
 static const size_t LARGE_OBJECT_THRESHOLD = 8192;
-static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
-                            size_t size) {
+static void* allocate_large(struct mutator *mut, size_t size) {
   struct heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
   struct semi_space *semi_space = heap_semi_space(heap);
@@ -222,14 +195,12 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
     abort();
   }
 
-  *(uintptr_t*)ret = kind;
   return ret;
 }
 
-static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
-                             size_t size) {
+static inline void* gc_allocate(struct mutator *mut, size_t size) {
   if (size >= LARGE_OBJECT_THRESHOLD)
-    return allocate_large(mut, kind, size);
+    return allocate_large(mut, size);
 
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
@@ -240,18 +211,13 @@ static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
       continue;
     }
     space->hp = new_hp;
-    void *ret = (void *)addr;
-    uintptr_t *header_word = ret;
-    *header_word = kind;
-    // FIXME: Allow allocator to avoid initializing pointerless memory?
-    // if (kind == NODE)
-    clear_memory(addr + sizeof(uintptr_t), size - sizeof(uintptr_t));
-    return ret;
+    // FIXME: Allow allocator to avoid clearing memory?
+    clear_memory(addr, size);
+    return (void *)addr;
   }
 }
-static inline void* allocate_pointerless(struct mutator *mut,
-                                         enum alloc_kind kind, size_t size) {
-  return allocate(mut, kind, size);
+static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
+  return gc_allocate(mut, size);
 }
 
 static inline void init_field(void *obj, void **addr, void *val) {
diff --git a/serial-tracer.h b/serial-tracer.h
index 474de34d7..68c4d489f 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -52,7 +52,7 @@ trace_queue_put(struct trace_queue *q, size_t idx, struct gcobj *x) {
   q->buf[idx & (q->size - 1)] = x;
 }
 
-static int trace_queue_grow(struct trace_queue *q) NEVER_INLINE;
+static int trace_queue_grow(struct trace_queue *q) GC_NEVER_INLINE;
 
 static int
 trace_queue_grow(struct trace_queue *q) {
@@ -138,10 +138,10 @@ static void tracer_release(struct heap *heap) {
 }
 
 struct gcobj;
-static inline void tracer_visit(struct gc_edge edge, void *trace_data) ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
+static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct heap *heap,
-                             struct gc_edge edge) ALWAYS_INLINE;
+                             struct gc_edge edge) GC_ALWAYS_INLINE;
 
 static inline void
 tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
diff --git a/simple-allocator.h b/simple-allocator.h
new file mode 100644
index 000000000..f1f02f341
--- /dev/null
+++ b/simple-allocator.h
@@ -0,0 +1,21 @@
+#ifndef SIMPLE_ALLOCATOR_H
+#define SIMPLE_ALLOCATOR_H
+
+#include "simple-tagging-scheme.h"
+#include "gc-api.h"
+
+static inline void*
+gc_allocate_with_kind(struct mutator *mut, enum alloc_kind kind, size_t bytes) {
+  void *obj = gc_allocate(mut, bytes);
+  *tag_word(obj) = tag_live(kind);
+  return obj;
+}
+
+static inline void*
+gc_allocate_pointerless_with_kind(struct mutator *mut, enum alloc_kind kind, size_t bytes) {
+  void *obj = gc_allocate_pointerless(mut, bytes);
+  *tag_word(obj) = tag_live(kind);
+  return obj;
+}
+
+#endif // SIMPLE_ALLOCATOR_H
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
new file mode 100644
index 000000000..a198a47ae
--- /dev/null
+++ b/simple-gc-embedder.h
@@ -0,0 +1,98 @@
+#include <stdatomic.h>
+
+#include "simple-tagging-scheme.h"
+#include "gc-embedder-api.h"
+
+static inline void gc_trace_object(void *object,
+                                   void (*trace_edge)(struct gc_edge edge,
+                                                      void *trace_data),
+                                   void *trace_data,
+                                   size_t *size) {
+  switch (tag_live_alloc_kind(*tag_word(object))) {
+#define SCAN_OBJECT(name, Name, NAME)                                   \
+    case ALLOC_KIND_##NAME:                                             \
+      if (trace_edge)                                                   \
+        visit_##name##_fields((Name*)object, trace_edge, trace_data);   \
+      if (size)                                                         \
+        *size = name##_size(object);                                    \
+      break;
+    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
+#undef SCAN_OBJECT
+  default:
+    abort ();
+  }
+}
+
+static inline uintptr_t gc_object_forwarded_nonatomic(void *object) {
+  uintptr_t tag = *tag_word(object);
+  return (tag & gcobj_not_forwarded_bit) ? 0 : tag;
+}
+
+static inline void gc_object_forward_nonatomic(void *object,
+                                               uintptr_t new_addr) {
+  *tag_word(object) = new_addr;
+}
+
+static inline struct gc_atomic_forward
+gc_atomic_forward_begin(void *object) {
+  uintptr_t tag = atomic_load_explicit(tag_word(object), memory_order_acquire);
+  enum gc_forwarding_state state;
+  if (tag == gcobj_busy)
+    state = GC_FORWARDING_STATE_BUSY;
+  else if (tag & gcobj_not_forwarded_bit)
+    state = GC_FORWARDING_STATE_NOT_FORWARDED;
+  else
+    state = GC_FORWARDING_STATE_FORWARDED;
+  return (struct gc_atomic_forward){ object, tag, state };
+}
+
+static inline int
+gc_atomic_forward_retry_busy(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_BUSY);
+  uintptr_t tag = atomic_load_explicit(tag_word(fwd->object),
+                                       memory_order_acquire);
+  if (tag == gcobj_busy)
+    return 0;
+  if (tag & gcobj_not_forwarded_bit)
+    fwd->state = GC_FORWARDING_STATE_ABORTED;
+  else {
+    fwd->state = GC_FORWARDING_STATE_FORWARDED;
+    fwd->data = tag;
+  }
+  return 1;
+}
+  
+static inline void
+gc_atomic_forward_acquire(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_NOT_FORWARDED);
+  if (atomic_compare_exchange_strong(tag_word(fwd->object), &fwd->data,
+                                     gcobj_busy))
+    fwd->state = GC_FORWARDING_STATE_ACQUIRED;
+  else if (fwd->data == gcobj_busy)
+    fwd->state = GC_FORWARDING_STATE_BUSY;
+  else {
+    GC_ASSERT((fwd->data & gcobj_not_forwarded_bit) == 0);
+    fwd->state = GC_FORWARDING_STATE_FORWARDED;
+  }
+}
+
+static inline void
+gc_atomic_forward_abort(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
+  atomic_store_explicit(tag_word(fwd->object), fwd->data, memory_order_release);
+  fwd->state = GC_FORWARDING_STATE_ABORTED;
+}
+
+static inline void
+gc_atomic_forward_commit(struct gc_atomic_forward *fwd, uintptr_t new_addr) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
+  *tag_word((void*)new_addr) = fwd->data;
+  atomic_store_explicit(tag_word(fwd->object), new_addr, memory_order_release);
+  fwd->state = GC_FORWARDING_STATE_FORWARDED;
+}
+
+static inline uintptr_t
+gc_atomic_forward_address(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_FORWARDED);
+  return fwd->data;
+}
diff --git a/simple-tagging-scheme.h b/simple-tagging-scheme.h
new file mode 100644
index 000000000..fc431c575
--- /dev/null
+++ b/simple-tagging-scheme.h
@@ -0,0 +1,29 @@
+#ifndef SIMPLE_TAGGING_SCHEME_H
+#define SIMPLE_TAGGING_SCHEME_H
+
+#include <stdint.h>
+
+struct gc_header {
+  uintptr_t tag;
+};
+
+// Alloc kind is in bits 1-7, for live objects.
+static const uintptr_t gcobj_alloc_kind_mask = 0x7f;
+static const uintptr_t gcobj_alloc_kind_shift = 1;
+static const uintptr_t gcobj_forwarded_mask = 0x1;
+static const uintptr_t gcobj_not_forwarded_bit = 0x1;
+static const uintptr_t gcobj_busy = 0;
+static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
+  return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
+}
+static inline uintptr_t tag_live(uint8_t alloc_kind) {
+  return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift)
+    | gcobj_not_forwarded_bit;
+}
+
+static inline uintptr_t* tag_word(void *object) {
+  struct gc_header *header = object;
+  return &header->tag;
+}
+
+#endif // SIMPLE_TAGGING_SCHEME_H
diff --git a/whippet.h b/whippet.h
index 3cbcee9a8..875d1cd37 100644
--- a/whippet.h
+++ b/whippet.h
@@ -15,9 +15,8 @@
 #include <string.h>
 #include <unistd.h>
 
-#include "assert.h"
 #include "debug.h"
-#include "inline.h"
+#include "gc-inline.h"
 #include "large-object-space.h"
 #include "precise-roots.h"
 #if GC_PARALLEL_TRACE
@@ -194,7 +193,7 @@ static uint8_t *object_metadata_byte(void *obj) {
 #define GRANULES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
 #define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
 static uint8_t *object_remset_byte(void *obj) {
-  ASSERT(!heap_object_is_large(obj));
+  GC_ASSERT(!heap_object_is_large(obj));
   uintptr_t addr = (uintptr_t) obj;
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
   uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
@@ -225,7 +224,7 @@ static uintptr_t block_summary_next(struct block_summary *summary) {
 }
 static void block_summary_set_next(struct block_summary *summary,
                                    uintptr_t next) {
-  ASSERT((next & (BLOCK_SIZE - 1)) == 0);
+  GC_ASSERT((next & (BLOCK_SIZE - 1)) == 0);
   summary->next_and_flags =
     (summary->next_and_flags & (BLOCK_SIZE - 1)) | next;
 }
@@ -268,29 +267,7 @@ static inline size_t size_to_granules(size_t size) {
   return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
 }
 
-// Alloc kind is in bits 1-7, for live objects.
-static const uintptr_t gcobj_alloc_kind_mask = 0x7f;
-static const uintptr_t gcobj_alloc_kind_shift = 1;
-static const uintptr_t gcobj_forwarded_mask = 0x1;
-static const uintptr_t gcobj_not_forwarded_bit = 0x1;
-static inline uint8_t tag_live_alloc_kind(uintptr_t tag) {
-  return (tag >> gcobj_alloc_kind_shift) & gcobj_alloc_kind_mask;
-}
-static inline uintptr_t tag_live(uint8_t alloc_kind) {
-  return ((uintptr_t)alloc_kind << gcobj_alloc_kind_shift)
-    | gcobj_not_forwarded_bit;
-}
-static inline uintptr_t tag_forwarded(struct gcobj *new_addr) {
-  return (uintptr_t)new_addr;
-}
-
-struct gcobj {
-  union {
-    uintptr_t tag;
-    uintptr_t words[0];
-    void *pointers[0];
-  };
-};
+struct gcobj;
 
 struct evacuation_allocator {
   size_t allocated; // atomically
@@ -396,18 +373,12 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct mutator *mut) NEVER_INLINE;
+static void collect(struct mutator *mut) GC_NEVER_INLINE;
 
 static int heap_object_is_large(struct gcobj *obj) {
-  switch (tag_live_alloc_kind(obj->tag)) {
-#define IS_LARGE(name, Name, NAME) \
-    case ALLOC_KIND_##NAME: \
-      return name##_size((Name*)obj) > LARGE_OBJECT_THRESHOLD;
-      break;
-    FOR_EACH_HEAP_OBJECT_KIND(IS_LARGE)
-#undef IS_LARGE
-  }
-  abort();
+  size_t size;
+  gc_trace_object(obj, NULL, NULL, &size);
+  return size > LARGE_OBJECT_THRESHOLD;
 }
 
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
@@ -436,7 +407,7 @@ static inline int mark_space_mark_object(struct mark_space *space,
 
 static uintptr_t make_evacuation_allocator_cursor(uintptr_t block,
                                                   size_t allocated) {
-  ASSERT(allocated < (BLOCK_SIZE - 1) * (uint64_t) BLOCK_SIZE);
+  GC_ASSERT(allocated < (BLOCK_SIZE - 1) * (uint64_t) BLOCK_SIZE);
   return (block & ~(BLOCK_SIZE - 1)) | (allocated / BLOCK_SIZE);
 }
 
@@ -453,17 +424,17 @@ static void prepare_evacuation_allocator(struct evacuation_allocator *alloc,
 
 static void clear_remaining_metadata_bytes_in_block(uintptr_t block,
                                                     uintptr_t allocated) {
-  ASSERT((allocated & (GRANULE_SIZE - 1)) == 0);
+  GC_ASSERT((allocated & (GRANULE_SIZE - 1)) == 0);
   uintptr_t base = block + allocated;
   uintptr_t limit = block + BLOCK_SIZE;
   uintptr_t granules = (limit - base) >> GRANULE_SIZE_LOG_2;
-  ASSERT(granules <= GRANULES_PER_BLOCK);
+  GC_ASSERT(granules <= GRANULES_PER_BLOCK);
   memset(object_metadata_byte((void*)base), 0, granules);
 }
 
 static void finish_evacuation_allocator_block(uintptr_t block,
                                               uintptr_t allocated) {
-  ASSERT(allocated <= BLOCK_SIZE);
+  GC_ASSERT(allocated <= BLOCK_SIZE);
   struct block_summary *summary = block_summary_for_addr(block);
   block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
   size_t fragmentation = (BLOCK_SIZE - allocated) >> GRANULE_SIZE_LOG_2;
@@ -489,13 +460,13 @@ static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
     allocated = alloc->limit;
   while (allocated >= BLOCK_SIZE) {
     uintptr_t block = pop_block(targets);
-    ASSERT(block);
+    GC_ASSERT(block);
     allocated -= BLOCK_SIZE;
   }
   if (allocated) {
     // Finish off the last partially-filled block.
     uintptr_t block = pop_block(targets);
-    ASSERT(block);
+    GC_ASSERT(block);
     finish_evacuation_allocator_block(block, allocated);
   }
   size_t remaining = atomic_load_explicit(&targets->count, memory_order_acquire);
@@ -536,7 +507,7 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
   uintptr_t base = seq * BLOCK_SIZE;
 
   while ((base ^ next) & ~block_mask) {
-    ASSERT(base < next);
+    GC_ASSERT(base < next);
     if (base + BLOCK_SIZE > prev) {
       // The allocation straddles a block boundary, and the cursor has
       // caught up so that we identify the block for the previous
@@ -549,10 +520,10 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
     base += BLOCK_SIZE;
     if (base >= alloc->limit) {
       // Ran out of blocks!
-      ASSERT(!block);
+      GC_ASSERT(!block);
       return NULL;
     }
-    ASSERT(block);
+    GC_ASSERT(block);
     // This store can race with other allocators, but that's OK as long
     // as it never advances the cursor beyond the allocation pointer,
     // which it won't because we updated the allocation pointer already.
@@ -579,37 +550,28 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
       ((byte & METADATA_BYTE_PINNED) == 0)) {
     // This is an evacuating collection, and we are attempting to
     // evacuate this block, and this particular object isn't pinned.
-    // First, see if someone evacuated this object already.
-    uintptr_t header_word = atomic_load_explicit(&obj->tag,
-                                                 memory_order_relaxed);
-    uintptr_t busy_header_word = 0;
-    if (header_word != busy_header_word &&
-        (header_word & gcobj_not_forwarded_bit) == 0) {
-      // The object has been evacuated already.  Update the edge;
-      // whoever forwarded the object will make sure it's eventually
-      // traced.
-      gc_edge_update(edge, gc_ref(header_word));
-      return 0;
-    }
-    // Otherwise try to claim it for evacuation.
-    if (header_word != busy_header_word &&
-        atomic_compare_exchange_strong(&obj->tag, &header_word,
-                                       busy_header_word)) {
+    struct gc_atomic_forward fwd = gc_atomic_forward_begin(obj);
+
+    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+      gc_atomic_forward_acquire(&fwd);
+
+    switch (fwd.state) {
+    case GC_FORWARDING_STATE_NOT_FORWARDED:
+    case GC_FORWARDING_STATE_ABORTED:
+      // Impossible.
+      abort();
+    case GC_FORWARDING_STATE_ACQUIRED: {
       // We claimed the object successfully; evacuating is up to us.
       size_t object_granules = mark_space_live_object_granules(metadata);
       struct gcobj *new_obj = evacuation_allocate(space, object_granules);
       if (new_obj) {
-        // We were able to reserve space in which to evacuate this object.
-        // Commit the evacuation by overwriting the tag.
-        uintptr_t new_header_word = tag_forwarded(new_obj);
-        atomic_store_explicit(&obj->tag, new_header_word,
-                              memory_order_release);
-        // Now copy the object contents, update extent metadata, and
-        // indicate to the caller that the object's fields need to be
-        // traced.
-        new_obj->tag = header_word;
-        memcpy(&new_obj->words[1], &obj->words[1],
-               object_granules * GRANULE_SIZE - sizeof(header_word));
+        // Copy object contents before committing, as we don't know what
+        // part of the object (if any) will be overwritten by the
+        // commit.
+        memcpy(new_obj, obj, object_granules * GRANULE_SIZE);
+        gc_atomic_forward_commit(&fwd, (uintptr_t)new_obj);
+        // Now update extent metadata, and indicate to the caller that
+        // the object's fields need to be traced.
         uint8_t *new_metadata = object_metadata_byte(new_obj);
         memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
         gc_edge_update(edge, gc_ref_from_heap_object(new_obj));
@@ -619,27 +581,33 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
       } else {
         // Well shucks; allocation failed, marking the end of
         // opportunistic evacuation.  No future evacuation of this
-        // object will succeed.  Restore the original header word and
-        // mark instead.
-        atomic_store_explicit(&obj->tag, header_word,
-                              memory_order_release);
+        // object will succeed.  Mark in place instead.
+        gc_atomic_forward_abort(&fwd);
       }
-    } else {
+      break;
+    }
+    case GC_FORWARDING_STATE_BUSY:
       // Someone else claimed this object first.  Spin until new address
       // known, or evacuation aborts.
       for (size_t spin_count = 0;; spin_count++) {
-        header_word = atomic_load_explicit(&obj->tag, memory_order_acquire);
-        if (header_word)
+        if (gc_atomic_forward_retry_busy(&fwd))
           break;
         yield_for_spin(spin_count);
       }
-      if ((header_word & gcobj_not_forwarded_bit) == 0)
-        gc_edge_update(edge, gc_ref(header_word));
-      // Either way, the other party is responsible for adding the
-      // object to the mark queue.
+      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+        // Remove evacuation aborted; remote will mark and enqueue.
+        return 0;
+      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+      // Fall through.
+    case GC_FORWARDING_STATE_FORWARDED:
+      // The object has been evacuated already.  Update the edge;
+      // whoever forwarded the object will make sure it's eventually
+      // traced.
+      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
       return 0;
     }
   }
+
   uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
     | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
   *metadata = (byte & ~mask) | space->marked_mask;
@@ -662,7 +630,7 @@ static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
   struct gcobj *obj = gc_ref_heap_object(ref);
-  if (LIKELY(mark_space_contains(heap_mark_space(heap), obj))) {
+  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), obj))) {
     if (heap_mark_space(heap)->evacuating)
       return mark_space_evacuate_or_mark_object(heap_mark_space(heap), edge,
                                                 ref);
@@ -676,16 +644,7 @@ static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
 }
 
 static inline void trace_one(struct gcobj *obj, void *mark_data) {
-  switch (tag_live_alloc_kind(obj->tag)) {
-#define SCAN_OBJECT(name, Name, NAME) \
-    case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, tracer_visit, mark_data); \
-      break;
-    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
-#undef SCAN_OBJECT
-  default:
-    abort ();
-  }
+  gc_trace_object(obj, tracer_visit, mark_data, NULL);
 }
 
 static int heap_has_multiple_mutators(struct heap *heap) {
@@ -730,23 +689,23 @@ static void remove_mutator(struct heap *heap, struct mutator *mut) {
 }
 
 static void request_mutators_to_stop(struct heap *heap) {
-  ASSERT(!mutators_are_stopping(heap));
+  GC_ASSERT(!mutators_are_stopping(heap));
   atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
 }
 
 static void allow_mutators_to_continue(struct heap *heap) {
-  ASSERT(mutators_are_stopping(heap));
-  ASSERT(heap->active_mutator_count == 0);
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(heap->active_mutator_count == 0);
   heap->active_mutator_count++;
   atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
-  ASSERT(!mutators_are_stopping(heap));
+  GC_ASSERT(!mutators_are_stopping(heap));
   pthread_cond_broadcast(&heap->mutator_cond);
 }
 
 static void push_unavailable_block(struct mark_space *space, uintptr_t block) {
   struct block_summary *summary = block_summary_for_addr(block);
-  ASSERT(!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP));
-  ASSERT(!block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
+  GC_ASSERT(!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP));
+  GC_ASSERT(!block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
   block_summary_set_flag(summary, BLOCK_UNAVAILABLE);
   madvise((void*)block, BLOCK_SIZE, MADV_DONTNEED);
   push_block(&space->unavailable, block);
@@ -757,7 +716,7 @@ static uintptr_t pop_unavailable_block(struct mark_space *space) {
   if (!block)
     return 0;
   struct block_summary *summary = block_summary_for_addr(block);
-  ASSERT(block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
+  GC_ASSERT(block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
   block_summary_clear_flag(summary, BLOCK_UNAVAILABLE);
   return block;
 }
@@ -768,7 +727,7 @@ static uintptr_t pop_empty_block(struct mark_space *space) {
 
 static int maybe_push_evacuation_target(struct mark_space *space,
                                         uintptr_t block, double reserve) {
-  ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
+  GC_ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
                                  BLOCK_NEEDS_SWEEP));
   size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
                                         memory_order_acquire);
@@ -795,7 +754,7 @@ static int push_evacuation_target_if_possible(struct mark_space *space,
 }
 
 static void push_empty_block(struct mark_space *space, uintptr_t block) {
-  ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
+  GC_ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
                                  BLOCK_NEEDS_SWEEP));
   push_block(&space->empty, block);
 }
@@ -811,7 +770,7 @@ static void mark_space_reacquire_memory(struct mark_space *space,
     atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
   while (pending + BLOCK_SIZE <= 0) {
     uintptr_t block = pop_unavailable_block(space);
-    ASSERT(block);
+    GC_ASSERT(block);
     if (push_evacuation_target_if_needed(space, block))
       continue;
     push_empty_block(space, block);
@@ -859,7 +818,7 @@ static int sweep_until_memory_released(struct mutator *mut) {
 static void heap_reset_large_object_pages(struct heap *heap, size_t npages) {
   size_t previous = heap->large_object_pages;
   heap->large_object_pages = npages;
-  ASSERT(npages <= previous);
+  GC_ASSERT(npages <= previous);
   size_t bytes = (previous - npages) <<
     heap_large_object_space(heap)->page_size_log2;
   mark_space_reacquire_memory(heap_mark_space(heap), bytes);
@@ -888,7 +847,7 @@ static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
 
 static void mutator_mark_buf_push(struct mutator_mark_buf *buf,
                                   struct gcobj *val) {
-  if (UNLIKELY(buf->size == buf->capacity))
+  if (GC_UNLIKELY(buf->size == buf->capacity))
     mutator_mark_buf_grow(buf);
   buf->objects[buf->size++] = val;
 }
@@ -908,7 +867,7 @@ static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
 
 static void enqueue_mutator_for_tracing(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
-  ASSERT(mut->next == NULL);
+  GC_ASSERT(mut->next == NULL);
   struct mutator *next =
     atomic_load_explicit(&heap->mutator_trace_list, memory_order_acquire);
   do {
@@ -948,7 +907,7 @@ static int mutator_should_mark_while_stopping(struct mutator *mut) {
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void mark_stopping_mutator_roots(struct mutator *mut) {
-  ASSERT(mutator_should_mark_while_stopping(mut));
+  GC_ASSERT(mutator_should_mark_while_stopping(mut));
   struct heap *heap = mutator_heap(mut);
   struct mutator_mark_buf *local_roots = &mut->mark_buf;
   for (struct handle *h = mut->roots; h; h = h->next) {
@@ -1026,16 +985,16 @@ static void trace_global_roots(struct heap *heap) {
 
 static inline int
 heap_object_is_young(struct heap *heap, struct gcobj *obj) {
-  if (UNLIKELY(!mark_space_contains(heap_mark_space(heap), obj))) {
+  if (GC_UNLIKELY(!mark_space_contains(heap_mark_space(heap), obj))) {
     // No lospace nursery, for the moment.
     return 0;
   }
-  ASSERT(!heap_object_is_large(obj));
+  GC_ASSERT(!heap_object_is_large(obj));
   return (*object_metadata_byte(obj)) & METADATA_BYTE_YOUNG;
 }
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
-  ASSERT(((uintptr_t)mark & 7) == 0);
+  GC_ASSERT(((uintptr_t)mark & 7) == 0);
   uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
   uint64_t word;
   memcpy(&word, aligned_mark, 8);
@@ -1073,7 +1032,7 @@ static void mark_space_trace_card(struct mark_space *space,
       size_t granule = granule_base + granule_offset;
       uintptr_t addr = first_addr_in_slab + granule * GRANULE_SIZE;
       struct gcobj *obj = (struct gcobj*)addr;
-      ASSERT(object_metadata_byte(obj) == &slab->metadata[granule]);
+      GC_ASSERT(object_metadata_byte(obj) == &slab->metadata[granule]);
       tracer_enqueue_root(&heap->tracer, obj);
     }
   }
@@ -1081,7 +1040,7 @@ static void mark_space_trace_card(struct mark_space *space,
 
 static void mark_space_trace_remembered_set(struct mark_space *space,
                                             struct heap *heap) {
-  ASSERT(!space->evacuating);
+  GC_ASSERT(!space->evacuating);
   for (size_t s = 0; s < space->nslabs; s++) {
     struct slab *slab = &space->slabs[s];
     uint8_t *remset = slab->remembered_set;
@@ -1116,10 +1075,10 @@ static void trace_generational_roots(struct heap *heap) {
   }
 }
 
-static void pause_mutator_for_collection(struct heap *heap) NEVER_INLINE;
+static void pause_mutator_for_collection(struct heap *heap) GC_NEVER_INLINE;
 static void pause_mutator_for_collection(struct heap *heap) {
-  ASSERT(mutators_are_stopping(heap));
-  ASSERT(heap->active_mutator_count);
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(heap->active_mutator_count);
   heap->active_mutator_count--;
   if (heap->active_mutator_count == 0)
     pthread_cond_signal(&heap->collector_cond);
@@ -1139,10 +1098,10 @@ static void pause_mutator_for_collection(struct heap *heap) {
   heap->active_mutator_count++;
 }
 
-static void pause_mutator_for_collection_with_lock(struct mutator *mut) NEVER_INLINE;
+static void pause_mutator_for_collection_with_lock(struct mutator *mut) GC_NEVER_INLINE;
 static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
-  ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping_in_block(mut);
   if (mutator_should_mark_while_stopping(mut))
     // No need to collect results in mark buf; we can enqueue roots directly.
@@ -1152,10 +1111,10 @@ static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   pause_mutator_for_collection(heap);
 }
 
-static void pause_mutator_for_collection_without_lock(struct mutator *mut) NEVER_INLINE;
+static void pause_mutator_for_collection_without_lock(struct mutator *mut) GC_NEVER_INLINE;
 static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
   struct heap *heap = mutator_heap(mut);
-  ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping(mut);
   if (mutator_should_mark_while_stopping(mut))
     mark_stopping_mutator_roots(mut);
@@ -1310,7 +1269,7 @@ static enum gc_kind determine_collection_kind(struct heap *heap) {
   } else {
     DEBUG("keeping on with minor GC\n");
     // Nursery has adequate space; keep trucking with minor GCs.
-    ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
+    GC_ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
     gc_kind = GC_KIND_MINOR_IN_PLACE;
   }
 
@@ -1391,7 +1350,7 @@ static void prepare_for_evacuation(struct heap *heap) {
   // they have been removed from the pool and have the UNAVAILABLE flag
   // set, or because they are on the empties or evacuation target
   // lists.  When evacuation starts, the empties list should be empty.
-  ASSERT(empties == target_blocks);
+  GC_ASSERT(empties == target_blocks);
 
   // Now select a number of blocks that is likely to fill the space in
   // the target blocks.  Prefer candidate blocks with fewer survivors
@@ -1560,7 +1519,7 @@ static uintptr_t mark_space_next_block_to_sweep(struct mark_space *space) {
 }
 
 static void finish_block(struct mutator *mut) {
-  ASSERT(mut->block);
+  GC_ASSERT(mut->block);
   struct block_summary *block = block_summary_for_addr(mut->block);
   struct mark_space *space = heap_mark_space(mutator_heap(mut));
   atomic_fetch_add(&space->granules_freed_by_last_collection,
@@ -1572,7 +1531,7 @@ static void finish_block(struct mutator *mut) {
   // trying to allocate into it for a minor GC.  Sweep it next time to
   // clear any garbage allocated in this cycle and mark it as
   // "venerable" (i.e., old).
-  ASSERT(!block_summary_has_flag(block, BLOCK_VENERABLE));
+  GC_ASSERT(!block_summary_has_flag(block, BLOCK_VENERABLE));
   if (!block_summary_has_flag(block, BLOCK_VENERABLE_AFTER_SWEEP) &&
       block->free_granules < GRANULES_PER_BLOCK * space->venerable_threshold)
     block_summary_set_flag(block, BLOCK_VENERABLE_AFTER_SWEEP);
@@ -1590,7 +1549,7 @@ static size_t next_hole_in_block(struct mutator *mut) {
   uintptr_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
 
   while (sweep != limit) {
-    ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
+    GC_ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
     uint8_t* metadata = object_metadata_byte((struct gcobj*)sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
 
@@ -1613,12 +1572,12 @@ static size_t next_hole_in_block(struct mutator *mut) {
     }
 
     size_t free_granules = next_mark(metadata, limit_granules, sweep_mask);
-    ASSERT(free_granules);
-    ASSERT(free_granules <= limit_granules);
+    GC_ASSERT(free_granules);
+    GC_ASSERT(free_granules <= limit_granules);
 
     struct block_summary *summary = block_summary_for_addr(sweep);
     summary->hole_count++;
-    ASSERT(free_granules <= GRANULES_PER_BLOCK - summary->free_granules);
+    GC_ASSERT(free_granules <= GRANULES_PER_BLOCK - summary->free_granules);
     summary->free_granules += free_granules;
 
     size_t free_bytes = free_granules * GRANULE_SIZE;
@@ -1645,7 +1604,7 @@ static void finish_hole(struct mutator *mut) {
 }
 
 static int maybe_release_swept_empty_block(struct mutator *mut) {
-  ASSERT(mut->block);
+  GC_ASSERT(mut->block);
   struct mark_space *space = heap_mark_space(mutator_heap(mut));
   uintptr_t block = mut->block;
   if (atomic_load_explicit(&space->pending_unavailable_bytes,
@@ -1696,7 +1655,7 @@ static size_t next_hole(struct mutator *mut) {
       mut->alloc = mut->sweep = mut->block = 0;
       empties_countdown--;
     }
-    ASSERT(mut->block == 0);
+    GC_ASSERT(mut->block == 0);
     while (1) {
       uintptr_t block = mark_space_next_block_to_sweep(space);
       if (block) {
@@ -1797,8 +1756,7 @@ static void trigger_collection(struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
-                            size_t granules) {
+static void* allocate_large(struct mutator *mut, size_t granules) {
   struct heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
 
@@ -1821,14 +1779,11 @@ static void* allocate_large(struct mutator *mut, enum alloc_kind kind,
     abort();
   }
 
-  *(uintptr_t*)ret = tag_live(kind);
   return ret;
 }
 
-static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
-                                 size_t granules) NEVER_INLINE;
-static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
-                                 size_t granules) {
+static void* allocate_small_slow(struct mutator *mut, size_t granules) GC_NEVER_INLINE;
+static void* allocate_small_slow(struct mutator *mut, size_t granules) {
   while (1) {
     size_t hole = next_hole(mut);
     if (hole >= granules) {
@@ -1843,9 +1798,8 @@ static void* allocate_small_slow(struct mutator *mut, enum alloc_kind kind,
   return ret;
 }
 
-static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
-                                   size_t granules) {
-  ASSERT(granules > 0); // allocating 0 granules would be silly
+static inline void* allocate_small(struct mutator *mut, size_t granules) {
+  GC_ASSERT(granules > 0); // allocating 0 granules would be silly
   uintptr_t alloc = mut->alloc;
   uintptr_t sweep = mut->sweep;
   uintptr_t new_alloc = alloc + granules * GRANULE_SIZE;
@@ -1854,9 +1808,8 @@ static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
     mut->alloc = new_alloc;
     obj = (struct gcobj *)alloc;
   } else {
-    obj = allocate_small_slow(mut, kind, granules);
+    obj = allocate_small_slow(mut, granules);
   }
-  obj->tag = tag_live(kind);
   uint8_t *metadata = object_metadata_byte(obj);
   if (granules == 1) {
     metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
@@ -1869,24 +1822,20 @@ static inline void* allocate_small(struct mutator *mut, enum alloc_kind kind,
   return obj;
 }
 
-static inline void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
-                                    size_t granules) {
-  return allocate_small(mut, kind, granules);
+static inline void* allocate_medium(struct mutator *mut, size_t granules) {
+  return allocate_small(mut, granules);
 }
 
-static inline void* allocate(struct mutator *mut, enum alloc_kind kind,
-                             size_t size) {
+static inline void* gc_allocate(struct mutator *mut, size_t size) {
   size_t granules = size_to_granules(size);
   if (granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    return allocate_small(mut, kind, granules);
+    return allocate_small(mut, granules);
   if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
-    return allocate_medium(mut, kind, granules);
-  return allocate_large(mut, kind, granules);
+    return allocate_medium(mut, granules);
+  return allocate_large(mut, granules);
 }
-static inline void* allocate_pointerless(struct mutator *mut,
-                                         enum alloc_kind kind,
-                                         size_t size) {
-  return allocate(mut, kind, size);
+static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
+  return gc_allocate(mut, size);
 }
 
 static inline void mark_space_write_barrier(void *obj) {
@@ -1940,8 +1889,8 @@ struct options {
 };
 
 static size_t parse_size_t(double value) {
-  ASSERT(value >= 0);
-  ASSERT(value <= (size_t) -1);
+  GC_ASSERT(value >= 0);
+  GC_ASSERT(value <= (size_t) -1);
   return value;
 }
 
@@ -2093,7 +2042,7 @@ static void gc_finish_for_thread(struct mutator *mut) {
 }
 
 static void deactivate_mutator(struct heap *heap, struct mutator *mut) {
-  ASSERT(mut->next == NULL);
+  GC_ASSERT(mut->next == NULL);
   heap_lock(heap);
   mut->next = heap->deactivated_mutators;
   heap->deactivated_mutators = mut;

From 4d8a7169d0040752f0d3e9993189c3ee60f578cf Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 14 Aug 2022 09:18:21 +0200
Subject: [PATCH 127/403] Add inline to gc-api.h

---
 gc-api.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gc-api.h b/gc-api.h
index 9bcd69d70..67169a99b 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -3,6 +3,7 @@
 
 #include "gc-config.h"
 #include "gc-assert.h"
+#include "gc-inline.h"
 #include "gc-ref.h"
 #include "gc-edge.h"
 

From a75842be9002c00403d8eac656c2ec42353b8a4c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 15 Aug 2022 11:17:15 +0200
Subject: [PATCH 128/403] Mostly implementation-independent inline allocation

This is a step towards separate compilation of the GC without losing
performance.  Only remaining task is the write barrier.
---
 bdw.h     |  40 +++++++++++++++++++---
 gc-api.h  |  96 +++++++++++++++++++++++++++++++++++++++++++++++++--
 semi.h    |  51 +++++++++++++++++++++-------
 whippet.h | 100 +++++++++++++++++++++++++++++++-----------------------
 4 files changed, 225 insertions(+), 62 deletions(-)

diff --git a/bdw.h b/bdw.h
index 1eb841662..8bfa30feb 100644
--- a/bdw.h
+++ b/bdw.h
@@ -45,6 +45,34 @@ static inline size_t gc_inline_freelist_object_size(size_t idx) {
   return (idx + 1U) * GC_INLINE_GRANULE_BYTES;
 }
 
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_FREELIST;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GC_INLINE_GRANULE_BYTES;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return 256;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  abort();
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  abort();
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  GC_ASSERT(size);
+  return sizeof(void*) * gc_inline_bytes_to_freelist_index(size);
+}
+
+static inline void gc_allocator_inline_success(struct mutator *mut,
+                                               struct gc_ref obj,
+                                               uintptr_t aligned_size) {}
+static inline void gc_allocator_inline_failure(struct mutator *mut,
+                                               uintptr_t aligned_size) {}
+
 // The values of these must match the internal POINTERLESS and NORMAL
 // definitions in libgc, for which unfortunately there are no external
 // definitions.  Alack.
@@ -80,12 +108,14 @@ allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   return head;
 }
 
-static inline void* gc_allocate(struct mutator *mut, size_t size) {
+static void* gc_allocate_large(struct mutator *mut, size_t size) {
+  return GC_malloc(size);
+}
+
+static void* gc_allocate_small(struct mutator *mut, size_t size) {
+  GC_ASSERT(size != 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
   size_t idx = gc_inline_bytes_to_freelist_index(size);
-
-  if (UNLIKELY(idx >= GC_INLINE_FREELIST_COUNT))
-    return GC_malloc(size);
-
   return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
 }
 
diff --git a/gc-api.h b/gc-api.h
index 67169a99b..65154cd5a 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -23,6 +23,10 @@ struct gc_option {
   double value;
 };
 
+struct gc_mutator {
+  void *user_data;
+};
+
 // FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
 #ifndef GC_API_
 #define GC_API_ static
@@ -38,8 +42,96 @@ GC_API_ void gc_finish_for_thread(struct mutator *mut);
 GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
-GC_API_ inline void* gc_allocate(struct mutator *mut, size_t bytes);
+GC_API_ void* gc_allocate_small(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
+GC_API_ void* gc_allocate_large(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
+static inline void* gc_allocate(struct mutator *mut, size_t bytes) GC_ALWAYS_INLINE;
 // FIXME: remove :P
-GC_API_ inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
+static inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
+
+enum gc_allocator_kind {
+  GC_ALLOCATOR_INLINE_BUMP_POINTER,
+  GC_ALLOCATOR_INLINE_FREELIST,
+  GC_ALLOCATOR_INLINE_NONE
+};
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_large_threshold(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_small_granule_size(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_allocation_limit_offset(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_freelist_offset(size_t size) GC_ALWAYS_INLINE;
+
+static inline void gc_allocator_inline_success(struct mutator *mut,
+                                               struct gc_ref obj,
+                                               uintptr_t aligned_size);
+static inline void gc_allocator_inline_failure(struct mutator *mut,
+                                               uintptr_t aligned_size);
+
+static inline void*
+gc_allocate_bump_pointer(struct mutator *mut, size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_bump_pointer(struct mutator *mut, size_t size) {
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+
+  size_t granule_size = gc_allocator_small_granule_size();
+  size_t hp_offset = gc_allocator_allocation_pointer_offset();
+  size_t limit_offset = gc_allocator_allocation_limit_offset();
+
+  uintptr_t base_addr = (uintptr_t)mut;
+  uintptr_t *hp_loc = (uintptr_t*)(base_addr + hp_offset);
+  uintptr_t *limit_loc = (uintptr_t*)(base_addr + limit_offset);
+
+  size = (size + granule_size - 1) & ~(granule_size - 1);
+  uintptr_t hp = *hp_loc;
+  uintptr_t limit = *limit_loc;
+  uintptr_t new_hp = hp + size;
+
+  if (GC_UNLIKELY (new_hp > limit)) {
+    gc_allocator_inline_failure(mut, size);
+    return gc_allocate_small(mut, size);
+  }
+
+  gc_allocator_inline_success(mut, gc_ref(hp), size);
+
+  *hp_loc = new_hp;
+  return (void*)hp;
+}
+
+static inline void* gc_allocate_freelist(struct mutator *mut,
+                                         size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_freelist(struct mutator *mut, size_t size) {
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+
+  size_t freelist_offset = gc_allocator_freelist_offset(size);
+  uintptr_t base_addr = (uintptr_t)mut;
+  void **freelist_loc = (void**)(base_addr + freelist_offset);
+
+  void *head = *freelist_loc;
+  if (GC_UNLIKELY(!head))
+    return gc_allocate_small(mut, size);
+
+  *freelist_loc = *(void**)head;
+  return head;
+}
+
+static inline void* gc_allocate(struct mutator *mut, size_t size) {
+  GC_ASSERT(size != 0);
+  if (size > gc_allocator_large_threshold())
+    return gc_allocate_large(mut, size);
+
+  switch (gc_allocator_kind()) {
+  case GC_ALLOCATOR_INLINE_BUMP_POINTER:
+    return gc_allocate_bump_pointer(mut, size);
+  case GC_ALLOCATOR_INLINE_FREELIST:
+    return gc_allocate_freelist(mut, size);
+  case GC_ALLOCATOR_INLINE_NONE:
+    return gc_allocate_small(mut, size);
+  default:
+    abort();
+  }
+}
 
 #endif // GC_API_H_
diff --git a/semi.h b/semi.h
index 02677d9c5..38dae224e 100644
--- a/semi.h
+++ b/semi.h
@@ -29,6 +29,43 @@ struct mutator {
   struct handle *roots;
 };
 
+static const uintptr_t ALIGNMENT = 8;
+static const size_t LARGE_OBJECT_THRESHOLD = 8192;
+
+static inline void clear_memory(uintptr_t addr, size_t size) {
+  memset((char*)addr, 0, size);
+}
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return ALIGNMENT;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return offsetof(struct semi_space, hp);
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return offsetof(struct semi_space, limit);
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  abort();
+}
+
+static inline void gc_allocator_inline_success(struct mutator *mut,
+                                               struct gc_ref obj,
+                                               uintptr_t aligned_size) {
+  // FIXME: Allow allocator to avoid clearing memory?
+  clear_memory(gc_ref_value(obj), aligned_size);
+}
+static inline void gc_allocator_inline_failure(struct mutator *mut,
+                                               uintptr_t aligned_size) {}
+
 static inline struct heap* mutator_heap(struct mutator *mut) {
   return &mut->heap;
 }
@@ -42,16 +79,10 @@ static inline struct semi_space* mutator_semi_space(struct mutator *mut) {
   return heap_semi_space(mutator_heap(mut));
 }
 
-static const uintptr_t ALIGNMENT = 8;
-
 static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
 
-static inline void clear_memory(uintptr_t addr, size_t size) {
-  memset((char*)addr, 0, size);
-}
-
 static void collect(struct mutator *mut) GC_NEVER_INLINE;
 static void collect_for_alloc(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
@@ -171,8 +202,7 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) {
   }
 }
 
-static const size_t LARGE_OBJECT_THRESHOLD = 8192;
-static void* allocate_large(struct mutator *mut, size_t size) {
+static void* gc_allocate_large(struct mutator *mut, size_t size) {
   struct heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
   struct semi_space *semi_space = heap_semi_space(heap);
@@ -198,10 +228,7 @@ static void* allocate_large(struct mutator *mut, size_t size) {
   return ret;
 }
 
-static inline void* gc_allocate(struct mutator *mut, size_t size) {
-  if (size >= LARGE_OBJECT_THRESHOLD)
-    return allocate_large(mut, size);
-
+static void* gc_allocate_small(struct mutator *mut, size_t size) {
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
     uintptr_t addr = space->hp;
diff --git a/whippet.h b/whippet.h
index 875d1cd37..405e87abc 100644
--- a/whippet.h
+++ b/whippet.h
@@ -369,6 +369,44 @@ static inline struct heap* mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
 
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GRANULE_SIZE;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return offsetof(struct mutator, alloc);
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return offsetof(struct mutator, sweep);
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  abort();
+}
+
+static inline void gc_allocator_inline_success(struct mutator *mut,
+                                               struct gc_ref obj,
+                                               uintptr_t aligned_size) {
+  uint8_t *metadata = object_metadata_byte(gc_ref_heap_object(obj));
+  size_t granules = aligned_size >> GRANULE_SIZE_LOG_2;
+  if (granules == 1) {
+    metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
+  } else {
+    metadata[0] = METADATA_BYTE_YOUNG;
+    if (granules > 2)
+      memset(metadata + 1, 0, granules - 2);
+    metadata[granules - 1] = METADATA_BYTE_END;
+  }
+}
+static inline void gc_allocator_inline_failure(struct mutator *mut,
+                                               uintptr_t aligned_size) {}
+
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
@@ -1756,11 +1794,10 @@ static void trigger_collection(struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void* allocate_large(struct mutator *mut, size_t granules) {
+static void* gc_allocate_large(struct mutator *mut, size_t size) {
   struct heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
 
-  size_t size = granules * GRANULE_SIZE;
   size_t npages = large_object_space_npages(space, size);
 
   mark_space_request_release_memory(heap_mark_space(heap),
@@ -1782,58 +1819,35 @@ static void* allocate_large(struct mutator *mut, size_t granules) {
   return ret;
 }
 
-static void* allocate_small_slow(struct mutator *mut, size_t granules) GC_NEVER_INLINE;
-static void* allocate_small_slow(struct mutator *mut, size_t granules) {
-  while (1) {
-    size_t hole = next_hole(mut);
-    if (hole >= granules) {
-      clear_memory(mut->alloc, hole * GRANULE_SIZE);
-      break;
-    }
-    if (!hole)
-      trigger_collection(mut);
-  }
-  struct gcobj* ret = (struct gcobj*)mut->alloc;
-  mut->alloc += granules * GRANULE_SIZE;
-  return ret;
-}
-
-static inline void* allocate_small(struct mutator *mut, size_t granules) {
-  GC_ASSERT(granules > 0); // allocating 0 granules would be silly
+static void* gc_allocate_small(struct mutator *mut, size_t size) {
+  GC_ASSERT(size > 0); // allocating 0 bytes would be silly
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, GRANULE_SIZE);
   uintptr_t alloc = mut->alloc;
   uintptr_t sweep = mut->sweep;
-  uintptr_t new_alloc = alloc + granules * GRANULE_SIZE;
+  uintptr_t new_alloc = alloc + size;
   struct gcobj *obj;
   if (new_alloc <= sweep) {
     mut->alloc = new_alloc;
     obj = (struct gcobj *)alloc;
   } else {
-    obj = allocate_small_slow(mut, granules);
-  }
-  uint8_t *metadata = object_metadata_byte(obj);
-  if (granules == 1) {
-    metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
-  } else {
-    metadata[0] = METADATA_BYTE_YOUNG;
-    if (granules > 2)
-      memset(metadata + 1, 0, granules - 2);
-    metadata[granules - 1] = METADATA_BYTE_END;
+    size_t granules = size >> GRANULE_SIZE_LOG_2;
+    while (1) {
+      size_t hole = next_hole(mut);
+      if (hole >= granules) {
+        clear_memory(mut->alloc, hole * GRANULE_SIZE);
+        break;
+      }
+      if (!hole)
+        trigger_collection(mut);
+    }
+    obj = (struct gcobj*)mut->alloc;
+    mut->alloc += size;
   }
+  gc_allocator_inline_success(mut, gc_ref_from_heap_object(obj), size);
   return obj;
 }
 
-static inline void* allocate_medium(struct mutator *mut, size_t granules) {
-  return allocate_small(mut, granules);
-}
-
-static inline void* gc_allocate(struct mutator *mut, size_t size) {
-  size_t granules = size_to_granules(size);
-  if (granules <= MEDIUM_OBJECT_GRANULE_THRESHOLD)
-    return allocate_small(mut, granules);
-  if (granules <= LARGE_OBJECT_GRANULE_THRESHOLD)
-    return allocate_medium(mut, granules);
-  return allocate_large(mut, granules);
-}
 static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }

From a00c83878ee2c65b08c0ace5ec8ffc3857558c50 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 15 Aug 2022 16:00:01 +0200
Subject: [PATCH 129/403] Inline post-allocation actions

---
 bdw.h        |  25 ++++++++++--
 gc-api.h     | 109 +++++++++++++++++++++++++++++++++++++++++++--------
 mt-gcbench.c |  15 +++++--
 quads.c      |   2 +-
 semi.h       |  30 ++++++++------
 whippet.h    |  38 +++++++++++-------
 6 files changed, 168 insertions(+), 51 deletions(-)

diff --git a/bdw.h b/bdw.h
index 8bfa30feb..21691ade8 100644
--- a/bdw.h
+++ b/bdw.h
@@ -131,11 +131,28 @@ static inline void collect(struct mutator *mut) {
   GC_gcollect();
 }
 
-static inline void init_field(void *obj, void **addr, void *val) {
-  *addr = val;
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+  return GC_WRITE_BARRIER_NONE;
 }
-static inline void set_field(void *obj, void **addr, void *val) {
-  *addr = val;
+static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+  abort();
+}
+static inline size_t gc_small_write_barrier_card_size(void) {
+  abort();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  abort();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  abort();
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 0;
 }
 
 static inline struct mutator *add_mutator(struct heap *heap) {
diff --git a/gc-api.h b/gc-api.h
index 65154cd5a..df865025f 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -7,7 +7,9 @@
 #include "gc-ref.h"
 #include "gc-edge.h"
 
+#include <stdatomic.h>
 #include <stdint.h>
+#include <string.h>
 
 // FIXME: prefix with gc_
 struct heap;
@@ -42,12 +44,6 @@ GC_API_ void gc_finish_for_thread(struct mutator *mut);
 GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
-GC_API_ void* gc_allocate_small(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
-GC_API_ void* gc_allocate_large(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
-static inline void* gc_allocate(struct mutator *mut, size_t bytes) GC_ALWAYS_INLINE;
-// FIXME: remove :P
-static inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
-
 enum gc_allocator_kind {
   GC_ALLOCATOR_INLINE_BUMP_POINTER,
   GC_ALLOCATOR_INLINE_FREELIST,
@@ -65,11 +61,54 @@ static inline size_t gc_allocator_allocation_limit_offset(void) GC_ALWAYS_INLINE
 
 static inline size_t gc_allocator_freelist_offset(size_t size) GC_ALWAYS_INLINE;
 
-static inline void gc_allocator_inline_success(struct mutator *mut,
-                                               struct gc_ref obj,
-                                               uintptr_t aligned_size);
-static inline void gc_allocator_inline_failure(struct mutator *mut,
-                                               uintptr_t aligned_size);
+static inline size_t gc_allocator_alloc_table_alignment(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLINE;
+
+static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
+
+static inline void gc_clear_fresh_allocation(struct gc_ref obj,
+                                             size_t size) GC_ALWAYS_INLINE;
+static inline void gc_clear_fresh_allocation(struct gc_ref obj,
+                                             size_t size) {
+  if (!gc_allocator_needs_clear()) return;
+  memset(gc_ref_heap_object(obj), 0, size);
+}
+
+static inline void gc_update_alloc_table(struct mutator *mut,
+                                         struct gc_ref obj,
+                                         size_t size) GC_ALWAYS_INLINE;
+static inline void gc_update_alloc_table(struct mutator *mut,
+                                         struct gc_ref obj,
+                                         size_t size) {
+  size_t alignment = gc_allocator_alloc_table_alignment();
+  if (!alignment) return;
+
+  uintptr_t addr = gc_ref_value(obj);
+  uintptr_t base = addr & ~(alignment - 1);
+  size_t granule_size = gc_allocator_small_granule_size();
+  uintptr_t granule = (addr & (alignment - 1)) / granule_size;
+  uint8_t *alloc = (uint8_t*)(base + granule);
+
+  uint8_t begin_pattern = gc_allocator_alloc_table_begin_pattern();
+  uint8_t end_pattern = gc_allocator_alloc_table_end_pattern();
+  if (end_pattern) {
+    size_t granules = size / granule_size;
+    if (granules == 1) {
+      alloc[0] = begin_pattern | end_pattern;
+    } else {
+      alloc[0] = begin_pattern;
+      if (granules > 2)
+        memset(alloc + 1, 0, granules - 2);
+      alloc[granules - 1] = end_pattern;
+    }
+  } else {
+    alloc[0] = begin_pattern;
+  }
+}
+
+GC_API_ void* gc_allocate_small(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
+GC_API_ void* gc_allocate_large(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
 static inline void*
 gc_allocate_bump_pointer(struct mutator *mut, size_t size) GC_ALWAYS_INLINE;
@@ -89,14 +128,14 @@ static inline void* gc_allocate_bump_pointer(struct mutator *mut, size_t size) {
   uintptr_t limit = *limit_loc;
   uintptr_t new_hp = hp + size;
 
-  if (GC_UNLIKELY (new_hp > limit)) {
-    gc_allocator_inline_failure(mut, size);
+  if (GC_UNLIKELY (new_hp > limit))
     return gc_allocate_small(mut, size);
-  }
-
-  gc_allocator_inline_success(mut, gc_ref(hp), size);
 
   *hp_loc = new_hp;
+
+  gc_clear_fresh_allocation(gc_ref(hp), size);
+  gc_update_alloc_table(mut, gc_ref(hp), size);
+
   return (void*)hp;
 }
 
@@ -114,9 +153,14 @@ static inline void* gc_allocate_freelist(struct mutator *mut, size_t size) {
     return gc_allocate_small(mut, size);
 
   *freelist_loc = *(void**)head;
+
+  gc_clear_fresh_allocation(gc_ref_from_heap_object(head), size);
+  gc_update_alloc_table(mut, gc_ref_from_heap_object(head), size);
+
   return head;
 }
 
+static inline void* gc_allocate(struct mutator *mut, size_t bytes) GC_ALWAYS_INLINE;
 static inline void* gc_allocate(struct mutator *mut, size_t size) {
   GC_ASSERT(size != 0);
   if (size > gc_allocator_large_threshold())
@@ -134,4 +178,37 @@ static inline void* gc_allocate(struct mutator *mut, size_t size) {
   }
 }
 
+// FIXME: remove :P
+static inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
+
+enum gc_write_barrier_kind {
+  GC_WRITE_BARRIER_NONE,
+  GC_WRITE_BARRIER_CARD
+};
+
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void);
+static inline size_t gc_small_write_barrier_card_table_alignment(void);
+static inline size_t gc_small_write_barrier_card_size(void);
+
+static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
+                                          struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
+                                          struct gc_ref new_val) {
+  switch (gc_small_write_barrier_kind()) {
+  case GC_WRITE_BARRIER_NONE:
+    return;
+  case GC_WRITE_BARRIER_CARD: {
+    size_t card_table_alignment = gc_small_write_barrier_card_table_alignment();
+    size_t card_size = gc_small_write_barrier_card_size();
+    uintptr_t addr = gc_ref_value(obj);
+    uintptr_t base = addr & ~(card_table_alignment - 1);
+    uintptr_t card = (addr & (card_table_alignment - 1)) / card_size;
+    atomic_store_explicit((uint8_t*)(base + card), 1, memory_order_relaxed);
+    return;
+  }
+  default:
+    abort();
+  }
+}
+
 #endif // GC_API_H_
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 90634c365..3d7ae8933 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -195,6 +195,13 @@ static void allocate_garbage(struct thread *t) {
   }
 }
 
+static void set_field(Node *obj, Node **field, Node *val) {
+  gc_small_write_barrier(gc_ref_from_heap_object(obj),
+                         gc_edge(field),
+                         gc_ref_from_heap_object(val));
+  *field = val;
+}
+
 // Build tree top down, assigning to older objects.
 static void populate(struct thread *t, int depth, Node *node) {
   struct mutator *mut = t->mut;
@@ -210,8 +217,8 @@ static void populate(struct thread *t, int depth, Node *node) {
   NodeHandle r = { allocate_node(mut) };
   PUSH_HANDLE(mut, r);
 
-  set_field(HANDLE_REF(self), (void**)&HANDLE_REF(self)->left, HANDLE_REF(l));
-  set_field(HANDLE_REF(self), (void**)&HANDLE_REF(self)->right, HANDLE_REF(r));
+  set_field(HANDLE_REF(self), &HANDLE_REF(self)->left, HANDLE_REF(l));
+  set_field(HANDLE_REF(self), &HANDLE_REF(self)->right, HANDLE_REF(r));
   // i is 0 because the memory is zeroed.
   HANDLE_REF(self)->j = depth;
 
@@ -236,8 +243,8 @@ static Node* make_tree(struct thread *t, int depth) {
 
   allocate_garbage(t);
   Node *result = allocate_node(mut);
-  init_field(result, (void**)&result->left, HANDLE_REF(left));
-  init_field(result, (void**)&result->right, HANDLE_REF(right));
+  result->left = HANDLE_REF(left);
+  result->right = HANDLE_REF(right);
   // i is 0 because the memory is zeroed.
   result->j = depth;
 
diff --git a/quads.c b/quads.c
index f7ca4e56a..968c64163 100644
--- a/quads.c
+++ b/quads.c
@@ -51,7 +51,7 @@ static Quad* make_tree(struct mutator *mut, int depth) {
 
     Quad *result = allocate_quad(mut);
     for (size_t i = 0; i < 4; i++)
-      init_field(result, (void**)&result->kids[i], HANDLE_REF(kids[i]));
+      result->kids[i] = HANDLE_REF(kids[i]);
 
     for (size_t i = 0; i < 4; i++)
       POP_HANDLE(mut);
diff --git a/semi.h b/semi.h
index 38dae224e..cb50e445c 100644
--- a/semi.h
+++ b/semi.h
@@ -57,14 +57,19 @@ static inline size_t gc_allocator_freelist_offset(size_t size) {
   abort();
 }
 
-static inline void gc_allocator_inline_success(struct mutator *mut,
-                                               struct gc_ref obj,
-                                               uintptr_t aligned_size) {
-  // FIXME: Allow allocator to avoid clearing memory?
-  clear_memory(gc_ref_value(obj), aligned_size);
+static inline int gc_allocator_needs_clear(void) {
+  return 1;
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  abort();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  abort();
 }
-static inline void gc_allocator_inline_failure(struct mutator *mut,
-                                               uintptr_t aligned_size) {}
 
 static inline struct heap* mutator_heap(struct mutator *mut) {
   return &mut->heap;
@@ -247,11 +252,14 @@ static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-static inline void init_field(void *obj, void **addr, void *val) {
-  *addr = val;
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+  return GC_WRITE_BARRIER_NONE;
 }
-static inline void set_field(void *obj, void **addr, void *val) {
-  *addr = val;
+static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+  abort();
+}
+static inline size_t gc_small_write_barrier_card_size(void) {
+  abort();
 }
 
 static int initialize_semi_space(struct semi_space *space, size_t size) {
diff --git a/whippet.h b/whippet.h
index 405e87abc..03c76349f 100644
--- a/whippet.h
+++ b/whippet.h
@@ -1852,24 +1852,32 @@ static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-static inline void mark_space_write_barrier(void *obj) {
-  // Unconditionally mark the card the object is in.  Precondition: obj
-  // is in the mark space (is not a large object).
-  atomic_store_explicit(object_remset_byte(obj), 1, memory_order_relaxed);
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+  if (GC_GENERATIONAL)
+    return GC_WRITE_BARRIER_CARD;
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return SLAB_SIZE;
+}
+static inline size_t gc_small_write_barrier_card_size(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return GRANULES_PER_REMSET_BYTE * GRANULE_SIZE;
 }
 
-// init_field is an optimization for the case in which there is no
-// intervening allocation or safepoint between allocating an object and
-// setting the value of a field in the object.  For the purposes of
-// generational collection, we can omit the barrier in that case,
-// because we know the source object is in the nursery.  It is always
-// correct to replace it with set_field.
-static inline void init_field(void *obj, void **addr, void *val) {
-  *addr = val;
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return SLAB_SIZE;
 }
-static inline void set_field(void *obj, void **addr, void *val) {
-  if (GC_GENERATIONAL) mark_space_write_barrier(obj);
-  *addr = val;
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  return METADATA_BYTE_YOUNG;
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  return METADATA_BYTE_END;
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 0;
 }
 
 #define FOR_EACH_GC_OPTION(M) \

From 8f2f4f7c69fba0aea78c409fb38a3fe404aa403b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 15 Aug 2022 18:16:32 +0200
Subject: [PATCH 130/403] API-ify gc_print_stats; add semi-inline.h

---
 bdw.h         |  4 +---
 gc-api.h      |  1 +
 mt-gcbench.c  |  3 +--
 quads.c       |  4 +---
 semi-inline.h | 54 +++++++++++++++++++++++++++++++++++++++++
 semi.h        | 66 ++++++++++-----------------------------------------
 whippet.h     |  5 +---
 7 files changed, 71 insertions(+), 66 deletions(-)
 create mode 100644 semi-inline.h

diff --git a/bdw.h b/bdw.h
index 21691ade8..07ba9d7c9 100644
--- a/bdw.h
+++ b/bdw.h
@@ -278,9 +278,7 @@ static void* gc_call_without_gc(struct mutator *mut,
   return GC_do_blocking(f, data);
 }
 
-static inline void print_start_gc_stats(struct heap *heap) {
-}
-static inline void print_end_gc_stats(struct heap *heap) {
+static void gc_print_stats(struct heap *heap) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
 }
diff --git a/gc-api.h b/gc-api.h
index df865025f..86bde3aa4 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -43,6 +43,7 @@ GC_API_ struct mutator* gc_init_for_thread(uintptr_t *stack_base,
 GC_API_ void gc_finish_for_thread(struct mutator *mut);
 GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
+GC_API_ void gc_print_stats(struct heap *heap);
 
 enum gc_allocator_kind {
   GC_ALLOCATOR_INLINE_BUMP_POINTER,
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 3d7ae8933..00f1d7e1a 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -430,7 +430,6 @@ int main(int argc, char *argv[]) {
 
   printf("Garbage Collector Test\n");
   printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
-  print_start_gc_stats(heap);
 
   unsigned long start = current_time();
         
@@ -456,5 +455,5 @@ int main(int argc, char *argv[]) {
   }
   
   printf("Completed in %.3f msec\n", elapsed_millis(start));
-  print_end_gc_stats(heap);
+  gc_print_stats(heap);
 }
diff --git a/quads.c b/quads.c
index 968c64163..0b26c5476 100644
--- a/quads.c
+++ b/quads.c
@@ -149,8 +149,6 @@ int main(int argc, char *argv[]) {
 
   PUSH_HANDLE(mut, quad);
 
-  print_start_gc_stats(heap);
-
   printf("Making quad tree of depth %zu (%zu nodes).  Total size %.3fGB.\n",
          depth, nquads, (nquads * sizeof(Quad)) / 1e9);
   unsigned long start = current_time();
@@ -176,7 +174,7 @@ int main(int argc, char *argv[]) {
   print_elapsed("allocation loop", garbage_start);
   print_elapsed("quads test", gc_start);
 
-  print_end_gc_stats(heap);
+  gc_print_stats(heap);
 
   POP_HANDLE(mut);
   return 0;
diff --git a/semi-inline.h b/semi-inline.h
new file mode 100644
index 000000000..9a8342fd4
--- /dev/null
+++ b/semi-inline.h
@@ -0,0 +1,54 @@
+#ifndef SEMI_INLINE_H
+#define SEMI_INLINE_H
+
+#include "gc-api.h"
+
+static const uintptr_t GC_ALIGNMENT = 8;
+static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GC_ALIGNMENT;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return GC_LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  abort();
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 1;
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  abort();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  abort();
+}
+
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+  abort();
+}
+static inline size_t gc_small_write_barrier_card_size(void) {
+  abort();
+}
+
+#endif // SEMI_INLINE_H
diff --git a/semi.h b/semi.h
index cb50e445c..90a42d05d 100644
--- a/semi.h
+++ b/semi.h
@@ -5,6 +5,7 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include "semi-inline.h"
 #include "large-object-space.h"
 #include "precise-roots.h"
 
@@ -29,48 +30,11 @@ struct mutator {
   struct handle *roots;
 };
 
-static const uintptr_t ALIGNMENT = 8;
-static const size_t LARGE_OBJECT_THRESHOLD = 8192;
 
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static inline enum gc_allocator_kind gc_allocator_kind(void) {
-  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
-}
-static inline size_t gc_allocator_small_granule_size(void) {
-  return ALIGNMENT;
-}
-static inline size_t gc_allocator_large_threshold(void) {
-  return LARGE_OBJECT_THRESHOLD;
-}
-
-static inline size_t gc_allocator_allocation_pointer_offset(void) {
-  return offsetof(struct semi_space, hp);
-}
-static inline size_t gc_allocator_allocation_limit_offset(void) {
-  return offsetof(struct semi_space, limit);
-}
-
-static inline size_t gc_allocator_freelist_offset(size_t size) {
-  abort();
-}
-
-static inline int gc_allocator_needs_clear(void) {
-  return 1;
-}
-
-static inline size_t gc_allocator_alloc_table_alignment(void) {
-  return 0;
-}
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
-  abort();
-}
-static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  abort();
-}
-
 static inline struct heap* mutator_heap(struct mutator *mut) {
   return &mut->heap;
 }
@@ -135,14 +99,14 @@ static void* copy(struct semi_space *space, void *obj) {
   void *new_obj = (void*)space->hp;
   memcpy(new_obj, obj, size);
   *(uintptr_t*) obj = space->hp;
-  space->hp += align_up (size, ALIGNMENT);
+  space->hp += align_up (size, GC_ALIGNMENT);
   return new_obj;
 }
 
 static uintptr_t scan(struct heap *heap, uintptr_t grey) {
   size_t size;
   gc_trace_object((void*)grey, visit, heap, &size);
-  return grey + align_up(size, ALIGNMENT);
+  return grey + align_up(size, GC_ALIGNMENT);
 }
 
 static void* forward(struct semi_space *space, void *obj) {
@@ -237,7 +201,7 @@ static void* gc_allocate_small(struct mutator *mut, size_t size) {
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
     uintptr_t addr = space->hp;
-    uintptr_t new_hp = align_up (addr + size, ALIGNMENT);
+    uintptr_t new_hp = align_up (addr + size, GC_ALIGNMENT);
     if (space->limit < new_hp) {
       collect_for_alloc(mut, size);
       continue;
@@ -252,16 +216,6 @@ static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
-  return GC_WRITE_BARRIER_NONE;
-}
-static inline size_t gc_small_write_barrier_card_table_alignment(void) {
-  abort();
-}
-static inline size_t gc_small_write_barrier_card_size(void) {
-  abort();
-}
-
 static int initialize_semi_space(struct semi_space *space, size_t size) {
   // Allocate even numbers of pages.
   size_t page_size = getpagesize();
@@ -348,8 +302,15 @@ static int parse_options(int argc, struct gc_option argv[],
   return 1;
 }
 
+#define GC_ASSERT_EQ(a, b) GC_ASSERT((a) == (b))
+
 static int gc_init(int argc, struct gc_option argv[],
                    struct heap **heap, struct mutator **mut) {
+  GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
+               offsetof(struct semi_space, hp));
+  GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
+               offsetof(struct semi_space, limit));
+
   struct options options = { 0, };
   if (!parse_options(argc, argv, &options))
     return 0;
@@ -384,10 +345,7 @@ static void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
   return f(data);
 }
 
-static inline void print_start_gc_stats(struct heap *heap) {
-}
-
-static inline void print_end_gc_stats(struct heap *heap) {
+static void gc_print_stats(struct heap *heap) {
   struct semi_space *space = heap_semi_space(heap);
   printf("Completed %ld collections\n", space->count);
   printf("Heap size is %zd\n", space->size);
diff --git a/whippet.h b/whippet.h
index 03c76349f..a070c0cb1 100644
--- a/whippet.h
+++ b/whippet.h
@@ -2097,10 +2097,7 @@ static void* gc_call_without_gc(struct mutator *mut,
   return ret;
 }
 
-static inline void print_start_gc_stats(struct heap *heap) {
-}
-
-static inline void print_end_gc_stats(struct heap *heap) {
+static void gc_print_stats(struct heap *heap) {
   printf("Completed %ld collections (%ld major)\n",
          heap->count, heap->count - heap->minor_count);
   printf("Heap size with overhead is %zd (%zu slabs)\n",

From 33aa5230dab63bd4b0255c725a11e2cc8d95ccdd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 15 Aug 2022 18:30:42 +0200
Subject: [PATCH 131/403] Add bdw-inline.h

---
 bdw-inline.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 bdw.h        | 57 +++++-----------------------------------------------
 gc-assert.h  |  2 ++
 semi.h       |  2 --
 4 files changed, 59 insertions(+), 54 deletions(-)
 create mode 100644 bdw-inline.h

diff --git a/bdw-inline.h b/bdw-inline.h
new file mode 100644
index 000000000..511c86d5f
--- /dev/null
+++ b/bdw-inline.h
@@ -0,0 +1,52 @@
+#ifndef BDW_INLINE_H
+#define BDW_INLINE_H
+
+#include "gc-api.h"
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_FREELIST;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return 2 * sizeof(void *);
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return 256;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  abort();
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  abort();
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  GC_ASSERT(size);
+  return sizeof(void*) * ((size - 1) / gc_allocator_small_granule_size());
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  abort();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  abort();
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 0;
+}
+
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+  abort();
+}
+static inline size_t gc_small_write_barrier_card_size(void) {
+  abort();
+}
+
+#endif // BDW_INLINE_H
diff --git a/bdw.h b/bdw.h
index 07ba9d7c9..b01155d4e 100644
--- a/bdw.h
+++ b/bdw.h
@@ -1,6 +1,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "bdw-inline.h"
 #include "conservative-roots.h"
 
 // When pthreads are used, let `libgc' know about it and redirect
@@ -45,34 +46,6 @@ static inline size_t gc_inline_freelist_object_size(size_t idx) {
   return (idx + 1U) * GC_INLINE_GRANULE_BYTES;
 }
 
-static inline enum gc_allocator_kind gc_allocator_kind(void) {
-  return GC_ALLOCATOR_INLINE_FREELIST;
-}
-static inline size_t gc_allocator_small_granule_size(void) {
-  return GC_INLINE_GRANULE_BYTES;
-}
-static inline size_t gc_allocator_large_threshold(void) {
-  return 256;
-}
-
-static inline size_t gc_allocator_allocation_pointer_offset(void) {
-  abort();
-}
-static inline size_t gc_allocator_allocation_limit_offset(void) {
-  abort();
-}
-
-static inline size_t gc_allocator_freelist_offset(size_t size) {
-  GC_ASSERT(size);
-  return sizeof(void*) * gc_inline_bytes_to_freelist_index(size);
-}
-
-static inline void gc_allocator_inline_success(struct mutator *mut,
-                                               struct gc_ref obj,
-                                               uintptr_t aligned_size) {}
-static inline void gc_allocator_inline_failure(struct mutator *mut,
-                                               uintptr_t aligned_size) {}
-
 // The values of these must match the internal POINTERLESS and NORMAL
 // definitions in libgc, for which unfortunately there are no external
 // definitions.  Alack.
@@ -131,30 +104,6 @@ static inline void collect(struct mutator *mut) {
   GC_gcollect();
 }
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
-  return GC_WRITE_BARRIER_NONE;
-}
-static inline size_t gc_small_write_barrier_card_table_alignment(void) {
-  abort();
-}
-static inline size_t gc_small_write_barrier_card_size(void) {
-  abort();
-}
-
-static inline size_t gc_allocator_alloc_table_alignment(void) {
-  return 0;
-}
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
-  abort();
-}
-static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  abort();
-}
-
-static inline int gc_allocator_needs_clear(void) {
-  return 0;
-}
-
 static inline struct mutator *add_mutator(struct heap *heap) {
   struct mutator *ret = GC_malloc(sizeof(struct mutator));
   ret->heap = heap;
@@ -230,6 +179,10 @@ static int parse_options(int argc, struct gc_option argv[],
 
 static int gc_init(int argc, struct gc_option argv[],
                    struct heap **heap, struct mutator **mutator) {
+  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
+  GC_ASSERT_EQ(gc_allocator_large_threshold(),
+               GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
+
   struct options options = { 0, };
   if (!parse_options(argc, argv, &options))
     return 0;
diff --git a/gc-assert.h b/gc-assert.h
index 472297a1e..dc39bee25 100644
--- a/gc-assert.h
+++ b/gc-assert.h
@@ -12,4 +12,6 @@
 #define GC_ASSERT(x) do { } while (0)
 #endif
 
+#define GC_ASSERT_EQ(a, b) GC_ASSERT((a) == (b))
+
 #endif // GC_ASSERT_H
diff --git a/semi.h b/semi.h
index 90a42d05d..224819f2c 100644
--- a/semi.h
+++ b/semi.h
@@ -302,8 +302,6 @@ static int parse_options(int argc, struct gc_option argv[],
   return 1;
 }
 
-#define GC_ASSERT_EQ(a, b) GC_ASSERT((a) == (b))
-
 static int gc_init(int argc, struct gc_option argv[],
                    struct heap **heap, struct mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),

From 607585e7f03e9264b47c1e97086e3e9c9c11b2d5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 10:25:23 +0200
Subject: [PATCH 132/403] Add whippet-inline.h

---
 whippet-inline.h | 56 ++++++++++++++++++++++++++++++++
 whippet.h        | 84 ++++++++++--------------------------------------
 2 files changed, 73 insertions(+), 67 deletions(-)
 create mode 100644 whippet-inline.h

diff --git a/whippet-inline.h b/whippet-inline.h
new file mode 100644
index 000000000..b61724cfa
--- /dev/null
+++ b/whippet-inline.h
@@ -0,0 +1,56 @@
+#ifndef WHIPPET_INLINE_H
+#define WHIPPET_INLINE_H
+
+#include "gc-config.h"
+#include "gc-api.h"
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return 16;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return 8192;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  abort();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 4 * 1024 * 1024;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  return 1;
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  return 16;
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 0;
+}
+
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+  if (GC_GENERATIONAL)
+    return GC_WRITE_BARRIER_CARD;
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 4 * 1024 * 1024;
+}
+static inline size_t gc_small_write_barrier_card_size(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 256;
+}
+
+#endif // WHIPPET_INLINE_H
diff --git a/whippet.h b/whippet.h
index a070c0cb1..6ac499635 100644
--- a/whippet.h
+++ b/whippet.h
@@ -25,6 +25,7 @@
 #include "serial-tracer.h"
 #endif
 #include "spin.h"
+#include "whippet-inline.h"
 
 #define GRANULE_SIZE 16
 #define GRANULE_SIZE_LOG_2 4
@@ -369,44 +370,6 @@ static inline struct heap* mutator_heap(struct mutator *mutator) {
   return mutator->heap;
 }
 
-static inline enum gc_allocator_kind gc_allocator_kind(void) {
-  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
-}
-static inline size_t gc_allocator_small_granule_size(void) {
-  return GRANULE_SIZE;
-}
-static inline size_t gc_allocator_large_threshold(void) {
-  return LARGE_OBJECT_THRESHOLD;
-}
-
-static inline size_t gc_allocator_allocation_pointer_offset(void) {
-  return offsetof(struct mutator, alloc);
-}
-static inline size_t gc_allocator_allocation_limit_offset(void) {
-  return offsetof(struct mutator, sweep);
-}
-
-static inline size_t gc_allocator_freelist_offset(size_t size) {
-  abort();
-}
-
-static inline void gc_allocator_inline_success(struct mutator *mut,
-                                               struct gc_ref obj,
-                                               uintptr_t aligned_size) {
-  uint8_t *metadata = object_metadata_byte(gc_ref_heap_object(obj));
-  size_t granules = aligned_size >> GRANULE_SIZE_LOG_2;
-  if (granules == 1) {
-    metadata[0] = METADATA_BYTE_YOUNG | METADATA_BYTE_END;
-  } else {
-    metadata[0] = METADATA_BYTE_YOUNG;
-    if (granules > 2)
-      memset(metadata + 1, 0, granules - 2);
-    metadata[granules - 1] = METADATA_BYTE_END;
-  }
-}
-static inline void gc_allocator_inline_failure(struct mutator *mut,
-                                               uintptr_t aligned_size) {}
-
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
@@ -1844,7 +1807,7 @@ static void* gc_allocate_small(struct mutator *mut, size_t size) {
     obj = (struct gcobj*)mut->alloc;
     mut->alloc += size;
   }
-  gc_allocator_inline_success(mut, gc_ref_from_heap_object(obj), size);
+  gc_update_alloc_table(mut, gc_ref_from_heap_object(obj), size);
   return obj;
 }
 
@@ -1852,34 +1815,6 @@ static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
-  if (GC_GENERATIONAL)
-    return GC_WRITE_BARRIER_CARD;
-  return GC_WRITE_BARRIER_NONE;
-}
-static inline size_t gc_small_write_barrier_card_table_alignment(void) {
-  GC_ASSERT(GC_GENERATIONAL);
-  return SLAB_SIZE;
-}
-static inline size_t gc_small_write_barrier_card_size(void) {
-  GC_ASSERT(GC_GENERATIONAL);
-  return GRANULES_PER_REMSET_BYTE * GRANULE_SIZE;
-}
-
-static inline size_t gc_allocator_alloc_table_alignment(void) {
-  return SLAB_SIZE;
-}
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
-  return METADATA_BYTE_YOUNG;
-}
-static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  return METADATA_BYTE_END;
-}
-
-static inline int gc_allocator_needs_clear(void) {
-  return 0;
-}
-
 #define FOR_EACH_GC_OPTION(M) \
   M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
   M(GC_OPTION_PARALLELISM, "parallelism")
@@ -2022,6 +1957,21 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
 
 static int gc_init(int argc, struct gc_option argv[],
                    struct heap **heap, struct mutator **mut) {
+  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
+  GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
+  GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
+               offsetof(struct mutator, alloc));
+  GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
+               offsetof(struct mutator, sweep));
+  GC_ASSERT_EQ(gc_allocator_alloc_table_alignment(), SLAB_SIZE);
+  GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), METADATA_BYTE_YOUNG);
+  GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), METADATA_BYTE_END);
+  if (GC_GENERATIONAL) {
+    GC_ASSERT_EQ(gc_small_write_barrier_card_table_alignment(), SLAB_SIZE);
+    GC_ASSERT_EQ(gc_small_write_barrier_card_size(),
+                 BLOCK_SIZE / REMSET_BYTES_PER_BLOCK);
+  }
+
   struct options options = { 0, };
   if (!parse_options(argc, argv, &options))
     return 0;

From 9e8940e59f42de18235037cf5badac68841f608e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 11:53:32 +0200
Subject: [PATCH 133/403] Get handles out of collectors

---
 bdw.h                |  6 ++++++
 conservative-roots.h | 14 ++++++++++++
 gc-api.h             |  6 ++++++
 gc-embedder-api.h    |  9 ++++++++
 mt-gcbench.c         | 41 ++++++++++++++++++-----------------
 precise-roots.h      | 12 +++++++++--
 quads.c              | 28 +++++++++++++++---------
 semi.h               | 14 +++++++++---
 simple-gc-embedder.h | 25 ++++++++++++++++++++++
 whippet.h            | 51 ++++++++++++++++++++++++--------------------
 10 files changed, 149 insertions(+), 57 deletions(-)

diff --git a/bdw.h b/bdw.h
index b01155d4e..7731034ab 100644
--- a/bdw.h
+++ b/bdw.h
@@ -231,6 +231,12 @@ static void* gc_call_without_gc(struct mutator *mut,
   return GC_do_blocking(f, data);
 }
 
+static void gc_mutator_set_roots(struct mutator *mut,
+                                 struct gc_mutator_roots *roots) {
+}
+static void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+}
+
 static void gc_print_stats(struct heap *heap) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
diff --git a/conservative-roots.h b/conservative-roots.h
index f5b1a5708..23fc38de7 100644
--- a/conservative-roots.h
+++ b/conservative-roots.h
@@ -5,3 +5,17 @@ struct handle { void *unused; };
 #define HANDLE_SET(h,val) do { h.v = val; } while (0)
 #define PUSH_HANDLE(cx, h) do { (void) &h; } while (0)
 #define POP_HANDLE(cx) do { } while (0)
+
+static inline void visit_thread_roots(void *thread_roots,
+                                      void (*trace_edge)(struct gc_edge edge,
+                                                         void *trace_data),
+                                      void *trace_data) {
+  abort();
+}
+
+static inline void visit_roots(struct handle *roots,
+                               void (*trace_edge)(struct gc_edge edge,
+                                                  void *trace_data),
+                               void *trace_data) {
+  GC_ASSERT(!roots);
+}
diff --git a/gc-api.h b/gc-api.h
index 86bde3aa4..da513f415 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -38,6 +38,12 @@ GC_API_ int gc_option_from_string(const char *str);
 GC_API_ int gc_init(int argc, struct gc_option argv[],
                     struct heap **heap, struct mutator **mutator);
 
+struct gc_mutator_roots;
+struct gc_heap_roots;
+GC_API_ void gc_mutator_set_roots(struct mutator *mut,
+                                  struct gc_mutator_roots *roots);
+GC_API_ void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots);
+
 GC_API_ struct mutator* gc_init_for_thread(uintptr_t *stack_base,
                                            struct heap *heap);
 GC_API_ void gc_finish_for_thread(struct mutator *mut);
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index f80ffe995..b74bd0486 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -13,6 +13,14 @@ GC_EMBEDDER_API inline void gc_trace_object(void *object,
                                                                void *trace_data),
                                             void *trace_data,
                                             size_t *size) GC_ALWAYS_INLINE;
+GC_EMBEDDER_API inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                                   void (*trace_edge)(struct gc_edge edge,
+                                                                      void *trace_data),
+                                                   void *trace_data);
+GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                                void (*trace_edge)(struct gc_edge edge,
+                                                                   void *trace_data),
+                                                void *trace_data);
 
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(void *object);
 GC_EMBEDDER_API inline void gc_object_forward_nonatomic(void *object, uintptr_t new_addr);
@@ -25,4 +33,5 @@ GC_EMBEDDER_API inline void gc_atomic_forward_commit(struct gc_atomic_forward *,
                                                      uintptr_t new_addr);
 GC_EMBEDDER_API inline uintptr_t gc_atomic_forward_address(struct gc_atomic_forward *);
 
+
 #endif // GC_EMBEDDER_API_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 00f1d7e1a..418851ede 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -185,6 +185,7 @@ static size_t power_law(size_t *counter) {
 
 struct thread {
   struct mutator *mut;
+  struct gc_mutator_roots roots;
   size_t counter;
 };
 
@@ -209,13 +210,13 @@ static void populate(struct thread *t, int depth, Node *node) {
     return;
 
   NodeHandle self = { node };
-  PUSH_HANDLE(mut, self);
+  PUSH_HANDLE(t, self);
   allocate_garbage(t);
   NodeHandle l = { allocate_node(mut) };
-  PUSH_HANDLE(mut, l);
+  PUSH_HANDLE(t, l);
   allocate_garbage(t);
   NodeHandle r = { allocate_node(mut) };
-  PUSH_HANDLE(mut, r);
+  PUSH_HANDLE(t, r);
 
   set_field(HANDLE_REF(self), &HANDLE_REF(self)->left, HANDLE_REF(l));
   set_field(HANDLE_REF(self), &HANDLE_REF(self)->right, HANDLE_REF(r));
@@ -225,9 +226,9 @@ static void populate(struct thread *t, int depth, Node *node) {
   populate(t, depth-1, HANDLE_REF(self)->left);
   populate(t, depth-1, HANDLE_REF(self)->right);
 
-  POP_HANDLE(mut);
-  POP_HANDLE(mut);
-  POP_HANDLE(mut);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
 }
 
 // Build tree bottom-up
@@ -237,9 +238,9 @@ static Node* make_tree(struct thread *t, int depth) {
     return allocate_node(mut);
 
   NodeHandle left = { make_tree(t, depth-1) };
-  PUSH_HANDLE(mut, left);
+  PUSH_HANDLE(t, left);
   NodeHandle right = { make_tree(t, depth-1) };
-  PUSH_HANDLE(mut, right);
+  PUSH_HANDLE(t, right);
 
   allocate_garbage(t);
   Node *result = allocate_node(mut);
@@ -248,8 +249,8 @@ static Node* make_tree(struct thread *t, int depth) {
   // i is 0 because the memory is zeroed.
   result->j = depth;
 
-  POP_HANDLE(mut);
-  POP_HANDLE(mut);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
 
   return result;
 }
@@ -274,7 +275,7 @@ static void time_construction(struct thread *t, int depth) {
   struct mutator *mut = t->mut;
   int num_iters = compute_num_iters(depth);
   NodeHandle temp_tree = { NULL };
-  PUSH_HANDLE(mut, temp_tree);
+  PUSH_HANDLE(t, temp_tree);
 
   printf("Creating %d trees of depth %d\n", num_iters, depth);
 
@@ -301,7 +302,7 @@ static void time_construction(struct thread *t, int depth) {
            elapsed_millis(start));
   }
 
-  POP_HANDLE(mut);
+  POP_HANDLE(t);
 }
 
 static void* call_with_stack_base(void* (*)(uintptr_t*, void*), void*) GC_NEVER_INLINE;
@@ -337,11 +338,12 @@ static void* run_one_test(struct mutator *mut) {
   NodeHandle long_lived_tree = { NULL };
   NodeHandle temp_tree = { NULL };
   DoubleArrayHandle array = { NULL };
-  struct thread t = { mut, 0 };
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
 
-  PUSH_HANDLE(mut, long_lived_tree);
-  PUSH_HANDLE(mut, temp_tree);
-  PUSH_HANDLE(mut, array);
+  PUSH_HANDLE(&t, long_lived_tree);
+  PUSH_HANDLE(&t, temp_tree);
+  PUSH_HANDLE(&t, array);
 
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
@@ -368,9 +370,10 @@ static void* run_one_test(struct mutator *mut) {
       || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
 
-  POP_HANDLE(mut);
-  POP_HANDLE(mut);
-  POP_HANDLE(mut);
+  POP_HANDLE(&t);
+  POP_HANDLE(&t);
+  POP_HANDLE(&t);
+  gc_mutator_set_roots(mut, NULL);
   return NULL;
 }
 
diff --git a/precise-roots.h b/precise-roots.h
index 0465083b9..2eedb60ed 100644
--- a/precise-roots.h
+++ b/precise-roots.h
@@ -6,8 +6,8 @@ struct handle {
 #define HANDLE_TO(T) union { T* v; struct handle handle; }
 #define HANDLE_REF(h) h.v
 #define HANDLE_SET(h,val) do { h.v = val; } while (0)
-#define PUSH_HANDLE(cx, h) push_handle(&cx->roots, &h.handle)
-#define POP_HANDLE(cx) pop_handle(&cx->roots)
+#define PUSH_HANDLE(cx, h) push_handle(&(cx)->roots.roots, &h.handle)
+#define POP_HANDLE(cx) pop_handle(&(cx)->roots.roots)
 
 static inline void push_handle(struct handle **roots, struct handle *handle) {
   handle->next = *roots;
@@ -17,3 +17,11 @@ static inline void push_handle(struct handle **roots, struct handle *handle) {
 static inline void pop_handle(struct handle **roots) {
   *roots = (*roots)->next;
 }
+
+static inline void visit_roots(struct handle *roots,
+                               void (*trace_edge)(struct gc_edge edge,
+                                                  void *trace_data),
+                               void *trace_data) {
+  for (struct handle *h = roots; h; h = h->next)
+    trace_edge(gc_edge(&h->v), trace_data);
+}
diff --git a/quads.c b/quads.c
index 0b26c5476..d7438a486 100644
--- a/quads.c
+++ b/quads.c
@@ -38,23 +38,29 @@ static unsigned long current_time(void)
   return t.tv_sec * 1000 * 1000 + t.tv_usec;
 }
 
+struct thread {
+  struct mutator *mut;
+  struct gc_mutator_roots roots;
+  size_t counter;
+};
+
 // Build tree bottom-up
-static Quad* make_tree(struct mutator *mut, int depth) {
+static Quad* make_tree(struct thread *t, int depth) {
   if (depth<=0) {
-    return allocate_quad(mut);
+    return allocate_quad(t->mut);
   } else {
     QuadHandle kids[4] = { { NULL }, };
     for (size_t i = 0; i < 4; i++) {
-      HANDLE_SET(kids[i], make_tree(mut, depth-1));
-      PUSH_HANDLE(mut, kids[i]);
+      HANDLE_SET(kids[i], make_tree(t, depth-1));
+      PUSH_HANDLE(t, kids[i]);
     }
 
-    Quad *result = allocate_quad(mut);
+    Quad *result = allocate_quad(t->mut);
     for (size_t i = 0; i < 4; i++)
       result->kids[i] = HANDLE_REF(kids[i]);
 
     for (size_t i = 0; i < 4; i++)
-      POP_HANDLE(mut);
+      POP_HANDLE(t);
 
     return result;
   }
@@ -144,15 +150,17 @@ int main(int argc, char *argv[]) {
             heap_size);
     return 1;
   }
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
 
   QuadHandle quad = { NULL };
 
-  PUSH_HANDLE(mut, quad);
+  PUSH_HANDLE(&t, quad);
 
   printf("Making quad tree of depth %zu (%zu nodes).  Total size %.3fGB.\n",
          depth, nquads, (nquads * sizeof(Quad)) / 1e9);
   unsigned long start = current_time();
-  HANDLE_SET(quad, make_tree(mut, depth));
+  HANDLE_SET(quad, make_tree(&t, depth));
   print_elapsed("construction", start);
 
   validate_tree(HANDLE_REF(quad), depth);
@@ -165,7 +173,7 @@ int main(int argc, char *argv[]) {
     size_t garbage_depth = 3;
     start = current_time();
     for (size_t i = garbage_step/(tree_size(garbage_depth)*4*sizeof(Quad*)); i; i--)
-      make_tree(mut, garbage_depth);
+      make_tree(&t, garbage_depth);
     print_elapsed("allocating garbage", start);
 
     start = current_time();
@@ -176,7 +184,7 @@ int main(int argc, char *argv[]) {
 
   gc_print_stats(heap);
 
-  POP_HANDLE(mut);
+  POP_HANDLE(&t);
   return 0;
 }
 
diff --git a/semi.h b/semi.h
index 224819f2c..def7a607f 100644
--- a/semi.h
+++ b/semi.h
@@ -27,7 +27,7 @@ struct heap {
 // One mutator per space, can just store the heap in the mutator.
 struct mutator {
   struct heap heap;
-  struct handle *roots;
+  struct gc_mutator_roots *roots;
 };
 
 
@@ -152,8 +152,8 @@ static void collect(struct mutator *mut) {
   large_object_space_start_gc(large, 0);
   flip(semi);
   uintptr_t grey = semi->hp;
-  for (struct handle *h = mut->roots; h; h = h->next)
-    visit(gc_edge(&h->v), heap);
+  if (mut->roots)
+    gc_trace_mutator_roots(mut->roots, visit, heap);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, grey);
@@ -328,6 +328,14 @@ static int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
+static void gc_mutator_set_roots(struct mutator *mut,
+                                 struct gc_mutator_roots *roots) {
+  mut->roots = roots;
+}
+static void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+  abort();
+}
+
 static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
                                           struct heap *heap) {
   fprintf(stderr,
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index a198a47ae..5a8adf717 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -23,6 +23,31 @@ static inline void gc_trace_object(void *object,
   }
 }
 
+struct handle;
+struct gc_heap_roots { struct handle *roots; };
+struct gc_mutator_roots { struct handle *roots; };
+
+static inline void visit_roots(struct handle *roots,
+                               void (*trace_edge)(struct gc_edge edge,
+                                                  void *trace_data),
+                               void *trace_data);
+
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             void *trace_data),
+                                          void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, trace_data);
+}
+
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          void *trace_data),
+                                       void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, trace_data);
+}
+
 static inline uintptr_t gc_object_forwarded_nonatomic(void *object) {
   uintptr_t tag = *tag_word(object);
   return (tag & gcobj_not_forwarded_bit) ? 0 : tag;
diff --git a/whippet.h b/whippet.h
index 6ac499635..4851c874d 100644
--- a/whippet.h
+++ b/whippet.h
@@ -322,7 +322,7 @@ struct heap {
   int allow_pinning;
   size_t active_mutator_count;
   size_t mutator_count;
-  struct handle *global_roots;
+  struct gc_heap_roots *roots;
   struct mutator *mutator_trace_list;
   long count;
   long minor_count;
@@ -348,7 +348,7 @@ struct mutator {
   uintptr_t sweep;
   uintptr_t block;
   struct heap *heap;
-  struct handle *roots;
+  struct gc_mutator_roots *roots;
   struct mutator_mark_buf mark_buf;
   // Three uses for this in-object linked-list pointer:
   //  - inactive (blocked in syscall) mutators
@@ -905,29 +905,39 @@ static int mutator_should_mark_while_stopping(struct mutator *mut) {
   return heap_should_mark_while_stopping(mutator_heap(mut));
 }
 
+static void gc_mutator_set_roots(struct mutator *mut,
+                                 struct gc_mutator_roots *roots) {
+  mut->roots = roots;
+}
+static void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+  heap->roots = roots;
+}
+
+static void trace_and_enqueue_locally(struct gc_edge edge, void *data) {
+  struct mutator *mut = data;
+  if (trace_edge(mutator_heap(mut), edge))
+    mutator_mark_buf_push(&mut->mark_buf,
+                          gc_ref_heap_object(gc_edge_ref(edge)));
+}
+
+static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
+  struct heap *heap = data;
+  if (trace_edge(heap, edge))
+    tracer_enqueue_root(&heap->tracer,
+                        gc_ref_heap_object(gc_edge_ref(edge)));
+}
+
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void mark_stopping_mutator_roots(struct mutator *mut) {
   GC_ASSERT(mutator_should_mark_while_stopping(mut));
-  struct heap *heap = mutator_heap(mut);
-  struct mutator_mark_buf *local_roots = &mut->mark_buf;
-  for (struct handle *h = mut->roots; h; h = h->next) {
-    struct gc_edge root = gc_edge(&h->v);
-    if (trace_edge(heap, root))
-      mutator_mark_buf_push(local_roots,
-                            gc_ref_heap_object(gc_edge_ref(root)));
-  }
+  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
 }
 
 // Precondition: the caller holds the heap lock.
 static void mark_mutator_roots_with_lock(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
-  for (struct handle *h = mut->roots; h; h = h->next) {
-    struct gc_edge root = gc_edge(&h->v);
-    if (trace_edge(heap, root))
-      tracer_enqueue_root(&heap->tracer,
-                          gc_ref_heap_object(gc_edge_ref(root)));
-  }
+  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
+                         mutator_heap(mut));
 }
 
 static void trace_mutator_roots_with_lock(struct mutator *mut) {
@@ -976,12 +986,7 @@ static void trace_mutator_roots_after_stop(struct heap *heap) {
 }
 
 static void trace_global_roots(struct heap *heap) {
-  for (struct handle *h = heap->global_roots; h; h = h->next) {
-    struct gc_edge edge = gc_edge(&h->v);
-    if (trace_edge(heap, edge))
-      tracer_enqueue_root(&heap->tracer,
-                          gc_ref_heap_object(gc_edge_ref(edge)));
-  }
+  gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
 }
 
 static inline int

From 8a111256c6bb2d7431de61f77e8030db840aaa72 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 12:04:56 +0200
Subject: [PATCH 134/403] Compile with -fvisibility=hidden; will be good for
 separate compilation

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4dc47225d..3945498c2 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi whippet parallel-whippet generational-whippet parallel-generational-whippet
 
 CC=gcc
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing -Wno-unused -DNDEBUG
+CFLAGS=-Wall -O2 -g -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
 INCLUDES=-I.
 LDFLAGS=-lpthread
 COMPILE=$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS)

From 112f27b77b4f0c05c086b63e9b9f5e4868c708bb Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 15:57:27 +0200
Subject: [PATCH 135/403] Simplify GC attributes for the inline allocator

Don't require pulling in all of gc-api.h.
---
 bdw-inline.h => bdw-attrs.h         |  8 +++----
 bdw.h                               |  2 +-
 gc-api.h                            | 33 +----------------------------
 semi-inline.h => semi-attrs.h       |  8 +++----
 semi.h                              |  2 +-
 whippet-inline.h => whippet-attrs.h |  8 +++----
 whippet.h                           |  2 +-
 7 files changed, 16 insertions(+), 47 deletions(-)
 rename bdw-inline.h => bdw-attrs.h (93%)
 rename semi-inline.h => semi-attrs.h (93%)
 rename whippet-inline.h => whippet-attrs.h (93%)

diff --git a/bdw-inline.h b/bdw-attrs.h
similarity index 93%
rename from bdw-inline.h
rename to bdw-attrs.h
index 511c86d5f..5743cd3a2 100644
--- a/bdw-inline.h
+++ b/bdw-attrs.h
@@ -1,7 +1,7 @@
-#ifndef BDW_INLINE_H
-#define BDW_INLINE_H
+#ifndef BDW_ATTRS_H
+#define BDW_ATTRS_H
 
-#include "gc-api.h"
+#include "gc-attrs.h"
 
 static inline enum gc_allocator_kind gc_allocator_kind(void) {
   return GC_ALLOCATOR_INLINE_FREELIST;
@@ -49,4 +49,4 @@ static inline size_t gc_small_write_barrier_card_size(void) {
   abort();
 }
 
-#endif // BDW_INLINE_H
+#endif // BDW_ATTRS_H
diff --git a/bdw.h b/bdw.h
index 7731034ab..0af332090 100644
--- a/bdw.h
+++ b/bdw.h
@@ -1,7 +1,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "bdw-inline.h"
+#include "bdw-attrs.h"
 #include "conservative-roots.h"
 
 // When pthreads are used, let `libgc' know about it and redirect
diff --git a/gc-api.h b/gc-api.h
index da513f415..c7d332176 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -3,6 +3,7 @@
 
 #include "gc-config.h"
 #include "gc-assert.h"
+#include "gc-attrs.h"
 #include "gc-inline.h"
 #include "gc-ref.h"
 #include "gc-edge.h"
@@ -51,29 +52,6 @@ GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 GC_API_ void gc_print_stats(struct heap *heap);
 
-enum gc_allocator_kind {
-  GC_ALLOCATOR_INLINE_BUMP_POINTER,
-  GC_ALLOCATOR_INLINE_FREELIST,
-  GC_ALLOCATOR_INLINE_NONE
-};
-
-static inline enum gc_allocator_kind gc_allocator_kind(void) GC_ALWAYS_INLINE;
-
-static inline size_t gc_allocator_large_threshold(void) GC_ALWAYS_INLINE;
-
-static inline size_t gc_allocator_small_granule_size(void) GC_ALWAYS_INLINE;
-
-static inline size_t gc_allocator_allocation_pointer_offset(void) GC_ALWAYS_INLINE;
-static inline size_t gc_allocator_allocation_limit_offset(void) GC_ALWAYS_INLINE;
-
-static inline size_t gc_allocator_freelist_offset(size_t size) GC_ALWAYS_INLINE;
-
-static inline size_t gc_allocator_alloc_table_alignment(void) GC_ALWAYS_INLINE;
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) GC_ALWAYS_INLINE;
-static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLINE;
-
-static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
-
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
                                              size_t size) GC_ALWAYS_INLINE;
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
@@ -188,15 +166,6 @@ static inline void* gc_allocate(struct mutator *mut, size_t size) {
 // FIXME: remove :P
 static inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
 
-enum gc_write_barrier_kind {
-  GC_WRITE_BARRIER_NONE,
-  GC_WRITE_BARRIER_CARD
-};
-
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void);
-static inline size_t gc_small_write_barrier_card_table_alignment(void);
-static inline size_t gc_small_write_barrier_card_size(void);
-
 static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
                                           struct gc_ref new_val) GC_ALWAYS_INLINE;
 static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
diff --git a/semi-inline.h b/semi-attrs.h
similarity index 93%
rename from semi-inline.h
rename to semi-attrs.h
index 9a8342fd4..55691f047 100644
--- a/semi-inline.h
+++ b/semi-attrs.h
@@ -1,7 +1,7 @@
-#ifndef SEMI_INLINE_H
-#define SEMI_INLINE_H
+#ifndef SEMI_ATTRS_H
+#define SEMI_ATTRS_H
 
-#include "gc-api.h"
+#include "gc-attrs.h"
 
 static const uintptr_t GC_ALIGNMENT = 8;
 static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
@@ -51,4 +51,4 @@ static inline size_t gc_small_write_barrier_card_size(void) {
   abort();
 }
 
-#endif // SEMI_INLINE_H
+#endif // SEMI_ATTRS_H
diff --git a/semi.h b/semi.h
index def7a607f..bc32eecb2 100644
--- a/semi.h
+++ b/semi.h
@@ -5,7 +5,7 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include "semi-inline.h"
+#include "semi-attrs.h"
 #include "large-object-space.h"
 #include "precise-roots.h"
 
diff --git a/whippet-inline.h b/whippet-attrs.h
similarity index 93%
rename from whippet-inline.h
rename to whippet-attrs.h
index b61724cfa..72915b1a1 100644
--- a/whippet-inline.h
+++ b/whippet-attrs.h
@@ -1,8 +1,8 @@
-#ifndef WHIPPET_INLINE_H
-#define WHIPPET_INLINE_H
+#ifndef WHIPPET_ATTRS_H
+#define WHIPPET_ATTRS_H
 
 #include "gc-config.h"
-#include "gc-api.h"
+#include "gc-attrs.h"
 
 static inline enum gc_allocator_kind gc_allocator_kind(void) {
   return GC_ALLOCATOR_INLINE_BUMP_POINTER;
@@ -53,4 +53,4 @@ static inline size_t gc_small_write_barrier_card_size(void) {
   return 256;
 }
 
-#endif // WHIPPET_INLINE_H
+#endif // WHIPPET_ATTRS_H
diff --git a/whippet.h b/whippet.h
index 4851c874d..654897b5c 100644
--- a/whippet.h
+++ b/whippet.h
@@ -25,7 +25,7 @@
 #include "serial-tracer.h"
 #endif
 #include "spin.h"
-#include "whippet-inline.h"
+#include "whippet-attrs.h"
 
 #define GRANULE_SIZE 16
 #define GRANULE_SIZE_LOG_2 4

From fe9bdf6397930bc4f6973f0d3f8d2dd3e1a82e1a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 16:09:36 +0200
Subject: [PATCH 136/403] Separate out embedder API from mt-gcbench, quads

---
 heap-objects.h        |  8 -------
 mt-gcbench-embedder.h | 44 +++++++++++++++++++++++++++++++++++++
 mt-gcbench-types.h    | 20 +++++++++++++++++
 mt-gcbench.c          | 50 +------------------------------------------
 quads-embedder.h      | 28 ++++++++++++++++++++++++
 quads-types.h         |  6 ++++++
 quads.c               | 18 +++-------------
 7 files changed, 102 insertions(+), 72 deletions(-)
 create mode 100644 mt-gcbench-embedder.h
 create mode 100644 quads-embedder.h

diff --git a/heap-objects.h b/heap-objects.h
index d76d5ee36..14ec2e3d8 100644
--- a/heap-objects.h
+++ b/heap-objects.h
@@ -16,12 +16,4 @@ enum alloc_kind {
 };
 #undef DEFINE_ENUM
 
-#define DEFINE_METHODS(name, Name, NAME) \
-  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
-  static inline void visit_##name##_fields(Name *obj,\
-                                           void (*visit)(struct gc_edge edge, void *visit_data), \
-                                           void *visit_data) GC_ALWAYS_INLINE;
-FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
-#undef DEFINE_METHODS
-
 #endif // HEAP_OBJECTS_H
diff --git a/mt-gcbench-embedder.h b/mt-gcbench-embedder.h
new file mode 100644
index 000000000..63076f474
--- /dev/null
+++ b/mt-gcbench-embedder.h
@@ -0,0 +1,44 @@
+#ifndef MT_GCBENCH_EMBEDDER_H
+#define MT_GCBENCH_EMBEDDER_H
+
+#include "mt-gcbench-types.h"
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, void *visit_data), \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t node_size(Node *obj) {
+  return sizeof(Node);
+}
+static inline size_t double_array_size(DoubleArray *array) {
+  return sizeof(*array) + array->length * sizeof(double);
+}
+static inline size_t hole_size(Hole *hole) {
+  return sizeof(*hole) + hole->length * sizeof(uintptr_t);
+}
+static inline void
+visit_node_fields(Node *node,
+                  void (*visit)(struct gc_edge edge, void *visit_data),
+                  void *visit_data) {
+  visit(gc_edge(&node->left), visit_data);
+  visit(gc_edge(&node->right), visit_data);
+}
+static inline void
+visit_double_array_fields(DoubleArray *obj,
+                          void (*visit)(struct gc_edge edge, void *visit_data),
+                          void *visit_data) {
+}
+static inline void
+visit_hole_fields(Hole *obj,
+                  void (*visit)(struct gc_edge edge, void *visit_data),
+                  void *visit_data) {
+  abort();
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // MT_GCBENCH_EMBEDDER_H
diff --git a/mt-gcbench-types.h b/mt-gcbench-types.h
index 04bf6d258..32471d025 100644
--- a/mt-gcbench-types.h
+++ b/mt-gcbench-types.h
@@ -7,5 +7,25 @@
   M(hole, Hole, HOLE)
 
 #include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct Node {
+  struct gc_header header;
+  struct Node *left;
+  struct Node *right;
+  int i, j;
+};
+
+struct DoubleArray {
+  struct gc_header header;
+  size_t length;
+  double values[0];
+};
+
+struct Hole {
+  struct gc_header header;
+  size_t length;
+  uintptr_t values[0];
+};
 
 #endif // GCBENCH_TYPES_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 418851ede..4d304c636 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -44,14 +44,13 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
-// Tracer will be specialized with respect to tags defined in this header.
 #include "mt-gcbench-types.h"
 
 #include "assert.h"
 #include "simple-allocator.h"
-#include "simple-gc-embedder.h"
 #include "gc-api.h"
 
+#include "mt-gcbench-embedder.h"
 #include "gc.h"
 
 #include "gc-inline.h"
@@ -63,53 +62,6 @@ static const int array_size = 500000; // about 4Mb
 static const int min_tree_depth = 4;
 static const int max_tree_depth = 16;
 
-struct Node {
-  struct gc_header header;
-  struct Node *left;
-  struct Node *right;
-  int i, j;
-};
-
-struct DoubleArray {
-  struct gc_header header;
-  size_t length;
-  double values[0];
-};
-
-struct Hole {
-  struct gc_header header;
-  size_t length;
-  uintptr_t values[0];
-};
-
-static inline size_t node_size(Node *obj) {
-  return sizeof(Node);
-}
-static inline size_t double_array_size(DoubleArray *array) {
-  return sizeof(*array) + array->length * sizeof(double);
-}
-static inline size_t hole_size(Hole *hole) {
-  return sizeof(*hole) + hole->length * sizeof(uintptr_t);
-}
-static inline void
-visit_node_fields(Node *node,
-                  void (*visit)(struct gc_edge edge, void *visit_data),
-                  void *visit_data) {
-  visit(gc_edge(&node->left), visit_data);
-  visit(gc_edge(&node->right), visit_data);
-}
-static inline void
-visit_double_array_fields(DoubleArray *obj,
-                          void (*visit)(struct gc_edge edge, void *visit_data),
-                          void *visit_data) {
-}
-static inline void
-visit_hole_fields(Hole *obj,
-                  void (*visit)(struct gc_edge edge, void *visit_data),
-                  void *visit_data) {
-  abort();
-}
-
 typedef HANDLE_TO(Node) NodeHandle;
 typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
 
diff --git a/quads-embedder.h b/quads-embedder.h
new file mode 100644
index 000000000..18047607c
--- /dev/null
+++ b/quads-embedder.h
@@ -0,0 +1,28 @@
+#ifndef QUADS_EMBEDDER_H
+#define QUADS_EMBEDDER_H
+
+#include "quads-types.h"
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, void *visit_data), \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t quad_size(Quad *obj) {
+  return sizeof(Quad);
+}
+
+static inline void
+visit_quad_fields(Quad *quad,
+                  void (*visit)(struct gc_edge edge, void *visit_data),
+                  void *visit_data) {
+  for (size_t i = 0; i < 4; i++)
+    visit(gc_edge(&quad->kids[i]), visit_data);
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // QUADS_EMBEDDER_H
diff --git a/quads-types.h b/quads-types.h
index 16a1c62d0..935591ef2 100644
--- a/quads-types.h
+++ b/quads-types.h
@@ -5,5 +5,11 @@
   M(quad, Quad, QUAD)
 
 #include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct Quad {
+  struct gc_header header;
+  struct Quad *kids[4];
+};
 
 #endif // QUADS_TYPES_H
diff --git a/quads.c b/quads.c
index d7438a486..7797bbd42 100644
--- a/quads.c
+++ b/quads.c
@@ -5,23 +5,11 @@
 #include "assert.h"
 #include "quads-types.h"
 #include "simple-allocator.h"
-#include "simple-gc-embedder.h"
+#include "gc-api.h"
+
+#include "quads-embedder.h"
 #include "gc.h"
 
-typedef struct Quad {
-  struct gc_header header;
-  struct Quad *kids[4];
-} Quad;
-static inline size_t quad_size(Quad *obj) {
-  return sizeof(Quad);
-}
-static inline void
-visit_quad_fields(Quad *quad,
-                  void (*visit)(struct gc_edge edge, void *visit_data),
-                  void *visit_data) {
-  for (size_t i = 0; i < 4; i++)
-    visit(gc_edge(&quad->kids[i]), visit_data);
-}
 typedef HANDLE_TO(Quad) QuadHandle;
 
 static Quad* allocate_quad(struct mutator *mut) {

From b082f5f50d64edf11918f3d3bcb27e1e06bc3586 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 17:54:15 +0200
Subject: [PATCH 137/403] Separate compilation!!!!!

---
 Makefile                               | 47 ++++++++++++------
 bdw-attrs.h                            | 13 ++---
 bdw.h => bdw.c                         | 56 ++++++++++++---------
 conservative-roots-api.h               | 12 +++++
 conservative-roots-embedder.h          | 21 ++++++++
 conservative-roots-types.h             |  8 +++
 conservative-roots.h                   | 21 --------
 gc-api.h                               |  8 +--
 gc-assert.h                            |  6 ++-
 gc-attrs.h                             | 39 +++++++++++++++
 gc-config.h                            | 12 +++++
 gc-embedder-api.h                      |  4 ++
 gc-visibility.h                        |  7 +++
 mt-gcbench-embedder.h                  |  2 +-
 mt-gcbench-types.h                     |  3 ++
 mt-gcbench.c                           | 16 +++---
 parallel-tracer.h                      |  4 +-
 precise-roots.h => precise-roots-api.h | 16 ++----
 precise-roots-embedder.h               | 31 ++++++++++++
 precise-roots-types.h                  | 17 +++++++
 quads-embedder.h                       |  2 +
 quads.c                                | 11 +++--
 semi-attrs.h                           | 11 +++--
 semi.h => semi.c                       | 60 +++++++++++++----------
 serial-tracer.h                        |  4 +-
 simple-gc-embedder.h                   | 31 +++---------
 whippet-attrs.h                        |  3 +-
 whippet.h => whippet.c                 | 68 +++++++++++++-------------
 28 files changed, 344 insertions(+), 189 deletions(-)
 rename bdw.h => bdw.c (84%)
 create mode 100644 conservative-roots-api.h
 create mode 100644 conservative-roots-embedder.h
 create mode 100644 conservative-roots-types.h
 delete mode 100644 conservative-roots.h
 create mode 100644 gc-attrs.h
 create mode 100644 gc-visibility.h
 rename precise-roots.h => precise-roots-api.h (54%)
 create mode 100644 precise-roots-embedder.h
 create mode 100644 precise-roots-types.h
 rename semi.h => semi.c (89%)
 rename whippet.h => whippet.c (98%)

diff --git a/Makefile b/Makefile
index 3945498c2..9244c246c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,32 +2,49 @@ TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
 COLLECTORS=bdw semi whippet parallel-whippet generational-whippet parallel-generational-whippet
 
 CC=gcc
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
+CFLAGS=-Wall -O2 -g -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
 INCLUDES=-I.
-LDFLAGS=-lpthread
-COMPILE=$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS)
+LDFLAGS=-lpthread -flto
+COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
 
 ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
 all: $(ALL_TESTS)
 
-bdw-%: bdw.h conservative-roots.h %-types.h %.c
-	$(COMPILE) `pkg-config --libs --cflags bdw-gc` -DGC_BDW -o $@ $*.c
+bdw-%-gc.o: semi.c %-embedder.h %.c
+	$(COMPILE) `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
+bdw-%.o: semi.c %.c
+	$(COMPILE) -include bdw-attrs.h -o $@ -c $*.c
+bdw-%: bdw-%.o bdw-%-gc.o
+	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
 
-semi-%: semi.h precise-roots.h large-object-space.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_SEMI -o $@ $*.c
+semi-%-gc.o: semi.c %-embedder.h large-object-space.h assert.h debug.h %.c
+	$(COMPILE) -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c semi.c
+semi-%.o: semi.c %.c
+	$(COMPILE) -DGC_PRECISE=1 -include semi-attrs.h -o $@ -c $*.c
 
-whippet-%: whippet.h precise-roots.h large-object-space.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_WHIPPET -o $@ $*.c
+whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
-parallel-whippet-%: whippet.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL_WHIPPET -o $@ $*.c
+parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+parallel-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
-generational-whippet-%: whippet.h precise-roots.h large-object-space.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_GENERATIONAL_WHIPPET -o $@ $*.c
+generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+generational-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
-parallel-generational-whippet-%: whippet.h precise-roots.h large-object-space.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL_GENERATIONAL_WHIPPET -o $@ $*.c
+parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+parallel-generational-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
+
+%: %.o %-gc.o
+	$(CC) $(LDFLAGS) $($*_LDFLAGS) -o $@ $^
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
diff --git a/bdw-attrs.h b/bdw-attrs.h
index 5743cd3a2..960e543b0 100644
--- a/bdw-attrs.h
+++ b/bdw-attrs.h
@@ -2,6 +2,7 @@
 #define BDW_ATTRS_H
 
 #include "gc-attrs.h"
+#include "gc-assert.h"
 
 static inline enum gc_allocator_kind gc_allocator_kind(void) {
   return GC_ALLOCATOR_INLINE_FREELIST;
@@ -14,10 +15,10 @@ static inline size_t gc_allocator_large_threshold(void) {
 }
 
 static inline size_t gc_allocator_allocation_pointer_offset(void) {
-  abort();
+  GC_CRASH();
 }
 static inline size_t gc_allocator_allocation_limit_offset(void) {
-  abort();
+  GC_CRASH();
 }
 
 static inline size_t gc_allocator_freelist_offset(size_t size) {
@@ -29,10 +30,10 @@ static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 0;
 }
 static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
-  abort();
+  GC_CRASH();
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  abort();
+  GC_CRASH();
 }
 
 static inline int gc_allocator_needs_clear(void) {
@@ -43,10 +44,10 @@ static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
   return GC_WRITE_BARRIER_NONE;
 }
 static inline size_t gc_small_write_barrier_card_table_alignment(void) {
-  abort();
+  GC_CRASH();
 }
 static inline size_t gc_small_write_barrier_card_size(void) {
-  abort();
+  GC_CRASH();
 }
 
 #endif // BDW_ATTRS_H
diff --git a/bdw.h b/bdw.c
similarity index 84%
rename from bdw.h
rename to bdw.c
index 0af332090..7f3ec186b 100644
--- a/bdw.h
+++ b/bdw.c
@@ -1,8 +1,18 @@
 #include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
+#define GC_API_ 
+#include "gc-api.h"
+
 #include "bdw-attrs.h"
-#include "conservative-roots.h"
+
+#if GC_PRECISE
+#error bdw-gc is a conservative collector
+#else
+#include "conservative-roots-embedder.h"
+#endif
 
 // When pthreads are used, let `libgc' know about it and redirect
 // allocation calls such as `GC_MALLOC ()' to (contention-free, faster)
@@ -61,10 +71,10 @@ static void* allocate_small_slow(void **freelist, size_t idx,
   size_t bytes = gc_inline_freelist_object_size(idx);
   GC_generic_malloc_many(bytes, kind, freelist);
   void *head = *freelist;
-  if (UNLIKELY (!head)) {
+  if (GC_UNLIKELY (!head)) {
     fprintf(stderr, "ran out of space, heap size %zu\n",
             GC_get_heap_size());
-    abort();
+    GC_CRASH();
   }
   *freelist = *(void **)(head);
   return head;
@@ -74,25 +84,25 @@ static inline void *
 allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   void *head = *freelist;
 
-  if (UNLIKELY (!head))
+  if (GC_UNLIKELY (!head))
     return allocate_small_slow(freelist, idx, kind);
 
   *freelist = *(void **)(head);
   return head;
 }
 
-static void* gc_allocate_large(struct mutator *mut, size_t size) {
+void* gc_allocate_large(struct mutator *mut, size_t size) {
   return GC_malloc(size);
 }
 
-static void* gc_allocate_small(struct mutator *mut, size_t size) {
+void* gc_allocate_small(struct mutator *mut, size_t size) {
   GC_ASSERT(size != 0);
   GC_ASSERT(size <= gc_allocator_large_threshold());
   size_t idx = gc_inline_bytes_to_freelist_index(size);
   return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
 }
 
-static inline void* gc_allocate_pointerless(struct mutator *mut,
+void* gc_allocate_pointerless(struct mutator *mut,
                                             size_t size) {
   // Because the BDW API requires us to implement a custom marker so
   // that the pointerless freelist gets traced, even though it's in a
@@ -126,7 +136,7 @@ static void dump_available_gc_options(void) {
   fprintf(stderr, "\n");
 }
 
-static int gc_option_from_string(const char *str) {
+int gc_option_from_string(const char *str) {
 #define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
   FOR_EACH_GC_OPTION(PARSE_OPTION)
 #undef PARSE_OPTION
@@ -145,8 +155,8 @@ struct options {
 };
 
 static size_t parse_size_t(double value) {
-  ASSERT(value >= 0);
-  ASSERT(value <= (size_t) -1);
+  GC_ASSERT(value >= 0);
+  GC_ASSERT(value <= (size_t) -1);
   return value;
 }
 
@@ -163,7 +173,7 @@ static int parse_options(int argc, struct gc_option argv[],
       options->parallelism = parse_size_t(argv[i].value);
       break;
     default:
-      abort();
+      GC_CRASH();
     }
   }
 
@@ -177,8 +187,8 @@ static int parse_options(int argc, struct gc_option argv[],
   return 1;
 }
 
-static int gc_init(int argc, struct gc_option argv[],
-                   struct heap **heap, struct mutator **mutator) {
+int gc_init(int argc, struct gc_option argv[],
+            struct heap **heap, struct mutator **mutator) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
                GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
@@ -208,8 +218,8 @@ static int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                          struct heap *heap) {
+struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                   struct heap *heap) {
   pthread_mutex_lock(&heap->lock);
   if (!heap->multithreaded) {
     GC_allow_register_threads();
@@ -221,23 +231,23 @@ static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
   GC_register_my_thread(&base);
   return add_mutator(heap);
 }
-static void gc_finish_for_thread(struct mutator *mut) {
+void gc_finish_for_thread(struct mutator *mut) {
   GC_unregister_my_thread();
 }
 
-static void* gc_call_without_gc(struct mutator *mut,
-                                void* (*f)(void*),
-                                void *data) {
+void* gc_call_without_gc(struct mutator *mut,
+                         void* (*f)(void*),
+                         void *data) {
   return GC_do_blocking(f, data);
 }
 
-static void gc_mutator_set_roots(struct mutator *mut,
-                                 struct gc_mutator_roots *roots) {
+void gc_mutator_set_roots(struct mutator *mut,
+                          struct gc_mutator_roots *roots) {
 }
-static void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
 }
 
-static void gc_print_stats(struct heap *heap) {
+void gc_print_stats(struct heap *heap) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
 }
diff --git a/conservative-roots-api.h b/conservative-roots-api.h
new file mode 100644
index 000000000..1619cf640
--- /dev/null
+++ b/conservative-roots-api.h
@@ -0,0 +1,12 @@
+#ifndef CONSERVATIVE_ROOTS_API_H
+#define CONSERVATIVE_ROOTS_API_H
+
+#include "conservative-roots-types.h"
+
+#define HANDLE_TO(T) union { T* v; struct handle handle; }
+#define HANDLE_REF(h) h.v
+#define HANDLE_SET(h,val) do { h.v = val; } while (0)
+#define PUSH_HANDLE(cx, h) do { (void) &h; } while (0)
+#define POP_HANDLE(cx) do { } while (0)
+
+#endif // CONSERVATIVE_ROOTS_API_H
diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
new file mode 100644
index 000000000..ae7120010
--- /dev/null
+++ b/conservative-roots-embedder.h
@@ -0,0 +1,21 @@
+#ifndef CONSERVATIVE_ROOTS_EMBEDDER_H
+#define CONSERVATIVE_ROOTS_EMBEDDER_H
+
+#include "gc-assert.h"
+#include "conservative-roots-types.h"
+
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             void *trace_data),
+                                          void *trace_data) {
+  GC_CRASH();
+}
+
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          void *trace_data),
+                                       void *trace_data) {
+  GC_CRASH();
+}
+
+#endif // CONSERVATIVE_ROOTS_EMBEDDER_H
diff --git a/conservative-roots-types.h b/conservative-roots-types.h
new file mode 100644
index 000000000..4744d746e
--- /dev/null
+++ b/conservative-roots-types.h
@@ -0,0 +1,8 @@
+#ifndef CONSERVATIVE_ROOTS_TYPES_H
+#define CONSERVATIVE_ROOTS_TYPES_H
+
+struct handle { void *unused; };
+struct gc_heap_roots { void *unused; };
+struct gc_mutator_roots { void *unused; };
+
+#endif // CONSERVATIVE_ROOTS_TYPES_H
diff --git a/conservative-roots.h b/conservative-roots.h
deleted file mode 100644
index 23fc38de7..000000000
--- a/conservative-roots.h
+++ /dev/null
@@ -1,21 +0,0 @@
-struct handle { void *unused; };
-
-#define HANDLE_TO(T) union { T* v; struct handle handle; }
-#define HANDLE_REF(h) h.v
-#define HANDLE_SET(h,val) do { h.v = val; } while (0)
-#define PUSH_HANDLE(cx, h) do { (void) &h; } while (0)
-#define POP_HANDLE(cx) do { } while (0)
-
-static inline void visit_thread_roots(void *thread_roots,
-                                      void (*trace_edge)(struct gc_edge edge,
-                                                         void *trace_data),
-                                      void *trace_data) {
-  abort();
-}
-
-static inline void visit_roots(struct handle *roots,
-                               void (*trace_edge)(struct gc_edge edge,
-                                                  void *trace_data),
-                               void *trace_data) {
-  GC_ASSERT(!roots);
-}
diff --git a/gc-api.h b/gc-api.h
index c7d332176..c00881f4f 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -32,7 +32,7 @@ struct gc_mutator {
 
 // FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
 #ifndef GC_API_
-#define GC_API_ static
+#define GC_API_ __attribute__((visibility("hidden")))
 #endif
 
 GC_API_ int gc_option_from_string(const char *str);
@@ -159,12 +159,12 @@ static inline void* gc_allocate(struct mutator *mut, size_t size) {
   case GC_ALLOCATOR_INLINE_NONE:
     return gc_allocate_small(mut, size);
   default:
-    abort();
+    GC_CRASH();
   }
 }
 
 // FIXME: remove :P
-static inline void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
+GC_API_ void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
 
 static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
                                           struct gc_ref new_val) GC_ALWAYS_INLINE;
@@ -183,7 +183,7 @@ static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge
     return;
   }
   default:
-    abort();
+    GC_CRASH();
   }
 }
 
diff --git a/gc-assert.h b/gc-assert.h
index dc39bee25..c3fa6b749 100644
--- a/gc-assert.h
+++ b/gc-assert.h
@@ -6,10 +6,14 @@
 #define GC_UNLIKELY(e) __builtin_expect(e, 0)
 #define GC_LIKELY(e) __builtin_expect(e, 1)
 
+#define GC_CRASH() __builtin_trap()
+
 #if GC_DEBUG
-#define GC_ASSERT(x) do { if (GC_UNLIKELY(!(x))) __builtin_trap(); } while (0)
+#define GC_ASSERT(x) do { if (GC_UNLIKELY(!(x))) GC_CRASH(); } while (0)
+#define GC_UNREACHABLE() GC_CRASH()
 #else
 #define GC_ASSERT(x) do { } while (0)
+#define GC_UNREACHABLE() __builtin_unreachable()
 #endif
 
 #define GC_ASSERT_EQ(a, b) GC_ASSERT((a) == (b))
diff --git a/gc-attrs.h b/gc-attrs.h
new file mode 100644
index 000000000..17ff2add5
--- /dev/null
+++ b/gc-attrs.h
@@ -0,0 +1,39 @@
+#ifndef GC_ATTRS_H
+#define GC_ATTRS_H
+
+#include "gc-inline.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+enum gc_allocator_kind {
+  GC_ALLOCATOR_INLINE_BUMP_POINTER,
+  GC_ALLOCATOR_INLINE_FREELIST,
+  GC_ALLOCATOR_INLINE_NONE
+};
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_large_threshold(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_small_granule_size(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_allocation_limit_offset(void) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_freelist_offset(size_t size) GC_ALWAYS_INLINE;
+
+static inline size_t gc_allocator_alloc_table_alignment(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLINE;
+
+static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
+
+enum gc_write_barrier_kind {
+  GC_WRITE_BARRIER_NONE,
+  GC_WRITE_BARRIER_CARD
+};
+
+static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) GC_ALWAYS_INLINE;
+static inline size_t gc_small_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
+static inline size_t gc_small_write_barrier_card_size(void) GC_ALWAYS_INLINE;
+
+#endif // GC_ATTRS_H
diff --git a/gc-config.h b/gc-config.h
index cd78e23d5..5fc27b7e5 100644
--- a/gc-config.h
+++ b/gc-config.h
@@ -5,4 +5,16 @@
 #define GC_DEBUG 0
 #endif
 
+#ifndef GC_PARALLEL
+#define GC_PARALLEL 0
+#endif
+
+#ifndef GC_GENERATIONAL
+#define GC_GENERATIONAL 0
+#endif
+
+#ifndef GC_PRECISE
+#define GC_PRECISE 0
+#endif
+
 #endif // GC_CONFIG_H
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index b74bd0486..26929b72e 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -8,6 +8,10 @@
 #define GC_EMBEDDER_API static
 #endif
 
+struct gc_mutator_roots;
+struct gc_heap_roots;
+struct gc_atomic_forward;
+
 GC_EMBEDDER_API inline void gc_trace_object(void *object,
                                             void (*trace_edge)(struct gc_edge edge,
                                                                void *trace_data),
diff --git a/gc-visibility.h b/gc-visibility.h
new file mode 100644
index 000000000..7360915a0
--- /dev/null
+++ b/gc-visibility.h
@@ -0,0 +1,7 @@
+#ifndef GC_VISIBILITY_H_
+#define GC_VISIBILITY_H_
+
+#define GC_INTERNAL __attribute__((visibility("hidden")))
+#define GC_PUBLIC __attribute__((visibility("default")))
+
+#endif // GC_VISIBILITY_H
diff --git a/mt-gcbench-embedder.h b/mt-gcbench-embedder.h
index 63076f474..3aec4808b 100644
--- a/mt-gcbench-embedder.h
+++ b/mt-gcbench-embedder.h
@@ -36,7 +36,7 @@ static inline void
 visit_hole_fields(Hole *obj,
                   void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
-  abort();
+  GC_CRASH();
 }
 
 #include "simple-gc-embedder.h"
diff --git a/mt-gcbench-types.h b/mt-gcbench-types.h
index 32471d025..60bddc489 100644
--- a/mt-gcbench-types.h
+++ b/mt-gcbench-types.h
@@ -1,6 +1,9 @@
 #ifndef GCBENCH_TYPES_H
 #define GCBENCH_TYPES_H
 
+#include <stddef.h>
+#include <stdint.h>
+
 #define FOR_EACH_HEAP_OBJECT_KIND(M) \
   M(node, Node, NODE) \
   M(double_array, DoubleArray, DOUBLE_ARRAY) \
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 4d304c636..2d2012ae1 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -44,16 +44,16 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
-#include "mt-gcbench-types.h"
-
 #include "assert.h"
-#include "simple-allocator.h"
 #include "gc-api.h"
-
-#include "mt-gcbench-embedder.h"
-#include "gc.h"
-
-#include "gc-inline.h"
+#include "mt-gcbench-types.h"
+#if GC_PRECISE
+#include "precise-roots-api.h"
+#else
+#include "conservative-roots-api.h"
+#endif
+#include "mt-gcbench-types.h"
+#include "simple-allocator.h"
 
 #define MAX_THREAD_COUNT 256
 
diff --git a/parallel-tracer.h b/parallel-tracer.h
index 4ee90de70..07561e215 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -149,7 +149,7 @@ static int
 trace_deque_grow(struct trace_deque *q, int cur, size_t b, size_t t) {
   if (!trace_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
     fprintf(stderr, "failed to grow deque!!\n");
-    abort();
+    GC_CRASH();
   }
 
   cur++;
@@ -359,7 +359,7 @@ trace_worker_thread(void *data) {
       pthread_mutex_unlock(&worker->lock);
       return NULL;
     default:
-      abort();
+      GC_CRASH();
     }
   }
 }
diff --git a/precise-roots.h b/precise-roots-api.h
similarity index 54%
rename from precise-roots.h
rename to precise-roots-api.h
index 2eedb60ed..ced560d15 100644
--- a/precise-roots.h
+++ b/precise-roots-api.h
@@ -1,7 +1,7 @@
-struct handle {
-  void *v;
-  struct handle *next;
-};
+#ifndef PRECISE_ROOTS_API_H
+#define PRECISE_ROOTS_API_H
+
+#include "precise-roots-types.h"
 
 #define HANDLE_TO(T) union { T* v; struct handle handle; }
 #define HANDLE_REF(h) h.v
@@ -18,10 +18,4 @@ static inline void pop_handle(struct handle **roots) {
   *roots = (*roots)->next;
 }
 
-static inline void visit_roots(struct handle *roots,
-                               void (*trace_edge)(struct gc_edge edge,
-                                                  void *trace_data),
-                               void *trace_data) {
-  for (struct handle *h = roots; h; h = h->next)
-    trace_edge(gc_edge(&h->v), trace_data);
-}
+#endif // PRECISE_ROOTS_API_H
diff --git a/precise-roots-embedder.h b/precise-roots-embedder.h
new file mode 100644
index 000000000..f37b38e1a
--- /dev/null
+++ b/precise-roots-embedder.h
@@ -0,0 +1,31 @@
+#ifndef PRECISE_ROOTS_EMBEDDER_H
+#define PRECISE_ROOTS_EMBEDDER_H
+
+#include "gc-edge.h"
+#include "precise-roots-types.h"
+
+static inline void visit_roots(struct handle *roots,
+                               void (*trace_edge)(struct gc_edge edge,
+                                                  void *trace_data),
+                               void *trace_data) {
+  for (struct handle *h = roots; h; h = h->next)
+    trace_edge(gc_edge(&h->v), trace_data);
+}
+
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             void *trace_data),
+                                          void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, trace_data);
+}
+
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          void *trace_data),
+                                       void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, trace_data);
+}
+
+#endif // PRECISE_ROOTS_EMBEDDER_H
diff --git a/precise-roots-types.h b/precise-roots-types.h
new file mode 100644
index 000000000..d2dc96491
--- /dev/null
+++ b/precise-roots-types.h
@@ -0,0 +1,17 @@
+#ifndef PRECISE_ROOTS_TYPES_H
+#define PRECISE_ROOTS_TYPES_H
+
+struct handle {
+  void *v;
+  struct handle *next;
+};
+
+struct gc_heap_roots {
+  struct handle *roots;
+};
+
+struct gc_mutator_roots {
+  struct handle *roots;
+};
+
+#endif // PRECISE_ROOTS_TYPES_H
diff --git a/quads-embedder.h b/quads-embedder.h
index 18047607c..714415dd0 100644
--- a/quads-embedder.h
+++ b/quads-embedder.h
@@ -1,6 +1,8 @@
 #ifndef QUADS_EMBEDDER_H
 #define QUADS_EMBEDDER_H
 
+#include <stddef.h>
+
 #include "quads-types.h"
 
 #define DEFINE_METHODS(name, Name, NAME) \
diff --git a/quads.c b/quads.c
index 7797bbd42..7620ff693 100644
--- a/quads.c
+++ b/quads.c
@@ -1,14 +1,17 @@
 #include <stdio.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <sys/time.h>
 
 #include "assert.h"
+#include "gc-api.h"
+#if GC_PRECISE
+#include "precise-roots-api.h"
+#else
+#include "conservative-roots-api.h"
+#endif
 #include "quads-types.h"
 #include "simple-allocator.h"
-#include "gc-api.h"
-
-#include "quads-embedder.h"
-#include "gc.h"
 
 typedef HANDLE_TO(Quad) QuadHandle;
 
diff --git a/semi-attrs.h b/semi-attrs.h
index 55691f047..e6b429178 100644
--- a/semi-attrs.h
+++ b/semi-attrs.h
@@ -2,6 +2,7 @@
 #define SEMI_ATTRS_H
 
 #include "gc-attrs.h"
+#include "gc-assert.h"
 
 static const uintptr_t GC_ALIGNMENT = 8;
 static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
@@ -24,7 +25,7 @@ static inline size_t gc_allocator_allocation_limit_offset(void) {
 }
 
 static inline size_t gc_allocator_freelist_offset(size_t size) {
-  abort();
+  GC_CRASH();
 }
 
 static inline int gc_allocator_needs_clear(void) {
@@ -35,20 +36,20 @@ static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 0;
 }
 static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
-  abort();
+  GC_CRASH();
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  abort();
+  GC_CRASH();
 }
 
 static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
   return GC_WRITE_BARRIER_NONE;
 }
 static inline size_t gc_small_write_barrier_card_table_alignment(void) {
-  abort();
+  GC_CRASH();
 }
 static inline size_t gc_small_write_barrier_card_size(void) {
-  abort();
+  GC_CRASH();
 }
 
 #endif // SEMI_ATTRS_H
diff --git a/semi.h b/semi.c
similarity index 89%
rename from semi.h
rename to semi.c
index bc32eecb2..64a9ee652 100644
--- a/semi.h
+++ b/semi.c
@@ -5,9 +5,17 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#define GC_API_ 
+#include "gc-api.h"
+
 #include "semi-attrs.h"
 #include "large-object-space.h"
-#include "precise-roots.h"
+
+#if GC_PRECISE
+#include "precise-roots-embedder.h"
+#else
+#error semi is a precise collector
+#endif
 
 struct semi_space {
   uintptr_t hp;
@@ -141,7 +149,7 @@ static void visit(struct gc_edge edge, void *visit_data) {
   else if (large_object_space_contains(heap_large_object_space(heap), obj))
     visit_large_object_space(heap, heap_large_object_space(heap), obj);
   else
-    abort();
+    GC_CRASH();
 }
 
 static void collect(struct mutator *mut) {
@@ -167,11 +175,11 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) {
   struct semi_space *space = mutator_semi_space(mut);
   if (space->limit - space->hp < bytes) {
     fprintf(stderr, "ran out of space, heap size %zu\n", space->size);
-    abort();
+    GC_CRASH();
   }
 }
 
-static void* gc_allocate_large(struct mutator *mut, size_t size) {
+void* gc_allocate_large(struct mutator *mut, size_t size) {
   struct heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
   struct semi_space *semi_space = heap_semi_space(heap);
@@ -181,7 +189,7 @@ static void* gc_allocate_large(struct mutator *mut, size_t size) {
     collect(mut);
     if (!semi_space_steal_pages(semi_space, npages)) {
       fprintf(stderr, "ran out of space, heap size %zu\n", semi_space->size);
-      abort();
+      GC_CRASH();
     }
   }
 
@@ -191,13 +199,13 @@ static void* gc_allocate_large(struct mutator *mut, size_t size) {
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
-    abort();
+    GC_CRASH();
   }
 
   return ret;
 }
 
-static void* gc_allocate_small(struct mutator *mut, size_t size) {
+void* gc_allocate_small(struct mutator *mut, size_t size) {
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
     uintptr_t addr = space->hp;
@@ -212,7 +220,7 @@ static void* gc_allocate_small(struct mutator *mut, size_t size) {
     return (void *)addr;
   }
 }
-static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
+void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
@@ -250,7 +258,7 @@ static void dump_available_gc_options(void) {
   fprintf(stderr, "\n");
 }
 
-static int gc_option_from_string(const char *str) {
+int gc_option_from_string(const char *str) {
 #define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
   FOR_EACH_GC_OPTION(PARSE_OPTION)
 #undef PARSE_OPTION
@@ -269,8 +277,8 @@ struct options {
 };
 
 static size_t parse_size_t(double value) {
-  ASSERT(value >= 0);
-  ASSERT(value <= (size_t) -1);
+  GC_ASSERT(value >= 0);
+  GC_ASSERT(value <= (size_t) -1);
   return value;
 }
 
@@ -286,7 +294,7 @@ static int parse_options(int argc, struct gc_option argv[],
       options->parallelism = parse_size_t(argv[i].value);
       break;
     default:
-      abort();
+      GC_CRASH();
     }
   }
 
@@ -302,8 +310,8 @@ static int parse_options(int argc, struct gc_option argv[],
   return 1;
 }
 
-static int gc_init(int argc, struct gc_option argv[],
-                   struct heap **heap, struct mutator **mut) {
+int gc_init(int argc, struct gc_option argv[],
+            struct heap **heap, struct mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
                offsetof(struct semi_space, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
@@ -314,7 +322,7 @@ static int gc_init(int argc, struct gc_option argv[],
     return 0;
 
   *mut = calloc(1, sizeof(struct mutator));
-  if (!*mut) abort();
+  if (!*mut) GC_CRASH();
   *heap = mutator_heap(*mut);
 
   struct semi_space *space = mutator_semi_space(*mut);
@@ -328,30 +336,30 @@ static int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-static void gc_mutator_set_roots(struct mutator *mut,
-                                 struct gc_mutator_roots *roots) {
+void gc_mutator_set_roots(struct mutator *mut,
+                          struct gc_mutator_roots *roots) {
   mut->roots = roots;
 }
-static void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
-  abort();
+void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+  GC_CRASH();
 }
 
-static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                          struct heap *heap) {
+struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                   struct heap *heap) {
   fprintf(stderr,
           "Semispace copying collector not appropriate for multithreaded use.\n");
-  exit(1);
+  GC_CRASH();
 }
-static void gc_finish_for_thread(struct mutator *space) {
+void gc_finish_for_thread(struct mutator *space) {
 }
 
-static void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
-                                void *data) {
+void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
+                         void *data) {
   // Can't be threads, then there won't be collection.
   return f(data);
 }
 
-static void gc_print_stats(struct heap *heap) {
+void gc_print_stats(struct heap *heap) {
   struct semi_space *space = heap_semi_space(heap);
   printf("Completed %ld collections\n", space->count);
   printf("Heap size is %zd\n", space->size);
diff --git a/serial-tracer.h b/serial-tracer.h
index 68c4d489f..27ff36882 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -85,7 +85,7 @@ static inline void
 trace_queue_push(struct trace_queue *q, struct gcobj *p) {
   if (UNLIKELY(q->write - q->read == q->size)) {
     if (!trace_queue_grow(q))
-      abort();
+      GC_CRASH();
   }
   trace_queue_put(q, q->write++, p);
 }
@@ -94,7 +94,7 @@ static inline void
 trace_queue_push_many(struct trace_queue *q, struct gcobj **pv, size_t count) {
   while (q->size - (q->write - q->read) < count) {
     if (!trace_queue_grow(q))
-      abort();
+      GC_CRASH();
   }
   for (size_t i = 0; i < count; i++)
     trace_queue_put(q, q->write++, pv[i]);
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 5a8adf717..42e04485e 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -19,34 +19,15 @@ static inline void gc_trace_object(void *object,
     FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
   default:
-    abort ();
+    GC_CRASH();
   }
 }
 
-struct handle;
-struct gc_heap_roots { struct handle *roots; };
-struct gc_mutator_roots { struct handle *roots; };
-
-static inline void visit_roots(struct handle *roots,
-                               void (*trace_edge)(struct gc_edge edge,
-                                                  void *trace_data),
-                               void *trace_data);
-
-static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
-                                          void (*trace_edge)(struct gc_edge edge,
-                                                             void *trace_data),
-                                          void *trace_data) {
-  if (roots)
-    visit_roots(roots->roots, trace_edge, trace_data);
-}
-
-static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
-                                       void (*trace_edge)(struct gc_edge edge,
-                                                          void *trace_data),
-                                       void *trace_data) {
-  if (roots)
-    visit_roots(roots->roots, trace_edge, trace_data);
-}
+#if GC_PRECISE
+#include "precise-roots-embedder.h"
+#else
+#include "conservative-roots-embedder.h"
+#endif
 
 static inline uintptr_t gc_object_forwarded_nonatomic(void *object) {
   uintptr_t tag = *tag_word(object);
diff --git a/whippet-attrs.h b/whippet-attrs.h
index 72915b1a1..bfecc44db 100644
--- a/whippet-attrs.h
+++ b/whippet-attrs.h
@@ -2,6 +2,7 @@
 #define WHIPPET_ATTRS_H
 
 #include "gc-config.h"
+#include "gc-assert.h"
 #include "gc-attrs.h"
 
 static inline enum gc_allocator_kind gc_allocator_kind(void) {
@@ -22,7 +23,7 @@ static inline size_t gc_allocator_allocation_limit_offset(void) {
 }
 
 static inline size_t gc_allocator_freelist_offset(size_t size) {
-  abort();
+  GC_CRASH();
 }
 
 static inline size_t gc_allocator_alloc_table_alignment(void) {
diff --git a/whippet.h b/whippet.c
similarity index 98%
rename from whippet.h
rename to whippet.c
index 654897b5c..2fdafa1c7 100644
--- a/whippet.h
+++ b/whippet.c
@@ -1,11 +1,3 @@
-#ifndef GC_PARALLEL_TRACE
-#error define GC_PARALLEL_TRACE to 1 or 0
-#endif
-
-#ifndef GC_GENERATIONAL
-#error define GC_GENERATIONAL to 1 or 0
-#endif
-
 #include <pthread.h>
 #include <stdatomic.h>
 #include <stdint.h>
@@ -15,11 +7,13 @@
 #include <string.h>
 #include <unistd.h>
 
+#define GC_API_ 
+#include "gc-api.h"
+
 #include "debug.h"
 #include "gc-inline.h"
 #include "large-object-space.h"
-#include "precise-roots.h"
-#if GC_PARALLEL_TRACE
+#if GC_PARALLEL
 #include "parallel-tracer.h"
 #else
 #include "serial-tracer.h"
@@ -27,6 +21,12 @@
 #include "spin.h"
 #include "whippet-attrs.h"
 
+#if GC_PRECISE
+#include "precise-roots-embedder.h"
+#else
+#error whippet only currently implements precise collection
+#endif
+
 #define GRANULE_SIZE 16
 #define GRANULE_SIZE_LOG_2 4
 #define MEDIUM_OBJECT_THRESHOLD 256
@@ -560,7 +560,7 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
     case GC_FORWARDING_STATE_NOT_FORWARDED:
     case GC_FORWARDING_STATE_ABORTED:
       // Impossible.
-      abort();
+      GC_CRASH();
     case GC_FORWARDING_STATE_ACQUIRED: {
       // We claimed the object successfully; evacuating is up to us.
       size_t object_granules = mark_space_live_object_granules(metadata);
@@ -641,7 +641,7 @@ static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           obj);
   else
-    abort();
+    GC_CRASH();
 }
 
 static inline void trace_one(struct gcobj *obj, void *mark_data) {
@@ -836,7 +836,7 @@ static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("allocating mutator mark buffer failed");
-    abort();
+    GC_CRASH();
   }
   if (old_bytes) {
     memcpy(mem, buf->objects, old_bytes);
@@ -905,11 +905,11 @@ static int mutator_should_mark_while_stopping(struct mutator *mut) {
   return heap_should_mark_while_stopping(mutator_heap(mut));
 }
 
-static void gc_mutator_set_roots(struct mutator *mut,
-                                 struct gc_mutator_roots *roots) {
+void gc_mutator_set_roots(struct mutator *mut,
+                          struct gc_mutator_roots *roots) {
   mut->roots = roots;
 }
-static void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
   heap->roots = roots;
 }
 
@@ -1213,7 +1213,7 @@ static void detect_out_of_memory(struct heap *heap) {
   // be able to yield more space: out of memory.
   fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
           heap->size, mark_space->nslabs);
-  abort();
+  GC_CRASH();
 }
 
 static double clamp_major_gc_yield_threshold(struct heap *heap,
@@ -1762,7 +1762,7 @@ static void trigger_collection(struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void* gc_allocate_large(struct mutator *mut, size_t size) {
+void* gc_allocate_large(struct mutator *mut, size_t size) {
   struct heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
 
@@ -1781,13 +1781,13 @@ static void* gc_allocate_large(struct mutator *mut, size_t size) {
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
-    abort();
+    GC_CRASH();
   }
 
   return ret;
 }
 
-static void* gc_allocate_small(struct mutator *mut, size_t size) {
+void* gc_allocate_small(struct mutator *mut, size_t size) {
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
   GC_ASSERT(size <= gc_allocator_large_threshold());
   size = align_up(size, GRANULE_SIZE);
@@ -1816,7 +1816,7 @@ static void* gc_allocate_small(struct mutator *mut, size_t size) {
   return obj;
 }
 
-static inline void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
+void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
@@ -1832,7 +1832,7 @@ static void dump_available_gc_options(void) {
   fprintf(stderr, "\n");
 }
 
-static int gc_option_from_string(const char *str) {
+int gc_option_from_string(const char *str) {
 #define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
   FOR_EACH_GC_OPTION(PARSE_OPTION)
 #undef PARSE_OPTION
@@ -1869,7 +1869,7 @@ static int parse_options(int argc, struct gc_option argv[],
       options->parallelism = parse_size_t(argv[i].value);
       break;
     default:
-      abort();
+      GC_CRASH();
     }
   }
 
@@ -1916,7 +1916,7 @@ static int heap_init(struct heap *heap, struct options *options) {
   heap->size = options->fixed_heap_size;
 
   if (!tracer_init(heap, options->parallelism))
-    abort();
+    GC_CRASH();
 
   heap->fragmentation_low_threshold = 0.05;
   heap->fragmentation_high_threshold = 0.10;
@@ -1960,7 +1960,7 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
   return 1;
 }
 
-static int gc_init(int argc, struct gc_option argv[],
+int gc_init(int argc, struct gc_option argv[],
                    struct heap **heap, struct mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
@@ -1982,10 +1982,10 @@ static int gc_init(int argc, struct gc_option argv[],
     return 0;
 
   *heap = calloc(1, sizeof(struct heap));
-  if (!*heap) abort();
+  if (!*heap) GC_CRASH();
 
   if (!heap_init(*heap, &options))
-    abort();
+    GC_CRASH();
 
   struct mark_space *space = heap_mark_space(*heap);
   if (!mark_space_init(space, *heap)) {
@@ -1995,24 +1995,24 @@ static int gc_init(int argc, struct gc_option argv[],
   }
   
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
-    abort();
+    GC_CRASH();
 
   *mut = calloc(1, sizeof(struct mutator));
-  if (!*mut) abort();
+  if (!*mut) GC_CRASH();
   add_mutator(*heap, *mut);
   return 1;
 }
 
-static struct mutator* gc_init_for_thread(uintptr_t *stack_base,
+struct mutator* gc_init_for_thread(uintptr_t *stack_base,
                                           struct heap *heap) {
   struct mutator *ret = calloc(1, sizeof(struct mutator));
   if (!ret)
-    abort();
+    GC_CRASH();
   add_mutator(heap, ret);
   return ret;
 }
 
-static void gc_finish_for_thread(struct mutator *mut) {
+void gc_finish_for_thread(struct mutator *mut) {
   remove_mutator(mutator_heap(mut), mut);
   mutator_mark_buf_destroy(&mut->mark_buf);
   free(mut);
@@ -2042,7 +2042,7 @@ static void reactivate_mutator(struct heap *heap, struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void* gc_call_without_gc(struct mutator *mut,
+void* gc_call_without_gc(struct mutator *mut,
                                 void* (*f)(void*),
                                 void *data) {
   struct heap *heap = mutator_heap(mut);
@@ -2052,7 +2052,7 @@ static void* gc_call_without_gc(struct mutator *mut,
   return ret;
 }
 
-static void gc_print_stats(struct heap *heap) {
+void gc_print_stats(struct heap *heap) {
   printf("Completed %ld collections (%ld major)\n",
          heap->count, heap->count - heap->minor_count);
   printf("Heap size with overhead is %zd (%zu slabs)\n",

From 92b8f1e917b966241a6e5ce39ca6d76d1950a824 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 21:35:16 +0200
Subject: [PATCH 138/403] Add gc_ prefix to struct heap, struct mutator

---
 bdw.c                |  38 ++++----
 gc-api.h             |  47 +++++-----
 large-object-space.h |   4 +-
 mt-gcbench.c         |  32 +++----
 parallel-tracer.h    |  20 ++--
 quads.c              |   8 +-
 semi.c               |  58 ++++++------
 serial-tracer.h      |  16 ++--
 simple-allocator.h   |   4 +-
 whippet.c            | 214 +++++++++++++++++++++----------------------
 10 files changed, 218 insertions(+), 223 deletions(-)

diff --git a/bdw.c b/bdw.c
index 7f3ec186b..9325726f5 100644
--- a/bdw.c
+++ b/bdw.c
@@ -39,14 +39,14 @@
    up to 256 bytes.  */
 #define GC_INLINE_FREELIST_COUNT (256U / GC_INLINE_GRANULE_BYTES)
 
-struct heap {
+struct gc_heap {
   pthread_mutex_t lock;
   int multithreaded;
 };
 
-struct mutator {
+struct gc_mutator {
   void *freelists[GC_INLINE_FREELIST_COUNT];
-  struct heap *heap;
+  struct gc_heap *heap;
 };
 
 static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
@@ -91,18 +91,18 @@ allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   return head;
 }
 
-void* gc_allocate_large(struct mutator *mut, size_t size) {
+void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
   return GC_malloc(size);
 }
 
-void* gc_allocate_small(struct mutator *mut, size_t size) {
+void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size != 0);
   GC_ASSERT(size <= gc_allocator_large_threshold());
   size_t idx = gc_inline_bytes_to_freelist_index(size);
   return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
 }
 
-void* gc_allocate_pointerless(struct mutator *mut,
+void* gc_allocate_pointerless(struct gc_mutator *mut,
                                             size_t size) {
   // Because the BDW API requires us to implement a custom marker so
   // that the pointerless freelist gets traced, even though it's in a
@@ -110,17 +110,17 @@ void* gc_allocate_pointerless(struct mutator *mut,
   return GC_malloc_atomic(size);
 }
 
-static inline void collect(struct mutator *mut) {
+static inline void collect(struct gc_mutator *mut) {
   GC_gcollect();
 }
 
-static inline struct mutator *add_mutator(struct heap *heap) {
-  struct mutator *ret = GC_malloc(sizeof(struct mutator));
+static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
+  struct gc_mutator *ret = GC_malloc(sizeof(struct gc_mutator));
   ret->heap = heap;
   return ret;
 }
 
-static inline struct heap *mutator_heap(struct mutator *mutator) {
+static inline struct gc_heap *mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
@@ -188,7 +188,7 @@ static int parse_options(int argc, struct gc_option argv[],
 }
 
 int gc_init(int argc, struct gc_option argv[],
-            struct heap **heap, struct mutator **mutator) {
+            struct gc_heap **heap, struct gc_mutator **mutator) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
                GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
@@ -212,14 +212,14 @@ int gc_init(int argc, struct gc_option argv[],
   if (options.fixed_heap_size > current_heap_size)
     GC_expand_hp(options.fixed_heap_size - current_heap_size);
   GC_allow_register_threads();
-  *heap = GC_malloc(sizeof(struct heap));
+  *heap = GC_malloc(sizeof(struct gc_heap));
   pthread_mutex_init(&(*heap)->lock, NULL);
   *mutator = add_mutator(*heap);
   return 1;
 }
 
-struct mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                   struct heap *heap) {
+struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                   struct gc_heap *heap) {
   pthread_mutex_lock(&heap->lock);
   if (!heap->multithreaded) {
     GC_allow_register_threads();
@@ -231,23 +231,23 @@ struct mutator* gc_init_for_thread(uintptr_t *stack_base,
   GC_register_my_thread(&base);
   return add_mutator(heap);
 }
-void gc_finish_for_thread(struct mutator *mut) {
+void gc_finish_for_thread(struct gc_mutator *mut) {
   GC_unregister_my_thread();
 }
 
-void* gc_call_without_gc(struct mutator *mut,
+void* gc_call_without_gc(struct gc_mutator *mut,
                          void* (*f)(void*),
                          void *data) {
   return GC_do_blocking(f, data);
 }
 
-void gc_mutator_set_roots(struct mutator *mut,
+void gc_mutator_set_roots(struct gc_mutator *mut,
                           struct gc_mutator_roots *roots) {
 }
-void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
 }
 
-void gc_print_stats(struct heap *heap) {
+void gc_print_stats(struct gc_heap *heap) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
   printf("Heap size is %ld\n", (long)GC_get_heap_size());
 }
diff --git a/gc-api.h b/gc-api.h
index c00881f4f..2c4a636a1 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -12,9 +12,8 @@
 #include <stdint.h>
 #include <string.h>
 
-// FIXME: prefix with gc_
-struct heap;
-struct mutator;
+struct gc_heap;
+struct gc_mutator;
 
 enum {
   GC_OPTION_FIXED_HEAP_SIZE,
@@ -26,10 +25,6 @@ struct gc_option {
   double value;
 };
 
-struct gc_mutator {
-  void *user_data;
-};
-
 // FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
 #ifndef GC_API_
 #define GC_API_ __attribute__((visibility("hidden")))
@@ -37,20 +32,20 @@ struct gc_mutator {
 
 GC_API_ int gc_option_from_string(const char *str);
 GC_API_ int gc_init(int argc, struct gc_option argv[],
-                    struct heap **heap, struct mutator **mutator);
+                    struct gc_heap **heap, struct gc_mutator **mutator);
 
 struct gc_mutator_roots;
 struct gc_heap_roots;
-GC_API_ void gc_mutator_set_roots(struct mutator *mut,
+GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
                                   struct gc_mutator_roots *roots);
-GC_API_ void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots);
+GC_API_ void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots);
 
-GC_API_ struct mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                           struct heap *heap);
-GC_API_ void gc_finish_for_thread(struct mutator *mut);
-GC_API_ void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
+GC_API_ struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                           struct gc_heap *heap);
+GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
+GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
-GC_API_ void gc_print_stats(struct heap *heap);
+GC_API_ void gc_print_stats(struct gc_heap *heap);
 
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
                                              size_t size) GC_ALWAYS_INLINE;
@@ -60,10 +55,10 @@ static inline void gc_clear_fresh_allocation(struct gc_ref obj,
   memset(gc_ref_heap_object(obj), 0, size);
 }
 
-static inline void gc_update_alloc_table(struct mutator *mut,
+static inline void gc_update_alloc_table(struct gc_mutator *mut,
                                          struct gc_ref obj,
                                          size_t size) GC_ALWAYS_INLINE;
-static inline void gc_update_alloc_table(struct mutator *mut,
+static inline void gc_update_alloc_table(struct gc_mutator *mut,
                                          struct gc_ref obj,
                                          size_t size) {
   size_t alignment = gc_allocator_alloc_table_alignment();
@@ -92,12 +87,12 @@ static inline void gc_update_alloc_table(struct mutator *mut,
   }
 }
 
-GC_API_ void* gc_allocate_small(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
-GC_API_ void* gc_allocate_large(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
+GC_API_ void* gc_allocate_small(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
+GC_API_ void* gc_allocate_large(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
 static inline void*
-gc_allocate_bump_pointer(struct mutator *mut, size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_bump_pointer(struct mutator *mut, size_t size) {
+gc_allocate_bump_pointer(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_bump_pointer(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
   size_t granule_size = gc_allocator_small_granule_size();
@@ -124,9 +119,9 @@ static inline void* gc_allocate_bump_pointer(struct mutator *mut, size_t size) {
   return (void*)hp;
 }
 
-static inline void* gc_allocate_freelist(struct mutator *mut,
+static inline void* gc_allocate_freelist(struct gc_mutator *mut,
                                          size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_freelist(struct mutator *mut, size_t size) {
+static inline void* gc_allocate_freelist(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
   size_t freelist_offset = gc_allocator_freelist_offset(size);
@@ -145,8 +140,8 @@ static inline void* gc_allocate_freelist(struct mutator *mut, size_t size) {
   return head;
 }
 
-static inline void* gc_allocate(struct mutator *mut, size_t bytes) GC_ALWAYS_INLINE;
-static inline void* gc_allocate(struct mutator *mut, size_t size) {
+static inline void* gc_allocate(struct gc_mutator *mut, size_t bytes) GC_ALWAYS_INLINE;
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size != 0);
   if (size > gc_allocator_large_threshold())
     return gc_allocate_large(mut, size);
@@ -164,7 +159,7 @@ static inline void* gc_allocate(struct mutator *mut, size_t size) {
 }
 
 // FIXME: remove :P
-GC_API_ void* gc_allocate_pointerless(struct mutator *mut, size_t bytes);
+GC_API_ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t bytes);
 
 static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
                                           struct gc_ref new_val) GC_ALWAYS_INLINE;
diff --git a/large-object-space.h b/large-object-space.h
index 68f1cf1cf..dd223f94e 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -18,7 +18,7 @@
 // objects are in which space.  That way we slot into the abstraction of a
 // copying collector while not actually copying data.
 
-struct heap;
+struct gc_heap;
 struct gcobj;
 
 struct large_object_space {
@@ -39,7 +39,7 @@ struct large_object_space {
 };
 
 static int large_object_space_init(struct large_object_space *space,
-                                   struct heap *heap) {
+                                   struct gc_heap *heap) {
   pthread_mutex_init(&space->lock, NULL);
   space->page_size = getpagesize();
   space->page_size_log2 = __builtin_ctz(space->page_size);
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 2d2012ae1..9c6e6b5d4 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -65,12 +65,12 @@ static const int max_tree_depth = 16;
 typedef HANDLE_TO(Node) NodeHandle;
 typedef HANDLE_TO(DoubleArray) DoubleArrayHandle;
 
-static Node* allocate_node(struct mutator *mut) {
+static Node* allocate_node(struct gc_mutator *mut) {
   // memset to 0 by the collector.
   return gc_allocate_with_kind(mut, ALLOC_KIND_NODE, sizeof (Node));
 }
 
-static DoubleArray* allocate_double_array(struct mutator *mut,
+static DoubleArray* allocate_double_array(struct gc_mutator *mut,
                                                  size_t size) {
   // May be uninitialized.
   size_t bytes = sizeof(DoubleArray) + sizeof (double) * size;
@@ -80,7 +80,7 @@ static DoubleArray* allocate_double_array(struct mutator *mut,
   return ret;
 }
 
-static Hole* allocate_hole(struct mutator *mut, size_t size) {
+static Hole* allocate_hole(struct gc_mutator *mut, size_t size) {
   size_t bytes = sizeof(Hole) + sizeof (uintptr_t) * size;
   Hole *ret = gc_allocate_with_kind(mut, ALLOC_KIND_HOLE, bytes);
   ret->length = size;
@@ -136,7 +136,7 @@ static size_t power_law(size_t *counter) {
 }
 
 struct thread {
-  struct mutator *mut;
+  struct gc_mutator *mut;
   struct gc_mutator_roots roots;
   size_t counter;
 };
@@ -157,7 +157,7 @@ static void set_field(Node *obj, Node **field, Node *val) {
 
 // Build tree top down, assigning to older objects.
 static void populate(struct thread *t, int depth, Node *node) {
-  struct mutator *mut = t->mut;
+  struct gc_mutator *mut = t->mut;
   if (depth <= 0)
     return;
 
@@ -185,7 +185,7 @@ static void populate(struct thread *t, int depth, Node *node) {
 
 // Build tree bottom-up
 static Node* make_tree(struct thread *t, int depth) {
-  struct mutator *mut = t->mut;
+  struct gc_mutator *mut = t->mut;
   if (depth <= 0)
     return allocate_node(mut);
 
@@ -224,7 +224,7 @@ static void validate_tree(Node *tree, int depth) {
 }
 
 static void time_construction(struct thread *t, int depth) {
-  struct mutator *mut = t->mut;
+  struct gc_mutator *mut = t->mut;
   int num_iters = compute_num_iters(depth);
   NodeHandle temp_tree = { NULL };
   PUSH_HANDLE(t, temp_tree);
@@ -270,23 +270,23 @@ static void* call_with_stack_base(void* (*f)(uintptr_t *stack_base, void *arg),
 }
 
 struct call_with_gc_data {
-  void* (*f)(struct mutator *);
-  struct heap *heap;
+  void* (*f)(struct gc_mutator *);
+  struct gc_heap *heap;
 };
 static void* call_with_gc_inner(uintptr_t *stack_base, void *arg) {
   struct call_with_gc_data *data = arg;
-  struct mutator *mut = gc_init_for_thread(stack_base, data->heap);
+  struct gc_mutator *mut = gc_init_for_thread(stack_base, data->heap);
   void *ret = data->f(mut);
   gc_finish_for_thread(mut);
   return ret;
 }
-static void* call_with_gc(void* (*f)(struct mutator *),
-                          struct heap *heap) {
+static void* call_with_gc(void* (*f)(struct gc_mutator *),
+                          struct gc_heap *heap) {
   struct call_with_gc_data data = { f, heap };
   return call_with_stack_base(call_with_gc_inner, &data);
 }
 
-static void* run_one_test(struct mutator *mut) {
+static void* run_one_test(struct gc_mutator *mut) {
   NodeHandle long_lived_tree = { NULL };
   NodeHandle temp_tree = { NULL };
   DoubleArrayHandle array = { NULL };
@@ -330,7 +330,7 @@ static void* run_one_test(struct mutator *mut) {
 }
 
 static void* run_one_test_in_thread(void *arg) {
-  struct heap *heap = arg;
+  struct gc_heap *heap = arg;
   return call_with_gc(run_one_test, heap);
 }
 
@@ -375,8 +375,8 @@ int main(int argc, char *argv[]) {
   size_t heap_size = heap_max_live * multiplier * nthreads;
   struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, heap_size },
                                  { GC_OPTION_PARALLELISM, parallelism } };
-  struct heap *heap;
-  struct mutator *mut;
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
   if (!gc_init(sizeof options / sizeof options[0], options, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
diff --git a/parallel-tracer.h b/parallel-tracer.h
index 07561e215..56dfa39b6 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -291,9 +291,9 @@ enum trace_worker_state {
   TRACE_WORKER_DEAD
 };
 
-struct heap;
+struct gc_heap;
 struct trace_worker {
-  struct heap *heap;
+  struct gc_heap *heap;
   size_t id;
   size_t steal_id;
   pthread_t thread;
@@ -318,15 +318,15 @@ struct tracer {
 struct local_tracer {
   struct trace_worker *worker;
   struct trace_deque *share_deque;
-  struct heap *heap;
+  struct gc_heap *heap;
   struct local_trace_queue local;
 };
 
 struct context;
-static inline struct tracer* heap_tracer(struct heap *heap);
+static inline struct tracer* heap_tracer(struct gc_heap *heap);
 
 static int
-trace_worker_init(struct trace_worker *worker, struct heap *heap,
+trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
                  struct tracer *tracer, size_t id) {
   worker->heap = heap;
   worker->id = id;
@@ -414,7 +414,7 @@ trace_worker_request_stop(struct trace_worker *worker) {
 }  
 
 static int
-tracer_init(struct heap *heap, size_t parallelism) {
+tracer_init(struct gc_heap *heap, size_t parallelism) {
   struct tracer *tracer = heap_tracer(heap);
   atomic_init(&tracer->active_tracers, 0);
   atomic_init(&tracer->running_tracers, 0);
@@ -436,12 +436,12 @@ tracer_init(struct heap *heap, size_t parallelism) {
   return tracer->worker_count > 0;
 }
 
-static void tracer_prepare(struct heap *heap) {
+static void tracer_prepare(struct gc_heap *heap) {
   struct tracer *tracer = heap_tracer(heap);
   for (size_t i = 0; i < tracer->worker_count; i++)
     tracer->workers[i].steal_id = 0;
 }
-static void tracer_release(struct heap *heap) {
+static void tracer_release(struct gc_heap *heap) {
   struct tracer *tracer = heap_tracer(heap);
   for (size_t i = 0; i < tracer->worker_count; i++)
     trace_deque_release(&tracer->workers[i].deque);
@@ -450,7 +450,7 @@ static void tracer_release(struct heap *heap) {
 struct gcobj;
 static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
 static inline void trace_one(struct gcobj *obj, void *trace_data) GC_ALWAYS_INLINE;
-static inline int trace_edge(struct heap *heap,
+static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
 static inline void
@@ -601,7 +601,7 @@ tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objv,
 }
 
 static inline void
-tracer_trace(struct heap *heap) {
+tracer_trace(struct gc_heap *heap) {
   struct tracer *tracer = heap_tracer(heap);
 
   pthread_mutex_lock(&tracer->lock);
diff --git a/quads.c b/quads.c
index 7620ff693..6136988b8 100644
--- a/quads.c
+++ b/quads.c
@@ -15,7 +15,7 @@
 
 typedef HANDLE_TO(Quad) QuadHandle;
 
-static Quad* allocate_quad(struct mutator *mut) {
+static Quad* allocate_quad(struct gc_mutator *mut) {
   // memset to 0 by the collector.
   return gc_allocate_with_kind(mut, ALLOC_KIND_QUAD, sizeof (Quad));
 }
@@ -30,7 +30,7 @@ static unsigned long current_time(void)
 }
 
 struct thread {
-  struct mutator *mut;
+  struct gc_mutator *mut;
   struct gc_mutator_roots roots;
   size_t counter;
 };
@@ -134,8 +134,8 @@ int main(int argc, char *argv[]) {
 
   struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, heap_size },
                                  { GC_OPTION_PARALLELISM, parallelism } };
-  struct heap *heap;
-  struct mutator *mut;
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
   if (!gc_init(sizeof options / sizeof options[0], options, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
diff --git a/semi.c b/semi.c
index 64a9ee652..7e44527a1 100644
--- a/semi.c
+++ b/semi.c
@@ -28,13 +28,13 @@ struct semi_space {
   size_t size;
   long count;
 };
-struct heap {
+struct gc_heap {
   struct semi_space semi_space;
   struct large_object_space large_object_space;
 };
 // One mutator per space, can just store the heap in the mutator.
-struct mutator {
-  struct heap heap;
+struct gc_mutator {
+  struct gc_heap heap;
   struct gc_mutator_roots *roots;
 };
 
@@ -43,16 +43,16 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static inline struct heap* mutator_heap(struct mutator *mut) {
+static inline struct gc_heap* mutator_heap(struct gc_mutator *mut) {
   return &mut->heap;
 }
-static inline struct semi_space* heap_semi_space(struct heap *heap) {
+static inline struct semi_space* heap_semi_space(struct gc_heap *heap) {
   return &heap->semi_space;
 }
-static inline struct large_object_space* heap_large_object_space(struct heap *heap) {
+static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
-static inline struct semi_space* mutator_semi_space(struct mutator *mut) {
+static inline struct semi_space* mutator_semi_space(struct gc_mutator *mut) {
   return heap_semi_space(mutator_heap(mut));
 }
 
@@ -60,8 +60,8 @@ static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
 
-static void collect(struct mutator *mut) GC_NEVER_INLINE;
-static void collect_for_alloc(struct mutator *mut, size_t bytes) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
 static void visit(struct gc_edge edge, void *visit_data);
 
@@ -111,7 +111,7 @@ static void* copy(struct semi_space *space, void *obj) {
   return new_obj;
 }
 
-static uintptr_t scan(struct heap *heap, uintptr_t grey) {
+static uintptr_t scan(struct gc_heap *heap, uintptr_t grey) {
   size_t size;
   gc_trace_object((void*)grey, visit, heap, &size);
   return grey + align_up(size, GC_ALIGNMENT);
@@ -122,12 +122,12 @@ static void* forward(struct semi_space *space, void *obj) {
   return forwarded ? (void*)forwarded : copy(space, obj);
 }  
 
-static void visit_semi_space(struct heap *heap, struct semi_space *space,
+static void visit_semi_space(struct gc_heap *heap, struct semi_space *space,
                              struct gc_edge edge, void *obj) {
   gc_edge_update(edge, gc_ref_from_heap_object(forward(space, obj)));
 }
 
-static void visit_large_object_space(struct heap *heap,
+static void visit_large_object_space(struct gc_heap *heap,
                                      struct large_object_space *space,
                                      void *obj) {
   if (large_object_space_copy(space, (uintptr_t)obj))
@@ -139,7 +139,7 @@ static int semi_space_contains(struct semi_space *space, void *obj) {
 }
 
 static void visit(struct gc_edge edge, void *visit_data) {
-  struct heap *heap = visit_data;
+  struct gc_heap *heap = visit_data;
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return;
@@ -152,8 +152,8 @@ static void visit(struct gc_edge edge, void *visit_data) {
     GC_CRASH();
 }
 
-static void collect(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
+static void collect(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
   struct semi_space *semi = heap_semi_space(heap);
   struct large_object_space *large = heap_large_object_space(heap);
   // fprintf(stderr, "start collect #%ld:\n", space->count);
@@ -170,7 +170,7 @@ static void collect(struct mutator *mut) {
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
 
-static void collect_for_alloc(struct mutator *mut, size_t bytes) {
+static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
   collect(mut);
   struct semi_space *space = mutator_semi_space(mut);
   if (space->limit - space->hp < bytes) {
@@ -179,8 +179,8 @@ static void collect_for_alloc(struct mutator *mut, size_t bytes) {
   }
 }
 
-void* gc_allocate_large(struct mutator *mut, size_t size) {
-  struct heap *heap = mutator_heap(mut);
+void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
+  struct gc_heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
   struct semi_space *semi_space = heap_semi_space(heap);
 
@@ -205,7 +205,7 @@ void* gc_allocate_large(struct mutator *mut, size_t size) {
   return ret;
 }
 
-void* gc_allocate_small(struct mutator *mut, size_t size) {
+void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
     uintptr_t addr = space->hp;
@@ -220,7 +220,7 @@ void* gc_allocate_small(struct mutator *mut, size_t size) {
     return (void *)addr;
   }
 }
-void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
+void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
@@ -311,7 +311,7 @@ static int parse_options(int argc, struct gc_option argv[],
 }
 
 int gc_init(int argc, struct gc_option argv[],
-            struct heap **heap, struct mutator **mut) {
+            struct gc_heap **heap, struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
                offsetof(struct semi_space, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
@@ -321,7 +321,7 @@ int gc_init(int argc, struct gc_option argv[],
   if (!parse_options(argc, argv, &options))
     return 0;
 
-  *mut = calloc(1, sizeof(struct mutator));
+  *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   *heap = mutator_heap(*mut);
 
@@ -336,30 +336,30 @@ int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-void gc_mutator_set_roots(struct mutator *mut,
+void gc_mutator_set_roots(struct gc_mutator *mut,
                           struct gc_mutator_roots *roots) {
   mut->roots = roots;
 }
-void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
   GC_CRASH();
 }
 
-struct mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                   struct heap *heap) {
+struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                   struct gc_heap *heap) {
   fprintf(stderr,
           "Semispace copying collector not appropriate for multithreaded use.\n");
   GC_CRASH();
 }
-void gc_finish_for_thread(struct mutator *space) {
+void gc_finish_for_thread(struct gc_mutator *space) {
 }
 
-void* gc_call_without_gc(struct mutator *mut, void* (*f)(void*),
+void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                          void *data) {
   // Can't be threads, then there won't be collection.
   return f(data);
 }
 
-void gc_print_stats(struct heap *heap) {
+void gc_print_stats(struct gc_heap *heap) {
   struct semi_space *space = heap_semi_space(heap);
   printf("Completed %ld collections\n", space->count);
   printf("Heap size is %zd\n", space->size);
diff --git a/serial-tracer.h b/serial-tracer.h
index 27ff36882..c61acf04e 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -125,22 +125,22 @@ struct tracer {
   struct trace_queue queue;
 };
 
-struct heap;
-static inline struct tracer* heap_tracer(struct heap *heap);
+struct gc_heap;
+static inline struct tracer* heap_tracer(struct gc_heap *heap);
 
 static int
-tracer_init(struct heap *heap, size_t parallelism) {
+tracer_init(struct gc_heap *heap, size_t parallelism) {
   return trace_queue_init(&heap_tracer(heap)->queue);
 }
-static void tracer_prepare(struct heap *heap) {}
-static void tracer_release(struct heap *heap) {
+static void tracer_prepare(struct gc_heap *heap) {}
+static void tracer_release(struct gc_heap *heap) {
   trace_queue_release(&heap_tracer(heap)->queue);
 }
 
 struct gcobj;
 static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
 static inline void trace_one(struct gcobj *obj, void *trace_data) GC_ALWAYS_INLINE;
-static inline int trace_edge(struct heap *heap,
+static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
 static inline void
@@ -154,13 +154,13 @@ tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objs,
 }
 static inline void
 tracer_visit(struct gc_edge edge, void *trace_data) {
-  struct heap *heap = trace_data;
+  struct gc_heap *heap = trace_data;
   if (trace_edge(heap, edge))
     tracer_enqueue_root(heap_tracer(heap),
                         gc_ref_heap_object(gc_edge_ref(edge)));
 }
 static inline void
-tracer_trace(struct heap *heap) {
+tracer_trace(struct gc_heap *heap) {
   struct gcobj *obj;
   while ((obj = trace_queue_pop(&heap_tracer(heap)->queue)))
     trace_one(obj, heap);
diff --git a/simple-allocator.h b/simple-allocator.h
index f1f02f341..e7f5c6f15 100644
--- a/simple-allocator.h
+++ b/simple-allocator.h
@@ -5,14 +5,14 @@
 #include "gc-api.h"
 
 static inline void*
-gc_allocate_with_kind(struct mutator *mut, enum alloc_kind kind, size_t bytes) {
+gc_allocate_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
   void *obj = gc_allocate(mut, bytes);
   *tag_word(obj) = tag_live(kind);
   return obj;
 }
 
 static inline void*
-gc_allocate_pointerless_with_kind(struct mutator *mut, enum alloc_kind kind, size_t bytes) {
+gc_allocate_pointerless_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
   void *obj = gc_allocate_pointerless(mut, bytes);
   *tag_word(obj) = tag_live(kind);
   return obj;
diff --git a/whippet.c b/whippet.c
index 2fdafa1c7..c8f657a55 100644
--- a/whippet.c
+++ b/whippet.c
@@ -308,7 +308,7 @@ enum gc_kind {
   GC_KIND_MAJOR_EVACUATING = GC_KIND_FLAG_EVACUATING,
 };
 
-struct heap {
+struct gc_heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
   size_t large_object_pages;
@@ -323,11 +323,11 @@ struct heap {
   size_t active_mutator_count;
   size_t mutator_count;
   struct gc_heap_roots *roots;
-  struct mutator *mutator_trace_list;
+  struct gc_mutator *mutator_trace_list;
   long count;
   long minor_count;
   uint8_t last_collection_was_minor;
-  struct mutator *deactivated_mutators;
+  struct gc_mutator *deactivated_mutators;
   struct tracer tracer;
   double fragmentation_low_threshold;
   double fragmentation_high_threshold;
@@ -336,37 +336,37 @@ struct heap {
   double minimum_major_gc_yield_threshold;
 };
 
-struct mutator_mark_buf {
+struct gc_mutator_mark_buf {
   size_t size;
   size_t capacity;
   struct gcobj **objects;
 };
 
-struct mutator {
+struct gc_mutator {
   // Bump-pointer allocation into holes.
   uintptr_t alloc;
   uintptr_t sweep;
   uintptr_t block;
-  struct heap *heap;
+  struct gc_heap *heap;
   struct gc_mutator_roots *roots;
-  struct mutator_mark_buf mark_buf;
+  struct gc_mutator_mark_buf mark_buf;
   // Three uses for this in-object linked-list pointer:
   //  - inactive (blocked in syscall) mutators
   //  - grey objects when stopping active mutators for mark-in-place
   //  - untraced mutators when stopping active mutators for evacuation
-  struct mutator *next;
+  struct gc_mutator *next;
 };
 
-static inline struct tracer* heap_tracer(struct heap *heap) {
+static inline struct tracer* heap_tracer(struct gc_heap *heap) {
   return &heap->tracer;
 }
-static inline struct mark_space* heap_mark_space(struct heap *heap) {
+static inline struct mark_space* heap_mark_space(struct gc_heap *heap) {
   return &heap->mark_space;
 }
-static inline struct large_object_space* heap_large_object_space(struct heap *heap) {
+static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
-static inline struct heap* mutator_heap(struct mutator *mutator) {
+static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
@@ -374,7 +374,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct mutator *mut) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 
 static int heap_object_is_large(struct gcobj *obj) {
   size_t size;
@@ -626,7 +626,7 @@ static inline int large_object_space_mark_object(struct large_object_space *spac
   return large_object_space_copy(space, (uintptr_t)obj);
 }
 
-static inline int trace_edge(struct heap *heap, struct gc_edge edge) {
+static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return 0;
@@ -648,22 +648,22 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
   gc_trace_object(obj, tracer_visit, mark_data, NULL);
 }
 
-static int heap_has_multiple_mutators(struct heap *heap) {
+static int heap_has_multiple_mutators(struct gc_heap *heap) {
   return atomic_load_explicit(&heap->multithreaded, memory_order_relaxed);
 }
 
-static int mutators_are_stopping(struct heap *heap) {
+static int mutators_are_stopping(struct gc_heap *heap) {
   return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
 }
 
-static inline void heap_lock(struct heap *heap) {
+static inline void heap_lock(struct gc_heap *heap) {
   pthread_mutex_lock(&heap->lock);
 }
-static inline void heap_unlock(struct heap *heap) {
+static inline void heap_unlock(struct gc_heap *heap) {
   pthread_mutex_unlock(&heap->lock);
 }
 
-static void add_mutator(struct heap *heap, struct mutator *mut) {
+static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
@@ -677,7 +677,7 @@ static void add_mutator(struct heap *heap, struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void remove_mutator(struct heap *heap, struct mutator *mut) {
+static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = NULL;
   heap_lock(heap);
   heap->active_mutator_count--;
@@ -689,12 +689,12 @@ static void remove_mutator(struct heap *heap, struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void request_mutators_to_stop(struct heap *heap) {
+static void request_mutators_to_stop(struct gc_heap *heap) {
   GC_ASSERT(!mutators_are_stopping(heap));
   atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
 }
 
-static void allow_mutators_to_continue(struct heap *heap) {
+static void allow_mutators_to_continue(struct gc_heap *heap) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(heap->active_mutator_count == 0);
   heap->active_mutator_count++;
@@ -780,9 +780,9 @@ static void mark_space_reacquire_memory(struct mark_space *space,
   }
 }
 
-static size_t next_hole(struct mutator *mut);
+static size_t next_hole(struct gc_mutator *mut);
 
-static int sweep_until_memory_released(struct mutator *mut) {
+static int sweep_until_memory_released(struct gc_mutator *mut) {
   struct mark_space *space = heap_mark_space(mutator_heap(mut));
   ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
                                          memory_order_acquire);
@@ -816,7 +816,7 @@ static int sweep_until_memory_released(struct mutator *mut) {
   return pending <= 0;
 }
 
-static void heap_reset_large_object_pages(struct heap *heap, size_t npages) {
+static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
   size_t previous = heap->large_object_pages;
   heap->large_object_pages = npages;
   GC_ASSERT(npages <= previous);
@@ -825,7 +825,7 @@ static void heap_reset_large_object_pages(struct heap *heap, size_t npages) {
   mark_space_reacquire_memory(heap_mark_space(heap), bytes);
 }
 
-static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
+static void mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
   size_t old_capacity = buf->capacity;
   size_t old_bytes = old_capacity * sizeof(struct gcobj*);
 
@@ -846,30 +846,30 @@ static void mutator_mark_buf_grow(struct mutator_mark_buf *buf) {
   buf->capacity = new_capacity;
 }
 
-static void mutator_mark_buf_push(struct mutator_mark_buf *buf,
+static void mutator_mark_buf_push(struct gc_mutator_mark_buf *buf,
                                   struct gcobj *val) {
   if (GC_UNLIKELY(buf->size == buf->capacity))
     mutator_mark_buf_grow(buf);
   buf->objects[buf->size++] = val;
 }
 
-static void mutator_mark_buf_release(struct mutator_mark_buf *buf) {
+static void mutator_mark_buf_release(struct gc_mutator_mark_buf *buf) {
   size_t bytes = buf->size * sizeof(struct gcobj*);
   if (bytes >= getpagesize())
     madvise(buf->objects, align_up(bytes, getpagesize()), MADV_DONTNEED);
   buf->size = 0;
 }
 
-static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
+static void mutator_mark_buf_destroy(struct gc_mutator_mark_buf *buf) {
   size_t bytes = buf->capacity * sizeof(struct gcobj*);
   if (bytes)
     munmap(buf->objects, bytes);
 }
 
-static void enqueue_mutator_for_tracing(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
+static void enqueue_mutator_for_tracing(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mut->next == NULL);
-  struct mutator *next =
+  struct gc_mutator *next =
     atomic_load_explicit(&heap->mutator_trace_list, memory_order_acquire);
   do {
     mut->next = next;
@@ -877,7 +877,7 @@ static void enqueue_mutator_for_tracing(struct mutator *mut) {
                                          &next, mut));
 }
 
-static int heap_should_mark_while_stopping(struct heap *heap) {
+static int heap_should_mark_while_stopping(struct gc_heap *heap) {
   if (heap->allow_pinning) {
     // The metadata byte is mostly used for marking and object extent.
     // For marking, we allow updates to race, because the state
@@ -901,27 +901,27 @@ static int heap_should_mark_while_stopping(struct heap *heap) {
   return (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_EVACUATING) == 0;
 }
 
-static int mutator_should_mark_while_stopping(struct mutator *mut) {
+static int mutator_should_mark_while_stopping(struct gc_mutator *mut) {
   return heap_should_mark_while_stopping(mutator_heap(mut));
 }
 
-void gc_mutator_set_roots(struct mutator *mut,
+void gc_mutator_set_roots(struct gc_mutator *mut,
                           struct gc_mutator_roots *roots) {
   mut->roots = roots;
 }
-void gc_heap_set_roots(struct heap *heap, struct gc_heap_roots *roots) {
+void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
   heap->roots = roots;
 }
 
 static void trace_and_enqueue_locally(struct gc_edge edge, void *data) {
-  struct mutator *mut = data;
+  struct gc_mutator *mut = data;
   if (trace_edge(mutator_heap(mut), edge))
     mutator_mark_buf_push(&mut->mark_buf,
                           gc_ref_heap_object(gc_edge_ref(edge)));
 }
 
 static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
-  struct heap *heap = data;
+  struct gc_heap *heap = data;
   if (trace_edge(heap, edge))
     tracer_enqueue_root(&heap->tracer,
                         gc_ref_heap_object(gc_edge_ref(edge)));
@@ -929,43 +929,43 @@ static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
 
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
-static void mark_stopping_mutator_roots(struct mutator *mut) {
+static void mark_stopping_mutator_roots(struct gc_mutator *mut) {
   GC_ASSERT(mutator_should_mark_while_stopping(mut));
   gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
 }
 
 // Precondition: the caller holds the heap lock.
-static void mark_mutator_roots_with_lock(struct mutator *mut) {
+static void mark_mutator_roots_with_lock(struct gc_mutator *mut) {
   gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
                          mutator_heap(mut));
 }
 
-static void trace_mutator_roots_with_lock(struct mutator *mut) {
+static void trace_mutator_roots_with_lock(struct gc_mutator *mut) {
   mark_mutator_roots_with_lock(mut);
 }
 
-static void trace_mutator_roots_with_lock_before_stop(struct mutator *mut) {
+static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
   if (mutator_should_mark_while_stopping(mut))
     mark_mutator_roots_with_lock(mut);
   else
     enqueue_mutator_for_tracing(mut);
 }
 
-static void release_stopping_mutator_roots(struct mutator *mut) {
+static void release_stopping_mutator_roots(struct gc_mutator *mut) {
   mutator_mark_buf_release(&mut->mark_buf);
 }
 
-static void wait_for_mutators_to_stop(struct heap *heap) {
+static void wait_for_mutators_to_stop(struct gc_heap *heap) {
   heap->active_mutator_count--;
   while (heap->active_mutator_count)
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static void finish_sweeping(struct mutator *mut);
-static void finish_sweeping_in_block(struct mutator *mut);
+static void finish_sweeping(struct gc_mutator *mut);
+static void finish_sweeping_in_block(struct gc_mutator *mut);
 
-static void trace_mutator_roots_after_stop(struct heap *heap) {
-  struct mutator *mut = atomic_load(&heap->mutator_trace_list);
+static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
+  struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   while (mut) {
     if (active_mutators_already_marked)
@@ -973,24 +973,24 @@ static void trace_mutator_roots_after_stop(struct heap *heap) {
                            mut->mark_buf.objects, mut->mark_buf.size);
     else
       trace_mutator_roots_with_lock(mut);
-    struct mutator *next = mut->next;
+    struct gc_mutator *next = mut->next;
     mut->next = NULL;
     mut = next;
   }
   atomic_store(&heap->mutator_trace_list, NULL);
 
-  for (struct mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
+  for (struct gc_mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
     finish_sweeping_in_block(mut);
     trace_mutator_roots_with_lock(mut);
   }
 }
 
-static void trace_global_roots(struct heap *heap) {
+static void trace_global_roots(struct gc_heap *heap) {
   gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
 }
 
 static inline int
-heap_object_is_young(struct heap *heap, struct gcobj *obj) {
+heap_object_is_young(struct gc_heap *heap, struct gcobj *obj) {
   if (GC_UNLIKELY(!mark_space_contains(heap_mark_space(heap), obj))) {
     // No lospace nursery, for the moment.
     return 0;
@@ -1023,7 +1023,7 @@ static uint64_t broadcast_byte(uint8_t byte) {
 // byte doesn't hold any roots, if all stores were to nursery objects.
 STATIC_ASSERT_EQ(GRANULES_PER_REMSET_BYTE % 8, 0);
 static void mark_space_trace_card(struct mark_space *space,
-                                  struct heap *heap, struct slab *slab,
+                                  struct gc_heap *heap, struct slab *slab,
                                   size_t card) {
   uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
   size_t granule_base = card * GRANULES_PER_REMSET_BYTE;
@@ -1045,7 +1045,7 @@ static void mark_space_trace_card(struct mark_space *space,
 }
 
 static void mark_space_trace_remembered_set(struct mark_space *space,
-                                            struct heap *heap) {
+                                            struct gc_heap *heap) {
   GC_ASSERT(!space->evacuating);
   for (size_t s = 0; s < space->nslabs; s++) {
     struct slab *slab = &space->slabs[s];
@@ -1072,7 +1072,7 @@ static void mark_space_clear_remembered_set(struct mark_space *space) {
   }
 }
 
-static void trace_generational_roots(struct heap *heap) {
+static void trace_generational_roots(struct gc_heap *heap) {
   // TODO: Add lospace nursery.
   if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
     mark_space_trace_remembered_set(heap_mark_space(heap), heap);
@@ -1081,8 +1081,8 @@ static void trace_generational_roots(struct heap *heap) {
   }
 }
 
-static void pause_mutator_for_collection(struct heap *heap) GC_NEVER_INLINE;
-static void pause_mutator_for_collection(struct heap *heap) {
+static void pause_mutator_for_collection(struct gc_heap *heap) GC_NEVER_INLINE;
+static void pause_mutator_for_collection(struct gc_heap *heap) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(heap->active_mutator_count);
   heap->active_mutator_count--;
@@ -1104,9 +1104,9 @@ static void pause_mutator_for_collection(struct heap *heap) {
   heap->active_mutator_count++;
 }
 
-static void pause_mutator_for_collection_with_lock(struct mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
+static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping_in_block(mut);
   if (mutator_should_mark_while_stopping(mut))
@@ -1117,9 +1117,9 @@ static void pause_mutator_for_collection_with_lock(struct mutator *mut) {
   pause_mutator_for_collection(heap);
 }
 
-static void pause_mutator_for_collection_without_lock(struct mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
+static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping(mut);
   if (mutator_should_mark_while_stopping(mut))
@@ -1131,7 +1131,7 @@ static void pause_mutator_for_collection_without_lock(struct mutator *mut) {
   release_stopping_mutator_roots(mut);
 }
 
-static inline void maybe_pause_mutator_for_collection(struct mutator *mut) {
+static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
   while (mutators_are_stopping(mutator_heap(mut)))
     pause_mutator_for_collection_without_lock(mut);
 }
@@ -1155,11 +1155,11 @@ static void reset_statistics(struct mark_space *space) {
   space->fragmentation_granules_since_last_collection = 0;
 }
 
-static int maybe_grow_heap(struct heap *heap) {
+static int maybe_grow_heap(struct gc_heap *heap) {
   return 0;
 }
 
-static double heap_last_gc_yield(struct heap *heap) {
+static double heap_last_gc_yield(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   size_t mark_space_yield = mark_space->granules_freed_by_last_collection;
   mark_space_yield <<= GRANULE_SIZE_LOG_2;
@@ -1180,7 +1180,7 @@ static double heap_last_gc_yield(struct heap *heap) {
   return yield / heap->size;
 }
 
-static double heap_fragmentation(struct heap *heap) {
+static double heap_fragmentation(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   size_t fragmentation_granules =
     mark_space->fragmentation_granules_since_last_collection;
@@ -1189,7 +1189,7 @@ static double heap_fragmentation(struct heap *heap) {
   return ((double)fragmentation_granules) / heap_granules;
 }
 
-static void detect_out_of_memory(struct heap *heap) {
+static void detect_out_of_memory(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
 
@@ -1216,7 +1216,7 @@ static void detect_out_of_memory(struct heap *heap) {
   GC_CRASH();
 }
 
-static double clamp_major_gc_yield_threshold(struct heap *heap,
+static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
                                              double threshold) {
   if (threshold < heap->minimum_major_gc_yield_threshold)
     threshold = heap->minimum_major_gc_yield_threshold;
@@ -1226,7 +1226,7 @@ static double clamp_major_gc_yield_threshold(struct heap *heap,
   return threshold;
 }
 
-static enum gc_kind determine_collection_kind(struct heap *heap) {
+static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
   enum gc_kind gc_kind;
@@ -1305,7 +1305,7 @@ static void release_evacuation_target_blocks(struct mark_space *space) {
                               reserve);
 }
 
-static void prepare_for_evacuation(struct heap *heap) {
+static void prepare_for_evacuation(struct gc_heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
 
   if ((heap->gc_kind & GC_KIND_FLAG_EVACUATING) == 0) {
@@ -1397,13 +1397,13 @@ static void prepare_for_evacuation(struct heap *heap) {
   space->evacuating = 1;
 }
 
-static void trace_conservative_roots_after_stop(struct heap *heap) {
+static void trace_conservative_roots_after_stop(struct gc_heap *heap) {
   // FIXME: Visit conservative roots, if the collector is configured in
   // that way.  Mark them in place, preventing any subsequent
   // evacuation.
 }
 
-static void trace_precise_roots_after_stop(struct heap *heap) {
+static void trace_precise_roots_after_stop(struct gc_heap *heap) {
   trace_mutator_roots_after_stop(heap);
   trace_global_roots(heap);
   trace_generational_roots(heap);
@@ -1418,8 +1418,8 @@ static void mark_space_finish_gc(struct mark_space *space,
   release_evacuation_target_blocks(space);
 }
 
-static void collect(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
+static void collect(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
   if (maybe_grow_heap(heap)) {
@@ -1524,7 +1524,7 @@ static uintptr_t mark_space_next_block_to_sweep(struct mark_space *space) {
   return block;
 }
 
-static void finish_block(struct mutator *mut) {
+static void finish_block(struct gc_mutator *mut) {
   GC_ASSERT(mut->block);
   struct block_summary *block = block_summary_for_addr(mut->block);
   struct mark_space *space = heap_mark_space(mutator_heap(mut));
@@ -1547,7 +1547,7 @@ static void finish_block(struct mutator *mut) {
 
 // Sweep some heap to reclaim free space, resetting mut->alloc and
 // mut->sweep.  Return the size of the hole in granules.
-static size_t next_hole_in_block(struct mutator *mut) {
+static size_t next_hole_in_block(struct gc_mutator *mut) {
   uintptr_t sweep = mut->sweep;
   if (sweep == 0)
     return 0;
@@ -1596,7 +1596,7 @@ static size_t next_hole_in_block(struct mutator *mut) {
   return 0;
 }
 
-static void finish_hole(struct mutator *mut) {
+static void finish_hole(struct gc_mutator *mut) {
   size_t granules = (mut->sweep - mut->alloc) / GRANULE_SIZE;
   if (granules) {
     struct block_summary *summary = block_summary_for_addr(mut->block);
@@ -1609,7 +1609,7 @@ static void finish_hole(struct mutator *mut) {
   // FIXME: add to fragmentation
 }
 
-static int maybe_release_swept_empty_block(struct mutator *mut) {
+static int maybe_release_swept_empty_block(struct gc_mutator *mut) {
   GC_ASSERT(mut->block);
   struct mark_space *space = heap_mark_space(mutator_heap(mut));
   uintptr_t block = mut->block;
@@ -1623,7 +1623,7 @@ static int maybe_release_swept_empty_block(struct mutator *mut) {
   return 1;
 }
 
-static size_t next_hole(struct mutator *mut) {
+static size_t next_hole(struct gc_mutator *mut) {
   finish_hole(mut);
   // As we sweep if we find that a block is empty, we return it to the
   // empties list.  Empties are precious.  But if we return 10 blocks in
@@ -1740,20 +1740,20 @@ static size_t next_hole(struct mutator *mut) {
   }
 }
 
-static void finish_sweeping_in_block(struct mutator *mut) {
+static void finish_sweeping_in_block(struct gc_mutator *mut) {
   while (next_hole_in_block(mut))
     finish_hole(mut);
 }
 
 // Another thread is triggering GC.  Before we stop, finish clearing the
 // dead mark bytes for the mutator's block, and release the block.
-static void finish_sweeping(struct mutator *mut) {
+static void finish_sweeping(struct gc_mutator *mut) {
   while (next_hole(mut))
     finish_hole(mut);
 }
 
-static void trigger_collection(struct mutator *mut) {
-  struct heap *heap = mutator_heap(mut);
+static void trigger_collection(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
   heap_lock(heap);
   if (mutators_are_stopping(heap))
     pause_mutator_for_collection_with_lock(mut);
@@ -1762,8 +1762,8 @@ static void trigger_collection(struct mutator *mut) {
   heap_unlock(heap);
 }
 
-void* gc_allocate_large(struct mutator *mut, size_t size) {
-  struct heap *heap = mutator_heap(mut);
+void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
+  struct gc_heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
 
   size_t npages = large_object_space_npages(space, size);
@@ -1787,7 +1787,7 @@ void* gc_allocate_large(struct mutator *mut, size_t size) {
   return ret;
 }
 
-void* gc_allocate_small(struct mutator *mut, size_t size) {
+void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
   GC_ASSERT(size <= gc_allocator_large_threshold());
   size = align_up(size, GRANULE_SIZE);
@@ -1816,7 +1816,7 @@ void* gc_allocate_small(struct mutator *mut, size_t size) {
   return obj;
 }
 
-void* gc_allocate_pointerless(struct mutator *mut, size_t size) {
+void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
@@ -1907,7 +1907,7 @@ static struct slab* allocate_slabs(size_t nslabs) {
   return (struct slab*) aligned_base;
 }
 
-static int heap_init(struct heap *heap, struct options *options) {
+static int heap_init(struct gc_heap *heap, struct options *options) {
   // *heap is already initialized to 0.
 
   pthread_mutex_init(&heap->lock, NULL);
@@ -1928,7 +1928,7 @@ static int heap_init(struct heap *heap, struct options *options) {
   return 1;
 }
 
-static int mark_space_init(struct mark_space *space, struct heap *heap) {
+static int mark_space_init(struct mark_space *space, struct gc_heap *heap) {
   size_t size = align_up(heap->size, SLAB_SIZE);
   size_t nslabs = size / SLAB_SIZE;
   struct slab *slabs = allocate_slabs(nslabs);
@@ -1961,13 +1961,13 @@ static int mark_space_init(struct mark_space *space, struct heap *heap) {
 }
 
 int gc_init(int argc, struct gc_option argv[],
-                   struct heap **heap, struct mutator **mut) {
+            struct gc_heap **heap, struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
-               offsetof(struct mutator, alloc));
+               offsetof(struct gc_mutator, alloc));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
-               offsetof(struct mutator, sweep));
+               offsetof(struct gc_mutator, sweep));
   GC_ASSERT_EQ(gc_allocator_alloc_table_alignment(), SLAB_SIZE);
   GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), METADATA_BYTE_YOUNG);
   GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), METADATA_BYTE_END);
@@ -1981,7 +1981,7 @@ int gc_init(int argc, struct gc_option argv[],
   if (!parse_options(argc, argv, &options))
     return 0;
 
-  *heap = calloc(1, sizeof(struct heap));
+  *heap = calloc(1, sizeof(struct gc_heap));
   if (!*heap) GC_CRASH();
 
   if (!heap_init(*heap, &options))
@@ -1997,28 +1997,28 @@ int gc_init(int argc, struct gc_option argv[],
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
-  *mut = calloc(1, sizeof(struct mutator));
+  *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   add_mutator(*heap, *mut);
   return 1;
 }
 
-struct mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                          struct heap *heap) {
-  struct mutator *ret = calloc(1, sizeof(struct mutator));
+struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
+                                      struct gc_heap *heap) {
+  struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
   if (!ret)
     GC_CRASH();
   add_mutator(heap, ret);
   return ret;
 }
 
-void gc_finish_for_thread(struct mutator *mut) {
+void gc_finish_for_thread(struct gc_mutator *mut) {
   remove_mutator(mutator_heap(mut), mut);
   mutator_mark_buf_destroy(&mut->mark_buf);
   free(mut);
 }
 
-static void deactivate_mutator(struct heap *heap, struct mutator *mut) {
+static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
   heap_lock(heap);
   mut->next = heap->deactivated_mutators;
@@ -2029,11 +2029,11 @@ static void deactivate_mutator(struct heap *heap, struct mutator *mut) {
   heap_unlock(heap);
 }
 
-static void reactivate_mutator(struct heap *heap, struct mutator *mut) {
+static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_lock(heap);
   while (mutators_are_stopping(heap))
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  struct mutator **prev = &heap->deactivated_mutators;
+  struct gc_mutator **prev = &heap->deactivated_mutators;
   while (*prev != mut)
     prev = &(*prev)->next;
   *prev = mut->next;
@@ -2042,17 +2042,17 @@ static void reactivate_mutator(struct heap *heap, struct mutator *mut) {
   heap_unlock(heap);
 }
 
-void* gc_call_without_gc(struct mutator *mut,
-                                void* (*f)(void*),
-                                void *data) {
-  struct heap *heap = mutator_heap(mut);
+void* gc_call_without_gc(struct gc_mutator *mut,
+                         void* (*f)(void*),
+                         void *data) {
+  struct gc_heap *heap = mutator_heap(mut);
   deactivate_mutator(heap, mut);
   void *ret = f(data);
   reactivate_mutator(heap, mut);
   return ret;
 }
 
-void gc_print_stats(struct heap *heap) {
+void gc_print_stats(struct gc_heap *heap) {
   printf("Completed %ld collections (%ld major)\n",
          heap->count, heap->count - heap->minor_count);
   printf("Heap size with overhead is %zd (%zu slabs)\n",

From 6ecf226570d15291d75df4a412a2cd18eee0d602 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 22:48:46 +0200
Subject: [PATCH 139/403] More typesafety, more gc_ref

---
 gc-embedder-api.h       | 11 +++---
 gc-forwarding.h         |  3 +-
 large-object-space.h    | 10 +++---
 parallel-tracer.h       |  6 ++--
 semi.c                  | 50 ++++++++++++++--------------
 serial-tracer.h         |  4 +--
 simple-allocator.h      |  4 +--
 simple-gc-embedder.h    | 38 +++++++++++----------
 simple-tagging-scheme.h |  4 +--
 whippet.c               | 74 +++++++++++++----------------------------
 10 files changed, 90 insertions(+), 114 deletions(-)

diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index 26929b72e..9756cd30b 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -12,7 +12,7 @@ struct gc_mutator_roots;
 struct gc_heap_roots;
 struct gc_atomic_forward;
 
-GC_EMBEDDER_API inline void gc_trace_object(void *object,
+GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
                                             void (*trace_edge)(struct gc_edge edge,
                                                                void *trace_data),
                                             void *trace_data,
@@ -26,15 +26,16 @@ GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
                                                                    void *trace_data),
                                                 void *trace_data);
 
-GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(void *object);
-GC_EMBEDDER_API inline void gc_object_forward_nonatomic(void *object, uintptr_t new_addr);
+GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
+GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
+                                                        struct gc_ref new_ref);
 
-GC_EMBEDDER_API inline struct gc_atomic_forward gc_atomic_forward_begin(void *obj);
+GC_EMBEDDER_API inline struct gc_atomic_forward gc_atomic_forward_begin(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_atomic_forward_acquire(struct gc_atomic_forward *);
 GC_EMBEDDER_API inline int gc_atomic_forward_retry_busy(struct gc_atomic_forward *);
 GC_EMBEDDER_API inline void gc_atomic_forward_abort(struct gc_atomic_forward *);
 GC_EMBEDDER_API inline void gc_atomic_forward_commit(struct gc_atomic_forward *,
-                                                     uintptr_t new_addr);
+                                                     struct gc_ref new_ref);
 GC_EMBEDDER_API inline uintptr_t gc_atomic_forward_address(struct gc_atomic_forward *);
 
 
diff --git a/gc-forwarding.h b/gc-forwarding.h
index 4fb1dec2c..b598e47a1 100644
--- a/gc-forwarding.h
+++ b/gc-forwarding.h
@@ -2,6 +2,7 @@
 #define GC_FORWARDING_H
 
 #include <stdint.h>
+#include "gc-ref.h"
 
 enum gc_forwarding_state {
   GC_FORWARDING_STATE_FORWARDED,
@@ -12,7 +13,7 @@ enum gc_forwarding_state {
 };
 
 struct gc_atomic_forward {
-  void *object;
+  struct gc_ref ref;
   uintptr_t data;
   enum gc_forwarding_state state;
 };
diff --git a/large-object-space.h b/large-object-space.h
index dd223f94e..01bc4cfc7 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -9,6 +9,7 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include "gc-ref.h"
 #include "address-map.h"
 #include "address-set.h"
 
@@ -19,7 +20,6 @@
 // copying collector while not actually copying data.
 
 struct gc_heap;
-struct gcobj;
 
 struct large_object_space {
   pthread_mutex_t lock;
@@ -71,8 +71,9 @@ static void large_object_space_start_gc(struct large_object_space *space,
 }
 
 static int large_object_space_copy(struct large_object_space *space,
-                                   uintptr_t addr) {
+                                   struct gc_ref ref) {
   int copied = 0;
+  uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
   if (!address_set_contains(&space->from_space, addr))
     // Already copied; object is grey or white.
@@ -145,12 +146,11 @@ static void large_object_space_finish_gc(struct large_object_space *space,
 }
 
 static inline int large_object_space_contains(struct large_object_space *space,
-                                              struct gcobj *ptr) {
-  int ret;
+                                              struct gc_ref ref) {
   pthread_mutex_lock(&space->lock);
   // ptr might be in fromspace or tospace.  Just check the object_pages table, which
   // contains both, as well as object_pages for free blocks.
-  ret = address_map_contains(&space->object_pages, (uintptr_t)ptr);
+  int ret = address_map_contains(&space->object_pages, gc_ref_value(ref));
   pthread_mutex_unlock(&space->lock);
   return ret;
 }
diff --git a/parallel-tracer.h b/parallel-tracer.h
index 56dfa39b6..180f8d09b 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -449,7 +449,7 @@ static void tracer_release(struct gc_heap *heap) {
 
 struct gcobj;
 static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *trace_data) GC_ALWAYS_INLINE;
+static inline void trace_one(struct gc_ref ref, void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
@@ -571,7 +571,7 @@ trace_worker_trace(struct trace_worker *worker) {
   size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
   while (1) {
-    struct gcobj * obj;
+    void *obj;
     if (!local_trace_queue_empty(&trace.local)) {
       obj = local_trace_queue_pop(&trace.local);
     } else {
@@ -579,7 +579,7 @@ trace_worker_trace(struct trace_worker *worker) {
       if (!obj)
         break;
     }
-    trace_one(obj, &trace);
+    trace_one(gc_ref_from_heap_object(obj), &trace);
     n++;
   }
   DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
diff --git a/semi.c b/semi.c
index 7e44527a1..d67ed68ac 100644
--- a/semi.c
+++ b/semi.c
@@ -101,41 +101,42 @@ static void flip(struct semi_space *space) {
   space->count++;
 }  
 
-static void* copy(struct semi_space *space, void *obj) {
+static struct gc_ref copy(struct semi_space *space, struct gc_ref ref) {
   size_t size;
-  gc_trace_object(obj, NULL, NULL, &size);
-  void *new_obj = (void*)space->hp;
-  memcpy(new_obj, obj, size);
-  *(uintptr_t*) obj = space->hp;
-  space->hp += align_up (size, GC_ALIGNMENT);
-  return new_obj;
+  gc_trace_object(ref, NULL, NULL, &size);
+  struct gc_ref new_ref = gc_ref(space->hp);
+  memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(ref), size);
+  gc_object_forward_nonatomic(ref, new_ref);
+  space->hp += align_up(size, GC_ALIGNMENT);
+  return new_ref;
 }
 
-static uintptr_t scan(struct gc_heap *heap, uintptr_t grey) {
+static uintptr_t scan(struct gc_heap *heap, struct gc_ref grey) {
   size_t size;
-  gc_trace_object((void*)grey, visit, heap, &size);
-  return grey + align_up(size, GC_ALIGNMENT);
+  gc_trace_object(grey, visit, heap, &size);
+  return gc_ref_value(grey) + align_up(size, GC_ALIGNMENT);
 }
 
-static void* forward(struct semi_space *space, void *obj) {
+static struct gc_ref forward(struct semi_space *space, struct gc_ref obj) {
   uintptr_t forwarded = gc_object_forwarded_nonatomic(obj);
-  return forwarded ? (void*)forwarded : copy(space, obj);
+  return forwarded ? gc_ref(forwarded) : copy(space, obj);
 }  
 
 static void visit_semi_space(struct gc_heap *heap, struct semi_space *space,
-                             struct gc_edge edge, void *obj) {
-  gc_edge_update(edge, gc_ref_from_heap_object(forward(space, obj)));
+                             struct gc_edge edge, struct gc_ref ref) {
+  gc_edge_update(edge, forward(space, ref));
 }
 
 static void visit_large_object_space(struct gc_heap *heap,
                                      struct large_object_space *space,
-                                     void *obj) {
-  if (large_object_space_copy(space, (uintptr_t)obj))
-    scan(heap, (uintptr_t)obj);
+                                     struct gc_ref ref) {
+  if (large_object_space_copy(space, ref))
+    gc_trace_object(ref, visit, heap, NULL);
 }
 
-static int semi_space_contains(struct semi_space *space, void *obj) {
-  return (((uintptr_t)obj) - space->base) < space->size;
+static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
+  uintptr_t addr = gc_ref_value(ref);
+  return addr - space->base < space->size;
 }
 
 static void visit(struct gc_edge edge, void *visit_data) {
@@ -143,11 +144,10 @@ static void visit(struct gc_edge edge, void *visit_data) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return;
-  void *obj = gc_ref_heap_object(ref);
-  if (semi_space_contains(heap_semi_space(heap), obj))
-    visit_semi_space(heap, heap_semi_space(heap), edge, obj);
-  else if (large_object_space_contains(heap_large_object_space(heap), obj))
-    visit_large_object_space(heap, heap_large_object_space(heap), obj);
+  if (semi_space_contains(heap_semi_space(heap), ref))
+    visit_semi_space(heap, heap_semi_space(heap), edge, ref);
+  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+    visit_large_object_space(heap, heap_large_object_space(heap), ref);
   else
     GC_CRASH();
 }
@@ -164,7 +164,7 @@ static void collect(struct gc_mutator *mut) {
     gc_trace_mutator_roots(mut->roots, visit, heap);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
-    grey = scan(heap, grey);
+    grey = scan(heap, gc_ref(grey));
   large_object_space_finish_gc(large, 0);
   semi_space_set_stolen_pages(semi, large->live_pages_at_last_collection);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
diff --git a/serial-tracer.h b/serial-tracer.h
index c61acf04e..c2202d841 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -139,7 +139,7 @@ static void tracer_release(struct gc_heap *heap) {
 
 struct gcobj;
 static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *trace_data) GC_ALWAYS_INLINE;
+static inline void trace_one(struct gc_ref ref, void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
@@ -163,7 +163,7 @@ static inline void
 tracer_trace(struct gc_heap *heap) {
   struct gcobj *obj;
   while ((obj = trace_queue_pop(&heap_tracer(heap)->queue)))
-    trace_one(obj, heap);
+    trace_one(gc_ref_from_heap_object(obj), heap);
 }
 
 #endif // SERIAL_TRACER_H
diff --git a/simple-allocator.h b/simple-allocator.h
index e7f5c6f15..1edba85d3 100644
--- a/simple-allocator.h
+++ b/simple-allocator.h
@@ -7,14 +7,14 @@
 static inline void*
 gc_allocate_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
   void *obj = gc_allocate(mut, bytes);
-  *tag_word(obj) = tag_live(kind);
+  *tag_word(gc_ref_from_heap_object(obj)) = tag_live(kind);
   return obj;
 }
 
 static inline void*
 gc_allocate_pointerless_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
   void *obj = gc_allocate_pointerless(mut, bytes);
-  *tag_word(obj) = tag_live(kind);
+  *tag_word(gc_ref_from_heap_object(obj)) = tag_live(kind);
   return obj;
 }
 
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 42e04485e..71255256d 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -3,18 +3,19 @@
 #include "simple-tagging-scheme.h"
 #include "gc-embedder-api.h"
 
-static inline void gc_trace_object(void *object,
+static inline void gc_trace_object(struct gc_ref ref,
                                    void (*trace_edge)(struct gc_edge edge,
                                                       void *trace_data),
                                    void *trace_data,
                                    size_t *size) {
-  switch (tag_live_alloc_kind(*tag_word(object))) {
+  switch (tag_live_alloc_kind(*tag_word(ref))) {
 #define SCAN_OBJECT(name, Name, NAME)                                   \
     case ALLOC_KIND_##NAME:                                             \
       if (trace_edge)                                                   \
-        visit_##name##_fields((Name*)object, trace_edge, trace_data);   \
+        visit_##name##_fields(gc_ref_heap_object(ref), trace_edge,      \
+                              trace_data);                              \
       if (size)                                                         \
-        *size = name##_size(object);                                    \
+        *size = name##_size(gc_ref_heap_object(ref));                   \
       break;
     FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
@@ -29,19 +30,19 @@ static inline void gc_trace_object(void *object,
 #include "conservative-roots-embedder.h"
 #endif
 
-static inline uintptr_t gc_object_forwarded_nonatomic(void *object) {
-  uintptr_t tag = *tag_word(object);
+static inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref) {
+  uintptr_t tag = *tag_word(ref);
   return (tag & gcobj_not_forwarded_bit) ? 0 : tag;
 }
 
-static inline void gc_object_forward_nonatomic(void *object,
-                                               uintptr_t new_addr) {
-  *tag_word(object) = new_addr;
+static inline void gc_object_forward_nonatomic(struct gc_ref ref,
+                                               struct gc_ref new_ref) {
+  *tag_word(ref) = gc_ref_value(new_ref);
 }
 
 static inline struct gc_atomic_forward
-gc_atomic_forward_begin(void *object) {
-  uintptr_t tag = atomic_load_explicit(tag_word(object), memory_order_acquire);
+gc_atomic_forward_begin(struct gc_ref ref) {
+  uintptr_t tag = atomic_load_explicit(tag_word(ref), memory_order_acquire);
   enum gc_forwarding_state state;
   if (tag == gcobj_busy)
     state = GC_FORWARDING_STATE_BUSY;
@@ -49,13 +50,13 @@ gc_atomic_forward_begin(void *object) {
     state = GC_FORWARDING_STATE_NOT_FORWARDED;
   else
     state = GC_FORWARDING_STATE_FORWARDED;
-  return (struct gc_atomic_forward){ object, tag, state };
+  return (struct gc_atomic_forward){ ref, tag, state };
 }
 
 static inline int
 gc_atomic_forward_retry_busy(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_BUSY);
-  uintptr_t tag = atomic_load_explicit(tag_word(fwd->object),
+  uintptr_t tag = atomic_load_explicit(tag_word(fwd->ref),
                                        memory_order_acquire);
   if (tag == gcobj_busy)
     return 0;
@@ -71,7 +72,7 @@ gc_atomic_forward_retry_busy(struct gc_atomic_forward *fwd) {
 static inline void
 gc_atomic_forward_acquire(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_NOT_FORWARDED);
-  if (atomic_compare_exchange_strong(tag_word(fwd->object), &fwd->data,
+  if (atomic_compare_exchange_strong(tag_word(fwd->ref), &fwd->data,
                                      gcobj_busy))
     fwd->state = GC_FORWARDING_STATE_ACQUIRED;
   else if (fwd->data == gcobj_busy)
@@ -85,15 +86,16 @@ gc_atomic_forward_acquire(struct gc_atomic_forward *fwd) {
 static inline void
 gc_atomic_forward_abort(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
-  atomic_store_explicit(tag_word(fwd->object), fwd->data, memory_order_release);
+  atomic_store_explicit(tag_word(fwd->ref), fwd->data, memory_order_release);
   fwd->state = GC_FORWARDING_STATE_ABORTED;
 }
 
 static inline void
-gc_atomic_forward_commit(struct gc_atomic_forward *fwd, uintptr_t new_addr) {
+gc_atomic_forward_commit(struct gc_atomic_forward *fwd, struct gc_ref new_ref) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
-  *tag_word((void*)new_addr) = fwd->data;
-  atomic_store_explicit(tag_word(fwd->object), new_addr, memory_order_release);
+  *tag_word(new_ref) = fwd->data;
+  atomic_store_explicit(tag_word(fwd->ref), gc_ref_value(new_ref),
+                        memory_order_release);
   fwd->state = GC_FORWARDING_STATE_FORWARDED;
 }
 
diff --git a/simple-tagging-scheme.h b/simple-tagging-scheme.h
index fc431c575..b6b8a924c 100644
--- a/simple-tagging-scheme.h
+++ b/simple-tagging-scheme.h
@@ -21,8 +21,8 @@ static inline uintptr_t tag_live(uint8_t alloc_kind) {
     | gcobj_not_forwarded_bit;
 }
 
-static inline uintptr_t* tag_word(void *object) {
-  struct gc_header *header = object;
+static inline uintptr_t* tag_word(struct gc_ref ref) {
+  struct gc_header *header = gc_ref_heap_object(ref);
   return &header->tag;
 }
 
diff --git a/whippet.c b/whippet.c
index c8f657a55..13ac6c260 100644
--- a/whippet.c
+++ b/whippet.c
@@ -182,8 +182,6 @@ static struct slab *object_slab(void *obj) {
   return (struct slab*) base;
 }
 
-static int heap_object_is_large(struct gcobj *obj);
-
 static uint8_t *object_metadata_byte(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
@@ -193,14 +191,6 @@ static uint8_t *object_metadata_byte(void *obj) {
 
 #define GRANULES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
 #define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
-static uint8_t *object_remset_byte(void *obj) {
-  GC_ASSERT(!heap_object_is_large(obj));
-  uintptr_t addr = (uintptr_t) obj;
-  uintptr_t base = addr & ~(SLAB_SIZE - 1);
-  uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
-  uintptr_t remset_byte = granule / GRANULES_PER_REMSET_BYTE;
-  return (uint8_t*) (base + remset_byte);
-}
 
 static struct block_summary* block_summary_for_addr(uintptr_t addr) {
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
@@ -376,12 +366,6 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 
 static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 
-static int heap_object_is_large(struct gcobj *obj) {
-  size_t size;
-  gc_trace_object(obj, NULL, NULL, &size);
-  return size > LARGE_OBJECT_THRESHOLD;
-}
-
 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
   return object_metadata_byte(obj);
 }
@@ -475,7 +459,7 @@ static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
     push_block(empties, pop_block(targets));
 }
 
-static struct gcobj *evacuation_allocate(struct mark_space *space,
+static struct gc_ref evacuation_allocate(struct mark_space *space,
                                          size_t granules) {
   // All collector threads compete to allocate from what is logically a
   // single bump-pointer arena, which is actually composed of a linked
@@ -490,7 +474,7 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
   do {
     if (prev >= alloc->limit)
       // No more space.
-      return NULL;
+      return gc_ref_null();
     next = prev + bytes;
     if ((prev ^ next) & ~block_mask)
       // Allocation straddles a block boundary; advance so it starts a
@@ -522,7 +506,7 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
     if (base >= alloc->limit) {
       // Ran out of blocks!
       GC_ASSERT(!block);
-      return NULL;
+      return gc_ref_null();
     }
     GC_ASSERT(block);
     // This store can race with other allocators, but that's OK as long
@@ -534,24 +518,23 @@ static struct gcobj *evacuation_allocate(struct mark_space *space,
   }
 
   uintptr_t addr = block + (next & block_mask) - bytes;
-  return (struct gcobj*) addr;
+  return gc_ref(addr);
 }
 
 static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
                                                      struct gc_edge edge,
                                                      struct gc_ref old_ref) {
-  struct gcobj *obj = gc_ref_heap_object(old_ref);
-  uint8_t *metadata = object_metadata_byte(obj);
+  uint8_t *metadata = object_metadata_byte(gc_ref_heap_object(old_ref));
   uint8_t byte = *metadata;
   if (byte & space->marked_mask)
     return 0;
   if (space->evacuating &&
-      block_summary_has_flag(block_summary_for_addr((uintptr_t)obj),
+      block_summary_has_flag(block_summary_for_addr(gc_ref_value(old_ref)),
                              BLOCK_EVACUATE) &&
       ((byte & METADATA_BYTE_PINNED) == 0)) {
     // This is an evacuating collection, and we are attempting to
     // evacuate this block, and this particular object isn't pinned.
-    struct gc_atomic_forward fwd = gc_atomic_forward_begin(obj);
+    struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
 
     if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
       gc_atomic_forward_acquire(&fwd);
@@ -564,19 +547,19 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
     case GC_FORWARDING_STATE_ACQUIRED: {
       // We claimed the object successfully; evacuating is up to us.
       size_t object_granules = mark_space_live_object_granules(metadata);
-      struct gcobj *new_obj = evacuation_allocate(space, object_granules);
-      if (new_obj) {
+      struct gc_ref new_ref = evacuation_allocate(space, object_granules);
+      if (gc_ref_is_heap_object(new_ref)) {
         // Copy object contents before committing, as we don't know what
         // part of the object (if any) will be overwritten by the
         // commit.
-        memcpy(new_obj, obj, object_granules * GRANULE_SIZE);
-        gc_atomic_forward_commit(&fwd, (uintptr_t)new_obj);
+        memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
+               object_granules * GRANULE_SIZE);
+        gc_atomic_forward_commit(&fwd, new_ref);
         // Now update extent metadata, and indicate to the caller that
         // the object's fields need to be traced.
-        uint8_t *new_metadata = object_metadata_byte(new_obj);
+        uint8_t *new_metadata = object_metadata_byte(gc_ref_heap_object(new_ref));
         memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
-        gc_edge_update(edge, gc_ref_from_heap_object(new_obj));
-        obj = new_obj;
+        gc_edge_update(edge, new_ref);
         metadata = new_metadata;
         // Fall through to set mark bits.
       } else {
@@ -616,36 +599,35 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
 }
 
 static inline int mark_space_contains(struct mark_space *space,
-                                      struct gcobj *obj) {
-  uintptr_t addr = (uintptr_t)obj;
+                                      struct gc_ref ref) {
+  uintptr_t addr = gc_ref_value(ref);
   return addr - space->low_addr < space->extent;
 }
 
 static inline int large_object_space_mark_object(struct large_object_space *space,
-                                                 struct gcobj *obj) {
-  return large_object_space_copy(space, (uintptr_t)obj);
+                                                 struct gc_ref ref) {
+  return large_object_space_copy(space, ref);
 }
 
 static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  struct gcobj *obj = gc_ref_heap_object(ref);
-  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), obj))) {
+  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
     if (heap_mark_space(heap)->evacuating)
       return mark_space_evacuate_or_mark_object(heap_mark_space(heap), edge,
                                                 ref);
     return mark_space_mark_object(heap_mark_space(heap), ref);
   }
-  else if (large_object_space_contains(heap_large_object_space(heap), obj))
+  else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap),
-                                          obj);
+                                          ref);
   else
     GC_CRASH();
 }
 
-static inline void trace_one(struct gcobj *obj, void *mark_data) {
-  gc_trace_object(obj, tracer_visit, mark_data, NULL);
+static inline void trace_one(struct gc_ref ref, void *mark_data) {
+  gc_trace_object(ref, tracer_visit, mark_data, NULL);
 }
 
 static int heap_has_multiple_mutators(struct gc_heap *heap) {
@@ -989,16 +971,6 @@ static void trace_global_roots(struct gc_heap *heap) {
   gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
 }
 
-static inline int
-heap_object_is_young(struct gc_heap *heap, struct gcobj *obj) {
-  if (GC_UNLIKELY(!mark_space_contains(heap_mark_space(heap), obj))) {
-    // No lospace nursery, for the moment.
-    return 0;
-  }
-  GC_ASSERT(!heap_object_is_large(obj));
-  return (*object_metadata_byte(obj)) & METADATA_BYTE_YOUNG;
-}
-
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
   GC_ASSERT(((uintptr_t)mark & 7) == 0);
   uint8_t * __attribute__((aligned(8))) aligned_mark = mark;

From 2199d5f48d487ce92b8d4fae7fdfa8031caa9f0a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 16 Aug 2022 23:21:16 +0200
Subject: [PATCH 140/403] Excise struct gcobj

---
 parallel-tracer.h | 82 ++++++++++++++++++++++-------------------------
 serial-tracer.h   | 51 +++++++++++++++--------------
 whippet.c         | 65 +++++++++++++++++--------------------
 3 files changed, 93 insertions(+), 105 deletions(-)

diff --git a/parallel-tracer.h b/parallel-tracer.h
index 180f8d09b..467ad1bf4 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -18,12 +18,10 @@
 // for Weak Memory Models" (Lê et al, PPoPP'13)
 // (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
 
-struct gcobj;
-
 struct trace_buf {
   unsigned log_size;
   size_t size;
-  struct gcobj **data;
+  uintptr_t *data;
 };
 
 // Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
@@ -35,7 +33,7 @@ static int
 trace_buf_init(struct trace_buf *buf, unsigned log_size) {
   ASSERT(log_size >= trace_buf_min_log_size);
   ASSERT(log_size <= trace_buf_max_log_size);
-  size_t size = (1 << log_size) * sizeof(struct gcobj *);
+  size_t size = (1 << log_size) * sizeof(uintptr_t);
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
@@ -56,7 +54,7 @@ trace_buf_size(struct trace_buf *buf) {
 
 static inline size_t
 trace_buf_byte_size(struct trace_buf *buf) {
-  return trace_buf_size(buf) * sizeof(struct gcobj *);
+  return trace_buf_size(buf) * sizeof(uintptr_t);
 }
 
 static void
@@ -75,16 +73,16 @@ trace_buf_destroy(struct trace_buf *buf) {
   }
 }
 
-static inline struct gcobj *
+static inline struct gc_ref
 trace_buf_get(struct trace_buf *buf, size_t i) {
-  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
-                              memory_order_relaxed);
+  return gc_ref(atomic_load_explicit(&buf->data[i & (buf->size - 1)],
+                                     memory_order_relaxed));
 }
 
 static inline void
-trace_buf_put(struct trace_buf *buf, size_t i, struct gcobj * o) {
+trace_buf_put(struct trace_buf *buf, size_t i, struct gc_ref ref) {
   return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
-                               o,
+                               gc_ref_value(ref),
                                memory_order_relaxed);
 }
 
@@ -158,7 +156,7 @@ trace_deque_grow(struct trace_deque *q, int cur, size_t b, size_t t) {
 }
 
 static void
-trace_deque_push(struct trace_deque *q, struct gcobj * x) {
+trace_deque_push(struct trace_deque *q, struct gc_ref x) {
   size_t b = LOAD_RELAXED(&q->bottom);
   size_t t = LOAD_ACQUIRE(&q->top);
   int active = LOAD_RELAXED(&q->active);
@@ -172,7 +170,7 @@ trace_deque_push(struct trace_deque *q, struct gcobj * x) {
 }
 
 static void
-trace_deque_push_many(struct trace_deque *q, struct gcobj **objv, size_t count) {
+trace_deque_push_many(struct trace_deque *q, struct gc_ref *objv, size_t count) {
   size_t b = LOAD_RELAXED(&q->bottom);
   size_t t = LOAD_ACQUIRE(&q->top);
   int active = LOAD_RELAXED(&q->active);
@@ -186,7 +184,7 @@ trace_deque_push_many(struct trace_deque *q, struct gcobj **objv, size_t count)
   STORE_RELAXED(&q->bottom, b + count);
 }
 
-static struct gcobj *
+static struct gc_ref
 trace_deque_try_pop(struct trace_deque *q) {
   size_t b = LOAD_RELAXED(&q->bottom);
   b = b - 1;
@@ -194,7 +192,7 @@ trace_deque_try_pop(struct trace_deque *q) {
   STORE_RELAXED(&q->bottom, b);
   atomic_thread_fence(memory_order_seq_cst);
   size_t t = LOAD_RELAXED(&q->top);
-  struct gcobj * x;
+  struct gc_ref x;
   if (t <= b) { // Non-empty queue.
     x = trace_buf_get(&q->bufs[active], b);
     if (t == b) { // Single last element in queue.
@@ -202,32 +200,32 @@ trace_deque_try_pop(struct trace_deque *q) {
                                                    memory_order_seq_cst,
                                                    memory_order_relaxed))
         // Failed race.
-        x = NULL;
+        x = gc_ref_null();
       STORE_RELAXED(&q->bottom, b + 1);
     }
   } else { // Empty queue.
-    x = NULL;
+    x = gc_ref_null();
     STORE_RELAXED(&q->bottom, b + 1);
   }
   return x;
 }
 
-static struct gcobj *
+static struct gc_ref
 trace_deque_steal(struct trace_deque *q) {
   while (1) {
     size_t t = LOAD_ACQUIRE(&q->top);
     atomic_thread_fence(memory_order_seq_cst);
     size_t b = LOAD_ACQUIRE(&q->bottom);
     if (t >= b)
-      return NULL;
+      return gc_ref_null();
     int active = LOAD_CONSUME(&q->active);
-    struct gcobj *x = x = trace_buf_get(&q->bufs[active], t);
+    struct gc_ref ref = trace_buf_get(&q->bufs[active], t);
     if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
                                                  memory_order_seq_cst,
                                                  memory_order_relaxed))
       // Failed race.
       continue;
-    return x;
+    return ref;
   }
 }
 
@@ -251,7 +249,7 @@ trace_deque_can_steal(struct trace_deque *q) {
 struct local_trace_queue {
   size_t read;
   size_t write;
-  struct gcobj * data[LOCAL_TRACE_QUEUE_SIZE];
+  struct gc_ref data[LOCAL_TRACE_QUEUE_SIZE];
 };
 
 static inline void
@@ -275,10 +273,10 @@ local_trace_queue_full(struct local_trace_queue *q) {
   return local_trace_queue_size(q) >= LOCAL_TRACE_QUEUE_SIZE;
 }
 static inline void
-local_trace_queue_push(struct local_trace_queue *q, struct gcobj * v) {
+local_trace_queue_push(struct local_trace_queue *q, struct gc_ref v) {
   q->data[q->write++ & LOCAL_TRACE_QUEUE_MASK] = v;
 }
-static inline struct gcobj *
+static inline struct gc_ref
 local_trace_queue_pop(struct local_trace_queue *q) {
   return q->data[q->read++ & LOCAL_TRACE_QUEUE_MASK];
 }
@@ -447,7 +445,6 @@ static void tracer_release(struct gc_heap *heap) {
     trace_deque_release(&tracer->workers[i].deque);
 }
 
-struct gcobj;
 static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
 static inline void trace_one(struct gc_ref ref, void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
@@ -466,12 +463,11 @@ tracer_visit(struct gc_edge edge, void *trace_data) {
   if (trace_edge(trace->heap, edge)) {
     if (local_trace_queue_full(&trace->local))
       tracer_share(trace);
-    local_trace_queue_push(&trace->local,
-                           gc_ref_heap_object(gc_edge_ref(edge)));
+    local_trace_queue_push(&trace->local, gc_edge_ref(edge));
   }
 }
 
-static struct gcobj *
+static struct gc_ref
 tracer_steal_from_worker(struct tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
   return trace_deque_steal(&tracer->workers[id].deque);
@@ -483,21 +479,21 @@ tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
   return trace_deque_can_steal(&tracer->workers[id].deque);
 }
 
-static struct gcobj *
+static struct gc_ref
 trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
   size_t steal_id = worker->steal_id;
   for (size_t i = 0; i < tracer->worker_count; i++) {
     steal_id = (steal_id + 1) % tracer->worker_count;
     DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, steal_id);
-    struct gcobj * obj = tracer_steal_from_worker(tracer, steal_id);
-    if (obj) {
+    struct gc_ref obj = tracer_steal_from_worker(tracer, steal_id);
+    if (gc_ref_is_heap_object(obj)) {
       DEBUG("tracer #%zu: stealing got %p\n", worker->id, obj);
       worker->steal_id = steal_id;
       return obj;
     }
   }
   DEBUG("tracer #%zu: failed to steal\n", worker->id);
-  return 0;
+  return gc_ref_null();
 }
 
 static int
@@ -544,19 +540,19 @@ trace_worker_check_termination(struct trace_worker *worker,
   }
 }
 
-static struct gcobj *
+static struct gc_ref
 trace_worker_steal(struct local_tracer *trace) {
   struct tracer *tracer = heap_tracer(trace->heap);
   struct trace_worker *worker = trace->worker;
 
   while (1) {
     DEBUG("tracer #%zu: trying to steal\n", worker->id);
-    struct gcobj *obj = trace_worker_steal_from_any(worker, tracer);
-    if (obj)
+    struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
+    if (gc_ref_is_heap_object(obj))
       return obj;
 
     if (trace_worker_check_termination(worker, tracer))
-      return NULL;
+      return gc_ref_null();
   }
 }
 
@@ -571,15 +567,15 @@ trace_worker_trace(struct trace_worker *worker) {
   size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
   while (1) {
-    void *obj;
+    struct gc_ref ref;
     if (!local_trace_queue_empty(&trace.local)) {
-      obj = local_trace_queue_pop(&trace.local);
+      ref = local_trace_queue_pop(&trace.local);
     } else {
-      obj = trace_worker_steal(&trace);
-      if (!obj)
+      ref = trace_worker_steal(&trace);
+      if (!gc_ref_is_heap_object(ref))
         break;
     }
-    trace_one(gc_ref_from_heap_object(obj), &trace);
+    trace_one(ref, &trace);
     n++;
   }
   DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
@@ -588,13 +584,13 @@ trace_worker_trace(struct trace_worker *worker) {
 }
 
 static inline void
-tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
+tracer_enqueue_root(struct tracer *tracer, struct gc_ref ref) {
   struct trace_deque *worker0_deque = &tracer->workers[0].deque;
-  trace_deque_push(worker0_deque, obj);
+  trace_deque_push(worker0_deque, ref);
 }
 
 static inline void
-tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objv,
+tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objv,
                      size_t count) {
   struct trace_deque *worker0_deque = &tracer->workers[0].deque;
   trace_deque_push_many(worker0_deque, objv, count);
diff --git a/serial-tracer.h b/serial-tracer.h
index c2202d841..b4194c160 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -8,22 +8,20 @@
 #include "debug.h"
 #include "gc-api.h"
 
-struct gcobj;
-
 struct trace_queue {
   size_t size;
   size_t read;
   size_t write;
-  struct gcobj **buf;
+  struct gc_ref *buf;
 };
 
 static const size_t trace_queue_max_size =
-  (1ULL << (sizeof(struct gcobj *) * 8 - 1)) / sizeof(struct gcobj *);
+  (1ULL << (sizeof(struct gc_ref) * 8 - 1)) / sizeof(struct gc_ref);
 static const size_t trace_queue_release_byte_threshold = 1 * 1024 * 1024;
 
-static struct gcobj **
+static struct gc_ref *
 trace_queue_alloc(size_t size) {
-  void *mem = mmap(NULL, size * sizeof(struct gcobj *), PROT_READ|PROT_WRITE,
+  void *mem = mmap(NULL, size * sizeof(struct gc_ref), PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
     perror("Failed to grow trace queue");
@@ -35,20 +33,20 @@ trace_queue_alloc(size_t size) {
 
 static int
 trace_queue_init(struct trace_queue *q) {
-  q->size = getpagesize() / sizeof(struct gcobj *);
+  q->size = getpagesize() / sizeof(struct gc_ref);
   q->read = 0;
   q->write = 0;
   q->buf = trace_queue_alloc(q->size);
   return !!q->buf;
 }
   
-static inline struct gcobj *
+static inline struct gc_ref
 trace_queue_get(struct trace_queue *q, size_t idx) {
   return q->buf[idx & (q->size - 1)];
 }
 
 static inline void
-trace_queue_put(struct trace_queue *q, size_t idx, struct gcobj *x) {
+trace_queue_put(struct trace_queue *q, size_t idx, struct gc_ref x) {
   q->buf[idx & (q->size - 1)] = x;
 }
 
@@ -57,14 +55,14 @@ static int trace_queue_grow(struct trace_queue *q) GC_NEVER_INLINE;
 static int
 trace_queue_grow(struct trace_queue *q) {
   size_t old_size = q->size;
-  struct gcobj **old_buf = q->buf;
+  struct gc_ref *old_buf = q->buf;
   if (old_size >= trace_queue_max_size) {
     DEBUG("trace queue already at max size of %zu bytes", old_size);
     return 0;
   }
 
   size_t new_size = old_size * 2;
-  struct gcobj **new_buf = trace_queue_alloc(new_size);
+  struct gc_ref *new_buf = trace_queue_alloc(new_size);
   if (!new_buf)
     return 0;
 
@@ -74,7 +72,7 @@ trace_queue_grow(struct trace_queue *q) {
   for (size_t i = q->read; i < q->write; i++)
     new_buf[i & new_mask] = old_buf[i & old_mask];
 
-  munmap(old_buf, old_size * sizeof(struct gcobj *));
+  munmap(old_buf, old_size * sizeof(struct gc_ref));
 
   q->size = new_size;
   q->buf = new_buf;
@@ -82,7 +80,7 @@ trace_queue_grow(struct trace_queue *q) {
 }
   
 static inline void
-trace_queue_push(struct trace_queue *q, struct gcobj *p) {
+trace_queue_push(struct trace_queue *q, struct gc_ref p) {
   if (UNLIKELY(q->write - q->read == q->size)) {
     if (!trace_queue_grow(q))
       GC_CRASH();
@@ -91,7 +89,7 @@ trace_queue_push(struct trace_queue *q, struct gcobj *p) {
 }
 
 static inline void
-trace_queue_push_many(struct trace_queue *q, struct gcobj **pv, size_t count) {
+trace_queue_push_many(struct trace_queue *q, struct gc_ref *pv, size_t count) {
   while (q->size - (q->write - q->read) < count) {
     if (!trace_queue_grow(q))
       GC_CRASH();
@@ -100,16 +98,16 @@ trace_queue_push_many(struct trace_queue *q, struct gcobj **pv, size_t count) {
     trace_queue_put(q, q->write++, pv[i]);
 }
 
-static inline struct gcobj*
+static inline struct gc_ref
 trace_queue_pop(struct trace_queue *q) {
   if (UNLIKELY(q->read == q->write))
-    return NULL;
+    return gc_ref_null();
   return trace_queue_get(q, q->read++);
 }
 
 static void
 trace_queue_release(struct trace_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gcobj *);
+  size_t byte_size = q->size * sizeof(struct gc_ref);
   if (byte_size >= trace_queue_release_byte_threshold)
     madvise(q->buf, byte_size, MADV_DONTNEED);
   q->read = q->write = 0;
@@ -117,7 +115,7 @@ trace_queue_release(struct trace_queue *q) {
 
 static void
 trace_queue_destroy(struct trace_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gcobj *);
+  size_t byte_size = q->size * sizeof(struct gc_ref);
   munmap(q->buf, byte_size);
 }
 
@@ -137,18 +135,17 @@ static void tracer_release(struct gc_heap *heap) {
   trace_queue_release(&heap_tracer(heap)->queue);
 }
 
-struct gcobj;
 static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
 static inline void trace_one(struct gc_ref ref, void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
 static inline void
-tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
+tracer_enqueue_root(struct tracer *tracer, struct gc_ref obj) {
   trace_queue_push(&tracer->queue, obj);
 }
 static inline void
-tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objs,
+tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objs,
                      size_t count) {
   trace_queue_push_many(&tracer->queue, objs, count);
 }
@@ -156,14 +153,16 @@ static inline void
 tracer_visit(struct gc_edge edge, void *trace_data) {
   struct gc_heap *heap = trace_data;
   if (trace_edge(heap, edge))
-    tracer_enqueue_root(heap_tracer(heap),
-                        gc_ref_heap_object(gc_edge_ref(edge)));
+    tracer_enqueue_root(heap_tracer(heap), gc_edge_ref(edge));
 }
 static inline void
 tracer_trace(struct gc_heap *heap) {
-  struct gcobj *obj;
-  while ((obj = trace_queue_pop(&heap_tracer(heap)->queue)))
-    trace_one(gc_ref_from_heap_object(obj), heap);
+  do {
+    struct gc_ref obj = trace_queue_pop(&heap_tracer(heap)->queue);
+    if (!gc_ref_is_heap_object(obj))
+      break;
+    trace_one(obj, heap);
+  } while (1);
 }
 
 #endif // SERIAL_TRACER_H
diff --git a/whippet.c b/whippet.c
index 13ac6c260..ab951c306 100644
--- a/whippet.c
+++ b/whippet.c
@@ -182,13 +182,16 @@ static struct slab *object_slab(void *obj) {
   return (struct slab*) base;
 }
 
-static uint8_t *object_metadata_byte(void *obj) {
-  uintptr_t addr = (uintptr_t) obj;
+static uint8_t *metadata_byte_for_addr(uintptr_t addr) {
   uintptr_t base = addr & ~(SLAB_SIZE - 1);
   uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
   return (uint8_t*) (base + granule);
 }
 
+static uint8_t *metadata_byte_for_object(struct gc_ref ref) {
+  return metadata_byte_for_addr(gc_ref_value(ref));
+}
+
 #define GRANULES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
 #define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
 
@@ -258,8 +261,6 @@ static inline size_t size_to_granules(size_t size) {
   return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
 }
 
-struct gcobj;
-
 struct evacuation_allocator {
   size_t allocated; // atomically
   size_t limit;
@@ -329,7 +330,7 @@ struct gc_heap {
 struct gc_mutator_mark_buf {
   size_t size;
   size_t capacity;
-  struct gcobj **objects;
+  struct gc_ref *objects;
 };
 
 struct gc_mutator {
@@ -366,10 +367,6 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 
 static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 
-static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
-  return object_metadata_byte(obj);
-}
-
 static size_t mark_space_live_object_granules(uint8_t *metadata) {
   size_t n = 0;
   while ((metadata[n] & METADATA_BYTE_END) == 0)
@@ -379,8 +376,7 @@ static size_t mark_space_live_object_granules(uint8_t *metadata) {
 
 static inline int mark_space_mark_object(struct mark_space *space,
                                          struct gc_ref ref) {
-  struct gcobj *obj = gc_ref_heap_object(ref);
-  uint8_t *loc = object_metadata_byte(obj);
+  uint8_t *loc = metadata_byte_for_object(ref);
   uint8_t byte = *loc;
   if (byte & space->marked_mask)
     return 0;
@@ -414,7 +410,7 @@ static void clear_remaining_metadata_bytes_in_block(uintptr_t block,
   uintptr_t limit = block + BLOCK_SIZE;
   uintptr_t granules = (limit - base) >> GRANULE_SIZE_LOG_2;
   GC_ASSERT(granules <= GRANULES_PER_BLOCK);
-  memset(object_metadata_byte((void*)base), 0, granules);
+  memset(metadata_byte_for_addr(base), 0, granules);
 }
 
 static void finish_evacuation_allocator_block(uintptr_t block,
@@ -524,7 +520,7 @@ static struct gc_ref evacuation_allocate(struct mark_space *space,
 static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
                                                      struct gc_edge edge,
                                                      struct gc_ref old_ref) {
-  uint8_t *metadata = object_metadata_byte(gc_ref_heap_object(old_ref));
+  uint8_t *metadata = metadata_byte_for_object(old_ref);
   uint8_t byte = *metadata;
   if (byte & space->marked_mask)
     return 0;
@@ -557,7 +553,7 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
         gc_atomic_forward_commit(&fwd, new_ref);
         // Now update extent metadata, and indicate to the caller that
         // the object's fields need to be traced.
-        uint8_t *new_metadata = object_metadata_byte(gc_ref_heap_object(new_ref));
+        uint8_t *new_metadata = metadata_byte_for_object(new_ref);
         memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
         gc_edge_update(edge, new_ref);
         metadata = new_metadata;
@@ -809,10 +805,10 @@ static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
 
 static void mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
   size_t old_capacity = buf->capacity;
-  size_t old_bytes = old_capacity * sizeof(struct gcobj*);
+  size_t old_bytes = old_capacity * sizeof(struct gc_ref);
 
   size_t new_bytes = old_bytes ? old_bytes * 2 : getpagesize();
-  size_t new_capacity = new_bytes / sizeof(struct gcobj*);
+  size_t new_capacity = new_bytes / sizeof(struct gc_ref);
 
   void *mem = mmap(NULL, new_bytes, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
@@ -829,21 +825,21 @@ static void mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
 }
 
 static void mutator_mark_buf_push(struct gc_mutator_mark_buf *buf,
-                                  struct gcobj *val) {
+                                  struct gc_ref ref) {
   if (GC_UNLIKELY(buf->size == buf->capacity))
     mutator_mark_buf_grow(buf);
-  buf->objects[buf->size++] = val;
+  buf->objects[buf->size++] = ref;
 }
 
 static void mutator_mark_buf_release(struct gc_mutator_mark_buf *buf) {
-  size_t bytes = buf->size * sizeof(struct gcobj*);
+  size_t bytes = buf->size * sizeof(struct gc_ref);
   if (bytes >= getpagesize())
     madvise(buf->objects, align_up(bytes, getpagesize()), MADV_DONTNEED);
   buf->size = 0;
 }
 
 static void mutator_mark_buf_destroy(struct gc_mutator_mark_buf *buf) {
-  size_t bytes = buf->capacity * sizeof(struct gcobj*);
+  size_t bytes = buf->capacity * sizeof(struct gc_ref);
   if (bytes)
     munmap(buf->objects, bytes);
 }
@@ -898,15 +894,13 @@ void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
 static void trace_and_enqueue_locally(struct gc_edge edge, void *data) {
   struct gc_mutator *mut = data;
   if (trace_edge(mutator_heap(mut), edge))
-    mutator_mark_buf_push(&mut->mark_buf,
-                          gc_ref_heap_object(gc_edge_ref(edge)));
+    mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
 static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
   struct gc_heap *heap = data;
   if (trace_edge(heap, edge))
-    tracer_enqueue_root(&heap->tracer,
-                        gc_ref_heap_object(gc_edge_ref(edge)));
+    tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
 // Mark the roots of a mutator that is stopping for GC.  We can't
@@ -951,8 +945,8 @@ static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   while (mut) {
     if (active_mutators_already_marked)
-      tracer_enqueue_roots(&heap->tracer,
-                           mut->mark_buf.objects, mut->mark_buf.size);
+      tracer_enqueue_roots(&heap->tracer, mut->mark_buf.objects,
+                           mut->mark_buf.size);
     else
       trace_mutator_roots_with_lock(mut);
     struct gc_mutator *next = mut->next;
@@ -1009,9 +1003,8 @@ static void mark_space_trace_card(struct mark_space *space,
       mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
       size_t granule = granule_base + granule_offset;
       uintptr_t addr = first_addr_in_slab + granule * GRANULE_SIZE;
-      struct gcobj *obj = (struct gcobj*)addr;
-      GC_ASSERT(object_metadata_byte(obj) == &slab->metadata[granule]);
-      tracer_enqueue_root(&heap->tracer, obj);
+      GC_ASSERT(metadata_byte_for_addr(addr) == &slab->metadata[granule]);
+      tracer_enqueue_root(&heap->tracer, gc_ref(addr));
     }
   }
 }
@@ -1528,7 +1521,7 @@ static size_t next_hole_in_block(struct gc_mutator *mut) {
 
   while (sweep != limit) {
     GC_ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
-    uint8_t* metadata = object_metadata_byte((struct gcobj*)sweep);
+    uint8_t* metadata = metadata_byte_for_addr(sweep);
     size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
 
     // Except for when we first get a block, mut->sweep is positioned
@@ -1574,7 +1567,7 @@ static void finish_hole(struct gc_mutator *mut) {
     struct block_summary *summary = block_summary_for_addr(mut->block);
     summary->holes_with_fragmentation++;
     summary->fragmentation_granules += granules;
-    uint8_t *metadata = object_metadata_byte((void*)mut->alloc);
+    uint8_t *metadata = metadata_byte_for_addr(mut->alloc);
     memset(metadata, 0, granules);
     mut->alloc = mut->sweep;
   }
@@ -1766,10 +1759,10 @@ void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
   uintptr_t alloc = mut->alloc;
   uintptr_t sweep = mut->sweep;
   uintptr_t new_alloc = alloc + size;
-  struct gcobj *obj;
+  struct gc_ref ret;
   if (new_alloc <= sweep) {
     mut->alloc = new_alloc;
-    obj = (struct gcobj *)alloc;
+    ret = gc_ref(alloc);
   } else {
     size_t granules = size >> GRANULE_SIZE_LOG_2;
     while (1) {
@@ -1781,11 +1774,11 @@ void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
       if (!hole)
         trigger_collection(mut);
     }
-    obj = (struct gcobj*)mut->alloc;
+    ret = gc_ref(mut->alloc);
     mut->alloc += size;
   }
-  gc_update_alloc_table(mut, gc_ref_from_heap_object(obj), size);
-  return obj;
+  gc_update_alloc_table(mut, ret, size);
+  return gc_ref_heap_object(ret);
 }
 
 void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {

From 8a51117763c59ff72d2818fbfd7dca2074db1498 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 22 Aug 2022 21:11:30 +0200
Subject: [PATCH 141/403] Rework pinning, prepare for conservative tracing

We don't need a pin bit: we just need to mark pinned objects before
evacuation starts.  This way we can remove the stopping / marking race
so that we can always mark while stopping.
---
 conservative-roots-embedder.h |  38 +++++--
 gc-embedder-api.h             |  21 +++-
 precise-roots-embedder.h      |  35 ++++--
 whippet.c                     | 195 ++++++++++++++++++++++------------
 4 files changed, 200 insertions(+), 89 deletions(-)

diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
index ae7120010..5b3d6fba9 100644
--- a/conservative-roots-embedder.h
+++ b/conservative-roots-embedder.h
@@ -4,18 +4,36 @@
 #include "gc-assert.h"
 #include "conservative-roots-types.h"
 
-static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
-                                          void (*trace_edge)(struct gc_edge edge,
-                                                             void *trace_data),
-                                          void *trace_data) {
-  GC_CRASH();
+static inline int gc_has_conservative_roots(void) {
+  return 1;
+}
+static inline int gc_has_conservative_intraheap_edges(void) {
+  // FIXME: Implement both ways.
+  return 0;
 }
 
-static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
-                                       void (*trace_edge)(struct gc_edge edge,
-                                                          void *trace_data),
-                                       void *trace_data) {
-  GC_CRASH();
+static inline void gc_trace_conservative_mutator_roots(struct gc_mutator_roots *roots,
+                                                       void (*trace_ref)(struct gc_ref edge,
+                                                                         void *trace_data),
+                                                       void *trace_data) {
+}
+
+static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
+                                                  void (*trace_edge)(struct gc_edge edge,
+                                                                     void *trace_data),
+                                                  void *trace_data) {
+}
+
+static inline void gc_trace_conservative_heap_roots(struct gc_heap_roots *roots,
+                                                    void (*trace_ref)(struct gc_ref ref,
+                                                                      void *trace_data),
+                                                    void *trace_data) {
+}
+
+static inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
+                                               void (*trace_edge)(struct gc_edge edge,
+                                                                  void *trace_data),
+                                               void *trace_data) {
 }
 
 #endif // CONSERVATIVE_ROOTS_EMBEDDER_H
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index 9756cd30b..2e0029e97 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -12,19 +12,30 @@ struct gc_mutator_roots;
 struct gc_heap_roots;
 struct gc_atomic_forward;
 
+GC_EMBEDDER_API inline int gc_has_conservative_roots(void);
+GC_EMBEDDER_API inline int gc_has_conservative_intraheap_edges(void);
+
 GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
                                             void (*trace_edge)(struct gc_edge edge,
                                                                void *trace_data),
                                             void *trace_data,
                                             size_t *size) GC_ALWAYS_INLINE;
-GC_EMBEDDER_API inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+GC_EMBEDDER_API inline void gc_trace_conservative_mutator_roots(struct gc_mutator_roots *roots,
+                                                                void (*trace_ref)(struct gc_ref edge,
+                                                                                  void *trace_data),
+                                                                void *trace_data);
+GC_EMBEDDER_API inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
                                                    void (*trace_edge)(struct gc_edge edge,
                                                                       void *trace_data),
                                                    void *trace_data);
-GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
-                                                void (*trace_edge)(struct gc_edge edge,
-                                                                   void *trace_data),
-                                                void *trace_data);
+GC_EMBEDDER_API inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
+                                                        void (*trace_edge)(struct gc_edge edge,
+                                                                           void *trace_data),
+                                                        void *trace_data);
+GC_EMBEDDER_API inline void gc_trace_conservative_heap_roots(struct gc_heap_roots *roots,
+                                                             void (*trace_ref)(struct gc_ref ref,
+                                                                               void *trace_data),
+                                                             void *trace_data);
 
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
diff --git a/precise-roots-embedder.h b/precise-roots-embedder.h
index f37b38e1a..8b4deb481 100644
--- a/precise-roots-embedder.h
+++ b/precise-roots-embedder.h
@@ -4,6 +4,13 @@
 #include "gc-edge.h"
 #include "precise-roots-types.h"
 
+static inline int gc_has_conservative_roots(void) {
+  return 0;
+}
+static inline int gc_has_conservative_intraheap_edges(void) {
+  return 0;
+}
+
 static inline void visit_roots(struct handle *roots,
                                void (*trace_edge)(struct gc_edge edge,
                                                   void *trace_data),
@@ -12,18 +19,30 @@ static inline void visit_roots(struct handle *roots,
     trace_edge(gc_edge(&h->v), trace_data);
 }
 
-static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
-                                          void (*trace_edge)(struct gc_edge edge,
-                                                             void *trace_data),
-                                          void *trace_data) {
+static inline void gc_trace_conservative_mutator_roots(struct gc_mutator_roots *roots,
+                                                       void (*trace_ref)(struct gc_ref edge,
+                                                                         void *trace_data),
+                                                       void *trace_data) {
+}
+
+static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
+                                                  void (*trace_edge)(struct gc_edge edge,
+                                                                     void *trace_data),
+                                                  void *trace_data) {
   if (roots)
     visit_roots(roots->roots, trace_edge, trace_data);
 }
 
-static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
-                                       void (*trace_edge)(struct gc_edge edge,
-                                                          void *trace_data),
-                                       void *trace_data) {
+static inline void gc_trace_conservative_heap_roots(struct gc_heap_roots *roots,
+                                                    void (*trace_ref)(struct gc_ref ref,
+                                                                      void *trace_data),
+                                                    void *trace_data) {
+}
+
+static inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
+                                               void (*trace_edge)(struct gc_edge edge,
+                                                                  void *trace_data),
+                                               void *trace_data) {
   if (roots)
     visit_roots(roots->roots, trace_edge, trace_data);
 }
diff --git a/whippet.c b/whippet.c
index ab951c306..500869cf0 100644
--- a/whippet.c
+++ b/whippet.c
@@ -40,22 +40,24 @@ STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
 STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
                  LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
 
-// Each granule has one metadata byte stored in a side table, used for
-// mark bits but also for other per-object metadata.  Already we were
-// using a byte instead of a bit to facilitate parallel marking.
-// (Parallel markers are allowed to race.)  Turns out we can put a
-// pinned bit there too, for objects that can't be moved (perhaps
-// because they have been passed to unmanaged C code).  (Objects can
-// also be temporarily pinned if they are referenced by a conservative
-// root, but that doesn't need a separate bit; we can just use the mark
-// bit.)
+// Each granule has one mark byte stored in a side table.  A granule's
+// mark state is a whole byte instead of a bit to facilitate parallel
+// marking.  (Parallel markers are allowed to race.)  We also use this
+// byte to compute object extent, via a bit flag indicating
+// end-of-object.
 //
-// Getting back to mark bits -- because we want to allow for
-// conservative roots, we need to know whether an address indicates an
-// object or not.  That means that when an object is allocated, it has
-// to set a bit, somewhere.  In our case we use the metadata byte, and
-// set the "young" bit.  In future we could use this for generational
-// GC, with the sticky mark bit strategy.
+// Because we want to allow for conservative roots, we need to know
+// whether an address indicates an object or not.  That means that when
+// an object is allocated, it has to set a bit, somewhere.  We use the
+// metadata byte for this purpose, setting the "young" bit.
+//
+// The "young" bit's name might make you think about generational
+// collection, and indeed all objects collected in a minor collection
+// will have this bit set.  However, whippet never needs to check for
+// the young bit; if it weren't for the need to identify conservative
+// roots, we wouldn't need a young bit at all.  Perhaps in an
+// all-precise system, we would be able to avoid the overhead of
+// initializing mark byte upon each fresh allocation.
 //
 // When an object becomes dead after a GC, it will still have a bit set
 // -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
@@ -75,9 +77,9 @@ enum metadata_byte {
   METADATA_BYTE_MARK_1 = 4,
   METADATA_BYTE_MARK_2 = 8,
   METADATA_BYTE_END = 16,
-  METADATA_BYTE_PINNED = 32,
-  METADATA_BYTE_UNUSED_1 = 64,
-  METADATA_BYTE_UNUSED_2 = 128
+  METADATA_BYTE_UNUSED_1 = 32,
+  METADATA_BYTE_UNUSED_2 = 64,
+  METADATA_BYTE_UNUSED_3 = 128
 };
 
 static uint8_t rotate_dead_survivor_marked(uint8_t mask) {
@@ -310,7 +312,6 @@ struct gc_heap {
   int collecting;
   enum gc_kind gc_kind;
   int multithreaded;
-  int allow_pinning;
   size_t active_mutator_count;
   size_t mutator_count;
   struct gc_heap_roots *roots;
@@ -526,10 +527,10 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
     return 0;
   if (space->evacuating &&
       block_summary_has_flag(block_summary_for_addr(gc_ref_value(old_ref)),
-                             BLOCK_EVACUATE) &&
-      ((byte & METADATA_BYTE_PINNED) == 0)) {
+                             BLOCK_EVACUATE)) {
     // This is an evacuating collection, and we are attempting to
-    // evacuate this block, and this particular object isn't pinned.
+    // evacuate this block, and we are tracing this particular object
+    // for what appears to be the first time.
     struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
 
     if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
@@ -622,6 +623,20 @@ static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
     GC_CRASH();
 }
 
+static inline int trace_ref(struct gc_heap *heap, struct gc_ref ref) {
+  if (!gc_ref_is_heap_object(ref))
+    return 0;
+  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
+    GC_ASSERT(!heap_mark_space(heap)->evacuating);
+    return mark_space_mark_object(heap_mark_space(heap), ref);
+  }
+  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+    return large_object_space_mark_object(heap_large_object_space(heap),
+                                          ref);
+  else
+    GC_CRASH();
+}
+
 static inline void trace_one(struct gc_ref ref, void *mark_data) {
   gc_trace_object(ref, tracer_visit, mark_data, NULL);
 }
@@ -856,27 +871,20 @@ static void enqueue_mutator_for_tracing(struct gc_mutator *mut) {
 }
 
 static int heap_should_mark_while_stopping(struct gc_heap *heap) {
-  if (heap->allow_pinning) {
-    // The metadata byte is mostly used for marking and object extent.
-    // For marking, we allow updates to race, because the state
-    // transition space is limited.  However during ragged stop there is
-    // the possibility of races between the marker and updates from the
-    // mutator to the pinned bit in the metadata byte.
-    //
-    // Losing the pinned bit would be bad.  Perhaps this means we should
-    // store the pinned bit elsewhere.  Or, perhaps for this reason (and
-    // in all cases?)  markers should use proper synchronization to
-    // update metadata mark bits instead of racing.  But for now it is
-    // sufficient to simply avoid ragged stops if we allow pins.
+  // Generally speaking, we allow mutators to mark their own stacks
+  // before pausing.  This is a limited form of concurrent marking, as
+  // other mutators might be running, not having received the signal to
+  // stop yet.  In a compacting collection, this results in pinned
+  // roots, because we haven't started evacuating yet and instead mark
+  // in place; avoid this pinning only if we're trying to reclaim free
+  // blocks.
+  GC_ASSERT(!heap_mark_space(heap)->evacuating);
+  if ((atomic_load(&heap->gc_kind) & GC_KIND_FLAG_EVACUATING)
+      && atomic_load_explicit(&heap_mark_space(heap)->pending_unavailable_bytes,
+                              memory_order_acquire) > 0)
     return 0;
-  }
-  // If we are marking in place, we allow mutators to mark their own
-  // stacks before pausing.  This is a limited form of concurrent
-  // marking, as other mutators might be running, not having received
-  // the signal to stop yet.  We can't do this for a compacting
-  // collection, however, as that would become concurrent evacuation,
-  // which is a different kettle of fish.
-  return (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_EVACUATING) == 0;
+
+  return 1;
 }
 
 static int mutator_should_mark_while_stopping(struct gc_mutator *mut) {
@@ -897,32 +905,52 @@ static void trace_and_enqueue_locally(struct gc_edge edge, void *data) {
     mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
+static void trace_ref_and_enqueue_locally(struct gc_ref ref, void *data) {
+  struct gc_mutator *mut = data;
+  if (trace_ref(mutator_heap(mut), ref))
+    mutator_mark_buf_push(&mut->mark_buf, ref);
+}
+
 static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
   struct gc_heap *heap = data;
   if (trace_edge(heap, edge))
     tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
-// Mark the roots of a mutator that is stopping for GC.  We can't
-// enqueue them directly, so we send them to the controller in a buffer.
-static void mark_stopping_mutator_roots(struct gc_mutator *mut) {
-  GC_ASSERT(mutator_should_mark_while_stopping(mut));
-  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
+static void trace_ref_and_enqueue_globally(struct gc_ref ref, void *data) {
+  struct gc_heap *heap = data;
+  if (trace_ref(heap, ref))
+    tracer_enqueue_root(&heap->tracer, ref);
 }
 
-// Precondition: the caller holds the heap lock.
-static void mark_mutator_roots_with_lock(struct gc_mutator *mut) {
-  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
-                         mutator_heap(mut));
+// Mark the roots of a mutator that is stopping for GC.  We can't
+// enqueue them directly, so we send them to the controller in a buffer.
+static void trace_stopping_mutator_roots(struct gc_mutator *mut) {
+  GC_ASSERT(mutator_should_mark_while_stopping(mut));
+  gc_trace_conservative_mutator_roots(mut->roots, trace_ref_and_enqueue_locally,
+                                      mut);
+  gc_trace_precise_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
+}
+
+static void trace_precise_mutator_roots_with_lock(struct gc_mutator *mut) {
+  gc_trace_precise_mutator_roots(mut->roots, trace_and_enqueue_globally,
+                                 mutator_heap(mut));
+}
+
+static void trace_conservative_mutator_roots_with_lock(struct gc_mutator *mut) {
+  gc_trace_conservative_mutator_roots(mut->roots,
+                                      trace_ref_and_enqueue_globally,
+                                      mutator_heap(mut));
 }
 
 static void trace_mutator_roots_with_lock(struct gc_mutator *mut) {
-  mark_mutator_roots_with_lock(mut);
+  trace_conservative_mutator_roots_with_lock(mut);
+  trace_precise_mutator_roots_with_lock(mut);
 }
 
 static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
   if (mutator_should_mark_while_stopping(mut))
-    mark_mutator_roots_with_lock(mut);
+    trace_mutator_roots_with_lock(mut);
   else
     enqueue_mutator_for_tracing(mut);
 }
@@ -940,15 +968,33 @@ static void wait_for_mutators_to_stop(struct gc_heap *heap) {
 static void finish_sweeping(struct gc_mutator *mut);
 static void finish_sweeping_in_block(struct gc_mutator *mut);
 
-static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
+static void trace_conservative_mutator_roots_after_stop(struct gc_heap *heap) {
+  int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
+  if (!active_mutators_already_marked) {
+    for (struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
+         mut;
+         mut = mut->next)
+      trace_conservative_mutator_roots_with_lock(mut);
+  }
+
+  for (struct gc_mutator *mut = heap->deactivated_mutators;
+       mut;
+       mut = mut->next)
+    trace_conservative_mutator_roots_with_lock(mut);
+}
+
+static void trace_precise_mutator_roots_after_stop(struct gc_heap *heap) {
   struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   while (mut) {
+    // Also collect any already-marked grey objects and put them on the
+    // global trace queue.
     if (active_mutators_already_marked)
       tracer_enqueue_roots(&heap->tracer, mut->mark_buf.objects,
                            mut->mark_buf.size);
     else
-      trace_mutator_roots_with_lock(mut);
+      trace_precise_mutator_roots_with_lock(mut);
+    // Also unlink mutator_trace_list chain.
     struct gc_mutator *next = mut->next;
     mut->next = NULL;
     mut = next;
@@ -957,12 +1003,16 @@ static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
 
   for (struct gc_mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
     finish_sweeping_in_block(mut);
-    trace_mutator_roots_with_lock(mut);
+    trace_precise_mutator_roots_with_lock(mut);
   }
 }
 
-static void trace_global_roots(struct gc_heap *heap) {
-  gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
+static void trace_precise_global_roots(struct gc_heap *heap) {
+  gc_trace_precise_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
+}
+
+static void trace_conservative_global_roots(struct gc_heap *heap) {
+  gc_trace_conservative_heap_roots(heap->roots, trace_ref_and_enqueue_globally, heap);
 }
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
@@ -1076,7 +1126,7 @@ static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   finish_sweeping_in_block(mut);
   if (mutator_should_mark_while_stopping(mut))
     // No need to collect results in mark buf; we can enqueue roots directly.
-    mark_mutator_roots_with_lock(mut);
+    trace_mutator_roots_with_lock(mut);
   else
     enqueue_mutator_for_tracing(mut);
   pause_mutator_for_collection(heap);
@@ -1088,7 +1138,7 @@ static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping(mut);
   if (mutator_should_mark_while_stopping(mut))
-    mark_stopping_mutator_roots(mut);
+    trace_stopping_mutator_roots(mut);
   enqueue_mutator_for_tracing(mut);
   heap_lock(heap);
   pause_mutator_for_collection(heap);
@@ -1244,6 +1294,12 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     gc_kind = GC_KIND_MINOR_IN_PLACE;
   }
 
+  if (gc_has_conservative_intraheap_edges() &&
+      (gc_kind & GC_KIND_FLAG_EVACUATING)) {
+    DEBUG("welp.  conservative heap scanning, no evacuation for you\n");
+    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+  }
+
   // If this is the first in a series of minor collections, reset the
   // threshold at which we should do a major GC.
   if ((gc_kind & GC_KIND_FLAG_MINOR) &&
@@ -1363,14 +1419,21 @@ static void prepare_for_evacuation(struct gc_heap *heap) {
 }
 
 static void trace_conservative_roots_after_stop(struct gc_heap *heap) {
-  // FIXME: Visit conservative roots, if the collector is configured in
-  // that way.  Mark them in place, preventing any subsequent
-  // evacuation.
+  GC_ASSERT(!heap_mark_space(heap)->evacuating);
+  GC_ASSERT(gc_has_conservative_roots());
+  trace_conservative_mutator_roots_after_stop(heap);
+  trace_conservative_global_roots(heap);
+}
+
+static void trace_pinned_roots_after_stop(struct gc_heap *heap) {
+  GC_ASSERT(!heap_mark_space(heap)->evacuating);
+  if (gc_has_conservative_roots())
+    trace_conservative_roots_after_stop(heap);
 }
 
 static void trace_precise_roots_after_stop(struct gc_heap *heap) {
-  trace_mutator_roots_after_stop(heap);
-  trace_global_roots(heap);
+  trace_precise_mutator_roots_after_stop(heap);
+  trace_precise_global_roots(heap);
   trace_generational_roots(heap);
 }
 
@@ -1404,7 +1467,7 @@ static void collect(struct gc_mutator *mut) {
   double fragmentation = heap_fragmentation(heap);
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
-  trace_conservative_roots_after_stop(heap);
+  trace_pinned_roots_after_stop(heap);
   prepare_for_evacuation(heap);
   trace_precise_roots_after_stop(heap);
   tracer_trace(heap);

From 1228e346fa1ce645304a120c50cae2c85dbbc316 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 16 Sep 2022 14:50:01 +0200
Subject: [PATCH 142/403] Fix semi-space collector for refactor

---
 semi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/semi.c b/semi.c
index d67ed68ac..e29e63c26 100644
--- a/semi.c
+++ b/semi.c
@@ -161,7 +161,7 @@ static void collect(struct gc_mutator *mut) {
   flip(semi);
   uintptr_t grey = semi->hp;
   if (mut->roots)
-    gc_trace_mutator_roots(mut->roots, visit, heap);
+    gc_trace_precise_mutator_roots(mut->roots, visit, heap);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, gc_ref(grey));

From f77cf923c101d1174e51eeb5a3f9580ee78c5f72 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 2 Oct 2022 09:54:06 +0200
Subject: [PATCH 143/403] Fix parallel tracer for gc_ref API change

---
 parallel-tracer.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parallel-tracer.h b/parallel-tracer.h
index 467ad1bf4..54a31d896 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -487,7 +487,8 @@ trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer)
     DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, steal_id);
     struct gc_ref obj = tracer_steal_from_worker(tracer, steal_id);
     if (gc_ref_is_heap_object(obj)) {
-      DEBUG("tracer #%zu: stealing got %p\n", worker->id, obj);
+      DEBUG("tracer #%zu: stealing got %p\n", worker->id,
+            gc_ref_heap_object(obj));
       worker->steal_id = steal_id;
       return obj;
     }

From 56aad402c9504b63d4f7b9f250e7bd931228d7c4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 3 Oct 2022 15:27:10 +0200
Subject: [PATCH 144/403] Fix bug in try_pop on chase-lev deque

The counters are unsigned, so that they can overflow.  (Is that really
necessary though?)  In any case try_pop can decrement a counter, leading
to a situation where you can think you have (size_t)-1 elements; not
good.  Instead when computing the queue size, use a signed value.
Limits total queue size to half the unsigned space; fine.
---
 parallel-tracer.h | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/parallel-tracer.h b/parallel-tracer.h
index 54a31d896..67a4b96c7 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -161,7 +161,8 @@ trace_deque_push(struct trace_deque *q, struct gc_ref x) {
   size_t t = LOAD_ACQUIRE(&q->top);
   int active = LOAD_RELAXED(&q->active);
 
-  if (b - t > trace_buf_size(&q->bufs[active]) - 1) /* Full queue. */
+  ssize_t size = b - t;
+  if (size > trace_buf_size(&q->bufs[active]) - 1) /* Full queue. */
     active = trace_deque_grow(q, active, b, t);
 
   trace_buf_put(&q->bufs[active], b, x);
@@ -175,7 +176,8 @@ trace_deque_push_many(struct trace_deque *q, struct gc_ref *objv, size_t count)
   size_t t = LOAD_ACQUIRE(&q->top);
   int active = LOAD_RELAXED(&q->active);
 
-  while (b - t > trace_buf_size(&q->bufs[active]) - count) /* Full queue. */
+  ssize_t size = b - t;
+  while (size > trace_buf_size(&q->bufs[active]) - count) /* Full queue. */
     active = trace_deque_grow(q, active, b, t);
 
   for (size_t i = 0; i < count; i++)
@@ -187,25 +189,25 @@ trace_deque_push_many(struct trace_deque *q, struct gc_ref *objv, size_t count)
 static struct gc_ref
 trace_deque_try_pop(struct trace_deque *q) {
   size_t b = LOAD_RELAXED(&q->bottom);
-  b = b - 1;
   int active = LOAD_RELAXED(&q->active);
-  STORE_RELAXED(&q->bottom, b);
+  STORE_RELAXED(&q->bottom, b - 1);
   atomic_thread_fence(memory_order_seq_cst);
   size_t t = LOAD_RELAXED(&q->top);
   struct gc_ref x;
-  if (t <= b) { // Non-empty queue.
-    x = trace_buf_get(&q->bufs[active], b);
-    if (t == b) { // Single last element in queue.
+  ssize_t size = b - t;
+  if (size > 0) { // Non-empty queue.
+    x = trace_buf_get(&q->bufs[active], b - 1);
+    if (size == 1) { // Single last element in queue.
       if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
                                                    memory_order_seq_cst,
                                                    memory_order_relaxed))
         // Failed race.
         x = gc_ref_null();
-      STORE_RELAXED(&q->bottom, b + 1);
+      STORE_RELAXED(&q->bottom, b);
     }
   } else { // Empty queue.
     x = gc_ref_null();
-    STORE_RELAXED(&q->bottom, b + 1);
+    STORE_RELAXED(&q->bottom, b);
   }
   return x;
 }
@@ -216,7 +218,8 @@ trace_deque_steal(struct trace_deque *q) {
     size_t t = LOAD_ACQUIRE(&q->top);
     atomic_thread_fence(memory_order_seq_cst);
     size_t b = LOAD_ACQUIRE(&q->bottom);
-    if (t >= b)
+    ssize_t size = b - t;
+    if (size <= 0)
       return gc_ref_null();
     int active = LOAD_CONSUME(&q->active);
     struct gc_ref ref = trace_buf_get(&q->bufs[active], t);
@@ -234,7 +237,8 @@ trace_deque_can_steal(struct trace_deque *q) {
   size_t t = LOAD_ACQUIRE(&q->top);
   atomic_thread_fence(memory_order_seq_cst);
   size_t b = LOAD_ACQUIRE(&q->bottom);
-  return t < b;
+  ssize_t size = b - t;
+  return size > 0;
 }
 
 #undef LOAD_RELAXED

From 8b8ddaf6e297648bd6be7a756bc85bfab3895923 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 3 Oct 2022 15:05:14 +0200
Subject: [PATCH 145/403] work-stealing optimization: stay with last-stolen
 worker

Previously we were always going round-robin.  Now a thief tries to
plunder its victim again directly.  Should result in less churn.
---
 parallel-tracer.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/parallel-tracer.h b/parallel-tracer.h
index 67a4b96c7..a9600927b 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -485,17 +485,15 @@ tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
 
 static struct gc_ref
 trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
-  size_t steal_id = worker->steal_id;
   for (size_t i = 0; i < tracer->worker_count; i++) {
-    steal_id = (steal_id + 1) % tracer->worker_count;
-    DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, steal_id);
-    struct gc_ref obj = tracer_steal_from_worker(tracer, steal_id);
+    DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
+    struct gc_ref obj = tracer_steal_from_worker(tracer, worker->steal_id);
     if (gc_ref_is_heap_object(obj)) {
       DEBUG("tracer #%zu: stealing got %p\n", worker->id,
             gc_ref_heap_object(obj));
-      worker->steal_id = steal_id;
       return obj;
     }
+    worker->steal_id = (worker->steal_id + 1) % tracer->worker_count;
   }
   DEBUG("tracer #%zu: failed to steal\n", worker->id);
   return gc_ref_null();
@@ -503,16 +501,15 @@ trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer)
 
 static int
 trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
-  size_t steal_id = worker->steal_id;
   DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
   for (size_t i = 0; i < tracer->worker_count; i++) {
-    steal_id = (steal_id + 1) % tracer->worker_count;
-    int res = tracer_can_steal_from_worker(tracer, steal_id);
+    int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
     if (res) {
-      DEBUG("tracer #%zu: worker #%zu has tasks!\n", worker->id, steal_id);
-      worker->steal_id = steal_id;
+      DEBUG("tracer #%zu: worker #%zu has tasks!\n", worker->id,
+            worker->steal_id);
       return 1;
     }
+    worker->steal_id = (worker->steal_id + 1) % tracer->worker_count;
   }
   DEBUG("tracer #%zu: nothing to steal\n", worker->id);
   return 0;

From 1e3122d054c584d4b1b6ca9cbea2dbe1e7651141 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 3 Oct 2022 15:29:14 +0200
Subject: [PATCH 146/403] trace_worker_steal first does a try_pop on its own
 deque

Before asking other threads for values, see if there is any pending data
that overflowed from the local mark stack.
---
 parallel-tracer.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/parallel-tracer.h b/parallel-tracer.h
index a9600927b..bcc7910c2 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -547,6 +547,16 @@ trace_worker_steal(struct local_tracer *trace) {
   struct tracer *tracer = heap_tracer(trace->heap);
   struct trace_worker *worker = trace->worker;
 
+  // It could be that the worker's local trace queue has simply
+  // overflowed.  In that case avoid contention by trying to pop
+  // something from the worker's own queue.
+  {
+    DEBUG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
+    struct gc_ref obj = trace_deque_try_pop(&worker->deque);
+    if (gc_ref_is_heap_object(obj))
+      return obj;
+  }
+
   while (1) {
     DEBUG("tracer #%zu: trying to steal\n", worker->id);
     struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);

From 24bd94d9f7be2d4aec10635a1b784f4107cc30aa Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 2 Oct 2022 10:51:20 +0200
Subject: [PATCH 147/403] Fix race condition in computation of
 mark-while-stopping

Choose the ragged stop strategy when the GC kind is determined, so that
we do so with respect to a single measurement of pending unavailable
bytes.

Also remove assert in heap_should_mark_while_stopping, as it can be
called after stopping too, when evacuation is enabled.
---
 whippet.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/whippet.c b/whippet.c
index 500869cf0..ed3ecf3df 100644
--- a/whippet.c
+++ b/whippet.c
@@ -310,6 +310,7 @@ struct gc_heap {
   pthread_cond_t mutator_cond;
   size_t size;
   int collecting;
+  int mark_while_stopping;
   enum gc_kind gc_kind;
   int multithreaded;
   size_t active_mutator_count;
@@ -871,20 +872,7 @@ static void enqueue_mutator_for_tracing(struct gc_mutator *mut) {
 }
 
 static int heap_should_mark_while_stopping(struct gc_heap *heap) {
-  // Generally speaking, we allow mutators to mark their own stacks
-  // before pausing.  This is a limited form of concurrent marking, as
-  // other mutators might be running, not having received the signal to
-  // stop yet.  In a compacting collection, this results in pinned
-  // roots, because we haven't started evacuating yet and instead mark
-  // in place; avoid this pinning only if we're trying to reclaim free
-  // blocks.
-  GC_ASSERT(!heap_mark_space(heap)->evacuating);
-  if ((atomic_load(&heap->gc_kind) & GC_KIND_FLAG_EVACUATING)
-      && atomic_load_explicit(&heap_mark_space(heap)->pending_unavailable_bytes,
-                              memory_order_acquire) > 0)
-    return 0;
-
-  return 1;
+  return atomic_load_explicit(&heap->mark_while_stopping, memory_order_acquire);
 }
 
 static int mutator_should_mark_while_stopping(struct gc_mutator *mut) {
@@ -1245,18 +1233,32 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
   enum gc_kind gc_kind;
+  int mark_while_stopping = 1;
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
+  ssize_t pending = atomic_load_explicit(&mark_space->pending_unavailable_bytes,
+                                         memory_order_acquire);
 
   if (heap->count == 0) {
     DEBUG("first collection is always major\n");
     gc_kind = GC_KIND_MAJOR_IN_PLACE;
-  } else if (atomic_load_explicit(&mark_space->pending_unavailable_bytes,
-                                  memory_order_acquire) > 0) {
+  } else if (pending > 0) {
+    DEBUG("evacuating due to need to reclaim %zd bytes\n", pending);
     // During the last cycle, a large allocation could not find enough
     // free blocks, and we decided not to expand the heap.  Let's do an
     // evacuating major collection to maximize the free block yield.
     gc_kind = GC_KIND_MAJOR_EVACUATING;
+
+    // Generally speaking, we allow mutators to mark their own stacks
+    // before pausing.  This is a limited form of concurrent marking, as
+    // other mutators might be running, not having received the signal
+    // to stop yet.  In a compacting collection, this results in pinned
+    // roots, because we haven't started evacuating yet and instead mark
+    // in place.  However as in this case we are trying to reclaim free
+    // blocks, try to avoid any pinning caused by the ragged-stop
+    // marking.  Of course if the mutator has conservative roots we will
+    // have pinning anyway and might as well allow ragged stops.
+    mark_while_stopping = gc_has_conservative_roots();
   } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
@@ -1298,6 +1300,7 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
       (gc_kind & GC_KIND_FLAG_EVACUATING)) {
     DEBUG("welp.  conservative heap scanning, no evacuation for you\n");
     gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    mark_while_stopping = 1;
   }
 
   // If this is the first in a series of minor collections, reset the
@@ -1311,6 +1314,10 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     DEBUG("first minor collection at yield %.2f%%, threshold %.2f%%\n",
           yield * 100., clamped * 100.);
   }
+
+  atomic_store_explicit(&heap->mark_while_stopping, mark_while_stopping,
+                        memory_order_release);
+
   atomic_store(&heap->gc_kind, gc_kind);
   return gc_kind;
 }

From e328346bbdc1177e0e70976f38ed1d821c57dd9f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 16 Sep 2022 13:45:13 +0200
Subject: [PATCH 148/403] Refactor alignment utilities in whippet.c

Add align_up and align_down helpers.
---
 whippet.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/whippet.c b/whippet.c
index ed3ecf3df..3e8e52185 100644
--- a/whippet.c
+++ b/whippet.c
@@ -178,14 +178,21 @@ struct slab {
 };
 STATIC_ASSERT_EQ(sizeof(struct slab), SLAB_SIZE);
 
+static inline uintptr_t align_down(uintptr_t addr, size_t align) {
+  return addr & ~(align - 1);
+}
+static inline uintptr_t align_up(uintptr_t addr, size_t align) {
+  return align_down(addr + align - 1, align);
+}
+
 static struct slab *object_slab(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
-  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  uintptr_t base = align_down(addr, SLAB_SIZE);
   return (struct slab*) base;
 }
 
 static uint8_t *metadata_byte_for_addr(uintptr_t addr) {
-  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  uintptr_t base = align_down(addr, SLAB_SIZE);
   uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
   return (uint8_t*) (base + granule);
 }
@@ -198,7 +205,7 @@ static uint8_t *metadata_byte_for_object(struct gc_ref ref) {
 #define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
 
 static struct block_summary* block_summary_for_addr(uintptr_t addr) {
-  uintptr_t base = addr & ~(SLAB_SIZE - 1);
+  uintptr_t base = align_down(addr, SLAB_SIZE);
   uintptr_t block = (addr & (SLAB_SIZE - 1)) / BLOCK_SIZE;
   return (struct block_summary*) (base + block * sizeof(struct block_summary));
 }
@@ -216,7 +223,7 @@ static void block_summary_clear_flag(struct block_summary *summary,
   summary->next_and_flags &= ~(uintptr_t)flag;
 }
 static uintptr_t block_summary_next(struct block_summary *summary) {
-  return summary->next_and_flags & ~(BLOCK_SIZE - 1);
+  return align_down(summary->next_and_flags, BLOCK_SIZE);
 }
 static void block_summary_set_next(struct block_summary *summary,
                                    uintptr_t next) {
@@ -255,10 +262,6 @@ static uintptr_t pop_block(struct block_list *list) {
   return head;
 }
 
-static uintptr_t align_up(uintptr_t addr, size_t align) {
-  return (addr + align - 1) & ~(align-1);
-}
-
 static inline size_t size_to_granules(size_t size) {
   return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
 }
@@ -391,7 +394,7 @@ static inline int mark_space_mark_object(struct mark_space *space,
 static uintptr_t make_evacuation_allocator_cursor(uintptr_t block,
                                                   size_t allocated) {
   GC_ASSERT(allocated < (BLOCK_SIZE - 1) * (uint64_t) BLOCK_SIZE);
-  return (block & ~(BLOCK_SIZE - 1)) | (allocated / BLOCK_SIZE);
+  return align_down(block, BLOCK_SIZE) | (allocated / BLOCK_SIZE);
 }
 
 static void prepare_evacuation_allocator(struct evacuation_allocator *alloc,

From 05d2c959505019ac1be03ca2e6f1885103fd7ec6 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 3 Oct 2022 16:08:46 +0200
Subject: [PATCH 149/403] mt-gcbench: Only crash when tracing holes for precise
 GC

---
 mt-gcbench-embedder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mt-gcbench-embedder.h b/mt-gcbench-embedder.h
index 3aec4808b..3162a8bf3 100644
--- a/mt-gcbench-embedder.h
+++ b/mt-gcbench-embedder.h
@@ -36,7 +36,9 @@ static inline void
 visit_hole_fields(Hole *obj,
                   void (*visit)(struct gc_edge edge, void *visit_data),
                   void *visit_data) {
+#if GC_PRECISE
   GC_CRASH();
+#endif
 }
 
 #include "simple-gc-embedder.h"

From a5b1a66d21437328a97004e256cb148114354a69 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 16 Sep 2022 13:23:05 +0200
Subject: [PATCH 150/403] Add platform abstraction

This will allow us to iterate conservative roots from stacks and static
data segments.
---
 Makefile                |   6 ++-
 gc-platform-gnu-linux.c | 101 ++++++++++++++++++++++++++++++++++++++++
 gc-platform.h           |  20 ++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 gc-platform-gnu-linux.c
 create mode 100644 gc-platform.h

diff --git a/Makefile b/Makefile
index 9244c246c..f8e2c9ba9 100644
--- a/Makefile
+++ b/Makefile
@@ -6,11 +6,15 @@ CFLAGS=-Wall -O2 -g -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused -
 INCLUDES=-I.
 LDFLAGS=-lpthread -flto
 COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
+PLATFORM=gnu-linux
 
 ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
 all: $(ALL_TESTS)
 
+gc-platform.o: gc-platform.h gc-platform-$(PLATFORM).c gc-visibility.h
+	$(COMPILE) -o $@ -c gc-platform-$(PLATFORM).c
+
 bdw-%-gc.o: semi.c %-embedder.h %.c
 	$(COMPILE) `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
 bdw-%.o: semi.c %.c
@@ -43,7 +47,7 @@ parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.
 parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
-%: %.o %-gc.o
+%: %.o %-gc.o gc-platform.o
 	$(CC) $(LDFLAGS) $($*_LDFLAGS) -o $@ $^
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/gc-platform-gnu-linux.c b/gc-platform-gnu-linux.c
new file mode 100644
index 000000000..b0b9f0983
--- /dev/null
+++ b/gc-platform-gnu-linux.c
@@ -0,0 +1,101 @@
+// For pthread_getattr_np.
+#define _GNU_SOURCE
+#include <errno.h>
+#include <link.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define GC_IMPL 1
+
+#include "debug.h"
+#include "gc-assert.h"
+#include "gc-inline.h"
+//#include "gc-stack.h"
+
+void gc_platform_init(void) {
+  // Nothing to do.
+}
+
+static uintptr_t fallback_current_thread_stack_base(void) GC_NEVER_INLINE;
+static uintptr_t fallback_current_thread_stack_base(void) {
+  // Sloppily assume that there are very few frames between us and the
+  // thread entry or main function, and that therefore we haven't
+  // consumed more than a page of stack; we can then just round up the
+  // stack pointer to the page boundary.
+  fprintf(stderr,
+          "Using fallback strategy to capture stack base for thread %p.\n",
+          (void*)pthread_self());
+  int local;
+  uintptr_t hot = (uintptr_t)&local;
+  size_t page_size = getpagesize();
+  return (hot + page_size) & ~(page_size - 1);
+}
+
+uintptr_t gc_platform_current_thread_stack_base(void) {
+  pthread_t me = pthread_self();
+  pthread_attr_t attr;
+  int err = pthread_getattr_np(me, &attr);
+  if (err) {
+    errno = err;
+    // This case can occur for the main thread when running in a
+    // filesystem without /proc/stat.
+    perror("Failed to capture stack base via pthread_getattr_np");
+    return fallback_current_thread_stack_base();
+  }
+
+  void *stack_low_addr;
+  size_t stack_size;
+  err = pthread_attr_getstack(&attr, &stack_low_addr, &stack_size);
+  pthread_attr_destroy(&attr);
+  if (err) {
+    // Should never occur.
+    errno = err;
+    perror("pthread_attr_getstack");
+    return fallback_current_thread_stack_base();
+  }
+
+  return (uintptr_t)stack_low_addr + stack_size;
+}
+
+struct visit_data {
+  void (*f)(uintptr_t start, uintptr_t end, void *data);
+  void *data;
+};
+
+static int visit_roots(struct dl_phdr_info *info, size_t size, void *data) {
+  struct visit_data *visit_data = data;
+  uintptr_t object_addr = info->dlpi_addr;
+  const char *object_name = info->dlpi_name;
+  const ElfW(Phdr) *program_headers = info->dlpi_phdr;
+  size_t program_headers_count = info->dlpi_phnum;
+
+  // From the loader's perspective, an ELF image is broken up into
+  // "segments", each of which is described by a "program header".
+  // Treat all writable data segments as potential edges into the
+  // GC-managed heap.
+  //
+  // Note that there are some RELRO segments which are initially
+  // writable but then remapped read-only.  BDW-GC will exclude these,
+  // but we just punt for the time being and treat them as roots
+  for (size_t i = 0; i < program_headers_count; i++) {
+    const ElfW(Phdr) *p = &program_headers[i];
+    if (p->p_type == PT_LOAD && (p->p_flags & PF_W)) {
+      uintptr_t start = p->p_vaddr + object_addr;
+      uintptr_t end = start + p->p_memsz;
+      DEBUG("found roots for '%s': [%p,%p)\n", object_name,
+            (void*)start, (void*)end);
+      visit_data->f(start, end, visit_data->data);
+    }
+  }
+
+  return 0;
+}
+
+void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
+                                                           uintptr_t end,
+                                                           void *data),
+                                                 void *data) {
+  struct visit_data visit_data = { f, data };
+  dl_iterate_phdr(visit_roots, &visit_data);
+}
diff --git a/gc-platform.h b/gc-platform.h
new file mode 100644
index 000000000..acc0096f9
--- /dev/null
+++ b/gc-platform.h
@@ -0,0 +1,20 @@
+#ifndef GC_PLATFORM_H
+#define GC_PLATFORM_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include <stdint.h>
+
+#include "gc-visibility.h"
+
+GC_INTERNAL void gc_platform_init(void);
+GC_INTERNAL uintptr_t gc_platform_current_thread_stack_base(void);
+GC_INTERNAL
+void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
+                                                           uintptr_t end,
+                                                           void *data),
+                                                 void *data);
+
+#endif // GC_PLATFORM_H

From d2bde8319f60f7a4b04c1b00c509c6096581db02 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 16 Sep 2022 13:40:55 +0200
Subject: [PATCH 151/403] Add conservative stack capture

This isn't really wired up yet anywhere, but add a precursor to
conservative stack scanning.
---
 Makefile     |  7 ++--
 bdw.c        |  9 ++++--
 gc-align.h   | 17 ++++++++++
 gc-api.h     | 22 +++++++++----
 gc-stack.c   | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 gc-stack.h   | 29 +++++++++++++++++
 mt-gcbench.c | 58 ++++++++++++++-------------------
 quads.c      |  3 +-
 semi.c       |  8 +++--
 whippet.c    | 15 ++++-----
 10 files changed, 200 insertions(+), 58 deletions(-)
 create mode 100644 gc-align.h
 create mode 100644 gc-stack.c
 create mode 100644 gc-stack.h

diff --git a/Makefile b/Makefile
index f8e2c9ba9..1770f1a3c 100644
--- a/Makefile
+++ b/Makefile
@@ -15,11 +15,14 @@ all: $(ALL_TESTS)
 gc-platform.o: gc-platform.h gc-platform-$(PLATFORM).c gc-visibility.h
 	$(COMPILE) -o $@ -c gc-platform-$(PLATFORM).c
 
+gc-stack.o: gc-stack.c
+	$(COMPILE) -o $@ -c $<
+
 bdw-%-gc.o: semi.c %-embedder.h %.c
 	$(COMPILE) `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
 bdw-%.o: semi.c %.c
 	$(COMPILE) -include bdw-attrs.h -o $@ -c $*.c
-bdw-%: bdw-%.o bdw-%-gc.o
+bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-platform.o
 	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
 
 semi-%-gc.o: semi.c %-embedder.h large-object-space.h assert.h debug.h %.c
@@ -47,7 +50,7 @@ parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.
 parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
-%: %.o %-gc.o gc-platform.o
+%: %.o %-gc.o gc-platform.o gc-stack.o
 	$(CC) $(LDFLAGS) $($*_LDFLAGS) -o $@ $^
 
 check: $(addprefix test-$(TARGET),$(TARGETS))
diff --git a/bdw.c b/bdw.c
index 9325726f5..caf161e0b 100644
--- a/bdw.c
+++ b/bdw.c
@@ -188,7 +188,8 @@ static int parse_options(int argc, struct gc_option argv[],
 }
 
 int gc_init(int argc, struct gc_option argv[],
-            struct gc_heap **heap, struct gc_mutator **mutator) {
+            struct gc_stack_addr *stack_base, struct gc_heap **heap,
+            struct gc_mutator **mutator) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
                GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
@@ -201,6 +202,8 @@ int gc_init(int argc, struct gc_option argv[],
   // GC_free_space_divisor = 16;
   // GC_enable_incremental();
   
+  // Ignore stack base for main thread.
+
   GC_set_max_heap_size(options.fixed_heap_size);
   // Not part of 7.3, sigh.  Have to set an env var.
   // GC_set_markers_count(options.parallelism);
@@ -218,8 +221,8 @@ int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                   struct gc_heap *heap) {
+struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
+                                      struct gc_heap *heap) {
   pthread_mutex_lock(&heap->lock);
   if (!heap->multithreaded) {
     GC_allow_register_threads();
diff --git a/gc-align.h b/gc-align.h
new file mode 100644
index 000000000..117d1cb47
--- /dev/null
+++ b/gc-align.h
@@ -0,0 +1,17 @@
+#ifndef GC_ALIGN_H
+#define GC_ALIGN_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include <stdint.h>
+
+static inline uintptr_t align_down(uintptr_t addr, size_t align) {
+  return addr & ~(align - 1);
+}
+static inline uintptr_t align_up(uintptr_t addr, size_t align) {
+  return align_down(addr + align - 1, align);
+}
+
+#endif // GC_ALIGN_H
diff --git a/gc-api.h b/gc-api.h
index 2c4a636a1..a33fd7b12 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -7,6 +7,7 @@
 #include "gc-inline.h"
 #include "gc-ref.h"
 #include "gc-edge.h"
+#include "gc-visibility.h"
 
 #include <stdatomic.h>
 #include <stdint.h>
@@ -27,21 +28,30 @@ struct gc_option {
 
 // FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
 #ifndef GC_API_
-#define GC_API_ __attribute__((visibility("hidden")))
+#define GC_API_ GC_INTERNAL
 #endif
 
 GC_API_ int gc_option_from_string(const char *str);
+
+struct gc_stack_addr;
+GC_API_ void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *,
+                                                 void *),
+                                      void *data) GC_NEVER_INLINE;
+
 GC_API_ int gc_init(int argc, struct gc_option argv[],
-                    struct gc_heap **heap, struct gc_mutator **mutator);
+                    struct gc_stack_addr *base, struct gc_heap **heap,
+                    struct gc_mutator **mutator);
 
 struct gc_mutator_roots;
-struct gc_heap_roots;
 GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
                                   struct gc_mutator_roots *roots);
-GC_API_ void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots);
 
-GC_API_ struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                           struct gc_heap *heap);
+struct gc_heap_roots;
+GC_API_ void gc_heap_set_roots(struct gc_heap *heap,
+                               struct gc_heap_roots *roots);
+
+GC_API_ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *base,
+                                              struct gc_heap *heap);
 GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
 GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
diff --git a/gc-stack.c b/gc-stack.c
new file mode 100644
index 000000000..bc8c9e64f
--- /dev/null
+++ b/gc-stack.c
@@ -0,0 +1,90 @@
+// For pthread_getattr_np.
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define GC_IMPL 1
+
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-assert.h"
+#include "gc-inline.h"
+#include "gc-platform.h"
+#include "gc-stack.h"
+
+static uintptr_t current_thread_hot_stack_addr(void) {
+#ifdef __GCC__
+  return (uintptr_t)__builtin_frame_address(0);
+#else
+  uintptr_t local;
+  return (uintptr_t)&local;
+#endif
+}
+
+// FIXME: check platform stack growth direction.
+#define HOTTER_THAN <=
+
+static void capture_current_thread_hot_stack_addr(struct gc_stack_addr *addr) {
+  addr->addr = current_thread_hot_stack_addr();
+}
+
+static void capture_current_thread_cold_stack_addr(struct gc_stack_addr *addr) {
+  addr->addr = gc_platform_current_thread_stack_base();
+}
+
+void gc_stack_init(struct gc_stack *stack, struct gc_stack_addr *base) {
+  if (base)
+    stack->cold = *base;
+  else
+    capture_current_thread_cold_stack_addr(&stack->cold);
+  stack->hot = stack->cold;
+}
+
+void gc_stack_capture_hot(struct gc_stack *stack) {
+  capture_current_thread_hot_stack_addr(&stack->hot);
+  setjmp(stack->registers);
+  GC_ASSERT(stack->hot.addr HOTTER_THAN stack->cold.addr);
+}
+
+static void* call_with_stack(void* (*)(struct gc_stack_addr*, void*),
+                             struct gc_stack_addr*, void*) GC_NEVER_INLINE;
+static void* call_with_stack(void* (*f)(struct gc_stack_addr *, void *),
+                             struct gc_stack_addr *addr, void *arg) {
+  return f(addr, arg);
+}
+void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *base,
+                                         void *arg),
+                              void *arg) {
+  struct gc_stack_addr base;
+  capture_current_thread_hot_stack_addr(&base);
+  return call_with_stack(f, &base, arg);
+}
+
+void gc_stack_visit(struct gc_stack *stack,
+                    void (*visit)(uintptr_t low, uintptr_t high, void *data),
+                    void *data) {
+  {
+    uintptr_t low = (uintptr_t)stack->registers;
+    GC_ASSERT(low == align_down(low, sizeof(uintptr_t)));
+    uintptr_t high = low + sizeof(jmp_buf);
+    DEBUG("found mutator register roots for %p: [%p,%p)\n", stack,
+          (void*)low, (void*)high);
+    visit(low, high, data);
+  }
+
+  if (0 HOTTER_THAN 1) {
+    DEBUG("found mutator stack roots for %p: [%p,%p)\n", stack,
+          (void*)stack->hot.addr, (void*)stack->cold.addr);
+    visit(align_up(stack->hot.addr, sizeof(uintptr_t)),
+          align_down(stack->cold.addr, sizeof(uintptr_t)),
+          data);
+  } else {
+    DEBUG("found mutator stack roots for %p: [%p,%p)\n", stack,
+          (void*)stack->cold.addr, (void*)stack->hot.addr);
+    visit(align_up(stack->cold.addr, sizeof(uintptr_t)),
+          align_down(stack->hot.addr, sizeof(uintptr_t)),
+          data);
+  }
+}
diff --git a/gc-stack.h b/gc-stack.h
new file mode 100644
index 000000000..fa228b210
--- /dev/null
+++ b/gc-stack.h
@@ -0,0 +1,29 @@
+#ifndef GC_STACK_H
+#define GC_STACK_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-inline.h"
+#include <setjmp.h>
+
+struct gc_stack_addr {
+  uintptr_t addr;
+};
+
+struct gc_stack {
+  struct gc_stack_addr cold;
+  struct gc_stack_addr hot;
+  jmp_buf registers;
+};
+
+GC_INTERNAL void gc_stack_init(struct gc_stack *stack,
+                               struct gc_stack_addr *base);
+GC_INTERNAL void gc_stack_capture_hot(struct gc_stack *stack);
+GC_INTERNAL void gc_stack_visit(struct gc_stack *stack,
+                                void (*visit)(uintptr_t low, uintptr_t high,
+                                              void *data),
+                                void *data);
+
+#endif // GC_STACK_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 9c6e6b5d4..4789a4f7d 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -257,61 +257,49 @@ static void time_construction(struct thread *t, int depth) {
   POP_HANDLE(t);
 }
 
-static void* call_with_stack_base(void* (*)(uintptr_t*, void*), void*) GC_NEVER_INLINE;
-static void* call_with_stack_base_inner(void* (*)(uintptr_t*, void*), uintptr_t*, void*) GC_NEVER_INLINE;
-static void* call_with_stack_base_inner(void* (*f)(uintptr_t *stack_base, void *arg),
-                                        uintptr_t *stack_base, void *arg) {
-  return f(stack_base, arg);
-}
-static void* call_with_stack_base(void* (*f)(uintptr_t *stack_base, void *arg),
-                                  void *arg) {
-  uintptr_t x;
-  return call_with_stack_base_inner(f, &x, arg);
-}
-
 struct call_with_gc_data {
-  void* (*f)(struct gc_mutator *);
+  void* (*f)(struct thread *);
   struct gc_heap *heap;
 };
-static void* call_with_gc_inner(uintptr_t *stack_base, void *arg) {
+static void* call_with_gc_inner(struct gc_stack_addr *addr, void *arg) {
   struct call_with_gc_data *data = arg;
-  struct gc_mutator *mut = gc_init_for_thread(stack_base, data->heap);
-  void *ret = data->f(mut);
+  struct gc_mutator *mut = gc_init_for_thread(addr, data->heap);
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+  void *ret = data->f(&t);
   gc_finish_for_thread(mut);
   return ret;
 }
-static void* call_with_gc(void* (*f)(struct gc_mutator *),
+static void* call_with_gc(void* (*f)(struct thread *),
                           struct gc_heap *heap) {
   struct call_with_gc_data data = { f, heap };
-  return call_with_stack_base(call_with_gc_inner, &data);
+  return gc_call_with_stack_addr(call_with_gc_inner, &data);
 }
 
-static void* run_one_test(struct gc_mutator *mut) {
+static void* run_one_test(struct thread *t) {
   NodeHandle long_lived_tree = { NULL };
   NodeHandle temp_tree = { NULL };
   DoubleArrayHandle array = { NULL };
-  struct thread t = { mut, };
-  gc_mutator_set_roots(mut, &t.roots);
 
-  PUSH_HANDLE(&t, long_lived_tree);
-  PUSH_HANDLE(&t, temp_tree);
-  PUSH_HANDLE(&t, array);
+  PUSH_HANDLE(t, long_lived_tree);
+  PUSH_HANDLE(t, temp_tree);
+  PUSH_HANDLE(t, array);
 
   // Create a long lived object
   printf(" Creating a long-lived binary tree of depth %d\n",
          long_lived_tree_depth);
-  HANDLE_SET(long_lived_tree, allocate_node(mut));
-  populate(&t, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
+  HANDLE_SET(long_lived_tree, allocate_node(t->mut));
+  populate(t, long_lived_tree_depth, HANDLE_REF(long_lived_tree));
 
   // Create long-lived array, filling half of it
   printf(" Creating a long-lived array of %d doubles\n", array_size);
-  HANDLE_SET(array, allocate_double_array(mut, array_size));
+  HANDLE_SET(array, allocate_double_array(t->mut, array_size));
   for (int i = 0; i < array_size/2; ++i) {
     HANDLE_REF(array)->values[i] = 1.0/i;
   }
 
   for (int d = min_tree_depth; d <= max_tree_depth; d += 2) {
-    time_construction(&t, d);
+    time_construction(t, d);
   }
 
   validate_tree(HANDLE_REF(long_lived_tree), long_lived_tree_depth);
@@ -322,10 +310,9 @@ static void* run_one_test(struct gc_mutator *mut) {
       || HANDLE_REF(array)->values[1000] != 1.0/1000)
     fprintf(stderr, "Failed\n");
 
-  POP_HANDLE(&t);
-  POP_HANDLE(&t);
-  POP_HANDLE(&t);
-  gc_mutator_set_roots(mut, NULL);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
   return NULL;
 }
 
@@ -377,11 +364,14 @@ int main(int argc, char *argv[]) {
                                  { GC_OPTION_PARALLELISM, parallelism } };
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(sizeof options / sizeof options[0], options, &heap, &mut)) {
+  if (!gc_init(sizeof options / sizeof options[0], options, NULL, &heap,
+               &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
   }
+  struct thread main_thread = { mut, };
+  gc_mutator_set_roots(mut, &main_thread.roots);
 
   printf("Garbage Collector Test\n");
   printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
@@ -398,7 +388,7 @@ int main(int argc, char *argv[]) {
       return 1;
     }
   }
-  run_one_test(mut);
+  run_one_test(&main_thread);
   for (size_t i = 1; i < nthreads; i++) {
     struct join_data data = { 0, threads[i] };
     gc_call_without_gc(mut, join_thread, &data);
diff --git a/quads.c b/quads.c
index 6136988b8..1318adf9f 100644
--- a/quads.c
+++ b/quads.c
@@ -136,7 +136,8 @@ int main(int argc, char *argv[]) {
                                  { GC_OPTION_PARALLELISM, parallelism } };
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(sizeof options / sizeof options[0], options, &heap, &mut)) {
+  if (!gc_init(sizeof options / sizeof options[0], options, NULL, &heap,
+               &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
diff --git a/semi.c b/semi.c
index e29e63c26..e771f7f3d 100644
--- a/semi.c
+++ b/semi.c
@@ -311,7 +311,8 @@ static int parse_options(int argc, struct gc_option argv[],
 }
 
 int gc_init(int argc, struct gc_option argv[],
-            struct gc_heap **heap, struct gc_mutator **mut) {
+            struct gc_stack_addr *stack_base, struct gc_heap **heap,
+            struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
                offsetof(struct semi_space, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
@@ -331,6 +332,7 @@ int gc_init(int argc, struct gc_option argv[],
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     return 0;
   
+  // Ignore stack base, as we are precise.
   (*mut)->roots = NULL;
 
   return 1;
@@ -344,8 +346,8 @@ void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
   GC_CRASH();
 }
 
-struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
-                                   struct gc_heap *heap) {
+struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *base,
+                                      struct gc_heap *heap) {
   fprintf(stderr,
           "Semispace copying collector not appropriate for multithreaded use.\n");
   GC_CRASH();
diff --git a/whippet.c b/whippet.c
index 3e8e52185..dccb2a470 100644
--- a/whippet.c
+++ b/whippet.c
@@ -10,7 +10,10 @@
 #define GC_API_ 
 #include "gc-api.h"
 
+#define GC_IMPL 1
+
 #include "debug.h"
+#include "gc-align.h"
 #include "gc-inline.h"
 #include "large-object-space.h"
 #if GC_PARALLEL
@@ -178,13 +181,6 @@ struct slab {
 };
 STATIC_ASSERT_EQ(sizeof(struct slab), SLAB_SIZE);
 
-static inline uintptr_t align_down(uintptr_t addr, size_t align) {
-  return addr & ~(align - 1);
-}
-static inline uintptr_t align_up(uintptr_t addr, size_t align) {
-  return align_down(addr + align - 1, align);
-}
-
 static struct slab *object_slab(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
   uintptr_t base = align_down(addr, SLAB_SIZE);
@@ -1999,7 +1995,8 @@ static int mark_space_init(struct mark_space *space, struct gc_heap *heap) {
 }
 
 int gc_init(int argc, struct gc_option argv[],
-            struct gc_heap **heap, struct gc_mutator **mut) {
+            struct gc_stack_addr *stack_base, struct gc_heap **heap,
+            struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
@@ -2041,7 +2038,7 @@ int gc_init(int argc, struct gc_option argv[],
   return 1;
 }
 
-struct gc_mutator* gc_init_for_thread(uintptr_t *stack_base,
+struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
                                       struct gc_heap *heap) {
   struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
   if (!ret)

From deed415a0631a12566e1d4d6bb6200edc81e2e8b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 21 Sep 2022 10:33:31 +0200
Subject: [PATCH 152/403] Whippet captures stack when stopping mutators

This is part of work to enable conservative GC.
---
 conservative-roots-embedder.h | 24 +++++---------
 gc-embedder-api.h             | 12 ++-----
 precise-roots-embedder.h      | 21 +++++-------
 whippet.c                     | 60 ++++++++++++++++++++++++-----------
 4 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
index 5b3d6fba9..2ac2d2b78 100644
--- a/conservative-roots-embedder.h
+++ b/conservative-roots-embedder.h
@@ -1,35 +1,27 @@
 #ifndef CONSERVATIVE_ROOTS_EMBEDDER_H
 #define CONSERVATIVE_ROOTS_EMBEDDER_H
 
-#include "gc-assert.h"
-#include "conservative-roots-types.h"
+#include "gc-embedder-api.h"
 
-static inline int gc_has_conservative_roots(void) {
+static inline int gc_has_mutator_conservative_roots(void) {
+  return 1;
+}
+static inline int gc_mutator_conservative_roots_may_be_interior(void) {
+  return 1;
+}
+static inline int gc_has_global_conservative_roots(void) {
   return 1;
 }
 static inline int gc_has_conservative_intraheap_edges(void) {
-  // FIXME: Implement both ways.
   return 0;
 }
 
-static inline void gc_trace_conservative_mutator_roots(struct gc_mutator_roots *roots,
-                                                       void (*trace_ref)(struct gc_ref edge,
-                                                                         void *trace_data),
-                                                       void *trace_data) {
-}
-
 static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
                                                   void (*trace_edge)(struct gc_edge edge,
                                                                      void *trace_data),
                                                   void *trace_data) {
 }
 
-static inline void gc_trace_conservative_heap_roots(struct gc_heap_roots *roots,
-                                                    void (*trace_ref)(struct gc_ref ref,
-                                                                      void *trace_data),
-                                                    void *trace_data) {
-}
-
 static inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
                                                void (*trace_edge)(struct gc_edge edge,
                                                                   void *trace_data),
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index 2e0029e97..b24483245 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -12,18 +12,16 @@ struct gc_mutator_roots;
 struct gc_heap_roots;
 struct gc_atomic_forward;
 
-GC_EMBEDDER_API inline int gc_has_conservative_roots(void);
+GC_EMBEDDER_API inline int gc_has_mutator_conservative_roots(void);
+GC_EMBEDDER_API inline int gc_has_global_conservative_roots(void);
 GC_EMBEDDER_API inline int gc_has_conservative_intraheap_edges(void);
+GC_EMBEDDER_API inline int gc_mutator_conservative_roots_may_be_interior(void);
 
 GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
                                             void (*trace_edge)(struct gc_edge edge,
                                                                void *trace_data),
                                             void *trace_data,
                                             size_t *size) GC_ALWAYS_INLINE;
-GC_EMBEDDER_API inline void gc_trace_conservative_mutator_roots(struct gc_mutator_roots *roots,
-                                                                void (*trace_ref)(struct gc_ref edge,
-                                                                                  void *trace_data),
-                                                                void *trace_data);
 GC_EMBEDDER_API inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
                                                    void (*trace_edge)(struct gc_edge edge,
                                                                       void *trace_data),
@@ -32,10 +30,6 @@ GC_EMBEDDER_API inline void gc_trace_precise_heap_roots(struct gc_heap_roots *ro
                                                         void (*trace_edge)(struct gc_edge edge,
                                                                            void *trace_data),
                                                         void *trace_data);
-GC_EMBEDDER_API inline void gc_trace_conservative_heap_roots(struct gc_heap_roots *roots,
-                                                             void (*trace_ref)(struct gc_ref ref,
-                                                                               void *trace_data),
-                                                             void *trace_data);
 
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
diff --git a/precise-roots-embedder.h b/precise-roots-embedder.h
index 8b4deb481..cf649e14d 100644
--- a/precise-roots-embedder.h
+++ b/precise-roots-embedder.h
@@ -2,9 +2,16 @@
 #define PRECISE_ROOTS_EMBEDDER_H
 
 #include "gc-edge.h"
+#include "gc-embedder-api.h"
 #include "precise-roots-types.h"
 
-static inline int gc_has_conservative_roots(void) {
+static inline int gc_has_mutator_conservative_roots(void) {
+  return 0;
+}
+static inline int gc_mutator_conservative_roots_may_be_interior(void) {
+  return 0;
+}
+static inline int gc_has_global_conservative_roots(void) {
   return 0;
 }
 static inline int gc_has_conservative_intraheap_edges(void) {
@@ -19,12 +26,6 @@ static inline void visit_roots(struct handle *roots,
     trace_edge(gc_edge(&h->v), trace_data);
 }
 
-static inline void gc_trace_conservative_mutator_roots(struct gc_mutator_roots *roots,
-                                                       void (*trace_ref)(struct gc_ref edge,
-                                                                         void *trace_data),
-                                                       void *trace_data) {
-}
-
 static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
                                                   void (*trace_edge)(struct gc_edge edge,
                                                                      void *trace_data),
@@ -33,12 +34,6 @@ static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots
     visit_roots(roots->roots, trace_edge, trace_data);
 }
 
-static inline void gc_trace_conservative_heap_roots(struct gc_heap_roots *roots,
-                                                    void (*trace_ref)(struct gc_ref ref,
-                                                                      void *trace_data),
-                                                    void *trace_data) {
-}
-
 static inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
                                                void (*trace_edge)(struct gc_edge edge,
                                                                   void *trace_data),
diff --git a/whippet.c b/whippet.c
index dccb2a470..ad1e02260 100644
--- a/whippet.c
+++ b/whippet.c
@@ -15,6 +15,7 @@
 #include "debug.h"
 #include "gc-align.h"
 #include "gc-inline.h"
+#include "gc-stack.h"
 #include "large-object-space.h"
 #if GC_PARALLEL
 #include "parallel-tracer.h"
@@ -27,7 +28,7 @@
 #if GC_PRECISE
 #include "precise-roots-embedder.h"
 #else
-#error whippet only currently implements precise collection
+#include "conservative-roots-embedder.h"
 #endif
 
 #define GRANULE_SIZE 16
@@ -340,6 +341,7 @@ struct gc_mutator {
   uintptr_t sweep;
   uintptr_t block;
   struct gc_heap *heap;
+  struct gc_stack stack;
   struct gc_mutator_roots *roots;
   struct gc_mutator_mark_buf mark_buf;
   // Three uses for this in-object linked-list pointer:
@@ -914,8 +916,11 @@ static void trace_ref_and_enqueue_globally(struct gc_ref ref, void *data) {
 // enqueue them directly, so we send them to the controller in a buffer.
 static void trace_stopping_mutator_roots(struct gc_mutator *mut) {
   GC_ASSERT(mutator_should_mark_while_stopping(mut));
-  gc_trace_conservative_mutator_roots(mut->roots, trace_ref_and_enqueue_locally,
-                                      mut);
+  /*
+  trace_mutator_conservative_roots(mut,
+                                   mark_and_locally_enqueue_conservative_roots,
+                                   mut);
+  */
   gc_trace_precise_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
 }
 
@@ -924,18 +929,21 @@ static void trace_precise_mutator_roots_with_lock(struct gc_mutator *mut) {
                                  mutator_heap(mut));
 }
 
-static void trace_conservative_mutator_roots_with_lock(struct gc_mutator *mut) {
-  gc_trace_conservative_mutator_roots(mut->roots,
-                                      trace_ref_and_enqueue_globally,
-                                      mutator_heap(mut));
+static void trace_mutator_conservative_roots_with_lock(struct gc_mutator *mut) {
+  /*
+  trace_mutator_conservative_roots(mut,
+                                   mark_and_globally_enqueue_conservative_roots,
+                                   mutator_heap(mut));
+  */
 }
 
 static void trace_mutator_roots_with_lock(struct gc_mutator *mut) {
-  trace_conservative_mutator_roots_with_lock(mut);
+  trace_mutator_conservative_roots_with_lock(mut);
   trace_precise_mutator_roots_with_lock(mut);
 }
 
 static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
+  gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     trace_mutator_roots_with_lock(mut);
   else
@@ -955,19 +963,19 @@ static void wait_for_mutators_to_stop(struct gc_heap *heap) {
 static void finish_sweeping(struct gc_mutator *mut);
 static void finish_sweeping_in_block(struct gc_mutator *mut);
 
-static void trace_conservative_mutator_roots_after_stop(struct gc_heap *heap) {
+static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   if (!active_mutators_already_marked) {
     for (struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
          mut;
          mut = mut->next)
-      trace_conservative_mutator_roots_with_lock(mut);
+      trace_mutator_conservative_roots_with_lock(mut);
   }
 
   for (struct gc_mutator *mut = heap->deactivated_mutators;
        mut;
        mut = mut->next)
-    trace_conservative_mutator_roots_with_lock(mut);
+    trace_mutator_conservative_roots_with_lock(mut);
 }
 
 static void trace_precise_mutator_roots_after_stop(struct gc_heap *heap) {
@@ -998,8 +1006,12 @@ static void trace_precise_global_roots(struct gc_heap *heap) {
   gc_trace_precise_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
 }
 
-static void trace_conservative_global_roots(struct gc_heap *heap) {
-  gc_trace_conservative_heap_roots(heap->roots, trace_ref_and_enqueue_globally, heap);
+static void trace_global_conservative_roots(struct gc_heap *heap) {
+  /*
+  if (gc_has_global_conservative_roots())
+    gc_platform_visit_global_conservative_roots
+      (mark_and_globally_enqueue_conservative_roots, heap);
+  */
 }
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
@@ -1111,6 +1123,7 @@ static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping_in_block(mut);
+  gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     // No need to collect results in mark buf; we can enqueue roots directly.
     trace_mutator_roots_with_lock(mut);
@@ -1124,6 +1137,7 @@ static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   finish_sweeping(mut);
+  gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     trace_stopping_mutator_roots(mut);
   enqueue_mutator_for_tracing(mut);
@@ -1228,6 +1242,11 @@ static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
   return threshold;
 }
 
+static inline int has_conservative_roots(void) {
+  return gc_has_mutator_conservative_roots() ||
+    gc_has_global_conservative_roots();
+}
+
 static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
@@ -1257,7 +1276,7 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     // blocks, try to avoid any pinning caused by the ragged-stop
     // marking.  Of course if the mutator has conservative roots we will
     // have pinning anyway and might as well allow ragged stops.
-    mark_while_stopping = gc_has_conservative_roots();
+    mark_while_stopping = has_conservative_roots();
   } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
@@ -1426,15 +1445,15 @@ static void prepare_for_evacuation(struct gc_heap *heap) {
 
 static void trace_conservative_roots_after_stop(struct gc_heap *heap) {
   GC_ASSERT(!heap_mark_space(heap)->evacuating);
-  GC_ASSERT(gc_has_conservative_roots());
-  trace_conservative_mutator_roots_after_stop(heap);
-  trace_conservative_global_roots(heap);
+  if (gc_has_mutator_conservative_roots())
+    trace_mutator_conservative_roots_after_stop(heap);
+  if (gc_has_global_conservative_roots())
+    trace_global_conservative_roots(heap);
 }
 
 static void trace_pinned_roots_after_stop(struct gc_heap *heap) {
   GC_ASSERT(!heap_mark_space(heap)->evacuating);
-  if (gc_has_conservative_roots())
-    trace_conservative_roots_after_stop(heap);
+  trace_conservative_roots_after_stop(heap);
 }
 
 static void trace_precise_roots_after_stop(struct gc_heap *heap) {
@@ -2034,6 +2053,7 @@ int gc_init(int argc, struct gc_option argv[],
 
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
+  gc_stack_init(&(*mut)->stack, stack_base);
   add_mutator(*heap, *mut);
   return 1;
 }
@@ -2043,6 +2063,7 @@ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
   struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
   if (!ret)
     GC_CRASH();
+  gc_stack_init(&ret->stack, stack_base);
   add_mutator(heap, ret);
   return ret;
 }
@@ -2059,6 +2080,7 @@ static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->next = heap->deactivated_mutators;
   heap->deactivated_mutators = mut;
   heap->active_mutator_count--;
+  gc_stack_capture_hot(&mut->stack);
   if (!heap->active_mutator_count && mutators_are_stopping(heap))
     pthread_cond_signal(&heap->collector_cond);
   heap_unlock(heap);

From 1944b54a192f104d6c32bcc2ac7579ceac1d58c6 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 21 Sep 2022 10:55:26 +0200
Subject: [PATCH 153/403] Whippet can trace conservative roots

Next up, enabling it via the makefiles.
---
 conservative-roots-embedder.h |  34 +++--
 gc-conservative-ref.h         |  17 +++
 gc-embedder-api.h             |  16 ++-
 large-object-space.h          |  38 ++++++
 precise-roots-embedder.h      |  26 ++--
 semi.c                        |   2 +-
 simple-gc-embedder.h          |   6 +
 whippet.c                     | 245 +++++++++++++++++++++++++++-------
 8 files changed, 311 insertions(+), 73 deletions(-)
 create mode 100644 gc-conservative-ref.h

diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
index 2ac2d2b78..15447a2c4 100644
--- a/conservative-roots-embedder.h
+++ b/conservative-roots-embedder.h
@@ -16,16 +16,34 @@ static inline int gc_has_conservative_intraheap_edges(void) {
   return 0;
 }
 
-static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
-                                                  void (*trace_edge)(struct gc_edge edge,
-                                                                     void *trace_data),
-                                                  void *trace_data) {
+static inline int
+gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
+  // Here is where you would allow tagged heap object references.
+  return displacement == 0;
+}
+static inline int
+gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
+                                           int possibly_interior) {
+  // Assume that the minimum page size is 4096, and that the first page
+  // will contain no heap objects.
+  if (gc_conservative_ref_value(ref) < 4096)
+    return 0;
+  if (possibly_interior)
+    return 1;
+  return gc_is_valid_conservative_ref_displacement
+    (gc_conservative_ref_value(ref) & (sizeof(uintptr_t) - 1));
 }
 
-static inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
-                                               void (*trace_edge)(struct gc_edge edge,
-                                                                  void *trace_data),
-                                               void *trace_data) {
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             void *trace_data),
+                                          void *trace_data) {
+}
+
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          void *trace_data),
+                                       void *trace_data) {
 }
 
 #endif // CONSERVATIVE_ROOTS_EMBEDDER_H
diff --git a/gc-conservative-ref.h b/gc-conservative-ref.h
new file mode 100644
index 000000000..a2b260384
--- /dev/null
+++ b/gc-conservative-ref.h
@@ -0,0 +1,17 @@
+#ifndef GC_CONSERVATIVE_REF_H
+#define GC_CONSERVATIVE_REF_H
+
+#include <stdint.h>
+
+struct gc_conservative_ref {
+  uintptr_t value;
+};
+
+static inline struct gc_conservative_ref gc_conservative_ref(uintptr_t value) {
+  return (struct gc_conservative_ref){value};
+}
+static inline uintptr_t gc_conservative_ref_value(struct gc_conservative_ref ref) {
+  return ref.value;
+}
+
+#endif // GC_CONSERVATIVE_REF_H
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index b24483245..2d74ed0a4 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -1,6 +1,7 @@
 #ifndef GC_EMBEDDER_API_H
 #define GC_EMBEDDER_API_H
 
+#include "gc-conservative-ref.h"
 #include "gc-edge.h"
 #include "gc-forwarding.h"
 
@@ -17,19 +18,24 @@ GC_EMBEDDER_API inline int gc_has_global_conservative_roots(void);
 GC_EMBEDDER_API inline int gc_has_conservative_intraheap_edges(void);
 GC_EMBEDDER_API inline int gc_mutator_conservative_roots_may_be_interior(void);
 
+GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
+GC_EMBEDDER_API inline int gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref,
+                                                                      int possibly_interior);
+
 GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
                                             void (*trace_edge)(struct gc_edge edge,
                                                                void *trace_data),
                                             void *trace_data,
                                             size_t *size) GC_ALWAYS_INLINE;
-GC_EMBEDDER_API inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
+
+GC_EMBEDDER_API inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
                                                    void (*trace_edge)(struct gc_edge edge,
                                                                       void *trace_data),
                                                    void *trace_data);
-GC_EMBEDDER_API inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
-                                                        void (*trace_edge)(struct gc_edge edge,
-                                                                           void *trace_data),
-                                                        void *trace_data);
+GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                                void (*trace_edge)(struct gc_edge edge,
+                                                                   void *trace_data),
+                                                void *trace_data);
 
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
diff --git a/large-object-space.h b/large-object-space.h
index 01bc4cfc7..6bb7a5af7 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -10,6 +10,7 @@
 #include <unistd.h>
 
 #include "gc-ref.h"
+#include "gc-conservative-ref.h"
 #include "address-map.h"
 #include "address-set.h"
 
@@ -90,6 +91,11 @@ done:
   return copied;
 }
 
+static int large_object_space_mark_object(struct large_object_space *space,
+                                          struct gc_ref ref) {
+  return large_object_space_copy(space, ref);
+}
+
 static void large_object_space_reclaim_one(uintptr_t addr, void *data) {
   struct large_object_space *space = data;
   size_t npages = address_map_lookup(&space->object_pages, addr, 0);
@@ -145,6 +151,38 @@ static void large_object_space_finish_gc(struct large_object_space *space,
   pthread_mutex_unlock(&space->lock);
 }
 
+static inline struct gc_ref
+large_object_space_mark_conservative_ref(struct large_object_space *space,
+                                         struct gc_conservative_ref ref,
+                                         int possibly_interior) {
+  uintptr_t addr = gc_conservative_ref_value(ref);
+
+  if (possibly_interior) {
+    // FIXME: This only allows interior pointers within the first page.
+    // BDW-GC doesn't have all-interior-pointers on for intraheap edges
+    // or edges originating in static data but by default does allow
+    // them from stack edges; probably we should too.
+    addr &= ~(space->page_size - 1);
+  } else {
+    // Addr not aligned on page boundary?  Not a large object.
+    uintptr_t displacement = addr & (space->page_size - 1);
+    if (!gc_is_valid_conservative_ref_displacement(displacement))
+      return gc_ref_null();
+    addr -= displacement;
+  }
+
+  pthread_mutex_lock(&space->lock);
+  // ptr might be in fromspace or tospace.  Just check the object_pages table, which
+  // contains both, as well as object_pages for free blocks.
+  int found = address_map_contains(&space->object_pages, addr);
+  pthread_mutex_unlock(&space->lock);
+
+  if (found && large_object_space_copy(space, gc_ref(addr)))
+    return gc_ref(addr);
+
+  return gc_ref_null();
+}
+
 static inline int large_object_space_contains(struct large_object_space *space,
                                               struct gc_ref ref) {
   pthread_mutex_lock(&space->lock);
diff --git a/precise-roots-embedder.h b/precise-roots-embedder.h
index cf649e14d..bde6be36e 100644
--- a/precise-roots-embedder.h
+++ b/precise-roots-embedder.h
@@ -18,6 +18,16 @@ static inline int gc_has_conservative_intraheap_edges(void) {
   return 0;
 }
 
+static inline int
+gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
+  GC_CRASH();
+}
+static inline int
+gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
+                                           int possibly_interior) {
+  GC_CRASH();
+}
+
 static inline void visit_roots(struct handle *roots,
                                void (*trace_edge)(struct gc_edge edge,
                                                   void *trace_data),
@@ -26,18 +36,18 @@ static inline void visit_roots(struct handle *roots,
     trace_edge(gc_edge(&h->v), trace_data);
 }
 
-static inline void gc_trace_precise_mutator_roots(struct gc_mutator_roots *roots,
-                                                  void (*trace_edge)(struct gc_edge edge,
-                                                                     void *trace_data),
-                                                  void *trace_data) {
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             void *trace_data),
+                                          void *trace_data) {
   if (roots)
     visit_roots(roots->roots, trace_edge, trace_data);
 }
 
-static inline void gc_trace_precise_heap_roots(struct gc_heap_roots *roots,
-                                               void (*trace_edge)(struct gc_edge edge,
-                                                                  void *trace_data),
-                                               void *trace_data) {
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          void *trace_data),
+                                       void *trace_data) {
   if (roots)
     visit_roots(roots->roots, trace_edge, trace_data);
 }
diff --git a/semi.c b/semi.c
index e771f7f3d..3ad765416 100644
--- a/semi.c
+++ b/semi.c
@@ -161,7 +161,7 @@ static void collect(struct gc_mutator *mut) {
   flip(semi);
   uintptr_t grey = semi->hp;
   if (mut->roots)
-    gc_trace_precise_mutator_roots(mut->roots, visit, heap);
+    gc_trace_mutator_roots(mut->roots, visit, heap);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, gc_ref(grey));
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 71255256d..457d9b09e 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -104,3 +104,9 @@ gc_atomic_forward_address(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_FORWARDED);
   return fwd->data;
 }
+
+static inline uintptr_t
+gc_conservative_ref_heap_address(struct gc_conservative_ref ref) {
+  // The specific spaces are responsible for checking alignment.
+  return gc_conservative_ref_value(ref);
+}
diff --git a/whippet.c b/whippet.c
index ad1e02260..687d6e307 100644
--- a/whippet.c
+++ b/whippet.c
@@ -15,6 +15,7 @@
 #include "debug.h"
 #include "gc-align.h"
 #include "gc-inline.h"
+#include "gc-platform.h"
 #include "gc-stack.h"
 #include "large-object-space.h"
 #if GC_PARALLEL
@@ -597,15 +598,19 @@ static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
   return 1;
 }
 
-static inline int mark_space_contains(struct mark_space *space,
-                                      struct gc_ref ref) {
-  uintptr_t addr = gc_ref_value(ref);
+static inline int mark_space_contains_address(struct mark_space *space,
+                                              uintptr_t addr) {
   return addr - space->low_addr < space->extent;
 }
 
-static inline int large_object_space_mark_object(struct large_object_space *space,
-                                                 struct gc_ref ref) {
-  return large_object_space_copy(space, ref);
+static inline int mark_space_contains_conservative_ref(struct mark_space *space,
+                                                       struct gc_conservative_ref ref) {
+  return mark_space_contains_address(space, gc_conservative_ref_value(ref));
+}
+
+static inline int mark_space_contains(struct mark_space *space,
+                                      struct gc_ref ref) {
+  return mark_space_contains_address(space, gc_ref_value(ref));
 }
 
 static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
@@ -625,18 +630,84 @@ static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
     GC_CRASH();
 }
 
-static inline int trace_ref(struct gc_heap *heap, struct gc_ref ref) {
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
-  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
-    GC_ASSERT(!heap_mark_space(heap)->evacuating);
-    return mark_space_mark_object(heap_mark_space(heap), ref);
+static inline struct gc_ref mark_space_mark_conservative_ref(struct mark_space *space,
+                                                             struct gc_conservative_ref ref,
+                                                             int possibly_interior) {
+  uintptr_t addr = gc_conservative_ref_value(ref);
+
+  if (possibly_interior) {
+    addr = align_down(addr, GRANULE_SIZE);
+  } else {
+    // Addr not an aligned granule?  Not an object.
+    uintptr_t displacement = addr & (GRANULE_SIZE - 1);
+    if (!gc_is_valid_conservative_ref_displacement(displacement))
+      return gc_ref_null();
+    addr -= displacement;
   }
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
-    return large_object_space_mark_object(heap_large_object_space(heap),
-                                          ref);
+
+  // Addr in meta block?  Not an object.
+  if ((addr & (SLAB_SIZE - 1)) < META_BLOCKS_PER_SLAB * BLOCK_SIZE)
+    return gc_ref_null();
+
+  // Addr in block that has been paged out?  Not an object.
+  struct block_summary *summary = block_summary_for_addr(addr);
+  if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
+    return gc_ref_null();
+
+  uint8_t *loc = metadata_byte_for_addr(addr);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
+
+  // Already marked object?  Nothing to do.
+  if (byte & space->marked_mask)
+    return gc_ref_null();
+
+  // Addr is the not start of an unmarked object?  Search backwards if
+  // we have interior pointers, otherwise not an object.
+  uint8_t object_start_mask = space->live_mask | METADATA_BYTE_YOUNG;
+  if (!(byte & object_start_mask)) {
+    if (!possibly_interior)
+      return gc_ref_null();
+
+    uintptr_t block_base = align_down(addr, BLOCK_SIZE);
+    uint8_t *loc_base = metadata_byte_for_addr(block_base);
+    do {
+      // Searched past block?  Not an object.
+      if (loc-- == loc_base)
+        return gc_ref_null();
+
+      byte = atomic_load_explicit(loc, memory_order_relaxed);
+
+      // Ran into the end of some other allocation?  Not an object, then.
+      if (byte & METADATA_BYTE_END)
+        return gc_ref_null();
+
+      // Continue until we find object start.
+    } while (!(byte & object_start_mask));
+
+    // Found object start, and object is unmarked; adjust addr.
+    addr = block_base + (loc - loc_base) * GRANULE_SIZE;
+  }
+
+  uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
+    | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
+  atomic_store_explicit(loc, (byte & ~mask) | space->marked_mask,
+                        memory_order_relaxed);
+
+  return gc_ref(addr);
+}
+
+static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
+                                                   struct gc_conservative_ref ref,
+                                                   int possibly_interior) {
+  if (!gc_conservative_ref_might_be_a_heap_object(ref, possibly_interior))
+    return gc_ref_null();
+
+  if (GC_LIKELY(mark_space_contains_conservative_ref(heap_mark_space(heap), ref)))
+    return mark_space_mark_conservative_ref(heap_mark_space(heap), ref,
+                                            possibly_interior);
   else
-    GC_CRASH();
+    return large_object_space_mark_conservative_ref(heap_large_object_space(heap),
+                                                    ref, possibly_interior);
 }
 
 static inline void trace_one(struct gc_ref ref, void *mark_data) {
@@ -894,10 +965,24 @@ static void trace_and_enqueue_locally(struct gc_edge edge, void *data) {
     mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
-static void trace_ref_and_enqueue_locally(struct gc_ref ref, void *data) {
+static inline void do_trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
+                                                                 void *data,
+                                                                 int possibly_interior) {
   struct gc_mutator *mut = data;
-  if (trace_ref(mutator_heap(mut), ref))
-    mutator_mark_buf_push(&mut->mark_buf, ref);
+  struct gc_ref object = trace_conservative_ref(mutator_heap(mut), ref,
+                                                possibly_interior);
+  if (gc_ref_is_heap_object(object))
+    mutator_mark_buf_push(&mut->mark_buf, object);
+}
+
+static void trace_possibly_interior_conservative_ref_and_enqueue_locally
+    (struct gc_conservative_ref ref, void *data) {
+  return do_trace_conservative_ref_and_enqueue_locally(ref, data, 1);
+}
+
+static void trace_conservative_ref_and_enqueue_locally
+    (struct gc_conservative_ref ref, void *data) {
+  return do_trace_conservative_ref_and_enqueue_locally(ref, data, 0);
 }
 
 static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
@@ -906,40 +991,105 @@ static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
     tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
-static void trace_ref_and_enqueue_globally(struct gc_ref ref, void *data) {
+static inline void do_trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                                  void *data,
+                                                                  int possibly_interior) {
   struct gc_heap *heap = data;
-  if (trace_ref(heap, ref))
-    tracer_enqueue_root(&heap->tracer, ref);
+  struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
+  if (gc_ref_is_heap_object(object))
+    tracer_enqueue_root(&heap->tracer, object);
+}
+
+static void trace_possibly_interior_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                                          void *data) {
+  return do_trace_conservative_ref_and_enqueue_globally(ref, data, 1);
+}
+
+static void trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                        void *data) {
+  return do_trace_conservative_ref_and_enqueue_globally(ref, data, 0);
+}
+
+static inline struct gc_conservative_ref
+load_conservative_ref(uintptr_t addr) {
+  GC_ASSERT((addr & (sizeof(uintptr_t) - 1)) == 0);
+  uintptr_t val;
+  memcpy(&val, (char*)addr, sizeof(uintptr_t));
+  return gc_conservative_ref(val);
+}
+
+static inline void
+trace_conservative_edges(uintptr_t low,
+                         uintptr_t high,
+                         void (*trace)(struct gc_conservative_ref, void *),
+                         void *data) {
+  GC_ASSERT(low == align_down(low, sizeof(uintptr_t)));
+  GC_ASSERT(high == align_down(high, sizeof(uintptr_t)));
+  for (uintptr_t addr = low; addr < high; addr += sizeof(uintptr_t))
+    trace(load_conservative_ref(addr), data);
+}
+
+static void
+mark_and_globally_enqueue_mutator_conservative_roots(uintptr_t low,
+                                                     uintptr_t high,
+                                                     void *data) {
+  trace_conservative_edges(low, high,
+                           gc_mutator_conservative_roots_may_be_interior()
+                           ? trace_possibly_interior_conservative_ref_and_enqueue_globally
+                           : trace_conservative_ref_and_enqueue_globally,
+                           data);
+}
+
+static void
+mark_and_globally_enqueue_heap_conservative_roots(uintptr_t low,
+                                                  uintptr_t high,
+                                                  void *data) {
+  trace_conservative_edges(low, high,
+                           trace_conservative_ref_and_enqueue_globally,
+                           data);
+}
+
+static void
+mark_and_locally_enqueue_mutator_conservative_roots(uintptr_t low,
+                                                    uintptr_t high,
+                                                    void *data) {
+  trace_conservative_edges(low, high,
+                           gc_mutator_conservative_roots_may_be_interior()
+                           ? trace_possibly_interior_conservative_ref_and_enqueue_locally
+                           : trace_conservative_ref_and_enqueue_locally,
+                           data);
+}
+
+static inline void
+trace_mutator_conservative_roots(struct gc_mutator *mut,
+                                 void (*trace_range)(uintptr_t low,
+                                                     uintptr_t high,
+                                                     void *data),
+                                 void *data) {
+  if (gc_has_mutator_conservative_roots())
+    gc_stack_visit(&mut->stack, trace_range, data);
 }
 
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void trace_stopping_mutator_roots(struct gc_mutator *mut) {
   GC_ASSERT(mutator_should_mark_while_stopping(mut));
-  /*
   trace_mutator_conservative_roots(mut,
-                                   mark_and_locally_enqueue_conservative_roots,
+                                   mark_and_locally_enqueue_mutator_conservative_roots,
                                    mut);
-  */
-  gc_trace_precise_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
-}
-
-static void trace_precise_mutator_roots_with_lock(struct gc_mutator *mut) {
-  gc_trace_precise_mutator_roots(mut->roots, trace_and_enqueue_globally,
-                                 mutator_heap(mut));
+  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
 }
 
 static void trace_mutator_conservative_roots_with_lock(struct gc_mutator *mut) {
-  /*
   trace_mutator_conservative_roots(mut,
-                                   mark_and_globally_enqueue_conservative_roots,
+                                   mark_and_globally_enqueue_mutator_conservative_roots,
                                    mutator_heap(mut));
-  */
 }
 
 static void trace_mutator_roots_with_lock(struct gc_mutator *mut) {
   trace_mutator_conservative_roots_with_lock(mut);
-  trace_precise_mutator_roots_with_lock(mut);
+  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
+                         mutator_heap(mut));
 }
 
 static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
@@ -965,12 +1115,11 @@ static void finish_sweeping_in_block(struct gc_mutator *mut);
 
 static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
-  if (!active_mutators_already_marked) {
+  if (!active_mutators_already_marked)
     for (struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
          mut;
          mut = mut->next)
       trace_mutator_conservative_roots_with_lock(mut);
-  }
 
   for (struct gc_mutator *mut = heap->deactivated_mutators;
        mut;
@@ -978,7 +1127,7 @@ static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
     trace_mutator_conservative_roots_with_lock(mut);
 }
 
-static void trace_precise_mutator_roots_after_stop(struct gc_heap *heap) {
+static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
   struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   while (mut) {
@@ -988,7 +1137,7 @@ static void trace_precise_mutator_roots_after_stop(struct gc_heap *heap) {
       tracer_enqueue_roots(&heap->tracer, mut->mark_buf.objects,
                            mut->mark_buf.size);
     else
-      trace_precise_mutator_roots_with_lock(mut);
+      trace_mutator_roots_with_lock(mut);
     // Also unlink mutator_trace_list chain.
     struct gc_mutator *next = mut->next;
     mut->next = NULL;
@@ -998,20 +1147,14 @@ static void trace_precise_mutator_roots_after_stop(struct gc_heap *heap) {
 
   for (struct gc_mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
     finish_sweeping_in_block(mut);
-    trace_precise_mutator_roots_with_lock(mut);
+    trace_mutator_roots_with_lock(mut);
   }
 }
 
-static void trace_precise_global_roots(struct gc_heap *heap) {
-  gc_trace_precise_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
-}
-
 static void trace_global_conservative_roots(struct gc_heap *heap) {
-  /*
   if (gc_has_global_conservative_roots())
     gc_platform_visit_global_conservative_roots
-      (mark_and_globally_enqueue_conservative_roots, heap);
-  */
+      (mark_and_globally_enqueue_heap_conservative_roots, heap);
 }
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
@@ -1456,9 +1599,9 @@ static void trace_pinned_roots_after_stop(struct gc_heap *heap) {
   trace_conservative_roots_after_stop(heap);
 }
 
-static void trace_precise_roots_after_stop(struct gc_heap *heap) {
-  trace_precise_mutator_roots_after_stop(heap);
-  trace_precise_global_roots(heap);
+static void trace_roots_after_stop(struct gc_heap *heap) {
+  trace_mutator_roots_after_stop(heap);
+  gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
   trace_generational_roots(heap);
 }
 
@@ -1494,7 +1637,7 @@ static void collect(struct gc_mutator *mut) {
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
   prepare_for_evacuation(heap);
-  trace_precise_roots_after_stop(heap);
+  trace_roots_after_stop(heap);
   tracer_trace(heap);
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);

From 703bb30e19f39c11b89b843fa042c05ccb47903d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 2 Oct 2022 09:30:44 +0200
Subject: [PATCH 154/403] add conservative makefile targets

---
 Makefile | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1770f1a3c..1f1e8ba8e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi whippet parallel-whippet generational-whippet parallel-generational-whippet
+COLLECTORS=bdw semi whippet conservative-whippet parallel-whippet conservative-parallel-whippet generational-whippet conservative-generational-whippet parallel-generational-whippet conservative-parallel-generational-whippet
 
 CC=gcc
 CFLAGS=-Wall -O2 -g -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
@@ -35,21 +35,41 @@ whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h asse
 whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
+conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+conservative-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+
 parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
+conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+conservative-parallel-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+
 generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
 generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
+conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+conservative-generational-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+
 parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
 
+conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+conservative-parallel-generational-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+
 %: %.o %-gc.o gc-platform.o gc-stack.o
 	$(CC) $(LDFLAGS) $($*_LDFLAGS) -o $@ $^
 

From 5e986e84e947657bc0b3c788fe755037ee7c4ec4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 3 Oct 2022 16:12:06 +0200
Subject: [PATCH 155/403] Update README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index bab4d80a1..ced078698 100644
--- a/README.md
+++ b/README.md
@@ -142,9 +142,9 @@ large majority of use cases.
 
 ### Missing features before Guile can use Whippet
 
- - [ ] Pinning
- - [ ] Conservative stacks
- - [ ] Conservative data segments
+ - [X] Pinning
+ - [X] Conservative stacks
+ - [X] Conservative data segments
  - [ ] Heap growth/shrinking
  - [ ] Debugging/tracing
  - [ ] Finalizers

From 053dbf0b61d201f738e1d29e1d5318b333424e07 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 25 Oct 2022 14:25:55 +0200
Subject: [PATCH 156/403] Pass heap to tracer functions

This will allow conservative intra-heap edges.  Hopefully no overhead?
---
 conservative-roots-embedder.h |  4 ++
 gc-embedder-api.h             | 11 +++++-
 gc-platform-gnu-linux.c       | 11 ++++--
 gc-platform.h                 |  4 ++
 gc-stack.c                    | 10 +++--
 gc-stack.h                    |  4 ++
 mt-gcbench-embedder.h         | 26 ++++++++-----
 parallel-tracer.h             | 19 +++++-----
 precise-roots-embedder.h      | 12 ++++--
 quads-embedder.h              | 13 +++++--
 semi.c                        | 17 +++++----
 serial-tracer.h               | 11 +++---
 simple-gc-embedder.h          |  4 +-
 whippet.c                     | 69 +++++++++++++++++++++--------------
 14 files changed, 141 insertions(+), 74 deletions(-)

diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
index 15447a2c4..c8004f00c 100644
--- a/conservative-roots-embedder.h
+++ b/conservative-roots-embedder.h
@@ -36,13 +36,17 @@ gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
 
 static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
                                           void (*trace_edge)(struct gc_edge edge,
+                                                             struct gc_heap *heap,
                                                              void *trace_data),
+                                          struct gc_heap *heap,
                                           void *trace_data) {
 }
 
 static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
                                        void (*trace_edge)(struct gc_edge edge,
+                                                          struct gc_heap *heap,
                                                           void *trace_data),
+                                       struct gc_heap *heap,
                                        void *trace_data) {
 }
 
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index 2d74ed0a4..3b3682a71 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -12,6 +12,7 @@
 struct gc_mutator_roots;
 struct gc_heap_roots;
 struct gc_atomic_forward;
+struct gc_heap;
 
 GC_EMBEDDER_API inline int gc_has_mutator_conservative_roots(void);
 GC_EMBEDDER_API inline int gc_has_global_conservative_roots(void);
@@ -23,18 +24,24 @@ GC_EMBEDDER_API inline int gc_conservative_ref_might_be_a_heap_object(struct gc_
                                                                       int possibly_interior);
 
 GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
-                                            void (*trace_edge)(struct gc_edge edge,
-                                                               void *trace_data),
+                                            void (*visit)(struct gc_edge edge,
+                                                          struct gc_heap *heap,
+                                                          void *visit_data),
+                                            struct gc_heap *heap,
                                             void *trace_data,
                                             size_t *size) GC_ALWAYS_INLINE;
 
 GC_EMBEDDER_API inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
                                                    void (*trace_edge)(struct gc_edge edge,
+                                                                      struct gc_heap *heap,
                                                                       void *trace_data),
+                                                   struct gc_heap *heap,
                                                    void *trace_data);
 GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
                                                 void (*trace_edge)(struct gc_edge edge,
+                                                                   struct gc_heap *heap,
                                                                    void *trace_data),
+                                                struct gc_heap *heap,
                                                 void *trace_data);
 
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
diff --git a/gc-platform-gnu-linux.c b/gc-platform-gnu-linux.c
index b0b9f0983..66e2a73df 100644
--- a/gc-platform-gnu-linux.c
+++ b/gc-platform-gnu-linux.c
@@ -11,7 +11,7 @@
 #include "debug.h"
 #include "gc-assert.h"
 #include "gc-inline.h"
-//#include "gc-stack.h"
+#include "gc-platform.h"
 
 void gc_platform_init(void) {
   // Nothing to do.
@@ -59,7 +59,8 @@ uintptr_t gc_platform_current_thread_stack_base(void) {
 }
 
 struct visit_data {
-  void (*f)(uintptr_t start, uintptr_t end, void *data);
+  void (*f)(uintptr_t start, uintptr_t end, struct gc_heap *heap, void *data);
+  struct gc_heap *heap;
   void *data;
 };
 
@@ -85,7 +86,7 @@ static int visit_roots(struct dl_phdr_info *info, size_t size, void *data) {
       uintptr_t end = start + p->p_memsz;
       DEBUG("found roots for '%s': [%p,%p)\n", object_name,
             (void*)start, (void*)end);
-      visit_data->f(start, end, visit_data->data);
+      visit_data->f(start, end, visit_data->heap, visit_data->data);
     }
   }
 
@@ -94,8 +95,10 @@ static int visit_roots(struct dl_phdr_info *info, size_t size, void *data) {
 
 void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
                                                            uintptr_t end,
+                                                           struct gc_heap*,
                                                            void *data),
+                                                 struct gc_heap *heap,
                                                  void *data) {
-  struct visit_data visit_data = { f, data };
+  struct visit_data visit_data = { f, heap, data };
   dl_iterate_phdr(visit_roots, &visit_data);
 }
diff --git a/gc-platform.h b/gc-platform.h
index acc0096f9..b22787d19 100644
--- a/gc-platform.h
+++ b/gc-platform.h
@@ -9,12 +9,16 @@
 
 #include "gc-visibility.h"
 
+struct gc_heap;
+
 GC_INTERNAL void gc_platform_init(void);
 GC_INTERNAL uintptr_t gc_platform_current_thread_stack_base(void);
 GC_INTERNAL
 void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
                                                            uintptr_t end,
+                                                           struct gc_heap *heap,
                                                            void *data),
+                                                 struct gc_heap *heap,
                                                  void *data);
 
 #endif // GC_PLATFORM_H
diff --git a/gc-stack.c b/gc-stack.c
index bc8c9e64f..54c6fdb0c 100644
--- a/gc-stack.c
+++ b/gc-stack.c
@@ -63,7 +63,9 @@ void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *base,
 }
 
 void gc_stack_visit(struct gc_stack *stack,
-                    void (*visit)(uintptr_t low, uintptr_t high, void *data),
+                    void (*visit)(uintptr_t low, uintptr_t high,
+                                  struct gc_heap *heap, void *data),
+                    struct gc_heap *heap,
                     void *data) {
   {
     uintptr_t low = (uintptr_t)stack->registers;
@@ -71,7 +73,7 @@ void gc_stack_visit(struct gc_stack *stack,
     uintptr_t high = low + sizeof(jmp_buf);
     DEBUG("found mutator register roots for %p: [%p,%p)\n", stack,
           (void*)low, (void*)high);
-    visit(low, high, data);
+    visit(low, high, heap, data);
   }
 
   if (0 HOTTER_THAN 1) {
@@ -79,12 +81,12 @@ void gc_stack_visit(struct gc_stack *stack,
           (void*)stack->hot.addr, (void*)stack->cold.addr);
     visit(align_up(stack->hot.addr, sizeof(uintptr_t)),
           align_down(stack->cold.addr, sizeof(uintptr_t)),
-          data);
+          heap, data);
   } else {
     DEBUG("found mutator stack roots for %p: [%p,%p)\n", stack,
           (void*)stack->cold.addr, (void*)stack->hot.addr);
     visit(align_up(stack->cold.addr, sizeof(uintptr_t)),
           align_down(stack->hot.addr, sizeof(uintptr_t)),
-          data);
+          heap, data);
   }
 }
diff --git a/gc-stack.h b/gc-stack.h
index fa228b210..15df9df6d 100644
--- a/gc-stack.h
+++ b/gc-stack.h
@@ -18,12 +18,16 @@ struct gc_stack {
   jmp_buf registers;
 };
 
+struct gc_heap;
+
 GC_INTERNAL void gc_stack_init(struct gc_stack *stack,
                                struct gc_stack_addr *base);
 GC_INTERNAL void gc_stack_capture_hot(struct gc_stack *stack);
 GC_INTERNAL void gc_stack_visit(struct gc_stack *stack,
                                 void (*visit)(uintptr_t low, uintptr_t high,
+                                              struct gc_heap *heap,
                                               void *data),
+                                struct gc_heap *heap,
                                 void *data);
 
 #endif // GC_STACK_H
diff --git a/mt-gcbench-embedder.h b/mt-gcbench-embedder.h
index 3162a8bf3..1ac42a327 100644
--- a/mt-gcbench-embedder.h
+++ b/mt-gcbench-embedder.h
@@ -3,10 +3,15 @@
 
 #include "mt-gcbench-types.h"
 
+struct gc_heap;
+
 #define DEFINE_METHODS(name, Name, NAME) \
   static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
   static inline void visit_##name##_fields(Name *obj,\
-                                           void (*visit)(struct gc_edge edge, void *visit_data), \
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
                                            void *visit_data) GC_ALWAYS_INLINE;
 FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
 #undef DEFINE_METHODS
@@ -22,20 +27,23 @@ static inline size_t hole_size(Hole *hole) {
 }
 static inline void
 visit_node_fields(Node *node,
-                  void (*visit)(struct gc_edge edge, void *visit_data),
-                  void *visit_data) {
-  visit(gc_edge(&node->left), visit_data);
-  visit(gc_edge(&node->right), visit_data);
+                  void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                void *visit_data),
+                  struct gc_heap *heap, void *visit_data) {
+  visit(gc_edge(&node->left), heap, visit_data);
+  visit(gc_edge(&node->right), heap, visit_data);
 }
 static inline void
 visit_double_array_fields(DoubleArray *obj,
-                          void (*visit)(struct gc_edge edge, void *visit_data),
-                          void *visit_data) {
+                          void (*visit)(struct gc_edge edge,
+                                        struct gc_heap *heap, void *visit_data),
+                          struct gc_heap *heap, void *visit_data) {
 }
 static inline void
 visit_hole_fields(Hole *obj,
-                  void (*visit)(struct gc_edge edge, void *visit_data),
-                  void *visit_data) {
+                  void (*visit)(struct gc_edge edge,
+                                struct gc_heap *heap, void *visit_data),
+                  struct gc_heap *heap, void *visit_data) {
 #if GC_PRECISE
   GC_CRASH();
 #endif
diff --git a/parallel-tracer.h b/parallel-tracer.h
index bcc7910c2..df6cc89ae 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -320,7 +320,6 @@ struct tracer {
 struct local_tracer {
   struct trace_worker *worker;
   struct trace_deque *share_deque;
-  struct gc_heap *heap;
   struct local_trace_queue local;
 };
 
@@ -449,8 +448,10 @@ static void tracer_release(struct gc_heap *heap) {
     trace_deque_release(&tracer->workers[i].deque);
 }
 
-static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
-static inline void trace_one(struct gc_ref ref, void *trace_data) GC_ALWAYS_INLINE;
+static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
+                                void *trace_data) GC_ALWAYS_INLINE;
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
@@ -462,9 +463,9 @@ tracer_share(struct local_tracer *trace) {
 }
 
 static inline void
-tracer_visit(struct gc_edge edge, void *trace_data) {
-  struct local_tracer *trace = trace_data;
-  if (trace_edge(trace->heap, edge)) {
+tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  if (trace_edge(heap, edge)) {
+    struct local_tracer *trace = trace_data;
     if (local_trace_queue_full(&trace->local))
       tracer_share(trace);
     local_trace_queue_push(&trace->local, gc_edge_ref(edge));
@@ -544,8 +545,8 @@ trace_worker_check_termination(struct trace_worker *worker,
 
 static struct gc_ref
 trace_worker_steal(struct local_tracer *trace) {
-  struct tracer *tracer = heap_tracer(trace->heap);
   struct trace_worker *worker = trace->worker;
+  struct tracer *tracer = heap_tracer(worker->heap);
 
   // It could be that the worker's local trace queue has simply
   // overflowed.  In that case avoid contention by trying to pop
@@ -573,7 +574,7 @@ trace_worker_trace(struct trace_worker *worker) {
   struct local_tracer trace;
   trace.worker = worker;
   trace.share_deque = &worker->deque;
-  trace.heap = worker->heap;
+  struct gc_heap *heap = worker->heap;
   local_trace_queue_init(&trace.local);
 
   size_t n = 0;
@@ -587,7 +588,7 @@ trace_worker_trace(struct trace_worker *worker) {
       if (!gc_ref_is_heap_object(ref))
         break;
     }
-    trace_one(ref, &trace);
+    trace_one(ref, heap, &trace);
     n++;
   }
   DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
diff --git a/precise-roots-embedder.h b/precise-roots-embedder.h
index bde6be36e..94192cb51 100644
--- a/precise-roots-embedder.h
+++ b/precise-roots-embedder.h
@@ -30,26 +30,32 @@ gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
 
 static inline void visit_roots(struct handle *roots,
                                void (*trace_edge)(struct gc_edge edge,
+                                                  struct gc_heap *heap,
                                                   void *trace_data),
+                               struct gc_heap *heap,
                                void *trace_data) {
   for (struct handle *h = roots; h; h = h->next)
-    trace_edge(gc_edge(&h->v), trace_data);
+    trace_edge(gc_edge(&h->v), heap, trace_data);
 }
 
 static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
                                           void (*trace_edge)(struct gc_edge edge,
+                                                             struct gc_heap *heap,
                                                              void *trace_data),
+                                          struct gc_heap *heap,
                                           void *trace_data) {
   if (roots)
-    visit_roots(roots->roots, trace_edge, trace_data);
+    visit_roots(roots->roots, trace_edge, heap, trace_data);
 }
 
 static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
                                        void (*trace_edge)(struct gc_edge edge,
+                                                          struct gc_heap *heap,
                                                           void *trace_data),
+                                       struct gc_heap *heap,
                                        void *trace_data) {
   if (roots)
-    visit_roots(roots->roots, trace_edge, trace_data);
+    visit_roots(roots->roots, trace_edge, heap, trace_data);
 }
 
 #endif // PRECISE_ROOTS_EMBEDDER_H
diff --git a/quads-embedder.h b/quads-embedder.h
index 714415dd0..1d9d3f71c 100644
--- a/quads-embedder.h
+++ b/quads-embedder.h
@@ -5,10 +5,15 @@
 
 #include "quads-types.h"
 
+struct gc_heap;
+
 #define DEFINE_METHODS(name, Name, NAME) \
   static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
   static inline void visit_##name##_fields(Name *obj,\
-                                           void (*visit)(struct gc_edge edge, void *visit_data), \
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
                                            void *visit_data) GC_ALWAYS_INLINE;
 FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
 #undef DEFINE_METHODS
@@ -19,10 +24,12 @@ static inline size_t quad_size(Quad *obj) {
 
 static inline void
 visit_quad_fields(Quad *quad,
-                  void (*visit)(struct gc_edge edge, void *visit_data),
+                  void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                void *visit_data),
+                  struct gc_heap *heap,
                   void *visit_data) {
   for (size_t i = 0; i < 4; i++)
-    visit(gc_edge(&quad->kids[i]), visit_data);
+    visit(gc_edge(&quad->kids[i]), heap, visit_data);
 }
 
 #include "simple-gc-embedder.h"
diff --git a/semi.c b/semi.c
index 3ad765416..2c1eae600 100644
--- a/semi.c
+++ b/semi.c
@@ -63,7 +63,7 @@ static uintptr_t align_up(uintptr_t addr, size_t align) {
 static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
-static void visit(struct gc_edge edge, void *visit_data);
+static void trace(struct gc_edge edge, struct gc_heap *heap, void *visit_data);
 
 static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
   size_t stolen_pages = space->stolen_pages + npages;
@@ -103,7 +103,7 @@ static void flip(struct semi_space *space) {
 
 static struct gc_ref copy(struct semi_space *space, struct gc_ref ref) {
   size_t size;
-  gc_trace_object(ref, NULL, NULL, &size);
+  gc_trace_object(ref, NULL, NULL, NULL, &size);
   struct gc_ref new_ref = gc_ref(space->hp);
   memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(ref), size);
   gc_object_forward_nonatomic(ref, new_ref);
@@ -113,7 +113,7 @@ static struct gc_ref copy(struct semi_space *space, struct gc_ref ref) {
 
 static uintptr_t scan(struct gc_heap *heap, struct gc_ref grey) {
   size_t size;
-  gc_trace_object(grey, visit, heap, &size);
+  gc_trace_object(grey, trace, heap, NULL, &size);
   return gc_ref_value(grey) + align_up(size, GC_ALIGNMENT);
 }
 
@@ -131,7 +131,7 @@ static void visit_large_object_space(struct gc_heap *heap,
                                      struct large_object_space *space,
                                      struct gc_ref ref) {
   if (large_object_space_copy(space, ref))
-    gc_trace_object(ref, visit, heap, NULL);
+    gc_trace_object(ref, trace, heap, NULL, NULL);
 }
 
 static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
@@ -139,8 +139,7 @@ static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
   return addr - space->base < space->size;
 }
 
-static void visit(struct gc_edge edge, void *visit_data) {
-  struct gc_heap *heap = visit_data;
+static void visit(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return;
@@ -152,6 +151,10 @@ static void visit(struct gc_edge edge, void *visit_data) {
     GC_CRASH();
 }
 
+static void trace(struct gc_edge edge, struct gc_heap *heap, void *visit_data) {
+  return visit(edge, heap);
+}
+
 static void collect(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   struct semi_space *semi = heap_semi_space(heap);
@@ -161,7 +164,7 @@ static void collect(struct gc_mutator *mut) {
   flip(semi);
   uintptr_t grey = semi->hp;
   if (mut->roots)
-    gc_trace_mutator_roots(mut->roots, visit, heap);
+    gc_trace_mutator_roots(mut->roots, trace, heap, NULL);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, gc_ref(grey));
diff --git a/serial-tracer.h b/serial-tracer.h
index b4194c160..7c0bdcca9 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -135,8 +135,10 @@ static void tracer_release(struct gc_heap *heap) {
   trace_queue_release(&heap_tracer(heap)->queue);
 }
 
-static inline void tracer_visit(struct gc_edge edge, void *trace_data) GC_ALWAYS_INLINE;
-static inline void trace_one(struct gc_ref ref, void *trace_data) GC_ALWAYS_INLINE;
+static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
+                                void *trace_data) GC_ALWAYS_INLINE;
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
@@ -150,8 +152,7 @@ tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objs,
   trace_queue_push_many(&tracer->queue, objs, count);
 }
 static inline void
-tracer_visit(struct gc_edge edge, void *trace_data) {
-  struct gc_heap *heap = trace_data;
+tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   if (trace_edge(heap, edge))
     tracer_enqueue_root(heap_tracer(heap), gc_edge_ref(edge));
 }
@@ -161,7 +162,7 @@ tracer_trace(struct gc_heap *heap) {
     struct gc_ref obj = trace_queue_pop(&heap_tracer(heap)->queue);
     if (!gc_ref_is_heap_object(obj))
       break;
-    trace_one(obj, heap);
+    trace_one(obj, heap, NULL);
   } while (1);
 }
 
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 457d9b09e..7b691acfa 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -5,7 +5,9 @@
 
 static inline void gc_trace_object(struct gc_ref ref,
                                    void (*trace_edge)(struct gc_edge edge,
+                                                      struct gc_heap *heap,
                                                       void *trace_data),
+                                   struct gc_heap *heap,
                                    void *trace_data,
                                    size_t *size) {
   switch (tag_live_alloc_kind(*tag_word(ref))) {
@@ -13,7 +15,7 @@ static inline void gc_trace_object(struct gc_ref ref,
     case ALLOC_KIND_##NAME:                                             \
       if (trace_edge)                                                   \
         visit_##name##_fields(gc_ref_heap_object(ref), trace_edge,      \
-                              trace_data);                              \
+                              heap, trace_data);                        \
       if (size)                                                         \
         *size = name##_size(gc_ref_heap_object(ref));                   \
       break;
diff --git a/whippet.c b/whippet.c
index 687d6e307..5dbfc80ec 100644
--- a/whippet.c
+++ b/whippet.c
@@ -710,8 +710,9 @@ static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
                                                     ref, possibly_interior);
 }
 
-static inline void trace_one(struct gc_ref ref, void *mark_data) {
-  gc_trace_object(ref, tracer_visit, mark_data, NULL);
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             void *mark_data) {
+  gc_trace_object(ref, tracer_visit, heap, mark_data, NULL);
 }
 
 static int heap_has_multiple_mutators(struct gc_heap *heap) {
@@ -959,55 +960,60 @@ void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
   heap->roots = roots;
 }
 
-static void trace_and_enqueue_locally(struct gc_edge edge, void *data) {
+static void trace_and_enqueue_locally(struct gc_edge edge,
+                                      struct gc_heap *heap,
+                                      void *data) {
   struct gc_mutator *mut = data;
-  if (trace_edge(mutator_heap(mut), edge))
+  if (trace_edge(heap, edge))
     mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
 static inline void do_trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
+                                                                 struct gc_heap *heap,
                                                                  void *data,
                                                                  int possibly_interior) {
   struct gc_mutator *mut = data;
-  struct gc_ref object = trace_conservative_ref(mutator_heap(mut), ref,
-                                                possibly_interior);
+  struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(object))
     mutator_mark_buf_push(&mut->mark_buf, object);
 }
 
 static void trace_possibly_interior_conservative_ref_and_enqueue_locally
-    (struct gc_conservative_ref ref, void *data) {
-  return do_trace_conservative_ref_and_enqueue_locally(ref, data, 1);
+    (struct gc_conservative_ref ref, struct gc_heap *heap, void *data) {
+  return do_trace_conservative_ref_and_enqueue_locally(ref, heap, data, 1);
 }
 
 static void trace_conservative_ref_and_enqueue_locally
-    (struct gc_conservative_ref ref, void *data) {
-  return do_trace_conservative_ref_and_enqueue_locally(ref, data, 0);
+    (struct gc_conservative_ref ref, struct gc_heap *heap, void *data) {
+  return do_trace_conservative_ref_and_enqueue_locally(ref, heap, data, 0);
 }
 
-static void trace_and_enqueue_globally(struct gc_edge edge, void *data) {
-  struct gc_heap *heap = data;
+static void trace_and_enqueue_globally(struct gc_edge edge,
+                                       struct gc_heap *heap,
+                                       void *unused) {
   if (trace_edge(heap, edge))
     tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
 static inline void do_trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                                  struct gc_heap *heap,
                                                                   void *data,
                                                                   int possibly_interior) {
-  struct gc_heap *heap = data;
   struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(object))
     tracer_enqueue_root(&heap->tracer, object);
 }
 
 static void trace_possibly_interior_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                                          struct gc_heap *heap,
                                                                           void *data) {
-  return do_trace_conservative_ref_and_enqueue_globally(ref, data, 1);
+  return do_trace_conservative_ref_and_enqueue_globally(ref, heap, data, 1);
 }
 
 static void trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                        struct gc_heap *heap,
                                                         void *data) {
-  return do_trace_conservative_ref_and_enqueue_globally(ref, data, 0);
+  return do_trace_conservative_ref_and_enqueue_globally(ref, heap, data, 0);
 }
 
 static inline struct gc_conservative_ref
@@ -1021,75 +1027,84 @@ load_conservative_ref(uintptr_t addr) {
 static inline void
 trace_conservative_edges(uintptr_t low,
                          uintptr_t high,
-                         void (*trace)(struct gc_conservative_ref, void *),
+                         void (*trace)(struct gc_conservative_ref,
+                                       struct gc_heap *, void *),
+                         struct gc_heap *heap,
                          void *data) {
   GC_ASSERT(low == align_down(low, sizeof(uintptr_t)));
   GC_ASSERT(high == align_down(high, sizeof(uintptr_t)));
   for (uintptr_t addr = low; addr < high; addr += sizeof(uintptr_t))
-    trace(load_conservative_ref(addr), data);
+    trace(load_conservative_ref(addr), heap, data);
 }
 
 static void
 mark_and_globally_enqueue_mutator_conservative_roots(uintptr_t low,
                                                      uintptr_t high,
+                                                     struct gc_heap *heap,
                                                      void *data) {
   trace_conservative_edges(low, high,
                            gc_mutator_conservative_roots_may_be_interior()
                            ? trace_possibly_interior_conservative_ref_and_enqueue_globally
                            : trace_conservative_ref_and_enqueue_globally,
-                           data);
+                           heap, data);
 }
 
 static void
 mark_and_globally_enqueue_heap_conservative_roots(uintptr_t low,
                                                   uintptr_t high,
+                                                  struct gc_heap *heap,
                                                   void *data) {
   trace_conservative_edges(low, high,
                            trace_conservative_ref_and_enqueue_globally,
-                           data);
+                           heap, data);
 }
 
 static void
 mark_and_locally_enqueue_mutator_conservative_roots(uintptr_t low,
                                                     uintptr_t high,
+                                                    struct gc_heap *heap,
                                                     void *data) {
   trace_conservative_edges(low, high,
                            gc_mutator_conservative_roots_may_be_interior()
                            ? trace_possibly_interior_conservative_ref_and_enqueue_locally
                            : trace_conservative_ref_and_enqueue_locally,
-                           data);
+                           heap, data);
 }
 
 static inline void
 trace_mutator_conservative_roots(struct gc_mutator *mut,
                                  void (*trace_range)(uintptr_t low,
                                                      uintptr_t high,
+                                                     struct gc_heap *heap,
                                                      void *data),
+                                 struct gc_heap *heap,
                                  void *data) {
   if (gc_has_mutator_conservative_roots())
-    gc_stack_visit(&mut->stack, trace_range, data);
+    gc_stack_visit(&mut->stack, trace_range, heap, data);
 }
 
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void trace_stopping_mutator_roots(struct gc_mutator *mut) {
   GC_ASSERT(mutator_should_mark_while_stopping(mut));
+  struct gc_heap *heap = mutator_heap(mut);
   trace_mutator_conservative_roots(mut,
                                    mark_and_locally_enqueue_mutator_conservative_roots,
-                                   mut);
-  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, mut);
+                                   heap, mut);
+  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, heap, mut);
 }
 
 static void trace_mutator_conservative_roots_with_lock(struct gc_mutator *mut) {
   trace_mutator_conservative_roots(mut,
                                    mark_and_globally_enqueue_mutator_conservative_roots,
-                                   mutator_heap(mut));
+                                   mutator_heap(mut),
+                                   NULL);
 }
 
 static void trace_mutator_roots_with_lock(struct gc_mutator *mut) {
   trace_mutator_conservative_roots_with_lock(mut);
   gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
-                         mutator_heap(mut));
+                         mutator_heap(mut), NULL);
 }
 
 static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
@@ -1154,7 +1169,7 @@ static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
 static void trace_global_conservative_roots(struct gc_heap *heap) {
   if (gc_has_global_conservative_roots())
     gc_platform_visit_global_conservative_roots
-      (mark_and_globally_enqueue_heap_conservative_roots, heap);
+      (mark_and_globally_enqueue_heap_conservative_roots, heap, NULL);
 }
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
@@ -1601,7 +1616,7 @@ static void trace_pinned_roots_after_stop(struct gc_heap *heap) {
 
 static void trace_roots_after_stop(struct gc_heap *heap) {
   trace_mutator_roots_after_stop(heap);
-  gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap);
+  gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap, NULL);
   trace_generational_roots(heap);
 }
 

From 910b62af8f8264c1a23fe13833e4c346f772af6f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 26 Oct 2022 10:37:55 +0200
Subject: [PATCH 157/403] Add conservative heap tracing (not just roots)

Also accelerate mark_space_live_object_granules.
---
 Makefile                      |  84 +++++++++++++++------
 bdw.c                         |  14 +++-
 conservative-roots-embedder.h |   3 +-
 gc-config.h                   |  16 +++-
 large-object-space.h          |   9 +++
 mt-gcbench-embedder.h         |   6 +-
 mt-gcbench.c                  |   5 +-
 parallel-tracer.h             |  16 ++++
 quads.c                       |   5 +-
 semi.c                        |   6 +-
 serial-tracer.h               |   8 +-
 simple-gc-embedder.h          |   6 +-
 whippet.c                     | 137 +++++++++++++++++++++-------------
 13 files changed, 221 insertions(+), 94 deletions(-)

diff --git a/Makefile b/Makefile
index 1f1e8ba8e..3d9eed197 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,23 @@
 TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi whippet conservative-whippet parallel-whippet conservative-parallel-whippet generational-whippet conservative-generational-whippet parallel-generational-whippet conservative-parallel-generational-whippet
+COLLECTORS= \
+	bdw \
+	semi \
+	\
+	whippet \
+	conservative-whippet \
+	fully-conservative-whippet \
+	\
+	parallel-whippet \
+	conservative-parallel-whippet \
+	fully-conservative-parallel-whippet \
+	\
+	generational-whippet \
+	conservative-generational-whippet \
+	fully-conservative-generational-whippet \
+	\
+	parallel-generational-whippet \
+	conservative-parallel-generational-whippet \
+	fully-conservative-parallel-generational-whippet
 
 CC=gcc
 CFLAGS=-Wall -O2 -g -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
@@ -18,57 +36,77 @@ gc-platform.o: gc-platform.h gc-platform-$(PLATFORM).c gc-visibility.h
 gc-stack.o: gc-stack.c
 	$(COMPILE) -o $@ -c $<
 
-bdw-%-gc.o: semi.c %-embedder.h %.c
-	$(COMPILE) `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
-bdw-%.o: semi.c %.c
-	$(COMPILE) -include bdw-attrs.h -o $@ -c $*.c
+bdw-%-gc.o: bdw.c %-embedder.h %.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
+bdw-%.o: bdw.c %.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include bdw-attrs.h -o $@ -c $*.c
 bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-platform.o
 	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
 
 semi-%-gc.o: semi.c %-embedder.h large-object-space.h assert.h debug.h %.c
-	$(COMPILE) -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c semi.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c semi.c
 semi-%.o: semi.c %.c
-	$(COMPILE) -DGC_PRECISE=1 -include semi-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include semi-attrs.h -o $@ -c $*.c
 
 whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
 conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 conservative-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+
+fully-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
+fully-conservative-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
 conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 conservative-parallel-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+
+fully-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
+fully-conservative-parallel-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
 conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 conservative-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+
+fully-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
+fully-conservative-generational-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=1 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
 conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include $*-embedder.h -o $@ -c whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 conservative-parallel-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE=0 -include whippet-attrs.h -o $@ -c $*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+
+fully-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
+fully-conservative-parallel-generational-whippet-%.o: whippet.c %.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 %: %.o %-gc.o gc-platform.o gc-stack.o
 	$(CC) $(LDFLAGS) $($*_LDFLAGS) -o $@ $^
diff --git a/bdw.c b/bdw.c
index caf161e0b..caf53b69f 100644
--- a/bdw.c
+++ b/bdw.c
@@ -8,12 +8,20 @@
 
 #include "bdw-attrs.h"
 
-#if GC_PRECISE
+#if GC_PRECISE_ROOTS
 #error bdw-gc is a conservative collector
-#else
-#include "conservative-roots-embedder.h"
 #endif
 
+#if !GC_CONSERVATIVE_ROOTS
+#error bdw-gc is a conservative collector
+#endif
+
+#if !GC_CONSERVATIVE_TRACE
+#error bdw-gc is a conservative collector
+#endif
+
+#include "conservative-roots-embedder.h"
+
 // When pthreads are used, let `libgc' know about it and redirect
 // allocation calls such as `GC_MALLOC ()' to (contention-free, faster)
 // thread-local allocation.
diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
index c8004f00c..4d2c4fa7b 100644
--- a/conservative-roots-embedder.h
+++ b/conservative-roots-embedder.h
@@ -1,6 +1,7 @@
 #ifndef CONSERVATIVE_ROOTS_EMBEDDER_H
 #define CONSERVATIVE_ROOTS_EMBEDDER_H
 
+#include "gc-config.h"
 #include "gc-embedder-api.h"
 
 static inline int gc_has_mutator_conservative_roots(void) {
@@ -13,7 +14,7 @@ static inline int gc_has_global_conservative_roots(void) {
   return 1;
 }
 static inline int gc_has_conservative_intraheap_edges(void) {
-  return 0;
+  return GC_CONSERVATIVE_TRACE;
 }
 
 static inline int
diff --git a/gc-config.h b/gc-config.h
index 5fc27b7e5..91dd555e2 100644
--- a/gc-config.h
+++ b/gc-config.h
@@ -13,8 +13,20 @@
 #define GC_GENERATIONAL 0
 #endif
 
-#ifndef GC_PRECISE
-#define GC_PRECISE 0
+// Though you normally wouldn't configure things this way, it's possible
+// to have both precise and conservative roots.  However we have to
+// either have precise or conservative tracing; not a mix.
+
+#ifndef GC_PRECISE_ROOTS
+#define GC_PRECISE_ROOTS 0
+#endif
+
+#ifndef GC_CONSERVATIVE_ROOTS
+#define GC_CONSERVATIVE_ROOTS 0
+#endif
+
+#ifndef GC_CONSERVATIVE_TRACE
+#define GC_CONSERVATIVE_TRACE 0
 #endif
 
 #endif // GC_CONFIG_H
diff --git a/large-object-space.h b/large-object-space.h
index 6bb7a5af7..ddd1bfcde 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -9,6 +9,7 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include "gc-assert.h"
 #include "gc-ref.h"
 #include "gc-conservative-ref.h"
 #include "address-map.h"
@@ -96,6 +97,14 @@ static int large_object_space_mark_object(struct large_object_space *space,
   return large_object_space_copy(space, ref);
 }
 
+static inline size_t large_object_space_object_size(struct large_object_space *space,
+                                                    struct gc_ref ref) {
+  size_t npages = address_map_lookup(&space->object_pages,
+                                     gc_ref_value(ref), 0);
+  GC_ASSERT(npages != 0);
+  return npages * space->page_size;
+}
+
 static void large_object_space_reclaim_one(uintptr_t addr, void *data) {
   struct large_object_space *space = data;
   size_t npages = address_map_lookup(&space->object_pages, addr, 0);
diff --git a/mt-gcbench-embedder.h b/mt-gcbench-embedder.h
index 1ac42a327..110e7e05e 100644
--- a/mt-gcbench-embedder.h
+++ b/mt-gcbench-embedder.h
@@ -1,6 +1,7 @@
 #ifndef MT_GCBENCH_EMBEDDER_H
 #define MT_GCBENCH_EMBEDDER_H
 
+#include "gc-config.h"
 #include "mt-gcbench-types.h"
 
 struct gc_heap;
@@ -44,9 +45,8 @@ visit_hole_fields(Hole *obj,
                   void (*visit)(struct gc_edge edge,
                                 struct gc_heap *heap, void *visit_data),
                   struct gc_heap *heap, void *visit_data) {
-#if GC_PRECISE
-  GC_CRASH();
-#endif
+  if (GC_PRECISE_ROOTS)
+    GC_CRASH();
 }
 
 #include "simple-gc-embedder.h"
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 4789a4f7d..744a7e66b 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -47,9 +47,10 @@
 #include "assert.h"
 #include "gc-api.h"
 #include "mt-gcbench-types.h"
-#if GC_PRECISE
+#if GC_PRECISE_ROOTS
 #include "precise-roots-api.h"
-#else
+#endif
+#if GC_CONSERVATIVE_ROOTS
 #include "conservative-roots-api.h"
 #endif
 #include "mt-gcbench-types.h"
diff --git a/parallel-tracer.h b/parallel-tracer.h
index df6cc89ae..9711ed03a 100644
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@@ -450,6 +450,8 @@ static void tracer_release(struct gc_heap *heap) {
 
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
+static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
+                                  void *trace_data) GC_ALWAYS_INLINE;
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
@@ -462,8 +464,22 @@ tracer_share(struct local_tracer *trace) {
     trace_deque_push(trace->share_deque, local_trace_queue_pop(&trace->local));
 }
 
+static inline void
+tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
+  struct local_tracer *trace = trace_data;
+  if (local_trace_queue_full(&trace->local))
+    tracer_share(trace);
+  local_trace_queue_push(&trace->local, ref);
+}
+
 static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  if (trace_edge(heap, edge))
+    tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
+}
+
+static inline void
+tracer_visit_(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   if (trace_edge(heap, edge)) {
     struct local_tracer *trace = trace_data;
     if (local_trace_queue_full(&trace->local))
diff --git a/quads.c b/quads.c
index 1318adf9f..b7d1bccc3 100644
--- a/quads.c
+++ b/quads.c
@@ -5,9 +5,10 @@
 
 #include "assert.h"
 #include "gc-api.h"
-#if GC_PRECISE
+#if GC_PRECISE_ROOTS
 #include "precise-roots-api.h"
-#else
+#endif
+#if GC_CONSERVATIVE_ROOTS
 #include "conservative-roots-api.h"
 #endif
 #include "quads-types.h"
diff --git a/semi.c b/semi.c
index 2c1eae600..2a3b19f23 100644
--- a/semi.c
+++ b/semi.c
@@ -11,9 +11,11 @@
 #include "semi-attrs.h"
 #include "large-object-space.h"
 
-#if GC_PRECISE
+#if GC_PRECISE_ROOTS
 #include "precise-roots-embedder.h"
-#else
+#endif
+
+#if GC_CONSERVATIVE_ROOTS
 #error semi is a precise collector
 #endif
 
diff --git a/serial-tracer.h b/serial-tracer.h
index 7c0bdcca9..d189b1c7c 100644
--- a/serial-tracer.h
+++ b/serial-tracer.h
@@ -137,6 +137,8 @@ static void tracer_release(struct gc_heap *heap) {
 
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
+static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
+                                  void *trace_data) GC_ALWAYS_INLINE;
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              void *trace_data) GC_ALWAYS_INLINE;
 static inline int trace_edge(struct gc_heap *heap,
@@ -152,9 +154,13 @@ tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objs,
   trace_queue_push_many(&tracer->queue, objs, count);
 }
 static inline void
+tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
+  tracer_enqueue_root(heap_tracer(heap), ref);
+}
+static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   if (trace_edge(heap, edge))
-    tracer_enqueue_root(heap_tracer(heap), gc_edge_ref(edge));
+    tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
 }
 static inline void
 tracer_trace(struct gc_heap *heap) {
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 7b691acfa..b97d1d7f0 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -26,9 +26,11 @@ static inline void gc_trace_object(struct gc_ref ref,
   }
 }
 
-#if GC_PRECISE
+#if GC_PRECISE_ROOTS
 #include "precise-roots-embedder.h"
-#else
+#endif
+
+#if GC_CONSERVATIVE_ROOTS
 #include "conservative-roots-embedder.h"
 #endif
 
diff --git a/whippet.c b/whippet.c
index 5dbfc80ec..7ae78671b 100644
--- a/whippet.c
+++ b/whippet.c
@@ -26,9 +26,11 @@
 #include "spin.h"
 #include "whippet-attrs.h"
 
-#if GC_PRECISE
+#if GC_PRECISE_ROOTS
 #include "precise-roots-embedder.h"
-#else
+#endif
+
+#if GC_CONSERVATIVE_ROOTS
 #include "conservative-roots-embedder.h"
 #endif
 
@@ -371,11 +373,52 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
 
 static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 
-static size_t mark_space_live_object_granules(uint8_t *metadata) {
+static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
+  GC_ASSERT(((uintptr_t)mark & 7) == 0);
+  uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
+  uint64_t word;
+  memcpy(&word, aligned_mark, 8);
+#ifdef WORDS_BIGENDIAN
+  word = __builtin_bswap64(word);
+#endif
+  return word;
+}
+
+static inline size_t count_zero_bytes(uint64_t bytes) {
+  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
+}
+
+static uint64_t broadcast_byte(uint8_t byte) {
+  uint64_t result = byte;
+  return result * 0x0101010101010101ULL;
+}
+
+static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
   size_t n = 0;
-  while ((metadata[n] & METADATA_BYTE_END) == 0)
-    n++;
-  return n + 1;
+  // If we have a hole, it is likely to be more that 8 granules long.
+  // Assuming that it's better to make aligned loads, first we align the
+  // sweep pointer, then we load aligned mark words.
+  size_t unaligned = ((uintptr_t) mark) & 7;
+  if (unaligned) {
+    uint64_t bytes = load_eight_aligned_bytes(mark - unaligned) >> (unaligned * 8);
+    bytes &= sweep_mask;
+    if (bytes)
+      return count_zero_bytes(bytes);
+    n += 8 - unaligned;
+  }
+
+  for(; n < limit; n += 8) {
+    uint64_t bytes = load_eight_aligned_bytes(mark + n);
+    bytes &= sweep_mask;
+    if (bytes)
+      return n + count_zero_bytes(bytes);
+  }
+
+  return limit;
+}
+
+static size_t mark_space_live_object_granules(uint8_t *metadata) {
+  return next_mark(metadata, -1, broadcast_byte(METADATA_BYTE_END)) + 1;
 }
 
 static inline int mark_space_mark_object(struct mark_space *space,
@@ -710,9 +753,18 @@ static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
                                                     ref, possibly_interior);
 }
 
-static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             void *mark_data) {
-  gc_trace_object(ref, tracer_visit, heap, mark_data, NULL);
+static inline size_t mark_space_object_size(struct mark_space *space,
+                                            struct gc_ref ref) {
+  uint8_t *loc = metadata_byte_for_object(ref);
+  size_t granules = mark_space_live_object_granules(loc);
+  return granules * GRANULE_SIZE;
+}
+
+static inline size_t gc_object_allocation_size(struct gc_heap *heap,
+                                               struct gc_ref ref) {
+  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref)))
+    return mark_space_object_size(heap_mark_space(heap), ref);
+  return large_object_space_object_size(heap_large_object_space(heap), ref);
 }
 
 static int heap_has_multiple_mutators(struct gc_heap *heap) {
@@ -1037,6 +1089,29 @@ trace_conservative_edges(uintptr_t low,
     trace(load_conservative_ref(addr), heap, data);
 }
 
+static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
+                                                 struct gc_heap *heap,
+                                                 void *data) {
+  int possibly_interior = 0;
+  struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
+  if (gc_ref_is_heap_object(resolved))
+    tracer_enqueue(resolved, heap, data);
+}
+
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             void *mark_data) {
+  if (gc_has_conservative_intraheap_edges()) {
+    size_t bytes = GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))
+      ? mark_space_object_size(heap_mark_space(heap), ref)
+      : large_object_space_object_size(heap_large_object_space(heap), ref);
+    trace_conservative_edges(gc_ref_value(ref),
+                             gc_ref_value(ref) + bytes,
+                             tracer_trace_conservative_ref, heap, mark_data);
+  } else {
+    gc_trace_object(ref, tracer_visit, heap, mark_data, NULL);
+  }
+}
+
 static void
 mark_and_globally_enqueue_mutator_conservative_roots(uintptr_t low,
                                                      uintptr_t high,
@@ -1172,26 +1247,6 @@ static void trace_global_conservative_roots(struct gc_heap *heap) {
       (mark_and_globally_enqueue_heap_conservative_roots, heap, NULL);
 }
 
-static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
-  GC_ASSERT(((uintptr_t)mark & 7) == 0);
-  uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
-  uint64_t word;
-  memcpy(&word, aligned_mark, 8);
-#ifdef WORDS_BIGENDIAN
-  word = __builtin_bswap64(word);
-#endif
-  return word;
-}
-
-static inline size_t count_zero_bytes(uint64_t bytes) {
-  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
-}
-
-static uint64_t broadcast_byte(uint8_t byte) {
-  uint64_t result = byte;
-  return result * 0x0101010101010101ULL;
-}
-
 // Note that it's quite possible (and even likely) that any given remset
 // byte doesn't hold any roots, if all stores were to nursery objects.
 STATIC_ASSERT_EQ(GRANULES_PER_REMSET_BYTE % 8, 0);
@@ -1690,30 +1745,6 @@ static int sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
   return 0;
 }
 
-static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
-  size_t n = 0;
-  // If we have a hole, it is likely to be more that 8 granules long.
-  // Assuming that it's better to make aligned loads, first we align the
-  // sweep pointer, then we load aligned mark words.
-  size_t unaligned = ((uintptr_t) mark) & 7;
-  if (unaligned) {
-    uint64_t bytes = load_eight_aligned_bytes(mark - unaligned) >> (unaligned * 8);
-    bytes &= sweep_mask;
-    if (bytes)
-      return count_zero_bytes(bytes);
-    n += 8 - unaligned;
-  }
-
-  for(; n < limit; n += 8) {
-    uint64_t bytes = load_eight_aligned_bytes(mark + n);
-    bytes &= sweep_mask;
-    if (bytes)
-      return n + count_zero_bytes(bytes);
-  }
-
-  return limit;
-}
-
 static uintptr_t mark_space_next_block_to_sweep(struct mark_space *space) {
   uintptr_t block = atomic_load_explicit(&space->next_block,
                                          memory_order_acquire);

From c614c2e40b6f8531ce096b6c3a42f814ca32dc13 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 26 Oct 2022 11:59:56 +0200
Subject: [PATCH 158/403] Refactor embedder interface for conservative GC

Now users don't have to #ifdef on conservative vs precise tracing; it's
just a generic embedder concern.
---
 bdw.c                                         |  2 -
 conservative-roots-api.h                      | 12 ----
 conservative-roots-embedder.h                 | 54 ----------------
 conservative-roots-types.h                    |  8 ---
 gc-embedder-api.h                             |  9 +--
 gc-trace.h                                    | 44 +++++++++++++
 mt-gcbench.c                                  |  8 +--
 precise-roots-embedder.h                      | 61 -------------------
 quads.c                                       |  7 +--
 semi.c                                        |  4 --
 simple-gc-embedder.h                          | 58 ++++++++++++++----
 precise-roots-api.h => simple-roots-api.h     | 18 +++---
 precise-roots-types.h => simple-roots-types.h |  6 +-
 whippet.c                                     | 16 +----
 14 files changed, 109 insertions(+), 198 deletions(-)
 delete mode 100644 conservative-roots-api.h
 delete mode 100644 conservative-roots-embedder.h
 delete mode 100644 conservative-roots-types.h
 create mode 100644 gc-trace.h
 delete mode 100644 precise-roots-embedder.h
 rename precise-roots-api.h => simple-roots-api.h (60%)
 rename precise-roots-types.h => simple-roots-types.h (63%)

diff --git a/bdw.c b/bdw.c
index caf53b69f..c8fdc5a80 100644
--- a/bdw.c
+++ b/bdw.c
@@ -20,8 +20,6 @@
 #error bdw-gc is a conservative collector
 #endif
 
-#include "conservative-roots-embedder.h"
-
 // When pthreads are used, let `libgc' know about it and redirect
 // allocation calls such as `GC_MALLOC ()' to (contention-free, faster)
 // thread-local allocation.
diff --git a/conservative-roots-api.h b/conservative-roots-api.h
deleted file mode 100644
index 1619cf640..000000000
--- a/conservative-roots-api.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef CONSERVATIVE_ROOTS_API_H
-#define CONSERVATIVE_ROOTS_API_H
-
-#include "conservative-roots-types.h"
-
-#define HANDLE_TO(T) union { T* v; struct handle handle; }
-#define HANDLE_REF(h) h.v
-#define HANDLE_SET(h,val) do { h.v = val; } while (0)
-#define PUSH_HANDLE(cx, h) do { (void) &h; } while (0)
-#define POP_HANDLE(cx) do { } while (0)
-
-#endif // CONSERVATIVE_ROOTS_API_H
diff --git a/conservative-roots-embedder.h b/conservative-roots-embedder.h
deleted file mode 100644
index 4d2c4fa7b..000000000
--- a/conservative-roots-embedder.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef CONSERVATIVE_ROOTS_EMBEDDER_H
-#define CONSERVATIVE_ROOTS_EMBEDDER_H
-
-#include "gc-config.h"
-#include "gc-embedder-api.h"
-
-static inline int gc_has_mutator_conservative_roots(void) {
-  return 1;
-}
-static inline int gc_mutator_conservative_roots_may_be_interior(void) {
-  return 1;
-}
-static inline int gc_has_global_conservative_roots(void) {
-  return 1;
-}
-static inline int gc_has_conservative_intraheap_edges(void) {
-  return GC_CONSERVATIVE_TRACE;
-}
-
-static inline int
-gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
-  // Here is where you would allow tagged heap object references.
-  return displacement == 0;
-}
-static inline int
-gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
-                                           int possibly_interior) {
-  // Assume that the minimum page size is 4096, and that the first page
-  // will contain no heap objects.
-  if (gc_conservative_ref_value(ref) < 4096)
-    return 0;
-  if (possibly_interior)
-    return 1;
-  return gc_is_valid_conservative_ref_displacement
-    (gc_conservative_ref_value(ref) & (sizeof(uintptr_t) - 1));
-}
-
-static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
-                                          void (*trace_edge)(struct gc_edge edge,
-                                                             struct gc_heap *heap,
-                                                             void *trace_data),
-                                          struct gc_heap *heap,
-                                          void *trace_data) {
-}
-
-static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
-                                       void (*trace_edge)(struct gc_edge edge,
-                                                          struct gc_heap *heap,
-                                                          void *trace_data),
-                                       struct gc_heap *heap,
-                                       void *trace_data) {
-}
-
-#endif // CONSERVATIVE_ROOTS_EMBEDDER_H
diff --git a/conservative-roots-types.h b/conservative-roots-types.h
deleted file mode 100644
index 4744d746e..000000000
--- a/conservative-roots-types.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef CONSERVATIVE_ROOTS_TYPES_H
-#define CONSERVATIVE_ROOTS_TYPES_H
-
-struct handle { void *unused; };
-struct gc_heap_roots { void *unused; };
-struct gc_mutator_roots { void *unused; };
-
-#endif // CONSERVATIVE_ROOTS_TYPES_H
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index 3b3682a71..e0a6b3b5a 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -1,7 +1,7 @@
 #ifndef GC_EMBEDDER_API_H
 #define GC_EMBEDDER_API_H
 
-#include "gc-conservative-ref.h"
+#include "gc-config.h"
 #include "gc-edge.h"
 #include "gc-forwarding.h"
 
@@ -14,14 +14,7 @@ struct gc_heap_roots;
 struct gc_atomic_forward;
 struct gc_heap;
 
-GC_EMBEDDER_API inline int gc_has_mutator_conservative_roots(void);
-GC_EMBEDDER_API inline int gc_has_global_conservative_roots(void);
-GC_EMBEDDER_API inline int gc_has_conservative_intraheap_edges(void);
-GC_EMBEDDER_API inline int gc_mutator_conservative_roots_may_be_interior(void);
-
 GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
-GC_EMBEDDER_API inline int gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref,
-                                                                      int possibly_interior);
 
 GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
                                             void (*visit)(struct gc_edge edge,
diff --git a/gc-trace.h b/gc-trace.h
new file mode 100644
index 000000000..b9e4691e8
--- /dev/null
+++ b/gc-trace.h
@@ -0,0 +1,44 @@
+#ifndef GC_TRACE_H
+#define GC_TRACE_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-conservative-ref.h"
+#include "gc-embedder-api.h"
+
+static inline int gc_has_mutator_conservative_roots(void) {
+  return GC_CONSERVATIVE_ROOTS;
+}
+static inline int gc_mutator_conservative_roots_may_be_interior(void) {
+  return 1;
+}
+static inline int gc_has_global_conservative_roots(void) {
+  return GC_CONSERVATIVE_ROOTS;
+}
+static inline int gc_has_conservative_intraheap_edges(void) {
+  return GC_CONSERVATIVE_TRACE;
+}
+
+static inline int gc_has_conservative_roots(void) {
+  return gc_has_mutator_conservative_roots() ||
+    gc_has_global_conservative_roots();
+}
+
+static inline int
+gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
+                                           int possibly_interior) {
+  // Assume that the minimum page size is 4096, and that the first page
+  // will contain no heap objects.
+  if (gc_conservative_ref_value(ref) < 4096)
+    return 0;
+  if (possibly_interior)
+    return 1;
+  return gc_is_valid_conservative_ref_displacement
+    (gc_conservative_ref_value(ref) & (sizeof(uintptr_t) - 1));
+}
+
+#endif // GC_TRACE_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index 744a7e66b..ac0eb1ff9 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -47,13 +47,7 @@
 #include "assert.h"
 #include "gc-api.h"
 #include "mt-gcbench-types.h"
-#if GC_PRECISE_ROOTS
-#include "precise-roots-api.h"
-#endif
-#if GC_CONSERVATIVE_ROOTS
-#include "conservative-roots-api.h"
-#endif
-#include "mt-gcbench-types.h"
+#include "simple-roots-api.h"
 #include "simple-allocator.h"
 
 #define MAX_THREAD_COUNT 256
diff --git a/precise-roots-embedder.h b/precise-roots-embedder.h
deleted file mode 100644
index 94192cb51..000000000
--- a/precise-roots-embedder.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef PRECISE_ROOTS_EMBEDDER_H
-#define PRECISE_ROOTS_EMBEDDER_H
-
-#include "gc-edge.h"
-#include "gc-embedder-api.h"
-#include "precise-roots-types.h"
-
-static inline int gc_has_mutator_conservative_roots(void) {
-  return 0;
-}
-static inline int gc_mutator_conservative_roots_may_be_interior(void) {
-  return 0;
-}
-static inline int gc_has_global_conservative_roots(void) {
-  return 0;
-}
-static inline int gc_has_conservative_intraheap_edges(void) {
-  return 0;
-}
-
-static inline int
-gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
-  GC_CRASH();
-}
-static inline int
-gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
-                                           int possibly_interior) {
-  GC_CRASH();
-}
-
-static inline void visit_roots(struct handle *roots,
-                               void (*trace_edge)(struct gc_edge edge,
-                                                  struct gc_heap *heap,
-                                                  void *trace_data),
-                               struct gc_heap *heap,
-                               void *trace_data) {
-  for (struct handle *h = roots; h; h = h->next)
-    trace_edge(gc_edge(&h->v), heap, trace_data);
-}
-
-static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
-                                          void (*trace_edge)(struct gc_edge edge,
-                                                             struct gc_heap *heap,
-                                                             void *trace_data),
-                                          struct gc_heap *heap,
-                                          void *trace_data) {
-  if (roots)
-    visit_roots(roots->roots, trace_edge, heap, trace_data);
-}
-
-static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
-                                       void (*trace_edge)(struct gc_edge edge,
-                                                          struct gc_heap *heap,
-                                                          void *trace_data),
-                                       struct gc_heap *heap,
-                                       void *trace_data) {
-  if (roots)
-    visit_roots(roots->roots, trace_edge, heap, trace_data);
-}
-
-#endif // PRECISE_ROOTS_EMBEDDER_H
diff --git a/quads.c b/quads.c
index b7d1bccc3..27923f1f7 100644
--- a/quads.c
+++ b/quads.c
@@ -5,12 +5,7 @@
 
 #include "assert.h"
 #include "gc-api.h"
-#if GC_PRECISE_ROOTS
-#include "precise-roots-api.h"
-#endif
-#if GC_CONSERVATIVE_ROOTS
-#include "conservative-roots-api.h"
-#endif
+#include "simple-roots-api.h"
 #include "quads-types.h"
 #include "simple-allocator.h"
 
diff --git a/semi.c b/semi.c
index 2a3b19f23..4c505b60e 100644
--- a/semi.c
+++ b/semi.c
@@ -11,10 +11,6 @@
 #include "semi-attrs.h"
 #include "large-object-space.h"
 
-#if GC_PRECISE_ROOTS
-#include "precise-roots-embedder.h"
-#endif
-
 #if GC_CONSERVATIVE_ROOTS
 #error semi is a precise collector
 #endif
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index b97d1d7f0..758e56462 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -1,8 +1,21 @@
 #include <stdatomic.h>
 
 #include "simple-tagging-scheme.h"
+#include "simple-roots-types.h"
+#include "gc-config.h"
 #include "gc-embedder-api.h"
 
+static inline int
+gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
+#if GC_CONSERVATIVE_ROOTS || GC_CONSERVATIVE_TRACE
+  // Here is where you would allow tagged heap object references.
+  return displacement == 0;
+#else
+  // Shouldn't get here.
+  GC_CRASH();
+#endif
+}
+
 static inline void gc_trace_object(struct gc_ref ref,
                                    void (*trace_edge)(struct gc_edge edge,
                                                       struct gc_heap *heap,
@@ -10,6 +23,10 @@ static inline void gc_trace_object(struct gc_ref ref,
                                    struct gc_heap *heap,
                                    void *trace_data,
                                    size_t *size) {
+#if GC_CONSERVATIVE_TRACE
+  // Shouldn't get here.
+  GC_CRASH();
+#else
   switch (tag_live_alloc_kind(*tag_word(ref))) {
 #define SCAN_OBJECT(name, Name, NAME)                                   \
     case ALLOC_KIND_##NAME:                                             \
@@ -24,15 +41,38 @@ static inline void gc_trace_object(struct gc_ref ref,
   default:
     GC_CRASH();
   }
+#endif
 }
 
-#if GC_PRECISE_ROOTS
-#include "precise-roots-embedder.h"
-#endif
+static inline void visit_roots(struct handle *roots,
+                               void (*trace_edge)(struct gc_edge edge,
+                                                  struct gc_heap *heap,
+                                                  void *trace_data),
+                               struct gc_heap *heap,
+                               void *trace_data) {
+  for (struct handle *h = roots; h; h = h->next)
+    trace_edge(gc_edge(&h->v), heap, trace_data);
+}
 
-#if GC_CONSERVATIVE_ROOTS
-#include "conservative-roots-embedder.h"
-#endif
+static inline void gc_trace_mutator_roots(struct gc_mutator_roots *roots,
+                                          void (*trace_edge)(struct gc_edge edge,
+                                                             struct gc_heap *heap,
+                                                             void *trace_data),
+                                          struct gc_heap *heap,
+                                          void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, heap, trace_data);
+}
+
+static inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
+                                       void (*trace_edge)(struct gc_edge edge,
+                                                          struct gc_heap *heap,
+                                                          void *trace_data),
+                                       struct gc_heap *heap,
+                                       void *trace_data) {
+  if (roots)
+    visit_roots(roots->roots, trace_edge, heap, trace_data);
+}
 
 static inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref) {
   uintptr_t tag = *tag_word(ref);
@@ -108,9 +148,3 @@ gc_atomic_forward_address(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_FORWARDED);
   return fwd->data;
 }
-
-static inline uintptr_t
-gc_conservative_ref_heap_address(struct gc_conservative_ref ref) {
-  // The specific spaces are responsible for checking alignment.
-  return gc_conservative_ref_value(ref);
-}
diff --git a/precise-roots-api.h b/simple-roots-api.h
similarity index 60%
rename from precise-roots-api.h
rename to simple-roots-api.h
index ced560d15..1cdfc15e0 100644
--- a/precise-roots-api.h
+++ b/simple-roots-api.h
@@ -1,7 +1,8 @@
-#ifndef PRECISE_ROOTS_API_H
-#define PRECISE_ROOTS_API_H
+#ifndef SIMPLE_ROOTS_API_H
+#define SIMPLE_ROOTS_API_H
 
-#include "precise-roots-types.h"
+#include "gc-config.h"
+#include "simple-roots-types.h"
 
 #define HANDLE_TO(T) union { T* v; struct handle handle; }
 #define HANDLE_REF(h) h.v
@@ -10,12 +11,15 @@
 #define POP_HANDLE(cx) pop_handle(&(cx)->roots.roots)
 
 static inline void push_handle(struct handle **roots, struct handle *handle) {
-  handle->next = *roots;
-  *roots = handle;
+  if (GC_PRECISE_ROOTS) {
+    handle->next = *roots;
+    *roots = handle;
+  }
 }
 
 static inline void pop_handle(struct handle **roots) {
-  *roots = (*roots)->next;
+  if (GC_PRECISE_ROOTS)
+    *roots = (*roots)->next;
 }
 
-#endif // PRECISE_ROOTS_API_H
+#endif // SIMPLE_ROOTS_API_H
diff --git a/precise-roots-types.h b/simple-roots-types.h
similarity index 63%
rename from precise-roots-types.h
rename to simple-roots-types.h
index d2dc96491..6d47fa788 100644
--- a/precise-roots-types.h
+++ b/simple-roots-types.h
@@ -1,5 +1,5 @@
-#ifndef PRECISE_ROOTS_TYPES_H
-#define PRECISE_ROOTS_TYPES_H
+#ifndef SIMPLE_ROOTS_TYPES_H
+#define SIMPLE_ROOTS_TYPES_H
 
 struct handle {
   void *v;
@@ -14,4 +14,4 @@ struct gc_mutator_roots {
   struct handle *roots;
 };
 
-#endif // PRECISE_ROOTS_TYPES_H
+#endif // SIMPLE_ROOTS_TYPES_H
diff --git a/whippet.c b/whippet.c
index 7ae78671b..90f2b470c 100644
--- a/whippet.c
+++ b/whippet.c
@@ -17,6 +17,7 @@
 #include "gc-inline.h"
 #include "gc-platform.h"
 #include "gc-stack.h"
+#include "gc-trace.h"
 #include "large-object-space.h"
 #if GC_PARALLEL
 #include "parallel-tracer.h"
@@ -26,14 +27,6 @@
 #include "spin.h"
 #include "whippet-attrs.h"
 
-#if GC_PRECISE_ROOTS
-#include "precise-roots-embedder.h"
-#endif
-
-#if GC_CONSERVATIVE_ROOTS
-#include "conservative-roots-embedder.h"
-#endif
-
 #define GRANULE_SIZE 16
 #define GRANULE_SIZE_LOG_2 4
 #define MEDIUM_OBJECT_THRESHOLD 256
@@ -1455,11 +1448,6 @@ static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
   return threshold;
 }
 
-static inline int has_conservative_roots(void) {
-  return gc_has_mutator_conservative_roots() ||
-    gc_has_global_conservative_roots();
-}
-
 static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
   enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
@@ -1489,7 +1477,7 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     // blocks, try to avoid any pinning caused by the ragged-stop
     // marking.  Of course if the mutator has conservative roots we will
     // have pinning anyway and might as well allow ragged stops.
-    mark_while_stopping = has_conservative_roots();
+    mark_while_stopping = gc_has_conservative_roots();
   } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",

From 1b3bc2f3b50762df8b652b9a57b365e1bff414a1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 2 Dec 2022 09:04:59 +0100
Subject: [PATCH 159/403] Use "stack-conservative" and "heap-conservative"
 terms

Thanks to Steve Blackburn for the terms.
---
 Makefile | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/Makefile b/Makefile
index 3d9eed197..94d25606d 100644
--- a/Makefile
+++ b/Makefile
@@ -4,20 +4,20 @@ COLLECTORS= \
 	semi \
 	\
 	whippet \
-	conservative-whippet \
-	fully-conservative-whippet \
+	stack-conservative-whippet \
+	heap-conservative-whippet \
 	\
 	parallel-whippet \
-	conservative-parallel-whippet \
-	fully-conservative-parallel-whippet \
+	stack-conservative-parallel-whippet \
+	heap-conservative-parallel-whippet \
 	\
 	generational-whippet \
-	conservative-generational-whippet \
-	fully-conservative-generational-whippet \
+	stack-conservative-generational-whippet \
+	heap-conservative-generational-whippet \
 	\
 	parallel-generational-whippet \
-	conservative-parallel-generational-whippet \
-	fully-conservative-parallel-generational-whippet
+	stack-conservative-parallel-generational-whippet \
+	heap-conservative-parallel-generational-whippet
 
 CC=gcc
 CFLAGS=-Wall -O2 -g -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
@@ -53,14 +53,14 @@ whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h asse
 whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+stack-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-conservative-whippet-%.o: whippet.c %.c
+stack-conservative-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-fully-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+heap-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-fully-conservative-whippet-%.o: whippet.c %.c
+heap-conservative-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
@@ -68,14 +68,14 @@ parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tr
 parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+stack-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-conservative-parallel-whippet-%.o: whippet.c %.c
+stack-conservative-parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-fully-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+heap-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-fully-conservative-parallel-whippet-%.o: whippet.c %.c
+heap-conservative-parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
@@ -83,14 +83,14 @@ generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-
 generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+stack-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-conservative-generational-whippet-%.o: whippet.c %.c
+stack-conservative-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-fully-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
+heap-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-fully-conservative-generational-whippet-%.o: whippet.c %.c
+heap-conservative-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
@@ -98,14 +98,14 @@ parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.
 parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
+stack-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-conservative-parallel-generational-whippet-%.o: whippet.c %.c
+stack-conservative-parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
 
-fully-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
+heap-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-fully-conservative-parallel-generational-whippet-%.o: whippet.c %.c
+heap-conservative-parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
 
 %: %.o %-gc.o gc-platform.o gc-stack.o

From 44f37a373ceae45d9fa1ff090d75830062cc86be Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 22 Jan 2023 21:04:20 +0100
Subject: [PATCH 160/403] Add gc_collect(mutator) API

---
 bdw.c     | 2 +-
 gc-api.h  | 2 ++
 semi.c    | 4 ++++
 whippet.c | 4 ++++
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/bdw.c b/bdw.c
index c8fdc5a80..d4aac0a56 100644
--- a/bdw.c
+++ b/bdw.c
@@ -116,7 +116,7 @@ void* gc_allocate_pointerless(struct gc_mutator *mut,
   return GC_malloc_atomic(size);
 }
 
-static inline void collect(struct gc_mutator *mut) {
+void gc_collect(struct gc_mutator *mut) {
   GC_gcollect();
 }
 
diff --git a/gc-api.h b/gc-api.h
index a33fd7b12..bec889cf4 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -57,6 +57,8 @@ GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 GC_API_ void gc_print_stats(struct gc_heap *heap);
 
+GC_API_ void gc_collect(struct gc_mutator *mut);
+
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
                                              size_t size) GC_ALWAYS_INLINE;
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
diff --git a/semi.c b/semi.c
index 4c505b60e..d9e1110cf 100644
--- a/semi.c
+++ b/semi.c
@@ -171,6 +171,10 @@ static void collect(struct gc_mutator *mut) {
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
 
+void gc_collect(struct gc_mutator *mut) {
+  collect(mut);
+}
+
 static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
   collect(mut);
   struct semi_space *space = mutator_semi_space(mut);
diff --git a/whippet.c b/whippet.c
index 90f2b470c..ee18445de 100644
--- a/whippet.c
+++ b/whippet.c
@@ -1992,6 +1992,10 @@ static void trigger_collection(struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
+void gc_collect(struct gc_mutator *mut) {
+  trigger_collection(mut);
+}
+
 void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
   struct gc_heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);

From 78da8d5811601c324bef722cdb443252e26303a7 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 26 Nov 2022 22:28:57 +0100
Subject: [PATCH 161/403] Add ephemeron implementation

This commit adds support for ephemerons to the API and wires it into the
collectors.  It also adds a new test.
---
 Makefile                |  58 ++--
 bdw.c                   |  94 ++++++-
 ephemerons-embedder.h   |  54 ++++
 ephemerons-types.h      |  21 ++
 ephemerons.c            | 270 +++++++++++++++++++
 gc-api.h                |   5 -
 gc-edge.h               |   3 +
 gc-embedder-api.h       |   4 +
 gc-ephemeron-internal.h |  51 ++++
 gc-ephemeron.c          | 582 ++++++++++++++++++++++++++++++++++++++++
 gc-ephemeron.h          |  42 +++
 gc-internal.h           |  10 +
 gc-visibility.h         |   5 +
 large-object-space.h    |  10 +
 semi.c                  | 101 ++++++-
 simple-gc-embedder.h    |   2 +
 simple-roots-api.h      |   5 +-
 whippet.c               | 194 ++++++++++++--
 18 files changed, 1455 insertions(+), 56 deletions(-)
 create mode 100644 ephemerons-embedder.h
 create mode 100644 ephemerons-types.h
 create mode 100644 ephemerons.c
 create mode 100644 gc-ephemeron-internal.h
 create mode 100644 gc-ephemeron.c
 create mode 100644 gc-ephemeron.h
 create mode 100644 gc-internal.h

diff --git a/Makefile b/Makefile
index 94d25606d..6f5652ad5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS=quads mt-gcbench # MT_GCBench MT_GCBench2
+TESTS=quads mt-gcbench ephemerons # MT_GCBench MT_GCBench2
 COLLECTORS= \
 	bdw \
 	semi \
@@ -19,8 +19,16 @@ COLLECTORS= \
 	stack-conservative-parallel-generational-whippet \
 	heap-conservative-parallel-generational-whippet
 
+DEFAULT_BUILD:=opt
+
+BUILD_CFLAGS_opt=-O2 -g -DNDEBUG
+BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1
+BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1
+
+BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
+
 CC=gcc
-CFLAGS=-Wall -O2 -g -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused -DNDEBUG
+CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
 INCLUDES=-I.
 LDFLAGS=-lpthread -flto
 COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
@@ -36,92 +44,106 @@ gc-platform.o: gc-platform.h gc-platform-$(PLATFORM).c gc-visibility.h
 gc-stack.o: gc-stack.c
 	$(COMPILE) -o $@ -c $<
 
+gc-ephemeron-%.o: gc-ephemeron.c gc-ephemeron.h gc-ephemeron-internal.h %-embedder.h
+	$(COMPILE) -include $*-embedder.h -o $@ -c $<
+
 bdw-%-gc.o: bdw.c %-embedder.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
 bdw-%.o: bdw.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include bdw-attrs.h -o $@ -c $*.c
-bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-platform.o
+bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
 
 semi-%-gc.o: semi.c %-embedder.h large-object-space.h assert.h debug.h %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c semi.c
 semi-%.o: semi.c %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include semi-attrs.h -o $@ -c $*.c
+semi-%: semi-%.o semi-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+whippet-%: whippet-%.o whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-whippet-%: stack-conservative-whippet-%.o stack-conservative-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-whippet-%: heap-conservative-whippet-%.o heap-conservative-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+parallel-whippet-%: parallel-whippet-%.o parallel-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-parallel-whippet-%: stack-conservative-parallel-whippet-%.o stack-conservative-parallel-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-parallel-whippet-%: heap-conservative-parallel-whippet-%.o heap-conservative-parallel-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+generational-whippet-%: generational-whippet-%.o generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-generational-whippet-%: stack-conservative-generational-whippet-%.o stack-conservative-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-generational-whippet-%: heap-conservative-generational-whippet-%.o heap-conservative-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+parallel-generational-whippet-%: parallel-generational-whippet-%.o parallel-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-parallel-generational-whippet-%: stack-conservative-parallel-generational-whippet-%.o stack-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
-
-%: %.o %-gc.o gc-platform.o gc-stack.o
-	$(CC) $(LDFLAGS) $($*_LDFLAGS) -o $@ $^
-
-check: $(addprefix test-$(TARGET),$(TARGETS))
-
-test-%: $(ALL_TESTS)
-	@echo "Running unit tests..."
-	@set -e; for test in $?; do \
-	  echo "Testing: $$test"; \
-	  ./$$test; \
-	done
-	@echo "Success."
-
-.PHONY: check
+heap-conservative-parallel-generational-whippet-%: heap-conservative-parallel-generational-whippet-%.o heap-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 .PRECIOUS: $(ALL_TESTS)
 
diff --git a/bdw.c b/bdw.c
index d4aac0a56..eb05c1b44 100644
--- a/bdw.c
+++ b/bdw.c
@@ -3,8 +3,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define GC_API_ 
 #include "gc-api.h"
+#include "gc-ephemeron.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
 
 #include "bdw-attrs.h"
 
@@ -34,6 +37,7 @@
 
 #include <gc/gc.h>
 #include <gc/gc_inline.h> /* GC_generic_malloc_many */
+#include <gc/gc_mark.h> /* GC_generic_malloc */
 
 #define GC_INLINE_GRANULE_WORDS 2
 #define GC_INLINE_GRANULE_BYTES (sizeof(void *) * GC_INLINE_GRANULE_WORDS)
@@ -120,6 +124,85 @@ void gc_collect(struct gc_mutator *mut) {
   GC_gcollect();
 }
 
+// In BDW-GC, we can't hook into the mark phase to call
+// gc_trace_ephemerons_for_object, so the advertised ephemeron strategy
+// doesn't really work.  The primitives that we have are mark functions,
+// which run during GC and can't allocate; finalizers, which run after
+// GC and can allocate but can't add to the connectivity graph; and
+// disappearing links, which are cleared at the end of marking, in the
+// stop-the-world phase.  It does not appear to be possible to implement
+// ephemerons using these primitives.  Instead fall back to weak-key
+// tables.
+
+static int ephemeron_gc_kind;
+
+struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut) {
+  void *ret = GC_generic_malloc(gc_ephemeron_size(), ephemeron_gc_kind);
+  return gc_ref_from_heap_object(ret);
+}
+
+unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+  return 0;
+}
+
+void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                       struct gc_ref key, struct gc_ref value) {
+  gc_ephemeron_init_internal(mut->heap, ephemeron, key, value);
+  if (GC_base((void*)gc_ref_value(key))) {
+    struct gc_ref *loc = gc_edge_loc(gc_ephemeron_key_edge(ephemeron));
+    GC_register_disappearing_link((void**)loc);
+  }
+}
+
+struct ephemeron_mark_state {
+  struct GC_ms_entry *mark_stack_ptr;
+  struct GC_ms_entry *mark_stack_limit;
+};
+
+int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+  // Pretend the key is traced, to avoid adding this ephemeron to the
+  // global table.
+  return 1;
+}
+static void trace_ephemeron_edge(struct gc_edge edge, struct gc_heap *heap,
+                                 void *visit_data) {
+  struct ephemeron_mark_state *state = visit_data;
+  uintptr_t addr = gc_ref_value(gc_edge_ref(edge));
+  state->mark_stack_ptr = GC_MARK_AND_PUSH ((void *) addr,
+                                            state->mark_stack_ptr,
+                                            state->mark_stack_limit,
+                                            NULL);
+}
+
+static struct GC_ms_entry *
+mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+               struct GC_ms_entry *mark_stack_limit, GC_word env) {
+
+  struct ephemeron_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_ephemeron *ephemeron = (struct gc_ephemeron*) addr;
+
+  // If this ephemeron is on a freelist, its first word will be a
+  // freelist link and everything else will be NULL.
+  if (!gc_ref_value(gc_edge_ref(gc_ephemeron_value_edge(ephemeron)))) {
+    trace_ephemeron_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  if (!gc_ref_value(gc_edge_ref(gc_ephemeron_key_edge(ephemeron)))) {
+    // If the key died in a previous collection, the disappearing link
+    // will have been cleared.  Mark the ephemeron as dead.
+    gc_ephemeron_mark_dead(ephemeron);
+  }
+
+  gc_trace_ephemeron(ephemeron, trace_ephemeron_edge, NULL, &state);
+
+  return state.mark_stack_ptr;
+}
+
 static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
   struct gc_mutator *ret = GC_malloc(sizeof(struct gc_mutator));
   ret->heap = heap;
@@ -224,6 +307,15 @@ int gc_init(int argc, struct gc_option argv[],
   *heap = GC_malloc(sizeof(struct gc_heap));
   pthread_mutex_init(&(*heap)->lock, NULL);
   *mutator = add_mutator(*heap);
+
+  {
+    GC_word descriptor = GC_MAKE_PROC(GC_new_proc(mark_ephemeron), 0);
+    int add_size_to_descriptor = 0;
+    int clear_memory = 1;
+    ephemeron_gc_kind = GC_new_kind(GC_new_free_list(), descriptor,
+                                    add_size_to_descriptor, clear_memory);
+  }
+
   return 1;
 }
 
diff --git a/ephemerons-embedder.h b/ephemerons-embedder.h
new file mode 100644
index 000000000..5b17178cd
--- /dev/null
+++ b/ephemerons-embedder.h
@@ -0,0 +1,54 @@
+#ifndef EPHEMERONS_EMBEDDER_H
+#define EPHEMERONS_EMBEDDER_H
+
+#include <stddef.h>
+
+#include "ephemerons-types.h"
+#include "gc-ephemeron.h"
+
+struct gc_heap;
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t small_object_size(SmallObject *obj) { return sizeof(*obj); }
+static inline size_t ephemeron_size(Ephemeron *obj) { return gc_ephemeron_size(); }
+static inline size_t box_size(Box *obj) { return sizeof(*obj); }
+
+static inline void
+visit_small_object_fields(SmallObject *obj,
+                          void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                        void *visit_data),
+                          struct gc_heap *heap,
+                          void *visit_data) {}
+
+static inline void
+visit_ephemeron_fields(Ephemeron *ephemeron,
+                       void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                     void *visit_data),
+
+                       struct gc_heap *heap,
+                       void *visit_data) {
+  gc_trace_ephemeron((struct gc_ephemeron*)ephemeron, visit, heap, visit_data);
+}
+
+static inline void
+visit_box_fields(Box *box,
+                 void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                               void *visit_data),
+                 struct gc_heap *heap,
+                 void *visit_data) {
+  visit(gc_edge(&box->obj), heap, visit_data);
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // EPHEMERONS_EMBEDDER_H
diff --git a/ephemerons-types.h b/ephemerons-types.h
new file mode 100644
index 000000000..d2a4b9a5b
--- /dev/null
+++ b/ephemerons-types.h
@@ -0,0 +1,21 @@
+#ifndef EPHEMERONS_TYPES_H
+#define EPHEMERONS_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(box, Box, BOX) \
+  M(ephemeron, Ephemeron, EPHEMERON) \
+  M(small_object, SmallObject, SMALL_OBJECT)
+
+#include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct SmallObject {
+  struct gc_header header;
+};
+
+struct Box {
+  struct gc_header header;
+  void *obj;
+};
+
+#endif // EPHEMERONS_TYPES_H
diff --git a/ephemerons.c b/ephemerons.c
new file mode 100644
index 000000000..84fc308f2
--- /dev/null
+++ b/ephemerons.c
@@ -0,0 +1,270 @@
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gc-api.h"
+#include "gc-ephemeron.h"
+#include "simple-roots-api.h"
+#include "ephemerons-types.h"
+#include "simple-allocator.h"
+
+typedef HANDLE_TO(SmallObject) SmallObjectHandle;
+typedef HANDLE_TO(struct gc_ephemeron) EphemeronHandle;
+typedef HANDLE_TO(Box) BoxHandle;
+
+static SmallObject* allocate_small_object(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_SMALL_OBJECT, sizeof(SmallObject));
+}
+
+static Box* allocate_box(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_BOX, sizeof(Box));
+}
+
+static struct gc_ephemeron* allocate_ephemeron(struct gc_mutator *mut) {
+  struct gc_ref ret = gc_allocate_ephemeron(mut);
+  *tag_word(ret) = tag_live(ALLOC_KIND_EPHEMERON);
+  return gc_ref_heap_object(ret);
+}
+
+/* Get the current time in microseconds */
+static unsigned long current_time(void)
+{
+  struct timeval t;
+  if (gettimeofday(&t, NULL) == -1)
+    return 0;
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+struct thread {
+  struct gc_mutator *mut;
+  struct gc_mutator_roots roots;
+};
+
+static void print_elapsed(const char *what, unsigned long start) {
+  unsigned long end = current_time();
+  unsigned long msec = (end - start) / 1000;
+  unsigned long usec = (end - start) % 1000;
+  printf("Completed %s in %lu.%.3lu msec\n", what, msec, usec);
+}
+
+struct call_with_gc_data {
+  void* (*f)(struct thread *);
+  struct gc_heap *heap;
+};
+static void* call_with_gc_inner(struct gc_stack_addr *addr, void *arg) {
+  struct call_with_gc_data *data = arg;
+  struct gc_mutator *mut = gc_init_for_thread(addr, data->heap);
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+  void *ret = data->f(&t);
+  gc_finish_for_thread(mut);
+  return ret;
+}
+static void* call_with_gc(void* (*f)(struct thread *),
+                          struct gc_heap *heap) {
+  struct call_with_gc_data data = { f, heap };
+  return gc_call_with_stack_addr(call_with_gc_inner, &data);
+}
+
+#define CHECK(x)                                                        \
+  do {                                                                  \
+    if (!(x)) {                                                         \
+      fprintf(stderr, "%s:%d: check failed: %s\n", __FILE__, __LINE__, #x); \
+      exit(1);                                                          \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NULL(x) CHECK_EQ(x, NULL)
+#define CHECK_NOT_NULL(x) CHECK_NE(x, NULL)
+
+static size_t ephemeron_chain_length(struct gc_ephemeron **loc,
+                                     SmallObject *key) {
+  struct gc_ephemeron *head = gc_ephemeron_chain_head(loc);
+  size_t len = 0;
+  while (head) {
+    CHECK_EQ(key, (SmallObject*)gc_ref_value(gc_ephemeron_key(head)));
+    Box *value = gc_ref_heap_object(gc_ephemeron_value(head));
+    CHECK_NOT_NULL(value);
+    key = value->obj;
+    CHECK_NOT_NULL(key);
+    head = gc_ephemeron_chain_next(head);
+    len++;
+  }
+  return len;
+}
+
+static double heap_size;
+static double heap_multiplier;
+static size_t nthreads;
+
+static void cause_gc(struct gc_mutator *mut) {
+  gc_collect(mut);
+}
+
+static void make_ephemeron_chain(struct thread *t, EphemeronHandle *head,
+                                 SmallObjectHandle *head_key, size_t length) {
+  BoxHandle tail_box = { NULL };
+  PUSH_HANDLE(t, tail_box);
+
+  CHECK_NULL(HANDLE_REF(*head_key));
+  HANDLE_SET(*head_key, allocate_small_object(t->mut));
+
+  for (size_t i = 0; i < length; i++) {
+    HANDLE_SET(tail_box, allocate_box(t->mut));
+    HANDLE_REF(tail_box)->obj = HANDLE_REF(*head_key);
+    HANDLE_SET(*head_key, allocate_small_object(t->mut));
+    struct gc_ephemeron *ephemeron = allocate_ephemeron(t->mut);
+    gc_ephemeron_init(t->mut, ephemeron,
+                      gc_ref_from_heap_object(HANDLE_REF(*head_key)),
+                      gc_ref_from_heap_object(HANDLE_REF(tail_box)));
+    gc_ephemeron_chain_push(HANDLE_LOC(*head), ephemeron);
+  }
+
+  POP_HANDLE(t);
+}
+
+static void* run_one_test(struct thread *t) {
+  size_t unit_size = gc_ephemeron_size() + sizeof(Box);
+  size_t list_length = heap_size / nthreads / heap_multiplier / unit_size;
+
+  printf("Allocating ephemeron list %zu nodes long.  Total size %.3fGB.\n",
+         list_length, list_length * unit_size / 1e9);
+
+  unsigned long thread_start = current_time();
+
+  SmallObjectHandle head_key = { NULL };
+  EphemeronHandle head = { NULL };
+
+  PUSH_HANDLE(t, head_key);
+  PUSH_HANDLE(t, head);
+
+  make_ephemeron_chain(t, &head, &head_key, list_length);
+
+  size_t measured_length = ephemeron_chain_length(HANDLE_LOC(head),
+                                                  HANDLE_REF(head_key));
+  CHECK_EQ(measured_length, list_length);
+
+  cause_gc(t->mut);
+  measured_length = ephemeron_chain_length(HANDLE_LOC(head),
+                                           HANDLE_REF(head_key));
+  CHECK_EQ(measured_length, list_length);
+
+  if (!GC_CONSERVATIVE_ROOTS) {
+    HANDLE_SET(head_key, NULL);
+    cause_gc(t->mut);
+    measured_length = ephemeron_chain_length(HANDLE_LOC(head),
+                                             HANDLE_REF(head_key));
+    CHECK_EQ(measured_length, 0);
+  }
+
+  // swap head_key for a key halfway in, cause gc
+  // check length is expected half-length; warn, or error if precise
+  // clear and return
+
+  print_elapsed("thread", thread_start);
+
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+
+  return NULL;
+}
+
+static void* run_one_test_in_thread(void *arg) {
+  struct gc_heap *heap = arg;
+  return call_with_gc(run_one_test, heap);
+}
+
+struct join_data { int status; pthread_t thread; };
+static void *join_thread(void *data) {
+  struct join_data *join_data = data;
+  void *ret;
+  join_data->status = pthread_join(join_data->thread, &ret);
+  return ret;
+}
+
+#define MAX_THREAD_COUNT 256
+
+int main(int argc, char *argv[]) {
+  if (argc != 5) {
+    fprintf(stderr, "usage: %s HEAP_SIZE MULTIPLIER NTHREADS PARALLELISM\n", argv[0]);
+    return 1;
+  }
+
+  heap_size = atof(argv[1]);
+  heap_multiplier = atof(argv[2]);
+  nthreads = atol(argv[3]);
+  size_t parallelism = atol(argv[4]);
+
+  if (heap_size < 8192) {
+    fprintf(stderr,
+            "Heap size should probably be at least 8192, right? '%s'\n",
+            argv[1]);
+    return 1;
+  }
+  if (!(1.0 < heap_multiplier && heap_multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+  if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for thread count, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[2]);
+    return 1;
+  }
+  if (parallelism < 1 || parallelism > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for parallelism, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[3]);
+    return 1;
+  }
+
+  printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
+         heap_size / 1e9, heap_multiplier);
+
+  struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, (size_t) heap_size },
+                                 { GC_OPTION_PARALLELISM, parallelism } };
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  if (!gc_init(sizeof options / sizeof options[0], options, NULL, &heap,
+               &mut)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            (size_t)heap_size);
+    return 1;
+  }
+  struct thread main_thread = { mut, };
+  gc_mutator_set_roots(mut, &main_thread.roots);
+
+  unsigned long test_start = current_time();
+
+  pthread_t threads[MAX_THREAD_COUNT];
+  // Run one of the threads in the main thread.
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, heap);
+    if (status) {
+      errno = status;
+      perror("Failed to create thread");
+      return 1;
+    }
+  }
+  run_one_test(&main_thread);
+  for (size_t i = 1; i < nthreads; i++) {
+    struct join_data data = { 0, threads[i] };
+    gc_call_without_gc(mut, join_thread, &data);
+    if (data.status) {
+      errno = data.status;
+      perror("Failed to join thread");
+      return 1;
+    }
+  }
+  
+  print_elapsed("test", test_start);
+
+  gc_print_stats(heap);
+
+  return 0;
+}
+
diff --git a/gc-api.h b/gc-api.h
index bec889cf4..4ffee3fc7 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -26,11 +26,6 @@ struct gc_option {
   double value;
 };
 
-// FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
-#ifndef GC_API_
-#define GC_API_ GC_INTERNAL
-#endif
-
 GC_API_ int gc_option_from_string(const char *str);
 
 struct gc_stack_addr;
diff --git a/gc-edge.h b/gc-edge.h
index cfd769c59..72d7b3e5b 100644
--- a/gc-edge.h
+++ b/gc-edge.h
@@ -13,6 +13,9 @@ static inline struct gc_edge gc_edge(void* addr) {
 static inline struct gc_ref gc_edge_ref(struct gc_edge edge) {
   return *edge.dst;
 }
+static inline struct gc_ref* gc_edge_loc(struct gc_edge edge) {
+  return edge.dst;
+}
 static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
   *edge.dst = ref;
 }
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index e0a6b3b5a..8ae45ef61 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -1,8 +1,11 @@
 #ifndef GC_EMBEDDER_API_H
 #define GC_EMBEDDER_API_H
 
+#include <stddef.h>
+
 #include "gc-config.h"
 #include "gc-edge.h"
+#include "gc-inline.h"
 #include "gc-forwarding.h"
 
 #ifndef GC_EMBEDDER_API
@@ -13,6 +16,7 @@ struct gc_mutator_roots;
 struct gc_heap_roots;
 struct gc_atomic_forward;
 struct gc_heap;
+struct gc_ephemeron;
 
 GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
 
diff --git a/gc-ephemeron-internal.h b/gc-ephemeron-internal.h
new file mode 100644
index 000000000..8894bbd8f
--- /dev/null
+++ b/gc-ephemeron-internal.h
@@ -0,0 +1,51 @@
+#ifndef GC_EPHEMERON_INTERNAL_H
+#define GC_EPHEMERON_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-ephemeron.h"
+
+struct gc_pending_ephemerons;
+
+// API implemented by collector, for use by ephemerons:
+GC_INTERNAL int gc_visit_ephemeron_key(struct gc_edge edge,
+                                       struct gc_heap *heap);
+GC_INTERNAL struct gc_pending_ephemerons*
+gc_heap_pending_ephemerons(struct gc_heap *heap);
+GC_INTERNAL unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap);
+
+// API implemented by ephemerons, for use by collector:
+GC_INTERNAL struct gc_edge gc_ephemeron_key_edge(struct gc_ephemeron *eph);
+GC_INTERNAL struct gc_edge gc_ephemeron_value_edge(struct gc_ephemeron *eph);
+
+GC_INTERNAL struct gc_pending_ephemerons*
+gc_prepare_pending_ephemerons(struct gc_pending_ephemerons *state,
+                              size_t target_size, double slop);
+
+GC_INTERNAL void
+gc_resolve_pending_ephemerons(struct gc_ref obj, struct gc_heap *heap);
+
+GC_INTERNAL void
+gc_scan_pending_ephemerons(struct gc_pending_ephemerons *state,
+                           struct gc_heap *heap, size_t shard,
+                           size_t nshards);
+
+GC_INTERNAL int
+gc_pop_resolved_ephemerons(struct gc_heap *heap,
+                           void (*visit)(struct gc_edge edge,
+                                         struct gc_heap *heap,
+                                         void *visit_data),
+                           void *trace_data);
+
+GC_INTERNAL void
+gc_sweep_pending_ephemerons(struct gc_pending_ephemerons *state,
+                            size_t shard, size_t nshards);
+
+GC_INTERNAL void gc_ephemeron_init_internal(struct gc_heap *heap,
+                                            struct gc_ephemeron *ephemeron,
+                                            struct gc_ref key,
+                                            struct gc_ref value);
+
+#endif // GC_EPHEMERON_INTERNAL_H
diff --git a/gc-ephemeron.c b/gc-ephemeron.c
new file mode 100644
index 000000000..a13c4bb98
--- /dev/null
+++ b/gc-ephemeron.c
@@ -0,0 +1,582 @@
+#include <math.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+
+#define GC_IMPL 1
+
+#include "address-hash.h"
+#include "debug.h"
+#include "gc-embedder-api.h"
+#include "gc-ephemeron-internal.h"
+
+// # Overview
+//
+// An ephemeron is a conjunction consisting of the ephemeron object
+// itself, a "key" object, and a "value" object.  If the ephemeron and
+// the key are live, then the value is kept live and can be looked up
+// given the ephemeron object.
+//
+// Sometimes we write this as E×K⇒V, indicating that you need both E and
+// K to get V.  We'll use this notation in these comments sometimes.
+//
+// The key and the value of an ephemeron are never modified, except
+// possibly via forwarding during GC.
+//
+// If the key of an ephemeron ever becomes unreachable, the ephemeron
+// object will be marked as dead by the collector, and neither key nor
+// value will be accessible.  Users can also explicitly mark an
+// ephemeron as dead.
+//
+// Users can build collections of ephemerons by chaining them together.
+// If an ephemeron ever becomes dead, the ephemeron will be removed from
+// the chain by the garbage collector.
+//
+// # Tracing algorithm
+//
+// Tracing ephemerons is somewhat complicated.  Tracing the live objects
+// in a heap is usually a parallelizable fan-out kind of operation,
+// requiring minimal synchronization between tracing worker threads.
+// However with ephemerons, each worker thread may need to check if
+// there is a pending ephemeron E for an object K, marking the
+// associated V for later traversal by the tracer.  Doing this without
+// introducing excessive global serialization points is the motivation
+// for the complications that follow.
+//
+// From the viewpoint of the garbage collector, an ephemeron E×K⇒V has 4
+// possible states:
+//
+//  - Traced: An E that was already fully traced as of a given GC epoch.
+//
+//  - Claimed: GC discovers E for the first time in a GC epoch
+//
+//  - Pending: K's liveness is unknown
+//
+//  - Resolved: K is live; V needs tracing
+//
+// The ephemeron state is kept in an atomic variable.  The pending and
+// resolved states also have associated atomic list link fields as well;
+// it doesn't appear possible to coalesce them into a single field
+// without introducing serialization.  Finally, there is a bit to
+// indicate whether a "traced" ephemeron is live or dead, and a field to
+// indicate the epoch at which it was last traced.
+//
+// Here is a diagram of the state transitions:
+//
+//               ,----->Traced<-----.
+//              ,          |   |     .
+//             ,           v   /      .
+//             |        Claimed        |
+//             |  ,-----/     \---.    |
+//             |  v               v    |
+//             Pending--------->Resolved
+//
+// Ephemerons are born in the traced state, for the current GC epoch.
+//
+// When the tracer sees an ephemeron E in the traced state it checks the
+// epoch.  If the epoch is up to date, E stays in the traced state and
+// we are done.
+//
+// Otherwise, E transitions from traced to claimed.  The thread that
+// claims E is then responsible for resetting E's pending and resolved
+// links, updating E's epoch, and tracing E's user-controlled chain
+// link.
+//
+// If the claiming thread sees that E was already marked dead by a
+// previous GC, or explicitly by the user, the ephemeron then
+// transitions from back to traced, ready for the next epoch.
+//
+// If the claiming thread sees K to already be known to be live, then E
+// is added to the global resolved set and E's state becomes resolved.
+//
+// Otherwise the claiming thread publishes K⇒E to the global pending
+// ephemeron table, via the pending link, and E transitions to pending.
+//
+// A pending ephemeron is a link in a buckets-of-chains concurrent hash
+// table.  If its K is ever determined to be live, it becomes resolved,
+// and is added to a global set of resolved ephemerons.  At the end of
+// GC, any ephemerons still pending are marked dead, transitioning their
+// states to traced.
+//
+// Note that the claiming thread -- the one that publishes K⇒E to the
+// global pending ephemeron table -- needs to re-check that K is still
+// untraced after adding K⇒E to the pending table, and move to resolved
+// if so.
+//
+// A resolved ephemeron needs its V to be traced.  Incidentally its K
+// also needs tracing, to relocate any forwarding pointer.  The thread
+// that pops an ephemeron from the resolved set is responsible for
+// tracing and for moving E's state to traced.
+//
+// # Concurrency
+//
+// All operations on ephemerons are wait-free.  Sometimes only one
+// thread can make progress (for example for an ephemeron in the claimed
+// state), but no thread will be stalled waiting on other threads to
+// proceed.
+//
+// There is one interesting (from a concurrency point of view) data
+// structure used by the implementation of ephemerons, the singly-linked
+// list.  Actually there are three of these; one is used as a stack and
+// the other two is used as sets.
+//
+// The resolved set is implemented via a global `struct gc_ephemeron
+// *resolved` variable.  Resolving an ephemeron does an atomic push to
+// this stack, via compare-and-swap (CAS); popping from the stack (also
+// via CAS) yields an ephemeron for tracing.  Ephemerons are added to
+// the resolved set at most once per GC cycle, and the resolved set is
+// empty outside of GC.
+//
+// The operations that are supported on atomic stacks are:
+//
+//   push(LOC, E, OFFSET) -> void
+//
+// The user-visible chain link and the link for the pending ephemeron
+// table are used to build atomic sets.  In these you can add an
+// ephemeron to the beginning of the list, traverse the list link by
+// link to the end (indicated by NULL), and remove any list item.
+// Removing a list node proceeds in two phases: one, you mark the node
+// for removal, by changing the ephemeron's state; then, possibly on a
+// subsequent traversal, any predecessor may forward its link past
+// removed nodes.  Because node values never change and nodes only go
+// from live to dead, the live list tail can always be reached by any
+// node, even from dead nodes.
+//
+// The operations that are supported on these atomic lists:
+//
+//   push(LOC, E, OFFSET) -> void
+//   pop(LOC, OFFSET) -> ephemeron or null
+//   follow(LOC, OFFSET, STATE_OFFSET, LIVE_STATE) -> ephemeron or null
+//
+// These operations are all wait-free.  The "push" operation is shared
+// between stack and set use cases.  "pop" is for stack-like use cases.
+// The "follow" operation traverses a list, opportunistically eliding
+// nodes that have been marked dead, atomically updating the location
+// storing the next item.
+//
+// There are also accessors on ephemerons to their fields:
+//
+//   key(E) -> value or null
+//   value(E) -> value or null
+//
+// These operations retrieve the key and value, respectively, provided
+// that the ephemeron is not marked dead.
+
+////////////////////////////////////////////////////////////////////////
+// Concurrent operations on ephemeron lists
+////////////////////////////////////////////////////////////////////////
+
+static void
+ephemeron_list_push(struct gc_ephemeron **loc,
+                    struct gc_ephemeron *head,
+                    struct gc_ephemeron** (*get_next)(struct gc_ephemeron*)) {
+  struct gc_ephemeron *tail = atomic_load_explicit(loc, memory_order_acquire);
+  while (1) {
+    // There must be no concurrent readers of HEAD, a precondition that
+    // we ensure by only publishing HEAD to LOC at most once per cycle.
+    // Therefore we can use a normal store for the tail pointer.
+    *get_next(head) = tail;
+    if (atomic_compare_exchange_weak(loc, &tail, head))
+      break;
+  }
+}
+
+static struct gc_ephemeron*
+ephemeron_list_pop(struct gc_ephemeron **loc,
+                   struct gc_ephemeron** (*get_next)(struct gc_ephemeron*)) {
+  struct gc_ephemeron *head = atomic_load_explicit(loc, memory_order_acquire);
+  while (head) {
+    // Precondition: the result of get_next on an ephemeron is never
+    // updated concurrently; OK to load non-atomically.
+    struct gc_ephemeron *tail = *get_next(head);
+    if (atomic_compare_exchange_weak(loc, &head, tail))
+      break;
+  }
+  return head;
+}
+
+static struct gc_ephemeron*
+ephemeron_list_follow(struct gc_ephemeron **loc,
+                      struct gc_ephemeron** (*get_next)(struct gc_ephemeron*),
+                      int (*is_live)(struct gc_ephemeron*)) {
+  struct gc_ephemeron *head = atomic_load_explicit(loc, memory_order_acquire);
+
+  while (1) {
+    struct gc_ephemeron *new_head = head;
+
+    // Skip past any dead nodes.
+    while (new_head && !is_live(new_head))
+      new_head = atomic_load_explicit(get_next(new_head), memory_order_acquire);
+
+    if (// If we didn't have to advance past any dead nodes, no need to
+        // update LOC.
+        (head == new_head)
+        // Otherwise if we succeed in updating LOC, we're done.
+        || atomic_compare_exchange_strong(loc, &head, new_head)
+        // Someone else managed to advance LOC; that's fine too.
+        || (head == new_head))
+      return new_head;
+
+    // Otherwise we lost a race; loop and retry.
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// The ephemeron object type
+////////////////////////////////////////////////////////////////////////
+
+#ifndef GC_EMBEDDER_EPHEMERON_HEADER
+#error Embedder should define GC_EMBEDDER_EPHEMERON_HEADER
+#endif
+
+enum {
+  EPHEMERON_STATE_TRACED,
+  EPHEMERON_STATE_CLAIMED,
+  EPHEMERON_STATE_PENDING,
+  EPHEMERON_STATE_RESOLVED,
+};
+
+struct gc_ephemeron {
+  GC_EMBEDDER_EPHEMERON_HEADER
+  uint8_t state;
+  uint8_t is_dead;
+  unsigned epoch;
+  struct gc_ephemeron *chain;
+  struct gc_ephemeron *pending;
+  struct gc_ephemeron *resolved;
+  struct gc_ref key;
+  struct gc_ref value;
+};
+
+size_t gc_ephemeron_size(void) { return sizeof(struct gc_ephemeron); }
+
+struct gc_edge gc_ephemeron_key_edge(struct gc_ephemeron *e) {
+  return gc_edge(&e->key);
+}
+struct gc_edge gc_ephemeron_value_edge(struct gc_ephemeron *e) {
+  return gc_edge(&e->value);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Operations on the user-controlled chain field
+////////////////////////////////////////////////////////////////////////
+
+static struct gc_ephemeron** ephemeron_chain(struct gc_ephemeron *e) {
+  return &e->chain;
+}
+static int ephemeron_is_dead(struct gc_ephemeron *e) {
+  return atomic_load_explicit(&e->is_dead, memory_order_acquire);
+}
+static int ephemeron_is_not_dead(struct gc_ephemeron *e) {
+  return !ephemeron_is_dead(e);
+}
+
+void gc_ephemeron_chain_push(struct gc_ephemeron **loc,
+                             struct gc_ephemeron *e) {
+  ephemeron_list_push(loc, e, ephemeron_chain);
+}  
+static struct gc_ephemeron* follow_chain(struct gc_ephemeron **loc) {
+  return ephemeron_list_follow(loc, ephemeron_chain, ephemeron_is_not_dead);
+}  
+struct gc_ephemeron* gc_ephemeron_chain_head(struct gc_ephemeron **loc) {
+  return follow_chain(loc);
+}
+struct gc_ephemeron* gc_ephemeron_chain_next(struct gc_ephemeron *e) {
+  return follow_chain(ephemeron_chain(e));
+}
+void gc_ephemeron_mark_dead(struct gc_ephemeron *e) {
+  atomic_store_explicit(&e->is_dead, 1, memory_order_release);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Operations on the GC-managed pending link
+////////////////////////////////////////////////////////////////////////
+
+static struct gc_ephemeron** ephemeron_pending(struct gc_ephemeron *e) {
+  return &e->pending;
+}
+static uint8_t ephemeron_state(struct gc_ephemeron *e) {
+  return atomic_load_explicit(&e->state, memory_order_acquire);
+}
+static int ephemeron_is_pending(struct gc_ephemeron *e) {
+  return ephemeron_state(e) == EPHEMERON_STATE_PENDING;
+}
+
+static void push_pending(struct gc_ephemeron **loc, struct gc_ephemeron *e) {
+  ephemeron_list_push(loc, e, ephemeron_pending);
+}  
+static struct gc_ephemeron* follow_pending(struct gc_ephemeron **loc) {
+  return ephemeron_list_follow(loc, ephemeron_pending, ephemeron_is_pending);
+}  
+
+////////////////////////////////////////////////////////////////////////
+// Operations on the GC-managed resolved link
+////////////////////////////////////////////////////////////////////////
+
+static struct gc_ephemeron** ephemeron_resolved(struct gc_ephemeron *e) {
+  return &e->resolved;
+}
+static void push_resolved(struct gc_ephemeron **loc, struct gc_ephemeron *e) {
+  ephemeron_list_push(loc, e, ephemeron_resolved);
+}  
+static struct gc_ephemeron* pop_resolved(struct gc_ephemeron **loc) {
+  return ephemeron_list_pop(loc, ephemeron_resolved);
+}  
+
+////////////////////////////////////////////////////////////////////////
+// Access to the association
+////////////////////////////////////////////////////////////////////////
+
+struct gc_ref gc_ephemeron_key(struct gc_ephemeron *e) {
+  return ephemeron_is_dead(e) ? gc_ref_null() : e->key;
+}
+
+struct gc_ref gc_ephemeron_value(struct gc_ephemeron *e) {
+  return ephemeron_is_dead(e) ? gc_ref_null() : e->value;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Tracing ephemerons
+////////////////////////////////////////////////////////////////////////
+
+struct gc_pending_ephemerons {
+  struct gc_ephemeron* resolved;
+  size_t nbuckets;
+  double scale;
+  struct gc_ephemeron* buckets[0];
+};
+
+static const size_t MIN_PENDING_EPHEMERONS_SIZE = 32;
+
+static size_t pending_ephemerons_byte_size(size_t nbuckets) {
+  return sizeof(struct gc_pending_ephemerons) +
+    sizeof(struct gc_ephemeron*) * nbuckets;
+}
+
+static struct gc_pending_ephemerons*
+gc_make_pending_ephemerons(size_t byte_size) {
+  size_t nbuckets = byte_size / sizeof(struct gc_ephemeron*);
+  if (nbuckets < MIN_PENDING_EPHEMERONS_SIZE)
+    nbuckets = MIN_PENDING_EPHEMERONS_SIZE;
+
+  struct gc_pending_ephemerons *ret =
+    malloc(pending_ephemerons_byte_size(nbuckets));
+  if (!ret)
+    return NULL;
+
+  ret->resolved = NULL;
+  ret->nbuckets = nbuckets;
+  ret->scale = nbuckets / pow(2.0, sizeof(uintptr_t) * 8);
+  for (size_t i = 0; i < nbuckets; i++)
+    ret->buckets[i] = NULL;
+
+  return ret;
+}
+
+struct gc_pending_ephemerons*
+gc_prepare_pending_ephemerons(struct gc_pending_ephemerons *state,
+                              size_t target_byte_size, double slop) {
+  size_t existing =
+    state ? pending_ephemerons_byte_size(state->nbuckets) : 0;
+  slop += 1.0;
+  if (existing * slop > target_byte_size && existing < target_byte_size * slop)
+    return state;
+
+  struct gc_pending_ephemerons *new_state =
+    gc_make_pending_ephemerons(target_byte_size);
+
+  if (!new_state)
+    return state;
+
+  free(state);
+  return new_state;
+}
+
+static struct gc_ephemeron**
+pending_ephemeron_bucket(struct gc_pending_ephemerons *state,
+                         struct gc_ref ref) {
+  uintptr_t hash = hash_address(gc_ref_value(ref));
+  size_t idx = hash * state->scale;
+  GC_ASSERT(idx < state->nbuckets);
+  return &state->buckets[idx];
+}
+
+static void
+add_pending_ephemeron(struct gc_pending_ephemerons *state,
+                      struct gc_ephemeron *e) {
+  struct gc_ephemeron **bucket = pending_ephemeron_bucket(state, e->key);
+  atomic_store_explicit(&e->state, EPHEMERON_STATE_PENDING,
+                        memory_order_release);
+  push_pending(bucket, e);
+}
+
+static void maybe_resolve_ephemeron(struct gc_pending_ephemerons *state,
+                                    struct gc_ephemeron *e) {
+  uint8_t expected = EPHEMERON_STATE_PENDING;
+  if (atomic_compare_exchange_strong(&e->state, &expected,
+                                     EPHEMERON_STATE_RESOLVED))
+    push_resolved(&state->resolved, e);
+}
+
+// Precondition: OBJ has already been copied to tospace, but OBJ is a
+// fromspace ref.
+void gc_resolve_pending_ephemerons(struct gc_ref obj, struct gc_heap *heap) {
+  struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
+  struct gc_ephemeron **bucket = pending_ephemeron_bucket(state, obj);
+  for (struct gc_ephemeron *link = follow_pending(bucket);
+       link;
+       link = follow_pending(&link->pending)) {
+    if (gc_ref_value(obj) == gc_ref_value(link->key)) {
+      gc_visit_ephemeron_key(gc_ephemeron_key_edge(link), heap);
+      // PENDING -> RESOLVED, if it was pending.
+      maybe_resolve_ephemeron(state, link);
+    }
+  }
+}
+
+void gc_trace_ephemeron(struct gc_ephemeron *e,
+                        void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                      void *visit_data),
+                        struct gc_heap *heap,
+                        void *trace_data) {
+  unsigned epoch = gc_heap_ephemeron_trace_epoch(heap);
+  uint8_t expected = EPHEMERON_STATE_TRACED;
+  // TRACED[_] -> CLAIMED[_].
+  if (!atomic_compare_exchange_strong(&e->state, &expected,
+                                      EPHEMERON_STATE_CLAIMED))
+    return;
+
+
+  if (e->epoch == epoch) {
+    // CLAIMED[epoch] -> TRACED[epoch].
+    atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+    return;
+  }
+
+  // CLAIMED[!epoch] -> CLAIMED[epoch].
+  e->epoch = epoch;
+  e->pending = NULL;
+  e->resolved = NULL;
+
+  // Trace chain successors, eliding any intermediate dead links.  Note
+  // that there is a race between trace-time evacuation of the next link
+  // in the chain and any mutation of that link pointer by the mutator
+  // (which can only be to advance the chain forward past dead links).
+  // Collectors using this API have to eliminate this race, for example
+  // by not evacuating while the mutator is running.
+  follow_chain(&e->chain);
+  visit(gc_edge(&e->chain), heap, trace_data);
+
+  // Similarly there is a race between the mutator marking an ephemeron
+  // as dead and here; the consequence would be that we treat an
+  // ephemeron as live when it's not, but only for this cycle.  No big
+  // deal.
+  if (atomic_load_explicit(&e->is_dead, memory_order_acquire)) {
+    // CLAIMED[epoch] -> TRACED[epoch].
+    atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+    return;
+  }
+    
+  // If K is live, trace V and we are done.
+  if (gc_visit_ephemeron_key(gc_ephemeron_key_edge(e), heap)) {
+    visit(gc_ephemeron_value_edge(e), heap, trace_data);
+    // CLAIMED[epoch] -> TRACED[epoch].
+    atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+    return;
+  }
+
+  // Otherwise K is not yet traced, so we don't know if it is live.
+  // Publish the ephemeron to a global table.
+  struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
+  // CLAIMED[epoch] -> PENDING.
+  add_pending_ephemeron(state, e);
+
+  // Given an ephemeron E×K⇒V, there is a race between marking K and E.
+  // One thread could go to mark E and see that K is unmarked, so we get
+  // here.  Meanwhile another thread could go to mark K and not see E in
+  // the global table yet.  Therefore after publishing E, we have to
+  // check the mark on K again.
+  if (gc_visit_ephemeron_key(gc_ephemeron_key_edge(e), heap))
+    // K visited by another thread while we published E; PENDING ->
+    // RESOLVED, if still PENDING.
+    maybe_resolve_ephemeron(state, e);
+}
+
+void
+gc_scan_pending_ephemerons(struct gc_pending_ephemerons *state,
+                           struct gc_heap *heap, size_t shard,
+                           size_t nshards) {
+  GC_ASSERT(shard < nshards);
+  size_t start = state->nbuckets * 1.0 * shard / nshards;
+  size_t end = state->nbuckets * 1.0 * (shard + 1) / nshards;
+  for (size_t idx = start; idx < end; idx++) {
+    for (struct gc_ephemeron *e = follow_pending(&state->buckets[idx]);
+         e;
+         e = follow_pending(&e->pending)) {
+      if (gc_visit_ephemeron_key(gc_ephemeron_key_edge(e), heap))
+        // PENDING -> RESOLVED, if PENDING.
+        maybe_resolve_ephemeron(state, e);
+    }
+  }
+}
+
+int
+gc_pop_resolved_ephemerons(struct gc_heap *heap,
+                           void (*visit)(struct gc_edge edge,
+                                         struct gc_heap *heap,
+                                         void *visit_data),
+                           void *trace_data) {
+  struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
+  struct gc_ephemeron *resolved = atomic_exchange(&state->resolved, NULL);
+  if (!resolved)
+    return 0;
+  for (; resolved; resolved = resolved->resolved) {
+    visit(gc_ephemeron_value_edge(resolved), heap, trace_data);
+    // RESOLVED -> TRACED.
+    atomic_store_explicit(&resolved->state, EPHEMERON_STATE_TRACED,
+                          memory_order_release);
+  }
+  return 1;
+}    
+
+void
+gc_sweep_pending_ephemerons(struct gc_pending_ephemerons *state,
+                            size_t shard, size_t nshards) {
+  GC_ASSERT(shard < nshards);
+  size_t start = state->nbuckets * 1.0 * shard / nshards;
+  size_t end = state->nbuckets * 1.0 * (shard + 1) / nshards;
+  for (size_t idx = start; idx < end; idx++) {
+    struct gc_ephemeron **bucket = &state->buckets[idx];
+    for (struct gc_ephemeron *e = follow_pending(bucket);
+         e;
+         e = follow_pending(&e->pending)) {
+      // PENDING -> TRACED, but dead.
+      atomic_store_explicit(&e->is_dead, 1, memory_order_release);
+      atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
+                            memory_order_release);
+    }
+    atomic_store_explicit(bucket, NULL, memory_order_release);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// Allocation & initialization
+////////////////////////////////////////////////////////////////////////
+
+void gc_ephemeron_init_internal(struct gc_heap *heap,
+                                struct gc_ephemeron *ephemeron,
+                                struct gc_ref key, struct gc_ref value) {
+  // Caller responsible for any write barrier, though really the
+  // assumption is that the ephemeron is younger than the key and the
+  // value.
+  ephemeron->state = EPHEMERON_STATE_TRACED;
+  ephemeron->is_dead = 0;
+  ephemeron->epoch = gc_heap_ephemeron_trace_epoch(heap) - 1;
+  ephemeron->chain = NULL;
+  ephemeron->pending = NULL;
+  ephemeron->resolved = NULL;
+  ephemeron->key = key;
+  ephemeron->value = value;
+}
diff --git a/gc-ephemeron.h b/gc-ephemeron.h
new file mode 100644
index 000000000..d5159dff3
--- /dev/null
+++ b/gc-ephemeron.h
@@ -0,0 +1,42 @@
+#ifndef GC_EPHEMERON_H_
+#define GC_EPHEMERON_H_
+
+#include "gc-edge.h"
+#include "gc-ref.h"
+#include "gc-visibility.h"
+
+// Ephemerons establish an association between a "key" object and a
+// "value" object.  If the ephemeron and the key are live, then the
+// value is live, and can be retrieved from the ephemeron.  Ephemerons
+// can be chained together, which allows them to function as links in a
+// buckets-and-chains hash table.
+//
+// This file defines the user-facing API for ephemerons.
+
+struct gc_heap;
+struct gc_mutator;
+struct gc_ephemeron;
+
+GC_API_ size_t gc_ephemeron_size(void);
+GC_API_ struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut);
+GC_API_ void gc_ephemeron_init(struct gc_mutator *mut,
+                               struct gc_ephemeron *ephemeron,
+                               struct gc_ref key, struct gc_ref value);
+
+GC_API_ struct gc_ref gc_ephemeron_key(struct gc_ephemeron *ephemeron);
+GC_API_ struct gc_ref gc_ephemeron_value(struct gc_ephemeron *ephemeron);
+
+GC_API_ struct gc_ephemeron* gc_ephemeron_chain_head(struct gc_ephemeron **loc);
+GC_API_ void gc_ephemeron_chain_push(struct gc_ephemeron **loc,
+                                     struct gc_ephemeron *ephemeron);
+GC_API_ struct gc_ephemeron* gc_ephemeron_chain_next(struct gc_ephemeron *ephemeron);
+GC_API_ void gc_ephemeron_mark_dead(struct gc_ephemeron *ephemeron);
+
+GC_API_ void gc_trace_ephemeron(struct gc_ephemeron *ephemeron,
+                                void (*visit)(struct gc_edge edge,
+                                              struct gc_heap *heap,
+                                              void *visit_data),
+                                struct gc_heap *heap,
+                                void *trace_data);
+
+#endif // GC_EPHEMERON_H_
diff --git a/gc-internal.h b/gc-internal.h
new file mode 100644
index 000000000..f74336dc9
--- /dev/null
+++ b/gc-internal.h
@@ -0,0 +1,10 @@
+#ifndef GC_INTERNAL_H
+#define GC_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-ephemeron-internal.h"
+
+#endif // GC_INTERNAL_H
diff --git a/gc-visibility.h b/gc-visibility.h
index 7360915a0..b7e1995df 100644
--- a/gc-visibility.h
+++ b/gc-visibility.h
@@ -4,4 +4,9 @@
 #define GC_INTERNAL __attribute__((visibility("hidden")))
 #define GC_PUBLIC __attribute__((visibility("default")))
 
+// FIXME: Conflict with bdw-gc GC_API.  Switch prefix?
+#ifndef GC_API_
+#define GC_API_ GC_INTERNAL
+#endif
+
 #endif // GC_VISIBILITY_H
diff --git a/large-object-space.h b/large-object-space.h
index ddd1bfcde..de41dea60 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -92,6 +92,16 @@ done:
   return copied;
 }
 
+static int large_object_space_is_copied(struct large_object_space *space,
+                                        struct gc_ref ref) {
+  int copied = 0;
+  uintptr_t addr = gc_ref_value(ref);
+  pthread_mutex_lock(&space->lock);
+  copied = address_set_contains(&space->from_space, addr);
+  pthread_mutex_unlock(&space->lock);
+  return copied;
+}
+
 static int large_object_space_mark_object(struct large_object_space *space,
                                           struct gc_ref ref) {
   return large_object_space_copy(space, ref);
diff --git a/semi.c b/semi.c
index d9e1110cf..e7c2b59cd 100644
--- a/semi.c
+++ b/semi.c
@@ -5,9 +5,11 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-#define GC_API_ 
 #include "gc-api.h"
 
+#define GC_IMPL 1
+#include "gc-internal.h"
+
 #include "semi-attrs.h"
 #include "large-object-space.h"
 
@@ -24,11 +26,16 @@ struct semi_space {
   size_t stolen_pages;
   uintptr_t base;
   size_t size;
-  long count;
 };
 struct gc_heap {
   struct semi_space semi_space;
   struct large_object_space large_object_space;
+  struct gc_pending_ephemerons *pending_ephemerons;
+  double pending_ephemerons_size_factor;
+  double pending_ephemerons_size_slop;
+  size_t size;
+  long count;
+  int check_pending_ephemerons;
 };
 // One mutator per space, can just store the heap in the mutator.
 struct gc_mutator {
@@ -96,16 +103,20 @@ static void flip(struct semi_space *space) {
   space->from_space = space->to_space;
   space->to_space = space->hp;
   space->limit = space->hp + space->size / 2;
-  space->count++;
 }  
 
-static struct gc_ref copy(struct semi_space *space, struct gc_ref ref) {
+static struct gc_ref copy(struct gc_heap *heap, struct semi_space *space,
+                          struct gc_ref ref) {
   size_t size;
   gc_trace_object(ref, NULL, NULL, NULL, &size);
   struct gc_ref new_ref = gc_ref(space->hp);
   memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(ref), size);
   gc_object_forward_nonatomic(ref, new_ref);
   space->hp += align_up(size, GC_ALIGNMENT);
+
+  if (GC_UNLIKELY(heap->check_pending_ephemerons))
+    gc_resolve_pending_ephemerons(ref, heap);
+
   return new_ref;
 }
 
@@ -115,21 +126,26 @@ static uintptr_t scan(struct gc_heap *heap, struct gc_ref grey) {
   return gc_ref_value(grey) + align_up(size, GC_ALIGNMENT);
 }
 
-static struct gc_ref forward(struct semi_space *space, struct gc_ref obj) {
+static struct gc_ref forward(struct gc_heap *heap, struct semi_space *space,
+                             struct gc_ref obj) {
   uintptr_t forwarded = gc_object_forwarded_nonatomic(obj);
-  return forwarded ? gc_ref(forwarded) : copy(space, obj);
+  return forwarded ? gc_ref(forwarded) : copy(heap, space, obj);
 }  
 
 static void visit_semi_space(struct gc_heap *heap, struct semi_space *space,
                              struct gc_edge edge, struct gc_ref ref) {
-  gc_edge_update(edge, forward(space, ref));
+  gc_edge_update(edge, forward(heap, space, ref));
 }
 
 static void visit_large_object_space(struct gc_heap *heap,
                                      struct large_object_space *space,
                                      struct gc_ref ref) {
-  if (large_object_space_copy(space, ref))
+  if (large_object_space_copy(space, ref)) {
+    if (GC_UNLIKELY(heap->check_pending_ephemerons))
+      gc_resolve_pending_ephemerons(ref, heap);
+
     gc_trace_object(ref, trace, heap, NULL, NULL);
+  }
 }
 
 static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
@@ -149,6 +165,26 @@ static void visit(struct gc_edge edge, struct gc_heap *heap) {
     GC_CRASH();
 }
 
+struct gc_pending_ephemerons *
+gc_heap_pending_ephemerons(struct gc_heap *heap) {
+  return heap->pending_ephemerons;
+}
+
+int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  GC_ASSERT(gc_ref_is_heap_object(ref));
+  if (semi_space_contains(heap_semi_space(heap), ref)) {
+    uintptr_t forwarded = gc_object_forwarded_nonatomic(ref);
+    if (!forwarded)
+      return 0;
+    gc_edge_update(edge, gc_ref(forwarded));
+    return 1;
+  } else if (large_object_space_contains(heap_large_object_space(heap), ref)) {
+    return large_object_space_is_copied(heap_large_object_space(heap), ref);
+  }
+  GC_CRASH();
+}
+
 static void trace(struct gc_edge edge, struct gc_heap *heap, void *visit_data) {
   return visit(edge, heap);
 }
@@ -160,14 +196,22 @@ static void collect(struct gc_mutator *mut) {
   // fprintf(stderr, "start collect #%ld:\n", space->count);
   large_object_space_start_gc(large, 0);
   flip(semi);
+  heap->count++;
+  heap->check_pending_ephemerons = 0;
   uintptr_t grey = semi->hp;
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, trace, heap, NULL);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, gc_ref(grey));
+  gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
+  heap->check_pending_ephemerons = 1;
+  while (gc_pop_resolved_ephemerons(heap, trace, NULL))
+    while(grey < semi->hp)
+      grey = scan(heap, gc_ref(grey));
   large_object_space_finish_gc(large, 0);
   semi_space_set_stolen_pages(semi, large->live_pages_at_last_collection);
+  gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
 
@@ -229,6 +273,15 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut) {
+  return gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size()));
+}
+
+void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                       struct gc_ref key, struct gc_ref value) {
+  gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
+}
+
 static int initialize_semi_space(struct semi_space *space, size_t size) {
   // Allocate even numbers of pages.
   size_t page_size = getpagesize();
@@ -246,7 +299,6 @@ static int initialize_semi_space(struct semi_space *space, size_t size) {
   space->page_size = page_size;
   space->stolen_pages = 0;
   space->size = size;
-  space->count = 0;
 
   return 1;
 }
@@ -315,6 +367,29 @@ static int parse_options(int argc, struct gc_option argv[],
   return 1;
 }
 
+static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
+  struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
+  size_t target = heap->size * heap->pending_ephemerons_size_factor;
+  double slop = heap->pending_ephemerons_size_slop;
+
+  heap->pending_ephemerons = gc_prepare_pending_ephemerons(cur, target, slop);
+
+  return !!heap->pending_ephemerons;
+}
+
+unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+  return heap->count;
+}
+
+static int heap_init(struct gc_heap *heap, size_t size) {
+  heap->pending_ephemerons_size_factor = 0.01;
+  heap->pending_ephemerons_size_slop = 0.5;
+  heap->count = 0;
+  heap->size = size;
+
+  return heap_prepare_pending_ephemerons(heap);
+}
+
 int gc_init(int argc, struct gc_option argv[],
             struct gc_stack_addr *stack_base, struct gc_heap **heap,
             struct gc_mutator **mut) {
@@ -331,6 +406,9 @@ int gc_init(int argc, struct gc_option argv[],
   if (!*mut) GC_CRASH();
   *heap = mutator_heap(*mut);
 
+  if (!heap_init(*heap, options.fixed_heap_size))
+    return 0;
+
   struct semi_space *space = mutator_semi_space(*mut);
   if (!initialize_semi_space(space, options.fixed_heap_size))
     return 0;
@@ -367,7 +445,6 @@ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
 }
 
 void gc_print_stats(struct gc_heap *heap) {
-  struct semi_space *space = heap_semi_space(heap);
-  printf("Completed %ld collections\n", space->count);
-  printf("Heap size is %zd\n", space->size);
+  printf("Completed %ld collections\n", heap->count);
+  printf("Heap size is %zd\n", heap->size);
 }
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 758e56462..70fd5c7a8 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -5,6 +5,8 @@
 #include "gc-config.h"
 #include "gc-embedder-api.h"
 
+#define GC_EMBEDDER_EPHEMERON_HEADER struct gc_header header;
+
 static inline int
 gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
 #if GC_CONSERVATIVE_ROOTS || GC_CONSERVATIVE_TRACE
diff --git a/simple-roots-api.h b/simple-roots-api.h
index 1cdfc15e0..d94397adf 100644
--- a/simple-roots-api.h
+++ b/simple-roots-api.h
@@ -5,8 +5,9 @@
 #include "simple-roots-types.h"
 
 #define HANDLE_TO(T) union { T* v; struct handle handle; }
-#define HANDLE_REF(h) h.v
-#define HANDLE_SET(h,val) do { h.v = val; } while (0)
+#define HANDLE_LOC(h) &(h).v
+#define HANDLE_REF(h) (h).v
+#define HANDLE_SET(h,val) do { (h).v = val; } while (0)
 #define PUSH_HANDLE(cx, h) push_handle(&(cx)->roots.roots, &h.handle)
 #define POP_HANDLE(cx) pop_handle(&(cx)->roots.roots)
 
diff --git a/whippet.c b/whippet.c
index ee18445de..9f81948a7 100644
--- a/whippet.c
+++ b/whippet.c
@@ -7,10 +7,10 @@
 #include <string.h>
 #include <unistd.h>
 
-#define GC_API_ 
 #include "gc-api.h"
 
 #define GC_IMPL 1
+#include "gc-internal.h"
 
 #include "debug.h"
 #include "gc-align.h"
@@ -77,9 +77,9 @@ enum metadata_byte {
   METADATA_BYTE_MARK_1 = 4,
   METADATA_BYTE_MARK_2 = 8,
   METADATA_BYTE_END = 16,
-  METADATA_BYTE_UNUSED_1 = 32,
-  METADATA_BYTE_UNUSED_2 = 64,
-  METADATA_BYTE_UNUSED_3 = 128
+  METADATA_BYTE_EPHEMERON = 32,
+  METADATA_BYTE_PINNED = 64,
+  METADATA_BYTE_UNUSED_1 = 128
 };
 
 static uint8_t rotate_dead_survivor_marked(uint8_t mask) {
@@ -307,6 +307,8 @@ struct gc_heap {
   size_t size;
   int collecting;
   int mark_while_stopping;
+  int check_pending_ephemerons;
+  struct gc_pending_ephemerons *pending_ephemerons;
   enum gc_kind gc_kind;
   int multithreaded;
   size_t active_mutator_count;
@@ -323,6 +325,8 @@ struct gc_heap {
   double minor_gc_yield_threshold;
   double major_gc_yield_threshold;
   double minimum_major_gc_yield_threshold;
+  double pending_ephemerons_size_factor;
+  double pending_ephemerons_size_slop;
 };
 
 struct gc_mutator_mark_buf {
@@ -649,8 +653,8 @@ static inline int mark_space_contains(struct mark_space *space,
   return mark_space_contains_address(space, gc_ref_value(ref));
 }
 
-static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
-  struct gc_ref ref = gc_edge_ref(edge);
+static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
+                           struct gc_ref ref) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
   if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
@@ -666,6 +670,63 @@ static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
     GC_CRASH();
 }
 
+static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  int is_new = do_trace(heap, edge, ref);
+
+  if (GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
+                                       memory_order_relaxed)))
+    gc_resolve_pending_ephemerons(ref, heap);
+
+  return is_new;
+}
+
+int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  if (!gc_ref_is_heap_object(ref))
+    return 0;
+  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
+    struct mark_space *space = heap_mark_space(heap);
+    uint8_t *metadata = metadata_byte_for_object(ref);
+    uint8_t byte = *metadata;
+    if (byte & space->marked_mask)
+      return 1;
+
+    if (!space->evacuating)
+      return 0;
+    if (!block_summary_has_flag(block_summary_for_addr(gc_ref_value(ref)),
+                                BLOCK_EVACUATE))
+      return 0;
+
+    struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
+    switch (fwd.state) {
+    case GC_FORWARDING_STATE_NOT_FORWARDED:
+      return 0;
+    case GC_FORWARDING_STATE_BUSY:
+      // Someone else claimed this object first.  Spin until new address
+      // known, or evacuation aborts.
+      for (size_t spin_count = 0;; spin_count++) {
+        if (gc_atomic_forward_retry_busy(&fwd))
+          break;
+        yield_for_spin(spin_count);
+      }
+      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+        // Remote evacuation aborted; remote will mark and enqueue.
+        return 1;
+      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+      // Fall through.
+    case GC_FORWARDING_STATE_FORWARDED:
+      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+      return 1;
+    default:
+      GC_CRASH();
+    }
+  } else if (large_object_space_contains(heap_large_object_space(heap), ref)) {
+    return large_object_space_is_copied(heap_large_object_space(heap), ref);
+  }
+  GC_CRASH();
+}
+
 static inline struct gc_ref mark_space_mark_conservative_ref(struct mark_space *space,
                                                              struct gc_conservative_ref ref,
                                                              int possibly_interior) {
@@ -732,9 +793,9 @@ static inline struct gc_ref mark_space_mark_conservative_ref(struct mark_space *
   return gc_ref(addr);
 }
 
-static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
-                                                   struct gc_conservative_ref ref,
-                                                   int possibly_interior) {
+static inline struct gc_ref do_trace_conservative_ref(struct gc_heap *heap,
+                                                      struct gc_conservative_ref ref,
+                                                      int possibly_interior) {
   if (!gc_conservative_ref_might_be_a_heap_object(ref, possibly_interior))
     return gc_ref_null();
 
@@ -746,6 +807,19 @@ static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
                                                     ref, possibly_interior);
 }
 
+static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
+                                                   struct gc_conservative_ref ref,
+                                                   int possibly_interior) {
+  struct gc_ref ret = do_trace_conservative_ref(heap, ref, possibly_interior);
+
+  if (gc_ref_is_heap_object(ret) &&
+      GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
+                                       memory_order_relaxed)))
+    gc_resolve_pending_ephemerons(ret, heap);
+
+  return ret;
+}
+
 static inline size_t mark_space_object_size(struct mark_space *space,
                                             struct gc_ref ref) {
   uint8_t *loc = metadata_byte_for_object(ref);
@@ -1091,18 +1165,37 @@ static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
     tracer_enqueue(resolved, heap, data);
 }
 
+static inline void trace_one_conservatively(struct gc_ref ref,
+                                            struct gc_heap *heap,
+                                            void *mark_data) {
+  size_t bytes;
+  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
+    // Generally speaking we trace conservatively and don't allow much
+    // in the way of incremental precise marking on a
+    // conservative-by-default heap.  But, we make an exception for
+    // ephemerons.
+    uint8_t meta = *metadata_byte_for_addr(gc_ref_value(ref));
+    if (GC_UNLIKELY(meta & METADATA_BYTE_EPHEMERON)) {
+      gc_trace_ephemeron(gc_ref_heap_object(ref), tracer_visit, heap,
+                         mark_data);
+      return;
+    }
+    bytes = mark_space_object_size(heap_mark_space(heap), ref);
+  } else {
+    bytes = large_object_space_object_size(heap_large_object_space(heap), ref);
+  }
+  trace_conservative_edges(gc_ref_value(ref),
+                           gc_ref_value(ref) + bytes,
+                           tracer_trace_conservative_ref, heap,
+                           mark_data);
+}
+
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              void *mark_data) {
-  if (gc_has_conservative_intraheap_edges()) {
-    size_t bytes = GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))
-      ? mark_space_object_size(heap_mark_space(heap), ref)
-      : large_object_space_object_size(heap_large_object_space(heap), ref);
-    trace_conservative_edges(gc_ref_value(ref),
-                             gc_ref_value(ref) + bytes,
-                             tracer_trace_conservative_ref, heap, mark_data);
-  } else {
+  if (gc_has_conservative_intraheap_edges())
+    trace_one_conservatively(ref, heap, mark_data);
+  else
     gc_trace_object(ref, tracer_visit, heap, mark_data, NULL);
-  }
 }
 
 static void
@@ -1672,6 +1765,26 @@ static void mark_space_finish_gc(struct mark_space *space,
   release_evacuation_target_blocks(space);
 }
 
+static void resolve_ephemerons_lazily(struct gc_heap *heap) {
+  atomic_store_explicit(&heap->check_pending_ephemerons, 0,
+                        memory_order_release);
+}
+
+static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
+  atomic_store_explicit(&heap->check_pending_ephemerons, 1,
+                        memory_order_release);
+  gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
+}
+
+static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
+  return gc_pop_resolved_ephemerons(heap, trace_and_enqueue_globally,
+                                    NULL);
+}
+
+static void sweep_ephemerons(struct gc_heap *heap) {
+  return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
+}
+
 static void collect(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
@@ -1684,6 +1797,7 @@ static void collect(struct gc_mutator *mut) {
   enum gc_kind gc_kind = determine_collection_kind(heap);
   update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
   large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
+  resolve_ephemerons_lazily(heap);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
   trace_mutator_roots_with_lock_before_stop(mut);
@@ -1697,6 +1811,10 @@ static void collect(struct gc_mutator *mut) {
   prepare_for_evacuation(heap);
   trace_roots_after_stop(heap);
   tracer_trace(heap);
+  resolve_ephemerons_eagerly(heap);
+  while (enqueue_resolved_ephemerons(heap))
+    tracer_trace(heap);
+  sweep_ephemerons(heap);
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
   large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
@@ -2054,6 +2172,31 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut) {
+  struct gc_ref ret =
+    gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size()));
+  if (gc_has_conservative_intraheap_edges()) {
+    uint8_t *metadata = metadata_byte_for_addr(gc_ref_value(ret));
+    *metadata |= METADATA_BYTE_EPHEMERON;
+  }
+  return ret;
+}
+
+void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                       struct gc_ref key, struct gc_ref value) {
+  gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
+  // No write barrier: we require that the ephemeron be newer than the
+  // key or the value.
+}
+
+struct gc_pending_ephemerons *gc_heap_pending_ephemerons(struct gc_heap *heap) {
+  return heap->pending_ephemerons;
+}
+
+unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+  return heap->count;
+}
+
 #define FOR_EACH_GC_OPTION(M) \
   M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
   M(GC_OPTION_PARALLELISM, "parallelism")
@@ -2141,6 +2284,16 @@ static struct slab* allocate_slabs(size_t nslabs) {
   return (struct slab*) aligned_base;
 }
 
+static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
+  struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
+  size_t target = heap->size * heap->pending_ephemerons_size_factor;
+  double slop = heap->pending_ephemerons_size_slop;
+
+  heap->pending_ephemerons = gc_prepare_pending_ephemerons(cur, target, slop);
+
+  return !!heap->pending_ephemerons;
+}
+
 static int heap_init(struct gc_heap *heap, struct options *options) {
   // *heap is already initialized to 0.
 
@@ -2152,6 +2305,8 @@ static int heap_init(struct gc_heap *heap, struct options *options) {
   if (!tracer_init(heap, options->parallelism))
     GC_CRASH();
 
+  heap->pending_ephemerons_size_factor = 0.005;
+  heap->pending_ephemerons_size_slop = 0.5;
   heap->fragmentation_low_threshold = 0.05;
   heap->fragmentation_high_threshold = 0.10;
   heap->minor_gc_yield_threshold = 0.30;
@@ -2159,6 +2314,9 @@ static int heap_init(struct gc_heap *heap, struct options *options) {
   heap->major_gc_yield_threshold =
     clamp_major_gc_yield_threshold(heap, heap->minor_gc_yield_threshold);
 
+  if (!heap_prepare_pending_ephemerons(heap))
+    GC_CRASH();
+
   return 1;
 }
 

From 499ff1fe765e8393a31e5b9d50ef479b58df5e1f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 23 Jan 2023 20:57:02 +0100
Subject: [PATCH 162/403] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ced078698..a4dcdbd21 100644
--- a/README.md
+++ b/README.md
@@ -148,7 +148,7 @@ large majority of use cases.
  - [ ] Heap growth/shrinking
  - [ ] Debugging/tracing
  - [ ] Finalizers
- - [ ] Weak references / weak maps
+ - [X] Weak references / weak maps
 
 ### Features that would improve Whippet performance
 

From 4cb26e0144880dcf8ff0cf4973e2a4e9e5a8cded Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 15 Feb 2023 10:50:17 +0100
Subject: [PATCH 163/403] Rework options interface

Users will want to set options from an environment variable or something
like that.  Particular GC implementations will want to expose an
expanded set of options.  For these reasons we make the options
interface a bit more generalized and include parsing.
---
 Makefile                |  31 +++----
 bdw.c                   | 103 +++++++----------------
 ephemerons.c            |  10 ++-
 gc-api.h                |  22 ++---
 gc-internal.h           |   1 +
 gc-options-internal.h   |  32 +++++++
 gc-options.c            | 181 ++++++++++++++++++++++++++++++++++++++++
 gc-options.h            |  39 +++++++++
 gc-platform-gnu-linux.c |   8 ++
 gc-platform.h           |   1 +
 mt-gcbench.c            |  11 ++-
 quads.c                 |  10 ++-
 semi.c                  | 111 +++++++++---------------
 whippet.c               | 108 ++++++++----------------
 14 files changed, 411 insertions(+), 257 deletions(-)
 create mode 100644 gc-options-internal.h
 create mode 100644 gc-options.c
 create mode 100644 gc-options.h

diff --git a/Makefile b/Makefile
index 6f5652ad5..3cfc68013 100644
--- a/Makefile
+++ b/Makefile
@@ -44,6 +44,9 @@ gc-platform.o: gc-platform.h gc-platform-$(PLATFORM).c gc-visibility.h
 gc-stack.o: gc-stack.c
 	$(COMPILE) -o $@ -c $<
 
+gc-options.o: gc-options.c gc-options.h gc-options-internal.h
+	$(COMPILE) -o $@ -c $<
+
 gc-ephemeron-%.o: gc-ephemeron.c gc-ephemeron.h gc-ephemeron-internal.h %-embedder.h
 	$(COMPILE) -include $*-embedder.h -o $@ -c $<
 
@@ -51,98 +54,98 @@ bdw-%-gc.o: bdw.c %-embedder.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
 bdw-%.o: bdw.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include bdw-attrs.h -o $@ -c $*.c
-bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
 
 semi-%-gc.o: semi.c %-embedder.h large-object-space.h assert.h debug.h %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c semi.c
 semi-%.o: semi.c %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include semi-attrs.h -o $@ -c $*.c
-semi-%: semi-%.o semi-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+semi-%: semi-%.o semi-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-whippet-%: whippet-%.o whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+whippet-%: whippet-%.o whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-stack-conservative-whippet-%: stack-conservative-whippet-%.o stack-conservative-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+stack-conservative-whippet-%: stack-conservative-whippet-%.o stack-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
-heap-conservative-whippet-%: heap-conservative-whippet-%.o heap-conservative-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+heap-conservative-whippet-%: heap-conservative-whippet-%.o heap-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-parallel-whippet-%: parallel-whippet-%.o parallel-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+parallel-whippet-%: parallel-whippet-%.o parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-stack-conservative-parallel-whippet-%: stack-conservative-parallel-whippet-%.o stack-conservative-parallel-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+stack-conservative-parallel-whippet-%: stack-conservative-parallel-whippet-%.o stack-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-parallel-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include whippet-attrs.h -o $@ -c $*.c
-heap-conservative-parallel-whippet-%: heap-conservative-parallel-whippet-%.o heap-conservative-parallel-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+heap-conservative-parallel-whippet-%: heap-conservative-parallel-whippet-%.o heap-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-generational-whippet-%: generational-whippet-%.o generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+generational-whippet-%: generational-whippet-%.o generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-stack-conservative-generational-whippet-%: stack-conservative-generational-whippet-%.o stack-conservative-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+stack-conservative-generational-whippet-%: stack-conservative-generational-whippet-%.o stack-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
-heap-conservative-generational-whippet-%: heap-conservative-generational-whippet-%.o heap-conservative-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+heap-conservative-generational-whippet-%: heap-conservative-generational-whippet-%.o heap-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-parallel-generational-whippet-%: parallel-generational-whippet-%.o parallel-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+parallel-generational-whippet-%: parallel-generational-whippet-%.o parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 stack-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
 stack-conservative-parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
-stack-conservative-parallel-generational-whippet-%: stack-conservative-parallel-generational-whippet-%.o stack-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+stack-conservative-parallel-generational-whippet-%: stack-conservative-parallel-generational-whippet-%.o stack-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 heap-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
 heap-conservative-parallel-generational-whippet-%.o: whippet.c %.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
-heap-conservative-parallel-generational-whippet-%: heap-conservative-parallel-generational-whippet-%.o heap-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-platform.o gc-ephemeron-%.o
+heap-conservative-parallel-generational-whippet-%: heap-conservative-parallel-generational-whippet-%.o heap-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 .PRECIOUS: $(ALL_TESTS)
diff --git a/bdw.c b/bdw.c
index eb05c1b44..b77847612 100644
--- a/bdw.c
+++ b/bdw.c
@@ -213,79 +213,40 @@ static inline struct gc_heap *mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
-#define FOR_EACH_GC_OPTION(M) \
-  M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
-  M(GC_OPTION_PARALLELISM, "parallelism")
-
-static void dump_available_gc_options(void) {
-  fprintf(stderr, "available gc options:");
-#define PRINT_OPTION(option, name) fprintf(stderr, " %s", name);
-  FOR_EACH_GC_OPTION(PRINT_OPTION)
-#undef PRINT_OPTION
-  fprintf(stderr, "\n");
-}
-
-int gc_option_from_string(const char *str) {
-#define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
-  FOR_EACH_GC_OPTION(PARSE_OPTION)
-#undef PARSE_OPTION
-  if (strcmp(str, "fixed-heap-size") == 0)
-    return GC_OPTION_FIXED_HEAP_SIZE;
-  if (strcmp(str, "parallelism") == 0)
-    return GC_OPTION_PARALLELISM;
-  fprintf(stderr, "bad gc option: '%s'\n", str);
-  dump_available_gc_options();
-  return -1;
-}
-
-struct options {
-  size_t fixed_heap_size;
-  size_t parallelism;
+struct gc_options {
+  struct gc_common_options common;
 };
-
-static size_t parse_size_t(double value) {
-  GC_ASSERT(value >= 0);
-  GC_ASSERT(value <= (size_t) -1);
-  return value;
+int gc_option_from_string(const char *str) {
+  return gc_common_option_from_string(str);
+}
+struct gc_options* gc_allocate_options(void) {
+  struct gc_options *ret = malloc(sizeof(struct gc_options));
+  gc_init_common_options(&ret->common);
+  return ret;
+}
+int gc_options_set_int(struct gc_options *options, int option, int value) {
+  return gc_common_options_set_int(&options->common, option, value);
+}
+int gc_options_set_size(struct gc_options *options, int option,
+                        size_t value) {
+  return gc_common_options_set_size(&options->common, option, value);
+}
+int gc_options_set_double(struct gc_options *options, int option,
+                          double value) {
+  return gc_common_options_set_double(&options->common, option, value);
+}
+int gc_options_parse_and_set(struct gc_options *options, int option,
+                             const char *value) {
+  return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-static size_t number_of_current_processors(void) { return 1; }
-
-static int parse_options(int argc, struct gc_option argv[],
-                         struct options *options) {
-  for (int i = 0; i < argc; i++) {
-    switch (argv[i].option) {
-    case GC_OPTION_FIXED_HEAP_SIZE:
-      options->fixed_heap_size = parse_size_t(argv[i].value);
-      break;
-    case GC_OPTION_PARALLELISM:
-      options->parallelism = parse_size_t(argv[i].value);
-      break;
-    default:
-      GC_CRASH();
-    }
-  }
-
-  if (!options->fixed_heap_size) {
-    fprintf(stderr, "fixed heap size is currently required\n");
-    return 0;
-  }
-  if (!options->parallelism)
-    options->parallelism = number_of_current_processors();
-
-  return 1;
-}
-
-int gc_init(int argc, struct gc_option argv[],
-            struct gc_stack_addr *stack_base, struct gc_heap **heap,
-            struct gc_mutator **mutator) {
+int gc_init(struct gc_options *options, struct gc_stack_addr *stack_base,
+            struct gc_heap **heap, struct gc_mutator **mutator) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
                GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
 
-  struct options options = { 0, };
-  if (!parse_options(argc, argv, &options))
-    return 0;
+  if (!options) options = gc_allocate_options();
 
   // GC_full_freq = 30;
   // GC_free_space_divisor = 16;
@@ -293,16 +254,16 @@ int gc_init(int argc, struct gc_option argv[],
   
   // Ignore stack base for main thread.
 
-  GC_set_max_heap_size(options.fixed_heap_size);
+  GC_set_max_heap_size(options->common.heap_size);
   // Not part of 7.3, sigh.  Have to set an env var.
-  // GC_set_markers_count(options.parallelism);
+  // GC_set_markers_count(options->common.parallelism);
   char markers[21] = {0,}; // 21 bytes enough for 2**64 in decimal + NUL.
-  snprintf(markers, sizeof(markers), "%zu", options.parallelism);
+  snprintf(markers, sizeof(markers), "%d", options->common.parallelism);
   setenv("GC_MARKERS", markers, 1);
   GC_init();
   size_t current_heap_size = GC_get_heap_size();
-  if (options.fixed_heap_size > current_heap_size)
-    GC_expand_hp(options.fixed_heap_size - current_heap_size);
+  if (options->common.heap_size > current_heap_size)
+    GC_expand_hp(options->common.heap_size - current_heap_size);
   GC_allow_register_threads();
   *heap = GC_malloc(sizeof(struct gc_heap));
   pthread_mutex_init(&(*heap)->lock, NULL);
diff --git a/ephemerons.c b/ephemerons.c
index 84fc308f2..40779d741 100644
--- a/ephemerons.c
+++ b/ephemerons.c
@@ -225,12 +225,14 @@ int main(int argc, char *argv[]) {
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, heap_multiplier);
 
-  struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, (size_t) heap_size },
-                                 { GC_OPTION_PARALLELISM, parallelism } };
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  gc_options_set_int(options, GC_OPTION_PARALLELISM, parallelism);
+
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(sizeof options / sizeof options[0], options, NULL, &heap,
-               &mut)) {
+  if (!gc_init(options, NULL, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             (size_t)heap_size);
     return 1;
diff --git a/gc-api.h b/gc-api.h
index 4ffee3fc7..183603338 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -4,9 +4,10 @@
 #include "gc-config.h"
 #include "gc-assert.h"
 #include "gc-attrs.h"
-#include "gc-inline.h"
-#include "gc-ref.h"
 #include "gc-edge.h"
+#include "gc-inline.h"
+#include "gc-options.h"
+#include "gc-ref.h"
 #include "gc-visibility.h"
 
 #include <stdatomic.h>
@@ -16,26 +17,13 @@
 struct gc_heap;
 struct gc_mutator;
 
-enum {
-  GC_OPTION_FIXED_HEAP_SIZE,
-  GC_OPTION_PARALLELISM
-};
-
-struct gc_option {
-  int option;
-  double value;
-};
-
-GC_API_ int gc_option_from_string(const char *str);
-
 struct gc_stack_addr;
 GC_API_ void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *,
                                                  void *),
                                       void *data) GC_NEVER_INLINE;
 
-GC_API_ int gc_init(int argc, struct gc_option argv[],
-                    struct gc_stack_addr *base, struct gc_heap **heap,
-                    struct gc_mutator **mutator);
+GC_API_ int gc_init(struct gc_options *options, struct gc_stack_addr *base,
+                    struct gc_heap **heap, struct gc_mutator **mutator);
 
 struct gc_mutator_roots;
 GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
diff --git a/gc-internal.h b/gc-internal.h
index f74336dc9..abc9bd83a 100644
--- a/gc-internal.h
+++ b/gc-internal.h
@@ -6,5 +6,6 @@
 #endif
 
 #include "gc-ephemeron-internal.h"
+#include "gc-options-internal.h"
 
 #endif // GC_INTERNAL_H
diff --git a/gc-options-internal.h b/gc-options-internal.h
new file mode 100644
index 000000000..4190cb841
--- /dev/null
+++ b/gc-options-internal.h
@@ -0,0 +1,32 @@
+#ifndef GC_OPTIONS_INTERNAL_H
+#define GC_OPTIONS_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-options.h"
+
+struct gc_common_options {
+  enum gc_heap_size_policy heap_size_policy;
+  size_t heap_size;
+  size_t maximum_heap_size;
+  double heap_size_multiplier;
+  double heap_frugality;
+  int parallelism;
+};
+
+GC_INTERNAL void gc_init_common_options(struct gc_common_options *options);
+
+GC_INTERNAL int gc_common_option_from_string(const char *str);
+
+GC_INTERNAL int gc_common_options_set_int(struct gc_common_options *options,
+                                          int option, int value);
+GC_INTERNAL int gc_common_options_set_size(struct gc_common_options *options,
+                                           int option, size_t value);
+GC_INTERNAL int gc_common_options_set_double(struct gc_common_options *options,
+                                             int option, double value);
+GC_INTERNAL int gc_common_options_parse_and_set(struct gc_common_options *options,
+                                                int option, const char *value);
+
+#endif // GC_OPTIONS_INTERNAL_H
diff --git a/gc-options.c b/gc-options.c
new file mode 100644
index 000000000..77373b170
--- /dev/null
+++ b/gc-options.c
@@ -0,0 +1,181 @@
+#include <limits.h>
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GC_IMPL 1
+
+#include "gc-options-internal.h"
+#include "gc-platform.h"
+
+// M(UPPER, lower, repr, parser, default, min, max)
+#define FOR_EACH_INT_GC_OPTION(M)                                       \
+  M(HEAP_SIZE_POLICY, heap_size_policy, "heap-size-policy",             \
+    int, GC_HEAP_SIZE_FIXED, GC_HEAP_SIZE_FIXED, GC_HEAP_SIZE_ADAPTIVE) \
+  M(PARALLELISM, parallelism, "parallelism",                            \
+    int, default_parallelism(), 1, 64)
+
+#define FOR_EACH_SIZE_GC_OPTION(M)                                      \
+  M(HEAP_SIZE, heap_size, "heap-size",                                  \
+    size, 6 * 1024 * 1024, 0, -1)                                       \
+  M(MAXIMUM_HEAP_SIZE, maximum_heap_size, "maximum-heap-size",          \
+    size, 0, 0, -1)
+
+#define FOR_EACH_DOUBLE_GC_OPTION(M)                                    \
+  M(HEAP_SIZE_MULTIPLIER, heap_size_multiplier, "heap-size-multiplier", \
+    double, 1.75, 1.0, 1e6)                                             \
+  M(HEAP_FRUGALITY, heap_frugality, "heap-frugality",                   \
+    double, 1e-1, 1e-6, 1e6)
+
+typedef int gc_option_int;
+typedef size_t gc_option_size;
+typedef double gc_option_double;
+
+#define FOR_EACH_COMMON_GC_OPTION(M)                                    \
+  FOR_EACH_INT_GC_OPTION(M)                                             \
+  FOR_EACH_SIZE_GC_OPTION(M)                                            \
+  FOR_EACH_DOUBLE_GC_OPTION(M)
+
+static int clamp_int(int n, int lo, int hi) {
+  return n < lo ? lo : n > hi ? hi : n;
+}
+static size_t clamp_size(size_t n, size_t lo, size_t hi) {
+  return n < lo ? lo : n > hi ? hi : n;
+}
+static double clamp_double(double n, double lo, double hi) {
+  return n < lo ? lo : n > hi ? hi : n;
+}
+
+static int default_parallelism(void) {
+  return clamp_int(gc_platform_processor_count(), 1, 8);
+}
+
+void gc_init_common_options(struct gc_common_options *options) {
+#define INIT(UPPER, lower, repr, parser, default, min, max) \
+  options->lower = default;
+  FOR_EACH_COMMON_GC_OPTION(INIT)
+#undef INIT
+}
+
+int gc_common_option_from_string(const char *str) {
+#define GET_OPTION(UPPER, lower, repr, parser, default, min, max) \
+  if (strcmp(str, repr) == 0) return GC_OPTION_##UPPER;
+  FOR_EACH_COMMON_GC_OPTION(GET_OPTION)
+#undef GET_OPTION
+  return -1;
+}
+
+#define SET_OPTION(UPPER, lower, repr, parser, default, min, max)       \
+  case GC_OPTION_##UPPER:                                               \
+  if (value != clamp_##parser(value, min, max)) return 0;               \
+    options->lower = value;                                             \
+    return 1;
+#define DEFINE_SETTER(STEM, stem, type)                                 \
+  int gc_common_options_set_##stem(struct gc_common_options *options,    \
+                                   int option, type value) {            \
+    switch (option) {                                                   \
+      FOR_EACH_##STEM##_GC_OPTION(SET_OPTION)                           \
+      default: return 0;                                                \
+    }                                                                   \
+  }
+DEFINE_SETTER(INT, int, int)
+DEFINE_SETTER(SIZE, size, size_t)
+DEFINE_SETTER(DOUBLE, double, double)
+#undef SET_OPTION
+#undef DEFINE_SETTER
+
+static int parse_size(const char *arg, size_t *val) {
+  char *end;
+  long i = strtol(arg, &end, 0);
+  if (i < 0 || i == LONG_MAX) return 0;
+  if (end == arg) return 0;
+  char delim = *end;
+  if (delim == 'k' || delim == 'K')
+    ++end, i *= 1024L;
+  else if (delim == 'm' || delim == 'M')
+    ++end, i *= 1024L * 1024L;
+  else if (delim == 'g' || delim == 'G')
+    ++end, i *= 1024L * 1024L * 1024L;
+  else if (delim == 't' || delim == 'T')
+    ++end, i *= 1024L * 1024L * 1024L * 1024L;
+
+  if (*end != '\0') return 0;
+  *val = i;
+  return 1;
+}
+
+static int parse_int(const char *arg, int *val) {
+  char *end;
+  long i = strtol(arg, &end, 0);
+  if (i == LONG_MIN || i == LONG_MAX || end == arg || *end)
+    return 0;
+  *val = i;
+  return 1;
+}
+
+static int parse_double(const char *arg, double *val) {
+  char *end;
+  double d = strtod(arg, &end);
+  if (end == arg || *end)
+    return 0;
+  *val = d;
+  return 1;
+}
+
+int gc_common_options_parse_and_set(struct gc_common_options *options,
+                                    int option, const char *value) {
+  switch (option) {
+#define SET_OPTION(UPPER, lower, repr, parser, default, min, max)       \
+    case GC_OPTION_##UPPER: {                                           \
+      gc_option_##parser v;                                             \
+      if (!parse_##parser(value, &v)) return 0;                         \
+      return gc_common_options_set_##parser(options, option, v);        \
+    }
+    FOR_EACH_COMMON_GC_OPTION(SET_OPTION)
+    default: return 0;
+  }
+}
+
+static int is_lower(char c) { return 'a' <= c && c <= 'z'; }
+static int is_digit(char c) { return '0' <= c && c <= '9'; }
+static int is_option(char c) { return is_lower(c) || c == '-'; }
+static int is_option_end(char c) { return c == '='; }
+static int is_value(char c) {
+  return is_lower(c) || is_digit(c) || c == '-' || c == '+' || c == '.';
+}
+static int is_value_end(char c) { return c == '\0' || c == ','; }
+static char* read_token(char *p, int (*is_tok)(char c), int (*is_end)(char c),
+                        char *delim) {
+  char c;
+  for (c = *p; is_tok(c); p++);
+  if (!is_end(c)) return NULL;
+  *delim = c;
+  *p = '\0';
+  return p + 1;
+}
+int gc_options_parse_and_set_many(struct gc_options *options,
+                                  const char *str) {
+  if (!*str) return 1;
+  char *copy = strdup(str);
+  char *cur = copy;
+  int ret = 0;
+  while (1) {
+    char delim;
+    char *next = read_token(cur, is_option, is_option_end, &delim);
+    if (!next) break;
+    int option = gc_option_from_string(cur);
+    if (option < 0) break;
+
+    cur = next;
+    next = read_token(cur, is_value, is_value_end, &delim);
+    if (!next) break;
+    if (!gc_options_parse_and_set(options, option, cur)) break;
+    cur = next;
+    if (delim == '\0') {
+      ret = 1;
+      break;
+    }
+  }
+  free(copy);
+  return ret;
+}
diff --git a/gc-options.h b/gc-options.h
new file mode 100644
index 000000000..35cb7aacf
--- /dev/null
+++ b/gc-options.h
@@ -0,0 +1,39 @@
+#ifndef GC_OPTIONS_H
+#define GC_OPTIONS_H
+
+#include "gc-visibility.h"
+
+enum gc_heap_size_policy {
+  GC_HEAP_SIZE_FIXED,
+  GC_HEAP_SIZE_GROWABLE,
+  GC_HEAP_SIZE_ADAPTIVE,
+};
+
+enum {
+  GC_OPTION_HEAP_SIZE_POLICY,
+  GC_OPTION_HEAP_SIZE,
+  GC_OPTION_MAXIMUM_HEAP_SIZE,
+  GC_OPTION_HEAP_SIZE_MULTIPLIER,
+  GC_OPTION_HEAP_FRUGALITY,
+  GC_OPTION_PARALLELISM
+};
+
+struct gc_options;
+
+GC_API_ int gc_option_from_string(const char *str);
+
+GC_API_ struct gc_options* gc_allocate_options(void);
+
+GC_API_ int gc_options_set_int(struct gc_options *options, int option,
+                               int value);
+GC_API_ int gc_options_set_size(struct gc_options *options, int option,
+                                size_t value);
+GC_API_ int gc_options_set_double(struct gc_options *options, int option,
+                                  double value);
+
+GC_API_ int gc_options_parse_and_set(struct gc_options *options,
+                                     int option, const char *value);
+GC_API_ int gc_options_parse_and_set_many(struct gc_options *options,
+                                          const char *str);
+
+#endif // GC_OPTIONS_H
diff --git a/gc-platform-gnu-linux.c b/gc-platform-gnu-linux.c
index 66e2a73df..82390d445 100644
--- a/gc-platform-gnu-linux.c
+++ b/gc-platform-gnu-linux.c
@@ -3,6 +3,7 @@
 #include <errno.h>
 #include <link.h>
 #include <pthread.h>
+#include <sched.h>
 #include <stdio.h>
 #include <unistd.h>
 
@@ -102,3 +103,10 @@ void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
   struct visit_data visit_data = { f, heap, data };
   dl_iterate_phdr(visit_roots, &visit_data);
 }
+
+int gc_platform_processor_count(void) {
+  cpu_set_t set;
+  if (sched_getaffinity(0, sizeof (set), &set) != 0)
+    return 1;
+  return CPU_COUNT(&set);
+}
diff --git a/gc-platform.h b/gc-platform.h
index b22787d19..ea6a6aa18 100644
--- a/gc-platform.h
+++ b/gc-platform.h
@@ -20,5 +20,6 @@ void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
                                                            void *data),
                                                  struct gc_heap *heap,
                                                  void *data);
+GC_INTERNAL int gc_platform_processor_count(void);
 
 #endif // GC_PLATFORM_H
diff --git a/mt-gcbench.c b/mt-gcbench.c
index ac0eb1ff9..1ab50a4e0 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -355,12 +355,15 @@ int main(int argc, char *argv[]) {
   }
 
   size_t heap_size = heap_max_live * multiplier * nthreads;
-  struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, heap_size },
-                                 { GC_OPTION_PARALLELISM, parallelism } };
+
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  gc_options_set_int(options, GC_OPTION_PARALLELISM, parallelism);
+
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(sizeof options / sizeof options[0], options, NULL, &heap,
-               &mut)) {
+  if (!gc_init(options, NULL, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
diff --git a/quads.c b/quads.c
index 27923f1f7..097735fad 100644
--- a/quads.c
+++ b/quads.c
@@ -128,12 +128,14 @@ int main(int argc, char *argv[]) {
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, multiplier);
 
-  struct gc_option options[] = { { GC_OPTION_FIXED_HEAP_SIZE, heap_size },
-                                 { GC_OPTION_PARALLELISM, parallelism } };
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  gc_options_set_int(options, GC_OPTION_PARALLELISM, 1);
+
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(sizeof options / sizeof options[0], options, NULL, &heap,
-               &mut)) {
+  if (!gc_init(options, NULL, &heap, &mut)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
diff --git a/semi.c b/semi.c
index e7c2b59cd..5f3db5473 100644
--- a/semi.c
+++ b/semi.c
@@ -303,70 +303,6 @@ static int initialize_semi_space(struct semi_space *space, size_t size) {
   return 1;
 }
   
-#define FOR_EACH_GC_OPTION(M) \
-  M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
-  M(GC_OPTION_PARALLELISM, "parallelism")
-
-static void dump_available_gc_options(void) {
-  fprintf(stderr, "available gc options:");
-#define PRINT_OPTION(option, name) fprintf(stderr, " %s", name);
-  FOR_EACH_GC_OPTION(PRINT_OPTION)
-#undef PRINT_OPTION
-  fprintf(stderr, "\n");
-}
-
-int gc_option_from_string(const char *str) {
-#define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
-  FOR_EACH_GC_OPTION(PARSE_OPTION)
-#undef PARSE_OPTION
-  if (strcmp(str, "fixed-heap-size") == 0)
-    return GC_OPTION_FIXED_HEAP_SIZE;
-  if (strcmp(str, "parallelism") == 0)
-    return GC_OPTION_PARALLELISM;
-  fprintf(stderr, "bad gc option: '%s'\n", str);
-  dump_available_gc_options();
-  return -1;
-}
-
-struct options {
-  size_t fixed_heap_size;
-  size_t parallelism;
-};
-
-static size_t parse_size_t(double value) {
-  GC_ASSERT(value >= 0);
-  GC_ASSERT(value <= (size_t) -1);
-  return value;
-}
-
-static int parse_options(int argc, struct gc_option argv[],
-                         struct options *options) {
-  options->parallelism = 1;
-  for (int i = 0; i < argc; i++) {
-    switch (argv[i].option) {
-    case GC_OPTION_FIXED_HEAP_SIZE:
-      options->fixed_heap_size = parse_size_t(argv[i].value);
-      break;
-    case GC_OPTION_PARALLELISM:
-      options->parallelism = parse_size_t(argv[i].value);
-      break;
-    default:
-      GC_CRASH();
-    }
-  }
-
-  if (!options->fixed_heap_size) {
-    fprintf(stderr, "fixed heap size is currently required\n");
-    return 0;
-  }
-  if (options->parallelism != 1) {
-    fprintf(stderr, "parallelism unimplemented in semispace copying collector\n");
-    return 0;
-  }
-
-  return 1;
-}
-
 static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
   struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
   size_t target = heap->size * heap->pending_ephemerons_size_factor;
@@ -390,27 +326,60 @@ static int heap_init(struct gc_heap *heap, size_t size) {
   return heap_prepare_pending_ephemerons(heap);
 }
 
-int gc_init(int argc, struct gc_option argv[],
-            struct gc_stack_addr *stack_base, struct gc_heap **heap,
-            struct gc_mutator **mut) {
+struct gc_options {
+  struct gc_common_options common;
+};
+int gc_option_from_string(const char *str) {
+  return gc_common_option_from_string(str);
+}
+struct gc_options* gc_allocate_options(void) {
+  struct gc_options *ret = malloc(sizeof(struct gc_options));
+  gc_init_common_options(&ret->common);
+  return ret;
+}
+int gc_options_set_int(struct gc_options *options, int option, int value) {
+  return gc_common_options_set_int(&options->common, option, value);
+}
+int gc_options_set_size(struct gc_options *options, int option,
+                        size_t value) {
+  return gc_common_options_set_size(&options->common, option, value);
+}
+int gc_options_set_double(struct gc_options *options, int option,
+                          double value) {
+  return gc_common_options_set_double(&options->common, option, value);
+}
+int gc_options_parse_and_set(struct gc_options *options, int option,
+                             const char *value) {
+  return gc_common_options_parse_and_set(&options->common, option, value);
+}
+
+int gc_init(struct gc_options *options, struct gc_stack_addr *stack_base,
+            struct gc_heap **heap, struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
                offsetof(struct semi_space, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
                offsetof(struct semi_space, limit));
 
-  struct options options = { 0, };
-  if (!parse_options(argc, argv, &options))
+  if (!options) options = gc_allocate_options();
+
+  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
+    fprintf(stderr, "fixed heap size is currently required\n");
     return 0;
+  }
+  if (options->common.parallelism != 1) {
+    fprintf(stderr, "parallelism unimplemented in semispace copying collector\n");
+    return 0;
+  }
 
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   *heap = mutator_heap(*mut);
 
-  if (!heap_init(*heap, options.fixed_heap_size))
+  if (!heap_init(*heap, options->common.heap_size))
     return 0;
 
   struct semi_space *space = mutator_semi_space(*mut);
-  if (!initialize_semi_space(space, options.fixed_heap_size))
+  if (!initialize_semi_space(space, options->common.heap_size))
     return 0;
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     return 0;
diff --git a/whippet.c b/whippet.c
index 9f81948a7..aee9086fd 100644
--- a/whippet.c
+++ b/whippet.c
@@ -2197,69 +2197,6 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
   return heap->count;
 }
 
-#define FOR_EACH_GC_OPTION(M) \
-  M(GC_OPTION_FIXED_HEAP_SIZE, "fixed-heap-size") \
-  M(GC_OPTION_PARALLELISM, "parallelism")
-
-static void dump_available_gc_options(void) {
-  fprintf(stderr, "available gc options:");
-#define PRINT_OPTION(option, name) fprintf(stderr, " %s", name);
-  FOR_EACH_GC_OPTION(PRINT_OPTION)
-#undef PRINT_OPTION
-  fprintf(stderr, "\n");
-}
-
-int gc_option_from_string(const char *str) {
-#define PARSE_OPTION(option, name) if (strcmp(str, name) == 0) return option;
-  FOR_EACH_GC_OPTION(PARSE_OPTION)
-#undef PARSE_OPTION
-  if (strcmp(str, "fixed-heap-size") == 0)
-    return GC_OPTION_FIXED_HEAP_SIZE;
-  if (strcmp(str, "parallelism") == 0)
-    return GC_OPTION_PARALLELISM;
-  fprintf(stderr, "bad gc option: '%s'\n", str);
-  dump_available_gc_options();
-  return -1;
-}
-
-struct options {
-  size_t fixed_heap_size;
-  size_t parallelism;
-};
-
-static size_t parse_size_t(double value) {
-  GC_ASSERT(value >= 0);
-  GC_ASSERT(value <= (size_t) -1);
-  return value;
-}
-
-static size_t number_of_current_processors(void) { return 1; }
-
-static int parse_options(int argc, struct gc_option argv[],
-                         struct options *options) {
-  for (int i = 0; i < argc; i++) {
-    switch (argv[i].option) {
-    case GC_OPTION_FIXED_HEAP_SIZE:
-      options->fixed_heap_size = parse_size_t(argv[i].value);
-      break;
-    case GC_OPTION_PARALLELISM:
-      options->parallelism = parse_size_t(argv[i].value);
-      break;
-    default:
-      GC_CRASH();
-    }
-  }
-
-  if (!options->fixed_heap_size) {
-    fprintf(stderr, "fixed heap size is currently required\n");
-    return 0;
-  }
-  if (!options->parallelism)
-    options->parallelism = number_of_current_processors();
-
-  return 1;
-}
-
 static struct slab* allocate_slabs(size_t nslabs) {
   size_t size = nslabs * SLAB_SIZE;
   size_t extent = size + SLAB_SIZE;
@@ -2294,15 +2231,42 @@ static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
   return !!heap->pending_ephemerons;
 }
 
-static int heap_init(struct gc_heap *heap, struct options *options) {
+struct gc_options {
+  struct gc_common_options common;
+};
+int gc_option_from_string(const char *str) {
+  return gc_common_option_from_string(str);
+}
+struct gc_options* gc_allocate_options(void) {
+  struct gc_options *ret = malloc(sizeof(struct gc_options));
+  gc_init_common_options(&ret->common);
+  return ret;
+}
+int gc_options_set_int(struct gc_options *options, int option, int value) {
+  return gc_common_options_set_int(&options->common, option, value);
+}
+int gc_options_set_size(struct gc_options *options, int option,
+                        size_t value) {
+  return gc_common_options_set_size(&options->common, option, value);
+}
+int gc_options_set_double(struct gc_options *options, int option,
+                          double value) {
+  return gc_common_options_set_double(&options->common, option, value);
+}
+int gc_options_parse_and_set(struct gc_options *options, int option,
+                             const char *value) {
+  return gc_common_options_parse_and_set(&options->common, option, value);
+}
+
+static int heap_init(struct gc_heap *heap, struct gc_options *options) {
   // *heap is already initialized to 0.
 
   pthread_mutex_init(&heap->lock, NULL);
   pthread_cond_init(&heap->mutator_cond, NULL);
   pthread_cond_init(&heap->collector_cond, NULL);
-  heap->size = options->fixed_heap_size;
+  heap->size = options->common.heap_size;
 
-  if (!tracer_init(heap, options->parallelism))
+  if (!tracer_init(heap, options->common.parallelism))
     GC_CRASH();
 
   heap->pending_ephemerons_size_factor = 0.005;
@@ -2352,9 +2316,8 @@ static int mark_space_init(struct mark_space *space, struct gc_heap *heap) {
   return 1;
 }
 
-int gc_init(int argc, struct gc_option argv[],
-            struct gc_stack_addr *stack_base, struct gc_heap **heap,
-            struct gc_mutator **mut) {
+int gc_init(struct gc_options *options, struct gc_stack_addr *stack_base,
+            struct gc_heap **heap, struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
@@ -2370,14 +2333,15 @@ int gc_init(int argc, struct gc_option argv[],
                  BLOCK_SIZE / REMSET_BYTES_PER_BLOCK);
   }
 
-  struct options options = { 0, };
-  if (!parse_options(argc, argv, &options))
+  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
+    fprintf(stderr, "fixed heap size is currently required\n");
     return 0;
+  }
 
   *heap = calloc(1, sizeof(struct gc_heap));
   if (!*heap) GC_CRASH();
 
-  if (!heap_init(*heap, &options))
+  if (!heap_init(*heap, options))
     GC_CRASH();
 
   struct mark_space *space = heap_mark_space(*heap);

From f0ad02d6eef99312a3ee633b27fccf66119c7f58 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 09:39:15 +0100
Subject: [PATCH 164/403] Fix parsing options from string, doh

---
 gc-options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gc-options.c b/gc-options.c
index 77373b170..076c2700f 100644
--- a/gc-options.c
+++ b/gc-options.c
@@ -147,7 +147,7 @@ static int is_value_end(char c) { return c == '\0' || c == ','; }
 static char* read_token(char *p, int (*is_tok)(char c), int (*is_end)(char c),
                         char *delim) {
   char c;
-  for (c = *p; is_tok(c); p++);
+  for (c = *p; is_tok(c); c = *++p);
   if (!is_end(c)) return NULL;
   *delim = c;
   *p = '\0';

From 8edfd42ca1d4cc77b57e81806e409176a3491f2a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 18 Feb 2023 10:00:59 +0100
Subject: [PATCH 165/403] Rework semi-space collector to separate regions

This is a preparation for heap growth and shrinking.
---
 semi.c | 129 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 92 insertions(+), 37 deletions(-)

diff --git a/semi.c b/semi.c
index 5f3db5473..1f0bde97b 100644
--- a/semi.c
+++ b/semi.c
@@ -17,15 +17,19 @@
 #error semi is a precise collector
 #endif
 
+struct region {
+  uintptr_t base;
+  size_t size;
+  size_t unavailable;
+};
 struct semi_space {
   uintptr_t hp;
   uintptr_t limit;
-  uintptr_t from_space;
-  uintptr_t to_space;
+  struct region from_space;
+  struct region to_space;
   size_t page_size;
   size_t stolen_pages;
-  uintptr_t base;
-  size_t size;
+  size_t reserve_pages;
 };
 struct gc_heap {
   struct semi_space semi_space;
@@ -70,39 +74,68 @@ static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) GC_NEVER_INL
 
 static void trace(struct gc_edge edge, struct gc_heap *heap, void *visit_data);
 
-static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
-  size_t stolen_pages = space->stolen_pages + npages;
-  size_t old_limit_size = space->limit - space->to_space;
-  size_t new_limit_size =
-    (space->size - align_up(stolen_pages, 2) * space->page_size) / 2;
+static void region_trim_by(struct region *region, size_t newly_unavailable) {
+  size_t old_available = region->size - region->unavailable;
+  GC_ASSERT(newly_unavailable <= old_available);
 
-  if (space->to_space + new_limit_size < space->hp)
+  madvise((void*)(region->base + old_available - newly_unavailable),
+          newly_unavailable,
+          MADV_DONTNEED);
+  region->unavailable += newly_unavailable;
+}
+
+static void region_reset_unavailable(struct region *region,
+                                     size_t unavailable) {
+  GC_ASSERT(unavailable <= region->unavailable);
+  region->unavailable = unavailable;
+}
+
+static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
+  size_t old_unavailable_pages = space->stolen_pages + space->reserve_pages;
+  size_t old_region_unavailable_pages = align_up(old_unavailable_pages, 2) / 2;
+  size_t new_unavailable_pages = old_unavailable_pages + npages;
+  size_t new_region_unavailable_pages = align_up(new_unavailable_pages, 2) / 2;
+  size_t region_newly_unavailable_pages =
+    new_region_unavailable_pages - old_region_unavailable_pages;
+  size_t region_newly_unavailable_bytes =
+    region_newly_unavailable_pages * space->page_size;
+
+  if (space->limit - space->hp < region_newly_unavailable_bytes)
     return 0;
 
-  space->limit = space->to_space + new_limit_size;
-  space->stolen_pages = stolen_pages;
+  space->stolen_pages += npages;
 
-  madvise((void*)(space->to_space + new_limit_size),
-          old_limit_size - new_limit_size,
-          MADV_DONTNEED);
-  madvise((void*)(space->from_space + new_limit_size),
-          old_limit_size - new_limit_size,
-          MADV_DONTNEED);
+  if (region_newly_unavailable_bytes == 0)
+    return 1;
+
+  space->limit -= region_newly_unavailable_bytes;
+  region_trim_by(&space->to_space, region_newly_unavailable_bytes);
+  region_trim_by(&space->from_space, region_newly_unavailable_bytes);
   return 1;
 }
 
-static void semi_space_set_stolen_pages(struct semi_space *space, size_t npages) {
+static void semi_space_set_stolen_pages(struct semi_space *space,
+                                        size_t npages) {
   space->stolen_pages = npages;
-  size_t limit_size =
-    (space->size - align_up(npages, 2) * space->page_size) / 2;
-  space->limit = space->to_space + limit_size;
+  size_t unavailable_pages = space->stolen_pages + space->reserve_pages;
+  size_t region_unavailable_pages = align_up(unavailable_pages, 2) / 2;
+  size_t region_unavailable_bytes = region_unavailable_pages * space->page_size;
+
+  region_reset_unavailable(&space->to_space, region_unavailable_bytes);
+  region_reset_unavailable(&space->from_space, region_unavailable_bytes);
+
+  space->limit =
+    space->to_space.base + space->to_space.size - space->to_space.unavailable;
 }
 
 static void flip(struct semi_space *space) {
-  space->hp = space->from_space;
-  space->from_space = space->to_space;
-  space->to_space = space->hp;
-  space->limit = space->hp + space->size / 2;
+  struct region tmp;
+  memcpy(&tmp, &space->from_space, sizeof(tmp));
+  memcpy(&space->from_space, &space->to_space, sizeof(tmp));
+  memcpy(&space->to_space, &tmp, sizeof(tmp));
+
+  space->hp = space->to_space.base;
+  space->limit = space->hp + space->to_space.size;
 }  
 
 static struct gc_ref copy(struct gc_heap *heap, struct semi_space *space,
@@ -148,9 +181,16 @@ static void visit_large_object_space(struct gc_heap *heap,
   }
 }
 
+static int region_contains(struct region *region, uintptr_t addr) {
+  return addr - region->base < region->size;
+}
+
 static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
+  // As each live object is traced exactly once, its edges have not been
+  // visited, so its refs are to fromspace and not tospace.
   uintptr_t addr = gc_ref_value(ref);
-  return addr - space->base < space->size;
+  GC_ASSERT(!region_contains(&space->to_space, addr));
+  return region_contains(&space->from_space, addr);
 }
 
 static void visit(struct gc_edge edge, struct gc_heap *heap) {
@@ -223,7 +263,8 @@ static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
   collect(mut);
   struct semi_space *space = mutator_semi_space(mut);
   if (space->limit - space->hp < bytes) {
-    fprintf(stderr, "ran out of space, heap size %zu\n", space->size);
+    fprintf(stderr, "ran out of space, heap size %zu\n",
+            mutator_heap(mut)->size);
     GC_CRASH();
   }
 }
@@ -237,7 +278,8 @@ void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
   if (!semi_space_steal_pages(semi_space, npages)) {
     collect(mut);
     if (!semi_space_steal_pages(semi_space, npages)) {
-      fprintf(stderr, "ran out of space, heap size %zu\n", semi_space->size);
+      fprintf(stderr, "ran out of space, heap size %zu\n",
+              mutator_heap(mut)->size);
       GC_CRASH();
     }
   }
@@ -282,11 +324,7 @@ void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
   gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
 }
 
-static int initialize_semi_space(struct semi_space *space, size_t size) {
-  // Allocate even numbers of pages.
-  size_t page_size = getpagesize();
-  size = align_up(size, page_size * 2);
-
+static int initialize_region(struct region *region, size_t size) {
   void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED) {
@@ -294,11 +332,28 @@ static int initialize_semi_space(struct semi_space *space, size_t size) {
     return 0;
   }
 
-  space->to_space = space->hp = space->base = (uintptr_t) mem;
-  space->from_space = space->base + size / 2;
+  region->base = (uintptr_t)mem;
+  region->size = size;
+  region->unavailable = 0;
+  return 1;
+}
+
+static int initialize_semi_space(struct semi_space *space, size_t size) {
+  // Allocate even numbers of pages.
+  size_t page_size = getpagesize();
+  size = align_up(size, page_size * 2);
+
+  if (!initialize_region(&space->from_space, size / 2))
+    return 0;
+  if (!initialize_region(&space->to_space, size / 2))
+    return 0;
+
+  space->hp = space->to_space.base;
+  space->limit = space->hp + space->to_space.size;
+
   space->page_size = page_size;
   space->stolen_pages = 0;
-  space->size = size;
+  space->reserve_pages = 0;
 
   return 1;
 }

From c42c538aaa3f17ac7a444d29acdd7ab6dccf2d46 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 09:27:59 +0100
Subject: [PATCH 166/403] gc_init takes const gc_options

---
 bdw.c     | 2 +-
 gc-api.h  | 5 +++--
 semi.c    | 2 +-
 whippet.c | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/bdw.c b/bdw.c
index b77847612..41533e1a2 100644
--- a/bdw.c
+++ b/bdw.c
@@ -240,7 +240,7 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-int gc_init(struct gc_options *options, struct gc_stack_addr *stack_base,
+int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mutator) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
diff --git a/gc-api.h b/gc-api.h
index 183603338..47222706a 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -22,8 +22,9 @@ GC_API_ void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *,
                                                  void *),
                                       void *data) GC_NEVER_INLINE;
 
-GC_API_ int gc_init(struct gc_options *options, struct gc_stack_addr *base,
-                    struct gc_heap **heap, struct gc_mutator **mutator);
+GC_API_ int gc_init(const struct gc_options *options,
+                    struct gc_stack_addr *base, struct gc_heap **heap,
+                    struct gc_mutator **mutator);
 
 struct gc_mutator_roots;
 GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
diff --git a/semi.c b/semi.c
index 1f0bde97b..7c5d8b07d 100644
--- a/semi.c
+++ b/semi.c
@@ -408,7 +408,7 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-int gc_init(struct gc_options *options, struct gc_stack_addr *stack_base,
+int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
                offsetof(struct semi_space, hp));
diff --git a/whippet.c b/whippet.c
index aee9086fd..686ac5cda 100644
--- a/whippet.c
+++ b/whippet.c
@@ -2258,7 +2258,7 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-static int heap_init(struct gc_heap *heap, struct gc_options *options) {
+static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   // *heap is already initialized to 0.
 
   pthread_mutex_init(&heap->lock, NULL);
@@ -2316,7 +2316,7 @@ static int mark_space_init(struct mark_space *space, struct gc_heap *heap) {
   return 1;
 }
 
-int gc_init(struct gc_options *options, struct gc_stack_addr *stack_base,
+int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);

From 898f7aa935474e7bfcf3d2433ffa49118202af30 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 09:28:12 +0100
Subject: [PATCH 167/403] Implement resizing of semi-space heap

Not yet hooked up to any demo, though.
---
 semi.c | 274 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 188 insertions(+), 86 deletions(-)

diff --git a/semi.c b/semi.c
index 7c5d8b07d..fe4ee382b 100644
--- a/semi.c
+++ b/semi.c
@@ -17,10 +17,13 @@
 #error semi is a precise collector
 #endif
 
+struct gc_options {
+  struct gc_common_options common;
+};
 struct region {
   uintptr_t base;
-  size_t size;
-  size_t unavailable;
+  size_t active_size;
+  size_t mapped_size;
 };
 struct semi_space {
   uintptr_t hp;
@@ -29,7 +32,6 @@ struct semi_space {
   struct region to_space;
   size_t page_size;
   size_t stolen_pages;
-  size_t reserve_pages;
 };
 struct gc_heap {
   struct semi_space semi_space;
@@ -40,6 +42,7 @@ struct gc_heap {
   size_t size;
   long count;
   int check_pending_ephemerons;
+  const struct gc_options *options;
 };
 // One mutator per space, can just store the heap in the mutator.
 struct gc_mutator {
@@ -47,7 +50,6 @@ struct gc_mutator {
   struct gc_mutator_roots *roots;
 };
 
-
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
@@ -68,37 +70,42 @@ static inline struct semi_space* mutator_semi_space(struct gc_mutator *mut) {
 static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }
+static size_t min_size(size_t a, size_t b) { return a < b ? a : b; }
+static size_t max_size(size_t a, size_t b) { return a < b ? b : a; }
 
-static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut, size_t for_alloc) GC_NEVER_INLINE;
+static void collect_for_alloc(struct gc_mutator *mut,
+                              size_t bytes) GC_NEVER_INLINE;
 
 static void trace(struct gc_edge edge, struct gc_heap *heap, void *visit_data);
 
 static void region_trim_by(struct region *region, size_t newly_unavailable) {
-  size_t old_available = region->size - region->unavailable;
-  GC_ASSERT(newly_unavailable <= old_available);
+  size_t old_available = region->active_size;
+  GC_ASSERT(newly_unavailable <= region->active_size);
 
-  madvise((void*)(region->base + old_available - newly_unavailable),
-          newly_unavailable,
+  region->active_size -= newly_unavailable;
+  madvise((void*)(region->base + region->active_size), newly_unavailable,
           MADV_DONTNEED);
-  region->unavailable += newly_unavailable;
 }
 
-static void region_reset_unavailable(struct region *region,
-                                     size_t unavailable) {
-  GC_ASSERT(unavailable <= region->unavailable);
-  region->unavailable = unavailable;
+static void region_set_active_size(struct region *region, size_t size) {
+  GC_ASSERT(size <= region->mapped_size);
+  GC_ASSERT(size == align_up(size, getpagesize()));
+  if (size < region->active_size)
+    region_trim_by(region, region->active_size - size);
+  else
+    region->active_size = size;
 }
 
 static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
-  size_t old_unavailable_pages = space->stolen_pages + space->reserve_pages;
-  size_t old_region_unavailable_pages = align_up(old_unavailable_pages, 2) / 2;
-  size_t new_unavailable_pages = old_unavailable_pages + npages;
-  size_t new_region_unavailable_pages = align_up(new_unavailable_pages, 2) / 2;
-  size_t region_newly_unavailable_pages =
-    new_region_unavailable_pages - old_region_unavailable_pages;
+  size_t old_stolen_pages = space->stolen_pages;
+  size_t old_region_stolen_pages = align_up(old_stolen_pages,2)/2;
+  size_t new_stolen_pages = old_stolen_pages + npages;
+  size_t new_region_stolen_pages = align_up(new_stolen_pages,2)/2;
+  size_t region_newly_stolen_pages =
+    new_region_stolen_pages - old_region_stolen_pages;
   size_t region_newly_unavailable_bytes =
-    region_newly_unavailable_pages * space->page_size;
+    region_newly_stolen_pages * space->page_size;
 
   if (space->limit - space->hp < region_newly_unavailable_bytes)
     return 0;
@@ -114,28 +121,23 @@ static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
   return 1;
 }
 
-static void semi_space_set_stolen_pages(struct semi_space *space,
-                                        size_t npages) {
-  space->stolen_pages = npages;
-  size_t unavailable_pages = space->stolen_pages + space->reserve_pages;
-  size_t region_unavailable_pages = align_up(unavailable_pages, 2) / 2;
-  size_t region_unavailable_bytes = region_unavailable_pages * space->page_size;
-
-  region_reset_unavailable(&space->to_space, region_unavailable_bytes);
-  region_reset_unavailable(&space->from_space, region_unavailable_bytes);
-
-  space->limit =
-    space->to_space.base + space->to_space.size - space->to_space.unavailable;
+static void semi_space_finish_gc(struct semi_space *space,
+                                 size_t large_object_pages) {
+  space->stolen_pages = large_object_pages;
+  space->limit = 0; // set in adjust_heap_size_and_limits
 }
 
 static void flip(struct semi_space *space) {
   struct region tmp;
+  GC_ASSERT(space->hp <= space->limit);
+  GC_ASSERT(space->limit - space->to_space.base <= space->to_space.active_size);
+  GC_ASSERT(space->to_space.active_size <= space->from_space.mapped_size);
   memcpy(&tmp, &space->from_space, sizeof(tmp));
   memcpy(&space->from_space, &space->to_space, sizeof(tmp));
   memcpy(&space->to_space, &tmp, sizeof(tmp));
 
   space->hp = space->to_space.base;
-  space->limit = space->hp + space->to_space.size;
+  space->limit = space->hp + space->to_space.active_size;
 }  
 
 static struct gc_ref copy(struct gc_heap *heap, struct semi_space *space,
@@ -182,7 +184,7 @@ static void visit_large_object_space(struct gc_heap *heap,
 }
 
 static int region_contains(struct region *region, uintptr_t addr) {
-  return addr - region->base < region->size;
+  return addr - region->base < region->active_size;
 }
 
 static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
@@ -229,7 +231,99 @@ static void trace(struct gc_edge edge, struct gc_heap *heap, void *visit_data) {
   return visit(edge, heap);
 }
 
-static void collect(struct gc_mutator *mut) {
+static int grow_region_if_needed(struct region *region, size_t new_size) {
+  if (new_size <= region->mapped_size)
+    return 1;
+
+  new_size = max_size(new_size, region->mapped_size * 2);
+
+  void *mem = mmap(NULL, new_size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return 0;
+  }
+  if (region->mapped_size)
+    munmap((void*)region->base, region->mapped_size);
+  region->base = (uintptr_t)mem;
+  region->active_size = 0;
+  region->mapped_size = new_size;
+  return 1;
+}
+
+static void truncate_region(struct region *region, size_t new_size) {
+  GC_ASSERT(new_size <= region->mapped_size);
+
+  size_t bytes = region->mapped_size - new_size;
+  if (bytes) {
+    munmap((void*)(region->base + new_size), bytes);
+    region->mapped_size = new_size;
+    if (region->active_size > new_size)
+      region->active_size = new_size;
+  }
+}
+
+static size_t compute_new_heap_size(struct gc_heap *heap, size_t for_alloc) {
+  struct semi_space *semi = heap_semi_space(heap);
+  struct large_object_space *large = heap_large_object_space(heap);
+  size_t live_bytes = semi->hp - semi->to_space.base;
+  live_bytes += large->live_pages_at_last_collection * semi->page_size;
+  live_bytes += for_alloc;
+
+  size_t new_heap_size = heap->size;
+  switch (heap->options->common.heap_size_policy) {
+    case GC_HEAP_SIZE_FIXED:
+      break;
+
+    case GC_HEAP_SIZE_GROWABLE: {
+      new_heap_size =
+        max_size(heap->size,
+                 live_bytes * heap->options->common.heap_size_multiplier);
+      break;
+    }
+
+    case GC_HEAP_SIZE_ADAPTIVE:
+    default:
+      GC_CRASH();
+  }
+  return align_up(new_heap_size, semi->page_size * 2);
+}
+
+static void adjust_heap_size_and_limits(struct gc_heap *heap,
+                                        size_t for_alloc) {
+  struct semi_space *semi = heap_semi_space(heap);
+  size_t new_heap_size = compute_new_heap_size(heap, for_alloc);
+  size_t new_region_size = new_heap_size / 2;
+
+  // Note that there is an asymmetry in how heap size is adjusted: we
+  // grow in two cycles (first the fromspace, then the tospace after it
+  // becomes the fromspace in the next collection) but shrink in one (by
+  // returning pages to the OS).
+
+  // If we are growing the heap now, grow the fromspace mapping.  Also,
+  // always try to grow the fromspace if it is smaller than the tospace.
+  grow_region_if_needed(&semi->from_space,
+                        max_size(new_region_size, semi->to_space.mapped_size));
+
+  // We may have grown fromspace.  Find out what our actual new region
+  // size will be.
+  new_region_size = min_size(new_region_size,
+                             min_size(semi->to_space.mapped_size,
+                                      semi->from_space.mapped_size));
+  heap->size = new_region_size * 2;
+  size_t stolen = align_up(semi->stolen_pages, 2) * semi->page_size;
+  GC_ASSERT(new_region_size > stolen/2);
+  size_t new_active_region_size = new_region_size - stolen/2;
+
+  region_set_active_size(&semi->from_space, new_active_region_size);
+  region_set_active_size(&semi->to_space, new_active_region_size);
+
+  size_t new_limit = semi->to_space.base + new_active_region_size;
+  GC_ASSERT(semi->hp <= new_limit);
+  semi->limit = new_limit;
+}
+
+static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   struct semi_space *semi = heap_semi_space(heap);
   struct large_object_space *large = heap_large_object_space(heap);
@@ -250,23 +344,39 @@ static void collect(struct gc_mutator *mut) {
     while(grey < semi->hp)
       grey = scan(heap, gc_ref(grey));
   large_object_space_finish_gc(large, 0);
-  semi_space_set_stolen_pages(semi, large->live_pages_at_last_collection);
+  semi_space_finish_gc(semi, large->live_pages_at_last_collection);
   gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
+  adjust_heap_size_and_limits(heap, for_alloc);
+
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
 
-void gc_collect(struct gc_mutator *mut) {
-  collect(mut);
+static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
+  collect(mut, bytes);
+
+  struct semi_space *space = mutator_semi_space(mut);
+  if (bytes < space->limit - space->hp)
+    return;
+
+  struct gc_heap *heap = mutator_heap(mut);
+  if (heap->options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
+    // Each collection can potentially resize only the inactive
+    // fromspace, so if we really run out of space we will need to
+    // collect again in order to resize the other half.
+    collect(mut, bytes);
+    if (bytes < space->limit - space->hp)
+      return;
+  }
+  fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
+  GC_CRASH();
 }
 
-static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
-  collect(mut);
-  struct semi_space *space = mutator_semi_space(mut);
-  if (space->limit - space->hp < bytes) {
-    fprintf(stderr, "ran out of space, heap size %zu\n",
-            mutator_heap(mut)->size);
-    GC_CRASH();
-  }
+void gc_collect(struct gc_mutator *mut) {
+  collect(mut, 0);
+}
+
+static void collect_for_large_alloc(struct gc_mutator *mut, size_t npages) {
+  collect_for_alloc(mut, npages * mutator_semi_space(mut)->page_size);
 }
 
 void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
@@ -275,14 +385,8 @@ void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
   struct semi_space *semi_space = heap_semi_space(heap);
 
   size_t npages = large_object_space_npages(space, size);
-  if (!semi_space_steal_pages(semi_space, npages)) {
-    collect(mut);
-    if (!semi_space_steal_pages(semi_space, npages)) {
-      fprintf(stderr, "ran out of space, heap size %zu\n",
-              mutator_heap(mut)->size);
-      GC_CRASH();
-    }
-  }
+  while (!semi_space_steal_pages(semi_space, npages))
+    collect_for_large_alloc(mut, npages);
 
   void *ret = large_object_space_alloc(space, npages);
   if (!ret)
@@ -302,7 +406,8 @@ void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
     uintptr_t addr = space->hp;
     uintptr_t new_hp = align_up (addr + size, GC_ALIGNMENT);
     if (space->limit < new_hp) {
-      collect_for_alloc(mut, size);
+      // The factor of 2 is for both regions.
+      collect_for_alloc(mut, size * 2);
       continue;
     }
     space->hp = new_hp;
@@ -324,36 +429,36 @@ void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
   gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
 }
 
-static int initialize_region(struct region *region, size_t size) {
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("mmap failed");
+static int region_init(struct region *region, size_t size) {
+  region->base = 0;
+  region->active_size = 0;
+  region->mapped_size = 0;
+
+  if (!grow_region_if_needed(region, size)) {
+    fprintf(stderr, "failed to allocated %zu bytes\n", size);
     return 0;
   }
 
-  region->base = (uintptr_t)mem;
-  region->size = size;
-  region->unavailable = 0;
+  region->active_size = size;
+
   return 1;
 }
 
-static int initialize_semi_space(struct semi_space *space, size_t size) {
+static int semi_space_init(struct semi_space *space, struct gc_heap *heap) {
   // Allocate even numbers of pages.
   size_t page_size = getpagesize();
-  size = align_up(size, page_size * 2);
-
-  if (!initialize_region(&space->from_space, size / 2))
-    return 0;
-  if (!initialize_region(&space->to_space, size / 2))
-    return 0;
-
-  space->hp = space->to_space.base;
-  space->limit = space->hp + space->to_space.size;
+  size_t size = align_up(heap->size, page_size * 2);
 
   space->page_size = page_size;
   space->stolen_pages = 0;
-  space->reserve_pages = 0;
+
+  if (!region_init(&space->from_space, size / 2))
+    return 0;
+  if (!region_init(&space->to_space, size / 2))
+    return 0;
+
+  space->hp = space->to_space.base;
+  space->limit = space->hp + space->to_space.active_size;
 
   return 1;
 }
@@ -372,18 +477,16 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
   return heap->count;
 }
 
-static int heap_init(struct gc_heap *heap, size_t size) {
+static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   heap->pending_ephemerons_size_factor = 0.01;
   heap->pending_ephemerons_size_slop = 0.5;
   heap->count = 0;
-  heap->size = size;
+  heap->options = options;
+  heap->size = options->common.heap_size;
 
   return heap_prepare_pending_ephemerons(heap);
 }
 
-struct gc_options {
-  struct gc_common_options common;
-};
 int gc_option_from_string(const char *str) {
   return gc_common_option_from_string(str);
 }
@@ -417,8 +520,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   if (!options) options = gc_allocate_options();
 
-  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
-    fprintf(stderr, "fixed heap size is currently required\n");
+  if (options->common.heap_size_policy == GC_HEAP_SIZE_ADAPTIVE) {
+    fprintf(stderr, "adaptive heap size is currently unimplemented\n");
     return 0;
   }
   if (options->common.parallelism != 1) {
@@ -430,11 +533,10 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!*mut) GC_CRASH();
   *heap = mutator_heap(*mut);
 
-  if (!heap_init(*heap, options->common.heap_size))
+  if (!heap_init(*heap, options))
     return 0;
 
-  struct semi_space *space = mutator_semi_space(*mut);
-  if (!initialize_semi_space(space, options->common.heap_size))
+  if (!semi_space_init(heap_semi_space(*heap), *heap))
     return 0;
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     return 0;

From f15eb3bd10e20f195d41d795c358e2ff55803b4e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 10:00:27 +0100
Subject: [PATCH 168/403] Add enum heap policy parsing

---
 gc-options.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/gc-options.c b/gc-options.c
index 076c2700f..c41b8fe51 100644
--- a/gc-options.c
+++ b/gc-options.c
@@ -8,24 +8,25 @@
 #include "gc-options-internal.h"
 #include "gc-platform.h"
 
-// M(UPPER, lower, repr, parser, default, min, max)
+// M(UPPER, lower, repr, type, parser, default, min, max)
 #define FOR_EACH_INT_GC_OPTION(M)                                       \
   M(HEAP_SIZE_POLICY, heap_size_policy, "heap-size-policy",             \
-    int, GC_HEAP_SIZE_FIXED, GC_HEAP_SIZE_FIXED, GC_HEAP_SIZE_ADAPTIVE) \
+    int, heap_size_policy, GC_HEAP_SIZE_FIXED, GC_HEAP_SIZE_FIXED,      \
+    GC_HEAP_SIZE_ADAPTIVE)                                              \
   M(PARALLELISM, parallelism, "parallelism",                            \
-    int, default_parallelism(), 1, 64)
+    int, int, default_parallelism(), 1, 64)
 
 #define FOR_EACH_SIZE_GC_OPTION(M)                                      \
   M(HEAP_SIZE, heap_size, "heap-size",                                  \
-    size, 6 * 1024 * 1024, 0, -1)                                       \
+    size, size, 6 * 1024 * 1024, 0, -1)                                 \
   M(MAXIMUM_HEAP_SIZE, maximum_heap_size, "maximum-heap-size",          \
-    size, 0, 0, -1)
+    size, size, 0, 0, -1)
 
 #define FOR_EACH_DOUBLE_GC_OPTION(M)                                    \
   M(HEAP_SIZE_MULTIPLIER, heap_size_multiplier, "heap-size-multiplier", \
-    double, 1.75, 1.0, 1e6)                                             \
+    double, double, 1.75, 1.0, 1e6)                                     \
   M(HEAP_FRUGALITY, heap_frugality, "heap-frugality",                   \
-    double, 1e-1, 1e-6, 1e6)
+    double, double, 1e-1, 1e-6, 1e6)
 
 typedef int gc_option_int;
 typedef size_t gc_option_size;
@@ -51,23 +52,23 @@ static int default_parallelism(void) {
 }
 
 void gc_init_common_options(struct gc_common_options *options) {
-#define INIT(UPPER, lower, repr, parser, default, min, max) \
+#define INIT(UPPER, lower, repr, type, parser, default, min, max) \
   options->lower = default;
   FOR_EACH_COMMON_GC_OPTION(INIT)
 #undef INIT
 }
 
 int gc_common_option_from_string(const char *str) {
-#define GET_OPTION(UPPER, lower, repr, parser, default, min, max) \
+#define GET_OPTION(UPPER, lower, repr, type, parser, default, min, max) \
   if (strcmp(str, repr) == 0) return GC_OPTION_##UPPER;
   FOR_EACH_COMMON_GC_OPTION(GET_OPTION)
 #undef GET_OPTION
   return -1;
 }
 
-#define SET_OPTION(UPPER, lower, repr, parser, default, min, max)       \
+#define SET_OPTION(UPPER, lower, repr, type, parser, default, min, max)  \
   case GC_OPTION_##UPPER:                                               \
-  if (value != clamp_##parser(value, min, max)) return 0;               \
+  if (value != clamp_##type(value, min, max)) return 0;                 \
     options->lower = value;                                             \
     return 1;
 #define DEFINE_SETTER(STEM, stem, type)                                 \
@@ -113,6 +114,22 @@ static int parse_int(const char *arg, int *val) {
   return 1;
 }
 
+static int parse_heap_size_policy(const char *arg, int *val) {
+  if (strcmp(arg, "fixed") == 0) {
+    *val = GC_HEAP_SIZE_FIXED;
+    return 1;
+  }
+  if (strcmp(arg, "growable") == 0) {
+    *val = GC_HEAP_SIZE_GROWABLE;
+    return 1;
+  }
+  if (strcmp(arg, "adaptive") == 0) {
+    *val = GC_HEAP_SIZE_ADAPTIVE;
+    return 1;
+  }
+  return parse_int(arg, val);
+}
+
 static int parse_double(const char *arg, double *val) {
   char *end;
   double d = strtod(arg, &end);
@@ -125,11 +142,11 @@ static int parse_double(const char *arg, double *val) {
 int gc_common_options_parse_and_set(struct gc_common_options *options,
                                     int option, const char *value) {
   switch (option) {
-#define SET_OPTION(UPPER, lower, repr, parser, default, min, max)       \
-    case GC_OPTION_##UPPER: {                                           \
-      gc_option_##parser v;                                             \
-      if (!parse_##parser(value, &v)) return 0;                         \
-      return gc_common_options_set_##parser(options, option, v);        \
+#define SET_OPTION(UPPER, lower, repr, type, parser, default, min, max)  \
+    case GC_OPTION_##UPPER: {                                            \
+      gc_option_##type v;                                                \
+      if (!parse_##parser(value, &v)) return 0;                          \
+      return gc_common_options_set_##type(options, option, v);           \
     }
     FOR_EACH_COMMON_GC_OPTION(SET_OPTION)
     default: return 0;

From 9576558a344b39a2ad48192543988b027fa974af Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 09:40:41 +0100
Subject: [PATCH 169/403] Rework mtbench to take GC options string instead of
 parallelism

---
 mt-gcbench.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/mt-gcbench.c b/mt-gcbench.c
index 1ab50a4e0..f72dac66e 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -330,14 +330,13 @@ int main(int argc, char *argv[]) {
     tree_size(long_lived_tree_depth) * sizeof(Node) +
     tree_size(max_tree_depth) * sizeof(Node) +
     sizeof(DoubleArray) + sizeof(double) * array_size;
-  if (argc != 4) {
-    fprintf(stderr, "usage: %s MULTIPLIER NTHREADS PARALLELISM\n", argv[0]);
+  if (argc < 3 || argc > 4) {
+    fprintf(stderr, "usage: %s MULTIPLIER NTHREADS [GC-OPTIONS]\n", argv[0]);
     return 1;
   }
 
   double multiplier = atof(argv[1]);
   size_t nthreads = atol(argv[2]);
-  size_t parallelism = atol(argv[3]);
 
   if (!(0.1 < multiplier && multiplier < 100)) {
     fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[1]);
@@ -348,18 +347,18 @@ int main(int argc, char *argv[]) {
             (int)MAX_THREAD_COUNT, argv[2]);
     return 1;
   }
-  if (parallelism < 1 || parallelism > MAX_THREAD_COUNT) {
-    fprintf(stderr, "Expected integer between 1 and %d for parallelism, got '%s'\n",
-            (int)MAX_THREAD_COUNT, argv[3]);
-    return 1;
-  }
 
   size_t heap_size = heap_max_live * multiplier * nthreads;
 
   struct gc_options *options = gc_allocate_options();
   gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
   gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
-  gc_options_set_int(options, GC_OPTION_PARALLELISM, parallelism);
+  if (argc == 4) {
+    if (!gc_options_parse_and_set_many(options, argv[3])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[3]);
+      return 1;
+    }
+  }
 
   struct gc_heap *heap;
   struct gc_mutator *mut;

From 157037dd1f30ce56c1ca3ca1188a945d0567d0f8 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 11:20:50 +0100
Subject: [PATCH 170/403] Fix parallelism in quads benchmark

---
 quads.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quads.c b/quads.c
index 097735fad..5e0449875 100644
--- a/quads.c
+++ b/quads.c
@@ -131,7 +131,7 @@ int main(int argc, char *argv[]) {
   struct gc_options *options = gc_allocate_options();
   gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
   gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
-  gc_options_set_int(options, GC_OPTION_PARALLELISM, 1);
+  gc_options_set_int(options, GC_OPTION_PARALLELISM, parallelism);
 
   struct gc_heap *heap;
   struct gc_mutator *mut;

From f657cd38471f6a20da9068fd8379a8e953034eb9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 11:24:33 +0100
Subject: [PATCH 171/403] quads benchmark takes gc-options param

---
 quads.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/quads.c b/quads.c
index 5e0449875..11d8e5e1f 100644
--- a/quads.c
+++ b/quads.c
@@ -101,24 +101,18 @@ static size_t tree_size(size_t depth) {
 #define MAX_THREAD_COUNT 256
 
 int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage: %s DEPTH MULTIPLIER PARALLELISM\n", argv[0]);
+  if (argc < 3 || 4 < argc) {
+    fprintf(stderr, "usage: %s DEPTH MULTIPLIER [GC-OPTIONS]\n", argv[0]);
     return 1;
   }
 
   size_t depth = parse_size(argv[1], "depth");
   double multiplier = atof(argv[2]);
-  size_t parallelism = atol(argv[3]);
 
   if (!(1.0 < multiplier && multiplier < 100)) {
     fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
     return 1;
   }
-  if (parallelism < 1 || parallelism > MAX_THREAD_COUNT) {
-    fprintf(stderr, "Expected integer between 1 and %d for parallelism, got '%s'\n",
-            (int)MAX_THREAD_COUNT, argv[3]);
-    return 1;
-  }
 
   size_t nquads = tree_size(depth);
   size_t tree_bytes = nquads * sizeof(Quad);
@@ -131,7 +125,12 @@ int main(int argc, char *argv[]) {
   struct gc_options *options = gc_allocate_options();
   gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
   gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
-  gc_options_set_int(options, GC_OPTION_PARALLELISM, parallelism);
+  if (argc == 4) {
+    if (!gc_options_parse_and_set_many(options, argv[3])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[3]);
+      return 1;
+    }
+  }
 
   struct gc_heap *heap;
   struct gc_mutator *mut;

From 51168fd96e11fc1c0051f4fa180878404526dedc Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 11:30:51 +0100
Subject: [PATCH 172/403] ephemerons benchmark takes gc options

---
 ephemerons.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/ephemerons.c b/ephemerons.c
index 40779d741..c11fa755c 100644
--- a/ephemerons.c
+++ b/ephemerons.c
@@ -191,15 +191,14 @@ static void *join_thread(void *data) {
 #define MAX_THREAD_COUNT 256
 
 int main(int argc, char *argv[]) {
-  if (argc != 5) {
-    fprintf(stderr, "usage: %s HEAP_SIZE MULTIPLIER NTHREADS PARALLELISM\n", argv[0]);
+  if (argc < 4 || 5 < argc) {
+    fprintf(stderr, "usage: %s HEAP_SIZE MULTIPLIER NTHREADS [GC-OPTIONS]\n", argv[0]);
     return 1;
   }
 
   heap_size = atof(argv[1]);
   heap_multiplier = atof(argv[2]);
   nthreads = atol(argv[3]);
-  size_t parallelism = atol(argv[4]);
 
   if (heap_size < 8192) {
     fprintf(stderr,
@@ -216,11 +215,6 @@ int main(int argc, char *argv[]) {
             (int)MAX_THREAD_COUNT, argv[2]);
     return 1;
   }
-  if (parallelism < 1 || parallelism > MAX_THREAD_COUNT) {
-    fprintf(stderr, "Expected integer between 1 and %d for parallelism, got '%s'\n",
-            (int)MAX_THREAD_COUNT, argv[3]);
-    return 1;
-  }
 
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, heap_multiplier);
@@ -228,7 +222,12 @@ int main(int argc, char *argv[]) {
   struct gc_options *options = gc_allocate_options();
   gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
   gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
-  gc_options_set_int(options, GC_OPTION_PARALLELISM, parallelism);
+  if (argc == 5) {
+    if (!gc_options_parse_and_set_many(options, argv[4])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[4]);
+      return 1;
+    }
+  }
 
   struct gc_heap *heap;
   struct gc_mutator *mut;

From 62f4b045f8117461f1ce0a5cc426d83aea0a7748 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 28 Feb 2023 22:02:49 +0100
Subject: [PATCH 173/403] BDW supports growable heap

---
 bdw.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/bdw.c b/bdw.c
index 41533e1a2..d4c664eeb 100644
--- a/bdw.c
+++ b/bdw.c
@@ -248,13 +248,35 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   if (!options) options = gc_allocate_options();
 
-  // GC_full_freq = 30;
-  // GC_free_space_divisor = 16;
-  // GC_enable_incremental();
-  
   // Ignore stack base for main thread.
 
-  GC_set_max_heap_size(options->common.heap_size);
+  switch (options->common.heap_size_policy) {
+    case GC_HEAP_SIZE_FIXED:
+      GC_set_max_heap_size(options->common.heap_size);
+      break;
+    case GC_HEAP_SIZE_GROWABLE: {
+      if (options->common.maximum_heap_size)
+        GC_set_max_heap_size(options->common.maximum_heap_size);
+      // BDW uses a pretty weird heap-sizing heuristic:
+      //
+      // heap-size = live-data * (1 + (2 / GC_free_space_divisor))
+      // heap-size-multiplier = heap-size/live-data = 1 + 2/GC_free_space_divisor
+      // GC_free_space_divisor = 2/(heap-size-multiplier-1)
+      //
+      // (Assumption: your heap is mostly "composite", i.e. not
+      // "atomic".  See bdw's alloc.c:min_bytes_allocd.)
+      double fsd = 2.0/(options->common.heap_size_multiplier - 1);
+      // But, the divisor is an integer.  WTF.  This caps the effective
+      // maximum heap multiplier at 3.  Oh well.
+      GC_set_free_space_divisor(fsd + 0.51);
+      break;
+    }
+    case GC_HEAP_SIZE_ADAPTIVE:
+    default:
+      fprintf(stderr, "adaptive heap sizing unsupported by bdw-gc\n");
+      return 0;
+  }
+
   // Not part of 7.3, sigh.  Have to set an env var.
   // GC_set_markers_count(options->common.parallelism);
   char markers[21] = {0,}; // 21 bytes enough for 2**64 in decimal + NUL.

From d0b8f6838dddc6c7f9dec5156cb6ea985394df19 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 14 Mar 2023 14:35:20 +0100
Subject: [PATCH 174/403] Rework fast/slow path alloc API

This lets users do gc_allocate_fast() and know that if they got a
non-NULL result, we could allocate without GC, and so no object was
moved.
---
 bdw.c     | 43 +++++++++++++++++--------------------------
 gc-api.h  | 48 ++++++++++++++++++++++++++++++++----------------
 semi.c    |  7 +++++--
 whippet.c |  9 ++++++---
 4 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/bdw.c b/bdw.c
index d4c664eeb..29958f133 100644
--- a/bdw.c
+++ b/bdw.c
@@ -74,42 +74,33 @@ enum gc_inline_kind {
   GC_INLINE_KIND_NORMAL
 };
 
-static void* allocate_small_slow(void **freelist, size_t idx,
-                                 enum gc_inline_kind kind) GC_NEVER_INLINE;
-static void* allocate_small_slow(void **freelist, size_t idx,
-                                 enum gc_inline_kind kind) {
-  size_t bytes = gc_inline_freelist_object_size(idx);
-  GC_generic_malloc_many(bytes, kind, freelist);
-  void *head = *freelist;
-  if (GC_UNLIKELY (!head)) {
-    fprintf(stderr, "ran out of space, heap size %zu\n",
-            GC_get_heap_size());
-    GC_CRASH();
-  }
-  *freelist = *(void **)(head);
-  return head;
-}
-
 static inline void *
 allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   void *head = *freelist;
 
-  if (GC_UNLIKELY (!head))
-    return allocate_small_slow(freelist, idx, kind);
+  if (!head) {
+    size_t bytes = gc_inline_freelist_object_size(idx);
+    GC_generic_malloc_many(bytes, kind, freelist);
+    head = *freelist;
+    if (GC_UNLIKELY (!head)) {
+      fprintf(stderr, "ran out of space, heap size %zu\n",
+              GC_get_heap_size());
+      GC_CRASH();
+    }
+  }
 
   *freelist = *(void **)(head);
   return head;
 }
 
-void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
-  return GC_malloc(size);
-}
-
-void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size != 0);
-  GC_ASSERT(size <= gc_allocator_large_threshold());
-  size_t idx = gc_inline_bytes_to_freelist_index(size);
-  return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
+  if (size <= gc_allocator_large_threshold()) {
+    size_t idx = gc_inline_bytes_to_freelist_index(size);
+    return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
+  } else {
+    return GC_malloc(size);
+  }
 }
 
 void* gc_allocate_pointerless(struct gc_mutator *mut,
diff --git a/gc-api.h b/gc-api.h
index 47222706a..5f0d3c1aa 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -83,12 +83,11 @@ static inline void gc_update_alloc_table(struct gc_mutator *mut,
   }
 }
 
-GC_API_ void* gc_allocate_small(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
-GC_API_ void* gc_allocate_large(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
+GC_API_ void* gc_allocate_slow(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
 
 static inline void*
-gc_allocate_bump_pointer(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_bump_pointer(struct gc_mutator *mut, size_t size) {
+gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
   size_t granule_size = gc_allocator_small_granule_size();
@@ -105,7 +104,7 @@ static inline void* gc_allocate_bump_pointer(struct gc_mutator *mut, size_t size
   uintptr_t new_hp = hp + size;
 
   if (GC_UNLIKELY (new_hp > limit))
-    return gc_allocate_small(mut, size);
+    return NULL;
 
   *hp_loc = new_hp;
 
@@ -115,9 +114,9 @@ static inline void* gc_allocate_bump_pointer(struct gc_mutator *mut, size_t size
   return (void*)hp;
 }
 
-static inline void* gc_allocate_freelist(struct gc_mutator *mut,
-                                         size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_freelist(struct gc_mutator *mut, size_t size) {
+static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut,
+                                                    size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
   size_t freelist_offset = gc_allocator_freelist_offset(size);
@@ -126,7 +125,7 @@ static inline void* gc_allocate_freelist(struct gc_mutator *mut, size_t size) {
 
   void *head = *freelist_loc;
   if (GC_UNLIKELY(!head))
-    return gc_allocate_small(mut, size);
+    return NULL;
 
   *freelist_loc = *(void**)head;
 
@@ -136,24 +135,41 @@ static inline void* gc_allocate_freelist(struct gc_mutator *mut, size_t size) {
   return head;
 }
 
-static inline void* gc_allocate(struct gc_mutator *mut, size_t bytes) GC_ALWAYS_INLINE;
-static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
+static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size != 0);
-  if (size > gc_allocator_large_threshold())
-    return gc_allocate_large(mut, size);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
 
   switch (gc_allocator_kind()) {
   case GC_ALLOCATOR_INLINE_BUMP_POINTER:
-    return gc_allocate_bump_pointer(mut, size);
+    return gc_allocate_small_fast_bump_pointer(mut, size);
   case GC_ALLOCATOR_INLINE_FREELIST:
-    return gc_allocate_freelist(mut, size);
+    return gc_allocate_small_fast_freelist(mut, size);
   case GC_ALLOCATOR_INLINE_NONE:
-    return gc_allocate_small(mut, size);
+    return NULL;
   default:
     GC_CRASH();
   }
 }
 
+static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size) {
+  GC_ASSERT(size != 0);
+  if (size > gc_allocator_large_threshold())
+    return NULL;
+
+  return gc_allocate_small_fast(mut, size);
+}
+
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
+  void *ret = gc_allocate_fast(mut, size);
+  if (GC_LIKELY(ret != NULL))
+    return ret;
+
+  return gc_allocate_slow(mut, size);
+}
+
 // FIXME: remove :P
 GC_API_ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t bytes);
 
diff --git a/semi.c b/semi.c
index fe4ee382b..5d85ec8c6 100644
--- a/semi.c
+++ b/semi.c
@@ -379,7 +379,7 @@ static void collect_for_large_alloc(struct gc_mutator *mut, size_t npages) {
   collect_for_alloc(mut, npages * mutator_semi_space(mut)->page_size);
 }
 
-void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
+static void* allocate_large(struct gc_mutator *mut, size_t size) {
   struct gc_heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
   struct semi_space *semi_space = heap_semi_space(heap);
@@ -400,7 +400,10 @@ void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
   return ret;
 }
 
-void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+  if (size > gc_allocator_large_threshold())
+    return allocate_large(mut, size);
+
   struct semi_space *space = mutator_semi_space(mut);
   while (1) {
     uintptr_t addr = space->hp;
diff --git a/whippet.c b/whippet.c
index 686ac5cda..5d692b728 100644
--- a/whippet.c
+++ b/whippet.c
@@ -2114,7 +2114,7 @@ void gc_collect(struct gc_mutator *mut) {
   trigger_collection(mut);
 }
 
-void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
+static void* allocate_large(struct gc_mutator *mut, size_t size) {
   struct gc_heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
 
@@ -2139,9 +2139,12 @@ void* gc_allocate_large(struct gc_mutator *mut, size_t size) {
   return ret;
 }
 
-void* gc_allocate_small(struct gc_mutator *mut, size_t size) {
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
-  GC_ASSERT(size <= gc_allocator_large_threshold());
+
+  if (size > gc_allocator_large_threshold())
+    return allocate_large(mut, size);
+
   size = align_up(size, GRANULE_SIZE);
   uintptr_t alloc = mut->alloc;
   uintptr_t sweep = mut->sweep;

From e27029024206380c1def273a5f902fbccd4927e7 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 15 Mar 2023 09:34:12 +0100
Subject: [PATCH 175/403] Allow large object space to be part of remembered set

---
 bdw-attrs.h             |  6 +++---
 bdw.c                   |  4 ++++
 gc-api.h                | 18 ++++++++++++++---
 gc-attrs.h              |  9 +++++----
 gc-embedder-api.h       |  8 ++++++++
 large-object-space.h    | 43 +++++++++++++++++++++++++++++++++++++++++
 mt-gcbench.c            |  6 +++---
 semi-attrs.h            |  6 +++---
 semi.c                  |  4 ++++
 simple-gc-embedder.h    | 19 ++++++++++++++++++
 simple-tagging-scheme.h |  8 +++++---
 whippet-attrs.h         | 13 ++++++++-----
 whippet.c               | 16 ++++++++++++++-
 13 files changed, 135 insertions(+), 25 deletions(-)

diff --git a/bdw-attrs.h b/bdw-attrs.h
index 960e543b0..e7a08100d 100644
--- a/bdw-attrs.h
+++ b/bdw-attrs.h
@@ -40,13 +40,13 @@ static inline int gc_allocator_needs_clear(void) {
   return 0;
 }
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
   return GC_WRITE_BARRIER_NONE;
 }
-static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+static inline size_t gc_write_barrier_card_table_alignment(void) {
   GC_CRASH();
 }
-static inline size_t gc_small_write_barrier_card_size(void) {
+static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
 
diff --git a/bdw.c b/bdw.c
index 29958f133..cf17f19e7 100644
--- a/bdw.c
+++ b/bdw.c
@@ -115,6 +115,10 @@ void gc_collect(struct gc_mutator *mut) {
   GC_gcollect();
 }
 
+void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                             struct gc_edge edge, struct gc_ref new_val) {
+}
+
 // In BDW-GC, we can't hook into the mark phase to call
 // gc_trace_ephemerons_for_object, so the advertised ephemeron strategy
 // doesn't really work.  The primitives that we have are mark functions,
diff --git a/gc-api.h b/gc-api.h
index 5f0d3c1aa..6cf783703 100644
--- a/gc-api.h
+++ b/gc-api.h
@@ -177,18 +177,30 @@ static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge
                                           struct gc_ref new_val) GC_ALWAYS_INLINE;
 static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
                                           struct gc_ref new_val) {
-  switch (gc_small_write_barrier_kind()) {
+}
+
+GC_API_ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                                     struct gc_edge edge, struct gc_ref new_val) GC_NEVER_INLINE;
+
+static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
+                                    struct gc_edge edge, struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
+                                    struct gc_edge edge, struct gc_ref new_val) {
+  switch (gc_write_barrier_kind(obj_size)) {
   case GC_WRITE_BARRIER_NONE:
     return;
   case GC_WRITE_BARRIER_CARD: {
-    size_t card_table_alignment = gc_small_write_barrier_card_table_alignment();
-    size_t card_size = gc_small_write_barrier_card_size();
+    size_t card_table_alignment = gc_write_barrier_card_table_alignment();
+    size_t card_size = gc_write_barrier_card_size();
     uintptr_t addr = gc_ref_value(obj);
     uintptr_t base = addr & ~(card_table_alignment - 1);
     uintptr_t card = (addr & (card_table_alignment - 1)) / card_size;
     atomic_store_explicit((uint8_t*)(base + card), 1, memory_order_relaxed);
     return;
   }
+  case GC_WRITE_BARRIER_EXTERN:
+    gc_write_barrier_extern(obj, obj_size, edge, new_val);
+    return;
   default:
     GC_CRASH();
   }
diff --git a/gc-attrs.h b/gc-attrs.h
index 17ff2add5..60d8e3351 100644
--- a/gc-attrs.h
+++ b/gc-attrs.h
@@ -29,11 +29,12 @@ static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
 
 enum gc_write_barrier_kind {
   GC_WRITE_BARRIER_NONE,
-  GC_WRITE_BARRIER_CARD
+  GC_WRITE_BARRIER_CARD,
+  GC_WRITE_BARRIER_EXTERN
 };
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) GC_ALWAYS_INLINE;
-static inline size_t gc_small_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
-static inline size_t gc_small_write_barrier_card_size(void) GC_ALWAYS_INLINE;
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) GC_ALWAYS_INLINE;
+static inline size_t gc_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
+static inline size_t gc_write_barrier_card_size(void) GC_ALWAYS_INLINE;
 
 #endif // GC_ATTRS_H
diff --git a/gc-embedder-api.h b/gc-embedder-api.h
index 8ae45ef61..6e39f05ea 100644
--- a/gc-embedder-api.h
+++ b/gc-embedder-api.h
@@ -41,6 +41,14 @@ GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
                                                 struct gc_heap *heap,
                                                 void *trace_data);
 
+// Some heap objects have space for a "remembered" bit, indicating they
+// are in the remembered set.  Large or potentially large objects
+// (e.g. a vector whose size is a run-time property) must have a
+// remembered set bit.  Small objects may or may not have such a bit.
+GC_EMBEDDER_API inline void gc_object_set_remembered(struct gc_ref ref);
+GC_EMBEDDER_API inline int gc_object_is_remembered_nonatomic(struct gc_ref ref);
+GC_EMBEDDER_API inline void gc_object_clear_remembered_nonatomic(struct gc_ref ref);
+
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
                                                         struct gc_ref new_ref);
diff --git a/large-object-space.h b/large-object-space.h
index de41dea60..9d8d0d06a 100644
--- a/large-object-space.h
+++ b/large-object-space.h
@@ -58,6 +58,49 @@ static size_t large_object_space_npages(struct large_object_space *space,
   return (bytes + space->page_size - 1) >> space->page_size_log2;
 }
 
+static void large_object_space_clear_one_remembered(uintptr_t addr,
+                                                    void *unused) {
+  struct gc_ref ref = gc_ref(addr);
+  if (gc_object_is_remembered_nonatomic(ref))
+    gc_object_clear_remembered_nonatomic(ref);
+}
+
+static void
+large_object_space_clear_remembered_set(struct large_object_space *space) {
+  if (!GC_GENERATIONAL)
+    return;
+  address_set_for_each(&space->to_space,
+                       large_object_space_clear_one_remembered, NULL);
+}
+
+struct large_object_space_trace_remembered_data {
+  void (*trace)(struct gc_ref, struct gc_heap*);
+  struct gc_heap *heap;
+};
+
+static void large_object_space_trace_one_remembered(uintptr_t addr,
+                                                    void *data) {
+  struct gc_ref ref = gc_ref(addr);
+  if (gc_object_is_remembered_nonatomic(ref)) {
+    gc_object_clear_remembered_nonatomic(ref);
+    struct large_object_space_trace_remembered_data *vdata = data;
+    vdata->trace(ref, vdata->heap);
+  }
+}
+
+static void
+large_object_space_trace_remembered_set(struct large_object_space *space,
+                                        void (*trace)(struct gc_ref,
+                                                      struct gc_heap*),
+                                        struct gc_heap *heap) {
+  struct large_object_space_trace_remembered_data vdata = { trace, heap };
+
+  if (!GC_GENERATIONAL)
+    return;
+  address_set_for_each(&space->to_space,
+                       large_object_space_trace_one_remembered, &vdata);
+}
+
 static void large_object_space_start_gc(struct large_object_space *space,
                                         int is_minor_gc) {
   if (is_minor_gc)
diff --git a/mt-gcbench.c b/mt-gcbench.c
index f72dac66e..e7e7d8a58 100644
--- a/mt-gcbench.c
+++ b/mt-gcbench.c
@@ -144,9 +144,9 @@ static void allocate_garbage(struct thread *t) {
 }
 
 static void set_field(Node *obj, Node **field, Node *val) {
-  gc_small_write_barrier(gc_ref_from_heap_object(obj),
-                         gc_edge(field),
-                         gc_ref_from_heap_object(val));
+  gc_write_barrier(gc_ref_from_heap_object(obj), sizeof(Node),
+                   gc_edge(field),
+                   gc_ref_from_heap_object(val));
   *field = val;
 }
 
diff --git a/semi-attrs.h b/semi-attrs.h
index e6b429178..3bf9584b8 100644
--- a/semi-attrs.h
+++ b/semi-attrs.h
@@ -42,13 +42,13 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
   GC_CRASH();
 }
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
   return GC_WRITE_BARRIER_NONE;
 }
-static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+static inline size_t gc_write_barrier_card_table_alignment(void) {
   GC_CRASH();
 }
-static inline size_t gc_small_write_barrier_card_size(void) {
+static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
 
diff --git a/semi.c b/semi.c
index 5d85ec8c6..11b74ec5f 100644
--- a/semi.c
+++ b/semi.c
@@ -375,6 +375,10 @@ void gc_collect(struct gc_mutator *mut) {
   collect(mut, 0);
 }
 
+void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                             struct gc_edge edge, struct gc_ref new_val) {
+}
+
 static void collect_for_large_alloc(struct gc_mutator *mut, size_t npages) {
   collect_for_alloc(mut, npages * mutator_semi_space(mut)->page_size);
 }
diff --git a/simple-gc-embedder.h b/simple-gc-embedder.h
index 70fd5c7a8..14fb142e7 100644
--- a/simple-gc-embedder.h
+++ b/simple-gc-embedder.h
@@ -86,6 +86,25 @@ static inline void gc_object_forward_nonatomic(struct gc_ref ref,
   *tag_word(ref) = gc_ref_value(new_ref);
 }
 
+static inline void gc_object_set_remembered(struct gc_ref ref) {
+  uintptr_t *loc = tag_word(ref);
+  uintptr_t tag = *loc;
+  while (!(tag & gcobj_remembered_bit))
+    atomic_compare_exchange_weak(loc, &tag, tag | gcobj_remembered_bit);
+}
+
+static inline int gc_object_is_remembered_nonatomic(struct gc_ref ref) {
+  uintptr_t *loc = tag_word(ref);
+  uintptr_t tag = *loc;
+  return tag & gcobj_remembered_bit;
+}
+
+static inline void gc_object_clear_remembered_nonatomic(struct gc_ref ref) {
+  uintptr_t *loc = tag_word(ref);
+  uintptr_t tag = *loc;
+  *loc = tag & ~(uintptr_t)gcobj_remembered_bit;
+}
+
 static inline struct gc_atomic_forward
 gc_atomic_forward_begin(struct gc_ref ref) {
   uintptr_t tag = atomic_load_explicit(tag_word(ref), memory_order_acquire);
diff --git a/simple-tagging-scheme.h b/simple-tagging-scheme.h
index b6b8a924c..aa0b707e4 100644
--- a/simple-tagging-scheme.h
+++ b/simple-tagging-scheme.h
@@ -7,9 +7,11 @@ struct gc_header {
   uintptr_t tag;
 };
 
-// Alloc kind is in bits 1-7, for live objects.
-static const uintptr_t gcobj_alloc_kind_mask = 0x7f;
-static const uintptr_t gcobj_alloc_kind_shift = 1;
+// Alloc kind is in bits 2-7, for live objects.
+static const uintptr_t gcobj_alloc_kind_mask = 0x3f;
+static const uintptr_t gcobj_alloc_kind_shift = 2;
+static const uintptr_t gcobj_remembered_mask = 0x2;
+static const uintptr_t gcobj_remembered_bit = 0x2;
 static const uintptr_t gcobj_forwarded_mask = 0x1;
 static const uintptr_t gcobj_not_forwarded_bit = 0x1;
 static const uintptr_t gcobj_busy = 0;
diff --git a/whippet-attrs.h b/whippet-attrs.h
index bfecc44db..b26d79ad3 100644
--- a/whippet-attrs.h
+++ b/whippet-attrs.h
@@ -40,16 +40,19 @@ static inline int gc_allocator_needs_clear(void) {
   return 0;
 }
 
-static inline enum gc_write_barrier_kind gc_small_write_barrier_kind(void) {
-  if (GC_GENERATIONAL)
-    return GC_WRITE_BARRIER_CARD;
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
+  if (GC_GENERATIONAL) {
+    if (obj_size <= gc_allocator_large_threshold())
+      return GC_WRITE_BARRIER_CARD;
+    return GC_WRITE_BARRIER_EXTERN;
+  }
   return GC_WRITE_BARRIER_NONE;
 }
-static inline size_t gc_small_write_barrier_card_table_alignment(void) {
+static inline size_t gc_write_barrier_card_table_alignment(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return 4 * 1024 * 1024;
 }
-static inline size_t gc_small_write_barrier_card_size(void) {
+static inline size_t gc_write_barrier_card_size(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return 256;
 }
diff --git a/whippet.c b/whippet.c
index 5d692b728..4771e37e9 100644
--- a/whippet.c
+++ b/whippet.c
@@ -1333,6 +1333,10 @@ static void trace_global_conservative_roots(struct gc_heap *heap) {
       (mark_and_globally_enqueue_heap_conservative_roots, heap, NULL);
 }
 
+static void enqueue_generational_root(struct gc_ref ref, struct gc_heap *heap) {
+  tracer_enqueue_root(&heap->tracer, ref);
+}
+
 // Note that it's quite possible (and even likely) that any given remset
 // byte doesn't hold any roots, if all stores were to nursery objects.
 STATIC_ASSERT_EQ(GRANULES_PER_REMSET_BYTE % 8, 0);
@@ -1352,7 +1356,7 @@ static void mark_space_trace_card(struct mark_space *space,
       size_t granule = granule_base + granule_offset;
       uintptr_t addr = first_addr_in_slab + granule * GRANULE_SIZE;
       GC_ASSERT(metadata_byte_for_addr(addr) == &slab->metadata[granule]);
-      tracer_enqueue_root(&heap->tracer, gc_ref(addr));
+      enqueue_generational_root(gc_ref(addr), heap);
     }
   }
 }
@@ -1385,12 +1389,22 @@ static void mark_space_clear_remembered_set(struct mark_space *space) {
   }
 }
 
+void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                             struct gc_edge edge, struct gc_ref new_val) {
+  GC_ASSERT(size > gc_allocator_large_threshold());
+  gc_object_set_remembered(obj);
+}
+
 static void trace_generational_roots(struct gc_heap *heap) {
   // TODO: Add lospace nursery.
   if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
     mark_space_trace_remembered_set(heap_mark_space(heap), heap);
+    large_object_space_trace_remembered_set(heap_large_object_space(heap),
+                                            enqueue_generational_root,
+                                            heap);
   } else {
     mark_space_clear_remembered_set(heap_mark_space(heap));
+    large_object_space_clear_remembered_set(heap_large_object_space(heap));
   }
 }
 

From 5fbd21a7c33c377be98d06a9abecc31ec348e9c5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 3 Aug 2023 11:21:00 +0200
Subject: [PATCH 176/403] Add USER-GUIDE.md

---
 USER-GUIDE.md | 544 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 544 insertions(+)
 create mode 100644 USER-GUIDE.md

diff --git a/USER-GUIDE.md b/USER-GUIDE.md
new file mode 100644
index 000000000..434a6f6cb
--- /dev/null
+++ b/USER-GUIDE.md
@@ -0,0 +1,544 @@
+# Whippet user's guide
+
+Whippet is an embed-only library: it should be copied into the source
+tree of the program that uses it.  The program's build system needs to
+be wired up to compile Whippet, then link it into the program that uses
+it.
+
+## Subtree merges
+
+One way is get Whippet is just to manually copy the files present in a
+Whippet checkout into your project.  However probably the best way is to
+perform a [subtree
+merge](https://docs.github.com/en/get-started/using-git/about-git-subtree-merges)
+of Whippet into your project's Git repository, so that you can easily
+update your copy of Whippet in the future.
+
+Performing the first subtree merge is annoying and full of arcane
+incantations.  Follow the [subtree merge
+page](https://docs.github.com/en/get-started/using-git/about-git-subtree-merges)
+for full details, but for a cheat sheet, you might do something like
+this to copy Whippet into the `whippet/` directory of your project root:
+
+```
+git remote add whippet https://github.com/wingo/whippet-gc
+git fetch whippet
+git merge -s ours --no-commit --allow-unrelated-histories whippet/main
+git read-tree --prefix=whippet/ -u whippet/main
+git commit -m 'Added initial Whippet merge'
+```
+
+Then to later update your copy of whippet, assuming you still have the
+`whippet` remote, just do:
+
+```
+git pull -s subtree whippet main
+```
+
+## `gc-embedder-api.h`
+
+To determine the live set of objects, a tracing garbage collector starts
+with a set of root objects, and then transitively visits all reachable
+object edges.  Exactly how it goes about doing this depends on the
+program that is using the garbage collector; different programs will
+have different object representations, different strategies for
+recording roots, and so on.
+
+To traverse the heap in a program-specific way but without imposing an
+abstraction overhead, Whippet requires that a number of data types and
+inline functions be implemented by the program, for use by Whippet
+itself.  This is the *embedder API*, and this document describes what
+Whippet requires from a program.
+
+A program should provide a header file implementing the API in
+[`gc-embedder-api.h`](./gc-embedder-api.h).  This header should only be
+included when compiling Whippet itself; it is not part of the API that
+Whippet exposes to the program.
+
+### Identifying roots
+
+The collector uses two opaque struct types, `struct gc_mutator_roots`
+and `struct gc_heap_roots`, that are used by the program to record
+object roots.  Probably you should put the definition of these data
+types in a separate header that is included both by Whippet, via the
+embedder API, and via users of Whippet, so that programs can populate
+the root set.  In any case the embedder-API use of these structs is via
+`gc_trace_mutator_roots` and `gc_trace_heap_roots`, two functions that
+are passed a trace visitor function `trace_edge`, and which should call
+that function on all edges from a given mutator or heap.  (Usually
+mutator roots are per-thread roots, such as from the stack, and heap
+roots are global roots.)
+
+### Tracing objects
+
+The `gc_trace_object` is responsible for calling the `trace_edge`
+visitor function on all outgoing edges in an object.  It also includes a
+`size` out-parameter, for when the collector wants to measure the size
+of an object.  `trace_edge` and `size` may be `NULL`, in which case no
+tracing or size computation should be performed.
+
+### Tracing ephemerons
+
+Most kinds of GC-managed object are defined by the program, but the GC
+itself has support for a specific object kind: ephemerons.  If the
+program allocates ephemerons, it should trace them in the
+`gc_trace_object` function by calling `gc_trace_ephemeron` from
+[`gc-ephemerons.h`](./gc-ephemerons.h).
+
+### Remembered-set bits
+
+When built to support generational garbage collection, Whippet requires
+that all "large" or potentially large objects have a flag bit reserved
+for use of the garbage collector.  A large object is one whose size
+exceeds the `gc_allocator_large_threshold()` (see
+[`gc-attrs.h`](./gc-attrs.h)), which is a collector-specific value.
+Currently the only generational collector is the in-place Whippet
+collector, whose large object threshold is 4096 bytes.  The
+`gc_object_set_remembered`, `gc_object_is_remembered_nonatomic`, and
+`gc_object_clear_remembered_nonatomic` embedder functions manage the
+remembered bit.  Setting the remembered bit should be idempotent;
+multiple threads can race to call `gc_object_set_remembered` and do not
+synchronize.  The query and clear functions are called without
+concurrent accessors and so don't have to be atomic.
+
+### Forwarding objects
+
+When built with a collector that moves objects, the embedder must also
+allow for forwarding pointers to be installed in an object.  There are
+two forwarding APIs: one that is atomic and one that isn't.
+
+The nonatomic API is relatively simple; there is a
+`gc_object_forwarded_nonatomic` function that returns an embedded
+forwarding address, or 0 if the object is not yet forwarded, and
+`gc_object_forward_nonatomic`, which installs a forwarding pointer.
+
+The atomic API is gnarly.  It is used by parallel collectors, in which
+multiple collector threads can race to evacuate an object.
+
+There is a state machine associated with the `gc_atomic_forward`
+structure from [`gc-forwarding.h`](./gc-forwarding.h); the embedder API
+implements the state changes.  The collector calls
+`gc_atomic_forward_begin` on an object to begin a forwarding attempt,
+and the resulting `gc_atomic_forward` can be in the `NOT_FORWARDED`,
+`FORWARDED`, or `BUSY` state.
+
+If the `gc_atomic_forward`'s state is `BUSY`, the collector will call
+`gc_atomic_forward_retry_busy`; a return value of 0 means the object is
+still busy, because another thread is attempting to forward it.
+Otherwise the forwarding state becomes either `FORWARDED`, if the other
+thread succeeded in forwarding it, or `ABORTED`, indicating that the
+other thread failed to forward it.
+
+If the forwarding state is `FORWARDED`, the collector will call
+`gc_atomic_forward_address` to get the new address.
+
+If the forwarding state is `NOT_FORWARDED`, the collector may begin a
+forwarding attempt by calling `gc_atomic_forward_acquire`.  The
+resulting state is `ACQUIRED` on success, or `BUSY` if another thread
+acquired the object in the meantime, or `FORWARDED` if another thread
+acquired and completed the forwarding attempt.
+
+An `ACQUIRED` object can then be forwarded via
+`gc_atomic_forward_commit`, or the forwarding attempt can be aborted via
+`gc_atomic_forward_abort`.
+
+All of these `gc_atomic_forward` functions are to be implemented by the
+embedder.  Some programs may allocate a dedicated forwarding word in all
+objects; some will manage to store the forwarding word in an initial
+"tag" word, via a specific pattern for the low 3 bits of the tag that no
+non-forwarded object will have.  The low-bits approach takes advantage
+of the collector's minimum object alignment, in which objects are
+aligned at least to an 8-byte boundary, so all objects have 0 for the
+low 3 bits of their address.
+
+### Conservative references
+
+Finally, when configured in a mode in which root edges or intra-object
+edges are *conservative*, the embedder can filter out which bit patterns
+might be an object reference by implementing
+`gc_is_valid_conservative_ref_displacement`.  Here, the collector masks
+off the low bits of a conservative reference, and asks the embedder if a
+value with those low bits might point to an object.  Usually the
+embedder should return 1 only if the displacement is 0, but if the
+program allows low-bit tagged pointers, then it should also return 1 for
+those pointer tags.
+
+## Configuration, compilation, and linking
+
+To the user, Whippet presents an abstract API that does not encode the
+specificities of any given collector.  Whippet currently includes three
+implementations of that API: `semi`, a simple semi-space collector;
+`bdw`, an implementation via the third-party
+[Boehm-Demers-Weiser](https://github.com/ivmai/bdwgc) conservative
+collector; and `whippet`, an Immix-like collector.
+
+There is a bit of name overloading between the Whippet abstract API, the
+collection of GC implementations, and the specific Whippet collector;
+our apologies.  It's just like that, and we hope to make the usage
+obvious from context.
+
+The program that embeds Whippet selects the collector implementation at
+build-time.  In the case of the specific Whippet collector, the program
+also configures a specific collector mode, again at build-time:
+generational or not, parallel or not, stack-conservative or not, and
+heap-conservative or not.  It may be nice in the future to be able to
+configure these at run-time, but for the time being they are
+compile-time options so that adding new features doesn't change the
+footprint of a more minimal collector.
+
+Different collectors have different allocation strategies: for example,
+the BDW collector allocates from thread-local freelists, whereas the
+semi-space collector has a bump-pointer allocator.  A collector may also
+expose a write barrier, for example to enable generational collection.
+For performance reasons, many of these details can't be hidden behind an
+opaque functional API: they must be inlined into call sites.  Whippet's
+approach is to expose fast paths as part of its inline API, but which
+are *parameterized* on attributes of the selected garbage collector.
+The goal is to keep the user's code generic and avoid any code
+dependency on the choice of garbage collector.  Because of inlining,
+however, the choice of garbage collector does need to be specified when
+compiling user code.
+
+### Compiling the collector
+
+Building the collector is not as easy as it should be.  As an embed-only
+library, we don't get to choose the One True Build System and then just
+build the software in that way; instead Whippet needs to be buildable
+with any build system.  At some point we will have snippets that
+embedders can include in their various build systems, but for now we
+document the low-level structure, so that people can craft the
+appropriate incantations for their program's build system.
+
+Whippet consists of some collector-implementation-agnostic independent
+modules, and then the collector implementation itself.  Though Whippet
+tries to put performance-sensitive interfaces in header files, users
+should also compile with link-time optimization (LTO) to remove any
+overhead imposed by the division of code into separate compilation
+units.
+
+Usually you want to build with maximum optimization and no debugging
+assertions.  Sometimes you want minimal optimization and all assertions.
+Here's what we do, as a `Makefile` snippet:
+
+```
+DEFAULT_BUILD=opt
+BUILD_CFLAGS_opt=-O2 -g -DNDEBUG
+BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1
+BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1
+BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
+```
+
+So if you do just plain `make`, it will do an `opt` build.  You can
+specify the build mode by setting `BUILD` on the command line, as in
+`make BUILD=debug`.
+
+Then for the actual compilation flags, we do:
+
+```
+CC=gcc
+CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
+INCLUDES=-I.
+LDFLAGS=-lpthread -flto
+COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
+```
+
+The actual include directory (the dot in `-I.`) should be adjusted as
+appropriate.
+
+#### Collector-implementation-agnostic independent modules
+
+There are currently four generic modules that don't depend on the choice
+of collector.  The first is `gc-stack.o`, which has supporting code to
+associate mutators (threads) with slices of the native stack, in order
+to support conservative root-finding.
+
+```
+$(COMPILE) -o gc-stack.o -c gc-stack.c
+```
+
+The next is a generic options interface, to allow the user to
+parameterize the collector at run-time, for example to implement a
+specific heap sizing strategy.
+
+```
+$(COMPILE) -o gc-options.o -c gc-options.c
+```
+
+Next, where Whippet needs to get data from the operating system, for
+example the number of processors available, it does so behind an
+abstract interface that is selected at compile-time.  The only
+implementation currently is for GNU/Linux, but it's a pretty thin layer,
+so adding more systems should not be difficult.
+
+```
+PLATFORM=gnu-linux
+$(COMPILE) -o gc-platform.o -c gc-platform-$(PLATFORM).c
+```
+
+Finally, something a little more complicated: ephemerons.  Ephemerons
+are objects that make a weak association between a key and a value.  As
+first-class objects, they need to be classifiable by the user system,
+and notably via the `gc_trace_object` procedure, and therefore need to
+have a header whose shape is understandable by the embedding program.
+We do this by including the `gc-embedder-api.h` implementation, via
+`-include`, in this case providing `foo-embedder.h`:
+
+```
+$(COMPILE) -include foo-embedder.h -o gc-ephemeron.o -c gc-ephemeron.c
+```
+
+#### Compile-time options
+
+There are a number of pre-processor definitions that can parameterize
+the collector at build-time:
+
+ * `GC_DEBUG`: If nonzero, then enable debugging assertions.
+ * `NDEBUG`: This one is a bit weird; if not defined, then enable
+   debugging assertions and some debugging printouts.  Probably
+   Whippet's use of `NDEBUG` should be folded in to `GC_DEBUG`.
+ * `GC_PARALLEL`: If nonzero, then enable parallelism in the collector.
+   Defaults to 0.
+ * `GC_GENERATIONAL`: If nonzero, then enable generational collection.
+   Defaults to zero.
+ * `GC_PRECISE_ROOTS`: If nonzero, then collect precise roots via
+   `gc_heap_roots` and `gc_mutator_roots`.  Defaults to zero.
+ * `GC_CONSERVATIVE_ROOTS`: If nonzero, then scan the stack and static
+   data sections for conservative roots.  Defaults to zero.  Not
+   mutually exclusive with `GC_PRECISE_ROOTS`.
+ * `GC_CONSERVATIVE_TRACE`: If nonzero, heap edges are scanned
+   conservatively.  Defaults to zero.
+
+Some collectors require specific compile-time options.  For example, the
+semi-space collector has to be able to move all objects; this is not
+compatible with conservative roots or heap edges.
+
+#### Building `semi`
+
+Finally, let's build a collector.  The simplest collector is the
+semi-space collector.  The entirety of the implementation can be had by
+compiling `semi.c`, providing the program's embedder API implementation
+via `-include`:
+
+```
+$(COMPILE) -DGC_PRECISE_ROOTS=1 -include foo-embedder.h -o gc.o -c semi.c
+```
+
+#### Building `bdw`
+
+The next simplest collector uses
+[BDW-GC](https://github.com/ivmai/bdwgc).  This collector must scan the
+roots and heap conservatively.  The collector is parallel if BDW-GC
+itself was compiled with parallelism enabled.
+
+```
+$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 \
+  `pkg-config --cflags bdw-gc` \
+  -include foo-embedder.h -o gc.o -c bdw.c
+```
+
+#### Building `whippet`
+
+Finally, there is the whippet collector.  It can collect roots precisely
+or conservatively, trace precisely or conservatively, be parallel or
+not, and be generational or not.
+
+```
+$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
+  -include foo-embedder.h -o gc.o -c whippet.c
+```
+
+### Compiling your program
+
+Any compilation unit that uses the GC API should have the same set of
+compile-time options defined as when compiling the collector.
+Additionally those compilation units should include the "attributes"
+header for the collector in question, namely `semi-attrs.h`,
+`bdw-attrs.h`, or `whippet-attrs.h`.  For example, for parallel
+generational whippet, you might have:
+
+```
+$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
+  -include whippet-attrs.h -o my-program.o -c my-program.c
+```
+
+### Linking the collector into your program
+
+Finally to link, pass all objects to the linker.  You will want to
+ensure that the linker enables `-flto`, for link-time optimization.  We
+do it like this:
+
+```
+$(CC) $(LDFLAGS) -o my-program \
+  my-program.o gc-stack.o gc-platform.o gc-options.o gc-ephemeron.o
+```
+
+## Using the collector
+
+Whew!  So you finally built the thing!  Did you also link it into your
+program?  No, because your program isn't written yet?  Well this section
+is for you: we describe the user-facing API of Whippet, where "user" in
+this case denotes the embedding program.
+
+What is the API, you ask?  It is in [`gc-api.h`](./gc-api.h).
+
+### Heaps and mutators
+
+To start with, you create a *heap*.  Usually an application will create
+just one heap.  A heap has one or more associated *mutators*.  A mutator
+is a thread-specific handle on the heap.  Allocating objects requires a
+mutator.
+
+The initial heap and mutator are created via `gc_init`, which takes two
+input parameters: the *options*, and a stack base address.  The options
+specify the initial heap size and so on.  `gc_init` returns the new heap
+as an out parameter, and also returns a mutator for the current thread.
+
+To make a new mutator for a new thread, use `gc_init_for_thread`.  When
+a thread is finished with its mutator, call `gc_finish_for_thread`.
+Each thread that allocates or accesses GC-managed objects should have
+its own mutator.
+
+The stack base address allows the collector to scan the mutator's stack,
+if conservative root-finding is enabled.  It may be omitted in the call
+to `gc_init` and `gc_init_for_thread`; passing `NULL` tells Whippet to
+ask the platform for the stack bounds of the current thread.  Generally
+speaking, this works on all platforms for the main thread, but not
+necessarily on other threads.  The most reliable solution is to
+explicitly obtain a base address by trampolining through
+`gc_call_with_stack_addr`.
+
+### Options
+
+There are some run-time parameters that programs and users might want to
+set explicitly; these are encapsulated in the *options*.  Make an
+options object with `gc_allocate_options()`; this object will be
+consumed by its `gc_init`.  Then, the most convenient thing is to set
+those options from `gc_options_parse_and_set_many` from a string passed
+on the command line or an environment variable, but to get there we have
+to explain the low-level first.  There are a few options that are
+defined for all collectors:
+
+ * `GC_OPTION_HEAP_SIZE_POLICY`: How should we size the heap?  Either
+   it's `GC_HEAP_SIZE_FIXED` (which is 0), in which the heap size is
+   fixed at startup; or `GC_HEAP_SIZE_GROWABLE` (1), in which the heap
+   may grow but will never shrink; or `GC_HEAP_SIZE_ADAPTIVE` (2), in
+   which we take an
+   [adaptive](https://wingolog.org/archives/2023/01/27/three-approaches-to-heap-sizing)
+   approach, depending on the rate of allocation and the cost of
+   collection.  Really you want the adaptive strategy, but if you are
+   benchmarking you definitely want the fixed policy.
+ * `GC_OPTION_HEAP_SIZE`: The initial heap size.  For a
+   `GC_HEAP_SIZE_FIXED` policy, this is also the final heap size.  In
+   bytes.
+ * `GC_OPTION_MAXIMUM_HEAP_SIZE`: For growable and adaptive heaps, the
+   maximum heap size, in bytes.
+ * `GC_OPTION_HEAP_SIZE_MULTIPLIER`: For growable heaps, the target heap
+   multiplier.  A heap multiplier of 2.5 means that for 100 MB of live
+   data, the heap should be 250 MB.
+ * `GC_OPTION_HEAP_FRUGALITY`: Something that will be used in adaptive
+   heaps, apparently!  Not yet implemented.
+ * `GC_OPTION_PARALLELISM`: How many threads to devote to collection
+   tasks during GC pauses.  By default, the current number of
+   processors, with a maximum of 8.
+
+You can set these options via `gc_option_set_int` and so on; see
+[`gc-options.h`](./gc-options.h).  Or, you can parse options from
+strings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
+on.  Use `gc_option_from_string` to determine if a string is really an
+option.  Use `gc_option_parse_and_set` to parse a value for an option.
+Use `gc_options_parse_and_set_many` to parse a number of comma-delimited
+*key=value* settings from a string.
+
+### Allocation
+
+So you have a heap and a mutator; great!  Let's allocate!  Call
+`gc_allocate`, passing the mutator and the number of bytes to allocate.
+
+There is also `gc_allocate_fast`, which is an inlined fast-path.  If
+that returns NULL, you need to call `gc_allocate_slow`.  The advantage
+of this API is that you can punt some root-saving overhead to the slow
+path.
+
+Allocation always succeeds.  If it doesn't, it kills your program.  The
+bytes in the resulting allocation will be initialized to 0.
+
+The allocation fast path is parameterized by collector-specific
+attributes.  JIT compilers can also read those attributes to emit
+appropriate inline code that replicates the logic of `gc_allocate_fast`.
+
+### Write barriers
+
+For some collectors, mutators have to tell the collector whenever they
+mutate an object.  They tell the collector by calling a *write barrier*;
+in Whippet this is currently the case only for generational collectors.
+
+The write barrier is `gc_write_barrier`; see `gc-api.h` for its
+parameters.
+
+As with allocation, the fast path for the write barrier is parameterized
+by collector-specific attributes, to allow JIT compilers to inline write
+barriers.
+
+### Safepoints
+
+Sometimes Whippet will need to synchronize all threads, for example as
+part of the "stop" phase of a stop-and-copy semi-space collector.
+Whippet stops at *safepoints*.  At a safepoint, all mutators must be
+able to enumerate all of their edges to live objects.
+
+Whippet has cooperative safepoints: mutators have to periodically call
+into the collector to potentially synchronize with other mutators.
+`gc_allocate_slow` is a safepoint, so if you a bunch of threads that are
+all allocating, usually safepoints are reached in a more-or-less prompt
+fashion.  But if a mutator isn't allocating, it either needs to
+temporarily mark itself as inactive by trampolining through
+`gc_call_without_gc`, or it should arrange to periodically call
+`gc_safepoint`.  Marking a mutator as inactive is the right strategy
+for, for example, system calls that might block.  Periodic safepoints is
+better for code that is active but not allocating.
+
+Thing is, though, `gc_safepoint` is not yet implemented :)  It will be,
+though!
+
+Also, the BDW collector actually uses pre-emptive safepoints: it stops
+threads via POSIX signals.  `gc_safepoint` is (or will be) a no-op with
+BDW.
+
+### Statistics
+
+Sometimes a program would like some information from the GC: how many
+bytes and objects have been allocated?  How much time has been spent in
+the GC?  How many times has GC run, and how many of those were minor
+collections?  What's the maximum pause time?  Stuff like that.  Whippet
+doesn't collect very much info right now, and this should probably
+change.  For the moment, all you get is `gc_print_stats`.
+
+### Ephemerons
+
+Whippet supports ephemerons, first-class objects that weakly associate
+keys with values.  If the an ephemeron's key ever becomes unreachable,
+the ephemeron becomes dead and loses its value.
+
+The user-facing API is in [`gc-ephemeron.h`](./gc-ephemeron.h).  To
+allocate an ephemeron, call `gc_allocate_ephemeron`, then initialize its
+key and value via `gc_ephemeron_init`.  Get the key and value via
+`gc_ephemeron_key` and `gc_ephemeron_value`, respectively.
+
+In Whippet, ephemerons can be linked together in a chain.  During GC, if
+an ephemeron's chain points to a dead ephemeron, that link will be
+elided, allowing the dead ephemeron itself to be collected.  In that
+way, ephemerons can be used to build weak data structures such as weak
+maps.
+
+Weak data structures are often shared across multiple threads, so all
+routines to access and modify chain links are atomic.  Use
+`gc_ephemeron_chain_head` to access the head of a storage location that
+points to an ephemeron; push a new ephemeron on a location with
+`gc_ephemeron_chain_push`; and traverse a chain with
+`gc_ephemeron_chain_next`.
+
+An ephemeron association can be removed via `gc_ephemeron_mark_dead`.
+
+### Finalizers
+
+Not yet implemented!

From e6de2fd633083210220681a0a04c0a2202f2acae Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 6 Aug 2023 12:04:33 +0200
Subject: [PATCH 177/403] Reorganize source tree and document

---
 Makefile                                      | 126 ++++++-------
 README.md                                     | 165 ++++--------------
 bdw-attrs.h => api/bdw-attrs.h                |   0
 gc-api.h => api/gc-api.h                      |   0
 gc-assert.h => api/gc-assert.h                |   0
 gc-attrs.h => api/gc-attrs.h                  |   0
 gc-config.h => api/gc-config.h                |   0
 .../gc-conservative-ref.h                     |   0
 gc-edge.h => api/gc-edge.h                    |   0
 gc-embedder-api.h => api/gc-embedder-api.h    |   0
 gc-ephemeron.h => api/gc-ephemeron.h          |   0
 gc-forwarding.h => api/gc-forwarding.h        |   0
 gc-inline.h => api/gc-inline.h                |   0
 gc-options.h => api/gc-options.h              |   0
 gc-ref.h => api/gc-ref.h                      |   0
 gc-visibility.h => api/gc-visibility.h        |   0
 semi-attrs.h => api/semi-attrs.h              |   0
 whippet-attrs.h => api/whippet-attrs.h        |   0
 benchmarks/README.md                          |  35 ++++
 .../ephemerons-embedder.h                     |   0
 .../ephemerons-types.h                        |   0
 ephemerons.c => benchmarks/ephemerons.c       |   0
 heap-objects.h => benchmarks/heap-objects.h   |   0
 .../mt-gcbench-embedder.h                     |   0
 .../mt-gcbench-types.h                        |   0
 mt-gcbench.c => benchmarks/mt-gcbench.c       |   0
 .../quads-embedder.h                          |   0
 quads-types.h => benchmarks/quads-types.h     |   0
 quads.c => benchmarks/quads.c                 |   0
 .../simple-allocator.h                        |   0
 .../simple-gc-embedder.h                      |   0
 .../simple-roots-api.h                        |   0
 .../simple-roots-types.h                      |   0
 .../simple-tagging-scheme.h                   |   0
 doc/design.md                                 |  64 +++++++
 doc/guile.md                                  |  26 +++
 USER-GUIDE.md => doc/manual.md                |  14 +-
 gc.h                                          |  30 ----
 address-hash.h => src/address-hash.h          |   0
 address-map.h => src/address-map.h            |   0
 address-set.h => src/address-set.h            |   0
 assert.h => src/assert.h                      |   0
 bdw.c => src/bdw.c                            |   0
 debug.h => src/debug.h                        |   0
 gc-align.h => src/gc-align.h                  |   0
 .../gc-ephemeron-internal.h                   |   0
 gc-ephemeron.c => src/gc-ephemeron.c          |   0
 gc-internal.h => src/gc-internal.h            |   0
 .../gc-options-internal.h                     |   0
 gc-options.c => src/gc-options.c              |   0
 .../gc-platform-gnu-linux.c                   |   0
 gc-platform.h => src/gc-platform.h            |   0
 gc-stack.c => src/gc-stack.c                  |   0
 gc-stack.h => src/gc-stack.h                  |   0
 gc-trace.h => src/gc-trace.h                  |   0
 .../large-object-space.h                      |   0
 parallel-tracer.h => src/parallel-tracer.h    |   0
 semi.c => src/semi.c                          |   0
 serial-tracer.h => src/serial-tracer.h        |   0
 spin.h => src/spin.h                          |   0
 whippet.c => src/whippet.c                    |   0
 test-address-map.c => test/test-address-map.c |   0
 test-address-set.c => test/test-address-set.c |   0
 63 files changed, 225 insertions(+), 235 deletions(-)
 rename bdw-attrs.h => api/bdw-attrs.h (100%)
 rename gc-api.h => api/gc-api.h (100%)
 rename gc-assert.h => api/gc-assert.h (100%)
 rename gc-attrs.h => api/gc-attrs.h (100%)
 rename gc-config.h => api/gc-config.h (100%)
 rename gc-conservative-ref.h => api/gc-conservative-ref.h (100%)
 rename gc-edge.h => api/gc-edge.h (100%)
 rename gc-embedder-api.h => api/gc-embedder-api.h (100%)
 rename gc-ephemeron.h => api/gc-ephemeron.h (100%)
 rename gc-forwarding.h => api/gc-forwarding.h (100%)
 rename gc-inline.h => api/gc-inline.h (100%)
 rename gc-options.h => api/gc-options.h (100%)
 rename gc-ref.h => api/gc-ref.h (100%)
 rename gc-visibility.h => api/gc-visibility.h (100%)
 rename semi-attrs.h => api/semi-attrs.h (100%)
 rename whippet-attrs.h => api/whippet-attrs.h (100%)
 create mode 100644 benchmarks/README.md
 rename ephemerons-embedder.h => benchmarks/ephemerons-embedder.h (100%)
 rename ephemerons-types.h => benchmarks/ephemerons-types.h (100%)
 rename ephemerons.c => benchmarks/ephemerons.c (100%)
 rename heap-objects.h => benchmarks/heap-objects.h (100%)
 rename mt-gcbench-embedder.h => benchmarks/mt-gcbench-embedder.h (100%)
 rename mt-gcbench-types.h => benchmarks/mt-gcbench-types.h (100%)
 rename mt-gcbench.c => benchmarks/mt-gcbench.c (100%)
 rename quads-embedder.h => benchmarks/quads-embedder.h (100%)
 rename quads-types.h => benchmarks/quads-types.h (100%)
 rename quads.c => benchmarks/quads.c (100%)
 rename simple-allocator.h => benchmarks/simple-allocator.h (100%)
 rename simple-gc-embedder.h => benchmarks/simple-gc-embedder.h (100%)
 rename simple-roots-api.h => benchmarks/simple-roots-api.h (100%)
 rename simple-roots-types.h => benchmarks/simple-roots-types.h (100%)
 rename simple-tagging-scheme.h => benchmarks/simple-tagging-scheme.h (100%)
 create mode 100644 doc/design.md
 create mode 100644 doc/guile.md
 rename USER-GUIDE.md => doc/manual.md (97%)
 delete mode 100644 gc.h
 rename address-hash.h => src/address-hash.h (100%)
 rename address-map.h => src/address-map.h (100%)
 rename address-set.h => src/address-set.h (100%)
 rename assert.h => src/assert.h (100%)
 rename bdw.c => src/bdw.c (100%)
 rename debug.h => src/debug.h (100%)
 rename gc-align.h => src/gc-align.h (100%)
 rename gc-ephemeron-internal.h => src/gc-ephemeron-internal.h (100%)
 rename gc-ephemeron.c => src/gc-ephemeron.c (100%)
 rename gc-internal.h => src/gc-internal.h (100%)
 rename gc-options-internal.h => src/gc-options-internal.h (100%)
 rename gc-options.c => src/gc-options.c (100%)
 rename gc-platform-gnu-linux.c => src/gc-platform-gnu-linux.c (100%)
 rename gc-platform.h => src/gc-platform.h (100%)
 rename gc-stack.c => src/gc-stack.c (100%)
 rename gc-stack.h => src/gc-stack.h (100%)
 rename gc-trace.h => src/gc-trace.h (100%)
 rename large-object-space.h => src/large-object-space.h (100%)
 rename parallel-tracer.h => src/parallel-tracer.h (100%)
 rename semi.c => src/semi.c (100%)
 rename serial-tracer.h => src/serial-tracer.h (100%)
 rename spin.h => src/spin.h (100%)
 rename whippet.c => src/whippet.c (100%)
 rename test-address-map.c => test/test-address-map.c (100%)
 rename test-address-set.c => test/test-address-set.c (100%)

diff --git a/Makefile b/Makefile
index 3cfc68013..ee20d5647 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
 
 CC=gcc
 CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
-INCLUDES=-I.
+INCLUDES=-Iapi
 LDFLAGS=-lpthread -flto
 COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
 PLATFORM=gnu-linux
@@ -38,113 +38,113 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 
 all: $(ALL_TESTS)
 
-gc-platform.o: gc-platform.h gc-platform-$(PLATFORM).c gc-visibility.h
-	$(COMPILE) -o $@ -c gc-platform-$(PLATFORM).c
+gc-platform.o: src/gc-platform.h src/gc-platform-$(PLATFORM).c api/gc-visibility.h
+	$(COMPILE) -o $@ -c src/gc-platform-$(PLATFORM).c
 
-gc-stack.o: gc-stack.c
+gc-stack.o: src/gc-stack.c
 	$(COMPILE) -o $@ -c $<
 
-gc-options.o: gc-options.c gc-options.h gc-options-internal.h
+gc-options.o: src/gc-options.c api/gc-options.h src/gc-options-internal.h
 	$(COMPILE) -o $@ -c $<
 
-gc-ephemeron-%.o: gc-ephemeron.c gc-ephemeron.h gc-ephemeron-internal.h %-embedder.h
-	$(COMPILE) -include $*-embedder.h -o $@ -c $<
+gc-ephemeron-%.o: src/gc-ephemeron.c api/gc-ephemeron.h src/gc-ephemeron-internal.h benchmarks/%-embedder.h
+	$(COMPILE) -include benchmarks/$*-embedder.h -o $@ -c $<
 
-bdw-%-gc.o: bdw.c %-embedder.h %.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include $*-embedder.h -o $@ -c bdw.c
-bdw-%.o: bdw.c %.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include bdw-attrs.h -o $@ -c $*.c
+bdw-%-gc.o: src/bdw.c benchmarks/%-embedder.h benchmarks/%.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include benchmarks/$*-embedder.h -o $@ -c src/bdw.c
+bdw-%.o: src/bdw.c benchmarks/%.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/bdw-attrs.h -o $@ -c benchmarks/$*.c
 bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
 
-semi-%-gc.o: semi.c %-embedder.h large-object-space.h assert.h debug.h %.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c semi.c
-semi-%.o: semi.c %.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include semi-attrs.h -o $@ -c $*.c
+semi-%-gc.o: src/semi.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/semi.c
+semi-%.o: src/semi.c benchmarks/%.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/semi-attrs.h -o $@ -c benchmarks/$*.c
 semi-%: semi-%.o semi-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 whippet-%: whippet-%.o whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-stack-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-stack-conservative-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+stack-conservative-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 stack-conservative-whippet-%: stack-conservative-whippet-%.o stack-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-heap-conservative-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-heap-conservative-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+heap-conservative-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 heap-conservative-whippet-%: heap-conservative-whippet-%.o heap-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-parallel-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+parallel-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 parallel-whippet-%: parallel-whippet-%.o parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-stack-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-stack-conservative-parallel-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+stack-conservative-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 stack-conservative-parallel-whippet-%: stack-conservative-parallel-whippet-%.o stack-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-heap-conservative-parallel-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-heap-conservative-parallel-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+heap-conservative-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 heap-conservative-parallel-whippet-%: heap-conservative-parallel-whippet-%.o heap-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+generational-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 generational-whippet-%: generational-whippet-%.o generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-stack-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-stack-conservative-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+stack-conservative-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 stack-conservative-generational-whippet-%: stack-conservative-generational-whippet-%.o stack-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-heap-conservative-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h serial-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-heap-conservative-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+heap-conservative-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 heap-conservative-generational-whippet-%: heap-conservative-generational-whippet-%.o heap-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-parallel-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 parallel-generational-whippet-%: parallel-generational-whippet-%.o parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-stack-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include $*-embedder.h -o $@ -c whippet.c
-stack-conservative-parallel-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include whippet-attrs.h -o $@ -c $*.c
+stack-conservative-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+stack-conservative-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 stack-conservative-parallel-generational-whippet-%: stack-conservative-parallel-generational-whippet-%.o stack-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-heap-conservative-parallel-generational-whippet-%-gc.o: whippet.c %-embedder.h large-object-space.h parallel-tracer.h assert.h debug.h heap-objects.h %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include $*-embedder.h -o $@ -c whippet.c
-heap-conservative-parallel-generational-whippet-%.o: whippet.c %.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include whippet-attrs.h -o $@ -c $*.c
+heap-conservative-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+heap-conservative-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
 heap-conservative-parallel-generational-whippet-%: heap-conservative-parallel-generational-whippet-%.o heap-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
diff --git a/README.md b/README.md
index a4dcdbd21..e1ac66150 100644
--- a/README.md
+++ b/README.md
@@ -4,139 +4,37 @@ This repository is for development of Whippet, a new garbage collector
 implementation, eventually for use in [Guile
 Scheme](https://gnu.org/s/guile).
 
-## Design
+Whippet is an embed-only C library, designed to be copied into a
+program's source tree.  It exposes an abstract C API for managed memory
+allocation, and provides a number of implementations of that API.
 
-Whippet is mainly a mark-region collector, like
-[Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
-See also the lovely detailed [Rust
-implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
+One of the implementations is also called "whippet", and is the
+motivation for creating this library.  For a detailed introduction, see
+[Whippet: Towards a new local
+maximum](https://wingolog.org/archives/2023/02/07/whippet-towards-a-new-local-maximum),
+a talk given at FOSDEM 2023.
 
-To a first approximation, Whippet is a whole-heap Immix collector with a
-large object space on the side.  See the Immix paper for full details,
-but basically Immix divides the heap into 32kB blocks, and then divides
-those blocks into 128B lines.  An Immix allocation never spans blocks;
-allocations larger than 8kB go into a separate large object space.
-Mutators request blocks from the global store and allocate into those
-blocks using bump-pointer allocation.  When all blocks are consumed,
-Immix stops the world and traces the object graph, marking objects but
-also the lines that objects are on.  After marking, blocks contain some
-lines with live objects and others that are completely free.  Spans of
-free lines are called holes.  When a mutator gets a recycled block from
-the global block store, it allocates into those holes.  Also, sometimes
-Immix can choose to evacuate rather than mark.  Bump-pointer-into-holes
-allocation is quite compatible with conservative roots, so it's an
-interesting option for Guile, which has a lot of legacy C API users.
+## Documentation
 
-The essential difference of Whippet from Immix stems from a simple
-observation: Immix needs a side table of line mark bytes and also a mark
-bit or bits in each object (or in a side table).  But if instead you
-choose to store mark bytes instead of bits (for concurrency reasons) in
-a side table, with one mark byte per granule (unit of allocation,
-perhaps 16 bytes), then you effectively have a line mark table where the
-granule size is the line size.  You can bump-pointer allocate into holes
-in the mark byte table.
+ * [Design](./doc/design.md): What's the general idea?
+ * [Manual](./doc/manual.md): How do you get your program to use
+   Whippet?  What is the API?
+ * [Guile](./doc/guile.md): Some notes on a potential rebase of Guile on
+   top of Whippet.
 
-You might think this is a bad tradeoff, and perhaps it is: I don't know
-yet.  If your granule size is two pointers, then one mark byte per
-granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
-32-bit, it's a lot!  On the other hand, instead of the worst case of one
-survivor object wasting a line (or two, in the case of conservative line
-marking), granule-size-is-line-size instead wastes nothing.  Also, you
-don't need GC bits in the object itself, and you can use the mark byte
-array to record the object end, so that finding holes in a block can
-just read the mark table and can avoid looking at object memory.
+## Source repository structure
 
-Other ideas in Whippet:
-
- * Minimize stop-the-world phase via parallel marking and punting all
-   sweeping to mutators
-
- * Enable mutator parallelism via lock-free block acquisition and lazy
-   statistics collation
-
- * Allocate block space using aligned 4 MB slabs, with embedded metadata
-   to allow metadata bytes, slab headers, and block metadata to be
-   located via address arithmetic
-
- * Facilitate conservative collection via mark byte array, oracle for
-   "does this address start an object"
-
- * Enable in-place generational collection via card table with one entry
-   per 256B or so
-
- * Enable concurrent marking by having three mark bit states (dead,
-   survivor, marked) that rotate at each collection, and sweeping a
-   block clears metadata for dead objects; but concurrent marking and
-   associated SATB barrier not yet implemented
-
-## What's there
-
-This repository is a workspace for Whippet implementation.  As such, it
-has files implementing Whippet itself.  It also has some benchmarks to
-use in optimizing Whippet:
-
- - [`mt-gcbench.c`](./mt-gcbench.c): The multi-threaded [GCBench
-   benchmark](https://hboehm.info/gc/gc_bench.html).  An old but
-   standard benchmark that allocates different sizes of binary trees.
-   As parameters it takes a heap multiplier and a number of mutator
-   threads.  We analytically compute the peak amount of live data and
-   then size the GC heap as a multiplier of that size.  It has a peak
-   heap consumption of 10 MB or so per mutator thread: not very large.
-   At a 2x heap multiplier, it causes about 30 collections for the
-   whippet collector, and runs somewhere around 200-400 milliseconds in
-   single-threaded mode, on the machines I have in 2022.  For low thread
-   counts, the GCBench benchmark is small; but then again many Guile
-   processes also are quite short-lived, so perhaps it is useful to
-   ensure that small heaps remain lightweight.
-
-   To stress Whippet's handling of fragmentation, we modified this
-   benchmark to intersperse pseudorandomly-sized holes between tree
-   nodes.
-
- - [`quads.c`](./quads.c): A synthetic benchmark that allocates quad
-   trees.  The mutator begins by allocating one long-lived tree of depth
-   N, and then allocates 13% of the heap in depth-3 trees, 20 times,
-   simulating a fixed working set and otherwise an allocation-heavy
-   workload.  By observing the times to allocate 13% of the heap in
-   garbage we can infer mutator overheads, and also note the variance
-   for the cycles in which GC hits.
-
-The repository has two other collector implementations, to appropriately
-situate Whippet's performance in context:
-
- - `bdw.h`: The external BDW-GC conservative parallel stop-the-world
-   mark-sweep segregated-fits collector with lazy sweeping.
- - `semi.h`: Semispace copying collector.
- - `whippet.h`: The whippet collector.  Two different marking
-   implementations: single-threaded and parallel.  Generational and
-   non-generational variants, also.
-
-## Guile
-
-If the Whippet collector works out, it could replace Guile's garbage
-collector.  Guile currently uses BDW-GC.  Guile has a widely used C API
-and implements part of its run-time in C.  For this reason it may be
-infeasible to require precise enumeration of GC roots -- we may need to
-allow GC roots to be conservatively identified from data sections and
-from stacks.  Such conservative roots would be pinned, but other objects
-can be moved by the collector if it chooses to do so.  We assume that
-object references within a heap object can be precisely identified.
-(However, Guile currently uses BDW-GC in its default configuration,
-which scans for references conservatively even on the heap.)
-
-The existing C API allows direct access to mutable object fields,
-without the mediation of read or write barriers.  Therefore it may be
-impossible to switch to collector strategies that need barriers, such as
-generational or concurrent collectors.  However, we shouldn't write off
-this possibility entirely; an ideal replacement for Guile's GC will
-offer the possibility of migration to other GC designs without imposing
-new requirements on C API users in the initial phase.
-
-In this regard, the Whippet experiment also has the goal of identifying
-a smallish GC abstraction in Guile, so that we might consider evolving
-GC implementation in the future without too much pain.  If we switch
-away from BDW-GC, we should be able to evaluate that it's a win for a
-large majority of use cases.
+ * [api/](./api/): The user-facing API.  Also, the "embedder API"; see
+   the [manual](./doc/manual.md) for more.
+ * [doc/](./doc/): Documentation, such as it is.
+ * [src/](./src/): The actual GC implementation.  The specific
+   implementations of the Whippet API are [`semi.c`](./src/semi.c), a
+   semi-space collector; [`bdw.c`](./src/bdw.c), the third-party
+   [BDW-GC](https://github.com/ivmai/bdwgc) conservative parallel
+   stop-the-world mark-sweep segregated-fits collector with lazy
+   sweeping; and [`whippet.c`](./src/whippet.c), the whippet collector.
+ * [benchmarks/](./benchmarks/): Benchmarks.  A work in progress.
+ * [test/](./test/): A dusty attic of minimal testing.
 
 ## To do
 
@@ -166,14 +64,8 @@ size.  It would be nice if whippet-gc turns out to have this property.
 
 ## License
 
-gcbench.c, MT_GCBench.c, and MT_GCBench2.c are from
-https://hboehm.info/gc/gc_bench/ and have a somewhat unclear license.  I
-have modified GCBench significantly so that I can slot in different GC
-implementations.  The GC implementations themselves are available under
-a MIT-style license, the text of which follows:
-
 ```
-Copyright (c) 2022 Andy Wingo
+Copyright (c) 2022-2023 Andy Wingo
 
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the
@@ -194,3 +86,6 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ```
+
+Note that some benchmarks have other licenses; see
+[`benchmarks/README.md`](./benchmarks/README.md) for more.
diff --git a/bdw-attrs.h b/api/bdw-attrs.h
similarity index 100%
rename from bdw-attrs.h
rename to api/bdw-attrs.h
diff --git a/gc-api.h b/api/gc-api.h
similarity index 100%
rename from gc-api.h
rename to api/gc-api.h
diff --git a/gc-assert.h b/api/gc-assert.h
similarity index 100%
rename from gc-assert.h
rename to api/gc-assert.h
diff --git a/gc-attrs.h b/api/gc-attrs.h
similarity index 100%
rename from gc-attrs.h
rename to api/gc-attrs.h
diff --git a/gc-config.h b/api/gc-config.h
similarity index 100%
rename from gc-config.h
rename to api/gc-config.h
diff --git a/gc-conservative-ref.h b/api/gc-conservative-ref.h
similarity index 100%
rename from gc-conservative-ref.h
rename to api/gc-conservative-ref.h
diff --git a/gc-edge.h b/api/gc-edge.h
similarity index 100%
rename from gc-edge.h
rename to api/gc-edge.h
diff --git a/gc-embedder-api.h b/api/gc-embedder-api.h
similarity index 100%
rename from gc-embedder-api.h
rename to api/gc-embedder-api.h
diff --git a/gc-ephemeron.h b/api/gc-ephemeron.h
similarity index 100%
rename from gc-ephemeron.h
rename to api/gc-ephemeron.h
diff --git a/gc-forwarding.h b/api/gc-forwarding.h
similarity index 100%
rename from gc-forwarding.h
rename to api/gc-forwarding.h
diff --git a/gc-inline.h b/api/gc-inline.h
similarity index 100%
rename from gc-inline.h
rename to api/gc-inline.h
diff --git a/gc-options.h b/api/gc-options.h
similarity index 100%
rename from gc-options.h
rename to api/gc-options.h
diff --git a/gc-ref.h b/api/gc-ref.h
similarity index 100%
rename from gc-ref.h
rename to api/gc-ref.h
diff --git a/gc-visibility.h b/api/gc-visibility.h
similarity index 100%
rename from gc-visibility.h
rename to api/gc-visibility.h
diff --git a/semi-attrs.h b/api/semi-attrs.h
similarity index 100%
rename from semi-attrs.h
rename to api/semi-attrs.h
diff --git a/whippet-attrs.h b/api/whippet-attrs.h
similarity index 100%
rename from whippet-attrs.h
rename to api/whippet-attrs.h
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..1a9f1ac87
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,35 @@
+# Benchmarks
+
+ - [`mt-gcbench.c`](./mt-gcbench.c): The multi-threaded [GCBench
+   benchmark](https://hboehm.info/gc/gc_bench.html).  An old but
+   standard benchmark that allocates different sizes of binary trees.
+   As parameters it takes a heap multiplier and a number of mutator
+   threads.  We analytically compute the peak amount of live data and
+   then size the GC heap as a multiplier of that size.  It has a peak
+   heap consumption of 10 MB or so per mutator thread: not very large.
+   At a 2x heap multiplier, it causes about 30 collections for the
+   whippet collector, and runs somewhere around 200-400 milliseconds in
+   single-threaded mode, on the machines I have in 2022.  For low thread
+   counts, the GCBench benchmark is small; but then again many Guile
+   processes also are quite short-lived, so perhaps it is useful to
+   ensure that small heaps remain lightweight.
+
+   To stress Whippet's handling of fragmentation, we modified this
+   benchmark to intersperse pseudorandomly-sized holes between tree
+   nodes.
+
+ - [`quads.c`](./quads.c): A synthetic benchmark that allocates quad
+   trees.  The mutator begins by allocating one long-lived tree of depth
+   N, and then allocates 13% of the heap in depth-3 trees, 20 times,
+   simulating a fixed working set and otherwise an allocation-heavy
+   workload.  By observing the times to allocate 13% of the heap in
+   garbage we can infer mutator overheads, and also note the variance
+   for the cycles in which GC hits.
+
+## License
+
+mt-gcbench.c was originally from https://hboehm.info/gc/gc_bench/, which
+has a somewhat unclear license.  I have modified GCBench significantly
+so that I can slot in different GC implementations.  Other files are
+distributed under the Whippet license; see the top-level
+[README.md](../README.md) for more.
diff --git a/ephemerons-embedder.h b/benchmarks/ephemerons-embedder.h
similarity index 100%
rename from ephemerons-embedder.h
rename to benchmarks/ephemerons-embedder.h
diff --git a/ephemerons-types.h b/benchmarks/ephemerons-types.h
similarity index 100%
rename from ephemerons-types.h
rename to benchmarks/ephemerons-types.h
diff --git a/ephemerons.c b/benchmarks/ephemerons.c
similarity index 100%
rename from ephemerons.c
rename to benchmarks/ephemerons.c
diff --git a/heap-objects.h b/benchmarks/heap-objects.h
similarity index 100%
rename from heap-objects.h
rename to benchmarks/heap-objects.h
diff --git a/mt-gcbench-embedder.h b/benchmarks/mt-gcbench-embedder.h
similarity index 100%
rename from mt-gcbench-embedder.h
rename to benchmarks/mt-gcbench-embedder.h
diff --git a/mt-gcbench-types.h b/benchmarks/mt-gcbench-types.h
similarity index 100%
rename from mt-gcbench-types.h
rename to benchmarks/mt-gcbench-types.h
diff --git a/mt-gcbench.c b/benchmarks/mt-gcbench.c
similarity index 100%
rename from mt-gcbench.c
rename to benchmarks/mt-gcbench.c
diff --git a/quads-embedder.h b/benchmarks/quads-embedder.h
similarity index 100%
rename from quads-embedder.h
rename to benchmarks/quads-embedder.h
diff --git a/quads-types.h b/benchmarks/quads-types.h
similarity index 100%
rename from quads-types.h
rename to benchmarks/quads-types.h
diff --git a/quads.c b/benchmarks/quads.c
similarity index 100%
rename from quads.c
rename to benchmarks/quads.c
diff --git a/simple-allocator.h b/benchmarks/simple-allocator.h
similarity index 100%
rename from simple-allocator.h
rename to benchmarks/simple-allocator.h
diff --git a/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
similarity index 100%
rename from simple-gc-embedder.h
rename to benchmarks/simple-gc-embedder.h
diff --git a/simple-roots-api.h b/benchmarks/simple-roots-api.h
similarity index 100%
rename from simple-roots-api.h
rename to benchmarks/simple-roots-api.h
diff --git a/simple-roots-types.h b/benchmarks/simple-roots-types.h
similarity index 100%
rename from simple-roots-types.h
rename to benchmarks/simple-roots-types.h
diff --git a/simple-tagging-scheme.h b/benchmarks/simple-tagging-scheme.h
similarity index 100%
rename from simple-tagging-scheme.h
rename to benchmarks/simple-tagging-scheme.h
diff --git a/doc/design.md b/doc/design.md
new file mode 100644
index 000000000..1a1c69bee
--- /dev/null
+++ b/doc/design.md
@@ -0,0 +1,64 @@
+# Design
+
+Whippet is mainly a mark-region collector, like
+[Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
+See also the lovely detailed [Rust
+implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
+
+To a first approximation, Whippet is a whole-heap Immix collector with a
+large object space on the side.  See the Immix paper for full details,
+but basically Immix divides the heap into 32kB blocks, and then divides
+those blocks into 128B lines.  An Immix allocation never spans blocks;
+allocations larger than 8kB go into a separate large object space.
+Mutators request blocks from the global store and allocate into those
+blocks using bump-pointer allocation.  When all blocks are consumed,
+Immix stops the world and traces the object graph, marking objects but
+also the lines that objects are on.  After marking, blocks contain some
+lines with live objects and others that are completely free.  Spans of
+free lines are called holes.  When a mutator gets a recycled block from
+the global block store, it allocates into those holes.  Also, sometimes
+Immix can choose to evacuate rather than mark.  Bump-pointer-into-holes
+allocation is quite compatible with conservative roots, so it's an
+interesting option for Guile, which has a lot of legacy C API users.
+
+The essential difference of Whippet from Immix stems from a simple
+observation: Immix needs a side table of line mark bytes and also a mark
+bit or bits in each object (or in a side table).  But if instead you
+choose to store mark bytes instead of bits (for concurrency reasons) in
+a side table, with one mark byte per granule (unit of allocation,
+perhaps 16 bytes), then you effectively have a line mark table where the
+granule size is the line size.  You can bump-pointer allocate into holes
+in the mark byte table.
+
+You might think this is a bad tradeoff, and perhaps it is: I don't know
+yet.  If your granule size is two pointers, then one mark byte per
+granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
+32-bit, it's a lot!  On the other hand, instead of the worst case of one
+survivor object wasting a line (or two, in the case of conservative line
+marking), granule-size-is-line-size instead wastes nothing.  Also, you
+don't need GC bits in the object itself, and you can use the mark byte
+array to record the object end, so that finding holes in a block can
+just read the mark table and can avoid looking at object memory.
+
+Other ideas in Whippet:
+
+ * Minimize stop-the-world phase via parallel marking and punting all
+   sweeping to mutators
+
+ * Enable mutator parallelism via lock-free block acquisition and lazy
+   statistics collation
+
+ * Allocate block space using aligned 4 MB slabs, with embedded metadata
+   to allow metadata bytes, slab headers, and block metadata to be
+   located via address arithmetic
+
+ * Facilitate conservative collection via mark byte array, oracle for
+   "does this address start an object"
+
+ * Enable in-place generational collection via card table with one entry
+   per 256B or so
+
+ * Enable concurrent marking by having three mark bit states (dead,
+   survivor, marked) that rotate at each collection, and sweeping a
+   block clears metadata for dead objects; but concurrent marking and
+   associated SATB barrier not yet implemented
diff --git a/doc/guile.md b/doc/guile.md
new file mode 100644
index 000000000..05bc17e15
--- /dev/null
+++ b/doc/guile.md
@@ -0,0 +1,26 @@
+# Whippet and Guile
+
+If the Whippet collector works out, it could replace Guile's garbage
+collector.  Guile currently uses BDW-GC.  Guile has a widely used C API
+and implements part of its run-time in C.  For this reason it may be
+infeasible to require precise enumeration of GC roots -- we may need to
+allow GC roots to be conservatively identified from data sections and
+from stacks.  Such conservative roots would be pinned, but other objects
+can be moved by the collector if it chooses to do so.  We assume that
+object references within a heap object can be precisely identified.
+(However, Guile currently uses BDW-GC in its default configuration,
+which scans for references conservatively even on the heap.)
+
+The existing C API allows direct access to mutable object fields,
+without the mediation of read or write barriers.  Therefore it may be
+impossible to switch to collector strategies that need barriers, such as
+generational or concurrent collectors.  However, we shouldn't write off
+this possibility entirely; an ideal replacement for Guile's GC will
+offer the possibility of migration to other GC designs without imposing
+new requirements on C API users in the initial phase.
+
+In this regard, the Whippet experiment also has the goal of identifying
+a smallish GC abstraction in Guile, so that we might consider evolving
+GC implementation in the future without too much pain.  If we switch
+away from BDW-GC, we should be able to evaluate that it's a win for a
+large majority of use cases.
diff --git a/USER-GUIDE.md b/doc/manual.md
similarity index 97%
rename from USER-GUIDE.md
rename to doc/manual.md
index 434a6f6cb..41ff83d91 100644
--- a/USER-GUIDE.md
+++ b/doc/manual.md
@@ -51,7 +51,7 @@ itself.  This is the *embedder API*, and this document describes what
 Whippet requires from a program.
 
 A program should provide a header file implementing the API in
-[`gc-embedder-api.h`](./gc-embedder-api.h).  This header should only be
+[`gc-embedder-api.h`](../api/gc-embedder-api.h).  This header should only be
 included when compiling Whippet itself; it is not part of the API that
 Whippet exposes to the program.
 
@@ -83,7 +83,7 @@ Most kinds of GC-managed object are defined by the program, but the GC
 itself has support for a specific object kind: ephemerons.  If the
 program allocates ephemerons, it should trace them in the
 `gc_trace_object` function by calling `gc_trace_ephemeron` from
-[`gc-ephemerons.h`](./gc-ephemerons.h).
+[`gc-ephemerons.h`](../api/gc-ephemerons.h).
 
 ### Remembered-set bits
 
@@ -91,7 +91,7 @@ When built to support generational garbage collection, Whippet requires
 that all "large" or potentially large objects have a flag bit reserved
 for use of the garbage collector.  A large object is one whose size
 exceeds the `gc_allocator_large_threshold()` (see
-[`gc-attrs.h`](./gc-attrs.h)), which is a collector-specific value.
+[`gc-attrs.h`](../api/gc-attrs.h)), which is a collector-specific value.
 Currently the only generational collector is the in-place Whippet
 collector, whose large object threshold is 4096 bytes.  The
 `gc_object_set_remembered`, `gc_object_is_remembered_nonatomic`, and
@@ -116,7 +116,7 @@ The atomic API is gnarly.  It is used by parallel collectors, in which
 multiple collector threads can race to evacuate an object.
 
 There is a state machine associated with the `gc_atomic_forward`
-structure from [`gc-forwarding.h`](./gc-forwarding.h); the embedder API
+structure from [`gc-forwarding.h`](../api/gc-forwarding.h); the embedder API
 implements the state changes.  The collector calls
 `gc_atomic_forward_begin` on an object to begin a forwarding attempt,
 and the resulting `gc_atomic_forward` can be in the `NOT_FORWARDED`,
@@ -379,7 +379,7 @@ program?  No, because your program isn't written yet?  Well this section
 is for you: we describe the user-facing API of Whippet, where "user" in
 this case denotes the embedding program.
 
-What is the API, you ask?  It is in [`gc-api.h`](./gc-api.h).
+What is the API, you ask?  It is in [`gc-api.h`](../api/gc-api.h).
 
 ### Heaps and mutators
 
@@ -442,7 +442,7 @@ defined for all collectors:
    processors, with a maximum of 8.
 
 You can set these options via `gc_option_set_int` and so on; see
-[`gc-options.h`](./gc-options.h).  Or, you can parse options from
+[`gc-options.h`](../api/gc-options.h).  Or, you can parse options from
 strings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
 on.  Use `gc_option_from_string` to determine if a string is really an
 option.  Use `gc_option_parse_and_set` to parse a value for an option.
@@ -519,7 +519,7 @@ Whippet supports ephemerons, first-class objects that weakly associate
 keys with values.  If the an ephemeron's key ever becomes unreachable,
 the ephemeron becomes dead and loses its value.
 
-The user-facing API is in [`gc-ephemeron.h`](./gc-ephemeron.h).  To
+The user-facing API is in [`gc-ephemeron.h`](../api/gc-ephemeron.h).  To
 allocate an ephemeron, call `gc_allocate_ephemeron`, then initialize its
 key and value via `gc_ephemeron_init`.  Get the key and value via
 `gc_ephemeron_key` and `gc_ephemeron_value`, respectively.
diff --git a/gc.h b/gc.h
deleted file mode 100644
index 1de10afdd..000000000
--- a/gc.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef GC_H_
-#define GC_H_
-
-#include "gc-api.h"
-
-#if defined(GC_BDW)
-#include "bdw.h"
-#elif defined(GC_SEMI)
-#include "semi.h"
-#elif defined(GC_WHIPPET)
-#define GC_PARALLEL_TRACE 0
-#define GC_GENERATIONAL 0
-#include "whippet.h"
-#elif defined(GC_PARALLEL_WHIPPET)
-#define GC_PARALLEL_TRACE 1
-#define GC_GENERATIONAL 0
-#include "whippet.h"
-#elif defined(GC_GENERATIONAL_WHIPPET)
-#define GC_PARALLEL_TRACE 0
-#define GC_GENERATIONAL 1
-#include "whippet.h"
-#elif defined(GC_PARALLEL_GENERATIONAL_WHIPPET)
-#define GC_PARALLEL_TRACE 1
-#define GC_GENERATIONAL 1
-#include "whippet.h"
-#else
-#error unknown gc
-#endif
-
-#endif // GC_H_
diff --git a/address-hash.h b/src/address-hash.h
similarity index 100%
rename from address-hash.h
rename to src/address-hash.h
diff --git a/address-map.h b/src/address-map.h
similarity index 100%
rename from address-map.h
rename to src/address-map.h
diff --git a/address-set.h b/src/address-set.h
similarity index 100%
rename from address-set.h
rename to src/address-set.h
diff --git a/assert.h b/src/assert.h
similarity index 100%
rename from assert.h
rename to src/assert.h
diff --git a/bdw.c b/src/bdw.c
similarity index 100%
rename from bdw.c
rename to src/bdw.c
diff --git a/debug.h b/src/debug.h
similarity index 100%
rename from debug.h
rename to src/debug.h
diff --git a/gc-align.h b/src/gc-align.h
similarity index 100%
rename from gc-align.h
rename to src/gc-align.h
diff --git a/gc-ephemeron-internal.h b/src/gc-ephemeron-internal.h
similarity index 100%
rename from gc-ephemeron-internal.h
rename to src/gc-ephemeron-internal.h
diff --git a/gc-ephemeron.c b/src/gc-ephemeron.c
similarity index 100%
rename from gc-ephemeron.c
rename to src/gc-ephemeron.c
diff --git a/gc-internal.h b/src/gc-internal.h
similarity index 100%
rename from gc-internal.h
rename to src/gc-internal.h
diff --git a/gc-options-internal.h b/src/gc-options-internal.h
similarity index 100%
rename from gc-options-internal.h
rename to src/gc-options-internal.h
diff --git a/gc-options.c b/src/gc-options.c
similarity index 100%
rename from gc-options.c
rename to src/gc-options.c
diff --git a/gc-platform-gnu-linux.c b/src/gc-platform-gnu-linux.c
similarity index 100%
rename from gc-platform-gnu-linux.c
rename to src/gc-platform-gnu-linux.c
diff --git a/gc-platform.h b/src/gc-platform.h
similarity index 100%
rename from gc-platform.h
rename to src/gc-platform.h
diff --git a/gc-stack.c b/src/gc-stack.c
similarity index 100%
rename from gc-stack.c
rename to src/gc-stack.c
diff --git a/gc-stack.h b/src/gc-stack.h
similarity index 100%
rename from gc-stack.h
rename to src/gc-stack.h
diff --git a/gc-trace.h b/src/gc-trace.h
similarity index 100%
rename from gc-trace.h
rename to src/gc-trace.h
diff --git a/large-object-space.h b/src/large-object-space.h
similarity index 100%
rename from large-object-space.h
rename to src/large-object-space.h
diff --git a/parallel-tracer.h b/src/parallel-tracer.h
similarity index 100%
rename from parallel-tracer.h
rename to src/parallel-tracer.h
diff --git a/semi.c b/src/semi.c
similarity index 100%
rename from semi.c
rename to src/semi.c
diff --git a/serial-tracer.h b/src/serial-tracer.h
similarity index 100%
rename from serial-tracer.h
rename to src/serial-tracer.h
diff --git a/spin.h b/src/spin.h
similarity index 100%
rename from spin.h
rename to src/spin.h
diff --git a/whippet.c b/src/whippet.c
similarity index 100%
rename from whippet.c
rename to src/whippet.c
diff --git a/test-address-map.c b/test/test-address-map.c
similarity index 100%
rename from test-address-map.c
rename to test/test-address-map.c
diff --git a/test-address-set.c b/test/test-address-set.c
similarity index 100%
rename from test-address-set.c
rename to test/test-address-set.c

From 2e8a0b3874f121bdbbbc7a2984ef06a8ade809f4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 6 Aug 2023 22:12:08 +0200
Subject: [PATCH 178/403] Add automatic dependency tracking

---
 .gitignore |   5 +++
 Makefile   | 106 ++++++++++++++++++++++++++++-------------------------
 2 files changed, 61 insertions(+), 50 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..64f222418
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+/*.d
+/*.o
+/*-ephemerons
+/*-mt-gcbench
+/*-quads
diff --git a/Makefile b/Makefile
index ee20d5647..61165242b 100644
--- a/Makefile
+++ b/Makefile
@@ -29,9 +29,11 @@ BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
 
 CC=gcc
 CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
-INCLUDES=-Iapi
+CPPFLAGS=-Iapi
 LDFLAGS=-lpthread -flto
-COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
+OUTPUT_OPTION=-MMD -MP -o $@
+COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(OUTPUT_OPTION)
+LINK=$(CC) $(LDFLAGS) -o $@
 PLATFORM=gnu-linux
 
 ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
@@ -39,116 +41,120 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
 all: $(ALL_TESTS)
 
 gc-platform.o: src/gc-platform.h src/gc-platform-$(PLATFORM).c api/gc-visibility.h
-	$(COMPILE) -o $@ -c src/gc-platform-$(PLATFORM).c
+	$(COMPILE) -c src/gc-platform-$(PLATFORM).c
 
 gc-stack.o: src/gc-stack.c
-	$(COMPILE) -o $@ -c $<
+	$(COMPILE) -c $<
 
 gc-options.o: src/gc-options.c api/gc-options.h src/gc-options-internal.h
-	$(COMPILE) -o $@ -c $<
+	$(COMPILE) -c $<
 
 gc-ephemeron-%.o: src/gc-ephemeron.c api/gc-ephemeron.h src/gc-ephemeron-internal.h benchmarks/%-embedder.h
-	$(COMPILE) -include benchmarks/$*-embedder.h -o $@ -c $<
+	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 
 bdw-%-gc.o: src/bdw.c benchmarks/%-embedder.h benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include benchmarks/$*-embedder.h -o $@ -c src/bdw.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include benchmarks/$*-embedder.h -c src/bdw.c
 bdw-%.o: src/bdw.c benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/bdw-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/bdw-attrs.h -c benchmarks/$*.c
 bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) `pkg-config --libs bdw-gc` -o $@ $^
+	$(LINK) `pkg-config --libs bdw-gc` $^
 
 semi-%-gc.o: src/semi.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/semi.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/semi.c
 semi-%.o: src/semi.c benchmarks/%.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/semi-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/semi-attrs.h -c benchmarks/$*.c
 semi-%: semi-%.o semi-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 whippet-%: whippet-%.o whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 stack-conservative-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 stack-conservative-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 stack-conservative-whippet-%: stack-conservative-whippet-%.o stack-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 heap-conservative-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 heap-conservative-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 heap-conservative-whippet-%: heap-conservative-whippet-%.o heap-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 parallel-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 parallel-whippet-%: parallel-whippet-%.o parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 stack-conservative-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 stack-conservative-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 stack-conservative-parallel-whippet-%: stack-conservative-parallel-whippet-%.o stack-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 heap-conservative-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 heap-conservative-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 heap-conservative-parallel-whippet-%: heap-conservative-parallel-whippet-%.o heap-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 generational-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 generational-whippet-%: generational-whippet-%.o generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 stack-conservative-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 stack-conservative-generational-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 stack-conservative-generational-whippet-%: stack-conservative-generational-whippet-%.o stack-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 heap-conservative-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 heap-conservative-generational-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 heap-conservative-generational-whippet-%: heap-conservative-generational-whippet-%.o heap-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 parallel-generational-whippet-%: parallel-generational-whippet-%.o parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 stack-conservative-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 stack-conservative-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 stack-conservative-parallel-generational-whippet-%: stack-conservative-parallel-generational-whippet-%.o stack-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
 heap-conservative-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -o $@ -c src/whippet.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
 heap-conservative-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -o $@ -c benchmarks/$*.c
+	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 heap-conservative-parallel-generational-whippet-%: heap-conservative-parallel-generational-whippet-%.o heap-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
-	$(CC) $(LDFLAGS) -o $@ $^
+	$(LINK) $^
 
-.PRECIOUS: $(ALL_TESTS)
+-include gc-platform.d gc-stack.d gc-options.d
+-include $(foreach COLLECTOR,$(COLLECTORS),gc-ephemeron-$(COLLECTOR).d)
+-include $(foreach TEST,$(ALL_TESTS),$(TEST)-gc.d $(TEST).d)
+
+.PRECIOUS: $(ALL_TESTS) $(foreach TEST,$(ALL_TESTS),$(TEST)-gc.o $(TEST).o)
 
 clean:
-	rm -f $(ALL_TESTS)
+	rm -f $(ALL_TESTS) *.d *.o

From 07d7df195208c8dcc1edae825426613dc3e99507 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 7 Aug 2023 09:28:45 +0200
Subject: [PATCH 179/403] Update benchmark target names

---
 .gitignore | 17 ++++++++--
 Makefile   | 92 +++++++++++++++++++++++++++---------------------------
 2 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/.gitignore b/.gitignore
index 64f222418..8dd74cd6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,16 @@
 /*.d
 /*.o
-/*-ephemerons
-/*-mt-gcbench
-/*-quads
+/*.bdw
+/*.semi
+/*.whippet
+/*.generational-whippet
+/*.parallel-whippet
+/*.parallel-generational-whippet
+/*.stack-conservative-whippet
+/*.stack-conservative-generational-whippet
+/*.stack-conservative-parallel-whippet
+/*.stack-conservative-parallel-generational-whippet
+/*.heap-conservative-whippet
+/*.heap-conservative-generational-whippet
+/*.heap-conservative-parallel-whippet
+/*.heap-conservative-parallel-generational-whippet
diff --git a/Makefile b/Makefile
index 61165242b..5f959c210 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(OUTPUT_OPTION)
 LINK=$(CC) $(LDFLAGS) -o $@
 PLATFORM=gnu-linux
 
-ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addprefix $(COLLECTOR)-,$(TESTS)))
+ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addsuffix .$(COLLECTOR),$(TESTS)))
 
 all: $(ALL_TESTS)
 
@@ -49,112 +49,112 @@ gc-stack.o: src/gc-stack.c
 gc-options.o: src/gc-options.c api/gc-options.h src/gc-options-internal.h
 	$(COMPILE) -c $<
 
-gc-ephemeron-%.o: src/gc-ephemeron.c api/gc-ephemeron.h src/gc-ephemeron-internal.h benchmarks/%-embedder.h
+%.gc-ephemeron.o: src/gc-ephemeron.c api/gc-ephemeron.h src/gc-ephemeron-internal.h benchmarks/%-embedder.h
 	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 
-bdw-%-gc.o: src/bdw.c benchmarks/%-embedder.h benchmarks/%.c
+%.bdw.gc.o: src/bdw.c benchmarks/%-embedder.h benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include benchmarks/$*-embedder.h -c src/bdw.c
-bdw-%.o: src/bdw.c benchmarks/%.c
+%.bdw.o: src/bdw.c benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/bdw-attrs.h -c benchmarks/$*.c
-bdw-%: bdw-%.o bdw-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.bdw: %.bdw.o %.bdw.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) `pkg-config --libs bdw-gc` $^
 
-semi-%-gc.o: src/semi.c benchmarks/%-embedder.h
+%.semi.gc.o: src/semi.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/semi.c
-semi-%.o: src/semi.c benchmarks/%.c
+%.semi.o: src/semi.c benchmarks/%.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/semi-attrs.h -c benchmarks/$*.c
-semi-%: semi-%.o semi-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.semi: %.semi.o %.semi.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-whippet-%.o: src/whippet.c benchmarks/%.c
+%.whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-whippet-%: whippet-%.o whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.whippet: %.whippet.o %.whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-stack-conservative-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-stack-conservative-whippet-%.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-stack-conservative-whippet-%: stack-conservative-whippet-%.o stack-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.stack-conservative-whippet: %.stack-conservative-whippet.o %.stack-conservative-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-heap-conservative-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-heap-conservative-whippet-%.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-heap-conservative-whippet-%: heap-conservative-whippet-%.o heap-conservative-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.heap-conservative-whippet: %.heap-conservative-whippet.o %.heap-conservative-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.parallel-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
+%.parallel-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-parallel-whippet-%: parallel-whippet-%.o parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.parallel-whippet: %.parallel-whippet.o %.parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-stack-conservative-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-parallel-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-stack-conservative-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-parallel-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-stack-conservative-parallel-whippet-%: stack-conservative-parallel-whippet-%.o stack-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.stack-conservative-parallel-whippet: %.stack-conservative-parallel-whippet.o %.stack-conservative-parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-heap-conservative-parallel-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-parallel-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-heap-conservative-parallel-whippet-%.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-parallel-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-heap-conservative-parallel-whippet-%: heap-conservative-parallel-whippet-%.o heap-conservative-parallel-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.heap-conservative-parallel-whippet: %.heap-conservative-parallel-whippet.o %.heap-conservative-parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+%.generational-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-generational-whippet-%: generational-whippet-%.o generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.generational-whippet: %.generational-whippet.o %.generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-stack-conservative-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-stack-conservative-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-generational-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-stack-conservative-generational-whippet-%: stack-conservative-generational-whippet-%.o stack-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.stack-conservative-generational-whippet: %.stack-conservative-generational-whippet.o %.stack-conservative-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-heap-conservative-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-heap-conservative-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-generational-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-heap-conservative-generational-whippet-%: heap-conservative-generational-whippet-%.o heap-conservative-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.heap-conservative-generational-whippet: %.heap-conservative-generational-whippet.o %.heap-conservative-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.parallel-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+%.parallel-generational-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-parallel-generational-whippet-%: parallel-generational-whippet-%.o parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.parallel-generational-whippet: %.parallel-generational-whippet.o %.parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-stack-conservative-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-parallel-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-stack-conservative-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-parallel-generational-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-stack-conservative-parallel-generational-whippet-%: stack-conservative-parallel-generational-whippet-%.o stack-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.stack-conservative-parallel-generational-whippet: %.stack-conservative-parallel-generational-whippet.o %.stack-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-heap-conservative-parallel-generational-whippet-%-gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-parallel-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-heap-conservative-parallel-generational-whippet-%.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-parallel-generational-whippet.o: src/whippet.c benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-heap-conservative-parallel-generational-whippet-%: heap-conservative-parallel-generational-whippet-%.o heap-conservative-parallel-generational-whippet-%-gc.o gc-stack.o gc-options.o gc-platform.o gc-ephemeron-%.o
+%.heap-conservative-parallel-generational-whippet: %.heap-conservative-parallel-generational-whippet.o %.heap-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
 -include gc-platform.d gc-stack.d gc-options.d
 -include $(foreach COLLECTOR,$(COLLECTORS),gc-ephemeron-$(COLLECTOR).d)
--include $(foreach TEST,$(ALL_TESTS),$(TEST)-gc.d $(TEST).d)
+-include $(foreach TEST,$(ALL_TESTS),$(TEST).gc.d $(TEST).d)
 
-.PRECIOUS: $(ALL_TESTS) $(foreach TEST,$(ALL_TESTS),$(TEST)-gc.o $(TEST).o)
+.PRECIOUS: $(ALL_TESTS) $(foreach TEST,$(ALL_TESTS),$(TEST).gc.o $(TEST).o)
 
 clean:
 	rm -f $(ALL_TESTS) *.d *.o

From 1c76cdcf072e71937f15c17ad014641de9eeb78b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 7 Aug 2023 10:55:30 +0200
Subject: [PATCH 180/403] Put deps in subdir

---
 .gitignore |  2 +-
 Makefile   | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8dd74cd6b..231806c14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-/*.d
 /*.o
 /*.bdw
 /*.semi
@@ -14,3 +13,4 @@
 /*.heap-conservative-generational-whippet
 /*.heap-conservative-parallel-whippet
 /*.heap-conservative-parallel-generational-whippet
+/.deps/
diff --git a/Makefile b/Makefile
index 5f959c210..e959d9384 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,8 @@ CC=gcc
 CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
 CPPFLAGS=-Iapi
 LDFLAGS=-lpthread -flto
-OUTPUT_OPTION=-MMD -MP -o $@
+DEPFLAGS=-MMD -MP -MF $(@:%.o=.deps/%.d)
+OUTPUT_OPTION=$(DEPFLAGS) -o $@
 COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(OUTPUT_OPTION)
 LINK=$(CC) $(LDFLAGS) -o $@
 PLATFORM=gnu-linux
@@ -40,6 +41,14 @@ ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addsuffix .$(COLLECTOR),$(TESTS)))
 
 all: $(ALL_TESTS)
 
+OBJS=gc-platform.o gc-stack.o gc-options.o
+OBJS+=$(foreach TEST,$(TESTS),$(TEST).gc-ephemeron.o)
+OBJS+=$(foreach TEST,$(ALL_TESTS),$(TEST).gc.o $(TEST).o)
+DEPS=$(OBJS:%.o=.deps/%.d)
+$(OBJS): | .deps
+.deps: ; mkdir -p .deps
+-include $(DEPS)
+
 gc-platform.o: src/gc-platform.h src/gc-platform-$(PLATFORM).c api/gc-visibility.h
 	$(COMPILE) -c src/gc-platform-$(PLATFORM).c
 
@@ -150,11 +159,7 @@ gc-options.o: src/gc-options.c api/gc-options.h src/gc-options-internal.h
 %.heap-conservative-parallel-generational-whippet: %.heap-conservative-parallel-generational-whippet.o %.heap-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
--include gc-platform.d gc-stack.d gc-options.d
--include $(foreach COLLECTOR,$(COLLECTORS),gc-ephemeron-$(COLLECTOR).d)
--include $(foreach TEST,$(ALL_TESTS),$(TEST).gc.d $(TEST).d)
-
-.PRECIOUS: $(ALL_TESTS) $(foreach TEST,$(ALL_TESTS),$(TEST).gc.o $(TEST).o)
+.PRECIOUS: $(ALL_TESTS) $(OBJS)
 
 clean:
-	rm -f $(ALL_TESTS) *.d *.o
+	rm -f $(ALL_TESTS) $(OBJS) $(DEPS)

From 498d0537f3002c3a3f43a568fa2750250bfe83de Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 7 Aug 2023 10:11:56 +0200
Subject: [PATCH 181/403] Simplify makefile

---
 Makefile | 69 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/Makefile b/Makefile
index e959d9384..d749fe14f 100644
--- a/Makefile
+++ b/Makefile
@@ -47,114 +47,111 @@ OBJS+=$(foreach TEST,$(ALL_TESTS),$(TEST).gc.o $(TEST).o)
 DEPS=$(OBJS:%.o=.deps/%.d)
 $(OBJS): | .deps
 .deps: ; mkdir -p .deps
--include $(DEPS)
-
-gc-platform.o: src/gc-platform.h src/gc-platform-$(PLATFORM).c api/gc-visibility.h
-	$(COMPILE) -c src/gc-platform-$(PLATFORM).c
+include $(wildcard $(DEPS))
 
+gc-platform.o: src/gc-platform-$(PLATFORM).c
+	$(COMPILE) -c $<
 gc-stack.o: src/gc-stack.c
 	$(COMPILE) -c $<
-
-gc-options.o: src/gc-options.c api/gc-options.h src/gc-options-internal.h
+gc-options.o: src/gc-options.c
 	$(COMPILE) -c $<
-
-%.gc-ephemeron.o: src/gc-ephemeron.c api/gc-ephemeron.h src/gc-ephemeron-internal.h benchmarks/%-embedder.h
+%.gc-ephemeron.o: src/gc-ephemeron.c
 	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 
-%.bdw.gc.o: src/bdw.c benchmarks/%-embedder.h benchmarks/%.c
+%.bdw.gc.o: src/bdw.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include benchmarks/$*-embedder.h -c src/bdw.c
-%.bdw.o: src/bdw.c benchmarks/%.c
+%.bdw.o: benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/bdw-attrs.h -c benchmarks/$*.c
 %.bdw: %.bdw.o %.bdw.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) `pkg-config --libs bdw-gc` $^
 
-%.semi.gc.o: src/semi.c benchmarks/%-embedder.h
+%.semi.gc.o: src/semi.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/semi.c
-%.semi.o: src/semi.c benchmarks/%.c
+%.semi.o: benchmarks/%.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/semi-attrs.h -c benchmarks/$*.c
 %.semi: %.semi.o %.semi.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.whippet.o: src/whippet.c benchmarks/%.c
+%.whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.whippet: %.whippet.o %.whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.stack-conservative-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-whippet.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.stack-conservative-whippet: %.stack-conservative-whippet.o %.stack-conservative-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.heap-conservative-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-whippet.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.heap-conservative-whippet: %.heap-conservative-whippet.o %.heap-conservative-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.parallel-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.parallel-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.parallel-whippet.o: src/whippet.c benchmarks/%.c
+%.parallel-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.parallel-whippet: %.parallel-whippet.o %.parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.stack-conservative-parallel-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-parallel-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-parallel-whippet.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-parallel-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.stack-conservative-parallel-whippet: %.stack-conservative-parallel-whippet.o %.stack-conservative-parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.heap-conservative-parallel-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-parallel-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-parallel-whippet.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-parallel-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.heap-conservative-parallel-whippet: %.heap-conservative-parallel-whippet.o %.heap-conservative-parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.generational-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.generational-whippet.o: src/whippet.c benchmarks/%.c
+%.generational-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.generational-whippet: %.generational-whippet.o %.generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.stack-conservative-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-generational-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-generational-whippet.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-generational-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.stack-conservative-generational-whippet: %.stack-conservative-generational-whippet.o %.stack-conservative-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.heap-conservative-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-generational-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-generational-whippet.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-generational-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.heap-conservative-generational-whippet: %.heap-conservative-generational-whippet.o %.heap-conservative-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.parallel-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.parallel-generational-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.parallel-generational-whippet.o: src/whippet.c benchmarks/%.c
+%.parallel-generational-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.parallel-generational-whippet: %.parallel-generational-whippet.o %.parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.stack-conservative-parallel-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.stack-conservative-parallel-generational-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-parallel-generational-whippet.o: src/whippet.c benchmarks/%.c
+%.stack-conservative-parallel-generational-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.stack-conservative-parallel-generational-whippet: %.stack-conservative-parallel-generational-whippet.o %.stack-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^
 
-%.heap-conservative-parallel-generational-whippet.gc.o: src/whippet.c benchmarks/%-embedder.h
+%.heap-conservative-parallel-generational-whippet.gc.o: src/whippet.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-parallel-generational-whippet.o: src/whippet.c benchmarks/%.c
+%.heap-conservative-parallel-generational-whippet.o: benchmarks/%.c
 	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
 %.heap-conservative-parallel-generational-whippet: %.heap-conservative-parallel-generational-whippet.o %.heap-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
 	$(LINK) $^

From 5fdfd1175e23d9951a846a8610fb2f80db37fab2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 8 Aug 2023 14:39:31 +0200
Subject: [PATCH 182/403] Use foreach and eval to generate benchmark x
 collector rules

---
 Makefile | 135 +++++++++++++++++++++----------------------------------
 1 file changed, 51 insertions(+), 84 deletions(-)

diff --git a/Makefile b/Makefile
index d749fe14f..e8b3647d6 100644
--- a/Makefile
+++ b/Makefile
@@ -58,103 +58,70 @@ gc-options.o: src/gc-options.c
 %.gc-ephemeron.o: src/gc-ephemeron.c
 	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 
-%.bdw.gc.o: src/bdw.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 `pkg-config --cflags bdw-gc` -include benchmarks/$*-embedder.h -c src/bdw.c
-%.bdw.o: benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/bdw-attrs.h -c benchmarks/$*.c
-%.bdw: %.bdw.o %.bdw.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) `pkg-config --libs bdw-gc` $^
+GC_STEM_bdw=bdw
+GC_CFLAGS_bdw=-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+GC_IMPL_CFLAGS_bdw=`pkg-config --cflags bdw-gc`
+GC_LIBS_bdw=`pkg-config --libs bdw-gc`
 
-%.semi.gc.o: src/semi.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/semi.c
-%.semi.o: benchmarks/%.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/semi-attrs.h -c benchmarks/$*.c
-%.semi: %.semi.o %.semi.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_semi=semi
+GC_CFLAGS_semi=-DGC_PRECISE_ROOTS=1
 
-%.whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.whippet: %.whippet.o %.whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_whippet=whippet
+GC_CFLAGS_whippet=-DGC_PRECISE_ROOTS=1
 
-%.stack-conservative-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.stack-conservative-whippet: %.stack-conservative-whippet.o %.stack-conservative-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_stack_conservative_whippet=whippet
+GC_CFLAGS_stack_conservative_whippet=-DGC_CONSERVATIVE_ROOTS=1
 
-%.heap-conservative-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.heap-conservative-whippet: %.heap-conservative-whippet.o %.heap-conservative-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_heap_conservative_whippet=whippet
+GC_CFLAGS_heap_conservative_whippet=-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
 
-%.parallel-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.parallel-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.parallel-whippet: %.parallel-whippet.o %.parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_parallel_whippet=whippet
+GC_CFLAGS_parallel_whippet=-DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1
 
-%.stack-conservative-parallel-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-parallel-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.stack-conservative-parallel-whippet: %.stack-conservative-parallel-whippet.o %.stack-conservative-parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_stack_conservative_parallel_whippet=whippet
+GC_CFLAGS_stack_conservative_parallel_whippet=-DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1
 
-%.heap-conservative-parallel-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-parallel-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_FULLY_CONSERVATIVE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.heap-conservative-parallel-whippet: %.heap-conservative-parallel-whippet.o %.heap-conservative-parallel-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_heap_conservative_parallel_whippet=whippet
+GC_CFLAGS_heap_conservative_parallel_whippet=-DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
 
-%.generational-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.generational-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.generational-whippet: %.generational-whippet.o %.generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_generational_whippet=whippet
+GC_CFLAGS_generational_whippet=-DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1
 
-%.stack-conservative-generational-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-generational-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.stack-conservative-generational-whippet: %.stack-conservative-generational-whippet.o %.stack-conservative-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_stack_conservative_generational_whippet=whippet
+GC_CFLAGS_stack_conservative_generational_whippet=-DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1
 
-%.heap-conservative-generational-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-generational-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.heap-conservative-generational-whippet: %.heap-conservative-generational-whippet.o %.heap-conservative-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_heap_conservative_generational_whippet=whippet
+GC_CFLAGS_heap_conservative_generational_whippet=-DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
 
-%.parallel-generational-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.parallel-generational-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.parallel-generational-whippet: %.parallel-generational-whippet.o %.parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_parallel_generational_whippet=whippet
+GC_CFLAGS_parallel_generational_whippet=-DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1
 
-%.stack-conservative-parallel-generational-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.stack-conservative-parallel-generational-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.stack-conservative-parallel-generational-whippet: %.stack-conservative-parallel-generational-whippet.o %.stack-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_stack_conservative_parallel_generational_whippet=whippet
+GC_CFLAGS_stack_conservative_parallel_generational_whippet=-DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1
 
-%.heap-conservative-parallel-generational-whippet.gc.o: src/whippet.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include benchmarks/$*-embedder.h -c src/whippet.c
-%.heap-conservative-parallel-generational-whippet.o: benchmarks/%.c
-	$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -include api/whippet-attrs.h -c benchmarks/$*.c
-%.heap-conservative-parallel-generational-whippet: %.heap-conservative-parallel-generational-whippet.o %.heap-conservative-parallel-generational-whippet.gc.o gc-stack.o gc-options.o gc-platform.o %.gc-ephemeron.o
-	$(LINK) $^
+GC_STEM_heap_conservative_parallel_generational_whippet=whippet
+GC_CFLAGS_heap_conservative_parallel_generational_whippet=-DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+
+# $(1) is the benchmark, $(2) is the collector configuration
+# gc_stem for bdw: bdw
+make_gc_var=$$($(1)$(subst -,_,$(2)))
+gc_impl=$(call make_gc_var,GC_STEM_,$(1)).c
+gc_attrs=$(call make_gc_var,GC_STEM_,$(1))-attrs.h
+gc_cflags=$(call make_gc_var,GC_CFLAGS_,$(1))
+gc_impl_cflags=$(call make_gc_var,GC_IMPL_CFLAGS_,$(1))
+gc_libs=$(call make_gc_var,GC_LIBS_,$(1))
+define benchmark_template
+$(1).$(2).gc.o: src/$(call gc_impl,$(2))
+	$$(COMPILE) $(call gc_cflags,$(2)) $(call gc_impl_cflags,$(2)) -include benchmarks/$(1)-embedder.h -c $$<
+$(1).$(2).o: benchmarks/$(1).c
+	$$(COMPILE) $(call gc_cflags,$(2)) -include api/$(call gc_attrs,$(2)) -c $$<
+$(1).$(2): $(1).$(2).gc.o $(1).$(2).o gc-stack.o gc-options.o gc-platform.o $(1).gc-ephemeron.o
+	$$(LINK) $(call gc_libs,$(2)) $$^
+endef
+
+$(foreach BENCHMARK,$(TESTS),\
+  $(foreach COLLECTOR,$(COLLECTORS),\
+    $(eval $(call benchmark_template,$(BENCHMARK),$(COLLECTOR)))))
 
 .PRECIOUS: $(ALL_TESTS) $(OBJS)
 

From f9330f789c69b79850fb206cc369c44a319d9802 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 8 Aug 2023 14:56:30 +0200
Subject: [PATCH 183/403] Attempt to do more DRY with whippet variants

---
 Makefile | 59 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/Makefile b/Makefile
index e8b3647d6..2e3cf35f4 100644
--- a/Makefile
+++ b/Makefile
@@ -66,41 +66,38 @@ GC_LIBS_bdw=`pkg-config --libs bdw-gc`
 GC_STEM_semi=semi
 GC_CFLAGS_semi=-DGC_PRECISE_ROOTS=1
 
-GC_STEM_whippet=whippet
-GC_CFLAGS_whippet=-DGC_PRECISE_ROOTS=1
+define whippet_variant
+GC_STEM_$(1)=whippet
+GC_CFLAGS_$(1)=$(2)
+endef
 
-GC_STEM_stack_conservative_whippet=whippet
-GC_CFLAGS_stack_conservative_whippet=-DGC_CONSERVATIVE_ROOTS=1
+$(eval $(call whippet_variant,whippet,\
+              -DGC_PRECISE_ROOTS=1))
+$(eval $(call whippet_variant,stack_conservative_whippet,\
+              -DGC_CONSERVATIVE_ROOTS=1))
+$(eval $(call whippet_variant,heap_conservative_whippet,\
+              -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
 
-GC_STEM_heap_conservative_whippet=whippet
-GC_CFLAGS_heap_conservative_whippet=-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+$(eval $(call whippet_variant,parallel_whippet,\
+              -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1))
+$(eval $(call whippet_variant,stack_conservative_parallel_whippet,\
+              -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1))
+$(eval $(call whippet_variant,heap_conservative_parallel_whippet,\
+              -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
 
-GC_STEM_parallel_whippet=whippet
-GC_CFLAGS_parallel_whippet=-DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1
+$(eval $(call whippet_variant,generational_whippet,\
+              -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1))
+$(eval $(call whippet_variant,stack_conservative_generational_whippet,\
+              -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1))
+$(eval $(call whippet_variant,heap_conservative_generational_whippet,\
+              -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
 
-GC_STEM_stack_conservative_parallel_whippet=whippet
-GC_CFLAGS_stack_conservative_parallel_whippet=-DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1
-
-GC_STEM_heap_conservative_parallel_whippet=whippet
-GC_CFLAGS_heap_conservative_parallel_whippet=-DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
-
-GC_STEM_generational_whippet=whippet
-GC_CFLAGS_generational_whippet=-DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1
-
-GC_STEM_stack_conservative_generational_whippet=whippet
-GC_CFLAGS_stack_conservative_generational_whippet=-DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1
-
-GC_STEM_heap_conservative_generational_whippet=whippet
-GC_CFLAGS_heap_conservative_generational_whippet=-DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
-
-GC_STEM_parallel_generational_whippet=whippet
-GC_CFLAGS_parallel_generational_whippet=-DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1
-
-GC_STEM_stack_conservative_parallel_generational_whippet=whippet
-GC_CFLAGS_stack_conservative_parallel_generational_whippet=-DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1
-
-GC_STEM_heap_conservative_parallel_generational_whippet=whippet
-GC_CFLAGS_heap_conservative_parallel_generational_whippet=-DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+$(eval $(call whippet_variant,parallel_generational_whippet,\
+              -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1))
+$(eval $(call whippet_variant,stack_conservative_parallel_generational_whippet,\
+              -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1))
+$(eval $(call whippet_variant,heap_conservative_parallel_generational_whippet,\
+              -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
 
 # $(1) is the benchmark, $(2) is the collector configuration
 # gc_stem for bdw: bdw

From 8f93e23a5f2f402347b5a95db6f6cf80ecb4e5c5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 9 Aug 2023 21:24:41 +0200
Subject: [PATCH 184/403] DRY for whippet variant cflags

---
 Makefile | 58 +++++++++++++++++++++++++-------------------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/Makefile b/Makefile
index 2e3cf35f4..fcd6c85d4 100644
--- a/Makefile
+++ b/Makefile
@@ -32,8 +32,7 @@ CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_
 CPPFLAGS=-Iapi
 LDFLAGS=-lpthread -flto
 DEPFLAGS=-MMD -MP -MF $(@:%.o=.deps/%.d)
-OUTPUT_OPTION=$(DEPFLAGS) -o $@
-COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(OUTPUT_OPTION)
+COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(DEPFLAGS) -o $@
 LINK=$(CC) $(LDFLAGS) -o $@
 PLATFORM=gnu-linux
 
@@ -71,42 +70,32 @@ GC_STEM_$(1)=whippet
 GC_CFLAGS_$(1)=$(2)
 endef
 
-$(eval $(call whippet_variant,whippet,\
-              -DGC_PRECISE_ROOTS=1))
-$(eval $(call whippet_variant,stack_conservative_whippet,\
-              -DGC_CONSERVATIVE_ROOTS=1))
-$(eval $(call whippet_variant,heap_conservative_whippet,\
-              -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
+define generational_whippet_variants
+$(call whippet_variant,$(1)whippet,$(2))
+$(call whippet_variant,$(1)generational_whippet,$(2) -DGC_GENERATIONAL=1)
+endef
 
-$(eval $(call whippet_variant,parallel_whippet,\
-              -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1))
-$(eval $(call whippet_variant,stack_conservative_parallel_whippet,\
-              -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1))
-$(eval $(call whippet_variant,heap_conservative_parallel_whippet,\
-              -DGC_PARALLEL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
+define parallel_whippet_variants
+$(call generational_whippet_variants,$(1),$(2))
+$(call generational_whippet_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
+endef
 
-$(eval $(call whippet_variant,generational_whippet,\
-              -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1))
-$(eval $(call whippet_variant,stack_conservative_generational_whippet,\
-              -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1))
-$(eval $(call whippet_variant,heap_conservative_generational_whippet,\
-              -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
+define trace_whippet_variants
+$(call parallel_whippet_variants,,-DGC_PRECISE_ROOTS=1)
+$(call parallel_whippet_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
+$(call parallel_whippet_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
+endef
 
-$(eval $(call whippet_variant,parallel_generational_whippet,\
-              -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1))
-$(eval $(call whippet_variant,stack_conservative_parallel_generational_whippet,\
-              -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1))
-$(eval $(call whippet_variant,heap_conservative_parallel_generational_whippet,\
-              -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1))
+$(eval $(call trace_whippet_variants))
 
 # $(1) is the benchmark, $(2) is the collector configuration
 # gc_stem for bdw: bdw
-make_gc_var=$$($(1)$(subst -,_,$(2)))
-gc_impl=$(call make_gc_var,GC_STEM_,$(1)).c
-gc_attrs=$(call make_gc_var,GC_STEM_,$(1))-attrs.h
-gc_cflags=$(call make_gc_var,GC_CFLAGS_,$(1))
-gc_impl_cflags=$(call make_gc_var,GC_IMPL_CFLAGS_,$(1))
-gc_libs=$(call make_gc_var,GC_LIBS_,$(1))
+make_gc_var    = $$($(1)$(subst -,_,$(2)))
+gc_impl        = $(call make_gc_var,GC_STEM_,$(1)).c
+gc_attrs       = $(call make_gc_var,GC_STEM_,$(1))-attrs.h
+gc_cflags      = $(call make_gc_var,GC_CFLAGS_,$(1))
+gc_impl_cflags = $(call make_gc_var,GC_IMPL_CFLAGS_,$(1))
+gc_libs        = $(call make_gc_var,GC_LIBS_,$(1))
 define benchmark_template
 $(1).$(2).gc.o: src/$(call gc_impl,$(2))
 	$$(COMPILE) $(call gc_cflags,$(2)) $(call gc_impl_cflags,$(2)) -include benchmarks/$(1)-embedder.h -c $$<
@@ -124,3 +113,8 @@ $(foreach BENCHMARK,$(TESTS),\
 
 clean:
 	rm -f $(ALL_TESTS) $(OBJS) $(DEPS)
+
+.SUFFIXES:
+.SECONDARY:
+%.c:;
+Makefile:;

From 19f66fab086874fd9aaede3e9eb759be55892e7e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 9 Aug 2023 21:42:57 +0200
Subject: [PATCH 185/403] Bin in bin, obj in obj; prettify makefile

---
 Makefile | 78 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 41 deletions(-)

diff --git a/Makefile b/Makefile
index fcd6c85d4..70d351d68 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-TESTS=quads mt-gcbench ephemerons # MT_GCBench MT_GCBench2
-COLLECTORS= \
+TESTS = quads mt-gcbench ephemerons # MT_GCBench MT_GCBench2
+COLLECTORS = \
 	bdw \
 	semi \
 	\
@@ -19,55 +19,50 @@ COLLECTORS= \
 	stack-conservative-parallel-generational-whippet \
 	heap-conservative-parallel-generational-whippet
 
-DEFAULT_BUILD:=opt
+DEFAULT_BUILD := opt
 
-BUILD_CFLAGS_opt=-O2 -g -DNDEBUG
-BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1
-BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1
+BUILD_CFLAGS_opt      = -O2 -g -DNDEBUG
+BUILD_CFLAGS_optdebug = -Og -g -DGC_DEBUG=1
+BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
 
-BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
+BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
 
-CC=gcc
-CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
-CPPFLAGS=-Iapi
-LDFLAGS=-lpthread -flto
-DEPFLAGS=-MMD -MP -MF $(@:%.o=.deps/%.d)
-COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(DEPFLAGS) -o $@
-LINK=$(CC) $(LDFLAGS) -o $@
-PLATFORM=gnu-linux
+CC       = gcc
+CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
+CPPFLAGS = -Iapi
+LDFLAGS  = -lpthread -flto
+DEPFLAGS = -MMD -MP -MF $(@:obj/%.o=.deps/%.d)
+COMPILE  = $(CC) $(CFLAGS) $(CPPFLAGS) $(DEPFLAGS) -o $@
+LINK     = $(CC) $(LDFLAGS) -o $@
+PLATFORM = gnu-linux
 
-ALL_TESTS=$(foreach COLLECTOR,$(COLLECTORS),$(addsuffix .$(COLLECTOR),$(TESTS)))
+ALL_TESTS = $(foreach COLLECTOR,$(COLLECTORS),$(addsuffix .$(COLLECTOR),$(TESTS)))
 
-all: $(ALL_TESTS)
+all: $(ALL_TESTS:%=bin/%)
+.deps obj bin: ; mkdir -p $@
 
-OBJS=gc-platform.o gc-stack.o gc-options.o
-OBJS+=$(foreach TEST,$(TESTS),$(TEST).gc-ephemeron.o)
-OBJS+=$(foreach TEST,$(ALL_TESTS),$(TEST).gc.o $(TEST).o)
-DEPS=$(OBJS:%.o=.deps/%.d)
-$(OBJS): | .deps
-.deps: ; mkdir -p .deps
-include $(wildcard $(DEPS))
+include $(wildcard .deps/*)
 
-gc-platform.o: src/gc-platform-$(PLATFORM).c
+obj/gc-platform.o: src/gc-platform-$(PLATFORM).c | .deps obj
 	$(COMPILE) -c $<
-gc-stack.o: src/gc-stack.c
+obj/gc-stack.o: src/gc-stack.c | .deps obj
 	$(COMPILE) -c $<
-gc-options.o: src/gc-options.c
+obj/gc-options.o: src/gc-options.c | .deps obj
 	$(COMPILE) -c $<
-%.gc-ephemeron.o: src/gc-ephemeron.c
+obj/%.gc-ephemeron.o: src/gc-ephemeron.c | .deps obj
 	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 
-GC_STEM_bdw=bdw
-GC_CFLAGS_bdw=-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
-GC_IMPL_CFLAGS_bdw=`pkg-config --cflags bdw-gc`
-GC_LIBS_bdw=`pkg-config --libs bdw-gc`
+GC_STEM_bdw   	   = bdw
+GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+GC_IMPL_CFLAGS_bdw = `pkg-config --cflags bdw-gc`
+GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 
-GC_STEM_semi=semi
-GC_CFLAGS_semi=-DGC_PRECISE_ROOTS=1
+GC_STEM_semi       = semi
+GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
 define whippet_variant
-GC_STEM_$(1)=whippet
-GC_CFLAGS_$(1)=$(2)
+GC_STEM_$(1)       = whippet
+GC_CFLAGS_$(1)     = $(2)
 endef
 
 define generational_whippet_variants
@@ -89,7 +84,6 @@ endef
 $(eval $(call trace_whippet_variants))
 
 # $(1) is the benchmark, $(2) is the collector configuration
-# gc_stem for bdw: bdw
 make_gc_var    = $$($(1)$(subst -,_,$(2)))
 gc_impl        = $(call make_gc_var,GC_STEM_,$(1)).c
 gc_attrs       = $(call make_gc_var,GC_STEM_,$(1))-attrs.h
@@ -97,11 +91,11 @@ gc_cflags      = $(call make_gc_var,GC_CFLAGS_,$(1))
 gc_impl_cflags = $(call make_gc_var,GC_IMPL_CFLAGS_,$(1))
 gc_libs        = $(call make_gc_var,GC_LIBS_,$(1))
 define benchmark_template
-$(1).$(2).gc.o: src/$(call gc_impl,$(2))
+obj/$(1).$(2).gc.o: src/$(call gc_impl,$(2)) | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) $(call gc_impl_cflags,$(2)) -include benchmarks/$(1)-embedder.h -c $$<
-$(1).$(2).o: benchmarks/$(1).c
+obj/$(1).$(2).o: benchmarks/$(1).c | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) -include api/$(call gc_attrs,$(2)) -c $$<
-$(1).$(2): $(1).$(2).gc.o $(1).$(2).o gc-stack.o gc-options.o gc-platform.o $(1).gc-ephemeron.o
+bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/$(1).gc-ephemeron.o | bin
 	$$(LINK) $(call gc_libs,$(2)) $$^
 endef
 
@@ -112,8 +106,10 @@ $(foreach BENCHMARK,$(TESTS),\
 .PRECIOUS: $(ALL_TESTS) $(OBJS)
 
 clean:
-	rm -f $(ALL_TESTS) $(OBJS) $(DEPS)
+	rm -f $(ALL_TESTS)
+	rm -rf .deps obj bin
 
+# Clear some of the default rules.
 .SUFFIXES:
 .SECONDARY:
 %.c:;

From e4191f44c6722fa45dc4cd04e40c9291351dfdc5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 9 Aug 2023 22:16:03 +0200
Subject: [PATCH 186/403] First stab at an embedder's makefile snippet

---
 embed.mk | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 embed.mk

diff --git a/embed.mk b/embed.mk
new file mode 100644
index 000000000..8ce8055ce
--- /dev/null
+++ b/embed.mk
@@ -0,0 +1,78 @@
+GC_COLLECTOR ?= semi
+
+DEFAULT_BUILD := opt
+
+BUILD_CFLAGS_opt      = -O2 -g -DNDEBUG
+BUILD_CFLAGS_optdebug = -Og -g -DGC_DEBUG=1
+BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
+
+GC_BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(GC_BUILD),$(DEFAULT_BUILD)))
+
+GC_CC       = gcc
+GC_CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(GC_BUILD_CFLAGS)
+GC_CPPFLAGS = -I$(here)/api
+GC_LDFLAGS  = -lpthread -flto
+GC_DEPFLAGS = -MMD -MP -MF $(@:obj/%.o=.deps/%.d)
+GC_COMPILE  = $(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
+GC_LINK     = $(CC) $(LDFLAGS) -o $@
+GC_PLATFORM = gnu-linux
+GC_OBJDIR   =
+
+$(GC_OBJDIR)gc-platform.o: src/gc-platform-$(PLATFORM).c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-stack.o: src/gc-stack.c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-options.o: src/gc-options.c
+	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-ephemeron.o: src/gc-ephemeron.c
+	$(GC_COMPILE) -include $(GC_EMBEDDER_H) -c $<
+
+GC_STEM_bdw   	   = bdw
+GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+GC_IMPL_CFLAGS_bdw = `pkg-config --cflags bdw-gc`
+GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
+
+GC_STEM_semi       = semi
+GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
+
+define whippet_variant
+GC_STEM_$(1)       = whippet
+GC_CFLAGS_$(1)     = $(2)
+endef
+
+define generational_whippet_variants
+$(call whippet_variant,$(1)whippet,$(2))
+$(call whippet_variant,$(1)generational_whippet,$(2) -DGC_GENERATIONAL=1)
+endef
+
+define parallel_whippet_variants
+$(call generational_whippet_variants,$(1),$(2))
+$(call generational_whippet_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
+endef
+
+define trace_whippet_variants
+$(call parallel_whippet_variants,,-DGC_PRECISE_ROOTS=1)
+$(call parallel_whippet_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
+$(call parallel_whippet_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
+endef
+
+$(eval $(call trace_whippet_variants))
+
+gc_var         = $($(1)$(subst -,_,$(2)))
+gc_impl        = $(call gc_var,GC_STEM_,$(1)).c
+gc_attrs       = $(call gc_var,GC_STEM_,$(1))-attrs.h
+gc_cflags      = $(call gc_var,GC_CFLAGS_,$(1))
+gc_impl_cflags = $(call gc_var,GC_IMPL_CFLAGS_,$(1))
+gc_libs        = $(call gc_var,GC_LIBS_,$(1))
+
+GC_IMPL        	    = $(call gc_impl,$(GC_COLLECTOR))
+GC_CFLAGS      	   += $(call gc_cflags,$(GC_COLLECTOR))
+GC_IMPL_CFLAGS 	    = $(call gc_impl_cflags,$(GC_COLLECTOR))
+GC_EMBEDDER_CFLAGS  = -include $(here)api/$(GC_IMPL)-attrs.h
+GC_ATTRS            = $(call gc_attrs,$(GC_COLLECTOR))
+GC_LIBS             = $(call gc_libs,$(GC_COLLECTOR))
+
+$(GC_OBJDIR)gc-impl.o: src/$(call gc_impl,$(GC_COLLECTOR))
+	$(GC_COMPILE) $(GC_IMPL_CFLAGS) -include $(GC_EMBEDDER_H) -c $<
+
+GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-ephemeron.o gc-impl.o,$(GC_OBJDIR)/$(O))

From 9c1a7649b4378480e3a762c501673e18857a8112 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Aug 2023 21:49:55 +0200
Subject: [PATCH 187/403] Update embed.mk

---
 embed.mk | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/embed.mk b/embed.mk
index 8ce8055ce..ce6410651 100644
--- a/embed.mk
+++ b/embed.mk
@@ -10,22 +10,22 @@ GC_BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(GC_BUILD),$(DEFAULT_BUILD)))
 
 GC_CC       = gcc
 GC_CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(GC_BUILD_CFLAGS)
-GC_CPPFLAGS = -I$(here)/api
+GC_CPPFLAGS = -I$(WHIPPET)api
 GC_LDFLAGS  = -lpthread -flto
-GC_DEPFLAGS = -MMD -MP -MF $(@:obj/%.o=.deps/%.d)
+GC_DEPFLAGS = 
 GC_COMPILE  = $(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
-GC_LINK     = $(CC) $(LDFLAGS) -o $@
+GC_LINK     = $(GC_CC) $(GC_LDFLAGS) -o $@
 GC_PLATFORM = gnu-linux
 GC_OBJDIR   =
 
-$(GC_OBJDIR)gc-platform.o: src/gc-platform-$(PLATFORM).c
+$(GC_OBJDIR)gc-platform.o: $(WHIPPET)src/gc-platform-$(GC_PLATFORM).c
 	$(GC_COMPILE) -c $<
-$(GC_OBJDIR)gc-stack.o: src/gc-stack.c
+$(GC_OBJDIR)gc-stack.o: $(WHIPPET)src/gc-stack.c
 	$(GC_COMPILE) -c $<
-$(GC_OBJDIR)gc-options.o: src/gc-options.c
-	$(GC_COMPILE) -c $<
-$(GC_OBJDIR)gc-ephemeron.o: src/gc-ephemeron.c
-	$(GC_COMPILE) -include $(GC_EMBEDDER_H) -c $<
+$(GC_OBJDIR)gc-options.o: $(WHIPPET)src/gc-options.c
+	$(GC_COMPILE) -c $(WHIPPET)$<
+$(GC_OBJDIR)gc-ephemeron.o: $(WHIPPET)src/gc-ephemeron.c
+	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 
 GC_STEM_bdw   	   = bdw
 GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
@@ -68,11 +68,11 @@ gc_libs        = $(call gc_var,GC_LIBS_,$(1))
 GC_IMPL        	    = $(call gc_impl,$(GC_COLLECTOR))
 GC_CFLAGS      	   += $(call gc_cflags,$(GC_COLLECTOR))
 GC_IMPL_CFLAGS 	    = $(call gc_impl_cflags,$(GC_COLLECTOR))
-GC_EMBEDDER_CFLAGS  = -include $(here)api/$(GC_IMPL)-attrs.h
-GC_ATTRS            = $(call gc_attrs,$(GC_COLLECTOR))
+GC_ATTRS            = $(WHIPPET)api/$(call gc_attrs,$(GC_COLLECTOR))
+GC_TO_EMBEDDER_CFLAGS = -include $(GC_ATTRS)
 GC_LIBS             = $(call gc_libs,$(GC_COLLECTOR))
 
-$(GC_OBJDIR)gc-impl.o: src/$(call gc_impl,$(GC_COLLECTOR))
-	$(GC_COMPILE) $(GC_IMPL_CFLAGS) -include $(GC_EMBEDDER_H) -c $<
+$(GC_OBJDIR)gc-impl.o: $(WHIPPET)src/$(call gc_impl,$(GC_COLLECTOR))
+	$(GC_COMPILE) $(GC_IMPL_CFLAGS) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 
-GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-ephemeron.o gc-impl.o,$(GC_OBJDIR)/$(O))
+GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-ephemeron.o gc-impl.o,$(GC_OBJDIR)$(O))

From a3019c961a06b06000a355c3694b2be5b9da4e00 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 11 Aug 2023 23:17:59 +0200
Subject: [PATCH 188/403] embed.mk fix

---
 embed.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/embed.mk b/embed.mk
index ce6410651..c189151c1 100644
--- a/embed.mk
+++ b/embed.mk
@@ -23,7 +23,7 @@ $(GC_OBJDIR)gc-platform.o: $(WHIPPET)src/gc-platform-$(GC_PLATFORM).c
 $(GC_OBJDIR)gc-stack.o: $(WHIPPET)src/gc-stack.c
 	$(GC_COMPILE) -c $<
 $(GC_OBJDIR)gc-options.o: $(WHIPPET)src/gc-options.c
-	$(GC_COMPILE) -c $(WHIPPET)$<
+	$(GC_COMPILE) -c $<
 $(GC_OBJDIR)gc-ephemeron.o: $(WHIPPET)src/gc-ephemeron.c
 	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 

From 3f92f36947393d0e6dc7f4545346f3083ad4d095 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 12 Aug 2023 10:23:29 +0200
Subject: [PATCH 189/403] embed.mk: silent by default

---
 embed.mk | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/embed.mk b/embed.mk
index c189151c1..27a9be6e8 100644
--- a/embed.mk
+++ b/embed.mk
@@ -8,13 +8,17 @@ BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
 
 GC_BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(GC_BUILD),$(DEFAULT_BUILD)))
 
+v_0 = @
+v_1 =
+
+GC_V        = $(v_$(V))
 GC_CC       = gcc
 GC_CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(GC_BUILD_CFLAGS)
 GC_CPPFLAGS = -I$(WHIPPET)api
 GC_LDFLAGS  = -lpthread -flto
 GC_DEPFLAGS = 
-GC_COMPILE  = $(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
-GC_LINK     = $(GC_CC) $(GC_LDFLAGS) -o $@
+GC_COMPILE  = $(GC_V)$(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
+GC_LINK     = $(GC_V)$(GC_CC) $(GC_LDFLAGS) -o $@
 GC_PLATFORM = gnu-linux
 GC_OBJDIR   =
 

From da5a4633dfb865434e8c061663c0c816fd0e7db5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 12 Aug 2023 10:26:12 +0200
Subject: [PATCH 190/403] embed.mk: silent by default, bis

---
 embed.mk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/embed.mk b/embed.mk
index 27a9be6e8..e2765e1af 100644
--- a/embed.mk
+++ b/embed.mk
@@ -8,6 +8,7 @@ BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
 
 GC_BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(GC_BUILD),$(DEFAULT_BUILD)))
 
+V ?= 1
 v_0 = @
 v_1 =
 

From fbe49598f583ae4f861528e54c0efeb0368cbbfd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 15 Aug 2023 11:34:29 +0200
Subject: [PATCH 191/403] Add "extern space"

This is mostly for static data.
---
 api/gc-api.h                    |  4 ++++
 api/gc-embedder-api.h           |  8 ++++++++
 benchmarks/simple-gc-embedder.h | 12 ++++++++++++
 doc/manual.md                   | 13 +++++++++++++
 src/bdw.c                       |  3 +++
 src/semi.c                      | 27 +++++++++++++++++++++++++--
 src/whippet.c                   | 13 ++++++++++++-
 7 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/api/gc-api.h b/api/gc-api.h
index 6cf783703..821891bca 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -34,6 +34,10 @@ struct gc_heap_roots;
 GC_API_ void gc_heap_set_roots(struct gc_heap *heap,
                                struct gc_heap_roots *roots);
 
+struct gc_extern_space;
+GC_API_ void gc_heap_set_extern_space(struct gc_heap *heap,
+                                      struct gc_extern_space *space);
+
 GC_API_ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *base,
                                               struct gc_heap *heap);
 GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index 6e39f05ea..31793316f 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -17,9 +17,17 @@ struct gc_heap_roots;
 struct gc_atomic_forward;
 struct gc_heap;
 struct gc_ephemeron;
+struct gc_extern_space;
 
 GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
 
+GC_EMBEDDER_API inline int gc_extern_space_mark(struct gc_extern_space *space,
+                                                struct gc_ref ref) GC_ALWAYS_INLINE;
+GC_EMBEDDER_API inline void gc_extern_space_start_gc(struct gc_extern_space *space,
+                                                     int is_minor_gc);
+GC_EMBEDDER_API inline void gc_extern_space_finish_gc(struct gc_extern_space *space,
+                                                      int is_minor_gc);
+
 GC_EMBEDDER_API inline void gc_trace_object(struct gc_ref ref,
                                             void (*visit)(struct gc_edge edge,
                                                           struct gc_heap *heap,
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index 14fb142e7..d4276192f 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -18,6 +18,18 @@ gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
 #endif
 }
 
+// No external objects in simple benchmarks.
+static inline int gc_extern_space_mark(struct gc_extern_space *space,
+                                       struct gc_ref ref) {
+  GC_CRASH();
+}
+static inline void gc_extern_space_start_gc(struct gc_extern_space *space,
+                                            int is_minor_gc) {
+}
+static inline void gc_extern_space_finish_gc(struct gc_extern_space *space,
+                                             int is_minor_gc) {
+}
+
 static inline void gc_trace_object(struct gc_ref ref,
                                    void (*trace_edge)(struct gc_edge edge,
                                                       struct gc_heap *heap,
diff --git a/doc/manual.md b/doc/manual.md
index 41ff83d91..e94095635 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -163,6 +163,19 @@ embedder should return 1 only if the displacement is 0, but if the
 program allows low-bit tagged pointers, then it should also return 1 for
 those pointer tags.
 
+### External objects
+
+Sometimes a system will allocate objects outside the GC, for example on
+the stack or in static data sections.  To support this use case, Whippet
+allows the embedder to provide a `struct gc_extern_space`
+implementation.  Whippet will call `gc_extern_space_start_gc` at the
+start of each collection, and `gc_extern_space_finish_gc` at the end.
+External objects will be visited by `gc_extern_space_mark`, which should
+return nonzero if the object hasn't been seen before and needs to be
+traced via `gc_trace_object` (coloring the object grey).  Note,
+`gc_extern_space_mark` may be called concurrently from many threads; be
+prepared!
+
 ## Configuration, compilation, and linking
 
 To the user, Whippet presents an abstract API that does not encode the
diff --git a/src/bdw.c b/src/bdw.c
index cf17f19e7..3b01e9dcb 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -325,6 +325,9 @@ void gc_mutator_set_roots(struct gc_mutator *mut,
 }
 void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
 }
+void gc_heap_set_extern_space(struct gc_heap *heap,
+                              struct gc_extern_space *space) {
+}
 
 void gc_print_stats(struct gc_heap *heap) {
   printf("Completed %ld collections\n", (long)GC_get_gc_no());
diff --git a/src/semi.c b/src/semi.c
index 11b74ec5f..151bafb83 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -37,12 +37,14 @@ struct gc_heap {
   struct semi_space semi_space;
   struct large_object_space large_object_space;
   struct gc_pending_ephemerons *pending_ephemerons;
+  struct gc_extern_space *extern_space;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
   size_t size;
   long count;
   int check_pending_ephemerons;
   const struct gc_options *options;
+  struct gc_heap_roots *roots;
 };
 // One mutator per space, can just store the heap in the mutator.
 struct gc_mutator {
@@ -195,6 +197,17 @@ static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
   return region_contains(&space->from_space, addr);
 }
 
+static void visit_external_object(struct gc_heap *heap,
+                                  struct gc_extern_space *space,
+                                  struct gc_ref ref) {
+  if (gc_extern_space_mark(space, ref)) {
+    if (GC_UNLIKELY(heap->check_pending_ephemerons))
+      gc_resolve_pending_ephemerons(ref, heap);
+
+    gc_trace_object(ref, trace, heap, NULL, NULL);
+  }
+}
+
 static void visit(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
@@ -204,7 +217,7 @@ static void visit(struct gc_edge edge, struct gc_heap *heap) {
   else if (large_object_space_contains(heap_large_object_space(heap), ref))
     visit_large_object_space(heap, heap_large_object_space(heap), ref);
   else
-    GC_CRASH();
+    visit_external_object(heap, heap->extern_space, ref);
 }
 
 struct gc_pending_ephemerons *
@@ -329,10 +342,13 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct large_object_space *large = heap_large_object_space(heap);
   // fprintf(stderr, "start collect #%ld:\n", space->count);
   large_object_space_start_gc(large, 0);
+  gc_extern_space_start_gc(heap->extern_space, 0);
   flip(semi);
   heap->count++;
   heap->check_pending_ephemerons = 0;
   uintptr_t grey = semi->hp;
+  if (heap->roots)
+    gc_trace_heap_roots(heap->roots, trace, heap, NULL);
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, trace, heap, NULL);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
@@ -344,6 +360,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
     while(grey < semi->hp)
       grey = scan(heap, gc_ref(grey));
   large_object_space_finish_gc(large, 0);
+  gc_extern_space_finish_gc(heap->extern_space, 0);
   semi_space_finish_gc(semi, large->live_pages_at_last_collection);
   gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
   adjust_heap_size_and_limits(heap, for_alloc);
@@ -485,11 +502,13 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
 }
 
 static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
+  heap->extern_space = NULL;
   heap->pending_ephemerons_size_factor = 0.01;
   heap->pending_ephemerons_size_slop = 0.5;
   heap->count = 0;
   heap->options = options;
   heap->size = options->common.heap_size;
+  heap->roots = NULL;
 
   return heap_prepare_pending_ephemerons(heap);
 }
@@ -559,7 +578,11 @@ void gc_mutator_set_roots(struct gc_mutator *mut,
   mut->roots = roots;
 }
 void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
-  GC_CRASH();
+  heap->roots = roots;
+}
+void gc_heap_set_extern_space(struct gc_heap *heap,
+                              struct gc_extern_space *space) {
+  heap->extern_space = space;
 }
 
 struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *base,
diff --git a/src/whippet.c b/src/whippet.c
index 4771e37e9..ae247482b 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -300,6 +300,7 @@ enum gc_kind {
 struct gc_heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
+  struct gc_extern_space *extern_space;
   size_t large_object_pages;
   pthread_mutex_t lock;
   pthread_cond_t collector_cond;
@@ -360,6 +361,9 @@ static inline struct mark_space* heap_mark_space(struct gc_heap *heap) {
 static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
+static inline struct gc_extern_space* heap_extern_space(struct gc_heap *heap) {
+  return heap->extern_space;
+}
 static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
@@ -667,7 +671,7 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           ref);
   else
-    GC_CRASH();
+    return gc_extern_space_mark(heap_extern_space(heap), ref);
 }
 
 static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
@@ -1078,6 +1082,10 @@ void gc_mutator_set_roots(struct gc_mutator *mut,
 void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
   heap->roots = roots;
 }
+void gc_heap_set_extern_space(struct gc_heap *heap,
+                              struct gc_extern_space *space) {
+  heap->extern_space = space;
+}
 
 static void trace_and_enqueue_locally(struct gc_edge edge,
                                       struct gc_heap *heap,
@@ -1803,6 +1811,7 @@ static void collect(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
+  struct gc_extern_space *exspace = heap_extern_space(heap);
   if (maybe_grow_heap(heap)) {
     DEBUG("grew heap instead of collecting #%ld:\n", heap->count);
     return;
@@ -1811,6 +1820,7 @@ static void collect(struct gc_mutator *mut) {
   enum gc_kind gc_kind = determine_collection_kind(heap);
   update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
   large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
+  gc_extern_space_start_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
   resolve_ephemerons_lazily(heap);
   tracer_prepare(heap);
   request_mutators_to_stop(heap);
@@ -1832,6 +1842,7 @@ static void collect(struct gc_mutator *mut) {
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
   large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
+  gc_extern_space_finish_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
   heap->count++;
   heap->last_collection_was_minor = gc_kind & GC_KIND_FLAG_MINOR;
   if (heap->last_collection_was_minor)

From db36c48efd8af953745081c42cd8bfa2531d3ef0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 16 Aug 2023 11:08:12 +0200
Subject: [PATCH 192/403] Update extern space API to allow for evacuation

---
 api/gc-embedder-api.h           |  5 +++--
 benchmarks/simple-gc-embedder.h |  5 +++--
 src/semi.c                      | 11 ++++++-----
 src/whippet.c                   |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index 31793316f..ad33bc170 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -21,8 +21,9 @@ struct gc_extern_space;
 
 GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
 
-GC_EMBEDDER_API inline int gc_extern_space_mark(struct gc_extern_space *space,
-                                                struct gc_ref ref) GC_ALWAYS_INLINE;
+GC_EMBEDDER_API inline int gc_extern_space_visit(struct gc_extern_space *space,
+                                                 struct gc_edge edge,
+                                                 struct gc_ref ref) GC_ALWAYS_INLINE;
 GC_EMBEDDER_API inline void gc_extern_space_start_gc(struct gc_extern_space *space,
                                                      int is_minor_gc);
 GC_EMBEDDER_API inline void gc_extern_space_finish_gc(struct gc_extern_space *space,
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index d4276192f..2c599655b 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -19,8 +19,9 @@ gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
 }
 
 // No external objects in simple benchmarks.
-static inline int gc_extern_space_mark(struct gc_extern_space *space,
-                                       struct gc_ref ref) {
+static inline int gc_extern_space_visit(struct gc_extern_space *space,
+                                        struct gc_edge edge,
+                                        struct gc_ref ref) {
   GC_CRASH();
 }
 static inline void gc_extern_space_start_gc(struct gc_extern_space *space,
diff --git a/src/semi.c b/src/semi.c
index 151bafb83..182a68a3d 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -199,12 +199,13 @@ static int semi_space_contains(struct semi_space *space, struct gc_ref ref) {
 
 static void visit_external_object(struct gc_heap *heap,
                                   struct gc_extern_space *space,
-                                  struct gc_ref ref) {
-  if (gc_extern_space_mark(space, ref)) {
+                                  struct gc_edge edge,
+                                  struct gc_ref old_ref) {
+  if (gc_extern_space_visit(space, edge, old_ref)) {
     if (GC_UNLIKELY(heap->check_pending_ephemerons))
-      gc_resolve_pending_ephemerons(ref, heap);
+      gc_resolve_pending_ephemerons(old_ref, heap);
 
-    gc_trace_object(ref, trace, heap, NULL, NULL);
+    gc_trace_object(gc_edge_ref(edge), trace, heap, NULL, NULL);
   }
 }
 
@@ -217,7 +218,7 @@ static void visit(struct gc_edge edge, struct gc_heap *heap) {
   else if (large_object_space_contains(heap_large_object_space(heap), ref))
     visit_large_object_space(heap, heap_large_object_space(heap), ref);
   else
-    visit_external_object(heap, heap->extern_space, ref);
+    visit_external_object(heap, heap->extern_space, edge, ref);
 }
 
 struct gc_pending_ephemerons *
diff --git a/src/whippet.c b/src/whippet.c
index ae247482b..d32868aa7 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -671,7 +671,7 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           ref);
   else
-    return gc_extern_space_mark(heap_extern_space(heap), ref);
+    return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
 }
 
 static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {

From dc013cfb585f258ea03d7cca388316a410d5938f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 11 Sep 2023 11:48:32 +0200
Subject: [PATCH 193/403] Change gc_allocate_ephemeron to return struct
 gc_ephemeron

---
 api/gc-ephemeron.h      | 2 +-
 benchmarks/ephemerons.c | 6 +++---
 src/bdw.c               | 5 ++---
 src/semi.c              | 4 ++--
 src/whippet.c           | 4 ++--
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/api/gc-ephemeron.h b/api/gc-ephemeron.h
index d5159dff3..1d9e59b55 100644
--- a/api/gc-ephemeron.h
+++ b/api/gc-ephemeron.h
@@ -18,7 +18,7 @@ struct gc_mutator;
 struct gc_ephemeron;
 
 GC_API_ size_t gc_ephemeron_size(void);
-GC_API_ struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut);
+GC_API_ struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut);
 GC_API_ void gc_ephemeron_init(struct gc_mutator *mut,
                                struct gc_ephemeron *ephemeron,
                                struct gc_ref key, struct gc_ref value);
diff --git a/benchmarks/ephemerons.c b/benchmarks/ephemerons.c
index c11fa755c..698aa1708 100644
--- a/benchmarks/ephemerons.c
+++ b/benchmarks/ephemerons.c
@@ -25,9 +25,9 @@ static Box* allocate_box(struct gc_mutator *mut) {
 }
 
 static struct gc_ephemeron* allocate_ephemeron(struct gc_mutator *mut) {
-  struct gc_ref ret = gc_allocate_ephemeron(mut);
-  *tag_word(ret) = tag_live(ALLOC_KIND_EPHEMERON);
-  return gc_ref_heap_object(ret);
+  struct gc_ephemeron *ret = gc_allocate_ephemeron(mut);
+  *tag_word(gc_ref_from_heap_object(ret)) = tag_live(ALLOC_KIND_EPHEMERON);
+  return ret;
 }
 
 /* Get the current time in microseconds */
diff --git a/src/bdw.c b/src/bdw.c
index 3b01e9dcb..aaa89e684 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -131,9 +131,8 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
 
 static int ephemeron_gc_kind;
 
-struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut) {
-  void *ret = GC_generic_malloc(gc_ephemeron_size(), ephemeron_gc_kind);
-  return gc_ref_from_heap_object(ret);
+struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
+  return GC_generic_malloc(gc_ephemeron_size(), ephemeron_gc_kind);
 }
 
 unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
diff --git a/src/semi.c b/src/semi.c
index 182a68a3d..fe181eb94 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -445,8 +445,8 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut) {
-  return gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size()));
+struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_ephemeron_size());
 }
 
 void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
diff --git a/src/whippet.c b/src/whippet.c
index d32868aa7..6f4596678 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -2200,14 +2200,14 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-struct gc_ref gc_allocate_ephemeron(struct gc_mutator *mut) {
+struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
   struct gc_ref ret =
     gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size()));
   if (gc_has_conservative_intraheap_edges()) {
     uint8_t *metadata = metadata_byte_for_addr(gc_ref_value(ret));
     *metadata |= METADATA_BYTE_EPHEMERON;
   }
-  return ret;
+  return gc_ref_heap_object(ret);
 }
 
 void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,

From 296e5e845810fdac137c6b188aa8cdfb66c3782b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 21 Sep 2023 10:08:23 +0200
Subject: [PATCH 194/403] BDW collector marks mutator/heap roots

Needed if a mutator has off-heap (mmap) storage.
---
 src/bdw.c | 129 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 100 insertions(+), 29 deletions(-)

diff --git a/src/bdw.c b/src/bdw.c
index aaa89e684..016ff5501 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -50,13 +50,16 @@
 #define GC_INLINE_FREELIST_COUNT (256U / GC_INLINE_GRANULE_BYTES)
 
 struct gc_heap {
+  struct gc_heap *freelist; // see mark_heap
   pthread_mutex_t lock;
   int multithreaded;
+  struct gc_heap_roots *roots;
 };
 
 struct gc_mutator {
   void *freelists[GC_INLINE_FREELIST_COUNT];
   struct gc_heap *heap;
+  struct gc_mutator_roots *roots;
 };
 
 static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
@@ -119,6 +122,25 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                              struct gc_edge edge, struct gc_ref new_val) {
 }
 
+struct bdw_mark_state {
+  struct GC_ms_entry *mark_stack_ptr;
+  struct GC_ms_entry *mark_stack_limit;
+};
+
+static void bdw_mark_edge(struct gc_edge edge, struct gc_heap *heap,
+                          void *visit_data) {
+  struct bdw_mark_state *state = visit_data;
+  uintptr_t addr = gc_ref_value(gc_edge_ref(edge));
+  state->mark_stack_ptr = GC_MARK_AND_PUSH ((void *) addr,
+                                            state->mark_stack_ptr,
+                                            state->mark_stack_limit,
+                                            NULL);
+}
+
+static int heap_gc_kind;
+static int mutator_gc_kind;
+static int ephemeron_gc_kind;
+
 // In BDW-GC, we can't hook into the mark phase to call
 // gc_trace_ephemerons_for_object, so the advertised ephemeron strategy
 // doesn't really work.  The primitives that we have are mark functions,
@@ -129,8 +151,6 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
 // ephemerons using these primitives.  Instead fall back to weak-key
 // tables.
 
-static int ephemeron_gc_kind;
-
 struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
   return GC_generic_malloc(gc_ephemeron_size(), ephemeron_gc_kind);
 }
@@ -148,31 +168,17 @@ void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
   }
 }
 
-struct ephemeron_mark_state {
-  struct GC_ms_entry *mark_stack_ptr;
-  struct GC_ms_entry *mark_stack_limit;
-};
-
 int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   // Pretend the key is traced, to avoid adding this ephemeron to the
   // global table.
   return 1;
 }
-static void trace_ephemeron_edge(struct gc_edge edge, struct gc_heap *heap,
-                                 void *visit_data) {
-  struct ephemeron_mark_state *state = visit_data;
-  uintptr_t addr = gc_ref_value(gc_edge_ref(edge));
-  state->mark_stack_ptr = GC_MARK_AND_PUSH ((void *) addr,
-                                            state->mark_stack_ptr,
-                                            state->mark_stack_limit,
-                                            NULL);
-}
 
 static struct GC_ms_entry *
 mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
                struct GC_ms_entry *mark_stack_limit, GC_word env) {
 
-  struct ephemeron_mark_state state = {
+  struct bdw_mark_state state = {
     mark_stack_ptr,
     mark_stack_limit,
   };
@@ -182,7 +188,7 @@ mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
   // If this ephemeron is on a freelist, its first word will be a
   // freelist link and everything else will be NULL.
   if (!gc_ref_value(gc_edge_ref(gc_ephemeron_value_edge(ephemeron)))) {
-    trace_ephemeron_edge(gc_edge(addr), NULL, &state);
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
     return state.mark_stack_ptr;
   }
 
@@ -192,21 +198,76 @@ mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
     gc_ephemeron_mark_dead(ephemeron);
   }
 
-  gc_trace_ephemeron(ephemeron, trace_ephemeron_edge, NULL, &state);
+  gc_trace_ephemeron(ephemeron, bdw_mark_edge, NULL, &state);
+
+  return state.mark_stack_ptr;
+}
+
+static struct GC_ms_entry *
+mark_heap(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+          struct GC_ms_entry *mark_stack_limit, GC_word env) {
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_heap *heap = (struct gc_heap*) addr;
+
+  // If this heap is on a freelist... well probably we are screwed, BDW
+  // isn't really made to do multiple heaps in a process.  But still, in
+  // this case, the first word is the freelist and the rest are null.
+  if (heap->freelist) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  if (heap->roots)
+    gc_trace_heap_roots(heap->roots, bdw_mark_edge, heap, &state);
+
+  return state.mark_stack_ptr;
+}
+
+static struct GC_ms_entry *
+mark_mutator(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+             struct GC_ms_entry *mark_stack_limit, GC_word env) {
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_mutator *mut = (struct gc_mutator*) addr;
+
+  // If this mutator is on a freelist, its first word will be a
+  // freelist link and everything else will be NULL.
+  if (!mut->heap) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  for (int i; i < GC_INLINE_FREELIST_COUNT; i++)
+    state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->freelists[i],
+                                             state.mark_stack_ptr,
+                                             state.mark_stack_limit,
+                                             NULL);
+
+  state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->heap,
+                                           state.mark_stack_ptr,
+                                           state.mark_stack_limit,
+                                           NULL);
+
+  if (mut->roots)
+    gc_trace_mutator_roots(mut->roots, bdw_mark_edge, mut->heap, &state);
 
   return state.mark_stack_ptr;
 }
 
 static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
-  struct gc_mutator *ret = GC_malloc(sizeof(struct gc_mutator));
+  struct gc_mutator *ret =
+    GC_generic_malloc(sizeof(struct gc_mutator), mutator_gc_kind);
   ret->heap = heap;
   return ret;
 }
 
-static inline struct gc_heap *mutator_heap(struct gc_mutator *mutator) {
-  return mutator->heap;
-}
-
 struct gc_options {
   struct gc_common_options common;
 };
@@ -281,18 +342,26 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (options->common.heap_size > current_heap_size)
     GC_expand_hp(options->common.heap_size - current_heap_size);
   GC_allow_register_threads();
-  *heap = GC_malloc(sizeof(struct gc_heap));
-  pthread_mutex_init(&(*heap)->lock, NULL);
-  *mutator = add_mutator(*heap);
 
   {
-    GC_word descriptor = GC_MAKE_PROC(GC_new_proc(mark_ephemeron), 0);
     int add_size_to_descriptor = 0;
     int clear_memory = 1;
-    ephemeron_gc_kind = GC_new_kind(GC_new_free_list(), descriptor,
+
+    heap_gc_kind = GC_new_kind(GC_new_free_list(),
+                               GC_MAKE_PROC(GC_new_proc(mark_heap), 0),
+                               add_size_to_descriptor, clear_memory);
+    mutator_gc_kind = GC_new_kind(GC_new_free_list(),
+                                  GC_MAKE_PROC(GC_new_proc(mark_mutator), 0),
+                                  add_size_to_descriptor, clear_memory);
+    ephemeron_gc_kind = GC_new_kind(GC_new_free_list(),
+                                    GC_MAKE_PROC(GC_new_proc(mark_ephemeron), 0),
                                     add_size_to_descriptor, clear_memory);
   }
 
+  *heap = GC_generic_malloc(sizeof(struct gc_heap), heap_gc_kind);
+  pthread_mutex_init(&(*heap)->lock, NULL);
+  *mutator = add_mutator(*heap);
+
   return 1;
 }
 
@@ -321,8 +390,10 @@ void* gc_call_without_gc(struct gc_mutator *mut,
 
 void gc_mutator_set_roots(struct gc_mutator *mut,
                           struct gc_mutator_roots *roots) {
+  mut->roots = roots;
 }
 void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
+  heap->roots = roots;
 }
 void gc_heap_set_extern_space(struct gc_heap *heap,
                               struct gc_extern_space *space) {

From 120cd91b0236c7b5209df183e871372cc1f0c8d0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 22 Sep 2023 14:44:21 +0200
Subject: [PATCH 195/403] Use -flto=auto to do LTO in parallel

---
 Makefile | 2 +-
 embed.mk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 70d351d68..58091d55e 100644
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
 CC       = gcc
 CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
 CPPFLAGS = -Iapi
-LDFLAGS  = -lpthread -flto
+LDFLAGS  = -lpthread -flto=auto
 DEPFLAGS = -MMD -MP -MF $(@:obj/%.o=.deps/%.d)
 COMPILE  = $(CC) $(CFLAGS) $(CPPFLAGS) $(DEPFLAGS) -o $@
 LINK     = $(CC) $(LDFLAGS) -o $@
diff --git a/embed.mk b/embed.mk
index e2765e1af..1c7822806 100644
--- a/embed.mk
+++ b/embed.mk
@@ -16,7 +16,7 @@ GC_V        = $(v_$(V))
 GC_CC       = gcc
 GC_CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(GC_BUILD_CFLAGS)
 GC_CPPFLAGS = -I$(WHIPPET)api
-GC_LDFLAGS  = -lpthread -flto
+GC_LDFLAGS  = -lpthread -flto=auto
 GC_DEPFLAGS = 
 GC_COMPILE  = $(GC_V)$(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
 GC_LINK     = $(GC_V)$(GC_CC) $(GC_LDFLAGS) -o $@

From 4d1358219bd73f99208cf0b23aa2aa6a7c5548fc Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 22 Sep 2023 15:19:55 +0200
Subject: [PATCH 196/403] Fix amazing error in bdw.c

---
 src/bdw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bdw.c b/src/bdw.c
index 016ff5501..cc47de9a1 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -244,7 +244,7 @@ mark_mutator(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
     return state.mark_stack_ptr;
   }
 
-  for (int i; i < GC_INLINE_FREELIST_COUNT; i++)
+  for (int i = 0; i < GC_INLINE_FREELIST_COUNT; i++)
     state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->freelists[i],
                                              state.mark_stack_ptr,
                                              state.mark_stack_limit,

From 41591d8722c21a3218ee0f0c90c7ef3e322249e4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 30 Sep 2023 21:58:46 +0200
Subject: [PATCH 197/403] bdw: Ensure heap and mutators are live

Before, we were relying on the heap and mutators being reachable from
roots.  This is no longer the case.
---
 src/bdw.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/src/bdw.c b/src/bdw.c
index cc47de9a1..90688e84a 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -52,14 +52,16 @@
 struct gc_heap {
   struct gc_heap *freelist; // see mark_heap
   pthread_mutex_t lock;
-  int multithreaded;
   struct gc_heap_roots *roots;
+  struct gc_mutator *mutators;
 };
 
 struct gc_mutator {
   void *freelists[GC_INLINE_FREELIST_COUNT];
   struct gc_heap *heap;
   struct gc_mutator_roots *roots;
+  struct gc_mutator *next; // with heap lock
+  struct gc_mutator **prev; // with heap lock
 };
 
 static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
@@ -224,6 +226,11 @@ mark_heap(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
   if (heap->roots)
     gc_trace_heap_roots(heap->roots, bdw_mark_edge, heap, &state);
 
+  state.mark_stack_ptr = GC_MARK_AND_PUSH (heap->mutators,
+                                           state.mark_stack_ptr,
+                                           state.mark_stack_limit,
+                                           NULL);
+
   return state.mark_stack_ptr;
 }
 
@@ -258,6 +265,11 @@ mark_mutator(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, bdw_mark_edge, mut->heap, &state);
 
+  state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->next,
+                                           state.mark_stack_ptr,
+                                           state.mark_stack_limit,
+                                           NULL);
+
   return state.mark_stack_ptr;
 }
 
@@ -265,6 +277,15 @@ static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
   struct gc_mutator *ret =
     GC_generic_malloc(sizeof(struct gc_mutator), mutator_gc_kind);
   ret->heap = heap;
+
+  pthread_mutex_lock(&heap->lock);
+  ret->next = heap->mutators;
+  ret->prev = &heap->mutators;
+  if (ret->next)
+    ret->next->prev = &ret->next;
+  heap->mutators = ret;
+  pthread_mutex_unlock(&heap->lock);
+
   return ret;
 }
 
@@ -295,12 +316,23 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
+struct gc_pending_ephemerons *
+gc_heap_pending_ephemerons(struct gc_heap *heap) {
+  GC_CRASH();
+  return NULL;
+}
+
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mutator) {
+  // Root the heap, which will also cause all mutators to be marked.
+  static struct gc_heap *the_heap;
+
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
                GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
 
+  GC_ASSERT_EQ(the_heap, NULL);
+
   if (!options) options = gc_allocate_options();
 
   // Ignore stack base for main thread.
@@ -362,23 +394,24 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   pthread_mutex_init(&(*heap)->lock, NULL);
   *mutator = add_mutator(*heap);
 
+  the_heap = *heap;
+
   return 1;
 }
 
 struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
                                       struct gc_heap *heap) {
-  pthread_mutex_lock(&heap->lock);
-  if (!heap->multithreaded) {
-    GC_allow_register_threads();
-    heap->multithreaded = 1;
-  }
-  pthread_mutex_unlock(&heap->lock);
-
   struct GC_stack_base base = { stack_base };
   GC_register_my_thread(&base);
   return add_mutator(heap);
 }
 void gc_finish_for_thread(struct gc_mutator *mut) {
+  pthread_mutex_lock(&mut->heap->lock);
+  *mut->prev = mut->next;
+  if (mut->next)
+    mut->next->prev = mut->prev;
+  pthread_mutex_unlock(&mut->heap->lock);
+
   GC_unregister_my_thread();
 }
 

From d56356fec7c7a0f3e38dbb7b52b5dceb4d45583d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 2 Oct 2023 22:47:35 +0200
Subject: [PATCH 198/403] Add gc_safepoint_mechanism gc attr

---
 api/bdw-attrs.h     | 4 ++++
 api/gc-attrs.h      | 6 ++++++
 api/semi-attrs.h    | 4 ++++
 api/whippet-attrs.h | 4 ++++
 4 files changed, 18 insertions(+)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index e7a08100d..e190c2cee 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -50,4 +50,8 @@ static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
 
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_SIGNAL;
+}
+
 #endif // BDW_ATTRS_H
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index 60d8e3351..c08330eaa 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -37,4 +37,10 @@ static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size)
 static inline size_t gc_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_card_size(void) GC_ALWAYS_INLINE;
 
+enum gc_safepoint_mechanism {
+  GC_SAFEPOINT_MECHANISM_COOPERATIVE,
+  GC_SAFEPOINT_MECHANISM_SIGNAL,
+};
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) GC_ALWAYS_INLINE;
+
 #endif // GC_ATTRS_H
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index 3bf9584b8..be906768f 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -52,4 +52,8 @@ static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
 
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
 #endif // SEMI_ATTRS_H
diff --git a/api/whippet-attrs.h b/api/whippet-attrs.h
index b26d79ad3..e6e5b22b9 100644
--- a/api/whippet-attrs.h
+++ b/api/whippet-attrs.h
@@ -57,4 +57,8 @@ static inline size_t gc_write_barrier_card_size(void) {
   return 256;
 }
 
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
 #endif // WHIPPET_ATTRS_H

From 9936d98f70c36ce0bf3fe2168403c42f7ce1662c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 2 Oct 2023 22:48:59 +0200
Subject: [PATCH 199/403] Rework heap marking

---
 src/bdw.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/bdw.c b/src/bdw.c
index 90688e84a..2344bba48 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -257,11 +257,6 @@ mark_mutator(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
                                              state.mark_stack_limit,
                                              NULL);
 
-  state.mark_stack_ptr = GC_MARK_AND_PUSH (mut->heap,
-                                           state.mark_stack_ptr,
-                                           state.mark_stack_limit,
-                                           NULL);
-
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, bdw_mark_edge, mut->heap, &state);
 
@@ -322,16 +317,16 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
   return NULL;
 }
 
+struct gc_heap *__the_bdw_gc_heap;
+
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mutator) {
   // Root the heap, which will also cause all mutators to be marked.
-  static struct gc_heap *the_heap;
-
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
                GC_INLINE_FREELIST_COUNT * GC_INLINE_GRANULE_BYTES);
 
-  GC_ASSERT_EQ(the_heap, NULL);
+  GC_ASSERT_EQ(__the_bdw_gc_heap, NULL);
 
   if (!options) options = gc_allocate_options();
 
@@ -394,7 +389,11 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   pthread_mutex_init(&(*heap)->lock, NULL);
   *mutator = add_mutator(*heap);
 
-  the_heap = *heap;
+  __the_bdw_gc_heap = *heap;
+
+  // Sanity check.
+  if (!GC_is_visible (&__the_bdw_gc_heap))
+    abort ();
 
   return 1;
 }

From 3c63de9b9d9080eab7e4ce74a1b1408b90bfa231 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 7 Oct 2023 23:09:10 +0200
Subject: [PATCH 200/403] bdw: Turn off all_interior_pointers

Quoth gc_inline.h, which makes freelists:

/* Note that for these routines, it is the clients responsibility to    */
/* add the extra byte at the end to deal with one-past-the-end pointers.*/
/* In the standard collector configuration, the collector assumes that  */
/* such a byte has been added, and hence does not trace the last word   */
/* in the resulting object.                                             */
/* This is not an issue if the collector is compiled with               */
/* DONT_ADD_BYTE_AT_END, or if GC_all_interior_pointers is not set.     */
---
 src/bdw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/bdw.c b/src/bdw.c
index 2344bba48..d6b873f4b 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -359,6 +359,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
       return 0;
   }
 
+  GC_set_all_interior_pointers (0);
+
   // Not part of 7.3, sigh.  Have to set an env var.
   // GC_set_markers_count(options->common.parallelism);
   char markers[21] = {0,}; // 21 bytes enough for 2**64 in decimal + NUL.

From c7499740c9a657402ba8f146feb5725652ace5d0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 7 Oct 2023 23:10:33 +0200
Subject: [PATCH 201/403] Fix GCC detection

Doh
---
 src/gc-stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-stack.c b/src/gc-stack.c
index 54c6fdb0c..318f5757f 100644
--- a/src/gc-stack.c
+++ b/src/gc-stack.c
@@ -15,7 +15,7 @@
 #include "gc-stack.h"
 
 static uintptr_t current_thread_hot_stack_addr(void) {
-#ifdef __GCC__
+#ifdef __GNUC__
   return (uintptr_t)__builtin_frame_address(0);
 #else
   uintptr_t local;

From 5130380ae52ef7c89350a4bd0c604d26a228ad18 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Oct 2023 12:13:08 +0200
Subject: [PATCH 202/403] Rework stats collection to use listener interface

---
 api/gc-api.h                 |   6 +-
 api/gc-basic-stats.h         | 130 +++++++++++++++++++++++++++++++++++
 api/gc-event-listener.h      |  26 +++++++
 api/gc-null-event-listener.h |  49 +++++++++++++
 benchmarks/ephemerons.c      |  12 ++--
 benchmarks/mt-gcbench.c      |  11 +--
 benchmarks/quads.c           |  10 +--
 src/bdw.c                    |  75 +++++++++++++++++---
 src/semi.c                   |  41 +++++++++--
 src/whippet.c                |  51 ++++++++++----
 10 files changed, 368 insertions(+), 43 deletions(-)
 create mode 100644 api/gc-basic-stats.h
 create mode 100644 api/gc-event-listener.h
 create mode 100644 api/gc-null-event-listener.h

diff --git a/api/gc-api.h b/api/gc-api.h
index 821891bca..f4d053c47 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -5,6 +5,7 @@
 #include "gc-assert.h"
 #include "gc-attrs.h"
 #include "gc-edge.h"
+#include "gc-event-listener.h"
 #include "gc-inline.h"
 #include "gc-options.h"
 #include "gc-ref.h"
@@ -24,7 +25,9 @@ GC_API_ void* gc_call_with_stack_addr(void* (*f)(struct gc_stack_addr *,
 
 GC_API_ int gc_init(const struct gc_options *options,
                     struct gc_stack_addr *base, struct gc_heap **heap,
-                    struct gc_mutator **mutator);
+                    struct gc_mutator **mutator,
+                    struct gc_event_listener event_listener,
+                    void *event_listener_data);
 
 struct gc_mutator_roots;
 GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
@@ -43,7 +46,6 @@ GC_API_ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *base,
 GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
 GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
-GC_API_ void gc_print_stats(struct gc_heap *heap);
 
 GC_API_ void gc_collect(struct gc_mutator *mut);
 
diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
new file mode 100644
index 000000000..8e57e40f1
--- /dev/null
+++ b/api/gc-basic-stats.h
@@ -0,0 +1,130 @@
+#ifndef GC_BASIC_STATS_H
+#define GC_BASIC_STATS_H
+
+#include "gc-event-listener.h"
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+
+struct gc_basic_stats {
+  uint64_t major_collection_count;
+  uint64_t minor_collection_count;
+  uint64_t last_time_usec;
+  uint64_t elapsed_mutator_usec;
+  uint64_t elapsed_collector_usec;
+  size_t heap_size;
+  size_t max_heap_size;
+  size_t max_live_data_size;
+};
+
+static inline uint64_t gc_basic_stats_now(void) {
+  struct timeval tv;
+  if (gettimeofday(&tv, NULL) != 0) GC_CRASH();
+  uint64_t ret = tv.tv_sec;
+  ret *= 1000 * 1000;
+  ret += tv.tv_usec;
+  return ret;
+}
+
+static inline void gc_basic_stats_init(void *data, size_t heap_size) {
+  struct gc_basic_stats *stats = data;
+  memset(stats, 0, sizeof(*stats));
+  stats->last_time_usec = gc_basic_stats_now();
+  stats->heap_size = stats->max_heap_size = heap_size;
+}
+
+static inline void gc_basic_stats_prepare_gc(void *data,
+                                             int is_minor,
+                                             int is_compacting) {
+  struct gc_basic_stats *stats = data;
+  if (is_minor)
+    stats->minor_collection_count++;
+  else
+    stats->major_collection_count++;
+  uint64_t now = gc_basic_stats_now();
+  stats->elapsed_mutator_usec += now - stats->last_time_usec;
+  stats->last_time_usec = now;
+}
+
+static inline void gc_basic_stats_requesting_stop(void *data) {}
+static inline void gc_basic_stats_waiting_for_stop(void *data) {}
+static inline void gc_basic_stats_mutators_stopped(void *data) {}
+static inline void gc_basic_stats_roots_traced(void *data) {}
+static inline void gc_basic_stats_heap_traced(void *data) {}
+static inline void gc_basic_stats_ephemerons_traced(void *data) {}
+
+static inline void gc_basic_stats_restarting_mutators(void *data) {
+  struct gc_basic_stats *stats = data;
+  uint64_t now = gc_basic_stats_now();
+  stats->elapsed_collector_usec += now - stats->last_time_usec;
+  stats->last_time_usec = now;
+}
+
+static inline void* gc_basic_stats_mutator_added(void *data) {
+  return NULL;
+}
+static inline void gc_basic_stats_mutator_cause_gc(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_stopping(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_stopped(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_restarted(void *mutator_data) {}
+static inline void gc_basic_stats_mutator_removed(void *mutator_data) {}
+
+static inline void gc_basic_stats_heap_resized(void *data, size_t size) {
+  struct gc_basic_stats *stats = data;
+  stats->heap_size = size;
+  if (size > stats->max_heap_size)
+    stats->max_heap_size = size;
+}
+
+static inline void gc_basic_stats_live_data_size(void *data, size_t size) {
+  struct gc_basic_stats *stats = data;
+  if (size > stats->max_live_data_size)
+    stats->max_live_data_size = size;
+}
+
+#define GC_BASIC_STATS                                                  \
+  ((struct gc_event_listener) {                                         \
+    gc_basic_stats_init,                                                \
+    gc_basic_stats_prepare_gc,                                          \
+    gc_basic_stats_requesting_stop,                                     \
+    gc_basic_stats_waiting_for_stop,                                    \
+    gc_basic_stats_mutators_stopped,                                    \
+    gc_basic_stats_roots_traced,                                        \
+    gc_basic_stats_heap_traced,                                         \
+    gc_basic_stats_ephemerons_traced,                                   \
+    gc_basic_stats_restarting_mutators,                                 \
+    gc_basic_stats_mutator_added,                                       \
+    gc_basic_stats_mutator_cause_gc,                                    \
+    gc_basic_stats_mutator_stopping,                                    \
+    gc_basic_stats_mutator_stopped,                                     \
+    gc_basic_stats_mutator_restarted,                                   \
+    gc_basic_stats_mutator_removed,                                     \
+    gc_basic_stats_heap_resized,                                        \
+    gc_basic_stats_live_data_size,                                      \
+  })
+
+static inline void gc_basic_stats_finish(struct gc_basic_stats *stats) {
+  uint64_t now = gc_basic_stats_now();
+  stats->elapsed_mutator_usec += stats->last_time_usec - now;
+  stats->last_time_usec = now;
+}
+
+static inline void gc_basic_stats_print(struct gc_basic_stats *stats, FILE *f) {
+  fprintf(f, "Completed %" PRIu64 " major collections (%" PRIu64 " minor).\n",
+          stats->major_collection_count, stats->minor_collection_count);
+  uint64_t stopped = stats->elapsed_collector_usec;
+  uint64_t elapsed = stats->elapsed_mutator_usec + stopped;
+  uint64_t ms = 1000; // per usec
+  fprintf(f, "%" PRIu64 ".%.3" PRIu64 " ms total time "
+          "(%" PRIu64 ".%.3" PRIu64 " stopped).\n",
+          elapsed / ms, elapsed % ms, stopped / ms, stopped % ms);
+  double MB = 1e6;
+  fprintf(f, "Heap size is %.3f MB (max %.3f MB); peak live data %.3f MB.\n",
+          stats->heap_size / MB, stats->max_heap_size / MB,
+          stats->max_live_data_size / MB);
+}
+
+#endif // GC_BASIC_STATS_H_
diff --git a/api/gc-event-listener.h b/api/gc-event-listener.h
new file mode 100644
index 000000000..57df09719
--- /dev/null
+++ b/api/gc-event-listener.h
@@ -0,0 +1,26 @@
+#ifndef GC_EVENT_LISTENER_H
+#define GC_EVENT_LISTENER_H
+
+struct gc_event_listener {
+  void (*init)(void *data, size_t heap_size);
+  void (*prepare_gc)(void *data, int is_minor, int is_compacting);
+  void (*requesting_stop)(void *data);
+  void (*waiting_for_stop)(void *data);
+  void (*mutators_stopped)(void *data);
+  void (*roots_traced)(void *data);
+  void (*heap_traced)(void *data);
+  void (*ephemerons_traced)(void *data);
+  void (*restarting_mutators)(void *data);
+
+  void* (*mutator_added)(void *data);
+  void (*mutator_cause_gc)(void *mutator_data);
+  void (*mutator_stopping)(void *mutator_data);
+  void (*mutator_stopped)(void *mutator_data);
+  void (*mutator_restarted)(void *mutator_data);
+  void (*mutator_removed)(void *mutator_data);
+
+  void (*heap_resized)(void *data, size_t size);
+  void (*live_data_size)(void *data, size_t size);
+};
+
+#endif // GC_EVENT_LISTENER_H
diff --git a/api/gc-null-event-listener.h b/api/gc-null-event-listener.h
new file mode 100644
index 000000000..7060bd729
--- /dev/null
+++ b/api/gc-null-event-listener.h
@@ -0,0 +1,49 @@
+#ifndef GC_NULL_EVENT_LISTENER_H
+#define GC_NULL_EVENT_LISTENER_H
+
+#include "gc-event-listener.h"
+
+static inline void gc_null_event_listener_init(void *data, size_t size) {}
+static inline void gc_null_event_listener_prepare_gc(void *data,
+                                               int is_minor,
+                                               int is_compacting) {}
+static inline void gc_null_event_listener_requesting_stop(void *data) {}
+static inline void gc_null_event_listener_waiting_for_stop(void *data) {}
+static inline void gc_null_event_listener_mutators_stopped(void *data) {}
+static inline void gc_null_event_listener_roots_traced(void *data) {}
+static inline void gc_null_event_listener_heap_traced(void *data) {}
+static inline void gc_null_event_listener_ephemerons_traced(void *data) {}
+static inline void gc_null_event_listener_restarting_mutators(void *data) {}
+
+static inline void* gc_null_event_listener_mutator_added(void *data) {}
+static inline void gc_null_event_listener_mutator_cause_gc(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_stopping(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_stopped(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_restarted(void *mutator_data) {}
+static inline void gc_null_event_listener_mutator_removed(void *mutator_data) {}
+
+static inline void gc_null_event_listener_heap_resized(void *, size_t) {}
+static inline void gc_null_event_listener_live_data_size(void *, size_t) {}
+
+#define GC_NULL_EVENT_LISTENER                                         \
+  ((struct gc_event_listener) {                                        \
+    gc_null_event_listener_init,                                       \
+    gc_null_event_listener_prepare_gc,                                 \
+    gc_null_event_listener_requesting_stop,                            \
+    gc_null_event_listener_waiting_for_stop,                           \
+    gc_null_event_listener_mutators_stopped,                           \
+    gc_null_event_listener_roots_traced,                               \
+    gc_null_event_listener_heap_traced,                                \
+    gc_null_event_listener_ephemerons_traced,                          \
+    gc_null_event_listener_restarting_mutators,                        \
+    gc_null_event_listener_mutator_added,                              \
+    gc_null_event_listener_mutator_cause_gc,                           \
+    gc_null_event_listener_mutator_stopping,                           \
+    gc_null_event_listener_mutator_stopped,                            \
+    gc_null_event_listener_mutator_restarted,                          \
+    gc_null_event_listener_mutator_removed,                            \
+    gc_null_event_listener_heap_resized,                               \
+    gc_null_event_listener_live_data_size,                             \
+  })
+
+#endif // GC_NULL_EVENT_LISTENER_H_
diff --git a/benchmarks/ephemerons.c b/benchmarks/ephemerons.c
index 698aa1708..2193f1fe0 100644
--- a/benchmarks/ephemerons.c
+++ b/benchmarks/ephemerons.c
@@ -7,6 +7,7 @@
 
 #include "assert.h"
 #include "gc-api.h"
+#include "gc-basic-stats.h"
 #include "gc-ephemeron.h"
 #include "simple-roots-api.h"
 #include "ephemerons-types.h"
@@ -231,7 +232,8 @@ int main(int argc, char *argv[]) {
 
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(options, NULL, &heap, &mut)) {
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             (size_t)heap_size);
     return 1;
@@ -239,8 +241,6 @@ int main(int argc, char *argv[]) {
   struct thread main_thread = { mut, };
   gc_mutator_set_roots(mut, &main_thread.roots);
 
-  unsigned long test_start = current_time();
-
   pthread_t threads[MAX_THREAD_COUNT];
   // Run one of the threads in the main thread.
   for (size_t i = 1; i < nthreads; i++) {
@@ -262,9 +262,9 @@ int main(int argc, char *argv[]) {
     }
   }
   
-  print_elapsed("test", test_start);
-
-  gc_print_stats(heap);
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
 
   return 0;
 }
diff --git a/benchmarks/mt-gcbench.c b/benchmarks/mt-gcbench.c
index e7e7d8a58..9d149c431 100644
--- a/benchmarks/mt-gcbench.c
+++ b/benchmarks/mt-gcbench.c
@@ -46,6 +46,7 @@
 
 #include "assert.h"
 #include "gc-api.h"
+#include "gc-basic-stats.h"
 #include "mt-gcbench-types.h"
 #include "simple-roots-api.h"
 #include "simple-allocator.h"
@@ -362,7 +363,8 @@ int main(int argc, char *argv[]) {
 
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(options, NULL, &heap, &mut)) {
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
@@ -373,8 +375,6 @@ int main(int argc, char *argv[]) {
   printf("Garbage Collector Test\n");
   printf(" Live storage will peak at %zd bytes.\n\n", heap_max_live);
 
-  unsigned long start = current_time();
-        
   pthread_t threads[MAX_THREAD_COUNT];
   // Run one of the threads in the main thread.
   for (size_t i = 1; i < nthreads; i++) {
@@ -396,6 +396,7 @@ int main(int argc, char *argv[]) {
     }
   }
   
-  printf("Completed in %.3f msec\n", elapsed_millis(start));
-  gc_print_stats(heap);
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
 }
diff --git a/benchmarks/quads.c b/benchmarks/quads.c
index 11d8e5e1f..6fa19f452 100644
--- a/benchmarks/quads.c
+++ b/benchmarks/quads.c
@@ -5,6 +5,7 @@
 
 #include "assert.h"
 #include "gc-api.h"
+#include "gc-basic-stats.h"
 #include "simple-roots-api.h"
 #include "quads-types.h"
 #include "simple-allocator.h"
@@ -118,7 +119,6 @@ int main(int argc, char *argv[]) {
   size_t tree_bytes = nquads * sizeof(Quad);
   size_t heap_size = tree_bytes * multiplier;
 
-  unsigned long gc_start = current_time();
   printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
          heap_size / 1e9, multiplier);
 
@@ -134,7 +134,8 @@ int main(int argc, char *argv[]) {
 
   struct gc_heap *heap;
   struct gc_mutator *mut;
-  if (!gc_init(options, NULL, &heap, &mut)) {
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
     fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
             heap_size);
     return 1;
@@ -169,9 +170,10 @@ int main(int argc, char *argv[]) {
     validate_tree(HANDLE_REF(quad), depth);
   }
   print_elapsed("allocation loop", garbage_start);
-  print_elapsed("quads test", gc_start);
 
-  gc_print_stats(heap);
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
 
   POP_HANDLE(&t);
   return 0;
diff --git a/src/bdw.c b/src/bdw.c
index d6b873f4b..809ad8808 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -54,6 +54,8 @@ struct gc_heap {
   pthread_mutex_t lock;
   struct gc_heap_roots *roots;
   struct gc_mutator *mutators;
+  struct gc_event_listener event_listener;
+  void *event_listener_data;
 };
 
 struct gc_mutator {
@@ -62,8 +64,15 @@ struct gc_mutator {
   struct gc_mutator_roots *roots;
   struct gc_mutator *next; // with heap lock
   struct gc_mutator **prev; // with heap lock
+  void *event_listener_data;
 };
 
+struct gc_heap *__the_bdw_gc_heap;
+#define HEAP_EVENT(event, ...)                                    \
+  __the_bdw_gc_heap->event_listener.event(__the_bdw_gc_heap->event_listener_data, ##__VA_ARGS__)
+#define MUTATOR_EVENT(mut, event, ...)                                  \
+  __the_bdw_gc_heap->event_listener.event(mut->event_listener_data, ##__VA_ARGS__)
+
 static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
   return (bytes - 1U) / GC_INLINE_GRANULE_BYTES;
 }
@@ -272,6 +281,7 @@ static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
   struct gc_mutator *ret =
     GC_generic_malloc(sizeof(struct gc_mutator), mutator_gc_kind);
   ret->heap = heap;
+  ret->event_listener_data = HEAP_EVENT(mutator_added);
 
   pthread_mutex_lock(&heap->lock);
   ret->next = heap->mutators;
@@ -317,10 +327,56 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
   return NULL;
 }
 
-struct gc_heap *__the_bdw_gc_heap;
+static void on_collection_event(GC_EventType event) {
+  switch (event) {
+  case GC_EVENT_START: {
+    int is_minor = 0;
+    int is_compacting = 0;
+    HEAP_EVENT(prepare_gc, is_minor, is_compacting);
+    HEAP_EVENT(requesting_stop);
+    HEAP_EVENT(waiting_for_stop);
+    break;
+  }
+  case GC_EVENT_MARK_START:
+    HEAP_EVENT(mutators_stopped);
+    break;
+  case GC_EVENT_MARK_END:
+    HEAP_EVENT(roots_traced);
+    HEAP_EVENT(heap_traced);
+    break;
+  case GC_EVENT_RECLAIM_START:
+    break;
+  case GC_EVENT_RECLAIM_END:
+    // Sloppily attribute finalizers and eager reclamation to
+    // ephemerons.
+    HEAP_EVENT(ephemerons_traced);
+    HEAP_EVENT(live_data_size, GC_get_heap_size() - GC_get_free_bytes());
+    break;
+  case GC_EVENT_END:
+    HEAP_EVENT(restarting_mutators);
+    break;
+  case GC_EVENT_PRE_START_WORLD:
+  case GC_EVENT_POST_STOP_WORLD:
+    // Can't rely on these, as they are only fired when threads are
+    // enabled.
+    break;
+  case GC_EVENT_THREAD_SUSPENDED:
+  case GC_EVENT_THREAD_UNSUSPENDED:
+    // No nice way to map back to the mutator.
+    break;
+  default:
+    break;
+  }
+}
+
+static void on_heap_resize(GC_word size) {
+  HEAP_EVENT(heap_resized, size);
+}
 
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
-            struct gc_heap **heap, struct gc_mutator **mutator) {
+            struct gc_heap **heap, struct gc_mutator **mutator,
+            struct gc_event_listener event_listener,
+            void *event_listener_data) {
   // Root the heap, which will also cause all mutators to be marked.
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_INLINE_GRANULE_BYTES);
   GC_ASSERT_EQ(gc_allocator_large_threshold(),
@@ -389,9 +445,16 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   *heap = GC_generic_malloc(sizeof(struct gc_heap), heap_gc_kind);
   pthread_mutex_init(&(*heap)->lock, NULL);
-  *mutator = add_mutator(*heap);
+
+  (*heap)->event_listener = event_listener;
+  (*heap)->event_listener_data = event_listener_data;
 
   __the_bdw_gc_heap = *heap;
+  HEAP_EVENT(init, GC_get_heap_size());
+  GC_set_on_collection_event(on_collection_event);
+  GC_set_on_heap_resize(on_heap_resize);
+
+  *mutator = add_mutator(*heap);
 
   // Sanity check.
   if (!GC_is_visible (&__the_bdw_gc_heap))
@@ -408,6 +471,7 @@ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
 }
 void gc_finish_for_thread(struct gc_mutator *mut) {
   pthread_mutex_lock(&mut->heap->lock);
+  MUTATOR_EVENT(mut, mutator_removed);
   *mut->prev = mut->next;
   if (mut->next)
     mut->next->prev = mut->prev;
@@ -432,8 +496,3 @@ void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
 void gc_heap_set_extern_space(struct gc_heap *heap,
                               struct gc_extern_space *space) {
 }
-
-void gc_print_stats(struct gc_heap *heap) {
-  printf("Completed %ld collections\n", (long)GC_get_gc_no());
-  printf("Heap size is %ld\n", (long)GC_get_heap_size());
-}
diff --git a/src/semi.c b/src/semi.c
index fe181eb94..247734788 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -45,13 +45,21 @@ struct gc_heap {
   int check_pending_ephemerons;
   const struct gc_options *options;
   struct gc_heap_roots *roots;
+  struct gc_event_listener event_listener;
+  void *event_listener_data;
 };
 // One mutator per space, can just store the heap in the mutator.
 struct gc_mutator {
   struct gc_heap heap;
   struct gc_mutator_roots *roots;
+  void *event_listener_data;
 };
 
+#define HEAP_EVENT(heap, event, ...)                                    \
+  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
+#define MUTATOR_EVENT(mut, event, ...)                                  \
+  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
@@ -284,6 +292,8 @@ static size_t compute_new_heap_size(struct gc_heap *heap, size_t for_alloc) {
   live_bytes += large->live_pages_at_last_collection * semi->page_size;
   live_bytes += for_alloc;
 
+  HEAP_EVENT(heap, live_data_size, live_bytes);
+
   size_t new_heap_size = heap->size;
   switch (heap->options->common.heap_size_policy) {
     case GC_HEAP_SIZE_FIXED:
@@ -324,7 +334,10 @@ static void adjust_heap_size_and_limits(struct gc_heap *heap,
   new_region_size = min_size(new_region_size,
                              min_size(semi->to_space.mapped_size,
                                       semi->from_space.mapped_size));
+  size_t old_heap_size = heap->size;
   heap->size = new_region_size * 2;
+  if (heap->size != old_heap_size)
+    HEAP_EVENT(heap, heap_resized, heap->size);
   size_t stolen = align_up(semi->stolen_pages, 2) * semi->page_size;
   GC_ASSERT(new_region_size > stolen/2);
   size_t new_active_region_size = new_region_size - stolen/2;
@@ -339,6 +352,14 @@ static void adjust_heap_size_and_limits(struct gc_heap *heap,
 
 static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
+  int is_minor = 0;
+  int is_compacting = 1;
+  HEAP_EVENT(heap, prepare_gc, is_minor, is_compacting);
+
+  HEAP_EVENT(heap, requesting_stop);
+  HEAP_EVENT(heap, waiting_for_stop);
+  HEAP_EVENT(heap, mutators_stopped);
+
   struct semi_space *semi = heap_semi_space(heap);
   struct large_object_space *large = heap_large_object_space(heap);
   // fprintf(stderr, "start collect #%ld:\n", space->count);
@@ -352,20 +373,24 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
     gc_trace_heap_roots(heap->roots, trace, heap, NULL);
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, trace, heap, NULL);
+  HEAP_EVENT(heap, roots_traced);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
   while(grey < semi->hp)
     grey = scan(heap, gc_ref(grey));
+  HEAP_EVENT(heap, heap_traced);
   gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
   heap->check_pending_ephemerons = 1;
   while (gc_pop_resolved_ephemerons(heap, trace, NULL))
     while(grey < semi->hp)
       grey = scan(heap, gc_ref(grey));
+  HEAP_EVENT(heap, ephemerons_traced);
   large_object_space_finish_gc(large, 0);
   gc_extern_space_finish_gc(heap->extern_space, 0);
   semi_space_finish_gc(semi, large->live_pages_at_last_collection);
   gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
   adjust_heap_size_and_limits(heap, for_alloc);
 
+  HEAP_EVENT(heap, restarting_mutators);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
 }
 
@@ -539,7 +564,9 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
 }
 
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
-            struct gc_heap **heap, struct gc_mutator **mut) {
+            struct gc_heap **heap, struct gc_mutator **mut,
+            struct gc_event_listener event_listener,
+            void *event_listener_data) {
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
                offsetof(struct semi_space, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
@@ -563,6 +590,10 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!heap_init(*heap, options))
     return 0;
 
+  (*heap)->event_listener = event_listener;
+  (*heap)->event_listener_data = event_listener_data;
+  HEAP_EVENT(*heap, init, (*heap)->size);
+
   if (!semi_space_init(heap_semi_space(*heap), *heap))
     return 0;
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
@@ -571,6 +602,9 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   // Ignore stack base, as we are precise.
   (*mut)->roots = NULL;
 
+  (*mut)->event_listener_data =
+    event_listener.mutator_added(event_listener_data);
+
   return 1;
 }
 
@@ -600,8 +634,3 @@ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
   // Can't be threads, then there won't be collection.
   return f(data);
 }
-
-void gc_print_stats(struct gc_heap *heap) {
-  printf("Completed %ld collections\n", heap->count);
-  printf("Heap size is %zd\n", heap->size);
-}
diff --git a/src/whippet.c b/src/whippet.c
index 6f4596678..7b4b90f41 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -328,8 +328,15 @@ struct gc_heap {
   double minimum_major_gc_yield_threshold;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
+  struct gc_event_listener event_listener;
+  void *event_listener_data;
 };
 
+#define HEAP_EVENT(heap, event, ...)                                    \
+  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
+#define MUTATOR_EVENT(mut, event, ...)                                  \
+  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+
 struct gc_mutator_mark_buf {
   size_t size;
   size_t capacity;
@@ -345,6 +352,7 @@ struct gc_mutator {
   struct gc_stack stack;
   struct gc_mutator_roots *roots;
   struct gc_mutator_mark_buf mark_buf;
+  void *event_listener_data;
   // Three uses for this in-object linked-list pointer:
   //  - inactive (blocked in syscall) mutators
   //  - grey objects when stopping active mutators for mark-in-place
@@ -855,6 +863,8 @@ static inline void heap_unlock(struct gc_heap *heap) {
 
 static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
+  mut->event_listener_data =
+    heap->event_listener.mutator_added(heap->event_listener_data);
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
@@ -868,6 +878,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 }
 
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
   heap->active_mutator_count--;
@@ -1416,10 +1427,13 @@ static void trace_generational_roots(struct gc_heap *heap) {
   }
 }
 
-static void pause_mutator_for_collection(struct gc_heap *heap) GC_NEVER_INLINE;
-static void pause_mutator_for_collection(struct gc_heap *heap) {
+static void pause_mutator_for_collection(struct gc_heap *heap,
+                                         struct gc_mutator *mut) GC_NEVER_INLINE;
+static void pause_mutator_for_collection(struct gc_heap *heap,
+                                         struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(heap->active_mutator_count);
+  MUTATOR_EVENT(mut, mutator_stopped);
   heap->active_mutator_count--;
   if (heap->active_mutator_count == 0)
     pthread_cond_signal(&heap->collector_cond);
@@ -1436,6 +1450,7 @@ static void pause_mutator_for_collection(struct gc_heap *heap) {
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
   while (mutators_are_stopping(heap) && heap->count == epoch);
 
+  MUTATOR_EVENT(mut, mutator_restarted);
   heap->active_mutator_count++;
 }
 
@@ -1443,6 +1458,7 @@ static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NE
 static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
   finish_sweeping_in_block(mut);
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
@@ -1450,20 +1466,21 @@ static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
     trace_mutator_roots_with_lock(mut);
   else
     enqueue_mutator_for_tracing(mut);
-  pause_mutator_for_collection(heap);
+  pause_mutator_for_collection(heap, mut);
 }
 
 static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
 static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
   finish_sweeping(mut);
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     trace_stopping_mutator_roots(mut);
   enqueue_mutator_for_tracing(mut);
   heap_lock(heap);
-  pause_mutator_for_collection(heap);
+  pause_mutator_for_collection(heap, mut);
   heap_unlock(heap);
   release_stopping_mutator_roots(mut);
 }
@@ -1816,28 +1833,38 @@ static void collect(struct gc_mutator *mut) {
     DEBUG("grew heap instead of collecting #%ld:\n", heap->count);
     return;
   }
+  MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
   enum gc_kind gc_kind = determine_collection_kind(heap);
+  HEAP_EVENT(heap, prepare_gc, gc_kind & GC_KIND_FLAG_MINOR,
+             gc_kind & GC_KIND_FLAG_EVACUATING);
   update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
   large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
   gc_extern_space_start_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
   resolve_ephemerons_lazily(heap);
   tracer_prepare(heap);
+  HEAP_EVENT(heap, requesting_stop);
   request_mutators_to_stop(heap);
   trace_mutator_roots_with_lock_before_stop(mut);
   finish_sweeping(mut);
+  HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
+  HEAP_EVENT(heap, mutators_stopped);
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
+  HEAP_EVENT(heap, live_data_size, heap->size * (1 - yield));
   fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
   prepare_for_evacuation(heap);
   trace_roots_after_stop(heap);
+  HEAP_EVENT(heap, roots_traced);
   tracer_trace(heap);
+  HEAP_EVENT(heap, heap_traced);
   resolve_ephemerons_eagerly(heap);
   while (enqueue_resolved_ephemerons(heap))
     tracer_trace(heap);
+  HEAP_EVENT(heap, ephemerons_traced);
   sweep_ephemerons(heap);
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
@@ -1848,6 +1875,7 @@ static void collect(struct gc_mutator *mut) {
   if (heap->last_collection_was_minor)
     heap->minor_count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
+  HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
   DEBUG("collect done\n");
 }
@@ -2345,7 +2373,9 @@ static int mark_space_init(struct mark_space *space, struct gc_heap *heap) {
 }
 
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
-            struct gc_heap **heap, struct gc_mutator **mut) {
+            struct gc_heap **heap, struct gc_mutator **mut,
+            struct gc_event_listener event_listener,
+            void *event_listener_data) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
@@ -2372,6 +2402,10 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!heap_init(*heap, options))
     GC_CRASH();
 
+  (*heap)->event_listener = event_listener;
+  (*heap)->event_listener_data = event_listener_data;
+  HEAP_EVENT(*heap, init, (*heap)->size);
+
   struct mark_space *space = heap_mark_space(*heap);
   if (!mark_space_init(space, *heap)) {
     free(*heap);
@@ -2439,10 +2473,3 @@ void* gc_call_without_gc(struct gc_mutator *mut,
   reactivate_mutator(heap, mut);
   return ret;
 }
-
-void gc_print_stats(struct gc_heap *heap) {
-  printf("Completed %ld collections (%ld major)\n",
-         heap->count, heap->count - heap->minor_count);
-  printf("Heap size with overhead is %zd (%zu slabs)\n",
-         heap->size, heap_mark_space(heap)->nslabs);
-}

From 4d3e8c30d7112c2723d6cfa47ff899931ffd52f3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Oct 2023 12:43:09 +0200
Subject: [PATCH 203/403] Update manual

---
 doc/manual.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 7 deletions(-)

diff --git a/doc/manual.md b/doc/manual.md
index e94095635..64be5a6c9 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -401,10 +401,12 @@ just one heap.  A heap has one or more associated *mutators*.  A mutator
 is a thread-specific handle on the heap.  Allocating objects requires a
 mutator.
 
-The initial heap and mutator are created via `gc_init`, which takes two
-input parameters: the *options*, and a stack base address.  The options
-specify the initial heap size and so on.  `gc_init` returns the new heap
-as an out parameter, and also returns a mutator for the current thread.
+The initial heap and mutator are created via `gc_init`, which takes
+three logical input parameters: the *options*, a stack base address, and
+an *event listener*.  The options specify the initial heap size and so
+on.  The event listener is mostly for gathering statistics; see below
+for more.  `gc_init` returns the new heap as an out parameter, and also
+returns a mutator for the current thread.
 
 To make a new mutator for a new thread, use `gc_init_for_thread`.  When
 a thread is finished with its mutator, call `gc_finish_for_thread`.
@@ -522,9 +524,109 @@ BDW.
 Sometimes a program would like some information from the GC: how many
 bytes and objects have been allocated?  How much time has been spent in
 the GC?  How many times has GC run, and how many of those were minor
-collections?  What's the maximum pause time?  Stuff like that.  Whippet
-doesn't collect very much info right now, and this should probably
-change.  For the moment, all you get is `gc_print_stats`.
+collections?  What's the maximum pause time?  Stuff like that.
+
+Instead of collecting a fixed set of information, Whippet emits
+callbacks when the collector reaches specific states.  The embedder
+provides a *listener* for these events when initializing the collector.
+
+The listener interface is defined in
+[`gc-event-listener.h`](../api/gc-event-listener.h).  Whippet ships with
+two listener implementations, `GC_NULL_EVENT_LISTENER`, and
+`GC_BASIC_STATS`.  Most embedders will want their own listener, but
+starting with the basic stats listener is not a bad option:
+
+```
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include <stdio.h>
+
+int main() {
+  struct gc_options *options = NULL;
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats);
+  // ...
+  gc_basic_stats_finish(&stats);
+  gc_basic_stats_print(&stats, stdout);
+}
+```
+
+As you can see, `GC_BASIC_STATS` expands to a `struct gc_event_listener`
+definition.  We pass an associated pointer to a `struct gc_basic_stats`
+instance which will be passed to the listener at every event.
+
+The output of this program might be something like:
+
+```
+Completed 19 major collections (0 minor).
+654.597 ms total time (385.235 stopped).
+Heap size is 167.772 MB (max 167.772 MB); peak live data 55.925 MB.
+```
+
+There are currently three different sorts of events: heap events to
+track heap growth, collector events to time different parts of
+collection, and mutator events to indicate when specific mutators are
+stopped.
+
+There are three heap events:
+
+ * `init(void* data, size_t heap_size)`: Called during `gc_init`, to
+   allow the listener to initialize its associated state.
+ * `heap_resized(void* data, size_t new_size)`: Called if the heap grows
+   or shrinks.
+ * `live_data_size(void* data, size_t size)`: Called periodically when
+   the collector learns about live data size.
+ 
+The collection events form a kind of state machine, and are called in
+this order:
+
+ * `prepare_gc(void* data, int is_minor, int is_compacting)`: Called at
+   the beginning of GC.  Some mutators may still be active.
+ * `requesting_stop(void* data)`: Called when the collector asks
+   mutators to stop.
+ * `waiting_for_stop(void* data)`: Called when the collector has done
+   all the pre-stop work that it is able to and is just waiting on
+   mutators to stop.
+ * `mutators_stopped(void* data)`: Called when all mutators have
+   stopped; the trace phase follows.
+ * `roots_traced(void* data)`: Called when roots have been visited.
+ * `heap_traced(void* data)`: Called when the whole heap has been
+   traced.
+ * `ephemerons_traced(void* data)`: Called when the [ephemeron
+   fixpoint](https://wingolog.org/archives/2023/01/24/parallel-ephemeron-tracing)
+   has been reached.
+ * `restarting_mutators(void* data)`: Called right before the collector
+   restarts mutators.
+
+The collectors in Whippet will call all of these event handlers, but it
+may be that they are called conservatively: for example, the
+single-mutator, single-collector semi-space collector will never have to
+wait for mutators to stop.  It will still call the functions, though!
+
+Finally, there are the mutator events:
+ * `mutator_added(void* data) -> void*`: The only event handler that
+   returns a value, called when a new mutator is added.  The parameter
+   is the overall event listener data, and the result is
+   mutator-specific data.  The rest of the mutator events pass this
+   mutator-specific data instead.
+ * `mutator_cause_gc(void* mutator_data)`: Called when a mutator causes
+   GC, either via allocation or an explicit `gc_collect` call.
+ * `mutator_stopping(void* mutator_data)`: Called when a mutator has
+   received the signal to stop.  It may perform some marking work before
+   it stops.
+ * `mutator_stopped(void* mutator_data)`: Called when a mutator parks
+   itself.
+ * `mutator_restarted(void* mutator_data)`: Called when a mutator
+   restarts.
+ * `mutator_removed(void* mutator_data)`: Called when a mutator goes
+   away.
+
+Note that these events handlers shouldn't really do much.  In
+particular, they shouldn't call into the Whippet API, and they shouldn't
+even access GC-managed objects.  Event listeners are really about
+statistics and profiling and aren't a place to mutate the object graph.
 
 ### Ephemerons
 

From 44a4e1c1310e6eba36ff824bb9e4787f8d901bb0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Oct 2023 12:45:13 +0200
Subject: [PATCH 204/403] Add manual links

---
 doc/manual.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/manual.md b/doc/manual.md
index 64be5a6c9..4156defdf 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -532,9 +532,11 @@ provides a *listener* for these events when initializing the collector.
 
 The listener interface is defined in
 [`gc-event-listener.h`](../api/gc-event-listener.h).  Whippet ships with
-two listener implementations, `GC_NULL_EVENT_LISTENER`, and
-`GC_BASIC_STATS`.  Most embedders will want their own listener, but
-starting with the basic stats listener is not a bad option:
+two listener implementations,
+[`GC_NULL_EVENT_LISTENER`](../api/gc-null-event-listener.h), and
+[`GC_BASIC_STATS`](../api/gc-basic-stats.h).  Most embedders will want
+their own listener, but starting with the basic stats listener is not a
+bad option:
 
 ```
 #include "gc-api.h"

From a6e34c3594c5abb615764131798885c8715bf377 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Oct 2023 21:36:27 +0200
Subject: [PATCH 205/403] Don't abort semi if parallelism > 1

Re-interpret parallelism option as a maximum.
---
 src/semi.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/semi.c b/src/semi.c
index 247734788..7097b2d49 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -578,10 +578,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
     fprintf(stderr, "adaptive heap size is currently unimplemented\n");
     return 0;
   }
-  if (options->common.parallelism != 1) {
-    fprintf(stderr, "parallelism unimplemented in semispace copying collector\n");
-    return 0;
-  }
+  if (options->common.parallelism != 1)
+    fprintf(stderr, "warning: parallelism unimplemented in semispace copying collector\n");
 
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();

From 9ce8ee2921ac8c093c66690835605fbe33f02828 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 17 Oct 2023 21:45:06 +0200
Subject: [PATCH 206/403] Fix unused API, outdated assertion

---
 api/gc-api.h  | 6 ------
 src/whippet.c | 4 ++--
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/api/gc-api.h b/api/gc-api.h
index f4d053c47..c66959321 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -179,12 +179,6 @@ static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
 // FIXME: remove :P
 GC_API_ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t bytes);
 
-static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
-                                          struct gc_ref new_val) GC_ALWAYS_INLINE;
-static inline void gc_small_write_barrier(struct gc_ref obj, struct gc_edge edge,
-                                          struct gc_ref new_val) {
-}
-
 GC_API_ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                                      struct gc_edge edge, struct gc_ref new_val) GC_NEVER_INLINE;
 
diff --git a/src/whippet.c b/src/whippet.c
index 7b4b90f41..4146eb88e 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -2386,8 +2386,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), METADATA_BYTE_YOUNG);
   GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), METADATA_BYTE_END);
   if (GC_GENERATIONAL) {
-    GC_ASSERT_EQ(gc_small_write_barrier_card_table_alignment(), SLAB_SIZE);
-    GC_ASSERT_EQ(gc_small_write_barrier_card_size(),
+    GC_ASSERT_EQ(gc_write_barrier_card_table_alignment(), SLAB_SIZE);
+    GC_ASSERT_EQ(gc_write_barrier_card_size(),
                  BLOCK_SIZE / REMSET_BYTES_PER_BLOCK);
   }
 

From 8aa2036331ce398848e6bb6897bcc1f05c3a6954 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Oct 2023 15:13:27 +0200
Subject: [PATCH 207/403] Fix optdebug warnings

---
 benchmarks/mt-gcbench.c | 12 ++++++------
 src/whippet.c           |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/mt-gcbench.c b/benchmarks/mt-gcbench.c
index 9d149c431..05ae887d0 100644
--- a/benchmarks/mt-gcbench.c
+++ b/benchmarks/mt-gcbench.c
@@ -205,14 +205,14 @@ static Node* make_tree(struct thread *t, int depth) {
 
 static void validate_tree(Node *tree, int depth) {
 #ifndef NDEBUG
-  ASSERT_EQ(tree->i, 0);
-  ASSERT_EQ(tree->j, depth);
+  GC_ASSERT_EQ(tree->i, 0);
+  GC_ASSERT_EQ(tree->j, depth);
   if (depth == 0) {
-    ASSERT(!tree->left);
-    ASSERT(!tree->right);
+    GC_ASSERT(!tree->left);
+    GC_ASSERT(!tree->right);
   } else {
-    ASSERT(tree->left);
-    ASSERT(tree->right);
+    GC_ASSERT(tree->left);
+    GC_ASSERT(tree->right);
     validate_tree(tree->left, depth - 1);
     validate_tree(tree->right, depth - 1);
   }
diff --git a/src/whippet.c b/src/whippet.c
index 4146eb88e..1ece0d3ff 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1410,7 +1410,7 @@ static void mark_space_clear_remembered_set(struct mark_space *space) {
 
 void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                              struct gc_edge edge, struct gc_ref new_val) {
-  GC_ASSERT(size > gc_allocator_large_threshold());
+  GC_ASSERT(obj_size > gc_allocator_large_threshold());
   gc_object_set_remembered(obj);
 }
 

From 514dcc702e74674716f442c9b1bfd9319059a119 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Oct 2023 10:20:55 +0200
Subject: [PATCH 208/403] Allow embedder to request a major GC

---
 api/gc-api.h                 |   4 +-
 api/gc-basic-stats.h         |   5 +-
 api/gc-collection-kind.h     |  11 ++++
 api/gc-event-listener.h      |   4 +-
 api/gc-null-event-listener.h |   3 +-
 benchmarks/ephemerons.c      |   3 +-
 src/bdw.c                    |  22 +++++--
 src/semi.c                   |   6 +-
 src/whippet.c                | 116 ++++++++++++++++++-----------------
 9 files changed, 104 insertions(+), 70 deletions(-)
 create mode 100644 api/gc-collection-kind.h

diff --git a/api/gc-api.h b/api/gc-api.h
index c66959321..4831500fe 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -4,6 +4,7 @@
 #include "gc-config.h"
 #include "gc-assert.h"
 #include "gc-attrs.h"
+#include "gc-collection-kind.h"
 #include "gc-edge.h"
 #include "gc-event-listener.h"
 #include "gc-inline.h"
@@ -47,7 +48,8 @@ GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
 GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
-GC_API_ void gc_collect(struct gc_mutator *mut);
+GC_API_ void gc_collect(struct gc_mutator *mut,
+                        enum gc_collection_kind requested_kind);
 
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
                                              size_t size) GC_ALWAYS_INLINE;
diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 8e57e40f1..0cf927b95 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -37,10 +37,9 @@ static inline void gc_basic_stats_init(void *data, size_t heap_size) {
 }
 
 static inline void gc_basic_stats_prepare_gc(void *data,
-                                             int is_minor,
-                                             int is_compacting) {
+                                             enum gc_collection_kind kind) {
   struct gc_basic_stats *stats = data;
-  if (is_minor)
+  if (kind == GC_COLLECTION_MINOR)
     stats->minor_collection_count++;
   else
     stats->major_collection_count++;
diff --git a/api/gc-collection-kind.h b/api/gc-collection-kind.h
new file mode 100644
index 000000000..11cfc276a
--- /dev/null
+++ b/api/gc-collection-kind.h
@@ -0,0 +1,11 @@
+#ifndef GC_COLLECTION_KIND_H
+#define GC_COLLECTION_KIND_H
+
+enum gc_collection_kind {
+  GC_COLLECTION_ANY,
+  GC_COLLECTION_MINOR,
+  GC_COLLECTION_MAJOR,
+  GC_COLLECTION_COMPACTING,
+};
+
+#endif // GC_COLLECTION_KIND_H
diff --git a/api/gc-event-listener.h b/api/gc-event-listener.h
index 57df09719..25558838b 100644
--- a/api/gc-event-listener.h
+++ b/api/gc-event-listener.h
@@ -1,9 +1,11 @@
 #ifndef GC_EVENT_LISTENER_H
 #define GC_EVENT_LISTENER_H
 
+#include "gc-collection-kind.h"
+
 struct gc_event_listener {
   void (*init)(void *data, size_t heap_size);
-  void (*prepare_gc)(void *data, int is_minor, int is_compacting);
+  void (*prepare_gc)(void *data, enum gc_collection_kind kind);
   void (*requesting_stop)(void *data);
   void (*waiting_for_stop)(void *data);
   void (*mutators_stopped)(void *data);
diff --git a/api/gc-null-event-listener.h b/api/gc-null-event-listener.h
index 7060bd729..5ca17975e 100644
--- a/api/gc-null-event-listener.h
+++ b/api/gc-null-event-listener.h
@@ -5,8 +5,7 @@
 
 static inline void gc_null_event_listener_init(void *data, size_t size) {}
 static inline void gc_null_event_listener_prepare_gc(void *data,
-                                               int is_minor,
-                                               int is_compacting) {}
+                                                     enum gc_collection_kind) {}
 static inline void gc_null_event_listener_requesting_stop(void *data) {}
 static inline void gc_null_event_listener_waiting_for_stop(void *data) {}
 static inline void gc_null_event_listener_mutators_stopped(void *data) {}
diff --git a/benchmarks/ephemerons.c b/benchmarks/ephemerons.c
index 2193f1fe0..2262bd5c9 100644
--- a/benchmarks/ephemerons.c
+++ b/benchmarks/ephemerons.c
@@ -105,7 +105,8 @@ static double heap_multiplier;
 static size_t nthreads;
 
 static void cause_gc(struct gc_mutator *mut) {
-  gc_collect(mut);
+  // Doing a full collection lets us reason precisely about liveness.
+  gc_collect(mut, GC_COLLECTION_MAJOR);
 }
 
 static void make_ephemeron_chain(struct thread *t, EphemeronHandle *head,
diff --git a/src/bdw.c b/src/bdw.c
index 809ad8808..186b50a8c 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -125,8 +125,22 @@ void* gc_allocate_pointerless(struct gc_mutator *mut,
   return GC_malloc_atomic(size);
 }
 
-void gc_collect(struct gc_mutator *mut) {
-  GC_gcollect();
+void gc_collect(struct gc_mutator *mut,
+                enum gc_collection_kind requested_kind) {
+  switch (requested_kind) {
+  case GC_COLLECTION_MINOR:
+    GC_collect_a_little();
+    break;
+  case GC_COLLECTION_ANY:
+  case GC_COLLECTION_MAJOR:
+    GC_gcollect();
+    break;
+  case GC_COLLECTION_COMPACTING:
+    GC_gcollect_and_unmap();
+    break;
+  default:
+    GC_CRASH();
+  }
 }
 
 void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
@@ -330,9 +344,7 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
 static void on_collection_event(GC_EventType event) {
   switch (event) {
   case GC_EVENT_START: {
-    int is_minor = 0;
-    int is_compacting = 0;
-    HEAP_EVENT(prepare_gc, is_minor, is_compacting);
+    HEAP_EVENT(prepare_gc, GC_COLLECTION_MAJOR);
     HEAP_EVENT(requesting_stop);
     HEAP_EVENT(waiting_for_stop);
     break;
diff --git a/src/semi.c b/src/semi.c
index 7097b2d49..fdcd03792 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -354,7 +354,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   int is_minor = 0;
   int is_compacting = 1;
-  HEAP_EVENT(heap, prepare_gc, is_minor, is_compacting);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
 
   HEAP_EVENT(heap, requesting_stop);
   HEAP_EVENT(heap, waiting_for_stop);
@@ -414,7 +414,9 @@ static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
   GC_CRASH();
 }
 
-void gc_collect(struct gc_mutator *mut) {
+void gc_collect(struct gc_mutator *mut,
+                enum gc_collection_kind requested_kind) {
+  // Ignore requested kind, because we always compact.
   collect(mut, 0);
 }
 
diff --git a/src/whippet.c b/src/whippet.c
index 1ece0d3ff..4c58db275 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -288,15 +288,6 @@ struct mark_space {
   uintptr_t fragmentation_granules_since_last_collection; // atomically
 };
 
-enum gc_kind {
-  GC_KIND_FLAG_MINOR = GC_GENERATIONAL, // 0 or 1
-  GC_KIND_FLAG_EVACUATING = 0x2,
-  GC_KIND_MINOR_IN_PLACE = GC_KIND_FLAG_MINOR,
-  GC_KIND_MINOR_EVACUATING = GC_KIND_FLAG_MINOR | GC_KIND_FLAG_EVACUATING,
-  GC_KIND_MAJOR_IN_PLACE = 0,
-  GC_KIND_MAJOR_EVACUATING = GC_KIND_FLAG_EVACUATING,
-};
-
 struct gc_heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
@@ -310,7 +301,7 @@ struct gc_heap {
   int mark_while_stopping;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
-  enum gc_kind gc_kind;
+  enum gc_collection_kind gc_kind;
   int multithreaded;
   size_t active_mutator_count;
   size_t mutator_count;
@@ -318,7 +309,7 @@ struct gc_heap {
   struct gc_mutator *mutator_trace_list;
   long count;
   long minor_count;
-  uint8_t last_collection_was_minor;
+  enum gc_collection_kind last_collection_kind;
   struct gc_mutator *deactivated_mutators;
   struct tracer tracer;
   double fragmentation_low_threshold;
@@ -380,7 +371,8 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut,
+                    enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
   GC_ASSERT(((uintptr_t)mark & 7) == 0);
@@ -1416,7 +1408,7 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
 
 static void trace_generational_roots(struct gc_heap *heap) {
   // TODO: Add lospace nursery.
-  if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
+  if (atomic_load(&heap->gc_kind) == GC_COLLECTION_MINOR) {
     mark_space_trace_remembered_set(heap_mark_space(heap), heap);
     large_object_space_trace_remembered_set(heap_large_object_space(heap),
                                             enqueue_generational_root,
@@ -1580,10 +1572,12 @@ static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
   return threshold;
 }
 
-static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
+static enum gc_collection_kind
+determine_collection_kind(struct gc_heap *heap,
+                          enum gc_collection_kind requested) {
   struct mark_space *mark_space = heap_mark_space(heap);
-  enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
-  enum gc_kind gc_kind;
+  enum gc_collection_kind previous_gc_kind = atomic_load(&heap->gc_kind);
+  enum gc_collection_kind gc_kind;
   int mark_while_stopping = 1;
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
@@ -1592,13 +1586,16 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
 
   if (heap->count == 0) {
     DEBUG("first collection is always major\n");
-    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MAJOR;
+  } else if (requested != GC_COLLECTION_ANY) {
+    DEBUG("user specifically requested collection kind %d\n", (int)requested);
+    gc_kind = requested;
   } else if (pending > 0) {
     DEBUG("evacuating due to need to reclaim %zd bytes\n", pending);
     // During the last cycle, a large allocation could not find enough
     // free blocks, and we decided not to expand the heap.  Let's do an
     // evacuating major collection to maximize the free block yield.
-    gc_kind = GC_KIND_MAJOR_EVACUATING;
+    gc_kind = GC_COLLECTION_COMPACTING;
 
     // Generally speaking, we allow mutators to mark their own stacks
     // before pausing.  This is a limited form of concurrent marking, as
@@ -1610,7 +1607,7 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     // marking.  Of course if the mutator has conservative roots we will
     // have pinning anyway and might as well allow ragged stops.
     mark_while_stopping = gc_has_conservative_roots();
-  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
+  } else if (previous_gc_kind == GC_COLLECTION_COMPACTING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
           fragmentation * 100.,
@@ -1618,46 +1615,50 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     // For some reason, we already decided to compact in the past,
     // and fragmentation hasn't yet fallen below a low-water-mark.
     // Keep going.
-    gc_kind = GC_KIND_MAJOR_EVACUATING;
+    gc_kind = GC_COLLECTION_COMPACTING;
   } else if (fragmentation > heap->fragmentation_high_threshold) {
     // Switch to evacuation mode if the heap is too fragmented.
     DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
           fragmentation * 100.,
           heap->fragmentation_high_threshold * 100.);
-    gc_kind = GC_KIND_MAJOR_EVACUATING;
-  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING) {
+    gc_kind = GC_COLLECTION_COMPACTING;
+  } else if (previous_gc_kind == GC_COLLECTION_COMPACTING) {
     // We were evacuating, but we're good now.  Go back to minor
     // collections.
     DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
           fragmentation * 100.,
           heap->fragmentation_low_threshold * 100.);
-    gc_kind = GC_KIND_MINOR_IN_PLACE;
-  } else if (previous_gc_kind != GC_KIND_MINOR_IN_PLACE) {
+    gc_kind = GC_GENERATIONAL ? GC_COLLECTION_MINOR : GC_COLLECTION_MAJOR;
+  } else if (!GC_GENERATIONAL) {
+    DEBUG("keeping on with major in-place GC\n");
+    GC_ASSERT(previous_gc_kind == GC_COLLECTION_MAJOR);
+    gc_kind = GC_COLLECTION_MAJOR;
+  } else if (previous_gc_kind != GC_COLLECTION_MINOR) {
     DEBUG("returning to minor collection after major collection\n");
     // Go back to minor collections.
-    gc_kind = GC_KIND_MINOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MINOR;
   } else if (yield < heap->major_gc_yield_threshold) {
     DEBUG("collection yield too low, triggering major collection\n");
     // Nursery is getting tight; trigger a major GC.
-    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MAJOR;
   } else {
     DEBUG("keeping on with minor GC\n");
     // Nursery has adequate space; keep trucking with minor GCs.
-    GC_ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
-    gc_kind = GC_KIND_MINOR_IN_PLACE;
+    GC_ASSERT(previous_gc_kind == GC_COLLECTION_MINOR);
+    gc_kind = GC_COLLECTION_MINOR;
   }
 
   if (gc_has_conservative_intraheap_edges() &&
-      (gc_kind & GC_KIND_FLAG_EVACUATING)) {
+      gc_kind == GC_COLLECTION_COMPACTING) {
     DEBUG("welp.  conservative heap scanning, no evacuation for you\n");
-    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MAJOR;
     mark_while_stopping = 1;
   }
 
   // If this is the first in a series of minor collections, reset the
   // threshold at which we should do a major GC.
-  if ((gc_kind & GC_KIND_FLAG_MINOR) &&
-      (previous_gc_kind & GC_KIND_FLAG_MINOR) != GC_KIND_FLAG_MINOR) {
+  if (gc_kind == GC_COLLECTION_MINOR &&
+      previous_gc_kind != GC_COLLECTION_MINOR) {
     double yield = heap_last_gc_yield(heap);
     double threshold = yield * heap->minor_gc_yield_threshold;
     double clamped = clamp_major_gc_yield_threshold(heap, threshold);
@@ -1687,7 +1688,7 @@ static void release_evacuation_target_blocks(struct mark_space *space) {
 static void prepare_for_evacuation(struct gc_heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
 
-  if ((heap->gc_kind & GC_KIND_FLAG_EVACUATING) == 0) {
+  if (heap->gc_kind != GC_COLLECTION_COMPACTING) {
     space->evacuating = 0;
     space->evacuation_reserve = space->evacuation_minimum_reserve;
     return;
@@ -1796,7 +1797,7 @@ static void trace_roots_after_stop(struct gc_heap *heap) {
 }
 
 static void mark_space_finish_gc(struct mark_space *space,
-                                 enum gc_kind gc_kind) {
+                                 enum gc_collection_kind gc_kind) {
   space->evacuating = 0;
   reset_sweeper(space);
   update_mark_patterns(space, 0);
@@ -1824,7 +1825,8 @@ static void sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
-static void collect(struct gc_mutator *mut) {
+static void collect(struct gc_mutator *mut,
+                    enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
@@ -1835,12 +1837,12 @@ static void collect(struct gc_mutator *mut) {
   }
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
-  enum gc_kind gc_kind = determine_collection_kind(heap);
-  HEAP_EVENT(heap, prepare_gc, gc_kind & GC_KIND_FLAG_MINOR,
-             gc_kind & GC_KIND_FLAG_EVACUATING);
-  update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
-  large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
-  gc_extern_space_start_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
+  enum gc_collection_kind gc_kind =
+    determine_collection_kind(heap, requested_kind);
+  HEAP_EVENT(heap, prepare_gc, gc_kind);
+  update_mark_patterns(space, gc_kind != GC_COLLECTION_MINOR);
+  large_object_space_start_gc(lospace, gc_kind == GC_COLLECTION_MINOR);
+  gc_extern_space_start_gc(exspace, gc_kind == GC_COLLECTION_MINOR);
   resolve_ephemerons_lazily(heap);
   tracer_prepare(heap);
   HEAP_EVENT(heap, requesting_stop);
@@ -1868,11 +1870,11 @@ static void collect(struct gc_mutator *mut) {
   sweep_ephemerons(heap);
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
-  large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
-  gc_extern_space_finish_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
+  large_object_space_finish_gc(lospace, gc_kind == GC_COLLECTION_MINOR);
+  gc_extern_space_finish_gc(exspace, gc_kind == GC_COLLECTION_MINOR);
   heap->count++;
-  heap->last_collection_was_minor = gc_kind & GC_KIND_FLAG_MINOR;
-  if (heap->last_collection_was_minor)
+  heap->last_collection_kind = gc_kind;
+  if (gc_kind == GC_COLLECTION_MINOR)
     heap->minor_count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   HEAP_EVENT(heap, restarting_mutators);
@@ -2079,7 +2081,7 @@ static size_t next_hole(struct gc_mutator *mut) {
           // mark bytes didn't rotate, so we have no cleanup to do; and
           // we shouldn't try to allocate into them as it's not worth
           // it.  Any wasted space is measured as fragmentation.
-          if (mutator_heap(mut)->last_collection_was_minor)
+          if (mutator_heap(mut)->last_collection_kind == GC_COLLECTION_MINOR)
             continue;
           else
             block_summary_clear_flag(summary, BLOCK_VENERABLE);
@@ -2092,7 +2094,7 @@ static size_t next_hole(struct gc_mutator *mut) {
             // mostly old data.  Sweep any garbage, commit the mark as
             // venerable, and avoid allocating into it.
             block_summary_clear_flag(summary, BLOCK_VENERABLE_AFTER_SWEEP);
-            if (mutator_heap(mut)->last_collection_was_minor) {
+            if (mutator_heap(mut)->last_collection_kind == GC_COLLECTION_MINOR) {
               finish_sweeping_in_block(mut);
               block_summary_set_flag(summary, BLOCK_VENERABLE);
               continue;
@@ -2153,18 +2155,22 @@ static void finish_sweeping(struct gc_mutator *mut) {
     finish_hole(mut);
 }
 
-static void trigger_collection(struct gc_mutator *mut) {
+static void trigger_collection(struct gc_mutator *mut,
+                               enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
+  enum gc_collection_kind last_collection_kind = GC_COLLECTION_ANY;
   heap_lock(heap);
-  if (mutators_are_stopping(heap))
+  while (mutators_are_stopping(heap)) {
     pause_mutator_for_collection_with_lock(mut);
-  else
-    collect(mut);
+    last_collection_kind = heap->last_collection_kind;
+  }
+  if (last_collection_kind < requested_kind)
+    collect(mut, requested_kind);
   heap_unlock(heap);
 }
 
-void gc_collect(struct gc_mutator *mut) {
-  trigger_collection(mut);
+void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
+  trigger_collection(mut, kind);
 }
 
 static void* allocate_large(struct gc_mutator *mut, size_t size) {
@@ -2177,7 +2183,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
                                     npages << space->page_size_log2);
 
   while (!sweep_until_memory_released(mut))
-    trigger_collection(mut);
+    trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
@@ -2215,7 +2221,7 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
         break;
       }
       if (!hole)
-        trigger_collection(mut);
+        trigger_collection(mut, GC_COLLECTION_ANY);
     }
     ret = gc_ref(mut->alloc);
     mut->alloc += size;

From 0c7bdacc5140ce0d05409015d13aa7a188011f89 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Oct 2023 10:28:46 +0200
Subject: [PATCH 209/403] Revert "Allow embedder to request a major GC"

This reverts commit f39e6ee69f0aa2e14227a019a5332ba129418977.
---
 api/gc-api.h                 |   4 +-
 api/gc-basic-stats.h         |   5 +-
 api/gc-collection-kind.h     |  11 ----
 api/gc-event-listener.h      |   4 +-
 api/gc-null-event-listener.h |   3 +-
 benchmarks/ephemerons.c      |   3 +-
 src/bdw.c                    |  22 ++-----
 src/semi.c                   |   6 +-
 src/whippet.c                | 116 +++++++++++++++++------------------
 9 files changed, 70 insertions(+), 104 deletions(-)
 delete mode 100644 api/gc-collection-kind.h

diff --git a/api/gc-api.h b/api/gc-api.h
index 4831500fe..c66959321 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -4,7 +4,6 @@
 #include "gc-config.h"
 #include "gc-assert.h"
 #include "gc-attrs.h"
-#include "gc-collection-kind.h"
 #include "gc-edge.h"
 #include "gc-event-listener.h"
 #include "gc-inline.h"
@@ -48,8 +47,7 @@ GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
 GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
-GC_API_ void gc_collect(struct gc_mutator *mut,
-                        enum gc_collection_kind requested_kind);
+GC_API_ void gc_collect(struct gc_mutator *mut);
 
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
                                              size_t size) GC_ALWAYS_INLINE;
diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 0cf927b95..8e57e40f1 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -37,9 +37,10 @@ static inline void gc_basic_stats_init(void *data, size_t heap_size) {
 }
 
 static inline void gc_basic_stats_prepare_gc(void *data,
-                                             enum gc_collection_kind kind) {
+                                             int is_minor,
+                                             int is_compacting) {
   struct gc_basic_stats *stats = data;
-  if (kind == GC_COLLECTION_MINOR)
+  if (is_minor)
     stats->minor_collection_count++;
   else
     stats->major_collection_count++;
diff --git a/api/gc-collection-kind.h b/api/gc-collection-kind.h
deleted file mode 100644
index 11cfc276a..000000000
--- a/api/gc-collection-kind.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef GC_COLLECTION_KIND_H
-#define GC_COLLECTION_KIND_H
-
-enum gc_collection_kind {
-  GC_COLLECTION_ANY,
-  GC_COLLECTION_MINOR,
-  GC_COLLECTION_MAJOR,
-  GC_COLLECTION_COMPACTING,
-};
-
-#endif // GC_COLLECTION_KIND_H
diff --git a/api/gc-event-listener.h b/api/gc-event-listener.h
index 25558838b..57df09719 100644
--- a/api/gc-event-listener.h
+++ b/api/gc-event-listener.h
@@ -1,11 +1,9 @@
 #ifndef GC_EVENT_LISTENER_H
 #define GC_EVENT_LISTENER_H
 
-#include "gc-collection-kind.h"
-
 struct gc_event_listener {
   void (*init)(void *data, size_t heap_size);
-  void (*prepare_gc)(void *data, enum gc_collection_kind kind);
+  void (*prepare_gc)(void *data, int is_minor, int is_compacting);
   void (*requesting_stop)(void *data);
   void (*waiting_for_stop)(void *data);
   void (*mutators_stopped)(void *data);
diff --git a/api/gc-null-event-listener.h b/api/gc-null-event-listener.h
index 5ca17975e..7060bd729 100644
--- a/api/gc-null-event-listener.h
+++ b/api/gc-null-event-listener.h
@@ -5,7 +5,8 @@
 
 static inline void gc_null_event_listener_init(void *data, size_t size) {}
 static inline void gc_null_event_listener_prepare_gc(void *data,
-                                                     enum gc_collection_kind) {}
+                                               int is_minor,
+                                               int is_compacting) {}
 static inline void gc_null_event_listener_requesting_stop(void *data) {}
 static inline void gc_null_event_listener_waiting_for_stop(void *data) {}
 static inline void gc_null_event_listener_mutators_stopped(void *data) {}
diff --git a/benchmarks/ephemerons.c b/benchmarks/ephemerons.c
index 2262bd5c9..2193f1fe0 100644
--- a/benchmarks/ephemerons.c
+++ b/benchmarks/ephemerons.c
@@ -105,8 +105,7 @@ static double heap_multiplier;
 static size_t nthreads;
 
 static void cause_gc(struct gc_mutator *mut) {
-  // Doing a full collection lets us reason precisely about liveness.
-  gc_collect(mut, GC_COLLECTION_MAJOR);
+  gc_collect(mut);
 }
 
 static void make_ephemeron_chain(struct thread *t, EphemeronHandle *head,
diff --git a/src/bdw.c b/src/bdw.c
index 186b50a8c..809ad8808 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -125,22 +125,8 @@ void* gc_allocate_pointerless(struct gc_mutator *mut,
   return GC_malloc_atomic(size);
 }
 
-void gc_collect(struct gc_mutator *mut,
-                enum gc_collection_kind requested_kind) {
-  switch (requested_kind) {
-  case GC_COLLECTION_MINOR:
-    GC_collect_a_little();
-    break;
-  case GC_COLLECTION_ANY:
-  case GC_COLLECTION_MAJOR:
-    GC_gcollect();
-    break;
-  case GC_COLLECTION_COMPACTING:
-    GC_gcollect_and_unmap();
-    break;
-  default:
-    GC_CRASH();
-  }
+void gc_collect(struct gc_mutator *mut) {
+  GC_gcollect();
 }
 
 void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
@@ -344,7 +330,9 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
 static void on_collection_event(GC_EventType event) {
   switch (event) {
   case GC_EVENT_START: {
-    HEAP_EVENT(prepare_gc, GC_COLLECTION_MAJOR);
+    int is_minor = 0;
+    int is_compacting = 0;
+    HEAP_EVENT(prepare_gc, is_minor, is_compacting);
     HEAP_EVENT(requesting_stop);
     HEAP_EVENT(waiting_for_stop);
     break;
diff --git a/src/semi.c b/src/semi.c
index fdcd03792..7097b2d49 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -354,7 +354,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   int is_minor = 0;
   int is_compacting = 1;
-  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
+  HEAP_EVENT(heap, prepare_gc, is_minor, is_compacting);
 
   HEAP_EVENT(heap, requesting_stop);
   HEAP_EVENT(heap, waiting_for_stop);
@@ -414,9 +414,7 @@ static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
   GC_CRASH();
 }
 
-void gc_collect(struct gc_mutator *mut,
-                enum gc_collection_kind requested_kind) {
-  // Ignore requested kind, because we always compact.
+void gc_collect(struct gc_mutator *mut) {
   collect(mut, 0);
 }
 
diff --git a/src/whippet.c b/src/whippet.c
index 4c58db275..1ece0d3ff 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -288,6 +288,15 @@ struct mark_space {
   uintptr_t fragmentation_granules_since_last_collection; // atomically
 };
 
+enum gc_kind {
+  GC_KIND_FLAG_MINOR = GC_GENERATIONAL, // 0 or 1
+  GC_KIND_FLAG_EVACUATING = 0x2,
+  GC_KIND_MINOR_IN_PLACE = GC_KIND_FLAG_MINOR,
+  GC_KIND_MINOR_EVACUATING = GC_KIND_FLAG_MINOR | GC_KIND_FLAG_EVACUATING,
+  GC_KIND_MAJOR_IN_PLACE = 0,
+  GC_KIND_MAJOR_EVACUATING = GC_KIND_FLAG_EVACUATING,
+};
+
 struct gc_heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
@@ -301,7 +310,7 @@ struct gc_heap {
   int mark_while_stopping;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
-  enum gc_collection_kind gc_kind;
+  enum gc_kind gc_kind;
   int multithreaded;
   size_t active_mutator_count;
   size_t mutator_count;
@@ -309,7 +318,7 @@ struct gc_heap {
   struct gc_mutator *mutator_trace_list;
   long count;
   long minor_count;
-  enum gc_collection_kind last_collection_kind;
+  uint8_t last_collection_was_minor;
   struct gc_mutator *deactivated_mutators;
   struct tracer tracer;
   double fragmentation_low_threshold;
@@ -371,8 +380,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct gc_mutator *mut,
-                    enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
   GC_ASSERT(((uintptr_t)mark & 7) == 0);
@@ -1408,7 +1416,7 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
 
 static void trace_generational_roots(struct gc_heap *heap) {
   // TODO: Add lospace nursery.
-  if (atomic_load(&heap->gc_kind) == GC_COLLECTION_MINOR) {
+  if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
     mark_space_trace_remembered_set(heap_mark_space(heap), heap);
     large_object_space_trace_remembered_set(heap_large_object_space(heap),
                                             enqueue_generational_root,
@@ -1572,12 +1580,10 @@ static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
   return threshold;
 }
 
-static enum gc_collection_kind
-determine_collection_kind(struct gc_heap *heap,
-                          enum gc_collection_kind requested) {
+static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
   struct mark_space *mark_space = heap_mark_space(heap);
-  enum gc_collection_kind previous_gc_kind = atomic_load(&heap->gc_kind);
-  enum gc_collection_kind gc_kind;
+  enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
+  enum gc_kind gc_kind;
   int mark_while_stopping = 1;
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
@@ -1586,16 +1592,13 @@ determine_collection_kind(struct gc_heap *heap,
 
   if (heap->count == 0) {
     DEBUG("first collection is always major\n");
-    gc_kind = GC_COLLECTION_MAJOR;
-  } else if (requested != GC_COLLECTION_ANY) {
-    DEBUG("user specifically requested collection kind %d\n", (int)requested);
-    gc_kind = requested;
+    gc_kind = GC_KIND_MAJOR_IN_PLACE;
   } else if (pending > 0) {
     DEBUG("evacuating due to need to reclaim %zd bytes\n", pending);
     // During the last cycle, a large allocation could not find enough
     // free blocks, and we decided not to expand the heap.  Let's do an
     // evacuating major collection to maximize the free block yield.
-    gc_kind = GC_COLLECTION_COMPACTING;
+    gc_kind = GC_KIND_MAJOR_EVACUATING;
 
     // Generally speaking, we allow mutators to mark their own stacks
     // before pausing.  This is a limited form of concurrent marking, as
@@ -1607,7 +1610,7 @@ determine_collection_kind(struct gc_heap *heap,
     // marking.  Of course if the mutator has conservative roots we will
     // have pinning anyway and might as well allow ragged stops.
     mark_while_stopping = gc_has_conservative_roots();
-  } else if (previous_gc_kind == GC_COLLECTION_COMPACTING
+  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
           fragmentation * 100.,
@@ -1615,50 +1618,46 @@ determine_collection_kind(struct gc_heap *heap,
     // For some reason, we already decided to compact in the past,
     // and fragmentation hasn't yet fallen below a low-water-mark.
     // Keep going.
-    gc_kind = GC_COLLECTION_COMPACTING;
+    gc_kind = GC_KIND_MAJOR_EVACUATING;
   } else if (fragmentation > heap->fragmentation_high_threshold) {
     // Switch to evacuation mode if the heap is too fragmented.
     DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
           fragmentation * 100.,
           heap->fragmentation_high_threshold * 100.);
-    gc_kind = GC_COLLECTION_COMPACTING;
-  } else if (previous_gc_kind == GC_COLLECTION_COMPACTING) {
+    gc_kind = GC_KIND_MAJOR_EVACUATING;
+  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING) {
     // We were evacuating, but we're good now.  Go back to minor
     // collections.
     DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
           fragmentation * 100.,
           heap->fragmentation_low_threshold * 100.);
-    gc_kind = GC_GENERATIONAL ? GC_COLLECTION_MINOR : GC_COLLECTION_MAJOR;
-  } else if (!GC_GENERATIONAL) {
-    DEBUG("keeping on with major in-place GC\n");
-    GC_ASSERT(previous_gc_kind == GC_COLLECTION_MAJOR);
-    gc_kind = GC_COLLECTION_MAJOR;
-  } else if (previous_gc_kind != GC_COLLECTION_MINOR) {
+    gc_kind = GC_KIND_MINOR_IN_PLACE;
+  } else if (previous_gc_kind != GC_KIND_MINOR_IN_PLACE) {
     DEBUG("returning to minor collection after major collection\n");
     // Go back to minor collections.
-    gc_kind = GC_COLLECTION_MINOR;
+    gc_kind = GC_KIND_MINOR_IN_PLACE;
   } else if (yield < heap->major_gc_yield_threshold) {
     DEBUG("collection yield too low, triggering major collection\n");
     // Nursery is getting tight; trigger a major GC.
-    gc_kind = GC_COLLECTION_MAJOR;
+    gc_kind = GC_KIND_MAJOR_IN_PLACE;
   } else {
     DEBUG("keeping on with minor GC\n");
     // Nursery has adequate space; keep trucking with minor GCs.
-    GC_ASSERT(previous_gc_kind == GC_COLLECTION_MINOR);
-    gc_kind = GC_COLLECTION_MINOR;
+    GC_ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
+    gc_kind = GC_KIND_MINOR_IN_PLACE;
   }
 
   if (gc_has_conservative_intraheap_edges() &&
-      gc_kind == GC_COLLECTION_COMPACTING) {
+      (gc_kind & GC_KIND_FLAG_EVACUATING)) {
     DEBUG("welp.  conservative heap scanning, no evacuation for you\n");
-    gc_kind = GC_COLLECTION_MAJOR;
+    gc_kind = GC_KIND_MAJOR_IN_PLACE;
     mark_while_stopping = 1;
   }
 
   // If this is the first in a series of minor collections, reset the
   // threshold at which we should do a major GC.
-  if (gc_kind == GC_COLLECTION_MINOR &&
-      previous_gc_kind != GC_COLLECTION_MINOR) {
+  if ((gc_kind & GC_KIND_FLAG_MINOR) &&
+      (previous_gc_kind & GC_KIND_FLAG_MINOR) != GC_KIND_FLAG_MINOR) {
     double yield = heap_last_gc_yield(heap);
     double threshold = yield * heap->minor_gc_yield_threshold;
     double clamped = clamp_major_gc_yield_threshold(heap, threshold);
@@ -1688,7 +1687,7 @@ static void release_evacuation_target_blocks(struct mark_space *space) {
 static void prepare_for_evacuation(struct gc_heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
 
-  if (heap->gc_kind != GC_COLLECTION_COMPACTING) {
+  if ((heap->gc_kind & GC_KIND_FLAG_EVACUATING) == 0) {
     space->evacuating = 0;
     space->evacuation_reserve = space->evacuation_minimum_reserve;
     return;
@@ -1797,7 +1796,7 @@ static void trace_roots_after_stop(struct gc_heap *heap) {
 }
 
 static void mark_space_finish_gc(struct mark_space *space,
-                                 enum gc_collection_kind gc_kind) {
+                                 enum gc_kind gc_kind) {
   space->evacuating = 0;
   reset_sweeper(space);
   update_mark_patterns(space, 0);
@@ -1825,8 +1824,7 @@ static void sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
-static void collect(struct gc_mutator *mut,
-                    enum gc_collection_kind requested_kind) {
+static void collect(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
@@ -1837,12 +1835,12 @@ static void collect(struct gc_mutator *mut,
   }
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
-  enum gc_collection_kind gc_kind =
-    determine_collection_kind(heap, requested_kind);
-  HEAP_EVENT(heap, prepare_gc, gc_kind);
-  update_mark_patterns(space, gc_kind != GC_COLLECTION_MINOR);
-  large_object_space_start_gc(lospace, gc_kind == GC_COLLECTION_MINOR);
-  gc_extern_space_start_gc(exspace, gc_kind == GC_COLLECTION_MINOR);
+  enum gc_kind gc_kind = determine_collection_kind(heap);
+  HEAP_EVENT(heap, prepare_gc, gc_kind & GC_KIND_FLAG_MINOR,
+             gc_kind & GC_KIND_FLAG_EVACUATING);
+  update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
+  large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
+  gc_extern_space_start_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
   resolve_ephemerons_lazily(heap);
   tracer_prepare(heap);
   HEAP_EVENT(heap, requesting_stop);
@@ -1870,11 +1868,11 @@ static void collect(struct gc_mutator *mut,
   sweep_ephemerons(heap);
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
-  large_object_space_finish_gc(lospace, gc_kind == GC_COLLECTION_MINOR);
-  gc_extern_space_finish_gc(exspace, gc_kind == GC_COLLECTION_MINOR);
+  large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
+  gc_extern_space_finish_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
   heap->count++;
-  heap->last_collection_kind = gc_kind;
-  if (gc_kind == GC_COLLECTION_MINOR)
+  heap->last_collection_was_minor = gc_kind & GC_KIND_FLAG_MINOR;
+  if (heap->last_collection_was_minor)
     heap->minor_count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   HEAP_EVENT(heap, restarting_mutators);
@@ -2081,7 +2079,7 @@ static size_t next_hole(struct gc_mutator *mut) {
           // mark bytes didn't rotate, so we have no cleanup to do; and
           // we shouldn't try to allocate into them as it's not worth
           // it.  Any wasted space is measured as fragmentation.
-          if (mutator_heap(mut)->last_collection_kind == GC_COLLECTION_MINOR)
+          if (mutator_heap(mut)->last_collection_was_minor)
             continue;
           else
             block_summary_clear_flag(summary, BLOCK_VENERABLE);
@@ -2094,7 +2092,7 @@ static size_t next_hole(struct gc_mutator *mut) {
             // mostly old data.  Sweep any garbage, commit the mark as
             // venerable, and avoid allocating into it.
             block_summary_clear_flag(summary, BLOCK_VENERABLE_AFTER_SWEEP);
-            if (mutator_heap(mut)->last_collection_kind == GC_COLLECTION_MINOR) {
+            if (mutator_heap(mut)->last_collection_was_minor) {
               finish_sweeping_in_block(mut);
               block_summary_set_flag(summary, BLOCK_VENERABLE);
               continue;
@@ -2155,22 +2153,18 @@ static void finish_sweeping(struct gc_mutator *mut) {
     finish_hole(mut);
 }
 
-static void trigger_collection(struct gc_mutator *mut,
-                               enum gc_collection_kind requested_kind) {
+static void trigger_collection(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
-  enum gc_collection_kind last_collection_kind = GC_COLLECTION_ANY;
   heap_lock(heap);
-  while (mutators_are_stopping(heap)) {
+  if (mutators_are_stopping(heap))
     pause_mutator_for_collection_with_lock(mut);
-    last_collection_kind = heap->last_collection_kind;
-  }
-  if (last_collection_kind < requested_kind)
-    collect(mut, requested_kind);
+  else
+    collect(mut);
   heap_unlock(heap);
 }
 
-void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
-  trigger_collection(mut, kind);
+void gc_collect(struct gc_mutator *mut) {
+  trigger_collection(mut);
 }
 
 static void* allocate_large(struct gc_mutator *mut, size_t size) {
@@ -2183,7 +2177,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
                                     npages << space->page_size_log2);
 
   while (!sweep_until_memory_released(mut))
-    trigger_collection(mut, GC_COLLECTION_COMPACTING);
+    trigger_collection(mut);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
@@ -2221,7 +2215,7 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
         break;
       }
       if (!hole)
-        trigger_collection(mut, GC_COLLECTION_ANY);
+        trigger_collection(mut);
     }
     ret = gc_ref(mut->alloc);
     mut->alloc += size;

From cfc8c8a9b8472fc843648385ed8a75f0ddbb59ac Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Oct 2023 15:18:35 +0200
Subject: [PATCH 210/403] Allow embedder to request a major GC

---
 api/gc-api.h                 |   4 +-
 api/gc-basic-stats.h         |   5 +-
 api/gc-collection-kind.h     |  11 +++
 api/gc-event-listener.h      |   4 +-
 api/gc-null-event-listener.h |   3 +-
 benchmarks/ephemerons.c      |   3 +-
 src/bdw.c                    |  22 ++++--
 src/semi.c                   |   6 +-
 src/whippet.c                | 135 +++++++++++++++++++----------------
 9 files changed, 115 insertions(+), 78 deletions(-)
 create mode 100644 api/gc-collection-kind.h

diff --git a/api/gc-api.h b/api/gc-api.h
index c66959321..4831500fe 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -4,6 +4,7 @@
 #include "gc-config.h"
 #include "gc-assert.h"
 #include "gc-attrs.h"
+#include "gc-collection-kind.h"
 #include "gc-edge.h"
 #include "gc-event-listener.h"
 #include "gc-inline.h"
@@ -47,7 +48,8 @@ GC_API_ void gc_finish_for_thread(struct gc_mutator *mut);
 GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
                                  void *data) GC_NEVER_INLINE;
 
-GC_API_ void gc_collect(struct gc_mutator *mut);
+GC_API_ void gc_collect(struct gc_mutator *mut,
+                        enum gc_collection_kind requested_kind);
 
 static inline void gc_clear_fresh_allocation(struct gc_ref obj,
                                              size_t size) GC_ALWAYS_INLINE;
diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 8e57e40f1..0cf927b95 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -37,10 +37,9 @@ static inline void gc_basic_stats_init(void *data, size_t heap_size) {
 }
 
 static inline void gc_basic_stats_prepare_gc(void *data,
-                                             int is_minor,
-                                             int is_compacting) {
+                                             enum gc_collection_kind kind) {
   struct gc_basic_stats *stats = data;
-  if (is_minor)
+  if (kind == GC_COLLECTION_MINOR)
     stats->minor_collection_count++;
   else
     stats->major_collection_count++;
diff --git a/api/gc-collection-kind.h b/api/gc-collection-kind.h
new file mode 100644
index 000000000..11cfc276a
--- /dev/null
+++ b/api/gc-collection-kind.h
@@ -0,0 +1,11 @@
+#ifndef GC_COLLECTION_KIND_H
+#define GC_COLLECTION_KIND_H
+
+enum gc_collection_kind {
+  GC_COLLECTION_ANY,
+  GC_COLLECTION_MINOR,
+  GC_COLLECTION_MAJOR,
+  GC_COLLECTION_COMPACTING,
+};
+
+#endif // GC_COLLECTION_KIND_H
diff --git a/api/gc-event-listener.h b/api/gc-event-listener.h
index 57df09719..25558838b 100644
--- a/api/gc-event-listener.h
+++ b/api/gc-event-listener.h
@@ -1,9 +1,11 @@
 #ifndef GC_EVENT_LISTENER_H
 #define GC_EVENT_LISTENER_H
 
+#include "gc-collection-kind.h"
+
 struct gc_event_listener {
   void (*init)(void *data, size_t heap_size);
-  void (*prepare_gc)(void *data, int is_minor, int is_compacting);
+  void (*prepare_gc)(void *data, enum gc_collection_kind kind);
   void (*requesting_stop)(void *data);
   void (*waiting_for_stop)(void *data);
   void (*mutators_stopped)(void *data);
diff --git a/api/gc-null-event-listener.h b/api/gc-null-event-listener.h
index 7060bd729..5ca17975e 100644
--- a/api/gc-null-event-listener.h
+++ b/api/gc-null-event-listener.h
@@ -5,8 +5,7 @@
 
 static inline void gc_null_event_listener_init(void *data, size_t size) {}
 static inline void gc_null_event_listener_prepare_gc(void *data,
-                                               int is_minor,
-                                               int is_compacting) {}
+                                                     enum gc_collection_kind) {}
 static inline void gc_null_event_listener_requesting_stop(void *data) {}
 static inline void gc_null_event_listener_waiting_for_stop(void *data) {}
 static inline void gc_null_event_listener_mutators_stopped(void *data) {}
diff --git a/benchmarks/ephemerons.c b/benchmarks/ephemerons.c
index 2193f1fe0..2262bd5c9 100644
--- a/benchmarks/ephemerons.c
+++ b/benchmarks/ephemerons.c
@@ -105,7 +105,8 @@ static double heap_multiplier;
 static size_t nthreads;
 
 static void cause_gc(struct gc_mutator *mut) {
-  gc_collect(mut);
+  // Doing a full collection lets us reason precisely about liveness.
+  gc_collect(mut, GC_COLLECTION_MAJOR);
 }
 
 static void make_ephemeron_chain(struct thread *t, EphemeronHandle *head,
diff --git a/src/bdw.c b/src/bdw.c
index 809ad8808..186b50a8c 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -125,8 +125,22 @@ void* gc_allocate_pointerless(struct gc_mutator *mut,
   return GC_malloc_atomic(size);
 }
 
-void gc_collect(struct gc_mutator *mut) {
-  GC_gcollect();
+void gc_collect(struct gc_mutator *mut,
+                enum gc_collection_kind requested_kind) {
+  switch (requested_kind) {
+  case GC_COLLECTION_MINOR:
+    GC_collect_a_little();
+    break;
+  case GC_COLLECTION_ANY:
+  case GC_COLLECTION_MAJOR:
+    GC_gcollect();
+    break;
+  case GC_COLLECTION_COMPACTING:
+    GC_gcollect_and_unmap();
+    break;
+  default:
+    GC_CRASH();
+  }
 }
 
 void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
@@ -330,9 +344,7 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
 static void on_collection_event(GC_EventType event) {
   switch (event) {
   case GC_EVENT_START: {
-    int is_minor = 0;
-    int is_compacting = 0;
-    HEAP_EVENT(prepare_gc, is_minor, is_compacting);
+    HEAP_EVENT(prepare_gc, GC_COLLECTION_MAJOR);
     HEAP_EVENT(requesting_stop);
     HEAP_EVENT(waiting_for_stop);
     break;
diff --git a/src/semi.c b/src/semi.c
index 7097b2d49..fdcd03792 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -354,7 +354,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   int is_minor = 0;
   int is_compacting = 1;
-  HEAP_EVENT(heap, prepare_gc, is_minor, is_compacting);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
 
   HEAP_EVENT(heap, requesting_stop);
   HEAP_EVENT(heap, waiting_for_stop);
@@ -414,7 +414,9 @@ static void collect_for_alloc(struct gc_mutator *mut, size_t bytes) {
   GC_CRASH();
 }
 
-void gc_collect(struct gc_mutator *mut) {
+void gc_collect(struct gc_mutator *mut,
+                enum gc_collection_kind requested_kind) {
+  // Ignore requested kind, because we always compact.
   collect(mut, 0);
 }
 
diff --git a/src/whippet.c b/src/whippet.c
index 1ece0d3ff..5deba1a64 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -288,15 +288,6 @@ struct mark_space {
   uintptr_t fragmentation_granules_since_last_collection; // atomically
 };
 
-enum gc_kind {
-  GC_KIND_FLAG_MINOR = GC_GENERATIONAL, // 0 or 1
-  GC_KIND_FLAG_EVACUATING = 0x2,
-  GC_KIND_MINOR_IN_PLACE = GC_KIND_FLAG_MINOR,
-  GC_KIND_MINOR_EVACUATING = GC_KIND_FLAG_MINOR | GC_KIND_FLAG_EVACUATING,
-  GC_KIND_MAJOR_IN_PLACE = 0,
-  GC_KIND_MAJOR_EVACUATING = GC_KIND_FLAG_EVACUATING,
-};
-
 struct gc_heap {
   struct mark_space mark_space;
   struct large_object_space large_object_space;
@@ -310,14 +301,13 @@ struct gc_heap {
   int mark_while_stopping;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
-  enum gc_kind gc_kind;
+  enum gc_collection_kind gc_kind;
   int multithreaded;
   size_t active_mutator_count;
   size_t mutator_count;
   struct gc_heap_roots *roots;
   struct gc_mutator *mutator_trace_list;
   long count;
-  long minor_count;
   uint8_t last_collection_was_minor;
   struct gc_mutator *deactivated_mutators;
   struct tracer tracer;
@@ -380,7 +370,8 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
 }
 
-static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut,
+                    enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 
 static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
   GC_ASSERT(((uintptr_t)mark & 7) == 0);
@@ -1416,7 +1407,7 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
 
 static void trace_generational_roots(struct gc_heap *heap) {
   // TODO: Add lospace nursery.
-  if (atomic_load(&heap->gc_kind) & GC_KIND_FLAG_MINOR) {
+  if (atomic_load(&heap->gc_kind) == GC_COLLECTION_MINOR) {
     mark_space_trace_remembered_set(heap_mark_space(heap), heap);
     large_object_space_trace_remembered_set(heap_large_object_space(heap),
                                             enqueue_generational_root,
@@ -1427,14 +1418,16 @@ static void trace_generational_roots(struct gc_heap *heap) {
   }
 }
 
-static void pause_mutator_for_collection(struct gc_heap *heap,
-                                         struct gc_mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection(struct gc_heap *heap,
-                                         struct gc_mutator *mut) {
+static enum gc_collection_kind
+pause_mutator_for_collection(struct gc_heap *heap,
+                             struct gc_mutator *mut) GC_NEVER_INLINE;
+static enum gc_collection_kind
+pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(heap->active_mutator_count);
   MUTATOR_EVENT(mut, mutator_stopped);
   heap->active_mutator_count--;
+  enum gc_collection_kind collection_kind = heap->gc_kind;
   if (heap->active_mutator_count == 0)
     pthread_cond_signal(&heap->collector_cond);
 
@@ -1452,10 +1445,13 @@ static void pause_mutator_for_collection(struct gc_heap *heap,
 
   MUTATOR_EVENT(mut, mutator_restarted);
   heap->active_mutator_count++;
+  return collection_kind;
 }
 
-static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
+static enum gc_collection_kind
+pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static enum gc_collection_kind
+pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
@@ -1466,7 +1462,7 @@ static void pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
     trace_mutator_roots_with_lock(mut);
   else
     enqueue_mutator_for_tracing(mut);
-  pause_mutator_for_collection(heap, mut);
+  return pause_mutator_for_collection(heap, mut);
 }
 
 static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
@@ -1580,25 +1576,32 @@ static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
   return threshold;
 }
 
-static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
+static enum gc_collection_kind
+determine_collection_kind(struct gc_heap *heap,
+                          enum gc_collection_kind requested) {
   struct mark_space *mark_space = heap_mark_space(heap);
-  enum gc_kind previous_gc_kind = atomic_load(&heap->gc_kind);
-  enum gc_kind gc_kind;
+  enum gc_collection_kind previous_gc_kind = atomic_load(&heap->gc_kind);
+  enum gc_collection_kind gc_kind;
   int mark_while_stopping = 1;
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   ssize_t pending = atomic_load_explicit(&mark_space->pending_unavailable_bytes,
                                          memory_order_acquire);
 
+  DEBUG("hiiiiii\n");
+
   if (heap->count == 0) {
     DEBUG("first collection is always major\n");
-    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MAJOR;
+  } else if (requested != GC_COLLECTION_ANY) {
+    DEBUG("user specifically requested collection kind %d\n", (int)requested);
+    gc_kind = requested;
   } else if (pending > 0) {
     DEBUG("evacuating due to need to reclaim %zd bytes\n", pending);
     // During the last cycle, a large allocation could not find enough
     // free blocks, and we decided not to expand the heap.  Let's do an
     // evacuating major collection to maximize the free block yield.
-    gc_kind = GC_KIND_MAJOR_EVACUATING;
+    gc_kind = GC_COLLECTION_COMPACTING;
 
     // Generally speaking, we allow mutators to mark their own stacks
     // before pausing.  This is a limited form of concurrent marking, as
@@ -1610,7 +1613,7 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     // marking.  Of course if the mutator has conservative roots we will
     // have pinning anyway and might as well allow ragged stops.
     mark_while_stopping = gc_has_conservative_roots();
-  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING
+  } else if (previous_gc_kind == GC_COLLECTION_COMPACTING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
           fragmentation * 100.,
@@ -1618,46 +1621,50 @@ static enum gc_kind determine_collection_kind(struct gc_heap *heap) {
     // For some reason, we already decided to compact in the past,
     // and fragmentation hasn't yet fallen below a low-water-mark.
     // Keep going.
-    gc_kind = GC_KIND_MAJOR_EVACUATING;
+    gc_kind = GC_COLLECTION_COMPACTING;
   } else if (fragmentation > heap->fragmentation_high_threshold) {
     // Switch to evacuation mode if the heap is too fragmented.
     DEBUG("triggering compaction due to fragmentation %.2f%% > %.2f%%\n",
           fragmentation * 100.,
           heap->fragmentation_high_threshold * 100.);
-    gc_kind = GC_KIND_MAJOR_EVACUATING;
-  } else if (previous_gc_kind == GC_KIND_MAJOR_EVACUATING) {
+    gc_kind = GC_COLLECTION_COMPACTING;
+  } else if (previous_gc_kind == GC_COLLECTION_COMPACTING) {
     // We were evacuating, but we're good now.  Go back to minor
     // collections.
     DEBUG("returning to in-place collection, fragmentation %.2f%% < %.2f%%\n",
           fragmentation * 100.,
           heap->fragmentation_low_threshold * 100.);
-    gc_kind = GC_KIND_MINOR_IN_PLACE;
-  } else if (previous_gc_kind != GC_KIND_MINOR_IN_PLACE) {
-    DEBUG("returning to minor collection after major collection\n");
+    gc_kind = GC_GENERATIONAL ? GC_COLLECTION_MINOR : GC_COLLECTION_MAJOR;
+  } else if (!GC_GENERATIONAL) {
+    DEBUG("keeping on with major in-place GC\n");
+    GC_ASSERT(previous_gc_kind == GC_COLLECTION_MAJOR);
+    gc_kind = GC_COLLECTION_MAJOR;
+  } else if (previous_gc_kind != GC_COLLECTION_MINOR) {
+    DEBUG("returning to minor collection\n");
     // Go back to minor collections.
-    gc_kind = GC_KIND_MINOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MINOR;
   } else if (yield < heap->major_gc_yield_threshold) {
     DEBUG("collection yield too low, triggering major collection\n");
     // Nursery is getting tight; trigger a major GC.
-    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MAJOR;
   } else {
     DEBUG("keeping on with minor GC\n");
     // Nursery has adequate space; keep trucking with minor GCs.
-    GC_ASSERT(previous_gc_kind == GC_KIND_MINOR_IN_PLACE);
-    gc_kind = GC_KIND_MINOR_IN_PLACE;
+    GC_ASSERT(previous_gc_kind == GC_COLLECTION_MINOR);
+    gc_kind = GC_COLLECTION_MINOR;
   }
 
   if (gc_has_conservative_intraheap_edges() &&
-      (gc_kind & GC_KIND_FLAG_EVACUATING)) {
+      gc_kind == GC_COLLECTION_COMPACTING) {
     DEBUG("welp.  conservative heap scanning, no evacuation for you\n");
-    gc_kind = GC_KIND_MAJOR_IN_PLACE;
+    gc_kind = GC_COLLECTION_MAJOR;
     mark_while_stopping = 1;
   }
 
   // If this is the first in a series of minor collections, reset the
   // threshold at which we should do a major GC.
-  if ((gc_kind & GC_KIND_FLAG_MINOR) &&
-      (previous_gc_kind & GC_KIND_FLAG_MINOR) != GC_KIND_FLAG_MINOR) {
+  if (gc_kind == GC_COLLECTION_MINOR &&
+      previous_gc_kind != GC_COLLECTION_MINOR) {
     double yield = heap_last_gc_yield(heap);
     double threshold = yield * heap->minor_gc_yield_threshold;
     double clamped = clamp_major_gc_yield_threshold(heap, threshold);
@@ -1687,7 +1694,7 @@ static void release_evacuation_target_blocks(struct mark_space *space) {
 static void prepare_for_evacuation(struct gc_heap *heap) {
   struct mark_space *space = heap_mark_space(heap);
 
-  if ((heap->gc_kind & GC_KIND_FLAG_EVACUATING) == 0) {
+  if (heap->gc_kind != GC_COLLECTION_COMPACTING) {
     space->evacuating = 0;
     space->evacuation_reserve = space->evacuation_minimum_reserve;
     return;
@@ -1796,7 +1803,7 @@ static void trace_roots_after_stop(struct gc_heap *heap) {
 }
 
 static void mark_space_finish_gc(struct mark_space *space,
-                                 enum gc_kind gc_kind) {
+                                 enum gc_collection_kind gc_kind) {
   space->evacuating = 0;
   reset_sweeper(space);
   update_mark_patterns(space, 0);
@@ -1824,7 +1831,8 @@ static void sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
-static void collect(struct gc_mutator *mut) {
+static void collect(struct gc_mutator *mut,
+                    enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   struct mark_space *space = heap_mark_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
@@ -1835,12 +1843,13 @@ static void collect(struct gc_mutator *mut) {
   }
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
-  enum gc_kind gc_kind = determine_collection_kind(heap);
-  HEAP_EVENT(heap, prepare_gc, gc_kind & GC_KIND_FLAG_MINOR,
-             gc_kind & GC_KIND_FLAG_EVACUATING);
-  update_mark_patterns(space, !(gc_kind & GC_KIND_FLAG_MINOR));
-  large_object_space_start_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
-  gc_extern_space_start_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
+  enum gc_collection_kind gc_kind =
+    determine_collection_kind(heap, requested_kind);
+  int is_minor = gc_kind == GC_COLLECTION_MINOR;
+  HEAP_EVENT(heap, prepare_gc, gc_kind);
+  update_mark_patterns(space, !is_minor);
+  large_object_space_start_gc(lospace, is_minor);
+  gc_extern_space_start_gc(exspace, is_minor);
   resolve_ephemerons_lazily(heap);
   tracer_prepare(heap);
   HEAP_EVENT(heap, requesting_stop);
@@ -1868,12 +1877,10 @@ static void collect(struct gc_mutator *mut) {
   sweep_ephemerons(heap);
   tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
-  large_object_space_finish_gc(lospace, gc_kind & GC_KIND_FLAG_MINOR);
-  gc_extern_space_finish_gc(exspace, gc_kind & GC_KIND_FLAG_MINOR);
+  large_object_space_finish_gc(lospace, is_minor);
+  gc_extern_space_finish_gc(exspace, is_minor);
   heap->count++;
-  heap->last_collection_was_minor = gc_kind & GC_KIND_FLAG_MINOR;
-  if (heap->last_collection_was_minor)
-    heap->minor_count++;
+  heap->last_collection_was_minor = is_minor;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
@@ -2153,18 +2160,20 @@ static void finish_sweeping(struct gc_mutator *mut) {
     finish_hole(mut);
 }
 
-static void trigger_collection(struct gc_mutator *mut) {
+static void trigger_collection(struct gc_mutator *mut,
+                               enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
+  int prev_kind = -1;
   heap_lock(heap);
-  if (mutators_are_stopping(heap))
-    pause_mutator_for_collection_with_lock(mut);
-  else
-    collect(mut);
+  while (mutators_are_stopping(heap))
+    prev_kind = pause_mutator_for_collection_with_lock(mut);
+  if (prev_kind < (int)requested_kind)
+    collect(mut, requested_kind);
   heap_unlock(heap);
 }
 
-void gc_collect(struct gc_mutator *mut) {
-  trigger_collection(mut);
+void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
+  trigger_collection(mut, kind);
 }
 
 static void* allocate_large(struct gc_mutator *mut, size_t size) {
@@ -2177,7 +2186,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
                                     npages << space->page_size_log2);
 
   while (!sweep_until_memory_released(mut))
-    trigger_collection(mut);
+    trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
@@ -2215,7 +2224,7 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
         break;
       }
       if (!hole)
-        trigger_collection(mut);
+        trigger_collection(mut, GC_COLLECTION_ANY);
     }
     ret = gc_ref(mut->alloc);
     mut->alloc += size;

From 9176aa650f7c4c7906887d9e7db62dcaa3c89e07 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 20 Oct 2023 11:43:40 +0200
Subject: [PATCH 211/403] Fix BDW ephemeron implementation

* src/gc-ephemeron.c: Use key null-ness as dead ephemeron indicator;
works better with BDW-GC's disappearing link.
* src/bdw.c (gc_heap_ephemeron_trace_epoch): Fix to actually define the
epoch.  Whoops!
---
 src/bdw.c          |  2 +-
 src/gc-ephemeron.c | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/bdw.c b/src/bdw.c
index 186b50a8c..f429b43c2 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -181,7 +181,7 @@ struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
 }
 
 unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
-  return 0;
+  return GC_get_gc_no();
 }
 
 void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
diff --git a/src/gc-ephemeron.c b/src/gc-ephemeron.c
index a13c4bb98..8a42c1a84 100644
--- a/src/gc-ephemeron.c
+++ b/src/gc-ephemeron.c
@@ -238,7 +238,6 @@ enum {
 struct gc_ephemeron {
   GC_EMBEDDER_EPHEMERON_HEADER
   uint8_t state;
-  uint8_t is_dead;
   unsigned epoch;
   struct gc_ephemeron *chain;
   struct gc_ephemeron *pending;
@@ -264,7 +263,7 @@ static struct gc_ephemeron** ephemeron_chain(struct gc_ephemeron *e) {
   return &e->chain;
 }
 static int ephemeron_is_dead(struct gc_ephemeron *e) {
-  return atomic_load_explicit(&e->is_dead, memory_order_acquire);
+  return !atomic_load_explicit(&e->key.value, memory_order_acquire);
 }
 static int ephemeron_is_not_dead(struct gc_ephemeron *e) {
   return !ephemeron_is_dead(e);
@@ -284,7 +283,7 @@ struct gc_ephemeron* gc_ephemeron_chain_next(struct gc_ephemeron *e) {
   return follow_chain(ephemeron_chain(e));
 }
 void gc_ephemeron_mark_dead(struct gc_ephemeron *e) {
-  atomic_store_explicit(&e->is_dead, 1, memory_order_release);
+  atomic_store_explicit(&e->key.value, 0, memory_order_release);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -327,7 +326,7 @@ static struct gc_ephemeron* pop_resolved(struct gc_ephemeron **loc) {
 ////////////////////////////////////////////////////////////////////////
 
 struct gc_ref gc_ephemeron_key(struct gc_ephemeron *e) {
-  return ephemeron_is_dead(e) ? gc_ref_null() : e->key;
+  return gc_ref(atomic_load_explicit(&e->key.value, memory_order_acquire));
 }
 
 struct gc_ref gc_ephemeron_value(struct gc_ephemeron *e) {
@@ -471,7 +470,7 @@ void gc_trace_ephemeron(struct gc_ephemeron *e,
   // as dead and here; the consequence would be that we treat an
   // ephemeron as live when it's not, but only for this cycle.  No big
   // deal.
-  if (atomic_load_explicit(&e->is_dead, memory_order_acquire)) {
+  if (ephemeron_is_dead(e)) {
     // CLAIMED[epoch] -> TRACED[epoch].
     atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
                           memory_order_release);
@@ -553,7 +552,7 @@ gc_sweep_pending_ephemerons(struct gc_pending_ephemerons *state,
          e;
          e = follow_pending(&e->pending)) {
       // PENDING -> TRACED, but dead.
-      atomic_store_explicit(&e->is_dead, 1, memory_order_release);
+      atomic_store_explicit(&e->key.value, 0, memory_order_release);
       atomic_store_explicit(&e->state, EPHEMERON_STATE_TRACED,
                             memory_order_release);
     }
@@ -572,7 +571,6 @@ void gc_ephemeron_init_internal(struct gc_heap *heap,
   // assumption is that the ephemeron is younger than the key and the
   // value.
   ephemeron->state = EPHEMERON_STATE_TRACED;
-  ephemeron->is_dead = 0;
   ephemeron->epoch = gc_heap_ephemeron_trace_epoch(heap) - 1;
   ephemeron->chain = NULL;
   ephemeron->pending = NULL;

From 2cab5269637a84455869fc0066bee74b4ef948b5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 20 Oct 2023 14:14:29 +0200
Subject: [PATCH 212/403] Fix gc_basic_stats_finish

---
 api/gc-basic-stats.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 0cf927b95..14cac492a 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -107,7 +107,7 @@ static inline void gc_basic_stats_live_data_size(void *data, size_t size) {
 
 static inline void gc_basic_stats_finish(struct gc_basic_stats *stats) {
   uint64_t now = gc_basic_stats_now();
-  stats->elapsed_mutator_usec += stats->last_time_usec - now;
+  stats->elapsed_mutator_usec += now - stats->last_time_usec;
   stats->last_time_usec = now;
 }
 

From 9cc12916a9cbee132fc27778ff223bace385dc47 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 23 Oct 2023 11:17:28 +0200
Subject: [PATCH 213/403] parallel marker: speed up sharing from local to
 published queue

---
 src/parallel-tracer.h | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 9711ed03a..09b636f3f 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -232,13 +232,18 @@ trace_deque_steal(struct trace_deque *q) {
   }
 }
 
-static int
-trace_deque_can_steal(struct trace_deque *q) {
+static ssize_t
+trace_deque_size(struct trace_deque *q) {
   size_t t = LOAD_ACQUIRE(&q->top);
   atomic_thread_fence(memory_order_seq_cst);
   size_t b = LOAD_ACQUIRE(&q->bottom);
   ssize_t size = b - t;
-  return size > 0;
+  return size;
+}
+
+static int
+trace_deque_can_steal(struct trace_deque *q) {
+  return trace_deque_size(q) > 0;
 }
 
 #undef LOAD_RELAXED
@@ -285,6 +290,19 @@ local_trace_queue_pop(struct local_trace_queue *q) {
   return q->data[q->read++ & LOCAL_TRACE_QUEUE_MASK];
 }
 
+static inline size_t
+local_trace_queue_pop_many(struct local_trace_queue *q, struct gc_ref **objv,
+                           size_t limit) {
+  size_t avail = local_trace_queue_size(q);
+  size_t read = q->read & LOCAL_TRACE_QUEUE_MASK;
+  size_t contig = LOCAL_TRACE_QUEUE_SIZE - read;
+  if (contig < avail) avail = contig;
+  if (limit < avail) avail = limit;
+  *objv = q->data + read;
+  q->read += avail;
+  return avail;
+}
+
 enum trace_worker_state {
   TRACE_WORKER_STOPPED,
   TRACE_WORKER_IDLE,
@@ -460,8 +478,13 @@ static inline int trace_edge(struct gc_heap *heap,
 static inline void
 tracer_share(struct local_tracer *trace) {
   DEBUG("tracer #%zu: sharing\n", trace->worker->id);
-  for (size_t i = 0; i < LOCAL_TRACE_QUEUE_SHARE_AMOUNT; i++)
-    trace_deque_push(trace->share_deque, local_trace_queue_pop(&trace->local));
+  size_t to_share = LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
+  while (to_share) {
+    struct gc_ref *objv;
+    size_t count = local_trace_queue_pop_many(&trace->local, &objv, to_share);
+    trace_deque_push_many(trace->share_deque, objv, count);
+    to_share -= count;
+  }
 }
 
 static inline void

From 3d2a12c684e8b69b224ff4012a324f55322b5c8e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 23 Oct 2023 11:26:33 +0200
Subject: [PATCH 214/403] Rework parallel tracing state machine

Instead of sending a message to each worker, we pthread_cond_broadcast
at the start.  Instead of having N worker threads, we have N-1 threads
and the main thread also does work.  Instead of termination being
detected by the worker threads, let the main thread detect it.  Avoid
parallelism if the mark stack is small enough, which can be the case for
ephemeron chains.  Let aux threads exit when they have no work instead
of spinning: sharing will start them up again.
---
 src/parallel-tracer.h | 216 ++++++++++++++++++------------------------
 1 file changed, 91 insertions(+), 125 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 09b636f3f..b88f1e792 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -319,7 +319,6 @@ struct trace_worker {
   pthread_t thread;
   enum trace_worker_state state;
   pthread_mutex_t lock;
-  pthread_cond_t cond;
   struct trace_deque deque;
 };
 
@@ -328,8 +327,7 @@ struct trace_worker {
 struct tracer {
   atomic_size_t active_tracers;
   size_t worker_count;
-  atomic_size_t running_tracers;
-  long count;
+  long epoch;
   pthread_mutex_t lock;
   pthread_cond_t cond;
   struct trace_worker workers[TRACE_WORKERS_MAX_COUNT];
@@ -351,9 +349,7 @@ trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
   worker->id = id;
   worker->steal_id = 0;
   worker->thread = 0;
-  worker->state = TRACE_WORKER_STOPPED;
   pthread_mutex_init(&worker->lock, NULL);
-  pthread_cond_init(&worker->cond, NULL);
   return trace_deque_init(&worker->deque);
 }
 
@@ -362,89 +358,46 @@ static void trace_worker_trace(struct trace_worker *worker);
 static void*
 trace_worker_thread(void *data) {
   struct trace_worker *worker = data;
+  struct tracer *tracer = heap_tracer(worker->heap);
+  long trace_epoch = 0;
 
   pthread_mutex_lock(&worker->lock);
   while (1) {
-    switch (worker->state) {
-    case TRACE_WORKER_IDLE:
-      pthread_cond_wait(&worker->cond, &worker->lock);
-      break;
-    case TRACE_WORKER_TRACING:
+    long epoch = atomic_load_explicit(&tracer->epoch, memory_order_acquire);
+    if (trace_epoch != epoch) {
+      trace_epoch = epoch;
       trace_worker_trace(worker);
-      worker->state = TRACE_WORKER_IDLE;
-      break;
-    case TRACE_WORKER_STOPPING:
-      worker->state = TRACE_WORKER_DEAD;
-      pthread_mutex_unlock(&worker->lock);
-      return NULL;
-    default:
-      GC_CRASH();
     }
+    pthread_cond_wait(&tracer->cond, &worker->lock);
   }
+  return NULL;
 }
 
 static int
 trace_worker_spawn(struct trace_worker *worker) {
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == TRACE_WORKER_STOPPED);
-  worker->state = TRACE_WORKER_IDLE;
-  pthread_mutex_unlock(&worker->lock);
-
   if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) {
     perror("spawning tracer thread failed");
-    worker->state = TRACE_WORKER_STOPPED;
     return 0;
   }
 
   return 1;
 }
 
-static void
-trace_worker_request_trace(struct trace_worker *worker) {
-  struct tracer *tracer = heap_tracer(worker->heap);
-    
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == TRACE_WORKER_IDLE);
-  worker->state = TRACE_WORKER_TRACING;
-  pthread_cond_signal(&worker->cond);
-  pthread_mutex_unlock(&worker->lock);
-}  
-
-static void
-trace_worker_finished_tracing(struct trace_worker *worker) {
-  // Signal controller that we are done with tracing.
-  struct tracer *tracer = heap_tracer(worker->heap);
-    
-  if (atomic_fetch_sub(&tracer->running_tracers, 1) == 1) {
-    pthread_mutex_lock(&tracer->lock);
-    tracer->count++;
-    pthread_cond_signal(&tracer->cond);
-    pthread_mutex_unlock(&tracer->lock);
-  }
-}
-
-static void
-trace_worker_request_stop(struct trace_worker *worker) {
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == TRACE_WORKER_IDLE);
-  worker->state = TRACE_WORKER_STOPPING;
-  pthread_cond_signal(&worker->cond);
-  pthread_mutex_unlock(&worker->lock);
-}  
-
 static int
 tracer_init(struct gc_heap *heap, size_t parallelism) {
   struct tracer *tracer = heap_tracer(heap);
   atomic_init(&tracer->active_tracers, 0);
-  atomic_init(&tracer->running_tracers, 0);
-  tracer->count = 0;
+  tracer->epoch = 0;
   pthread_mutex_init(&tracer->lock, NULL);
   pthread_cond_init(&tracer->cond, NULL);
   size_t desired_worker_count = parallelism;
   ASSERT(desired_worker_count);
   if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
     desired_worker_count = TRACE_WORKERS_MAX_COUNT;
-  for (size_t i = 0; i < desired_worker_count; i++) {
+  if (!trace_worker_init(&tracer->workers[0], heap, tracer, 0))
+    return 0;
+  tracer->worker_count++;
+  for (size_t i = 1; i < desired_worker_count; i++) {
     if (!trace_worker_init(&tracer->workers[i], heap, tracer, i))
       break;
     if (trace_worker_spawn(&tracer->workers[i]))
@@ -452,7 +405,7 @@ tracer_init(struct gc_heap *heap, size_t parallelism) {
     else
       break;
   }
-  return tracer->worker_count > 0;
+  return 1;
 }
 
 static void tracer_prepare(struct gc_heap *heap) {
@@ -466,6 +419,24 @@ static void tracer_release(struct gc_heap *heap) {
     trace_deque_release(&tracer->workers[i].deque);
 }
 
+static inline void
+tracer_unpark_all_workers(struct tracer *tracer) {
+  long old_epoch =
+    atomic_fetch_add_explicit(&tracer->epoch, 1, memory_order_acq_rel);
+  long epoch = old_epoch + 1;
+  DEBUG("starting trace; %zu workers; epoch=%ld\n", tracer->worker_count,
+        epoch);
+  pthread_cond_broadcast(&tracer->cond);
+}
+
+static inline void
+tracer_maybe_unpark_workers(struct tracer *tracer) {
+  size_t active =
+    atomic_load_explicit(&tracer->active_tracers, memory_order_acquire);
+  if (active < tracer->worker_count)
+    tracer_unpark_all_workers(tracer);
+}
+
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
 static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
@@ -485,6 +456,7 @@ tracer_share(struct local_tracer *trace) {
     trace_deque_push_many(trace->share_deque, objv, count);
     to_share -= count;
   }
+  tracer_maybe_unpark_workers(heap_tracer(trace->worker->heap));
 }
 
 static inline void
@@ -501,16 +473,6 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
     tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
 }
 
-static inline void
-tracer_visit_(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
-  if (trace_edge(heap, edge)) {
-    struct local_tracer *trace = trace_data;
-    if (local_trace_queue_full(&trace->local))
-      tracer_share(trace);
-    local_trace_queue_push(&trace->local, gc_edge_ref(edge));
-  }
-}
-
 static struct gc_ref
 tracer_steal_from_worker(struct tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
@@ -556,28 +518,35 @@ trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *trac
 }
 
 static int
-trace_worker_check_termination(struct trace_worker *worker,
-                              struct tracer *tracer) {
-  // We went around all workers and nothing.  Enter termination phase.
-  if (atomic_fetch_sub_explicit(&tracer->active_tracers, 1,
-                                memory_order_relaxed) == 1) {
-    DEBUG("  ->> tracer #%zu: DONE (no spinning) <<-\n", worker->id);
-    return 1;
-  }
+trace_worker_should_continue(struct trace_worker *worker) {
+  // Helper workers should park themselves immediately if they have no work.
+  if (worker->id != 0)
+    return 0;
+
+  struct tracer *tracer = heap_tracer(worker->heap);
 
   for (size_t spin_count = 0;; spin_count++) {
-    if (trace_worker_can_steal_from_any(worker, tracer)) {
-      atomic_fetch_add_explicit(&tracer->active_tracers, 1,
-                                memory_order_relaxed);
-      return 0;
-    }
     if (atomic_load_explicit(&tracer->active_tracers,
-                             memory_order_relaxed) == 0) {
-      DEBUG("  ->> tracer #%zu: DONE <<-\n", worker->id);
-      return 1;
+                             memory_order_acquire) == 1) {
+      // All trace workers have exited except us, the main worker.  We are
+      // probably done, but we need to synchronize to be sure that there is no
+      // work pending, for example if a worker had a spurious wakeup.  Skip
+      // worker 0 (the main worker).
+      size_t locked = 1;
+      while (locked < tracer->worker_count) {
+        if (pthread_mutex_trylock(&tracer->workers[locked].lock) == 0)
+          locked++;
+        else
+          break;
+      }
+      int done = (locked == tracer->worker_count) &&
+        !trace_worker_can_steal_from_any(worker, tracer);
+      while (locked > 1)
+        pthread_mutex_unlock(&tracer->workers[--locked].lock);
+      return !done;
     }
     // spin
-    DEBUG("tracer #%zu: spinning #%zu\n", worker->id, spin_count);
+    DEBUG("checking for termination: spinning #%zu\n", spin_count);
     yield_for_spin(spin_count);
   }
 }
@@ -597,42 +566,46 @@ trace_worker_steal(struct local_tracer *trace) {
       return obj;
   }
 
-  while (1) {
-    DEBUG("tracer #%zu: trying to steal\n", worker->id);
-    struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
-    if (gc_ref_is_heap_object(obj))
-      return obj;
+  DEBUG("tracer #%zu: trying to steal\n", worker->id);
+  struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
+  if (gc_ref_is_heap_object(obj))
+    return obj;
 
-    if (trace_worker_check_termination(worker, tracer))
-      return gc_ref_null();
-  }
+  return gc_ref_null();
 }
 
 static void
 trace_worker_trace(struct trace_worker *worker) {
+  struct gc_heap *heap = worker->heap;
+  struct tracer *tracer = heap_tracer(heap);
+  atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
+
   struct local_tracer trace;
   trace.worker = worker;
   trace.share_deque = &worker->deque;
-  struct gc_heap *heap = worker->heap;
   local_trace_queue_init(&trace.local);
 
   size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
-  while (1) {
-    struct gc_ref ref;
-    if (!local_trace_queue_empty(&trace.local)) {
-      ref = local_trace_queue_pop(&trace.local);
-    } else {
-      ref = trace_worker_steal(&trace);
-      if (!gc_ref_is_heap_object(ref))
-        break;
+
+  do {
+    while (1) {
+      struct gc_ref ref;
+      if (!local_trace_queue_empty(&trace.local)) {
+        ref = local_trace_queue_pop(&trace.local);
+      } else {
+        ref = trace_worker_steal(&trace);
+        if (!gc_ref_is_heap_object(ref))
+          break;
+      }
+      trace_one(ref, heap, &trace);
+      n++;
     }
-    trace_one(ref, heap, &trace);
-    n++;
-  }
+  } while (trace_worker_should_continue(worker));
+
   DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
 
-  trace_worker_finished_tracing(worker);
+  atomic_fetch_sub_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
 }
 
 static inline void
@@ -652,25 +625,18 @@ static inline void
 tracer_trace(struct gc_heap *heap) {
   struct tracer *tracer = heap_tracer(heap);
 
-  pthread_mutex_lock(&tracer->lock);
-  long trace_count = tracer->count;
-  pthread_mutex_unlock(&tracer->lock);
-
   DEBUG("starting trace; %zu workers\n", tracer->worker_count);
-  DEBUG("waking workers\n");
-  atomic_store_explicit(&tracer->active_tracers, tracer->worker_count,
-                        memory_order_release);
-  atomic_store_explicit(&tracer->running_tracers, tracer->worker_count,
-                        memory_order_release);
-  for (size_t i = 0; i < tracer->worker_count; i++)
-    trace_worker_request_trace(&tracer->workers[i]);
 
-  DEBUG("waiting on tracers\n");
+  ssize_t parallel_threshold =
+    LOCAL_TRACE_QUEUE_SIZE - LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
+  if (trace_deque_size(&tracer->workers[0].deque) >= parallel_threshold) {
+    DEBUG("waking workers\n");
+    tracer_unpark_all_workers(tracer);
+  } else {
+    DEBUG("starting in local-only mode\n");
+  }
 
-  pthread_mutex_lock(&tracer->lock);
-  while (tracer->count <= trace_count)
-    pthread_cond_wait(&tracer->cond, &tracer->lock);
-  pthread_mutex_unlock(&tracer->lock);
+  trace_worker_trace(&tracer->workers[0]);
 
   DEBUG("trace finished\n");
 }

From 3a1a5e0368291e8d05f1e047de55c2c6d5dcb6e6 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 25 Oct 2023 11:33:24 +0200
Subject: [PATCH 215/403] Fix linking against static libgc.a

Thanks to Thorsten Ball.  Fixes #1, #2.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 58091d55e..22df61e4f 100644
--- a/Makefile
+++ b/Makefile
@@ -96,7 +96,7 @@ obj/$(1).$(2).gc.o: src/$(call gc_impl,$(2)) | .deps obj
 obj/$(1).$(2).o: benchmarks/$(1).c | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) -include api/$(call gc_attrs,$(2)) -c $$<
 bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/$(1).gc-ephemeron.o | bin
-	$$(LINK) $(call gc_libs,$(2)) $$^
+	$$(LINK) $$^ $(call gc_libs,$(2))
 endef
 
 $(foreach BENCHMARK,$(TESTS),\

From c4396a42149284afee4d7f5b476c1ea274f54582 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 5 Nov 2023 10:26:43 +0100
Subject: [PATCH 216/403] Remove a release-mode debugging printout in whippet

---
 src/whippet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/whippet.c b/src/whippet.c
index 5deba1a64..6501a751f 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1862,7 +1862,7 @@ static void collect(struct gc_mutator *mut,
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   HEAP_EVENT(heap, live_data_size, heap->size * (1 - yield));
-  fprintf(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
+  DEBUG(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
   prepare_for_evacuation(heap);

From 5ad83e49e1cbf97a6254b1d13fc17bfc0a477022 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Nov 2023 22:06:51 +0100
Subject: [PATCH 217/403] Fix the debug fix

---
 src/whippet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/whippet.c b/src/whippet.c
index 6501a751f..24225e5ae 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1862,7 +1862,7 @@ static void collect(struct gc_mutator *mut,
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   HEAP_EVENT(heap, live_data_size, heap->size * (1 - yield));
-  DEBUG(stderr, "last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
+  DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
   prepare_for_evacuation(heap);

From 361d880277041dbab589a16c05eb04e4ee92e6bd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Nov 2023 22:12:34 +0100
Subject: [PATCH 218/403] optdebug build uses -O2 instead of -Og

---
 embed.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/embed.mk b/embed.mk
index 1c7822806..6f2c4c6f2 100644
--- a/embed.mk
+++ b/embed.mk
@@ -3,7 +3,7 @@ GC_COLLECTOR ?= semi
 DEFAULT_BUILD := opt
 
 BUILD_CFLAGS_opt      = -O2 -g -DNDEBUG
-BUILD_CFLAGS_optdebug = -Og -g -DGC_DEBUG=1
+BUILD_CFLAGS_optdebug = -O2 -g -DGC_DEBUG=1
 BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
 
 GC_BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(GC_BUILD),$(DEFAULT_BUILD)))

From 40be1a03cb260b198eaa77f77b295c9998bb7397 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Nov 2023 10:45:43 +0100
Subject: [PATCH 219/403] Add debug-mode heap verification to Whippet

---
 src/whippet.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/whippet.c b/src/whippet.c
index 24225e5ae..465f3570d 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1802,6 +1802,40 @@ static void trace_roots_after_stop(struct gc_heap *heap) {
   trace_generational_roots(heap);
 }
 
+static void verify_mark_space_before_restart(struct mark_space *space) {
+  // Iterate objects in each block, verifying that the END bytes correspond to
+  // the measured object size.
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct block_summary *summary = &space->slabs[slab].summaries[block];
+      if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
+        continue;
+
+      uintptr_t addr = (uintptr_t)space->slabs[slab].blocks[block].data;
+      uintptr_t limit = addr + BLOCK_SIZE;
+      uint8_t *meta = metadata_byte_for_addr(addr);
+      while (addr < limit) {
+        if (meta[0] & space->live_mask) {
+          struct gc_ref obj = gc_ref(addr);
+          size_t obj_bytes = 0;
+          gc_trace_object(gc_ref(addr), NULL, NULL, NULL, &obj_bytes);
+          size_t granules = size_to_granules(obj_bytes);
+          GC_ASSERT(granules);
+          for (size_t granule = 0; granule < granules - 1; granule++)
+            GC_ASSERT(!(meta[granule] & METADATA_BYTE_END));
+          GC_ASSERT(meta[granules - 1] & METADATA_BYTE_END);
+          meta += granules;
+          addr += granules * GRANULE_SIZE;
+        } else {
+          meta++;
+          addr += GRANULE_SIZE;
+        }
+      }
+      GC_ASSERT(addr == limit);
+    }
+  }
+}
+
 static void mark_space_finish_gc(struct mark_space *space,
                                  enum gc_collection_kind gc_kind) {
   space->evacuating = 0;
@@ -1809,6 +1843,8 @@ static void mark_space_finish_gc(struct mark_space *space,
   update_mark_patterns(space, 0);
   reset_statistics(space);
   release_evacuation_target_blocks(space);
+  if (GC_DEBUG)
+    verify_mark_space_before_restart(space);
 }
 
 static void resolve_ephemerons_lazily(struct gc_heap *heap) {

From adaffab3da7ad6dfa6ee486fdd308758338e6560 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Nov 2023 15:07:45 +0100
Subject: [PATCH 220/403] Fix a case where we might miss some sweeping

Unlike next_hole, next_hole_in_block doesn't finish_hole, so it doesn't
clear metadata bits.  Fix to always finish_hole when
finish_sweeping_in_block.
---
 src/whippet.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/whippet.c b/src/whippet.c
index 465f3570d..28f296764 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -2185,15 +2185,13 @@ static size_t next_hole(struct gc_mutator *mut) {
 }
 
 static void finish_sweeping_in_block(struct gc_mutator *mut) {
-  while (next_hole_in_block(mut))
-    finish_hole(mut);
+  do { finish_hole(mut); } while (next_hole_in_block(mut));
 }
 
 // Another thread is triggering GC.  Before we stop, finish clearing the
 // dead mark bytes for the mutator's block, and release the block.
 static void finish_sweeping(struct gc_mutator *mut) {
-  while (next_hole(mut))
-    finish_hole(mut);
+  while (next_hole(mut)) {}
 }
 
 static void trigger_collection(struct gc_mutator *mut,

From 3ce75b2bad1a86d6fd374f46a1a067a593891679 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Nov 2023 15:09:10 +0100
Subject: [PATCH 221/403] Fix bug in which we could forget to mark stopping
 mutators

Separately track total mutator count, paused mutators, and inactive
mutators.  Paused mutators need to mark their roots before stopping.  We
had a bug whereby a paused mutator would not wake up before the next
collection, which resulted in that mutator's roots not being marked.
Fix by resetting paused mutator count to 0 after collection, requiring
those mutators to sync up again.
---
 src/whippet.c | 53 ++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/whippet.c b/src/whippet.c
index 28f296764..98828f11d 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -303,13 +303,14 @@ struct gc_heap {
   struct gc_pending_ephemerons *pending_ephemerons;
   enum gc_collection_kind gc_kind;
   int multithreaded;
-  size_t active_mutator_count;
   size_t mutator_count;
+  size_t paused_mutator_count;
+  size_t inactive_mutator_count;
   struct gc_heap_roots *roots;
   struct gc_mutator *mutator_trace_list;
   long count;
   uint8_t last_collection_was_minor;
-  struct gc_mutator *deactivated_mutators;
+  struct gc_mutator *inactive_mutators;
   struct tracer tracer;
   double fragmentation_low_threshold;
   double fragmentation_high_threshold;
@@ -852,6 +853,12 @@ static inline void heap_unlock(struct gc_heap *heap) {
   pthread_mutex_unlock(&heap->lock);
 }
 
+// with heap lock
+static inline int all_mutators_stopped(struct gc_heap *heap) {
+  return heap->mutator_count ==
+    heap->paused_mutator_count + heap->inactive_mutator_count;
+}
+
 static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
@@ -863,7 +870,6 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
   if (heap->mutator_count == 1)
     heap->multithreaded = 1;
-  heap->active_mutator_count++;
   heap->mutator_count++;
   heap_unlock(heap);
 }
@@ -872,11 +878,10 @@ static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
-  heap->active_mutator_count--;
   heap->mutator_count--;
   // We have no roots.  If there is a GC stop currently in progress,
   // maybe tell the controller it can continue.
-  if (mutators_are_stopping(heap) && heap->active_mutator_count == 0)
+  if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
     pthread_cond_signal(&heap->collector_cond);
   heap_unlock(heap);
 }
@@ -888,8 +893,8 @@ static void request_mutators_to_stop(struct gc_heap *heap) {
 
 static void allow_mutators_to_continue(struct gc_heap *heap) {
   GC_ASSERT(mutators_are_stopping(heap));
-  GC_ASSERT(heap->active_mutator_count == 0);
-  heap->active_mutator_count++;
+  GC_ASSERT(all_mutators_stopped(heap));
+  heap->paused_mutator_count = 0;
   atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
   GC_ASSERT(!mutators_are_stopping(heap));
   pthread_cond_broadcast(&heap->mutator_cond);
@@ -921,7 +926,7 @@ static uintptr_t pop_empty_block(struct mark_space *space) {
 static int maybe_push_evacuation_target(struct mark_space *space,
                                         uintptr_t block, double reserve) {
   GC_ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
-                                 BLOCK_NEEDS_SWEEP));
+                                    BLOCK_NEEDS_SWEEP));
   size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
                                         memory_order_acquire);
   size_t total = space->nslabs * NONMETA_BLOCKS_PER_SLAB;
@@ -948,7 +953,7 @@ static int push_evacuation_target_if_possible(struct mark_space *space,
 
 static void push_empty_block(struct mark_space *space, uintptr_t block) {
   GC_ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
-                                 BLOCK_NEEDS_SWEEP));
+                                    BLOCK_NEEDS_SWEEP));
   push_block(&space->empty, block);
 }
 
@@ -1291,8 +1296,8 @@ static void release_stopping_mutator_roots(struct gc_mutator *mut) {
 }
 
 static void wait_for_mutators_to_stop(struct gc_heap *heap) {
-  heap->active_mutator_count--;
-  while (heap->active_mutator_count)
+  heap->paused_mutator_count++;
+  while (!all_mutators_stopped(heap))
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
@@ -1307,7 +1312,7 @@ static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
          mut = mut->next)
       trace_mutator_conservative_roots_with_lock(mut);
 
-  for (struct gc_mutator *mut = heap->deactivated_mutators;
+  for (struct gc_mutator *mut = heap->inactive_mutators;
        mut;
        mut = mut->next)
     trace_mutator_conservative_roots_with_lock(mut);
@@ -1331,7 +1336,7 @@ static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
   }
   atomic_store(&heap->mutator_trace_list, NULL);
 
-  for (struct gc_mutator *mut = heap->deactivated_mutators; mut; mut = mut->next) {
+  for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next) {
     finish_sweeping_in_block(mut);
     trace_mutator_roots_with_lock(mut);
   }
@@ -1424,11 +1429,11 @@ pause_mutator_for_collection(struct gc_heap *heap,
 static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
-  GC_ASSERT(heap->active_mutator_count);
+  GC_ASSERT(!all_mutators_stopped(heap));
   MUTATOR_EVENT(mut, mutator_stopped);
-  heap->active_mutator_count--;
+  heap->paused_mutator_count++;
   enum gc_collection_kind collection_kind = heap->gc_kind;
-  if (heap->active_mutator_count == 0)
+  if (all_mutators_stopped(heap))
     pthread_cond_signal(&heap->collector_cond);
 
   // Go to sleep and wake up when the collector is done.  Note,
@@ -1444,7 +1449,6 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   while (mutators_are_stopping(heap) && heap->count == epoch);
 
   MUTATOR_EVENT(mut, mutator_restarted);
-  heap->active_mutator_count++;
   return collection_kind;
 }
 
@@ -1588,8 +1592,6 @@ determine_collection_kind(struct gc_heap *heap,
   ssize_t pending = atomic_load_explicit(&mark_space->pending_unavailable_bytes,
                                          memory_order_acquire);
 
-  DEBUG("hiiiiii\n");
-
   if (heap->count == 0) {
     DEBUG("first collection is always major\n");
     gc_kind = GC_COLLECTION_MAJOR;
@@ -1920,7 +1922,6 @@ static void collect(struct gc_mutator *mut,
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
-  DEBUG("collect done\n");
 }
 
 static int sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
@@ -2485,11 +2486,11 @@ void gc_finish_for_thread(struct gc_mutator *mut) {
 static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
   heap_lock(heap);
-  mut->next = heap->deactivated_mutators;
-  heap->deactivated_mutators = mut;
-  heap->active_mutator_count--;
+  mut->next = heap->inactive_mutators;
+  heap->inactive_mutators = mut;
+  heap->inactive_mutator_count++;
   gc_stack_capture_hot(&mut->stack);
-  if (!heap->active_mutator_count && mutators_are_stopping(heap))
+  if (all_mutators_stopped(heap))
     pthread_cond_signal(&heap->collector_cond);
   heap_unlock(heap);
 }
@@ -2498,12 +2499,12 @@ static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_lock(heap);
   while (mutators_are_stopping(heap))
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  struct gc_mutator **prev = &heap->deactivated_mutators;
+  struct gc_mutator **prev = &heap->inactive_mutators;
   while (*prev != mut)
     prev = &(*prev)->next;
   *prev = mut->next;
   mut->next = NULL;
-  heap->active_mutator_count++;
+  heap->inactive_mutator_count--;
   heap_unlock(heap);
 }
 

From 4b51376447d9f422f39775c47e33e64bd8744d09 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 6 Dec 2023 21:48:39 +0100
Subject: [PATCH 222/403] Add mechanism to compose event listeners

---
 api/gc-event-listener-chain.h | 140 ++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 api/gc-event-listener-chain.h

diff --git a/api/gc-event-listener-chain.h b/api/gc-event-listener-chain.h
new file mode 100644
index 000000000..3bebf3531
--- /dev/null
+++ b/api/gc-event-listener-chain.h
@@ -0,0 +1,140 @@
+#ifndef GC_EVENT_LISTENER_CHAIN_H
+#define GC_EVENT_LISTENER_CHAIN_H
+
+#include "gc-event-listener.h"
+
+struct gc_event_listener_chain {
+  struct gc_event_listener head; void *head_data;
+  struct gc_event_listener tail; void *tail_data;
+};
+
+struct gc_event_listener_chain_mutator {
+  struct gc_event_listener_chain *chain;
+  void *head_mutator_data;
+  void *tail_mutator_data;
+};
+
+static inline void gc_event_listener_chain_init(void *data, size_t heap_size) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.init(chain->head_data, heap_size);
+  chain->tail.init(chain->tail_data, heap_size);
+}
+
+static inline void gc_event_listener_chain_prepare_gc(void *data,
+                                             enum gc_collection_kind kind) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.prepare_gc(chain->head_data, kind);
+  chain->tail.prepare_gc(chain->tail_data, kind);
+}
+
+static inline void gc_event_listener_chain_requesting_stop(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.requesting_stop(chain->head_data);
+  chain->tail.requesting_stop(chain->tail_data);
+}
+static inline void gc_event_listener_chain_waiting_for_stop(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.waiting_for_stop(chain->head_data);
+  chain->tail.waiting_for_stop(chain->tail_data);
+}
+static inline void gc_event_listener_chain_mutators_stopped(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.mutators_stopped(chain->head_data);
+  chain->tail.mutators_stopped(chain->tail_data);
+}
+static inline void gc_event_listener_chain_roots_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.roots_traced(chain->head_data);
+  chain->tail.roots_traced(chain->tail_data);
+}
+static inline void gc_event_listener_chain_heap_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.heap_traced(chain->head_data);
+  chain->tail.heap_traced(chain->tail_data);
+}
+static inline void gc_event_listener_chain_ephemerons_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.ephemerons_traced(chain->head_data);
+  chain->tail.ephemerons_traced(chain->tail_data);
+}
+
+static inline void gc_event_listener_chain_restarting_mutators(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.restarting_mutators(chain->head_data);
+  chain->tail.restarting_mutators(chain->tail_data);
+}
+
+static inline void* gc_event_listener_chain_mutator_added(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  struct gc_event_listener_chain_mutator *mutator = malloc(sizeof(*mutator));;
+  if (!mutator) abort();
+  mutator->chain = chain;
+  mutator->head_mutator_data = chain->head.mutator_added(chain->head_data);
+  mutator->tail_mutator_data = chain->tail.mutator_added(chain->tail_data);
+  return mutator;
+}
+
+static inline void gc_event_listener_chain_mutator_cause_gc(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.restarting_mutators(mutator->head_data);
+  mutator->chain->tail.restarting_mutators(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_stopping(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_stopping(mutator->head_data);
+  mutator->chain->tail.mutator_stopping(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_stopped(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_stopped(mutator->head_data);
+  mutator->chain->tail.mutator_stopped(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_restarted(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_restarted(mutator->head_data);
+  mutator->chain->tail.mutator_restarted(mutator->tail_data);
+}
+static inline void gc_event_listener_chain_mutator_removed(void *mutator_data) {
+  struct gc_event_listener_chain_mutator *mutator = mutator_data;
+  mutator->chain->head.mutator_removed(mutator->head_data);
+  mutator->chain->tail.mutator_removed(mutator->tail_data);
+  free(mutator);
+}
+
+static inline void gc_event_listener_chain_heap_resized(void *data, size_t size) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.heap_resized(chain->head_data, size);
+  chain->tail.heap_resized(chain->tail_data, size);
+}
+
+static inline void gc_event_listener_chain_live_data_size(void *data, size_t size) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.live_data_size(chain->head_data, size);
+  chain->tail.live_data_size(chain->tail_data, size);
+}
+
+#define GC_EVENT_LISTENER_CHAIN                                         \
+  ((struct gc_event_listener) {                                         \
+    gc_event_listener_chain_init,                                       \
+    gc_event_listener_chain_prepare_gc,                                 \
+    gc_event_listener_chain_requesting_stop,                            \
+    gc_event_listener_chain_waiting_for_stop,                           \
+    gc_event_listener_chain_mutators_stopped,                           \
+    gc_event_listener_chain_roots_traced,                               \
+    gc_event_listener_chain_heap_traced,                                \
+    gc_event_listener_chain_ephemerons_traced,                          \
+    gc_event_listener_chain_restarting_mutators,                        \
+    gc_event_listener_chain_mutator_added,                              \
+    gc_event_listener_chain_mutator_cause_gc,                           \
+    gc_event_listener_chain_mutator_stopping,                           \
+    gc_event_listener_chain_mutator_stopped,                            \
+    gc_event_listener_chain_mutator_restarted,                          \
+    gc_event_listener_chain_mutator_removed,                            \
+    gc_event_listener_chain_heap_resized,                               \
+    gc_event_listener_chain_live_data_size,                             \
+  })
+
+#define GC_EVENT_LISTENER_CHAIN_DATA(head, head_data, tail, tail_data)  \
+  ((struct gc_event_listener_chain_data){head, head_data, tail, tail_data})
+
+#endif // GC_EVENT_LISTENER_CHAIN_H

From 1267f77de3584392b5a5a52ec1d77873fbde87b8 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 10 Dec 2023 21:23:44 +0100
Subject: [PATCH 223/403] Basic stats records latency readings in a histogram,
 prints p95

---
 api/gc-basic-stats.h | 16 +++++++-
 api/gc-histogram.h   | 89 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 api/gc-histogram.h

diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 14cac492a..af8cd4243 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -2,6 +2,7 @@
 #define GC_BASIC_STATS_H
 
 #include "gc-event-listener.h"
+#include "gc-histogram.h"
 
 #include <inttypes.h>
 #include <stdint.h>
@@ -9,6 +10,8 @@
 #include <string.h>
 #include <sys/time.h>
 
+GC_DEFINE_HISTOGRAM(gc_latency, 25, 4);
+
 struct gc_basic_stats {
   uint64_t major_collection_count;
   uint64_t minor_collection_count;
@@ -18,6 +21,7 @@ struct gc_basic_stats {
   size_t heap_size;
   size_t max_heap_size;
   size_t max_live_data_size;
+  struct gc_latency pause_times;
 };
 
 static inline uint64_t gc_basic_stats_now(void) {
@@ -58,7 +62,9 @@ static inline void gc_basic_stats_ephemerons_traced(void *data) {}
 static inline void gc_basic_stats_restarting_mutators(void *data) {
   struct gc_basic_stats *stats = data;
   uint64_t now = gc_basic_stats_now();
-  stats->elapsed_collector_usec += now - stats->last_time_usec;
+  uint64_t pause_time = now - stats->last_time_usec;
+  stats->elapsed_collector_usec += pause_time;
+  gc_latency_record(&stats->pause_times, pause_time);
   stats->last_time_usec = now;
 }
 
@@ -120,6 +126,14 @@ static inline void gc_basic_stats_print(struct gc_basic_stats *stats, FILE *f) {
   fprintf(f, "%" PRIu64 ".%.3" PRIu64 " ms total time "
           "(%" PRIu64 ".%.3" PRIu64 " stopped).\n",
           elapsed / ms, elapsed % ms, stopped / ms, stopped % ms);
+  uint64_t pause_median = gc_latency_median(&stats->pause_times);
+  uint64_t pause_p95 = gc_latency_percentile(&stats->pause_times, 0.95);
+  uint64_t pause_max = gc_latency_max(&stats->pause_times);
+  fprintf(f, "%" PRIu64 ".%.3" PRIu64 " ms median pause time, "
+          "%" PRIu64 ".%.3" PRIu64 " p95, "
+          "%" PRIu64 ".%.3" PRIu64 " max.\n",
+          pause_median / ms, pause_median % ms, pause_p95 / ms, pause_p95 % ms,
+          pause_max / ms, pause_max % ms);
   double MB = 1e6;
   fprintf(f, "Heap size is %.3f MB (max %.3f MB); peak live data %.3f MB.\n",
           stats->heap_size / MB, stats->max_heap_size / MB,
diff --git a/api/gc-histogram.h b/api/gc-histogram.h
new file mode 100644
index 000000000..be5702e5c
--- /dev/null
+++ b/api/gc-histogram.h
@@ -0,0 +1,89 @@
+#ifndef GC_HISTOGRAM_H
+#define GC_HISTOGRAM_H
+
+#include "gc-assert.h"
+
+#include <stdint.h>
+
+static inline size_t gc_histogram_bucket(uint64_t max_value_bits,
+                                         uint64_t precision,
+                                         uint64_t val) {
+  uint64_t major = val < (1ULL << precision)
+    ? 0ULL
+    : 64ULL - __builtin_clzl(val) - precision;
+  uint64_t minor = val < (1 << precision)
+    ? val
+    : (val >> (major - 1ULL)) & ((1ULL << precision) - 1ULL);
+  uint64_t idx = (major << precision) | minor;
+  if (idx >= (max_value_bits << precision))
+    idx = max_value_bits << precision;
+  return idx;
+}
+
+static inline uint64_t gc_histogram_bucket_min_val(uint64_t precision,
+                                                   size_t idx) {
+  uint64_t major = idx >> precision;
+  uint64_t minor = idx & ((1ULL << precision) - 1ULL);
+  uint64_t min_val = major
+    ? ((1ULL << precision) | minor) << (major - 1ULL)
+    : minor;
+  return min_val;
+}
+
+static inline void gc_histogram_record(uint32_t *buckets,
+                                       uint64_t max_value_bits,
+                                       uint64_t precision,
+                                       uint64_t val) {
+  buckets[gc_histogram_bucket(max_value_bits, precision, val)]++;
+}
+
+#define GC_DEFINE_HISTOGRAM(name, max_value_bits, precision)            \
+  struct name { uint32_t buckets[((max_value_bits) << (precision)) + 1]; }; \
+  static inline size_t name##_size(void) {                              \
+    return ((max_value_bits) << (precision)) + 1;                       \
+  }                                                                     \
+  static inline uint64_t name##_bucket_min_val(size_t idx) {            \
+    GC_ASSERT(idx < name##_size());                                     \
+    return gc_histogram_bucket_min_val((precision), idx);               \
+  }                                                                     \
+  static inline struct name make_##name(void) {                         \
+    return (struct name) { { 0, }};                                     \
+  }                                                                     \
+  static inline void name##_record(struct name *h, uint64_t val) {      \
+    h->buckets[gc_histogram_bucket((max_value_bits), (precision), val)]++; \
+  }                                                                     \
+  static inline uint64_t name##_ref(struct name *h, size_t idx) {       \
+    GC_ASSERT(idx < name##_size());                                     \
+    return h->buckets[idx];                                             \
+  }                                                                     \
+  static inline uint64_t name##_min(struct name *h) {                   \
+    for (size_t bucket = 0; bucket < name##_size(); bucket++)           \
+      if (h->buckets[bucket]) return name##_bucket_min_val(bucket);     \
+    return -1;                                                          \
+  }                                                                     \
+  static inline uint64_t name##_max(struct name *h) {                   \
+    if (h->buckets[name##_size()-1]) return -1LL;                       \
+    for (ssize_t bucket = name##_size() - 1; bucket >= 0; bucket--)     \
+      if (h->buckets[bucket]) return name##_bucket_min_val(bucket+1);   \
+    return 0;                                                           \
+  }                                                                     \
+  static inline uint64_t name##_count(struct name *h) {                 \
+    uint64_t sum = 0;                                                   \
+    for (size_t bucket = 0; bucket < name##_size(); bucket++)           \
+      sum += h->buckets[bucket];                                        \
+    return sum;                                                         \
+  }                                                                     \
+  static inline uint64_t name##_percentile(struct name *h, double p) {  \
+    uint64_t n = name##_count(h) * p;                                   \
+    uint64_t sum = 0;                                                   \
+    for (size_t bucket = 0; bucket + 1 < name##_size(); bucket++) {     \
+      sum += h->buckets[bucket];                                        \
+      if (sum >= n) return name##_bucket_min_val(bucket+1);             \
+    }                                                                   \
+    return -1ULL;                                                       \
+  }                                                                     \
+  static inline uint64_t name##_median(struct name *h) {                \
+    return name##_percentile(h, 0.5);                                   \
+  }
+
+#endif // GC_HISTOGRAM_H

From 5ba0aec869b2a162b4197325ee03110efe02ba98 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 10 Dec 2023 22:08:05 +0100
Subject: [PATCH 224/403] Remove unused function

---
 api/gc-histogram.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/api/gc-histogram.h b/api/gc-histogram.h
index be5702e5c..0761a630f 100644
--- a/api/gc-histogram.h
+++ b/api/gc-histogram.h
@@ -30,13 +30,6 @@ static inline uint64_t gc_histogram_bucket_min_val(uint64_t precision,
   return min_val;
 }
 
-static inline void gc_histogram_record(uint32_t *buckets,
-                                       uint64_t max_value_bits,
-                                       uint64_t precision,
-                                       uint64_t val) {
-  buckets[gc_histogram_bucket(max_value_bits, precision, val)]++;
-}
-
 #define GC_DEFINE_HISTOGRAM(name, max_value_bits, precision)            \
   struct name { uint32_t buckets[((max_value_bits) << (precision)) + 1]; }; \
   static inline size_t name##_size(void) {                              \

From b4bf949df695db38cf9a2ca0d6a52d80f3d45f05 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 3 Jul 2024 10:38:55 +0200
Subject: [PATCH 225/403] Add Guix manifest.scm

---
 manifest.scm | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 manifest.scm

diff --git a/manifest.scm b/manifest.scm
new file mode 100644
index 000000000..fbb5d428c
--- /dev/null
+++ b/manifest.scm
@@ -0,0 +1,10 @@
+(use-modules (guix packages))
+
+(specifications->manifest
+ '("bash"
+   "coreutils"
+   "gcc-toolchain"
+   "glibc"
+   "libgc"
+   "make"
+   "pkg-config"))

From dd3953ef1a8aaa6207c572d152f2faf876b82646 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 10:42:58 +0200
Subject: [PATCH 226/403] Factor trace deque out to shared-worklist.h

Also increase alignment to account for cache line prefetcher.
---
 src/gc-align.h        |   5 +
 src/parallel-tracer.h | 271 +++---------------------------------------
 src/shared-worklist.h | 259 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 280 insertions(+), 255 deletions(-)
 create mode 100644 src/shared-worklist.h

diff --git a/src/gc-align.h b/src/gc-align.h
index 117d1cb47..c0758b1e0 100644
--- a/src/gc-align.h
+++ b/src/gc-align.h
@@ -14,4 +14,9 @@ static inline uintptr_t align_up(uintptr_t addr, size_t align) {
   return align_down(addr + align - 1, align);
 }
 
+// Poor man's equivalent of std::hardware_destructive_interference_size.
+#define AVOID_FALSE_SHARING 128
+#define ALIGNED_TO_AVOID_FALSE_SHARING \
+  __attribute__((aligned(AVOID_FALSE_SHARING)))
+
 #endif // GC_ALIGN_H
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index b88f1e792..3b4a8d0f1 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -9,249 +9,9 @@
 #include "assert.h"
 #include "debug.h"
 #include "gc-inline.h"
+#include "shared-worklist.h"
 #include "spin.h"
 
-// The Chase-Lev work-stealing deque, as initially described in "Dynamic
-// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
-// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
-// and improved with C11 atomics in "Correct and Efficient Work-Stealing
-// for Weak Memory Models" (Lê et al, PPoPP'13)
-// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
-
-struct trace_buf {
-  unsigned log_size;
-  size_t size;
-  uintptr_t *data;
-};
-
-// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
-#define trace_buf_min_log_size ((unsigned) 10)
-// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
-#define trace_buf_max_log_size ((unsigned) 28)
-
-static int
-trace_buf_init(struct trace_buf *buf, unsigned log_size) {
-  ASSERT(log_size >= trace_buf_min_log_size);
-  ASSERT(log_size <= trace_buf_max_log_size);
-  size_t size = (1 << log_size) * sizeof(uintptr_t);
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("Failed to grow work-stealing dequeue");
-    DEBUG("Failed to allocate %zu bytes", size);
-    return 0;
-  }
-  buf->log_size = log_size;
-  buf->size = 1 << log_size;
-  buf->data = mem;
-  return 1;
-}
-  
-static inline size_t
-trace_buf_size(struct trace_buf *buf) {
-  return buf->size;
-}
-
-static inline size_t
-trace_buf_byte_size(struct trace_buf *buf) {
-  return trace_buf_size(buf) * sizeof(uintptr_t);
-}
-
-static void
-trace_buf_release(struct trace_buf *buf) {
-  if (buf->data)
-    madvise(buf->data, trace_buf_byte_size(buf), MADV_DONTNEED);
-}
-
-static void
-trace_buf_destroy(struct trace_buf *buf) {
-  if (buf->data) {
-    munmap(buf->data, trace_buf_byte_size(buf));
-    buf->data = NULL;
-    buf->log_size = 0;
-    buf->size = 0;
-  }
-}
-
-static inline struct gc_ref
-trace_buf_get(struct trace_buf *buf, size_t i) {
-  return gc_ref(atomic_load_explicit(&buf->data[i & (buf->size - 1)],
-                                     memory_order_relaxed));
-}
-
-static inline void
-trace_buf_put(struct trace_buf *buf, size_t i, struct gc_ref ref) {
-  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
-                               gc_ref_value(ref),
-                               memory_order_relaxed);
-}
-
-static inline int
-trace_buf_grow(struct trace_buf *from, struct trace_buf *to,
-              size_t b, size_t t) {
-  if (from->log_size == trace_buf_max_log_size)
-    return 0;
-  if (!trace_buf_init (to, from->log_size + 1))
-    return 0;
-  for (size_t i=t; i<b; i++)
-    trace_buf_put(to, i, trace_buf_get(from, i));
-  return 1;
-}
-
-// Chase-Lev work-stealing deque.  One thread pushes data into the deque
-// at the bottom, and many threads compete to steal data from the top.
-struct trace_deque {
-  // Ensure bottom and top are on different cache lines.
-  union {
-    atomic_size_t bottom;
-    char bottom_padding[64];
-  };
-  union {
-    atomic_size_t top;
-    char top_padding[64];
-  };
-  atomic_int active; // Which trace_buf is active.
-  struct trace_buf bufs[(trace_buf_max_log_size - trace_buf_min_log_size) + 1];
-};
-
-#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
-#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
-
-#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
-#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
-
-#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
-
-static int
-trace_deque_init(struct trace_deque *q) {
-  memset(q, 0, sizeof (*q));
-  int ret = trace_buf_init(&q->bufs[0], trace_buf_min_log_size);
-  // Note, this fence isn't in the paper, I added it out of caution.
-  atomic_thread_fence(memory_order_release);
-  return ret;
-}
-
-static void
-trace_deque_release(struct trace_deque *q) {
-  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
-    trace_buf_release(&q->bufs[i]);
-}
-
-static void
-trace_deque_destroy(struct trace_deque *q) {
-  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
-    trace_buf_destroy(&q->bufs[i]);
-}
-
-static int
-trace_deque_grow(struct trace_deque *q, int cur, size_t b, size_t t) {
-  if (!trace_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
-    fprintf(stderr, "failed to grow deque!!\n");
-    GC_CRASH();
-  }
-
-  cur++;
-  STORE_RELAXED(&q->active, cur);
-  return cur;
-}
-
-static void
-trace_deque_push(struct trace_deque *q, struct gc_ref x) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  size_t t = LOAD_ACQUIRE(&q->top);
-  int active = LOAD_RELAXED(&q->active);
-
-  ssize_t size = b - t;
-  if (size > trace_buf_size(&q->bufs[active]) - 1) /* Full queue. */
-    active = trace_deque_grow(q, active, b, t);
-
-  trace_buf_put(&q->bufs[active], b, x);
-  atomic_thread_fence(memory_order_release);
-  STORE_RELAXED(&q->bottom, b + 1);
-}
-
-static void
-trace_deque_push_many(struct trace_deque *q, struct gc_ref *objv, size_t count) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  size_t t = LOAD_ACQUIRE(&q->top);
-  int active = LOAD_RELAXED(&q->active);
-
-  ssize_t size = b - t;
-  while (size > trace_buf_size(&q->bufs[active]) - count) /* Full queue. */
-    active = trace_deque_grow(q, active, b, t);
-
-  for (size_t i = 0; i < count; i++)
-    trace_buf_put(&q->bufs[active], b + i, objv[i]);
-  atomic_thread_fence(memory_order_release);
-  STORE_RELAXED(&q->bottom, b + count);
-}
-
-static struct gc_ref
-trace_deque_try_pop(struct trace_deque *q) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  int active = LOAD_RELAXED(&q->active);
-  STORE_RELAXED(&q->bottom, b - 1);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t t = LOAD_RELAXED(&q->top);
-  struct gc_ref x;
-  ssize_t size = b - t;
-  if (size > 0) { // Non-empty queue.
-    x = trace_buf_get(&q->bufs[active], b - 1);
-    if (size == 1) { // Single last element in queue.
-      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
-                                                   memory_order_seq_cst,
-                                                   memory_order_relaxed))
-        // Failed race.
-        x = gc_ref_null();
-      STORE_RELAXED(&q->bottom, b);
-    }
-  } else { // Empty queue.
-    x = gc_ref_null();
-    STORE_RELAXED(&q->bottom, b);
-  }
-  return x;
-}
-
-static struct gc_ref
-trace_deque_steal(struct trace_deque *q) {
-  while (1) {
-    size_t t = LOAD_ACQUIRE(&q->top);
-    atomic_thread_fence(memory_order_seq_cst);
-    size_t b = LOAD_ACQUIRE(&q->bottom);
-    ssize_t size = b - t;
-    if (size <= 0)
-      return gc_ref_null();
-    int active = LOAD_CONSUME(&q->active);
-    struct gc_ref ref = trace_buf_get(&q->bufs[active], t);
-    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
-                                                 memory_order_seq_cst,
-                                                 memory_order_relaxed))
-      // Failed race.
-      continue;
-    return ref;
-  }
-}
-
-static ssize_t
-trace_deque_size(struct trace_deque *q) {
-  size_t t = LOAD_ACQUIRE(&q->top);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t b = LOAD_ACQUIRE(&q->bottom);
-  ssize_t size = b - t;
-  return size;
-}
-
-static int
-trace_deque_can_steal(struct trace_deque *q) {
-  return trace_deque_size(q) > 0;
-}
-
-#undef LOAD_RELAXED
-#undef STORE_RELAXED
-#undef LOAD_ACQUIRE
-#undef STORE_RELEASE
-#undef LOAD_CONSUME
-
 #define LOCAL_TRACE_QUEUE_SIZE 1024
 #define LOCAL_TRACE_QUEUE_MASK (LOCAL_TRACE_QUEUE_SIZE - 1)
 #define LOCAL_TRACE_QUEUE_SHARE_AMOUNT (LOCAL_TRACE_QUEUE_SIZE * 3 / 4)
@@ -319,7 +79,7 @@ struct trace_worker {
   pthread_t thread;
   enum trace_worker_state state;
   pthread_mutex_t lock;
-  struct trace_deque deque;
+  struct shared_worklist deque;
 };
 
 #define TRACE_WORKERS_MAX_COUNT 8
@@ -335,7 +95,7 @@ struct tracer {
 
 struct local_tracer {
   struct trace_worker *worker;
-  struct trace_deque *share_deque;
+  struct shared_worklist *share_deque;
   struct local_trace_queue local;
 };
 
@@ -350,7 +110,7 @@ trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
   worker->steal_id = 0;
   worker->thread = 0;
   pthread_mutex_init(&worker->lock, NULL);
-  return trace_deque_init(&worker->deque);
+  return shared_worklist_init(&worker->deque);
 }
 
 static void trace_worker_trace(struct trace_worker *worker);
@@ -416,7 +176,7 @@ static void tracer_prepare(struct gc_heap *heap) {
 static void tracer_release(struct gc_heap *heap) {
   struct tracer *tracer = heap_tracer(heap);
   for (size_t i = 0; i < tracer->worker_count; i++)
-    trace_deque_release(&tracer->workers[i].deque);
+    shared_worklist_release(&tracer->workers[i].deque);
 }
 
 static inline void
@@ -453,7 +213,7 @@ tracer_share(struct local_tracer *trace) {
   while (to_share) {
     struct gc_ref *objv;
     size_t count = local_trace_queue_pop_many(&trace->local, &objv, to_share);
-    trace_deque_push_many(trace->share_deque, objv, count);
+    shared_worklist_push_many(trace->share_deque, objv, count);
     to_share -= count;
   }
   tracer_maybe_unpark_workers(heap_tracer(trace->worker->heap));
@@ -476,13 +236,13 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
 static struct gc_ref
 tracer_steal_from_worker(struct tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
-  return trace_deque_steal(&tracer->workers[id].deque);
+  return shared_worklist_steal(&tracer->workers[id].deque);
 }
 
 static int
 tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
-  return trace_deque_can_steal(&tracer->workers[id].deque);
+  return shared_worklist_can_steal(&tracer->workers[id].deque);
 }
 
 static struct gc_ref
@@ -502,7 +262,8 @@ trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer)
 }
 
 static int
-trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
+trace_worker_can_steal_from_any(struct trace_worker *worker,
+                                struct tracer *tracer) {
   DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
   for (size_t i = 0; i < tracer->worker_count; i++) {
     int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
@@ -561,7 +322,7 @@ trace_worker_steal(struct local_tracer *trace) {
   // something from the worker's own queue.
   {
     DEBUG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
-    struct gc_ref obj = trace_deque_try_pop(&worker->deque);
+    struct gc_ref obj = shared_worklist_try_pop(&worker->deque);
     if (gc_ref_is_heap_object(obj))
       return obj;
   }
@@ -610,15 +371,15 @@ trace_worker_trace(struct trace_worker *worker) {
 
 static inline void
 tracer_enqueue_root(struct tracer *tracer, struct gc_ref ref) {
-  struct trace_deque *worker0_deque = &tracer->workers[0].deque;
-  trace_deque_push(worker0_deque, ref);
+  struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
+  shared_worklist_push(worker0_deque, ref);
 }
 
 static inline void
 tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objv,
                      size_t count) {
-  struct trace_deque *worker0_deque = &tracer->workers[0].deque;
-  trace_deque_push_many(worker0_deque, objv, count);
+  struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
+  shared_worklist_push_many(worker0_deque, objv, count);
 }
 
 static inline void
@@ -629,7 +390,7 @@ tracer_trace(struct gc_heap *heap) {
 
   ssize_t parallel_threshold =
     LOCAL_TRACE_QUEUE_SIZE - LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
-  if (trace_deque_size(&tracer->workers[0].deque) >= parallel_threshold) {
+  if (shared_worklist_size(&tracer->workers[0].deque) >= parallel_threshold) {
     DEBUG("waking workers\n");
     tracer_unpark_all_workers(tracer);
   } else {
diff --git a/src/shared-worklist.h b/src/shared-worklist.h
new file mode 100644
index 000000000..6d5ed3315
--- /dev/null
+++ b/src/shared-worklist.h
@@ -0,0 +1,259 @@
+#ifndef SHARED_WORKLIST_H
+#define SHARED_WORKLIST_H
+
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-inline.h"
+#include "spin.h"
+
+// The Chase-Lev work-stealing deque, as initially described in "Dynamic
+// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
+// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
+// and improved with C11 atomics in "Correct and Efficient Work-Stealing
+// for Weak Memory Models" (Lê et al, PPoPP'13)
+// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
+
+struct shared_worklist_buf {
+  unsigned log_size;
+  size_t size;
+  uintptr_t *data;
+};
+
+// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
+#define shared_worklist_buf_min_log_size ((unsigned) 10)
+// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
+#define shared_worklist_buf_max_log_size ((unsigned) 28)
+
+static int
+shared_worklist_buf_init(struct shared_worklist_buf *buf, unsigned log_size) {
+  ASSERT(log_size >= shared_worklist_buf_min_log_size);
+  ASSERT(log_size <= shared_worklist_buf_max_log_size);
+  size_t size = (1 << log_size) * sizeof(uintptr_t);
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow work-stealing dequeue");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return 0;
+  }
+  buf->log_size = log_size;
+  buf->size = 1 << log_size;
+  buf->data = mem;
+  return 1;
+}
+  
+static inline size_t
+shared_worklist_buf_size(struct shared_worklist_buf *buf) {
+  return buf->size;
+}
+
+static inline size_t
+shared_worklist_buf_byte_size(struct shared_worklist_buf *buf) {
+  return shared_worklist_buf_size(buf) * sizeof(uintptr_t);
+}
+
+static void
+shared_worklist_buf_release(struct shared_worklist_buf *buf) {
+  if (buf->data)
+    madvise(buf->data, shared_worklist_buf_byte_size(buf), MADV_DONTNEED);
+}
+
+static void
+shared_worklist_buf_destroy(struct shared_worklist_buf *buf) {
+  if (buf->data) {
+    munmap(buf->data, shared_worklist_buf_byte_size(buf));
+    buf->data = NULL;
+    buf->log_size = 0;
+    buf->size = 0;
+  }
+}
+
+static inline struct gc_ref
+shared_worklist_buf_get(struct shared_worklist_buf *buf, size_t i) {
+  return gc_ref(atomic_load_explicit(&buf->data[i & (buf->size - 1)],
+                                     memory_order_relaxed));
+}
+
+static inline void
+shared_worklist_buf_put(struct shared_worklist_buf *buf, size_t i,
+                        struct gc_ref ref) {
+  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
+                               gc_ref_value(ref),
+                               memory_order_relaxed);
+}
+
+static inline int
+shared_worklist_buf_grow(struct shared_worklist_buf *from,
+                         struct shared_worklist_buf *to, size_t b, size_t t) {
+  if (from->log_size == shared_worklist_buf_max_log_size)
+    return 0;
+  if (!shared_worklist_buf_init (to, from->log_size + 1))
+    return 0;
+  for (size_t i=t; i<b; i++)
+    shared_worklist_buf_put(to, i, shared_worklist_buf_get(from, i));
+  return 1;
+}
+
+// Chase-Lev work-stealing deque.  One thread pushes data into the deque
+// at the bottom, and many threads compete to steal data from the top.
+struct shared_worklist {
+  // Ensure bottom and top are on different cache lines.
+  union {
+    atomic_size_t bottom;
+    char bottom_padding[AVOID_FALSE_SHARING];
+  };
+  union {
+    atomic_size_t top;
+    char top_padding[AVOID_FALSE_SHARING];
+  };
+  atomic_int active; // Which shared_worklist_buf is active.
+  struct shared_worklist_buf bufs[(shared_worklist_buf_max_log_size -
+                                   shared_worklist_buf_min_log_size) + 1];
+};
+
+#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
+#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
+
+#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
+#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
+
+#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
+
+static int
+shared_worklist_init(struct shared_worklist *q) {
+  memset(q, 0, sizeof (*q));
+  int ret = shared_worklist_buf_init(&q->bufs[0],
+                                     shared_worklist_buf_min_log_size);
+  // Note, this fence isn't in the paper, I added it out of caution.
+  atomic_thread_fence(memory_order_release);
+  return ret;
+}
+
+static void
+shared_worklist_release(struct shared_worklist *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    shared_worklist_buf_release(&q->bufs[i]);
+}
+
+static void
+shared_worklist_destroy(struct shared_worklist *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    shared_worklist_buf_destroy(&q->bufs[i]);
+}
+
+static int
+shared_worklist_grow(struct shared_worklist *q, int cur, size_t b, size_t t) {
+  if (!shared_worklist_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
+    fprintf(stderr, "failed to grow deque!!\n");
+    GC_CRASH();
+  }
+
+  cur++;
+  STORE_RELAXED(&q->active, cur);
+  return cur;
+}
+
+static void
+shared_worklist_push(struct shared_worklist *q, struct gc_ref x) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  ssize_t size = b - t;
+  if (size > shared_worklist_buf_size(&q->bufs[active]) - 1)
+    active = shared_worklist_grow(q, active, b, t); /* Full queue; grow. */
+
+  shared_worklist_buf_put(&q->bufs[active], b, x);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + 1);
+}
+
+static void
+shared_worklist_push_many(struct shared_worklist *q, struct gc_ref *objv,
+                          size_t count) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  ssize_t size = b - t;
+  while (size > shared_worklist_buf_size(&q->bufs[active]) - count)
+    active = shared_worklist_grow(q, active, b, t); /* Full queue; grow. */
+
+  for (size_t i = 0; i < count; i++)
+    shared_worklist_buf_put(&q->bufs[active], b + i, objv[i]);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + count);
+}
+
+static struct gc_ref
+shared_worklist_try_pop(struct shared_worklist *q) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  int active = LOAD_RELAXED(&q->active);
+  STORE_RELAXED(&q->bottom, b - 1);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t t = LOAD_RELAXED(&q->top);
+  struct gc_ref x;
+  ssize_t size = b - t;
+  if (size > 0) { // Non-empty queue.
+    x = shared_worklist_buf_get(&q->bufs[active], b - 1);
+    if (size == 1) { // Single last element in queue.
+      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                   memory_order_seq_cst,
+                                                   memory_order_relaxed))
+        // Failed race.
+        x = gc_ref_null();
+      STORE_RELAXED(&q->bottom, b);
+    }
+  } else { // Empty queue.
+    x = gc_ref_null();
+    STORE_RELAXED(&q->bottom, b);
+  }
+  return x;
+}
+
+static struct gc_ref
+shared_worklist_steal(struct shared_worklist *q) {
+  while (1) {
+    size_t t = LOAD_ACQUIRE(&q->top);
+    atomic_thread_fence(memory_order_seq_cst);
+    size_t b = LOAD_ACQUIRE(&q->bottom);
+    ssize_t size = b - t;
+    if (size <= 0)
+      return gc_ref_null();
+    int active = LOAD_CONSUME(&q->active);
+    struct gc_ref ref = shared_worklist_buf_get(&q->bufs[active], t);
+    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                 memory_order_seq_cst,
+                                                 memory_order_relaxed))
+      // Failed race.
+      continue;
+    return ref;
+  }
+}
+
+static ssize_t
+shared_worklist_size(struct shared_worklist *q) {
+  size_t t = LOAD_ACQUIRE(&q->top);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t b = LOAD_ACQUIRE(&q->bottom);
+  ssize_t size = b - t;
+  return size;
+}
+
+static int
+shared_worklist_can_steal(struct shared_worklist *q) {
+  return shared_worklist_size(q) > 0;
+}
+
+#undef LOAD_RELAXED
+#undef STORE_RELAXED
+#undef LOAD_ACQUIRE
+#undef STORE_RELEASE
+#undef LOAD_CONSUME
+
+#endif // SHARED_WORKLIST_H

From 4c6f1b6cef1c9149e79c0c65bbb681e1fd8bf192 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 10:49:46 +0200
Subject: [PATCH 227/403] Break local worklist out to its own file

---
 src/local-worklist.h  | 59 ++++++++++++++++++++++++++++++++++++
 src/parallel-tracer.h | 70 +++++++------------------------------------
 2 files changed, 69 insertions(+), 60 deletions(-)
 create mode 100644 src/local-worklist.h

diff --git a/src/local-worklist.h b/src/local-worklist.h
new file mode 100644
index 000000000..8dcd3e20d
--- /dev/null
+++ b/src/local-worklist.h
@@ -0,0 +1,59 @@
+#ifndef LOCAL_WORKLIST_H
+#define LOCAL_WORKLIST_H
+
+#include "assert.h"
+
+#define LOCAL_WORKLIST_SIZE 1024
+#define LOCAL_WORKLIST_MASK (LOCAL_WORKLIST_SIZE - 1)
+#define LOCAL_WORKLIST_SHARE_AMOUNT (LOCAL_WORKLIST_SIZE * 3 / 4)
+struct local_worklist {
+  size_t read;
+  size_t write;
+  struct gc_ref data[LOCAL_WORKLIST_SIZE];
+};
+
+static inline void
+local_worklist_init(struct local_worklist *q) {
+  q->read = q->write = 0;
+}
+static inline void
+local_worklist_poison(struct local_worklist *q) {
+  q->read = 0; q->write = LOCAL_WORKLIST_SIZE;
+}
+static inline size_t
+local_worklist_size(struct local_worklist *q) {
+  return q->write - q->read;
+}
+static inline int
+local_worklist_empty(struct local_worklist *q) {
+  return local_worklist_size(q) == 0;
+}
+static inline int
+local_worklist_full(struct local_worklist *q) {
+  return local_worklist_size(q) >= LOCAL_WORKLIST_SIZE;
+}
+static inline void
+local_worklist_push(struct local_worklist *q, struct gc_ref v) {
+  ASSERT(!local_worklist_full(q));
+  q->data[q->write++ & LOCAL_WORKLIST_MASK] = v;
+}
+static inline struct gc_ref
+local_worklist_pop(struct local_worklist *q) {
+  ASSERT(!local_worklist_empty(q));
+  return q->data[q->read++ & LOCAL_WORKLIST_MASK];
+}
+
+static inline size_t
+local_worklist_pop_many(struct local_worklist *q, struct gc_ref **objv,
+                        size_t limit) {
+  size_t avail = local_worklist_size(q);
+  size_t read = q->read & LOCAL_WORKLIST_MASK;
+  size_t contig = LOCAL_WORKLIST_SIZE - read;
+  if (contig < avail) avail = contig;
+  if (limit < avail) avail = limit;
+  *objv = q->data + read;
+  q->read += avail;
+  return avail;
+}
+
+#endif // LOCAL_WORKLIST_H
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 3b4a8d0f1..18fd03899 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -9,60 +9,10 @@
 #include "assert.h"
 #include "debug.h"
 #include "gc-inline.h"
+#include "local-worklist.h"
 #include "shared-worklist.h"
 #include "spin.h"
 
-#define LOCAL_TRACE_QUEUE_SIZE 1024
-#define LOCAL_TRACE_QUEUE_MASK (LOCAL_TRACE_QUEUE_SIZE - 1)
-#define LOCAL_TRACE_QUEUE_SHARE_AMOUNT (LOCAL_TRACE_QUEUE_SIZE * 3 / 4)
-struct local_trace_queue {
-  size_t read;
-  size_t write;
-  struct gc_ref data[LOCAL_TRACE_QUEUE_SIZE];
-};
-
-static inline void
-local_trace_queue_init(struct local_trace_queue *q) {
-  q->read = q->write = 0;
-}
-static inline void
-local_trace_queue_poison(struct local_trace_queue *q) {
-  q->read = 0; q->write = LOCAL_TRACE_QUEUE_SIZE;
-}
-static inline size_t
-local_trace_queue_size(struct local_trace_queue *q) {
-  return q->write - q->read;
-}
-static inline int
-local_trace_queue_empty(struct local_trace_queue *q) {
-  return local_trace_queue_size(q) == 0;
-}
-static inline int
-local_trace_queue_full(struct local_trace_queue *q) {
-  return local_trace_queue_size(q) >= LOCAL_TRACE_QUEUE_SIZE;
-}
-static inline void
-local_trace_queue_push(struct local_trace_queue *q, struct gc_ref v) {
-  q->data[q->write++ & LOCAL_TRACE_QUEUE_MASK] = v;
-}
-static inline struct gc_ref
-local_trace_queue_pop(struct local_trace_queue *q) {
-  return q->data[q->read++ & LOCAL_TRACE_QUEUE_MASK];
-}
-
-static inline size_t
-local_trace_queue_pop_many(struct local_trace_queue *q, struct gc_ref **objv,
-                           size_t limit) {
-  size_t avail = local_trace_queue_size(q);
-  size_t read = q->read & LOCAL_TRACE_QUEUE_MASK;
-  size_t contig = LOCAL_TRACE_QUEUE_SIZE - read;
-  if (contig < avail) avail = contig;
-  if (limit < avail) avail = limit;
-  *objv = q->data + read;
-  q->read += avail;
-  return avail;
-}
-
 enum trace_worker_state {
   TRACE_WORKER_STOPPED,
   TRACE_WORKER_IDLE,
@@ -96,7 +46,7 @@ struct tracer {
 struct local_tracer {
   struct trace_worker *worker;
   struct shared_worklist *share_deque;
-  struct local_trace_queue local;
+  struct local_worklist local;
 };
 
 struct context;
@@ -209,10 +159,10 @@ static inline int trace_edge(struct gc_heap *heap,
 static inline void
 tracer_share(struct local_tracer *trace) {
   DEBUG("tracer #%zu: sharing\n", trace->worker->id);
-  size_t to_share = LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
+  size_t to_share = LOCAL_WORKLIST_SHARE_AMOUNT;
   while (to_share) {
     struct gc_ref *objv;
-    size_t count = local_trace_queue_pop_many(&trace->local, &objv, to_share);
+    size_t count = local_worklist_pop_many(&trace->local, &objv, to_share);
     shared_worklist_push_many(trace->share_deque, objv, count);
     to_share -= count;
   }
@@ -222,9 +172,9 @@ tracer_share(struct local_tracer *trace) {
 static inline void
 tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
   struct local_tracer *trace = trace_data;
-  if (local_trace_queue_full(&trace->local))
+  if (local_worklist_full(&trace->local))
     tracer_share(trace);
-  local_trace_queue_push(&trace->local, ref);
+  local_worklist_push(&trace->local, ref);
 }
 
 static inline void
@@ -344,7 +294,7 @@ trace_worker_trace(struct trace_worker *worker) {
   struct local_tracer trace;
   trace.worker = worker;
   trace.share_deque = &worker->deque;
-  local_trace_queue_init(&trace.local);
+  local_worklist_init(&trace.local);
 
   size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
@@ -352,8 +302,8 @@ trace_worker_trace(struct trace_worker *worker) {
   do {
     while (1) {
       struct gc_ref ref;
-      if (!local_trace_queue_empty(&trace.local)) {
-        ref = local_trace_queue_pop(&trace.local);
+      if (!local_worklist_empty(&trace.local)) {
+        ref = local_worklist_pop(&trace.local);
       } else {
         ref = trace_worker_steal(&trace);
         if (!gc_ref_is_heap_object(ref))
@@ -389,7 +339,7 @@ tracer_trace(struct gc_heap *heap) {
   DEBUG("starting trace; %zu workers\n", tracer->worker_count);
 
   ssize_t parallel_threshold =
-    LOCAL_TRACE_QUEUE_SIZE - LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
+    LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
   if (shared_worklist_size(&tracer->workers[0].deque) >= parallel_threshold) {
     DEBUG("waking workers\n");
     tracer_unpark_all_workers(tracer);

From 82afee869386f5906589068760131f281ecb6229 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 11:49:47 +0200
Subject: [PATCH 228/403] Break simple serial worklist out to its own file

---
 src/serial-tracer.h   | 125 +++---------------------------------------
 src/simple-worklist.h | 124 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+), 118 deletions(-)
 create mode 100644 src/simple-worklist.h

diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index d189b1c7c..2246eb360 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -6,121 +6,10 @@
 
 #include "assert.h"
 #include "debug.h"
-#include "gc-api.h"
-
-struct trace_queue {
-  size_t size;
-  size_t read;
-  size_t write;
-  struct gc_ref *buf;
-};
-
-static const size_t trace_queue_max_size =
-  (1ULL << (sizeof(struct gc_ref) * 8 - 1)) / sizeof(struct gc_ref);
-static const size_t trace_queue_release_byte_threshold = 1 * 1024 * 1024;
-
-static struct gc_ref *
-trace_queue_alloc(size_t size) {
-  void *mem = mmap(NULL, size * sizeof(struct gc_ref), PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("Failed to grow trace queue");
-    DEBUG("Failed to allocate %zu bytes", size);
-    return NULL;
-  }
-  return mem;
-}
-
-static int
-trace_queue_init(struct trace_queue *q) {
-  q->size = getpagesize() / sizeof(struct gc_ref);
-  q->read = 0;
-  q->write = 0;
-  q->buf = trace_queue_alloc(q->size);
-  return !!q->buf;
-}
-  
-static inline struct gc_ref
-trace_queue_get(struct trace_queue *q, size_t idx) {
-  return q->buf[idx & (q->size - 1)];
-}
-
-static inline void
-trace_queue_put(struct trace_queue *q, size_t idx, struct gc_ref x) {
-  q->buf[idx & (q->size - 1)] = x;
-}
-
-static int trace_queue_grow(struct trace_queue *q) GC_NEVER_INLINE;
-
-static int
-trace_queue_grow(struct trace_queue *q) {
-  size_t old_size = q->size;
-  struct gc_ref *old_buf = q->buf;
-  if (old_size >= trace_queue_max_size) {
-    DEBUG("trace queue already at max size of %zu bytes", old_size);
-    return 0;
-  }
-
-  size_t new_size = old_size * 2;
-  struct gc_ref *new_buf = trace_queue_alloc(new_size);
-  if (!new_buf)
-    return 0;
-
-  size_t old_mask = old_size - 1;
-  size_t new_mask = new_size - 1;
-
-  for (size_t i = q->read; i < q->write; i++)
-    new_buf[i & new_mask] = old_buf[i & old_mask];
-
-  munmap(old_buf, old_size * sizeof(struct gc_ref));
-
-  q->size = new_size;
-  q->buf = new_buf;
-  return 1;
-}
-  
-static inline void
-trace_queue_push(struct trace_queue *q, struct gc_ref p) {
-  if (UNLIKELY(q->write - q->read == q->size)) {
-    if (!trace_queue_grow(q))
-      GC_CRASH();
-  }
-  trace_queue_put(q, q->write++, p);
-}
-
-static inline void
-trace_queue_push_many(struct trace_queue *q, struct gc_ref *pv, size_t count) {
-  while (q->size - (q->write - q->read) < count) {
-    if (!trace_queue_grow(q))
-      GC_CRASH();
-  }
-  for (size_t i = 0; i < count; i++)
-    trace_queue_put(q, q->write++, pv[i]);
-}
-
-static inline struct gc_ref
-trace_queue_pop(struct trace_queue *q) {
-  if (UNLIKELY(q->read == q->write))
-    return gc_ref_null();
-  return trace_queue_get(q, q->read++);
-}
-
-static void
-trace_queue_release(struct trace_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gc_ref);
-  if (byte_size >= trace_queue_release_byte_threshold)
-    madvise(q->buf, byte_size, MADV_DONTNEED);
-  q->read = q->write = 0;
-}
-
-static void
-trace_queue_destroy(struct trace_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gc_ref);
-  munmap(q->buf, byte_size);
-}
+#include "simple-worklist.h"
 
 struct tracer {
-  struct trace_queue queue;
+  struct simple_worklist worklist;
 };
 
 struct gc_heap;
@@ -128,11 +17,11 @@ static inline struct tracer* heap_tracer(struct gc_heap *heap);
 
 static int
 tracer_init(struct gc_heap *heap, size_t parallelism) {
-  return trace_queue_init(&heap_tracer(heap)->queue);
+  return simple_worklist_init(&heap_tracer(heap)->worklist);
 }
 static void tracer_prepare(struct gc_heap *heap) {}
 static void tracer_release(struct gc_heap *heap) {
-  trace_queue_release(&heap_tracer(heap)->queue);
+  simple_worklist_release(&heap_tracer(heap)->worklist);
 }
 
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
@@ -146,12 +35,12 @@ static inline int trace_edge(struct gc_heap *heap,
 
 static inline void
 tracer_enqueue_root(struct tracer *tracer, struct gc_ref obj) {
-  trace_queue_push(&tracer->queue, obj);
+  simple_worklist_push(&tracer->worklist, obj);
 }
 static inline void
 tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objs,
                      size_t count) {
-  trace_queue_push_many(&tracer->queue, objs, count);
+  simple_worklist_push_many(&tracer->worklist, objs, count);
 }
 static inline void
 tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
@@ -165,7 +54,7 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
 static inline void
 tracer_trace(struct gc_heap *heap) {
   do {
-    struct gc_ref obj = trace_queue_pop(&heap_tracer(heap)->queue);
+    struct gc_ref obj = simple_worklist_pop(&heap_tracer(heap)->worklist);
     if (!gc_ref_is_heap_object(obj))
       break;
     trace_one(obj, heap, NULL);
diff --git a/src/simple-worklist.h b/src/simple-worklist.h
new file mode 100644
index 000000000..bae33b470
--- /dev/null
+++ b/src/simple-worklist.h
@@ -0,0 +1,124 @@
+#ifndef SIMPLE_WORKLIST_H
+#define SIMPLE_WORKLIST_H
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-inline.h"
+#include "gc-ref.h"
+
+struct simple_worklist {
+  size_t size;
+  size_t read;
+  size_t write;
+  struct gc_ref *buf;
+};
+
+static const size_t simple_worklist_max_size =
+  (1ULL << (sizeof(struct gc_ref) * 8 - 1)) / sizeof(struct gc_ref);
+static const size_t simple_worklist_release_byte_threshold = 1 * 1024 * 1024;
+
+static struct gc_ref *
+simple_worklist_alloc(size_t size) {
+  void *mem = mmap(NULL, size * sizeof(struct gc_ref), PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow trace queue");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return NULL;
+  }
+  return mem;
+}
+
+static int
+simple_worklist_init(struct simple_worklist *q) {
+  q->size = getpagesize() / sizeof(struct gc_ref);
+  q->read = 0;
+  q->write = 0;
+  q->buf = simple_worklist_alloc(q->size);
+  return !!q->buf;
+}
+  
+static inline struct gc_ref
+simple_worklist_get(struct simple_worklist *q, size_t idx) {
+  return q->buf[idx & (q->size - 1)];
+}
+
+static inline void
+simple_worklist_put(struct simple_worklist *q, size_t idx, struct gc_ref x) {
+  q->buf[idx & (q->size - 1)] = x;
+}
+
+static int simple_worklist_grow(struct simple_worklist *q) GC_NEVER_INLINE;
+
+static int
+simple_worklist_grow(struct simple_worklist *q) {
+  size_t old_size = q->size;
+  struct gc_ref *old_buf = q->buf;
+  if (old_size >= simple_worklist_max_size) {
+    DEBUG("trace queue already at max size of %zu bytes", old_size);
+    return 0;
+  }
+
+  size_t new_size = old_size * 2;
+  struct gc_ref *new_buf = simple_worklist_alloc(new_size);
+  if (!new_buf)
+    return 0;
+
+  size_t old_mask = old_size - 1;
+  size_t new_mask = new_size - 1;
+
+  for (size_t i = q->read; i < q->write; i++)
+    new_buf[i & new_mask] = old_buf[i & old_mask];
+
+  munmap(old_buf, old_size * sizeof(struct gc_ref));
+
+  q->size = new_size;
+  q->buf = new_buf;
+  return 1;
+}
+  
+static inline void
+simple_worklist_push(struct simple_worklist *q, struct gc_ref p) {
+  if (UNLIKELY(q->write - q->read == q->size)) {
+    if (!simple_worklist_grow(q))
+      GC_CRASH();
+  }
+  simple_worklist_put(q, q->write++, p);
+}
+
+static inline void
+simple_worklist_push_many(struct simple_worklist *q, struct gc_ref *pv,
+                          size_t count) {
+  while (q->size - (q->write - q->read) < count) {
+    if (!simple_worklist_grow(q))
+      GC_CRASH();
+  }
+  for (size_t i = 0; i < count; i++)
+    simple_worklist_put(q, q->write++, pv[i]);
+}
+
+static inline struct gc_ref
+simple_worklist_pop(struct simple_worklist *q) {
+  if (UNLIKELY(q->read == q->write))
+    return gc_ref_null();
+  return simple_worklist_get(q, q->read++);
+}
+
+static void
+simple_worklist_release(struct simple_worklist *q) {
+  size_t byte_size = q->size * sizeof(struct gc_ref);
+  if (byte_size >= simple_worklist_release_byte_threshold)
+    madvise(q->buf, byte_size, MADV_DONTNEED);
+  q->read = q->write = 0;
+}
+
+static void
+simple_worklist_destroy(struct simple_worklist *q) {
+  size_t byte_size = q->size * sizeof(struct gc_ref);
+  munmap(q->buf, byte_size);
+}
+
+#endif // SIMPLE_WORKLIST_H

From b4543ad6418ac779c3c4f8d53db1ef05dd46240c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 14:38:15 +0200
Subject: [PATCH 229/403] Factor out tracer interface to own file

---
 src/parallel-tracer.h | 19 +-----------
 src/serial-tracer.h   | 18 +-----------
 src/tracer.h          | 68 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 35 deletions(-)
 create mode 100644 src/tracer.h

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 18fd03899..9d04f264b 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -12,6 +12,7 @@
 #include "local-worklist.h"
 #include "shared-worklist.h"
 #include "spin.h"
+#include "tracer.h"
 
 enum trace_worker_state {
   TRACE_WORKER_STOPPED,
@@ -49,9 +50,6 @@ struct local_tracer {
   struct local_worklist local;
 };
 
-struct context;
-static inline struct tracer* heap_tracer(struct gc_heap *heap);
-
 static int
 trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
                  struct tracer *tracer, size_t id) {
@@ -147,15 +145,6 @@ tracer_maybe_unpark_workers(struct tracer *tracer) {
     tracer_unpark_all_workers(tracer);
 }
 
-static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
-                                void *trace_data) GC_ALWAYS_INLINE;
-static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
-                                  void *trace_data) GC_ALWAYS_INLINE;
-static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             void *trace_data) GC_ALWAYS_INLINE;
-static inline int trace_edge(struct gc_heap *heap,
-                             struct gc_edge edge) GC_ALWAYS_INLINE;
-
 static inline void
 tracer_share(struct local_tracer *trace) {
   DEBUG("tracer #%zu: sharing\n", trace->worker->id);
@@ -177,12 +166,6 @@ tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
   local_worklist_push(&trace->local, ref);
 }
 
-static inline void
-tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
-  if (trace_edge(heap, edge))
-    tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
-}
-
 static struct gc_ref
 tracer_steal_from_worker(struct tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index 2246eb360..13aae60ce 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -7,14 +7,12 @@
 #include "assert.h"
 #include "debug.h"
 #include "simple-worklist.h"
+#include "tracer.h"
 
 struct tracer {
   struct simple_worklist worklist;
 };
 
-struct gc_heap;
-static inline struct tracer* heap_tracer(struct gc_heap *heap);
-
 static int
 tracer_init(struct gc_heap *heap, size_t parallelism) {
   return simple_worklist_init(&heap_tracer(heap)->worklist);
@@ -24,15 +22,6 @@ static void tracer_release(struct gc_heap *heap) {
   simple_worklist_release(&heap_tracer(heap)->worklist);
 }
 
-static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
-                                void *trace_data) GC_ALWAYS_INLINE;
-static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
-                                  void *trace_data) GC_ALWAYS_INLINE;
-static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             void *trace_data) GC_ALWAYS_INLINE;
-static inline int trace_edge(struct gc_heap *heap,
-                             struct gc_edge edge) GC_ALWAYS_INLINE;
-
 static inline void
 tracer_enqueue_root(struct tracer *tracer, struct gc_ref obj) {
   simple_worklist_push(&tracer->worklist, obj);
@@ -47,11 +36,6 @@ tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
   tracer_enqueue_root(heap_tracer(heap), ref);
 }
 static inline void
-tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
-  if (trace_edge(heap, edge))
-    tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
-}
-static inline void
 tracer_trace(struct gc_heap *heap) {
   do {
     struct gc_ref obj = simple_worklist_pop(&heap_tracer(heap)->worklist);
diff --git a/src/tracer.h b/src/tracer.h
new file mode 100644
index 000000000..15b37e0d4
--- /dev/null
+++ b/src/tracer.h
@@ -0,0 +1,68 @@
+#ifndef TRACER_H
+#define TRACER_H
+
+#include "gc-ref.h"
+#include "gc-edge.h"
+
+struct gc_heap;
+
+////////////////////////////////////////////////////////////////////////
+/// To be implemented by collector.
+////////////////////////////////////////////////////////////////////////
+
+// Initialize the tracer when the heap is created.
+static inline struct tracer* heap_tracer(struct gc_heap *heap);
+
+// Visit all fields in an object.
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             void *trace_data) GC_ALWAYS_INLINE;
+
+// Visit one edge.  Return nonzero if this call shaded the object grey.
+static inline int trace_edge(struct gc_heap *heap,
+                             struct gc_edge edge) GC_ALWAYS_INLINE;
+
+////////////////////////////////////////////////////////////////////////
+/// To be implemented by tracer.
+////////////////////////////////////////////////////////////////////////
+
+// The tracer struct itself should be defined in the implementation.
+struct tracer;
+
+// Initialize the tracer when the heap is created.
+static int tracer_init(struct gc_heap *heap, size_t parallelism);
+
+// Initialize the tracer for a new GC cycle.
+static void tracer_prepare(struct gc_heap *heap);
+
+// Release any resources allocated during the trace.
+static void tracer_release(struct gc_heap *heap);
+
+// Add root objects to the trace.  Call before tracer_trace.
+static inline void tracer_enqueue_root(struct tracer *tracer,
+                                       struct gc_ref obj);
+static inline void tracer_enqueue_roots(struct tracer *tracer,
+                                        struct gc_ref *objs,
+                                        size_t count);
+
+// Given that an object has been shaded grey, enqueue for tracing.
+static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
+                                  void *trace_data) GC_ALWAYS_INLINE;
+
+// Run the full trace.
+static inline void tracer_trace(struct gc_heap *heap);
+
+////////////////////////////////////////////////////////////////////////
+/// Procedures that work with any tracer.
+////////////////////////////////////////////////////////////////////////
+
+// Visit one edge.  If we shade the edge target grey, enqueue it for
+// tracing.
+static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
+                                void *trace_data) GC_ALWAYS_INLINE;
+static inline void
+tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  if (trace_edge(heap, edge))
+    tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
+}
+
+#endif // TRACER_H

From 921c012b518321ce046221f420570a5a89431893 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 17:15:00 +0200
Subject: [PATCH 230/403] Add gc_ prefix to tracer API

---
 src/parallel-tracer.h | 48 +++++++++++++++++++++----------------------
 src/serial-tracer.h   | 20 +++++++++---------
 src/tracer.h          | 31 ++++++++++++++--------------
 src/whippet.c         | 26 +++++++++++------------
 4 files changed, 63 insertions(+), 62 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 9d04f264b..10bb207c5 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -35,7 +35,7 @@ struct trace_worker {
 
 #define TRACE_WORKERS_MAX_COUNT 8
 
-struct tracer {
+struct gc_tracer {
   atomic_size_t active_tracers;
   size_t worker_count;
   long epoch;
@@ -52,7 +52,7 @@ struct local_tracer {
 
 static int
 trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
-                 struct tracer *tracer, size_t id) {
+                  struct gc_tracer *tracer, size_t id) {
   worker->heap = heap;
   worker->id = id;
   worker->steal_id = 0;
@@ -66,7 +66,7 @@ static void trace_worker_trace(struct trace_worker *worker);
 static void*
 trace_worker_thread(void *data) {
   struct trace_worker *worker = data;
-  struct tracer *tracer = heap_tracer(worker->heap);
+  struct gc_tracer *tracer = heap_tracer(worker->heap);
   long trace_epoch = 0;
 
   pthread_mutex_lock(&worker->lock);
@@ -92,8 +92,8 @@ trace_worker_spawn(struct trace_worker *worker) {
 }
 
 static int
-tracer_init(struct gc_heap *heap, size_t parallelism) {
-  struct tracer *tracer = heap_tracer(heap);
+gc_tracer_init(struct gc_heap *heap, size_t parallelism) {
+  struct gc_tracer *tracer = heap_tracer(heap);
   atomic_init(&tracer->active_tracers, 0);
   tracer->epoch = 0;
   pthread_mutex_init(&tracer->lock, NULL);
@@ -116,19 +116,19 @@ tracer_init(struct gc_heap *heap, size_t parallelism) {
   return 1;
 }
 
-static void tracer_prepare(struct gc_heap *heap) {
-  struct tracer *tracer = heap_tracer(heap);
+static void gc_tracer_prepare(struct gc_heap *heap) {
+  struct gc_tracer *tracer = heap_tracer(heap);
   for (size_t i = 0; i < tracer->worker_count; i++)
     tracer->workers[i].steal_id = 0;
 }
-static void tracer_release(struct gc_heap *heap) {
-  struct tracer *tracer = heap_tracer(heap);
+static void gc_tracer_release(struct gc_heap *heap) {
+  struct gc_tracer *tracer = heap_tracer(heap);
   for (size_t i = 0; i < tracer->worker_count; i++)
     shared_worklist_release(&tracer->workers[i].deque);
 }
 
 static inline void
-tracer_unpark_all_workers(struct tracer *tracer) {
+tracer_unpark_all_workers(struct gc_tracer *tracer) {
   long old_epoch =
     atomic_fetch_add_explicit(&tracer->epoch, 1, memory_order_acq_rel);
   long epoch = old_epoch + 1;
@@ -138,7 +138,7 @@ tracer_unpark_all_workers(struct tracer *tracer) {
 }
 
 static inline void
-tracer_maybe_unpark_workers(struct tracer *tracer) {
+tracer_maybe_unpark_workers(struct gc_tracer *tracer) {
   size_t active =
     atomic_load_explicit(&tracer->active_tracers, memory_order_acquire);
   if (active < tracer->worker_count)
@@ -159,7 +159,7 @@ tracer_share(struct local_tracer *trace) {
 }
 
 static inline void
-tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
+gc_tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
   struct local_tracer *trace = trace_data;
   if (local_worklist_full(&trace->local))
     tracer_share(trace);
@@ -167,19 +167,19 @@ tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
 }
 
 static struct gc_ref
-tracer_steal_from_worker(struct tracer *tracer, size_t id) {
+tracer_steal_from_worker(struct gc_tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
   return shared_worklist_steal(&tracer->workers[id].deque);
 }
 
 static int
-tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
+tracer_can_steal_from_worker(struct gc_tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
   return shared_worklist_can_steal(&tracer->workers[id].deque);
 }
 
 static struct gc_ref
-trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
+trace_worker_steal_from_any(struct trace_worker *worker, struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++) {
     DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
     struct gc_ref obj = tracer_steal_from_worker(tracer, worker->steal_id);
@@ -196,7 +196,7 @@ trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer)
 
 static int
 trace_worker_can_steal_from_any(struct trace_worker *worker,
-                                struct tracer *tracer) {
+                                struct gc_tracer *tracer) {
   DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
   for (size_t i = 0; i < tracer->worker_count; i++) {
     int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
@@ -217,7 +217,7 @@ trace_worker_should_continue(struct trace_worker *worker) {
   if (worker->id != 0)
     return 0;
 
-  struct tracer *tracer = heap_tracer(worker->heap);
+  struct gc_tracer *tracer = heap_tracer(worker->heap);
 
   for (size_t spin_count = 0;; spin_count++) {
     if (atomic_load_explicit(&tracer->active_tracers,
@@ -248,7 +248,7 @@ trace_worker_should_continue(struct trace_worker *worker) {
 static struct gc_ref
 trace_worker_steal(struct local_tracer *trace) {
   struct trace_worker *worker = trace->worker;
-  struct tracer *tracer = heap_tracer(worker->heap);
+  struct gc_tracer *tracer = heap_tracer(worker->heap);
 
   // It could be that the worker's local trace queue has simply
   // overflowed.  In that case avoid contention by trying to pop
@@ -271,7 +271,7 @@ trace_worker_steal(struct local_tracer *trace) {
 static void
 trace_worker_trace(struct trace_worker *worker) {
   struct gc_heap *heap = worker->heap;
-  struct tracer *tracer = heap_tracer(heap);
+  struct gc_tracer *tracer = heap_tracer(heap);
   atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
 
   struct local_tracer trace;
@@ -303,21 +303,21 @@ trace_worker_trace(struct trace_worker *worker) {
 }
 
 static inline void
-tracer_enqueue_root(struct tracer *tracer, struct gc_ref ref) {
+gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref ref) {
   struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
   shared_worklist_push(worker0_deque, ref);
 }
 
 static inline void
-tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objv,
-                     size_t count) {
+gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objv,
+                        size_t count) {
   struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
   shared_worklist_push_many(worker0_deque, objv, count);
 }
 
 static inline void
-tracer_trace(struct gc_heap *heap) {
-  struct tracer *tracer = heap_tracer(heap);
+gc_tracer_trace(struct gc_heap *heap) {
+  struct gc_tracer *tracer = heap_tracer(heap);
 
   DEBUG("starting trace; %zu workers\n", tracer->worker_count);
 
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index 13aae60ce..d353162d9 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -9,34 +9,34 @@
 #include "simple-worklist.h"
 #include "tracer.h"
 
-struct tracer {
+struct gc_tracer {
   struct simple_worklist worklist;
 };
 
 static int
-tracer_init(struct gc_heap *heap, size_t parallelism) {
+gc_tracer_init(struct gc_heap *heap, size_t parallelism) {
   return simple_worklist_init(&heap_tracer(heap)->worklist);
 }
-static void tracer_prepare(struct gc_heap *heap) {}
-static void tracer_release(struct gc_heap *heap) {
+static void gc_tracer_prepare(struct gc_heap *heap) {}
+static void gc_tracer_release(struct gc_heap *heap) {
   simple_worklist_release(&heap_tracer(heap)->worklist);
 }
 
 static inline void
-tracer_enqueue_root(struct tracer *tracer, struct gc_ref obj) {
+gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref obj) {
   simple_worklist_push(&tracer->worklist, obj);
 }
 static inline void
-tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objs,
-                     size_t count) {
+gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objs,
+                        size_t count) {
   simple_worklist_push_many(&tracer->worklist, objs, count);
 }
 static inline void
-tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
-  tracer_enqueue_root(heap_tracer(heap), ref);
+gc_tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
+  gc_tracer_enqueue_root(heap_tracer(heap), ref);
 }
 static inline void
-tracer_trace(struct gc_heap *heap) {
+gc_tracer_trace(struct gc_heap *heap) {
   do {
     struct gc_ref obj = simple_worklist_pop(&heap_tracer(heap)->worklist);
     if (!gc_ref_is_heap_object(obj))
diff --git a/src/tracer.h b/src/tracer.h
index 15b37e0d4..d0f99938d 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -11,7 +11,7 @@ struct gc_heap;
 ////////////////////////////////////////////////////////////////////////
 
 // Initialize the tracer when the heap is created.
-static inline struct tracer* heap_tracer(struct gc_heap *heap);
+static inline struct gc_tracer* heap_tracer(struct gc_heap *heap);
 
 // Visit all fields in an object.
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
@@ -25,31 +25,32 @@ static inline int trace_edge(struct gc_heap *heap,
 /// To be implemented by tracer.
 ////////////////////////////////////////////////////////////////////////
 
-// The tracer struct itself should be defined in the implementation.
-struct tracer;
+// The tracer struct itself should be defined by the tracer
+// implementation.
+struct gc_tracer;
 
 // Initialize the tracer when the heap is created.
-static int tracer_init(struct gc_heap *heap, size_t parallelism);
+static int gc_tracer_init(struct gc_heap *heap, size_t parallelism);
 
 // Initialize the tracer for a new GC cycle.
-static void tracer_prepare(struct gc_heap *heap);
+static void gc_tracer_prepare(struct gc_heap *heap);
 
 // Release any resources allocated during the trace.
-static void tracer_release(struct gc_heap *heap);
+static void gc_tracer_release(struct gc_heap *heap);
 
 // Add root objects to the trace.  Call before tracer_trace.
-static inline void tracer_enqueue_root(struct tracer *tracer,
-                                       struct gc_ref obj);
-static inline void tracer_enqueue_roots(struct tracer *tracer,
-                                        struct gc_ref *objs,
-                                        size_t count);
+static inline void gc_tracer_enqueue_root(struct gc_tracer *tracer,
+                                          struct gc_ref obj);
+static inline void gc_tracer_enqueue_roots(struct gc_tracer *tracer,
+                                           struct gc_ref *objs,
+                                           size_t count);
 
 // Given that an object has been shaded grey, enqueue for tracing.
-static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
-                                  void *trace_data) GC_ALWAYS_INLINE;
+static inline void gc_tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
+                                     void *trace_data) GC_ALWAYS_INLINE;
 
 // Run the full trace.
-static inline void tracer_trace(struct gc_heap *heap);
+static inline void gc_tracer_trace(struct gc_heap *heap);
 
 ////////////////////////////////////////////////////////////////////////
 /// Procedures that work with any tracer.
@@ -62,7 +63,7 @@ static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
 static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   if (trace_edge(heap, edge))
-    tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
+    gc_tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
 }
 
 #endif // TRACER_H
diff --git a/src/whippet.c b/src/whippet.c
index 98828f11d..f111dd8fd 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -311,7 +311,7 @@ struct gc_heap {
   long count;
   uint8_t last_collection_was_minor;
   struct gc_mutator *inactive_mutators;
-  struct tracer tracer;
+  struct gc_tracer tracer;
   double fragmentation_low_threshold;
   double fragmentation_high_threshold;
   double minor_gc_yield_threshold;
@@ -351,7 +351,7 @@ struct gc_mutator {
   struct gc_mutator *next;
 };
 
-static inline struct tracer* heap_tracer(struct gc_heap *heap) {
+static inline struct gc_tracer* heap_tracer(struct gc_heap *heap) {
   return &heap->tracer;
 }
 static inline struct mark_space* heap_mark_space(struct gc_heap *heap) {
@@ -1126,7 +1126,7 @@ static void trace_and_enqueue_globally(struct gc_edge edge,
                                        struct gc_heap *heap,
                                        void *unused) {
   if (trace_edge(heap, edge))
-    tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
+    gc_tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
 static inline void do_trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
@@ -1135,7 +1135,7 @@ static inline void do_trace_conservative_ref_and_enqueue_globally(struct gc_cons
                                                                   int possibly_interior) {
   struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(object))
-    tracer_enqueue_root(&heap->tracer, object);
+    gc_tracer_enqueue_root(&heap->tracer, object);
 }
 
 static void trace_possibly_interior_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
@@ -1177,7 +1177,7 @@ static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
   int possibly_interior = 0;
   struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(resolved))
-    tracer_enqueue(resolved, heap, data);
+    gc_tracer_enqueue(resolved, heap, data);
 }
 
 static inline void trace_one_conservatively(struct gc_ref ref,
@@ -1325,8 +1325,8 @@ static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
     // Also collect any already-marked grey objects and put them on the
     // global trace queue.
     if (active_mutators_already_marked)
-      tracer_enqueue_roots(&heap->tracer, mut->mark_buf.objects,
-                           mut->mark_buf.size);
+      gc_tracer_enqueue_roots(&heap->tracer, mut->mark_buf.objects,
+                              mut->mark_buf.size);
     else
       trace_mutator_roots_with_lock(mut);
     // Also unlink mutator_trace_list chain.
@@ -1349,7 +1349,7 @@ static void trace_global_conservative_roots(struct gc_heap *heap) {
 }
 
 static void enqueue_generational_root(struct gc_ref ref, struct gc_heap *heap) {
-  tracer_enqueue_root(&heap->tracer, ref);
+  gc_tracer_enqueue_root(&heap->tracer, ref);
 }
 
 // Note that it's quite possible (and even likely) that any given remset
@@ -1889,7 +1889,7 @@ static void collect(struct gc_mutator *mut,
   large_object_space_start_gc(lospace, is_minor);
   gc_extern_space_start_gc(exspace, is_minor);
   resolve_ephemerons_lazily(heap);
-  tracer_prepare(heap);
+  gc_tracer_prepare(heap);
   HEAP_EVENT(heap, requesting_stop);
   request_mutators_to_stop(heap);
   trace_mutator_roots_with_lock_before_stop(mut);
@@ -1906,14 +1906,14 @@ static void collect(struct gc_mutator *mut,
   prepare_for_evacuation(heap);
   trace_roots_after_stop(heap);
   HEAP_EVENT(heap, roots_traced);
-  tracer_trace(heap);
+  gc_tracer_trace(heap);
   HEAP_EVENT(heap, heap_traced);
   resolve_ephemerons_eagerly(heap);
   while (enqueue_resolved_ephemerons(heap))
-    tracer_trace(heap);
+    gc_tracer_trace(heap);
   HEAP_EVENT(heap, ephemerons_traced);
   sweep_ephemerons(heap);
-  tracer_release(heap);
+  gc_tracer_release(heap);
   mark_space_finish_gc(space, gc_kind);
   large_object_space_finish_gc(lospace, is_minor);
   gc_extern_space_finish_gc(exspace, is_minor);
@@ -2366,7 +2366,7 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   pthread_cond_init(&heap->collector_cond, NULL);
   heap->size = options->common.heap_size;
 
-  if (!tracer_init(heap, options->common.parallelism))
+  if (!gc_tracer_init(heap, options->common.parallelism))
     GC_CRASH();
 
   heap->pending_ephemerons_size_factor = 0.005;

From ba9459ce567e69d1a8fc5c21f90480a02b9237e0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 17:29:24 +0200
Subject: [PATCH 231/403] Rework tracer API to pass tracer to all functions

---
 src/parallel-tracer.h | 19 +++++++++----------
 src/serial-tracer.h   | 22 +++++++++++++---------
 src/tracer.h          | 30 +++++++-----------------------
 src/whippet.c         | 23 +++++++++++++++++------
 4 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 10bb207c5..80638603a 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -36,6 +36,7 @@ struct trace_worker {
 #define TRACE_WORKERS_MAX_COUNT 8
 
 struct gc_tracer {
+  struct gc_heap *heap;
   atomic_size_t active_tracers;
   size_t worker_count;
   long epoch;
@@ -92,8 +93,9 @@ trace_worker_spawn(struct trace_worker *worker) {
 }
 
 static int
-gc_tracer_init(struct gc_heap *heap, size_t parallelism) {
-  struct gc_tracer *tracer = heap_tracer(heap);
+gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
+               size_t parallelism) {
+  tracer->heap = heap;
   atomic_init(&tracer->active_tracers, 0);
   tracer->epoch = 0;
   pthread_mutex_init(&tracer->lock, NULL);
@@ -116,13 +118,11 @@ gc_tracer_init(struct gc_heap *heap, size_t parallelism) {
   return 1;
 }
 
-static void gc_tracer_prepare(struct gc_heap *heap) {
-  struct gc_tracer *tracer = heap_tracer(heap);
+static void gc_tracer_prepare(struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++)
     tracer->workers[i].steal_id = 0;
 }
-static void gc_tracer_release(struct gc_heap *heap) {
-  struct gc_tracer *tracer = heap_tracer(heap);
+static void gc_tracer_release(struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++)
     shared_worklist_release(&tracer->workers[i].deque);
 }
@@ -159,7 +159,8 @@ tracer_share(struct local_tracer *trace) {
 }
 
 static inline void
-gc_tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
+gc_tracer_enqueue(struct gc_tracer *tracer, struct gc_ref ref,
+                  void *trace_data) {
   struct local_tracer *trace = trace_data;
   if (local_worklist_full(&trace->local))
     tracer_share(trace);
@@ -316,9 +317,7 @@ gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objv,
 }
 
 static inline void
-gc_tracer_trace(struct gc_heap *heap) {
-  struct gc_tracer *tracer = heap_tracer(heap);
-
+gc_tracer_trace(struct gc_tracer *tracer) {
   DEBUG("starting trace; %zu workers\n", tracer->worker_count);
 
   ssize_t parallel_threshold =
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index d353162d9..c208e10e5 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -10,16 +10,19 @@
 #include "tracer.h"
 
 struct gc_tracer {
+  struct gc_heap *heap;
   struct simple_worklist worklist;
 };
 
 static int
-gc_tracer_init(struct gc_heap *heap, size_t parallelism) {
+gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
+               size_t parallelism) {
+  tracer->heap = heap;
   return simple_worklist_init(&heap_tracer(heap)->worklist);
 }
-static void gc_tracer_prepare(struct gc_heap *heap) {}
-static void gc_tracer_release(struct gc_heap *heap) {
-  simple_worklist_release(&heap_tracer(heap)->worklist);
+static void gc_tracer_prepare(struct gc_tracer *tracer) {}
+static void gc_tracer_release(struct gc_tracer *tracer) {
+  simple_worklist_release(&tracer->worklist);
 }
 
 static inline void
@@ -32,16 +35,17 @@ gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objs,
   simple_worklist_push_many(&tracer->worklist, objs, count);
 }
 static inline void
-gc_tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, void *trace_data) {
-  gc_tracer_enqueue_root(heap_tracer(heap), ref);
+gc_tracer_enqueue(struct gc_tracer *tracer, struct gc_ref ref,
+                  void *trace_data) {
+  gc_tracer_enqueue_root(tracer, ref);
 }
 static inline void
-gc_tracer_trace(struct gc_heap *heap) {
+gc_tracer_trace(struct gc_tracer *tracer) {
   do {
-    struct gc_ref obj = simple_worklist_pop(&heap_tracer(heap)->worklist);
+    struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
     if (!gc_ref_is_heap_object(obj))
       break;
-    trace_one(obj, heap, NULL);
+    trace_one(obj, tracer->heap, NULL);
   } while (1);
 }
 
diff --git a/src/tracer.h b/src/tracer.h
index d0f99938d..a49d25887 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -17,10 +17,6 @@ static inline struct gc_tracer* heap_tracer(struct gc_heap *heap);
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              void *trace_data) GC_ALWAYS_INLINE;
 
-// Visit one edge.  Return nonzero if this call shaded the object grey.
-static inline int trace_edge(struct gc_heap *heap,
-                             struct gc_edge edge) GC_ALWAYS_INLINE;
-
 ////////////////////////////////////////////////////////////////////////
 /// To be implemented by tracer.
 ////////////////////////////////////////////////////////////////////////
@@ -30,13 +26,14 @@ static inline int trace_edge(struct gc_heap *heap,
 struct gc_tracer;
 
 // Initialize the tracer when the heap is created.
-static int gc_tracer_init(struct gc_heap *heap, size_t parallelism);
+static int gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
+                          size_t parallelism);
 
 // Initialize the tracer for a new GC cycle.
-static void gc_tracer_prepare(struct gc_heap *heap);
+static void gc_tracer_prepare(struct gc_tracer *tracer);
 
 // Release any resources allocated during the trace.
-static void gc_tracer_release(struct gc_heap *heap);
+static void gc_tracer_release(struct gc_tracer *tracer);
 
 // Add root objects to the trace.  Call before tracer_trace.
 static inline void gc_tracer_enqueue_root(struct gc_tracer *tracer,
@@ -46,24 +43,11 @@ static inline void gc_tracer_enqueue_roots(struct gc_tracer *tracer,
                                            size_t count);
 
 // Given that an object has been shaded grey, enqueue for tracing.
-static inline void gc_tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
+static inline void gc_tracer_enqueue(struct gc_tracer *tracer,
+                                     struct gc_ref ref,
                                      void *trace_data) GC_ALWAYS_INLINE;
 
 // Run the full trace.
-static inline void gc_tracer_trace(struct gc_heap *heap);
-
-////////////////////////////////////////////////////////////////////////
-/// Procedures that work with any tracer.
-////////////////////////////////////////////////////////////////////////
-
-// Visit one edge.  If we shade the edge target grey, enqueue it for
-// tracing.
-static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
-                                void *trace_data) GC_ALWAYS_INLINE;
-static inline void
-tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
-  if (trace_edge(heap, edge))
-    gc_tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
-}
+static inline void gc_tracer_trace(struct gc_tracer *tracer);
 
 #endif // TRACER_H
diff --git a/src/whippet.c b/src/whippet.c
index f111dd8fd..cf1f6d6e9 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -674,6 +674,9 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
     return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
 }
 
+static inline int trace_edge(struct gc_heap *heap,
+                             struct gc_edge edge) GC_ALWAYS_INLINE;
+
 static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
   struct gc_ref ref = gc_edge_ref(edge);
   int is_new = do_trace(heap, edge, ref);
@@ -1094,6 +1097,14 @@ void gc_heap_set_extern_space(struct gc_heap *heap,
   heap->extern_space = space;
 }
 
+static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
+                                void *trace_data) GC_ALWAYS_INLINE;
+static inline void
+tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  if (trace_edge(heap, edge))
+    gc_tracer_enqueue(&heap->tracer, gc_edge_ref(edge), trace_data);
+}
+
 static void trace_and_enqueue_locally(struct gc_edge edge,
                                       struct gc_heap *heap,
                                       void *data) {
@@ -1177,7 +1188,7 @@ static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
   int possibly_interior = 0;
   struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(resolved))
-    gc_tracer_enqueue(resolved, heap, data);
+    gc_tracer_enqueue(&heap->tracer, resolved, data);
 }
 
 static inline void trace_one_conservatively(struct gc_ref ref,
@@ -1889,7 +1900,7 @@ static void collect(struct gc_mutator *mut,
   large_object_space_start_gc(lospace, is_minor);
   gc_extern_space_start_gc(exspace, is_minor);
   resolve_ephemerons_lazily(heap);
-  gc_tracer_prepare(heap);
+  gc_tracer_prepare(&heap->tracer);
   HEAP_EVENT(heap, requesting_stop);
   request_mutators_to_stop(heap);
   trace_mutator_roots_with_lock_before_stop(mut);
@@ -1906,14 +1917,14 @@ static void collect(struct gc_mutator *mut,
   prepare_for_evacuation(heap);
   trace_roots_after_stop(heap);
   HEAP_EVENT(heap, roots_traced);
-  gc_tracer_trace(heap);
+  gc_tracer_trace(&heap->tracer);
   HEAP_EVENT(heap, heap_traced);
   resolve_ephemerons_eagerly(heap);
   while (enqueue_resolved_ephemerons(heap))
-    gc_tracer_trace(heap);
+    gc_tracer_trace(&heap->tracer);
   HEAP_EVENT(heap, ephemerons_traced);
   sweep_ephemerons(heap);
-  gc_tracer_release(heap);
+  gc_tracer_release(&heap->tracer);
   mark_space_finish_gc(space, gc_kind);
   large_object_space_finish_gc(lospace, is_minor);
   gc_extern_space_finish_gc(exspace, is_minor);
@@ -2366,7 +2377,7 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   pthread_cond_init(&heap->collector_cond, NULL);
   heap->size = options->common.heap_size;
 
-  if (!gc_tracer_init(heap, options->common.parallelism))
+  if (!gc_tracer_init(&heap->tracer, heap, options->common.parallelism))
     GC_CRASH();
 
   heap->pending_ephemerons_size_factor = 0.005;

From ac5d54648183a37130b807d0431defd3667c3fd1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 17:35:58 +0200
Subject: [PATCH 232/403] Remove heap_tracer; get to heap from tracer directly

---
 src/parallel-tracer.h | 12 +++++++-----
 src/serial-tracer.h   |  2 +-
 src/tracer.h          |  3 ---
 src/whippet.c         |  3 ---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 80638603a..6b25ea1a8 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -25,6 +25,7 @@ enum trace_worker_state {
 struct gc_heap;
 struct trace_worker {
   struct gc_heap *heap;
+  struct gc_tracer *tracer;
   size_t id;
   size_t steal_id;
   pthread_t thread;
@@ -55,6 +56,7 @@ static int
 trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
                   struct gc_tracer *tracer, size_t id) {
   worker->heap = heap;
+  worker->tracer = tracer;
   worker->id = id;
   worker->steal_id = 0;
   worker->thread = 0;
@@ -67,7 +69,7 @@ static void trace_worker_trace(struct trace_worker *worker);
 static void*
 trace_worker_thread(void *data) {
   struct trace_worker *worker = data;
-  struct gc_tracer *tracer = heap_tracer(worker->heap);
+  struct gc_tracer *tracer = worker->tracer;
   long trace_epoch = 0;
 
   pthread_mutex_lock(&worker->lock);
@@ -155,7 +157,7 @@ tracer_share(struct local_tracer *trace) {
     shared_worklist_push_many(trace->share_deque, objv, count);
     to_share -= count;
   }
-  tracer_maybe_unpark_workers(heap_tracer(trace->worker->heap));
+  tracer_maybe_unpark_workers(trace->worker->tracer);
 }
 
 static inline void
@@ -218,7 +220,7 @@ trace_worker_should_continue(struct trace_worker *worker) {
   if (worker->id != 0)
     return 0;
 
-  struct gc_tracer *tracer = heap_tracer(worker->heap);
+  struct gc_tracer *tracer = worker->tracer;
 
   for (size_t spin_count = 0;; spin_count++) {
     if (atomic_load_explicit(&tracer->active_tracers,
@@ -249,7 +251,7 @@ trace_worker_should_continue(struct trace_worker *worker) {
 static struct gc_ref
 trace_worker_steal(struct local_tracer *trace) {
   struct trace_worker *worker = trace->worker;
-  struct gc_tracer *tracer = heap_tracer(worker->heap);
+  struct gc_tracer *tracer = worker->tracer;
 
   // It could be that the worker's local trace queue has simply
   // overflowed.  In that case avoid contention by trying to pop
@@ -272,7 +274,7 @@ trace_worker_steal(struct local_tracer *trace) {
 static void
 trace_worker_trace(struct trace_worker *worker) {
   struct gc_heap *heap = worker->heap;
-  struct gc_tracer *tracer = heap_tracer(heap);
+  struct gc_tracer *tracer = worker->tracer;
   atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
 
   struct local_tracer trace;
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index c208e10e5..fffa133dd 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -18,7 +18,7 @@ static int
 gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
                size_t parallelism) {
   tracer->heap = heap;
-  return simple_worklist_init(&heap_tracer(heap)->worklist);
+  return simple_worklist_init(&tracer->worklist);
 }
 static void gc_tracer_prepare(struct gc_tracer *tracer) {}
 static void gc_tracer_release(struct gc_tracer *tracer) {
diff --git a/src/tracer.h b/src/tracer.h
index a49d25887..0208f6aa8 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -10,9 +10,6 @@ struct gc_heap;
 /// To be implemented by collector.
 ////////////////////////////////////////////////////////////////////////
 
-// Initialize the tracer when the heap is created.
-static inline struct gc_tracer* heap_tracer(struct gc_heap *heap);
-
 // Visit all fields in an object.
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              void *trace_data) GC_ALWAYS_INLINE;
diff --git a/src/whippet.c b/src/whippet.c
index cf1f6d6e9..d56476ffc 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -351,9 +351,6 @@ struct gc_mutator {
   struct gc_mutator *next;
 };
 
-static inline struct gc_tracer* heap_tracer(struct gc_heap *heap) {
-  return &heap->tracer;
-}
 static inline struct mark_space* heap_mark_space(struct gc_heap *heap) {
   return &heap->mark_space;
 }

From 5ff78f01c88e77c73bb17f36ad85ea2b75c4f218 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 18:09:34 +0200
Subject: [PATCH 233/403] Add gc_trace_worker_call_with_data

Goal is to pass thread-local evacuation buffers.
---
 src/parallel-tracer.h | 19 ++++++++++++++++---
 src/serial-tracer.h   | 11 +++++++++--
 src/tracer.h          | 15 +++++++++++----
 src/whippet.c         | 11 +++++++++++
 4 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 6b25ea1a8..4bfa0490f 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -32,6 +32,7 @@ struct trace_worker {
   enum trace_worker_state state;
   pthread_mutex_t lock;
   struct shared_worklist deque;
+  struct gc_trace_worker_data *data;
 };
 
 #define TRACE_WORKERS_MAX_COUNT 8
@@ -60,7 +61,9 @@ trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
   worker->id = id;
   worker->steal_id = 0;
   worker->thread = 0;
+  worker->state = TRACE_WORKER_STOPPED;
   pthread_mutex_init(&worker->lock, NULL);
+  worker->data = NULL;
   return shared_worklist_init(&worker->deque);
 }
 
@@ -272,10 +275,13 @@ trace_worker_steal(struct local_tracer *trace) {
 }
 
 static void
-trace_worker_trace(struct trace_worker *worker) {
-  struct gc_heap *heap = worker->heap;
-  struct gc_tracer *tracer = worker->tracer;
+trace_with_data(struct gc_tracer *tracer,
+                struct gc_heap *heap,
+                struct gc_trace_worker_data *worker_data,
+                void *data) {
+  struct trace_worker *worker = data;
   atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
+  worker->data = worker_data;
 
   struct local_tracer trace;
   trace.worker = worker;
@@ -302,9 +308,16 @@ trace_worker_trace(struct trace_worker *worker) {
 
   DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
 
+  worker->data = NULL;
   atomic_fetch_sub_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
 }
 
+static void
+trace_worker_trace(struct trace_worker *worker) {
+  gc_trace_worker_call_with_data(trace_with_data, worker->tracer,
+                                 worker->heap, worker);
+}
+
 static inline void
 gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref ref) {
   struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index fffa133dd..c75425bde 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -40,13 +40,20 @@ gc_tracer_enqueue(struct gc_tracer *tracer, struct gc_ref ref,
   gc_tracer_enqueue_root(tracer, ref);
 }
 static inline void
-gc_tracer_trace(struct gc_tracer *tracer) {
+tracer_trace_with_data(struct gc_tracer *tracer, struct gc_heap *heap,
+                       struct gc_trace_worker_data *worker_data,
+                       void *data) {
   do {
     struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
     if (!gc_ref_is_heap_object(obj))
       break;
-    trace_one(obj, tracer->heap, NULL);
+    trace_one(obj, heap, NULL);
   } while (1);
 }
+static inline void
+gc_tracer_trace(struct gc_tracer *tracer) {
+  gc_trace_worker_call_with_data(tracer_trace_with_data, tracer, tracer->heap,
+                                 NULL);
+}
 
 #endif // SERIAL_TRACER_H
diff --git a/src/tracer.h b/src/tracer.h
index 0208f6aa8..f0524406a 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -10,18 +10,25 @@ struct gc_heap;
 /// To be implemented by collector.
 ////////////////////////////////////////////////////////////////////////
 
+struct gc_tracer;
+struct gc_trace_worker_data;
 // Visit all fields in an object.
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              void *trace_data) GC_ALWAYS_INLINE;
 
+static void
+gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
+                                         struct gc_heap *heap,
+                                         struct gc_trace_worker_data *worker_data,
+                                         void *data),
+                               struct gc_tracer *tracer,
+                               struct gc_heap *heap,
+                               void *data);
+
 ////////////////////////////////////////////////////////////////////////
 /// To be implemented by tracer.
 ////////////////////////////////////////////////////////////////////////
 
-// The tracer struct itself should be defined by the tracer
-// implementation.
-struct gc_tracer;
-
 // Initialize the tracer when the heap is created.
 static int gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
                           size_t parallelism);
diff --git a/src/whippet.c b/src/whippet.c
index d56476ffc..63dd1775e 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1094,6 +1094,17 @@ void gc_heap_set_extern_space(struct gc_heap *heap,
   heap->extern_space = space;
 }
 
+static void
+gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
+                                         struct gc_heap *heap,
+                                         struct gc_trace_worker_data *worker_data,
+                                         void *data),
+                               struct gc_tracer *tracer,
+                               struct gc_heap *heap,
+                               void *data) {
+  f(tracer, heap, NULL, data);
+}
+
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
 static inline void

From 247f9432a4ed625fd7d96663588627699eaba8fb Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 8 Jul 2024 18:44:24 +0200
Subject: [PATCH 234/403] Tighten up typing of trace workers and trace worker
 data

---
 src/parallel-tracer.h | 90 +++++++++++++++++++------------------------
 src/serial-tracer.h   | 20 ++++++----
 src/tracer.h          | 21 +++++-----
 src/whippet.c         | 26 +++++++------
 4 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 4bfa0490f..0e93ca0dd 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -23,7 +23,7 @@ enum trace_worker_state {
 };
 
 struct gc_heap;
-struct trace_worker {
+struct gc_trace_worker {
   struct gc_heap *heap;
   struct gc_tracer *tracer;
   size_t id;
@@ -31,7 +31,8 @@ struct trace_worker {
   pthread_t thread;
   enum trace_worker_state state;
   pthread_mutex_t lock;
-  struct shared_worklist deque;
+  struct shared_worklist shared;
+  struct local_worklist local;
   struct gc_trace_worker_data *data;
 };
 
@@ -44,17 +45,11 @@ struct gc_tracer {
   long epoch;
   pthread_mutex_t lock;
   pthread_cond_t cond;
-  struct trace_worker workers[TRACE_WORKERS_MAX_COUNT];
-};
-
-struct local_tracer {
-  struct trace_worker *worker;
-  struct shared_worklist *share_deque;
-  struct local_worklist local;
+  struct gc_trace_worker workers[TRACE_WORKERS_MAX_COUNT];
 };
 
 static int
-trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
+trace_worker_init(struct gc_trace_worker *worker, struct gc_heap *heap,
                   struct gc_tracer *tracer, size_t id) {
   worker->heap = heap;
   worker->tracer = tracer;
@@ -64,14 +59,15 @@ trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
   worker->state = TRACE_WORKER_STOPPED;
   pthread_mutex_init(&worker->lock, NULL);
   worker->data = NULL;
-  return shared_worklist_init(&worker->deque);
+  local_worklist_init(&worker->local);
+  return shared_worklist_init(&worker->shared);
 }
 
-static void trace_worker_trace(struct trace_worker *worker);
+static void trace_worker_trace(struct gc_trace_worker *worker);
 
 static void*
 trace_worker_thread(void *data) {
-  struct trace_worker *worker = data;
+  struct gc_trace_worker *worker = data;
   struct gc_tracer *tracer = worker->tracer;
   long trace_epoch = 0;
 
@@ -88,7 +84,7 @@ trace_worker_thread(void *data) {
 }
 
 static int
-trace_worker_spawn(struct trace_worker *worker) {
+trace_worker_spawn(struct gc_trace_worker *worker) {
   if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) {
     perror("spawning tracer thread failed");
     return 0;
@@ -129,7 +125,7 @@ static void gc_tracer_prepare(struct gc_tracer *tracer) {
 }
 static void gc_tracer_release(struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++)
-    shared_worklist_release(&tracer->workers[i].deque);
+    shared_worklist_release(&tracer->workers[i].shared);
 }
 
 static inline void
@@ -151,41 +147,40 @@ tracer_maybe_unpark_workers(struct gc_tracer *tracer) {
 }
 
 static inline void
-tracer_share(struct local_tracer *trace) {
-  DEBUG("tracer #%zu: sharing\n", trace->worker->id);
+tracer_share(struct gc_trace_worker *worker) {
+  DEBUG("tracer #%zu: sharing\n", worker->id);
   size_t to_share = LOCAL_WORKLIST_SHARE_AMOUNT;
   while (to_share) {
     struct gc_ref *objv;
-    size_t count = local_worklist_pop_many(&trace->local, &objv, to_share);
-    shared_worklist_push_many(trace->share_deque, objv, count);
+    size_t count = local_worklist_pop_many(&worker->local, &objv, to_share);
+    shared_worklist_push_many(&worker->shared, objv, count);
     to_share -= count;
   }
-  tracer_maybe_unpark_workers(trace->worker->tracer);
+  tracer_maybe_unpark_workers(worker->tracer);
 }
 
 static inline void
-gc_tracer_enqueue(struct gc_tracer *tracer, struct gc_ref ref,
-                  void *trace_data) {
-  struct local_tracer *trace = trace_data;
-  if (local_worklist_full(&trace->local))
-    tracer_share(trace);
-  local_worklist_push(&trace->local, ref);
+gc_trace_worker_enqueue(struct gc_trace_worker *worker, struct gc_ref ref) {
+  if (local_worklist_full(&worker->local))
+    tracer_share(worker);
+  local_worklist_push(&worker->local, ref);
 }
 
 static struct gc_ref
 tracer_steal_from_worker(struct gc_tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
-  return shared_worklist_steal(&tracer->workers[id].deque);
+  return shared_worklist_steal(&tracer->workers[id].shared);
 }
 
 static int
 tracer_can_steal_from_worker(struct gc_tracer *tracer, size_t id) {
   ASSERT(id < tracer->worker_count);
-  return shared_worklist_can_steal(&tracer->workers[id].deque);
+  return shared_worklist_can_steal(&tracer->workers[id].shared);
 }
 
 static struct gc_ref
-trace_worker_steal_from_any(struct trace_worker *worker, struct gc_tracer *tracer) {
+trace_worker_steal_from_any(struct gc_trace_worker *worker,
+                            struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++) {
     DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
     struct gc_ref obj = tracer_steal_from_worker(tracer, worker->steal_id);
@@ -201,7 +196,7 @@ trace_worker_steal_from_any(struct trace_worker *worker, struct gc_tracer *trace
 }
 
 static int
-trace_worker_can_steal_from_any(struct trace_worker *worker,
+trace_worker_can_steal_from_any(struct gc_trace_worker *worker,
                                 struct gc_tracer *tracer) {
   DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
   for (size_t i = 0; i < tracer->worker_count; i++) {
@@ -218,7 +213,7 @@ trace_worker_can_steal_from_any(struct trace_worker *worker,
 }
 
 static int
-trace_worker_should_continue(struct trace_worker *worker) {
+trace_worker_should_continue(struct gc_trace_worker *worker) {
   // Helper workers should park themselves immediately if they have no work.
   if (worker->id != 0)
     return 0;
@@ -252,8 +247,7 @@ trace_worker_should_continue(struct trace_worker *worker) {
 }
 
 static struct gc_ref
-trace_worker_steal(struct local_tracer *trace) {
-  struct trace_worker *worker = trace->worker;
+trace_worker_steal(struct gc_trace_worker *worker) {
   struct gc_tracer *tracer = worker->tracer;
 
   // It could be that the worker's local trace queue has simply
@@ -261,7 +255,7 @@ trace_worker_steal(struct local_tracer *trace) {
   // something from the worker's own queue.
   {
     DEBUG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
-    struct gc_ref obj = shared_worklist_try_pop(&worker->deque);
+    struct gc_ref obj = shared_worklist_try_pop(&worker->shared);
     if (gc_ref_is_heap_object(obj))
       return obj;
   }
@@ -277,16 +271,10 @@ trace_worker_steal(struct local_tracer *trace) {
 static void
 trace_with_data(struct gc_tracer *tracer,
                 struct gc_heap *heap,
-                struct gc_trace_worker_data *worker_data,
-                void *data) {
-  struct trace_worker *worker = data;
+                struct gc_trace_worker *worker,
+                struct gc_trace_worker_data *data) {
   atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
-  worker->data = worker_data;
-
-  struct local_tracer trace;
-  trace.worker = worker;
-  trace.share_deque = &worker->deque;
-  local_worklist_init(&trace.local);
+  worker->data = data;
 
   size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
@@ -294,14 +282,14 @@ trace_with_data(struct gc_tracer *tracer,
   do {
     while (1) {
       struct gc_ref ref;
-      if (!local_worklist_empty(&trace.local)) {
-        ref = local_worklist_pop(&trace.local);
+      if (!local_worklist_empty(&worker->local)) {
+        ref = local_worklist_pop(&worker->local);
       } else {
-        ref = trace_worker_steal(&trace);
+        ref = trace_worker_steal(worker);
         if (!gc_ref_is_heap_object(ref))
           break;
       }
-      trace_one(ref, heap, &trace);
+      trace_one(ref, heap, worker);
       n++;
     }
   } while (trace_worker_should_continue(worker));
@@ -313,21 +301,21 @@ trace_with_data(struct gc_tracer *tracer,
 }
 
 static void
-trace_worker_trace(struct trace_worker *worker) {
+trace_worker_trace(struct gc_trace_worker *worker) {
   gc_trace_worker_call_with_data(trace_with_data, worker->tracer,
                                  worker->heap, worker);
 }
 
 static inline void
 gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref ref) {
-  struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
+  struct shared_worklist *worker0_deque = &tracer->workers[0].shared;
   shared_worklist_push(worker0_deque, ref);
 }
 
 static inline void
 gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objv,
                         size_t count) {
-  struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
+  struct shared_worklist *worker0_deque = &tracer->workers[0].shared;
   shared_worklist_push_many(worker0_deque, objv, count);
 }
 
@@ -337,7 +325,7 @@ gc_tracer_trace(struct gc_tracer *tracer) {
 
   ssize_t parallel_threshold =
     LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
-  if (shared_worklist_size(&tracer->workers[0].deque) >= parallel_threshold) {
+  if (shared_worklist_size(&tracer->workers[0].shared) >= parallel_threshold) {
     DEBUG("waking workers\n");
     tracer_unpark_all_workers(tracer);
   } else {
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index c75425bde..9e128be91 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -14,6 +14,11 @@ struct gc_tracer {
   struct simple_worklist worklist;
 };
 
+struct gc_trace_worker {
+  struct gc_tracer *tracer;
+  struct gc_trace_worker_data *data;
+};
+
 static int
 gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
                size_t parallelism) {
@@ -35,25 +40,26 @@ gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objs,
   simple_worklist_push_many(&tracer->worklist, objs, count);
 }
 static inline void
-gc_tracer_enqueue(struct gc_tracer *tracer, struct gc_ref ref,
-                  void *trace_data) {
-  gc_tracer_enqueue_root(tracer, ref);
+gc_trace_worker_enqueue(struct gc_trace_worker *worker, struct gc_ref ref) {
+  gc_tracer_enqueue_root(worker->tracer, ref);
 }
 static inline void
 tracer_trace_with_data(struct gc_tracer *tracer, struct gc_heap *heap,
-                       struct gc_trace_worker_data *worker_data,
-                       void *data) {
+                       struct gc_trace_worker *worker,
+                       struct gc_trace_worker_data *data) {
+  worker->data = data;
   do {
     struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
     if (!gc_ref_is_heap_object(obj))
       break;
-    trace_one(obj, heap, NULL);
+    trace_one(obj, heap, worker);
   } while (1);
 }
 static inline void
 gc_tracer_trace(struct gc_tracer *tracer) {
+  struct gc_trace_worker worker = { tracer };
   gc_trace_worker_call_with_data(tracer_trace_with_data, tracer, tracer->heap,
-                                 NULL);
+                                 &worker);
 }
 
 #endif // SERIAL_TRACER_H
diff --git a/src/tracer.h b/src/tracer.h
index f0524406a..64f6dcdc6 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -6,24 +6,28 @@
 
 struct gc_heap;
 
+// Data types to be implemented by tracer.
+struct gc_tracer;
+struct gc_trace_worker;
+// Data types to be implemented by collector.
+struct gc_trace_worker_data;
+
 ////////////////////////////////////////////////////////////////////////
 /// To be implemented by collector.
 ////////////////////////////////////////////////////////////////////////
 
-struct gc_tracer;
-struct gc_trace_worker_data;
 // Visit all fields in an object.
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             void *trace_data) GC_ALWAYS_INLINE;
+                             struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
 
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                          struct gc_heap *heap,
-                                         struct gc_trace_worker_data *worker_data,
-                                         void *data),
+                                         struct gc_trace_worker *worker,
+                                         struct gc_trace_worker_data *data),
                                struct gc_tracer *tracer,
                                struct gc_heap *heap,
-                               void *data);
+                               struct gc_trace_worker *worker);
 
 ////////////////////////////////////////////////////////////////////////
 /// To be implemented by tracer.
@@ -47,9 +51,8 @@ static inline void gc_tracer_enqueue_roots(struct gc_tracer *tracer,
                                            size_t count);
 
 // Given that an object has been shaded grey, enqueue for tracing.
-static inline void gc_tracer_enqueue(struct gc_tracer *tracer,
-                                     struct gc_ref ref,
-                                     void *trace_data) GC_ALWAYS_INLINE;
+static inline void gc_trace_worker_enqueue(struct gc_trace_worker *worker,
+                                           struct gc_ref ref) GC_ALWAYS_INLINE;
 
 // Run the full trace.
 static inline void gc_tracer_trace(struct gc_tracer *tracer);
diff --git a/src/whippet.c b/src/whippet.c
index 63dd1775e..0b4dd61d6 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1097,20 +1097,21 @@ void gc_heap_set_extern_space(struct gc_heap *heap,
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                          struct gc_heap *heap,
-                                         struct gc_trace_worker_data *worker_data,
-                                         void *data),
+                                         struct gc_trace_worker *worker,
+                                         struct gc_trace_worker_data *data),
                                struct gc_tracer *tracer,
                                struct gc_heap *heap,
-                               void *data) {
-  f(tracer, heap, NULL, data);
+                               struct gc_trace_worker *worker) {
+  f(tracer, heap, worker, NULL);
 }
 
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
 static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  struct gc_trace_worker *worker = trace_data;
   if (trace_edge(heap, edge))
-    gc_tracer_enqueue(&heap->tracer, gc_edge_ref(edge), trace_data);
+    gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
 static void trace_and_enqueue_locally(struct gc_edge edge,
@@ -1193,15 +1194,16 @@ trace_conservative_edges(uintptr_t low,
 static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
                                                  struct gc_heap *heap,
                                                  void *data) {
+  struct gc_trace_worker *worker = data;
   int possibly_interior = 0;
   struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(resolved))
-    gc_tracer_enqueue(&heap->tracer, resolved, data);
+    gc_trace_worker_enqueue(worker, resolved);
 }
 
 static inline void trace_one_conservatively(struct gc_ref ref,
                                             struct gc_heap *heap,
-                                            void *mark_data) {
+                                            struct gc_trace_worker *worker) {
   size_t bytes;
   if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
     // Generally speaking we trace conservatively and don't allow much
@@ -1211,7 +1213,7 @@ static inline void trace_one_conservatively(struct gc_ref ref,
     uint8_t meta = *metadata_byte_for_addr(gc_ref_value(ref));
     if (GC_UNLIKELY(meta & METADATA_BYTE_EPHEMERON)) {
       gc_trace_ephemeron(gc_ref_heap_object(ref), tracer_visit, heap,
-                         mark_data);
+                         worker);
       return;
     }
     bytes = mark_space_object_size(heap_mark_space(heap), ref);
@@ -1221,15 +1223,15 @@ static inline void trace_one_conservatively(struct gc_ref ref,
   trace_conservative_edges(gc_ref_value(ref),
                            gc_ref_value(ref) + bytes,
                            tracer_trace_conservative_ref, heap,
-                           mark_data);
+                           worker);
 }
 
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             void *mark_data) {
+                             struct gc_trace_worker *worker) {
   if (gc_has_conservative_intraheap_edges())
-    trace_one_conservatively(ref, heap, mark_data);
+    trace_one_conservatively(ref, heap, worker);
   else
-    gc_trace_object(ref, tracer_visit, heap, mark_data, NULL);
+    gc_trace_object(ref, tracer_visit, heap, worker, NULL);
 }
 
 static void

From d50455ed1be21888b0b04db56140ddc9c29c3160 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 9 Jul 2024 09:57:28 +0200
Subject: [PATCH 235/403] Add new gc_atomic_forward_object_size API

Also remove an unused function in whippet.c
---
 api/gc-embedder-api.h           |  1 +
 benchmarks/simple-gc-embedder.h | 14 ++++++++++++++
 doc/manual.md                   |  5 ++++-
 src/whippet.c                   |  7 -------
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index ad33bc170..30ba62946 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -66,6 +66,7 @@ GC_EMBEDDER_API inline struct gc_atomic_forward gc_atomic_forward_begin(struct g
 GC_EMBEDDER_API inline void gc_atomic_forward_acquire(struct gc_atomic_forward *);
 GC_EMBEDDER_API inline int gc_atomic_forward_retry_busy(struct gc_atomic_forward *);
 GC_EMBEDDER_API inline void gc_atomic_forward_abort(struct gc_atomic_forward *);
+GC_EMBEDDER_API inline size_t gc_atomic_forward_object_size(struct gc_atomic_forward *);
 GC_EMBEDDER_API inline void gc_atomic_forward_commit(struct gc_atomic_forward *,
                                                      struct gc_ref new_ref);
 GC_EMBEDDER_API inline uintptr_t gc_atomic_forward_address(struct gc_atomic_forward *);
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index 2c599655b..5f77a7cc9 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -168,6 +168,20 @@ gc_atomic_forward_abort(struct gc_atomic_forward *fwd) {
   fwd->state = GC_FORWARDING_STATE_ABORTED;
 }
 
+static inline size_t
+gc_atomic_forward_object_size(struct gc_atomic_forward *fwd) {
+  GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
+  switch (tag_live_alloc_kind(fwd->data)) {
+#define SCAN_OBJECT(name, Name, NAME)                                   \
+    case ALLOC_KIND_##NAME:                                             \
+      return name##_size(gc_ref_heap_object(ref));                      \
+    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
+#undef SCAN_OBJECT
+  default:
+    GC_CRASH();
+  }
+}
+
 static inline void
 gc_atomic_forward_commit(struct gc_atomic_forward *fwd, struct gc_ref new_ref) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
diff --git a/doc/manual.md b/doc/manual.md
index 4156defdf..ab0656db0 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -140,7 +140,10 @@ acquired and completed the forwarding attempt.
 
 An `ACQUIRED` object can then be forwarded via
 `gc_atomic_forward_commit`, or the forwarding attempt can be aborted via
-`gc_atomic_forward_abort`.
+`gc_atomic_forward_abort`.  Also, when an object is acquired, the
+collector may call `gc_atomic_forward_object_size` to compute how many
+bytes to copy.  (The collector may choose instead to record object sizes
+in a different way.)
 
 All of these `gc_atomic_forward` functions are to be implemented by the
 embedder.  Some programs may allocate a dedicated forwarding word in all
diff --git a/src/whippet.c b/src/whippet.c
index 0b4dd61d6..af9b3dbd4 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -831,13 +831,6 @@ static inline size_t mark_space_object_size(struct mark_space *space,
   return granules * GRANULE_SIZE;
 }
 
-static inline size_t gc_object_allocation_size(struct gc_heap *heap,
-                                               struct gc_ref ref) {
-  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref)))
-    return mark_space_object_size(heap_mark_space(heap), ref);
-  return large_object_space_object_size(heap_large_object_space(heap), ref);
-}
-
 static int heap_has_multiple_mutators(struct gc_heap *heap) {
   return atomic_load_explicit(&heap->multithreaded, memory_order_relaxed);
 }

From ff1e1b1d443173e3b5c8b1c161c85319396a7cc1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 10 Jul 2024 11:47:15 +0200
Subject: [PATCH 236/403] whippet: avoid an atomic load when visiting
 already-marked objects

---
 src/whippet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/whippet.c b/src/whippet.c
index af9b3dbd4..9674560c2 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -678,7 +678,8 @@ static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
   struct gc_ref ref = gc_edge_ref(edge);
   int is_new = do_trace(heap, edge, ref);
 
-  if (GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
+  if (is_new &&
+      GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
                                        memory_order_relaxed)))
     gc_resolve_pending_ephemerons(ref, heap);
 

From 5084730471a15b6776c548f9e6779e61b9638705 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 10 Jul 2024 15:46:11 +0200
Subject: [PATCH 237/403] Add parallel root-tracing phase

---
 benchmarks/simple-gc-embedder.h |  8 ++--
 src/gc-ephemeron-internal.h     | 16 ++++---
 src/gc-ephemeron.c              | 22 +++++-----
 src/parallel-tracer.h           | 51 +++++++++++++++++-----
 src/root-worklist.h             | 76 +++++++++++++++++++++++++++++++++
 src/root.h                      | 43 +++++++++++++++++++
 src/semi.c                      |  7 ++-
 src/serial-tracer.h             | 20 +++++++++
 src/tracer.h                    |  8 ++++
 src/whippet.c                   | 14 +++++-
 10 files changed, 231 insertions(+), 34 deletions(-)
 create mode 100644 src/root-worklist.h
 create mode 100644 src/root.h

diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index 5f77a7cc9..23fc54a5d 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -172,11 +172,11 @@ static inline size_t
 gc_atomic_forward_object_size(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
   switch (tag_live_alloc_kind(fwd->data)) {
-#define SCAN_OBJECT(name, Name, NAME)                                   \
+#define OBJECT_SIZE(name, Name, NAME)                                   \
     case ALLOC_KIND_##NAME:                                             \
-      return name##_size(gc_ref_heap_object(ref));                      \
-    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
-#undef SCAN_OBJECT
+      return name##_size(gc_ref_heap_object(fwd->ref));
+    FOR_EACH_HEAP_OBJECT_KIND(OBJECT_SIZE)
+#undef OBJECT_SIZE
   default:
     GC_CRASH();
   }
diff --git a/src/gc-ephemeron-internal.h b/src/gc-ephemeron-internal.h
index 8894bbd8f..3d34cf188 100644
--- a/src/gc-ephemeron-internal.h
+++ b/src/gc-ephemeron-internal.h
@@ -32,12 +32,16 @@ gc_scan_pending_ephemerons(struct gc_pending_ephemerons *state,
                            struct gc_heap *heap, size_t shard,
                            size_t nshards);
 
-GC_INTERNAL int
-gc_pop_resolved_ephemerons(struct gc_heap *heap,
-                           void (*visit)(struct gc_edge edge,
-                                         struct gc_heap *heap,
-                                         void *visit_data),
-                           void *trace_data);
+GC_INTERNAL struct gc_ephemeron*
+gc_pop_resolved_ephemerons(struct gc_heap *heap);
+
+GC_INTERNAL void
+gc_trace_resolved_ephemerons(struct gc_ephemeron *resolved,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *trace_data);
 
 GC_INTERNAL void
 gc_sweep_pending_ephemerons(struct gc_pending_ephemerons *state,
diff --git a/src/gc-ephemeron.c b/src/gc-ephemeron.c
index 8a42c1a84..90f82e8ca 100644
--- a/src/gc-ephemeron.c
+++ b/src/gc-ephemeron.c
@@ -521,23 +521,25 @@ gc_scan_pending_ephemerons(struct gc_pending_ephemerons *state,
   }
 }
 
-int
-gc_pop_resolved_ephemerons(struct gc_heap *heap,
-                           void (*visit)(struct gc_edge edge,
-                                         struct gc_heap *heap,
-                                         void *visit_data),
-                           void *trace_data) {
+struct gc_ephemeron*
+gc_pop_resolved_ephemerons(struct gc_heap *heap) {
   struct gc_pending_ephemerons *state = gc_heap_pending_ephemerons(heap);
-  struct gc_ephemeron *resolved = atomic_exchange(&state->resolved, NULL);
-  if (!resolved)
-    return 0;
+  return atomic_exchange(&state->resolved, NULL);
+}    
+
+void
+gc_trace_resolved_ephemerons(struct gc_ephemeron *resolved,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *trace_data) {
   for (; resolved; resolved = resolved->resolved) {
     visit(gc_ephemeron_value_edge(resolved), heap, trace_data);
     // RESOLVED -> TRACED.
     atomic_store_explicit(&resolved->state, EPHEMERON_STATE_TRACED,
                           memory_order_release);
   }
-  return 1;
 }    
 
 void
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 0e93ca0dd..dea87cc9f 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -10,10 +10,17 @@
 #include "debug.h"
 #include "gc-inline.h"
 #include "local-worklist.h"
+#include "root-worklist.h"
 #include "shared-worklist.h"
 #include "spin.h"
 #include "tracer.h"
 
+#ifdef VERBOSE_LOGGING
+#define LOG(...) fprintf (stderr, "LOG: " __VA_ARGS__)
+#else
+#define LOG(...) do { } while (0)
+#endif
+
 enum trace_worker_state {
   TRACE_WORKER_STOPPED,
   TRACE_WORKER_IDLE,
@@ -36,6 +43,11 @@ struct gc_trace_worker {
   struct gc_trace_worker_data *data;
 };
 
+static inline struct gc_trace_worker_data*
+gc_trace_worker_data(struct gc_trace_worker *worker) {
+  return worker->data;
+}
+
 #define TRACE_WORKERS_MAX_COUNT 8
 
 struct gc_tracer {
@@ -45,6 +57,7 @@ struct gc_tracer {
   long epoch;
   pthread_mutex_t lock;
   pthread_cond_t cond;
+  struct root_worklist roots;
   struct gc_trace_worker workers[TRACE_WORKERS_MAX_COUNT];
 };
 
@@ -101,6 +114,7 @@ gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
   tracer->epoch = 0;
   pthread_mutex_init(&tracer->lock, NULL);
   pthread_cond_init(&tracer->cond, NULL);
+  root_worklist_init(&tracer->roots);
   size_t desired_worker_count = parallelism;
   ASSERT(desired_worker_count);
   if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
@@ -121,13 +135,18 @@ gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
 
 static void gc_tracer_prepare(struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++)
-    tracer->workers[i].steal_id = 0;
+    tracer->workers[i].steal_id = (i + 1) % tracer->worker_count;
 }
 static void gc_tracer_release(struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++)
     shared_worklist_release(&tracer->workers[i].shared);
 }
 
+static inline void
+gc_tracer_add_root(struct gc_tracer *tracer, struct gc_root root) {
+  root_worklist_push(&tracer->roots, root);
+}
+
 static inline void
 tracer_unpark_all_workers(struct gc_tracer *tracer) {
   long old_epoch =
@@ -161,6 +180,7 @@ tracer_share(struct gc_trace_worker *worker) {
 
 static inline void
 gc_trace_worker_enqueue(struct gc_trace_worker *worker, struct gc_ref ref) {
+  ASSERT(gc_ref_is_heap_object(ref));
   if (local_worklist_full(&worker->local))
     tracer_share(worker);
   local_worklist_push(&worker->local, ref);
@@ -182,33 +202,33 @@ static struct gc_ref
 trace_worker_steal_from_any(struct gc_trace_worker *worker,
                             struct gc_tracer *tracer) {
   for (size_t i = 0; i < tracer->worker_count; i++) {
-    DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
+    LOG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
     struct gc_ref obj = tracer_steal_from_worker(tracer, worker->steal_id);
     if (gc_ref_is_heap_object(obj)) {
-      DEBUG("tracer #%zu: stealing got %p\n", worker->id,
+      LOG("tracer #%zu: stealing got %p\n", worker->id,
             gc_ref_heap_object(obj));
       return obj;
     }
     worker->steal_id = (worker->steal_id + 1) % tracer->worker_count;
   }
-  DEBUG("tracer #%zu: failed to steal\n", worker->id);
+  LOG("tracer #%zu: failed to steal\n", worker->id);
   return gc_ref_null();
 }
 
 static int
 trace_worker_can_steal_from_any(struct gc_trace_worker *worker,
                                 struct gc_tracer *tracer) {
-  DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
+  LOG("tracer #%zu: checking if any worker has tasks\n", worker->id);
   for (size_t i = 0; i < tracer->worker_count; i++) {
     int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
     if (res) {
-      DEBUG("tracer #%zu: worker #%zu has tasks!\n", worker->id,
+      LOG("tracer #%zu: worker #%zu has tasks!\n", worker->id,
             worker->steal_id);
       return 1;
     }
     worker->steal_id = (worker->steal_id + 1) % tracer->worker_count;
   }
-  DEBUG("tracer #%zu: nothing to steal\n", worker->id);
+  LOG("tracer #%zu: nothing to steal\n", worker->id);
   return 0;
 }
 
@@ -241,7 +261,7 @@ trace_worker_should_continue(struct gc_trace_worker *worker) {
       return !done;
     }
     // spin
-    DEBUG("checking for termination: spinning #%zu\n", spin_count);
+    LOG("checking for termination: spinning #%zu\n", spin_count);
     yield_for_spin(spin_count);
   }
 }
@@ -254,13 +274,13 @@ trace_worker_steal(struct gc_trace_worker *worker) {
   // overflowed.  In that case avoid contention by trying to pop
   // something from the worker's own queue.
   {
-    DEBUG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
+    LOG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
     struct gc_ref obj = shared_worklist_try_pop(&worker->shared);
     if (gc_ref_is_heap_object(obj))
       return obj;
   }
 
-  DEBUG("tracer #%zu: trying to steal\n", worker->id);
+  LOG("tracer #%zu: trying to steal\n", worker->id);
   struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
   if (gc_ref_is_heap_object(obj))
     return obj;
@@ -279,6 +299,13 @@ trace_with_data(struct gc_tracer *tracer,
   size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
 
+  do {
+    struct gc_root root = root_worklist_pop(&tracer->roots);
+    if (root.kind == GC_ROOT_KIND_NONE)
+      break;
+    trace_root(root, heap, worker);
+  } while (1);
+
   do {
     while (1) {
       struct gc_ref ref;
@@ -325,7 +352,8 @@ gc_tracer_trace(struct gc_tracer *tracer) {
 
   ssize_t parallel_threshold =
     LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
-  if (shared_worklist_size(&tracer->workers[0].shared) >= parallel_threshold) {
+  if (root_worklist_size(&tracer->roots) > 1 ||
+      shared_worklist_size(&tracer->workers[0].shared) >= parallel_threshold) {
     DEBUG("waking workers\n");
     tracer_unpark_all_workers(tracer);
   } else {
@@ -333,6 +361,7 @@ gc_tracer_trace(struct gc_tracer *tracer) {
   }
 
   trace_worker_trace(&tracer->workers[0]);
+  root_worklist_reset(&tracer->roots);
 
   DEBUG("trace finished\n");
 }
diff --git a/src/root-worklist.h b/src/root-worklist.h
new file mode 100644
index 000000000..45ede8595
--- /dev/null
+++ b/src/root-worklist.h
@@ -0,0 +1,76 @@
+#ifndef ROOT_WORKLIST_H
+#define ROOT_WORKLIST_H
+
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-inline.h"
+#include "gc-ref.h"
+#include "root.h"
+
+// A single-producer, multiple-consumer worklist that has two phases:
+// one in which roots are added by the producer, then one in which roots
+// are consumed from the worklist.  Roots are never added once the
+// consumer phase starts.
+struct root_worklist {
+  size_t size;
+  size_t read;
+  size_t write;
+  struct gc_root *buf;
+};
+
+void
+root_worklist_alloc(struct root_worklist *q) {
+  q->buf = realloc(q->buf, q->size * sizeof(struct gc_root));
+  if (!q->buf) {
+    perror("Failed to grow root worklist");
+    GC_CRASH();
+  }
+}
+
+static void
+root_worklist_init(struct root_worklist *q) {
+  q->size = 16;
+  q->read = 0;
+  q->write = 0;
+  q->buf = NULL;
+  root_worklist_alloc(q);
+}
+
+static inline void
+root_worklist_push(struct root_worklist *q, struct gc_root root) {
+  if (UNLIKELY(q->write == q->size)) {
+    q->size *= 2;
+    root_worklist_alloc(q);
+  }
+  q->buf[q->write++] = root;
+}
+
+// Not atomic.
+static inline size_t
+root_worklist_size(struct root_worklist *q) {
+  return q->write - q->read;
+}
+
+static inline struct gc_root
+root_worklist_pop(struct root_worklist *q) {
+  size_t idx = atomic_fetch_add(&q->read, 1);
+  if (idx < q->write)
+    return q->buf[idx];
+  return (struct gc_root){ GC_ROOT_KIND_NONE, };
+}
+
+static void
+root_worklist_reset(struct root_worklist *q) {
+  q->read = q->write = 0;
+}
+
+static void
+root_worklist_destroy(struct root_worklist *q) {
+  free(q->buf);
+}
+
+#endif // ROOT_WORKLIST_H
diff --git a/src/root.h b/src/root.h
new file mode 100644
index 000000000..a6a91f987
--- /dev/null
+++ b/src/root.h
@@ -0,0 +1,43 @@
+#ifndef ROOT_H
+#define ROOT_H
+
+struct gc_ephemeron;
+struct gc_heap;
+struct gc_mutator;
+
+enum gc_root_kind {
+  GC_ROOT_KIND_NONE,
+  GC_ROOT_KIND_HEAP,
+  GC_ROOT_KIND_MUTATOR,
+  GC_ROOT_KIND_RESOLVED_EPHEMERONS
+};
+
+struct gc_root {
+  enum gc_root_kind kind;
+  union {
+    struct gc_heap *heap;
+    struct gc_mutator *mutator;
+    struct gc_ephemeron *resolved_ephemerons;
+  };
+};
+
+static inline struct gc_root gc_root_heap(struct gc_heap* heap) {
+  struct gc_root ret = { GC_ROOT_KIND_HEAP };
+  ret.heap = heap;
+  return ret;
+}
+
+static inline struct gc_root gc_root_mutator(struct gc_mutator* mutator) {
+  struct gc_root ret = { GC_ROOT_KIND_MUTATOR };
+  ret.mutator = mutator;
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_resolved_ephemerons(struct gc_ephemeron* resolved) {
+  struct gc_root ret = { GC_ROOT_KIND_RESOLVED_EPHEMERONS };
+  ret.resolved_ephemerons = resolved;
+  return ret;
+}
+
+#endif // ROOT_H
diff --git a/src/semi.c b/src/semi.c
index fdcd03792..af9134bd7 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -380,9 +380,14 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   HEAP_EVENT(heap, heap_traced);
   gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
   heap->check_pending_ephemerons = 1;
-  while (gc_pop_resolved_ephemerons(heap, trace, NULL))
+  do {
+    struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+    if (!resolved)
+      break;
+    gc_trace_resolved_ephemerons(resolved, trace, heap, NULL);
     while(grey < semi->hp)
       grey = scan(heap, gc_ref(grey));
+  } while (1);
   HEAP_EVENT(heap, ephemerons_traced);
   large_object_space_finish_gc(large, 0);
   gc_extern_space_finish_gc(heap->extern_space, 0);
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index 9e128be91..96ab7e563 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -7,10 +7,12 @@
 #include "assert.h"
 #include "debug.h"
 #include "simple-worklist.h"
+#include "root-worklist.h"
 #include "tracer.h"
 
 struct gc_tracer {
   struct gc_heap *heap;
+  struct root_worklist roots;
   struct simple_worklist worklist;
 };
 
@@ -19,10 +21,16 @@ struct gc_trace_worker {
   struct gc_trace_worker_data *data;
 };
 
+static inline struct gc_trace_worker_data*
+gc_trace_worker_data(struct gc_trace_worker *worker) {
+  return worker->data;
+}
+
 static int
 gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
                size_t parallelism) {
   tracer->heap = heap;
+  root_worklist_init(&tracer->roots);
   return simple_worklist_init(&tracer->worklist);
 }
 static void gc_tracer_prepare(struct gc_tracer *tracer) {}
@@ -30,6 +38,11 @@ static void gc_tracer_release(struct gc_tracer *tracer) {
   simple_worklist_release(&tracer->worklist);
 }
 
+static inline void
+gc_tracer_add_root(struct gc_tracer *tracer, struct gc_root root) {
+  root_worklist_push(&tracer->roots, root);
+}
+
 static inline void
 gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref obj) {
   simple_worklist_push(&tracer->worklist, obj);
@@ -48,6 +61,13 @@ tracer_trace_with_data(struct gc_tracer *tracer, struct gc_heap *heap,
                        struct gc_trace_worker *worker,
                        struct gc_trace_worker_data *data) {
   worker->data = data;
+  do {
+    struct gc_root root = root_worklist_pop(&tracer->roots);
+    if (root.kind == GC_ROOT_KIND_NONE)
+      break;
+    trace_root(root, heap, worker);
+  } while (1);
+  root_worklist_reset(&tracer->roots);
   do {
     struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
     if (!gc_ref_is_heap_object(obj))
diff --git a/src/tracer.h b/src/tracer.h
index 64f6dcdc6..ec6a140b1 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -3,6 +3,7 @@
 
 #include "gc-ref.h"
 #include "gc-edge.h"
+#include "root.h"
 
 struct gc_heap;
 
@@ -19,6 +20,8 @@ struct gc_trace_worker_data;
 // Visit all fields in an object.
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
+static inline void trace_root(struct gc_root root, struct gc_heap *heap,
+                              struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
 
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
@@ -53,6 +56,11 @@ static inline void gc_tracer_enqueue_roots(struct gc_tracer *tracer,
 // Given that an object has been shaded grey, enqueue for tracing.
 static inline void gc_trace_worker_enqueue(struct gc_trace_worker *worker,
                                            struct gc_ref ref) GC_ALWAYS_INLINE;
+static inline struct gc_trace_worker_data*
+gc_trace_worker_data(struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
+
+static inline void gc_tracer_add_root(struct gc_tracer *tracer,
+                                      struct gc_root root);
 
 // Run the full trace.
 static inline void gc_tracer_trace(struct gc_tracer *tracer);
diff --git a/src/whippet.c b/src/whippet.c
index 9674560c2..3e17f5422 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1228,6 +1228,13 @@ static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
     gc_trace_object(ref, tracer_visit, heap, worker, NULL);
 }
 
+static inline void trace_root(struct gc_root root,
+                              struct gc_heap *heap,
+                              struct gc_trace_worker *worker) {
+  // We don't use parallel root tracing yet.
+  GC_CRASH();
+}
+
 static void
 mark_and_globally_enqueue_mutator_conservative_roots(uintptr_t low,
                                                      uintptr_t high,
@@ -1876,8 +1883,11 @@ static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
 }
 
 static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
-  return gc_pop_resolved_ephemerons(heap, trace_and_enqueue_globally,
-                                    NULL);
+  struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+  if (!resolved)
+    return 0;
+  gc_trace_resolved_ephemerons(resolved, trace_and_enqueue_globally, heap, NULL);
+  return 1;
 }
 
 static void sweep_ephemerons(struct gc_heap *heap) {

From c226570a816ec84cbefca87921be3914746e3aba Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 11 Jul 2024 15:52:57 +0200
Subject: [PATCH 238/403] Fix parallel tracer to force workers to be stopped
 during pauses

---
 src/parallel-tracer.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index dea87cc9f..888c0fad6 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -125,6 +125,7 @@ gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
   for (size_t i = 1; i < desired_worker_count; i++) {
     if (!trace_worker_init(&tracer->workers[i], heap, tracer, i))
       break;
+    pthread_mutex_lock(&tracer->workers[i].lock);
     if (trace_worker_spawn(&tracer->workers[i]))
       tracer->worker_count++;
     else
@@ -256,9 +257,11 @@ trace_worker_should_continue(struct gc_trace_worker *worker) {
       }
       int done = (locked == tracer->worker_count) &&
         !trace_worker_can_steal_from_any(worker, tracer);
+      if (done)
+        return 0;
       while (locked > 1)
         pthread_mutex_unlock(&tracer->workers[--locked].lock);
-      return !done;
+      return 1;
     }
     // spin
     LOG("checking for termination: spinning #%zu\n", spin_count);
@@ -350,6 +353,9 @@ static inline void
 gc_tracer_trace(struct gc_tracer *tracer) {
   DEBUG("starting trace; %zu workers\n", tracer->worker_count);
 
+  for (int i = 1; i < tracer->worker_count; i++)
+    pthread_mutex_unlock(&tracer->workers[i].lock);
+
   ssize_t parallel_threshold =
     LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
   if (root_worklist_size(&tracer->roots) > 1 ||

From d5ef140dfe909a3cdd8c69feef26ad5d5f9cd510 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 10 Jul 2024 11:53:39 +0200
Subject: [PATCH 239/403] Add parallel copying collector

---
 Makefile                 |    4 +
 api/pcc-attrs.h          |   60 +++
 embed.mk                 |    3 +
 src/large-object-space.h |    5 +
 src/pcc.c                | 1043 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 1115 insertions(+)
 create mode 100644 api/pcc-attrs.h
 create mode 100644 src/pcc.c

diff --git a/Makefile b/Makefile
index 22df61e4f..0ea60d022 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@ TESTS = quads mt-gcbench ephemerons # MT_GCBench MT_GCBench2
 COLLECTORS = \
 	bdw \
 	semi \
+	pcc \
 	\
 	whippet \
 	stack-conservative-whippet \
@@ -60,6 +61,9 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
+GC_STEM_pcc       = pcc
+GC_CFLAGS_pcc     = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+
 define whippet_variant
 GC_STEM_$(1)       = whippet
 GC_CFLAGS_$(1)     = $(2)
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
new file mode 100644
index 000000000..7d589115f
--- /dev/null
+++ b/api/pcc-attrs.h
@@ -0,0 +1,60 @@
+#ifndef PCC_ATTRS_H
+#define PCC_ATTRS_H
+
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-attrs.h"
+
+static const uintptr_t GC_ALIGNMENT = 8;
+static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GC_ALIGNMENT;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return GC_LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  GC_CRASH();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  GC_CRASH();
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 1;
+}
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_write_barrier_card_table_alignment(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_card_size(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
+#endif // PCC_ATTRS_H
diff --git a/embed.mk b/embed.mk
index 6f2c4c6f2..49a7b7347 100644
--- a/embed.mk
+++ b/embed.mk
@@ -40,6 +40,9 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
+GC_STEM_pcc        = pcc
+GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+
 define whippet_variant
 GC_STEM_$(1)       = whippet
 GC_CFLAGS_$(1)     = $(2)
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 9d8d0d06a..3d07cf8ad 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -58,6 +58,11 @@ static size_t large_object_space_npages(struct large_object_space *space,
   return (bytes + space->page_size - 1) >> space->page_size_log2;
 }
 
+static size_t
+large_object_space_size_at_last_collection(struct large_object_space *space) {
+  return space->live_pages_at_last_collection << space->page_size_log2;
+}
+
 static void large_object_space_clear_one_remembered(uintptr_t addr,
                                                     void *unused) {
   struct gc_ref ref = gc_ref(addr);
diff --git a/src/pcc.c b/src/pcc.c
new file mode 100644
index 000000000..95bcb67ae
--- /dev/null
+++ b/src/pcc.c
@@ -0,0 +1,1043 @@
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "gc-api.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
+
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-inline.h"
+#include "gc-trace.h"
+#include "large-object-space.h"
+#include "parallel-tracer.h"
+#include "spin.h"
+#include "pcc-attrs.h"
+
+#define SLAB_SIZE (64 * 1024 * 1024)
+#define REGION_SIZE (64 * 1024)
+#define BLOCK_SIZE (2 * REGION_SIZE)
+#define BLOCKS_PER_SLAB (SLAB_SIZE / BLOCK_SIZE)
+#define HEADER_BYTES_PER_BLOCK (BLOCK_SIZE / BLOCKS_PER_SLAB)
+#define HEADER_BLOCKS_PER_SLAB 1
+#define NONHEADER_BLOCKS_PER_SLAB (BLOCKS_PER_SLAB - HEADER_BLOCKS_PER_SLAB)
+#define HEADER_BYTES_PER_SLAB (HEADER_BYTES_PER_BLOCK * HEADER_BLOCKS_PER_SLAB)
+
+struct pcc_slab;
+struct pcc_block;
+
+struct pcc_slab_header {
+  union {
+    struct {
+      struct pcc_slab *next;
+      struct pcc_slab *prev;
+      unsigned incore_block_count;
+    };
+    uint8_t padding[HEADER_BYTES_PER_SLAB];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct pcc_slab_header),
+                 HEADER_BYTES_PER_SLAB);
+
+// Really just the block header.
+struct pcc_block {
+  union {
+    struct {
+      struct pcc_block *next;
+      uint8_t in_core;
+    };
+    uint8_t padding[HEADER_BYTES_PER_BLOCK];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct pcc_block),
+                 HEADER_BYTES_PER_BLOCK);
+
+struct pcc_region {
+  char data[REGION_SIZE];
+};
+
+struct pcc_block_payload {
+  struct pcc_region regions[2];
+};
+
+struct pcc_slab {
+  struct pcc_slab_header header;
+  struct pcc_block headers[NONHEADER_BLOCKS_PER_SLAB];
+  struct pcc_block_payload blocks[NONHEADER_BLOCKS_PER_SLAB];
+};
+STATIC_ASSERT_EQ(sizeof(struct pcc_slab), SLAB_SIZE);
+
+static struct pcc_block *block_header(struct pcc_block_payload *payload) {
+  uintptr_t addr = (uintptr_t) payload;
+  uintptr_t base = align_down(addr, SLAB_SIZE);
+  struct pcc_slab *slab = (struct pcc_slab*) base;
+  uintptr_t block_idx = (addr / BLOCK_SIZE) % BLOCKS_PER_SLAB;
+  return &slab->headers[block_idx - HEADER_BLOCKS_PER_SLAB];
+}
+
+static struct pcc_block_payload *block_payload(struct pcc_block *block) {
+  uintptr_t addr = (uintptr_t) block;
+  uintptr_t base = align_down(addr, SLAB_SIZE);
+  struct pcc_slab *slab = (struct pcc_slab*) base;
+  uintptr_t block_idx = (addr / HEADER_BYTES_PER_BLOCK) % BLOCKS_PER_SLAB;
+  return &slab->blocks[block_idx - HEADER_BLOCKS_PER_SLAB];
+}
+
+static uint8_t pcc_object_region(struct gc_ref obj) {
+  return (gc_ref_value(obj) / REGION_SIZE) & 1;
+}
+
+struct pcc_extent {
+  uintptr_t low_addr;
+  uintptr_t high_addr;
+};
+
+struct pcc_space {
+  struct pcc_block *available;
+  struct pcc_block *allocated ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t allocated_block_count;
+  struct pcc_block *paged_out ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t fragmentation ALIGNED_TO_AVOID_FALSE_SHARING;
+  ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
+  // The rest of these members are only changed rarely and with the heap
+  // lock.
+  uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t live_bytes_at_last_gc;
+  size_t fragmentation_at_last_gc;
+  struct pcc_extent *extents;
+  size_t nextents;
+  struct pcc_slab *slabs;
+  size_t nslabs;
+};
+
+struct gc_heap {
+  struct pcc_space pcc_space;
+  struct large_object_space large_object_space;
+  struct gc_extern_space *extern_space;
+  size_t large_object_pages;
+  pthread_mutex_t lock;
+  pthread_cond_t collector_cond;
+  pthread_cond_t mutator_cond;
+  size_t size;
+  int collecting;
+  int check_pending_ephemerons;
+  struct gc_pending_ephemerons *pending_ephemerons;
+  size_t mutator_count;
+  size_t paused_mutator_count;
+  size_t inactive_mutator_count;
+  struct gc_heap_roots *roots;
+  struct gc_mutator *mutators;
+  long count;
+  struct gc_tracer tracer;
+  double pending_ephemerons_size_factor;
+  double pending_ephemerons_size_slop;
+  struct gc_event_listener event_listener;
+  void *event_listener_data;
+};
+
+#define HEAP_EVENT(heap, event, ...)                                    \
+  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
+#define MUTATOR_EVENT(mut, event, ...)                                  \
+  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+
+struct gc_mutator {
+  uintptr_t hp;
+  uintptr_t limit;
+  struct pcc_block *block;
+  struct gc_heap *heap;
+  struct gc_mutator_roots *roots;
+  void *event_listener_data;
+  struct gc_mutator *next;
+  struct gc_mutator *prev;
+};
+
+struct gc_trace_worker_data {
+  uintptr_t hp;
+  uintptr_t limit;
+  struct pcc_block *block;
+};
+
+static inline struct pcc_space* heap_pcc_space(struct gc_heap *heap) {
+  return &heap->pcc_space;
+}
+static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
+  return &heap->large_object_space;
+}
+static inline struct gc_extern_space* heap_extern_space(struct gc_heap *heap) {
+  return heap->extern_space;
+}
+static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
+  return mutator->heap;
+}
+
+static inline void pcc_space_compute_region(struct pcc_space *space,
+                                            struct pcc_block *block,
+                                            uintptr_t *hp, uintptr_t *limit) {
+  struct pcc_block_payload *payload = block_payload(block);
+  struct pcc_region *region = &payload->regions[space->active_region];
+  *hp = (uintptr_t)&region[0];
+  *limit = (uintptr_t)&region[1];
+}
+
+static void push_block(struct pcc_block **list,
+                       struct pcc_block *block) {
+  struct pcc_block *next = atomic_load_explicit(list, memory_order_acquire);
+  do {
+    block->next = next;
+  } while (!atomic_compare_exchange_weak(list, &next, block));
+}
+
+static struct pcc_block* pop_block(struct pcc_block **list) {
+  struct pcc_block *head = atomic_load_explicit(list, memory_order_acquire);
+  struct pcc_block *next;
+  do {
+    if (!head)
+      return NULL;
+  } while (!atomic_compare_exchange_weak(list, &head, head->next));
+  head->next = NULL;
+  return head;
+}
+
+static struct pcc_block* pop_available_block(struct pcc_space *space) {
+  return pop_block(&space->available);
+}
+static void push_available_block(struct pcc_space *space,
+                                 struct pcc_block *block) {
+  push_block(&space->available, block);
+}
+
+static struct pcc_block* pop_allocated_block(struct pcc_space *space) {
+  return pop_block(&space->allocated);
+}
+static void push_allocated_block(struct pcc_space *space,
+                                 struct pcc_block *block) {
+  push_block(&space->allocated, block);
+  atomic_fetch_add_explicit(&space->allocated_block_count, 1,
+                            memory_order_relaxed);
+}
+
+static struct pcc_block* pop_paged_out_block(struct pcc_space *space) {
+  return pop_block(&space->paged_out);
+}
+static void push_paged_out_block(struct pcc_space *space,
+                                 struct pcc_block *block) {
+  push_block(&space->paged_out, block);
+}
+
+static void page_out_block(struct pcc_space *space,
+                           struct pcc_block *block) {
+  block->in_core = 0;
+  madvise(block_payload(block), BLOCK_SIZE, MADV_DONTNEED);
+  push_paged_out_block(space, block);
+}
+
+static struct pcc_block* page_in_block(struct pcc_space *space) {
+  struct pcc_block* block = pop_paged_out_block(space);
+  if (block) block->in_core = 1;
+  return block;
+}
+
+static void record_fragmentation(struct pcc_space *space,
+                                 size_t bytes) {
+  atomic_fetch_add_explicit(&space->fragmentation, bytes,
+                            memory_order_relaxed);
+}
+
+static ssize_t pcc_space_request_release_memory(struct pcc_space *space,
+                                                  size_t bytes) {
+  return atomic_fetch_add(&space->bytes_to_page_out, bytes) + bytes;
+}
+
+static int
+pcc_space_page_out_blocks_until_memory_released(struct pcc_space *space) {
+  ssize_t pending = atomic_load(&space->bytes_to_page_out);
+  while (pending > 0) {
+    struct pcc_block *block = pop_available_block(space);
+    if (!block) return 0;
+    page_out_block(space, block);
+    pending =
+      atomic_fetch_sub(&space->bytes_to_page_out, BLOCK_SIZE) - BLOCK_SIZE;
+  }
+  return 1;
+}
+
+static void pcc_space_reacquire_memory(struct pcc_space *space,
+                                       size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
+  while (pending + BLOCK_SIZE <= 0) {
+    struct pcc_block *block = page_in_block(space);
+    GC_ASSERT(block);
+    push_available_block(space, block);
+    pending =
+      atomic_fetch_add(&space->bytes_to_page_out, BLOCK_SIZE) + BLOCK_SIZE;
+  }
+}
+
+static void
+gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
+                                         struct gc_heap *heap,
+                                         struct gc_trace_worker *worker,
+                                         struct gc_trace_worker_data *data),
+                               struct gc_tracer *tracer,
+                               struct gc_heap *heap,
+                               struct gc_trace_worker *worker) {
+  struct gc_trace_worker_data data = {0,0,NULL};
+  f(tracer, heap, worker, &data);
+  if (data.block) {
+    record_fragmentation(heap_pcc_space(heap), data.limit - data.hp);
+    push_allocated_block(heap_pcc_space(heap), data.block);
+  }
+  // FIXME: Add (data.limit - data.hp) to fragmentation.
+}
+
+static void clear_mutator_allocation_buffers(struct gc_heap *heap) {
+  for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next) {
+    if (mut->block) {
+      record_fragmentation(heap_pcc_space(heap), mut->limit - mut->hp);
+      push_allocated_block(heap_pcc_space(heap), mut->block);
+      mut->block = NULL;
+    }
+    mut->hp = mut->limit = 0;
+  }
+}
+
+static void pcc_space_flip(struct pcc_space *space) {
+  // Mutators stopped, can access nonatomically.
+  struct pcc_block *available = space->available;
+  struct pcc_block *allocated = space->allocated;
+  if (available) {
+    struct pcc_block *tail = available;
+    while (tail->next)
+      tail = tail->next;
+    tail->next = allocated;
+    allocated = available;
+  }
+  space->available = allocated;
+  space->allocated = NULL;
+  space->allocated_block_count = 0;
+  space->fragmentation = 0;
+  space->active_region ^= 1;
+}
+
+static void pcc_space_finish_gc(struct pcc_space *space) {
+  // Mutators stopped, can access nonatomically.
+  space->live_bytes_at_last_gc = space->allocated_block_count * REGION_SIZE;
+  space->fragmentation_at_last_gc = space->fragmentation;
+}
+
+static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
+
+static struct gc_ref evacuation_allocate(struct pcc_space *space,
+                                         struct gc_trace_worker_data *data,
+                                         size_t size) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, GC_ALIGNMENT);
+
+  uintptr_t hp = data->hp;
+  uintptr_t limit = data->limit;
+  uintptr_t new_hp = hp + size;
+  struct gc_ref ret;
+  if (new_hp <= limit) {
+    data->hp = new_hp;
+    return gc_ref(hp);
+  }
+
+  if (data->block) {
+    record_fragmentation(space, limit - hp);
+    push_allocated_block(space, data->block);
+  }
+  data->block = pop_available_block(space);
+  if (!data->block) {
+    // Can happen if space is really tight and reordering of objects
+    // during evacuation resulted in more end-of-block fragmentation and
+    // thus block use than before collection started.  A dire situation.
+    fprintf(stderr, "Out of memory\n");
+    GC_CRASH();
+  }
+  pcc_space_compute_region(space, data->block, &hp, &data->limit);
+  // The region is empty and is therefore large enough for a small
+  // allocation.
+  data->hp = hp + size;
+  return gc_ref(hp);
+}
+
+static inline int pcc_space_forward(struct pcc_space *space,
+                                    struct gc_edge edge,
+                                    struct gc_ref old_ref,
+                                    struct gc_trace_worker_data *data) {
+  GC_ASSERT(pcc_object_region(old_ref) != space->active_region);
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+
+  if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+    gc_atomic_forward_acquire(&fwd);
+
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+  case GC_FORWARDING_STATE_ABORTED:
+  default:
+    // Impossible.
+    GC_CRASH();
+  case GC_FORWARDING_STATE_ACQUIRED: {
+    // We claimed the object successfully; evacuating is up to us.
+    size_t bytes = gc_atomic_forward_object_size(&fwd);
+    struct gc_ref new_ref = evacuation_allocate(space, data, bytes);
+    // Copy object contents before committing, as we don't know what
+    // part of the object (if any) will be overwritten by the
+    // commit.
+    memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), bytes);
+    gc_atomic_forward_commit(&fwd, new_ref);
+    gc_edge_update(edge, new_ref);
+    return 1;
+  }
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known, or evacuation aborts.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        break;
+      yield_for_spin(spin_count);
+    }
+    GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+    // Fall through.
+  case GC_FORWARDING_STATE_FORWARDED:
+    // The object has been evacuated already.  Update the edge;
+    // whoever forwarded the object will make sure it's eventually
+    // traced.
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 0;
+  }
+}
+
+static inline int pcc_space_contains(struct pcc_space *space,
+                                     struct gc_ref ref) {
+
+  for (size_t i = 0; i < space->nextents; i++)
+    if (space->extents[i].low_addr <= gc_ref_value(ref) &&
+        gc_ref_value(ref) < space->extents[i].high_addr)
+      return 1;
+  return 0;
+}
+
+static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
+                           struct gc_ref ref,
+                           struct gc_trace_worker_data *data) {
+  if (!gc_ref_is_heap_object(ref))
+    return 0;
+  if (GC_LIKELY(pcc_space_contains(heap_pcc_space(heap), ref)))
+    return pcc_space_forward(heap_pcc_space(heap), edge, ref, data);
+  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+    return large_object_space_mark_object(heap_large_object_space(heap), ref);
+  else
+    return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
+}
+
+static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge,
+                             struct gc_trace_worker *worker) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  struct gc_trace_worker_data *data = gc_trace_worker_data(worker);
+  int is_new = do_trace(heap, edge, ref, data);
+
+  if (is_new &&
+      GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
+                                       memory_order_relaxed)))
+    gc_resolve_pending_ephemerons(ref, heap);
+
+  return is_new;
+}
+
+int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  if (!gc_ref_is_heap_object(ref))
+    return 0;
+  if (GC_LIKELY(pcc_space_contains(heap_pcc_space(heap), ref))) {
+    struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
+    switch (fwd.state) {
+    case GC_FORWARDING_STATE_NOT_FORWARDED:
+      return 0;
+    case GC_FORWARDING_STATE_BUSY:
+      // Someone else claimed this object first.  Spin until new address
+      // known.
+      for (size_t spin_count = 0;; spin_count++) {
+        if (gc_atomic_forward_retry_busy(&fwd))
+          break;
+        yield_for_spin(spin_count);
+      }
+      GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+      // Fall through.
+    case GC_FORWARDING_STATE_FORWARDED:
+      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+      return 1;
+    default:
+      GC_CRASH();
+    }
+  } else if (large_object_space_contains(heap_large_object_space(heap), ref)) {
+    return large_object_space_is_copied(heap_large_object_space(heap), ref);
+  }
+  GC_CRASH();
+}
+
+static int mutators_are_stopping(struct gc_heap *heap) {
+  return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
+}
+
+static inline void heap_lock(struct gc_heap *heap) {
+  pthread_mutex_lock(&heap->lock);
+}
+static inline void heap_unlock(struct gc_heap *heap) {
+  pthread_mutex_unlock(&heap->lock);
+}
+
+// with heap lock
+static inline int all_mutators_stopped(struct gc_heap *heap) {
+  return heap->mutator_count ==
+    heap->paused_mutator_count + heap->inactive_mutator_count;
+}
+
+static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  mut->heap = heap;
+  mut->event_listener_data =
+    heap->event_listener.mutator_added(heap->event_listener_data);
+  heap_lock(heap);
+  // We have no roots.  If there is a GC currently in progress, we have
+  // nothing to add.  Just wait until it's done.
+  while (mutators_are_stopping(heap))
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  mut->next = mut->prev = NULL;
+  struct gc_mutator *tail = heap->mutators;
+  if (tail) {
+    mut->next = tail;
+    tail->prev = mut;
+  }
+  heap->mutators = mut;
+  heap->mutator_count++;
+  heap_unlock(heap);
+}
+
+static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  MUTATOR_EVENT(mut, mutator_removed);
+  mut->heap = NULL;
+  heap_lock(heap);
+  heap->mutator_count--;
+  // We have no roots.  If there is a GC stop currently in progress,
+  // maybe tell the controller it can continue.
+  if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
+  if (mut->next)
+    mut->next->prev = mut->prev;
+  if (mut->prev)
+    mut->prev->next = mut->next;
+  else
+    heap->mutators = mut->next;
+  heap_unlock(heap);
+}
+
+static void request_mutators_to_stop(struct gc_heap *heap) {
+  GC_ASSERT(!mutators_are_stopping(heap));
+  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
+}
+
+static void allow_mutators_to_continue(struct gc_heap *heap) {
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(all_mutators_stopped(heap));
+  heap->paused_mutator_count = 0;
+  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
+  GC_ASSERT(!mutators_are_stopping(heap));
+  pthread_cond_broadcast(&heap->mutator_cond);
+}
+
+static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
+  size_t previous = heap->large_object_pages;
+  heap->large_object_pages = npages;
+  GC_ASSERT(npages <= previous);
+  size_t bytes = (previous - npages) <<
+    heap_large_object_space(heap)->page_size_log2;
+  pcc_space_reacquire_memory(heap_pcc_space(heap), bytes);
+}
+
+void gc_mutator_set_roots(struct gc_mutator *mut,
+                          struct gc_mutator_roots *roots) {
+  mut->roots = roots;
+}
+void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
+  heap->roots = roots;
+}
+void gc_heap_set_extern_space(struct gc_heap *heap,
+                              struct gc_extern_space *space) {
+  heap->extern_space = space;
+}
+
+static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
+                                void *trace_data) GC_ALWAYS_INLINE;
+static inline void
+tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  struct gc_trace_worker *worker = trace_data;
+  if (trace_edge(heap, edge, worker))
+    gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
+}
+
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             struct gc_trace_worker *worker) {
+#ifdef DEBUG
+  if (pcc_space_contains(heap_pcc_space(heap), ref))
+    GC_ASSERT(pcc_object_region(ref) == heap_pcc_space(heap)->active_region);
+#endif
+  gc_trace_object(ref, tracer_visit, heap, worker, NULL);
+}
+
+static inline void trace_root(struct gc_root root, struct gc_heap *heap,
+                              struct gc_trace_worker *worker) {
+  switch (root.kind) {
+  case GC_ROOT_KIND_HEAP:
+    gc_trace_heap_roots(root.heap->roots, tracer_visit, heap, worker);
+    break;
+  case GC_ROOT_KIND_MUTATOR:
+    gc_trace_mutator_roots(root.mutator->roots, tracer_visit, heap, worker);
+    break;
+  case GC_ROOT_KIND_RESOLVED_EPHEMERONS:
+    gc_trace_resolved_ephemerons(root.resolved_ephemerons, tracer_visit,
+                                 heap, worker);
+    break;
+  default:
+    GC_CRASH();
+  }
+}
+
+static void wait_for_mutators_to_stop(struct gc_heap *heap) {
+  heap->paused_mutator_count++;
+  while (!all_mutators_stopped(heap))
+    pthread_cond_wait(&heap->collector_cond, &heap->lock);
+}
+
+void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                             struct gc_edge edge, struct gc_ref new_val) {
+}
+
+static void
+pause_mutator_for_collection(struct gc_heap *heap,
+                             struct gc_mutator *mut) GC_NEVER_INLINE;
+static void
+pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(!all_mutators_stopped(heap));
+  MUTATOR_EVENT(mut, mutator_stopped);
+  heap->paused_mutator_count++;
+  if (all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
+
+  do {
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  } while (mutators_are_stopping(heap));
+
+  MUTATOR_EVENT(mut, mutator_restarted);
+}
+
+static void
+pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void
+pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  GC_ASSERT(mutators_are_stopping(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
+  pause_mutator_for_collection(heap, mut);
+}
+
+static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  GC_ASSERT(mutators_are_stopping(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
+  heap_lock(heap);
+  pause_mutator_for_collection(heap, mut);
+  heap_unlock(heap);
+}
+
+static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
+  while (mutators_are_stopping(mutator_heap(mut)))
+    pause_mutator_for_collection_without_lock(mut);
+}
+
+static int maybe_grow_heap(struct gc_heap *heap) {
+  return 0;
+}
+
+static void add_roots(struct gc_heap *heap) {
+  for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next)
+    gc_tracer_add_root(&heap->tracer, gc_root_mutator(mut));
+  gc_tracer_add_root(&heap->tracer, gc_root_heap(heap));
+}
+
+static void resolve_ephemerons_lazily(struct gc_heap *heap) {
+  atomic_store_explicit(&heap->check_pending_ephemerons, 0,
+                        memory_order_release);
+}
+
+static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
+  atomic_store_explicit(&heap->check_pending_ephemerons, 1,
+                        memory_order_release);
+  gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
+}
+
+static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
+  struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+  if (!resolved)
+    return 0;
+  gc_tracer_add_root(&heap->tracer, gc_root_resolved_ephemerons(resolved));
+  return 1;
+}
+
+static void sweep_ephemerons(struct gc_heap *heap) {
+  return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
+}
+
+static void collect(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  struct pcc_space *cspace = heap_pcc_space(heap);
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  struct gc_extern_space *exspace = heap_extern_space(heap);
+  MUTATOR_EVENT(mut, mutator_cause_gc);
+  DEBUG("start collect #%ld:\n", heap->count);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
+  large_object_space_start_gc(lospace, 0);
+  gc_extern_space_start_gc(exspace, 0);
+  resolve_ephemerons_lazily(heap);
+  HEAP_EVENT(heap, requesting_stop);
+  request_mutators_to_stop(heap);
+  HEAP_EVENT(heap, waiting_for_stop);
+  wait_for_mutators_to_stop(heap);
+  HEAP_EVENT(heap, mutators_stopped);
+  clear_mutator_allocation_buffers(heap);
+  pcc_space_flip(cspace);
+  gc_tracer_prepare(&heap->tracer);
+  add_roots(heap);
+  HEAP_EVENT(heap, roots_traced);
+  gc_tracer_trace(&heap->tracer);
+  HEAP_EVENT(heap, heap_traced);
+  resolve_ephemerons_eagerly(heap);
+  while (enqueue_resolved_ephemerons(heap))
+    gc_tracer_trace(&heap->tracer);
+  HEAP_EVENT(heap, ephemerons_traced);
+  sweep_ephemerons(heap);
+  gc_tracer_release(&heap->tracer);
+  pcc_space_finish_gc(cspace);
+  large_object_space_finish_gc(lospace, 0);
+  gc_extern_space_finish_gc(exspace, 0);
+  heap->count++;
+  heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
+  size_t live_size = (cspace->live_bytes_at_last_gc +
+                      large_object_space_size_at_last_collection(lospace));
+  HEAP_EVENT(heap, live_data_size, live_size);
+  maybe_grow_heap(heap);
+  if (!pcc_space_page_out_blocks_until_memory_released(cspace)) {
+    fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
+            heap->size, cspace->nslabs);
+    GC_CRASH();
+  }
+  HEAP_EVENT(heap, restarting_mutators);
+  allow_mutators_to_continue(heap);
+}
+
+static void trigger_collection(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  heap_lock(heap);
+  long epoch = heap->count;
+  while (mutators_are_stopping(heap))
+    pause_mutator_for_collection_with_lock(mut);
+  if (epoch == heap->count)
+    collect(mut);
+  heap_unlock(heap);
+}
+
+void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
+  trigger_collection(mut);
+}
+
+static void* allocate_large(struct gc_mutator *mut, size_t size) {
+  struct gc_heap *heap = mutator_heap(mut);
+  struct large_object_space *space = heap_large_object_space(heap);
+
+  size_t npages = large_object_space_npages(space, size);
+
+  pcc_space_request_release_memory(heap_pcc_space(heap),
+                                     npages << space->page_size_log2);
+  while (!pcc_space_page_out_blocks_until_memory_released(heap_pcc_space(heap)))
+    trigger_collection(mut);
+  atomic_fetch_add(&heap->large_object_pages, npages);
+
+  void *ret = large_object_space_alloc(space, npages);
+  if (!ret)
+    ret = large_object_space_obtain_and_alloc(space, npages);
+
+  if (!ret) {
+    perror("weird: we have the space but mmap didn't work");
+    GC_CRASH();
+  }
+
+  return ret;
+}
+
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+  GC_ASSERT(size > 0); // allocating 0 bytes would be silly
+
+  if (size > gc_allocator_large_threshold())
+    return allocate_large(mut, size);
+
+  size = align_up(size, GC_ALIGNMENT);
+  uintptr_t hp = mut->hp;
+  uintptr_t limit = mut->limit;
+  uintptr_t new_hp = hp + size;
+  struct gc_ref ret;
+  if (new_hp <= limit) {
+    mut->hp = new_hp;
+    gc_clear_fresh_allocation(gc_ref(hp), size);
+    return gc_ref_heap_object(gc_ref(hp));
+  }
+
+  struct pcc_space *space = heap_pcc_space(mutator_heap(mut));
+  if (mut->block) {
+    record_fragmentation(space, limit - hp);
+    push_allocated_block(space, mut->block);
+  }
+  mut->block = pop_available_block(space);
+  while (!mut->block) {
+    trigger_collection(mut);
+    mut->block = pop_available_block(space);
+  }
+  pcc_space_compute_region(space, mut->block, &hp, &mut->limit);
+  // The region is empty and is therefore large enough for a small
+  // allocation.
+  mut->hp = hp + size;
+  gc_clear_fresh_allocation(gc_ref(hp), size);
+  return gc_ref_heap_object(gc_ref(hp));
+}
+
+void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
+  return gc_allocate(mut, size);
+}
+
+struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_ephemeron_size());
+}
+
+void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                       struct gc_ref key, struct gc_ref value) {
+  gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
+}
+
+struct gc_pending_ephemerons *gc_heap_pending_ephemerons(struct gc_heap *heap) {
+  return heap->pending_ephemerons;
+}
+
+unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+  return heap->count;
+}
+
+static struct pcc_slab* allocate_slabs(size_t nslabs) {
+  size_t size = nslabs * SLAB_SIZE;
+  size_t extent = size + SLAB_SIZE;
+
+  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return NULL;
+  }
+
+  uintptr_t base = (uintptr_t) mem;
+  uintptr_t end = base + extent;
+  uintptr_t aligned_base = align_up(base, SLAB_SIZE);
+  uintptr_t aligned_end = aligned_base + size;
+
+  if (aligned_base - base)
+    munmap((void*)base, aligned_base - base);
+  if (end - aligned_end)
+    munmap((void*)aligned_end, end - aligned_end);
+
+  return (struct pcc_slab*) aligned_base;
+}
+
+static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
+  struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
+  size_t target = heap->size * heap->pending_ephemerons_size_factor;
+  double slop = heap->pending_ephemerons_size_slop;
+
+  heap->pending_ephemerons = gc_prepare_pending_ephemerons(cur, target, slop);
+
+  return !!heap->pending_ephemerons;
+}
+
+struct gc_options {
+  struct gc_common_options common;
+};
+int gc_option_from_string(const char *str) {
+  return gc_common_option_from_string(str);
+}
+struct gc_options* gc_allocate_options(void) {
+  struct gc_options *ret = malloc(sizeof(struct gc_options));
+  gc_init_common_options(&ret->common);
+  return ret;
+}
+int gc_options_set_int(struct gc_options *options, int option, int value) {
+  return gc_common_options_set_int(&options->common, option, value);
+}
+int gc_options_set_size(struct gc_options *options, int option,
+                        size_t value) {
+  return gc_common_options_set_size(&options->common, option, value);
+}
+int gc_options_set_double(struct gc_options *options, int option,
+                          double value) {
+  return gc_common_options_set_double(&options->common, option, value);
+}
+int gc_options_parse_and_set(struct gc_options *options, int option,
+                             const char *value) {
+  return gc_common_options_parse_and_set(&options->common, option, value);
+}
+
+static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
+  // *heap is already initialized to 0.
+
+  pthread_mutex_init(&heap->lock, NULL);
+  pthread_cond_init(&heap->mutator_cond, NULL);
+  pthread_cond_init(&heap->collector_cond, NULL);
+  heap->size = options->common.heap_size;
+
+  if (!gc_tracer_init(&heap->tracer, heap, options->common.parallelism))
+    GC_CRASH();
+
+  heap->pending_ephemerons_size_factor = 0.005;
+  heap->pending_ephemerons_size_slop = 0.5;
+
+  if (!heap_prepare_pending_ephemerons(heap))
+    GC_CRASH();
+
+  return 1;
+}
+
+static int pcc_space_init(struct pcc_space *space, struct gc_heap *heap) {
+  size_t size = align_up(heap->size, SLAB_SIZE);
+  size_t nslabs = size / SLAB_SIZE;
+  struct pcc_slab *slabs = allocate_slabs(nslabs);
+  if (!slabs)
+    return 0;
+
+  space->available = NULL;
+  space->allocated = NULL;
+  space->allocated_block_count = 0;
+  space->paged_out = NULL;
+  space->fragmentation = 0;
+  space->bytes_to_page_out = 0;
+  space->active_region = 0;
+  space->live_bytes_at_last_gc = 0;
+  space->fragmentation_at_last_gc = 0;
+  space->extents = calloc(1, sizeof(struct pcc_extent));
+  space->extents[0].low_addr = (uintptr_t) slabs;
+  space->extents[0].high_addr = space->extents[0].low_addr + size;
+  space->nextents = 1;
+  space->slabs = slabs;
+  space->nslabs = nslabs;
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t idx = 0; idx < NONHEADER_BLOCKS_PER_SLAB; idx++) {
+      struct pcc_block *block = &slabs[slab].headers[idx];
+      if (size > heap->size) {
+        block->in_core = 0;
+        push_paged_out_block(space, block);
+        size -= BLOCK_SIZE;
+      } else {
+        block->in_core = 1;
+        push_available_block(space, block);
+      }
+    }
+  }
+  return 1;
+}
+
+int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
+            struct gc_heap **heap, struct gc_mutator **mut,
+            struct gc_event_listener event_listener,
+            void *event_listener_data) {
+  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_ALIGNMENT);
+  GC_ASSERT_EQ(gc_allocator_large_threshold(), GC_LARGE_OBJECT_THRESHOLD);
+  GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
+               offsetof(struct gc_mutator, hp));
+  GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
+               offsetof(struct gc_mutator, limit));
+
+  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
+    fprintf(stderr, "fixed heap size is currently required\n");
+    return 0;
+  }
+
+  *heap = calloc(1, sizeof(struct gc_heap));
+  if (!*heap) GC_CRASH();
+
+  if (!heap_init(*heap, options))
+    GC_CRASH();
+
+  (*heap)->event_listener = event_listener;
+  (*heap)->event_listener_data = event_listener_data;
+  HEAP_EVENT(*heap, init, (*heap)->size);
+
+  struct pcc_space *space = heap_pcc_space(*heap);
+  if (!pcc_space_init(space, *heap)) {
+    free(*heap);
+    *heap = NULL;
+    return 0;
+  }
+  
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+    GC_CRASH();
+
+  *mut = calloc(1, sizeof(struct gc_mutator));
+  if (!*mut) GC_CRASH();
+  add_mutator(*heap, *mut);
+  return 1;
+}
+
+struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
+                                      struct gc_heap *heap) {
+  struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
+  if (!ret)
+    GC_CRASH();
+  add_mutator(heap, ret);
+  return ret;
+}
+
+void gc_finish_for_thread(struct gc_mutator *mut) {
+  remove_mutator(mutator_heap(mut), mut);
+  free(mut);
+}
+
+static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  GC_ASSERT(mut->next == NULL);
+  heap_lock(heap);
+  heap->inactive_mutator_count++;
+  if (all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
+  heap_unlock(heap);
+}
+
+static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  heap_lock(heap);
+  while (mutators_are_stopping(heap))
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  heap->inactive_mutator_count--;
+  heap_unlock(heap);
+}
+
+void* gc_call_without_gc(struct gc_mutator *mut,
+                         void* (*f)(void*),
+                         void *data) {
+  struct gc_heap *heap = mutator_heap(mut);
+  deactivate_mutator(heap, mut);
+  void *ret = f(data);
+  reactivate_mutator(heap, mut);
+  return ret;
+}

From c556dedb56c0a4b0a37a73aec33372940ad725c8 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 11 Jul 2024 21:07:26 +0200
Subject: [PATCH 240/403] Add pcc collector to docs

---
 doc/manual.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/manual.md b/doc/manual.md
index ab0656db0..73aa537eb 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -182,8 +182,9 @@ prepared!
 ## Configuration, compilation, and linking
 
 To the user, Whippet presents an abstract API that does not encode the
-specificities of any given collector.  Whippet currently includes three
+specificities of any given collector.  Whippet currently includes four
 implementations of that API: `semi`, a simple semi-space collector;
+`pcc`, a parallel copying collector (like semi but multithreaded);
 `bdw`, an implementation via the third-party
 [Boehm-Demers-Weiser](https://github.com/ivmai/bdwgc) conservative
 collector; and `whippet`, an Immix-like collector.

From 64c7d73fa23900346114120ab2193e92ccdce75c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 11 Jul 2024 22:15:59 +0200
Subject: [PATCH 241/403] Add partially allocated block list.

Stopping a mutator or evacuator adds to this list.
---
 src/pcc.c | 110 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 78 insertions(+), 32 deletions(-)

diff --git a/src/pcc.c b/src/pcc.c
index 95bcb67ae..d56c9b0e0 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -52,6 +52,7 @@ struct pcc_block {
     struct {
       struct pcc_block *next;
       uint8_t in_core;
+      size_t allocated; // For partially-allocated blocks.
     };
     uint8_t padding[HEADER_BYTES_PER_BLOCK];
   };
@@ -101,6 +102,7 @@ struct pcc_extent {
 
 struct pcc_space {
   struct pcc_block *available;
+  struct pcc_block *partially_allocated;
   struct pcc_block *allocated ALIGNED_TO_AVOID_FALSE_SHARING;
   size_t allocated_block_count;
   struct pcc_block *paged_out ALIGNED_TO_AVOID_FALSE_SHARING;
@@ -223,6 +225,17 @@ static void push_allocated_block(struct pcc_space *space,
                             memory_order_relaxed);
 }
 
+static struct pcc_block* pop_partially_allocated_block(struct pcc_space *space) {
+  return pop_block(&space->partially_allocated);
+}
+static void push_partially_allocated_block(struct pcc_space *space,
+                                           struct pcc_block *block,
+                                           uintptr_t hp) {
+  block->allocated = hp & (REGION_SIZE - 1);
+  GC_ASSERT(block->allocated);
+  push_block(&space->partially_allocated, block);
+}
+
 static struct pcc_block* pop_paged_out_block(struct pcc_space *space) {
   return pop_block(&space->paged_out);
 }
@@ -291,10 +304,9 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                struct gc_trace_worker *worker) {
   struct gc_trace_worker_data data = {0,0,NULL};
   f(tracer, heap, worker, &data);
-  if (data.block) {
-    record_fragmentation(heap_pcc_space(heap), data.limit - data.hp);
-    push_allocated_block(heap_pcc_space(heap), data.block);
-  }
+  if (data.block)
+    push_partially_allocated_block(heap_pcc_space(heap), data.block,
+                                   data.hp);
   // FIXME: Add (data.limit - data.hp) to fragmentation.
 }
 
@@ -309,18 +321,26 @@ static void clear_mutator_allocation_buffers(struct gc_heap *heap) {
   }
 }
 
+static struct pcc_block*
+append_block_lists(struct pcc_block *head, struct pcc_block *tail) {
+  if (!head) return tail;
+  if (tail) {
+    struct pcc_block *walk = head;
+    while (walk->next)
+      walk = walk->next;
+    walk->next = tail;
+  }
+  return head;
+}
+
 static void pcc_space_flip(struct pcc_space *space) {
   // Mutators stopped, can access nonatomically.
   struct pcc_block *available = space->available;
+  struct pcc_block *partially_allocated = space->partially_allocated;
   struct pcc_block *allocated = space->allocated;
-  if (available) {
-    struct pcc_block *tail = available;
-    while (tail->next)
-      tail = tail->next;
-    tail->next = allocated;
-    allocated = available;
-  }
-  space->available = allocated;
+  allocated = append_block_lists(partially_allocated, allocated);
+  space->available = append_block_lists(available, allocated);
+  space->partially_allocated = NULL;
   space->allocated = NULL;
   space->allocated_block_count = 0;
   space->fragmentation = 0;
@@ -345,16 +365,24 @@ static struct gc_ref evacuation_allocate(struct pcc_space *space,
   uintptr_t hp = data->hp;
   uintptr_t limit = data->limit;
   uintptr_t new_hp = hp + size;
-  struct gc_ref ret;
-  if (new_hp <= limit) {
-    data->hp = new_hp;
-    return gc_ref(hp);
-  }
+  if (new_hp <= limit)
+    goto done;
 
   if (data->block) {
     record_fragmentation(space, limit - hp);
     push_allocated_block(space, data->block);
   }
+  while ((data->block = pop_partially_allocated_block(space))) {
+    pcc_space_compute_region(space, data->block, &hp, &limit);
+    hp += data->block->allocated;
+    new_hp = hp + size;
+    if (new_hp <= limit) {
+      data->limit = limit;
+      goto done;
+    }
+    record_fragmentation(space, limit - hp);
+    push_allocated_block(space, data->block);
+  }
   data->block = pop_available_block(space);
   if (!data->block) {
     // Can happen if space is really tight and reordering of objects
@@ -364,9 +392,12 @@ static struct gc_ref evacuation_allocate(struct pcc_space *space,
     GC_CRASH();
   }
   pcc_space_compute_region(space, data->block, &hp, &data->limit);
+  new_hp = hp + size;
   // The region is empty and is therefore large enough for a small
   // allocation.
-  data->hp = hp + size;
+
+done:
+  data->hp = new_hp;
   return gc_ref(hp);
 }
 
@@ -507,6 +538,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
   heap_lock(heap);
+  mut->block = NULL;
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
   while (mutators_are_stopping(heap))
@@ -525,18 +557,23 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
+  if (mut->block) {
+    push_partially_allocated_block(heap_pcc_space(heap), mut->block,
+                                   mut->hp);
+    mut->block = NULL;
+  }
   heap_lock(heap);
   heap->mutator_count--;
-  // We have no roots.  If there is a GC stop currently in progress,
-  // maybe tell the controller it can continue.
-  if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
-    pthread_cond_signal(&heap->collector_cond);
   if (mut->next)
     mut->next->prev = mut->prev;
   if (mut->prev)
     mut->prev->next = mut->next;
   else
     heap->mutators = mut->next;
+  // We have no roots.  If there is a GC stop currently in progress,
+  // maybe tell the controller it can continue.
+  if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
   heap_unlock(heap);
 }
 
@@ -794,27 +831,35 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   uintptr_t hp = mut->hp;
   uintptr_t limit = mut->limit;
   uintptr_t new_hp = hp + size;
-  struct gc_ref ret;
-  if (new_hp <= limit) {
-    mut->hp = new_hp;
-    gc_clear_fresh_allocation(gc_ref(hp), size);
-    return gc_ref_heap_object(gc_ref(hp));
-  }
+  if (new_hp <= limit)
+    goto done;
 
   struct pcc_space *space = heap_pcc_space(mutator_heap(mut));
   if (mut->block) {
     record_fragmentation(space, limit - hp);
     push_allocated_block(space, mut->block);
   }
-  mut->block = pop_available_block(space);
-  while (!mut->block) {
+  while ((mut->block = pop_partially_allocated_block(space))) {
+    pcc_space_compute_region(space, mut->block, &hp, &limit);
+    hp += mut->block->allocated;
+    new_hp = hp + size;
+    if (new_hp <= limit) {
+      mut->limit = limit;
+      goto done;
+    }
+    record_fragmentation(space, limit - hp);
+    push_allocated_block(space, mut->block);
+  }
+  while (!(mut->block = pop_available_block(space))) {
     trigger_collection(mut);
-    mut->block = pop_available_block(space);
   }
   pcc_space_compute_region(space, mut->block, &hp, &mut->limit);
+  new_hp = hp + size;
   // The region is empty and is therefore large enough for a small
   // allocation.
-  mut->hp = hp + size;
+
+done:
+  mut->hp = new_hp;
   gc_clear_fresh_allocation(gc_ref(hp), size);
   return gc_ref_heap_object(gc_ref(hp));
 }
@@ -929,6 +974,7 @@ static int pcc_space_init(struct pcc_space *space, struct gc_heap *heap) {
     return 0;
 
   space->available = NULL;
+  space->partially_allocated = NULL;
   space->allocated = NULL;
   space->allocated_block_count = 0;
   space->paged_out = NULL;

From 9167dbb5f6b80556b9c0b16ab01bd4474bbf6c1c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 12 Jul 2024 09:06:26 +0200
Subject: [PATCH 242/403] Fix returning partially allocated blocks that are
 actually full

---
 src/pcc.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/pcc.c b/src/pcc.c
index d56c9b0e0..c71f2d04e 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -231,9 +231,15 @@ static struct pcc_block* pop_partially_allocated_block(struct pcc_space *space)
 static void push_partially_allocated_block(struct pcc_space *space,
                                            struct pcc_block *block,
                                            uintptr_t hp) {
-  block->allocated = hp & (REGION_SIZE - 1);
-  GC_ASSERT(block->allocated);
-  push_block(&space->partially_allocated, block);
+  size_t allocated = hp & (REGION_SIZE - 1);
+  if (allocated) {
+    block->allocated = allocated;
+    push_block(&space->partially_allocated, block);
+  } else {
+    // Could be hp was bumped all the way to the limit, in which case
+    // allocated wraps to 0; in any case the block is full.
+    push_allocated_block(space, block);
+  }
 }
 
 static struct pcc_block* pop_paged_out_block(struct pcc_space *space) {

From f6057184e1490e6ba6ad983f363350db4dc2de4b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 23 Jul 2024 22:32:57 +0200
Subject: [PATCH 243/403] Add finalizers

---
 Makefile                        |   4 +-
 README.md                       |   2 +-
 api/gc-basic-stats.h            |   2 +
 api/gc-embedder-api.h           |   1 -
 api/gc-event-listener-chain.h   |   6 +
 api/gc-event-listener.h         |   1 +
 api/gc-finalizer.h              |  81 +++++++++
 api/gc-null-event-listener.h    |   2 +
 benchmarks/simple-gc-embedder.h |   3 +
 embed.mk                        |   4 +-
 src/bdw.c                       |  72 ++++++++
 src/gc-finalizer-internal.h     |  65 +++++++
 src/gc-finalizer.c              | 307 ++++++++++++++++++++++++++++++++
 src/gc-internal.h               |   1 +
 src/pcc.c                       |  67 ++++++-
 src/root.h                      |  13 +-
 src/semi.c                      |  74 ++++++--
 src/whippet.c                   |  79 +++++++-
 18 files changed, 756 insertions(+), 28 deletions(-)
 create mode 100644 api/gc-finalizer.h
 create mode 100644 src/gc-finalizer-internal.h
 create mode 100644 src/gc-finalizer.c

diff --git a/Makefile b/Makefile
index 0ea60d022..c1ba15f43 100644
--- a/Makefile
+++ b/Makefile
@@ -52,6 +52,8 @@ obj/gc-options.o: src/gc-options.c | .deps obj
 	$(COMPILE) -c $<
 obj/%.gc-ephemeron.o: src/gc-ephemeron.c | .deps obj
 	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
+obj/%.gc-finalizer.o: src/gc-finalizer.c | .deps obj
+	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 
 GC_STEM_bdw   	   = bdw
 GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
@@ -99,7 +101,7 @@ obj/$(1).$(2).gc.o: src/$(call gc_impl,$(2)) | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) $(call gc_impl_cflags,$(2)) -include benchmarks/$(1)-embedder.h -c $$<
 obj/$(1).$(2).o: benchmarks/$(1).c | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) -include api/$(call gc_attrs,$(2)) -c $$<
-bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/$(1).gc-ephemeron.o | bin
+bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/$(1).gc-ephemeron.o obj/$(1).gc-finalizer.o | bin
 	$$(LINK) $$^ $(call gc_libs,$(2))
 endef
 
diff --git a/README.md b/README.md
index e1ac66150..e62b0661a 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ a talk given at FOSDEM 2023.
  - [X] Conservative data segments
  - [ ] Heap growth/shrinking
  - [ ] Debugging/tracing
- - [ ] Finalizers
+ - [X] Finalizers
  - [X] Weak references / weak maps
 
 ### Features that would improve Whippet performance
diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index af8cd4243..40cdbcb3e 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -58,6 +58,7 @@ static inline void gc_basic_stats_mutators_stopped(void *data) {}
 static inline void gc_basic_stats_roots_traced(void *data) {}
 static inline void gc_basic_stats_heap_traced(void *data) {}
 static inline void gc_basic_stats_ephemerons_traced(void *data) {}
+static inline void gc_basic_stats_finalizers_traced(void *data) {}
 
 static inline void gc_basic_stats_restarting_mutators(void *data) {
   struct gc_basic_stats *stats = data;
@@ -100,6 +101,7 @@ static inline void gc_basic_stats_live_data_size(void *data, size_t size) {
     gc_basic_stats_roots_traced,                                        \
     gc_basic_stats_heap_traced,                                         \
     gc_basic_stats_ephemerons_traced,                                   \
+    gc_basic_stats_finalizers_traced,                                   \
     gc_basic_stats_restarting_mutators,                                 \
     gc_basic_stats_mutator_added,                                       \
     gc_basic_stats_mutator_cause_gc,                                    \
diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index 30ba62946..bb091caeb 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -16,7 +16,6 @@ struct gc_mutator_roots;
 struct gc_heap_roots;
 struct gc_atomic_forward;
 struct gc_heap;
-struct gc_ephemeron;
 struct gc_extern_space;
 
 GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
diff --git a/api/gc-event-listener-chain.h b/api/gc-event-listener-chain.h
index 3bebf3531..96a7356a8 100644
--- a/api/gc-event-listener-chain.h
+++ b/api/gc-event-listener-chain.h
@@ -57,6 +57,11 @@ static inline void gc_event_listener_chain_ephemerons_traced(void *data) {
   chain->head.ephemerons_traced(chain->head_data);
   chain->tail.ephemerons_traced(chain->tail_data);
 }
+static inline void gc_event_listener_chain_finalizers_traced(void *data) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.finalizers_traced(chain->head_data);
+  chain->tail.finalizers_traced(chain->tail_data);
+}
 
 static inline void gc_event_listener_chain_restarting_mutators(void *data) {
   struct gc_event_listener_chain *chain = data;
@@ -123,6 +128,7 @@ static inline void gc_event_listener_chain_live_data_size(void *data, size_t siz
     gc_event_listener_chain_roots_traced,                               \
     gc_event_listener_chain_heap_traced,                                \
     gc_event_listener_chain_ephemerons_traced,                          \
+    gc_event_listener_chain_finalizers_traced,                          \
     gc_event_listener_chain_restarting_mutators,                        \
     gc_event_listener_chain_mutator_added,                              \
     gc_event_listener_chain_mutator_cause_gc,                           \
diff --git a/api/gc-event-listener.h b/api/gc-event-listener.h
index 25558838b..f5d8180f6 100644
--- a/api/gc-event-listener.h
+++ b/api/gc-event-listener.h
@@ -12,6 +12,7 @@ struct gc_event_listener {
   void (*roots_traced)(void *data);
   void (*heap_traced)(void *data);
   void (*ephemerons_traced)(void *data);
+  void (*finalizers_traced)(void *data);
   void (*restarting_mutators)(void *data);
 
   void* (*mutator_added)(void *data);
diff --git a/api/gc-finalizer.h b/api/gc-finalizer.h
new file mode 100644
index 000000000..1dcb0fb2f
--- /dev/null
+++ b/api/gc-finalizer.h
@@ -0,0 +1,81 @@
+#ifndef GC_FINALIZER_H_
+#define GC_FINALIZER_H_
+
+#include "gc-edge.h"
+#include "gc-ref.h"
+#include "gc-visibility.h"
+
+// A finalizer allows the embedder to be notified when an object becomes
+// unreachable.
+//
+// A finalizer has a priority.  When the heap is created, the embedder
+// should declare how many priorities there are.  Lower-numbered
+// priorities take precedence; if an object has a priority-0 finalizer
+// outstanding, that will prevent any finalizer at level 1 (or 2, ...)
+// from firing until no priority-0 finalizer remains.
+//
+// Call gc_attach_finalizer to attach a finalizer to an object.
+//
+// A finalizer also references an associated GC-managed closure object.
+// A finalizer's reference to the closure object is strong:  if a
+// finalizer's closure closure references its finalizable object,
+// directly or indirectly, the finalizer will never fire.
+//
+// When an object with a finalizer becomes unreachable, it is added to a
+// queue.  The embedder can call gc_pop_finalizable to get the next
+// finalizable object and its associated closure.  At that point the
+// embedder can do anything with the object, including keeping it alive.
+// Ephemeron associations will still be present while the finalizable
+// object is live.  Note however that any objects referenced by the
+// finalizable object may themselves be already finalized; finalizers
+// are enqueued for objects when they become unreachable, which can
+// concern whole subgraphs of objects at once.
+//
+// The usual way for an embedder to know when the queue of finalizable
+// object is non-empty is to call gc_set_finalizer_callback to
+// provide a function that will be invoked when there are pending
+// finalizers.
+//
+// Arranging to call gc_pop_finalizable and doing something with the
+// finalizable object and closure is the responsibility of the embedder.
+// The embedder's finalization action can end up invoking arbitrary
+// code, so unless the embedder imposes some kind of restriction on what
+// finalizers can do, generally speaking finalizers should be run in a
+// dedicated thread instead of recursively from within whatever mutator
+// thread caused GC.  Setting up such a thread is the responsibility of
+// the mutator.  gc_pop_finalizable is thread-safe, allowing multiple
+// finalization threads if that is appropriate.
+//
+// gc_allocate_finalizer returns a finalizer, which is a fresh
+// GC-managed heap object.  The mutator should then directly attach it
+// to an object using gc_finalizer_attach.  When the finalizer is fired,
+// it becomes available to the mutator via gc_pop_finalizable.
+
+struct gc_heap;
+struct gc_mutator;
+struct gc_finalizer;
+
+GC_API_ size_t gc_finalizer_size(void);
+GC_API_ struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut);
+GC_API_ void gc_finalizer_attach(struct gc_mutator *mut,
+                                 struct gc_finalizer *finalizer,
+                                 unsigned priority,
+                                 struct gc_ref object, struct gc_ref closure);
+
+GC_API_ struct gc_ref gc_finalizer_object(struct gc_finalizer *finalizer);
+GC_API_ struct gc_ref gc_finalizer_closure(struct gc_finalizer *finalizer);
+
+GC_API_ struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut);
+
+typedef void (*gc_finalizer_callback)(struct gc_heap *heap, size_t count);
+GC_API_ void gc_set_finalizer_callback(struct gc_heap *heap,
+                                       gc_finalizer_callback callback);
+
+GC_API_ void gc_trace_finalizer(struct gc_finalizer *finalizer,
+                                void (*visit)(struct gc_edge edge,
+                                              struct gc_heap *heap,
+                                              void *visit_data),
+                                struct gc_heap *heap,
+                                void *trace_data);
+
+#endif // GC_FINALIZER_H_
diff --git a/api/gc-null-event-listener.h b/api/gc-null-event-listener.h
index 5ca17975e..7563c3a46 100644
--- a/api/gc-null-event-listener.h
+++ b/api/gc-null-event-listener.h
@@ -12,6 +12,7 @@ static inline void gc_null_event_listener_mutators_stopped(void *data) {}
 static inline void gc_null_event_listener_roots_traced(void *data) {}
 static inline void gc_null_event_listener_heap_traced(void *data) {}
 static inline void gc_null_event_listener_ephemerons_traced(void *data) {}
+static inline void gc_null_event_listener_finalizers_traced(void *data) {}
 static inline void gc_null_event_listener_restarting_mutators(void *data) {}
 
 static inline void* gc_null_event_listener_mutator_added(void *data) {}
@@ -34,6 +35,7 @@ static inline void gc_null_event_listener_live_data_size(void *, size_t) {}
     gc_null_event_listener_roots_traced,                               \
     gc_null_event_listener_heap_traced,                                \
     gc_null_event_listener_ephemerons_traced,                          \
+    gc_null_event_listener_finalizers_traced,                          \
     gc_null_event_listener_restarting_mutators,                        \
     gc_null_event_listener_mutator_added,                              \
     gc_null_event_listener_mutator_cause_gc,                           \
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index 23fc54a5d..683cc15ca 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -6,6 +6,9 @@
 #include "gc-embedder-api.h"
 
 #define GC_EMBEDDER_EPHEMERON_HEADER struct gc_header header;
+#define GC_EMBEDDER_FINALIZER_HEADER struct gc_header header;
+
+static inline size_t gc_finalizer_priority_count(void) { return 2; }
 
 static inline int
 gc_is_valid_conservative_ref_displacement(uintptr_t displacement) {
diff --git a/embed.mk b/embed.mk
index 49a7b7347..9284781e0 100644
--- a/embed.mk
+++ b/embed.mk
@@ -31,6 +31,8 @@ $(GC_OBJDIR)gc-options.o: $(WHIPPET)src/gc-options.c
 	$(GC_COMPILE) -c $<
 $(GC_OBJDIR)gc-ephemeron.o: $(WHIPPET)src/gc-ephemeron.c
 	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
+$(GC_OBJDIR)gc-finalizer.o: $(WHIPPET)src/gc-finalizer.c
+	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 
 GC_STEM_bdw   	   = bdw
 GC_CFLAGS_bdw 	   = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
@@ -83,4 +85,4 @@ GC_LIBS             = $(call gc_libs,$(GC_COLLECTOR))
 $(GC_OBJDIR)gc-impl.o: $(WHIPPET)src/$(call gc_impl,$(GC_COLLECTOR))
 	$(GC_COMPILE) $(GC_IMPL_CFLAGS) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 
-GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-ephemeron.o gc-impl.o,$(GC_OBJDIR)$(O))
+GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-ephemeron.o gc-finalizer.o gc-impl.o,$(GC_OBJDIR)$(O))
diff --git a/src/bdw.c b/src/bdw.c
index f429b43c2..e9b7cb9f5 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -55,6 +55,8 @@ struct gc_heap {
   struct gc_heap_roots *roots;
   struct gc_mutator *mutators;
   struct gc_event_listener event_listener;
+  struct gc_finalizer_state *finalizer_state;
+  gc_finalizer_callback have_finalizers;
   void *event_listener_data;
 };
 
@@ -165,6 +167,7 @@ static void bdw_mark_edge(struct gc_edge edge, struct gc_heap *heap,
 static int heap_gc_kind;
 static int mutator_gc_kind;
 static int ephemeron_gc_kind;
+static int finalizer_gc_kind;
 
 // In BDW-GC, we can't hook into the mark phase to call
 // gc_trace_ephemerons_for_object, so the advertised ephemeron strategy
@@ -199,6 +202,46 @@ int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   return 1;
 }
 
+struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+  return GC_generic_malloc(gc_finalizer_size(), finalizer_gc_kind);
+}
+
+static void finalize_object(void *obj, void *data) {
+  struct gc_finalizer *f = data;
+  gc_finalizer_externally_fired(__the_bdw_gc_heap->finalizer_state, f);
+}
+
+void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                         unsigned priority, struct gc_ref object,
+                         struct gc_ref closure) {
+  // Don't bother much about the actual finalizer; just delegate to BDW-GC.
+  GC_finalization_proc prev = NULL;
+  void *prev_data = NULL;
+  gc_finalizer_init_internal(finalizer, object, closure);
+  gc_finalizer_externally_activated(finalizer);
+  GC_REGISTER_FINALIZER_NO_ORDER (gc_ref_heap_object(object), finalize_object,
+                                  finalizer, &prev, &prev_data);
+  // FIXME: Allow multiple finalizers per object.
+  GC_ASSERT(prev == NULL);
+  GC_ASSERT(prev_data == NULL);
+}
+
+struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+  GC_invoke_finalizers();
+  return gc_finalizer_state_pop(mut->heap->finalizer_state);
+}
+
+void gc_set_finalizer_callback(struct gc_heap *heap,
+                               gc_finalizer_callback callback) {
+  heap->have_finalizers = callback;
+}
+
+static void have_finalizers(void) {
+  struct gc_heap *heap = __the_bdw_gc_heap;
+  if (heap->have_finalizers)
+    heap->have_finalizers(heap, 1);
+}
+
 static struct GC_ms_entry *
 mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
                struct GC_ms_entry *mark_stack_limit, GC_word env) {
@@ -228,6 +271,29 @@ mark_ephemeron(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
   return state.mark_stack_ptr;
 }
 
+static struct GC_ms_entry *
+mark_finalizer(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
+               struct GC_ms_entry *mark_stack_limit, GC_word env) {
+
+  struct bdw_mark_state state = {
+    mark_stack_ptr,
+    mark_stack_limit,
+  };
+  
+  struct gc_finalizer *finalizer = (struct gc_finalizer*) addr;
+
+  // If this ephemeron is on a freelist, its first word will be a
+  // freelist link and everything else will be NULL.
+  if (!gc_ref_value(gc_finalizer_object(finalizer))) {
+    bdw_mark_edge(gc_edge(addr), NULL, &state);
+    return state.mark_stack_ptr;
+  }
+
+  gc_trace_finalizer(finalizer, bdw_mark_edge, NULL, &state);
+
+  return state.mark_stack_ptr;
+}
+
 static struct GC_ms_entry *
 mark_heap(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
           struct GC_ms_entry *mark_stack_limit, GC_word env) {
@@ -428,6 +494,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   }
 
   GC_set_all_interior_pointers (0);
+  GC_set_finalize_on_demand (1);
+  GC_set_finalizer_notifier(have_finalizers);
 
   // Not part of 7.3, sigh.  Have to set an env var.
   // GC_set_markers_count(options->common.parallelism);
@@ -453,6 +521,9 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
     ephemeron_gc_kind = GC_new_kind(GC_new_free_list(),
                                     GC_MAKE_PROC(GC_new_proc(mark_ephemeron), 0),
                                     add_size_to_descriptor, clear_memory);
+    finalizer_gc_kind = GC_new_kind(GC_new_free_list(),
+                                    GC_MAKE_PROC(GC_new_proc(mark_finalizer), 0),
+                                    add_size_to_descriptor, clear_memory);
   }
 
   *heap = GC_generic_malloc(sizeof(struct gc_heap), heap_gc_kind);
@@ -460,6 +531,7 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   (*heap)->event_listener = event_listener;
   (*heap)->event_listener_data = event_listener_data;
+  (*heap)->finalizer_state = gc_make_finalizer_state();
 
   __the_bdw_gc_heap = *heap;
   HEAP_EVENT(init, GC_get_heap_size());
diff --git a/src/gc-finalizer-internal.h b/src/gc-finalizer-internal.h
new file mode 100644
index 000000000..529a087ee
--- /dev/null
+++ b/src/gc-finalizer-internal.h
@@ -0,0 +1,65 @@
+#ifndef GC_FINALIZER_INTERNAL_H
+#define GC_FINALIZER_INTERNAL_H
+
+#ifndef GC_IMPL
+#error internal header file, not part of API
+#endif
+
+#include "gc-finalizer.h"
+#include "root.h"
+
+struct gc_finalizer_state;
+
+GC_INTERNAL
+struct gc_finalizer_state* gc_make_finalizer_state(void);
+
+GC_INTERNAL
+void gc_finalizer_init_internal(struct gc_finalizer *f,
+                                struct gc_ref object,
+                                struct gc_ref closure);
+
+GC_INTERNAL
+void gc_finalizer_attach_internal(struct gc_finalizer_state *state,
+                                  struct gc_finalizer *f,
+                                  unsigned priority);
+
+GC_INTERNAL
+void gc_finalizer_externally_activated(struct gc_finalizer *f);
+
+GC_INTERNAL
+void gc_finalizer_externally_fired(struct gc_finalizer_state *state,
+                                   struct gc_finalizer *finalizer);
+
+GC_INTERNAL
+struct gc_finalizer* gc_finalizer_state_pop(struct gc_finalizer_state *state);
+
+GC_INTERNAL
+void gc_finalizer_fire(struct gc_finalizer **fired_list_loc,
+                       struct gc_finalizer *finalizer);
+
+GC_INTERNAL
+void gc_finalizer_state_set_callback(struct gc_finalizer_state *state,
+                                     gc_finalizer_callback callback);
+
+GC_INTERNAL
+size_t gc_visit_finalizer_roots(struct gc_finalizer_state *state,
+                                void (*visit)(struct gc_edge edge,
+                                              struct gc_heap *heap,
+                                              void *visit_data),
+                                struct gc_heap *heap,
+                                void *visit_data);
+
+GC_INTERNAL
+size_t gc_resolve_finalizers(struct gc_finalizer_state *state,
+                             size_t priority,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *visit_data);
+
+GC_INTERNAL
+void gc_notify_finalizers(struct gc_finalizer_state *state,
+                          struct gc_heap *heap);
+
+#endif // GC_FINALIZER_INTERNAL_H
diff --git a/src/gc-finalizer.c b/src/gc-finalizer.c
new file mode 100644
index 000000000..ed9c39173
--- /dev/null
+++ b/src/gc-finalizer.c
@@ -0,0 +1,307 @@
+#include <math.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GC_IMPL 1
+
+#include "debug.h"
+#include "gc-embedder-api.h"
+#include "gc-ephemeron-internal.h" // for gc_visit_ephemeron_key
+#include "gc-finalizer-internal.h"
+
+// # Overview
+//
+// See gc-finalizer.h for a overview of finalizers from the user and
+// embedder point of view.
+//
+// ## Tracing
+//
+// From the perspecive of the collector implementation, finalizers are
+// GC-managed objects, allowing their size to be accounted for within
+// the heap size.  They get traced during collection, allowing for
+// relocation of their object references, and allowing the finalizer
+// object itself to be evacuated if appropriate.
+//
+// The collector holds on to outstanding finalizers in a *finalizer
+// state*, which holds one *finalizer table* for each priority.  We
+// don't need to look up finalizers by object, so we could just hold
+// them in a big list, but to facilitate parallelism we slice them
+// across some number of shards, where the "next" pointer is part of the
+// finalizer object.
+//
+// There are a number of ways you could imagine integrating finalizers
+// into a system.  The way Whippet does it goes like this.  See
+// https://wingolog.org/archives/2022/10/31/ephemerons-and-finalizers
+// and
+// https://wingolog.org/archives/2024/07/22/finalizers-guardians-phantom-references-et-cetera
+// for some further discussion.
+//
+//   1. The collector should begin a cycle by adding all shards from all
+//      priorities to the root set.  When the embedder comes across a
+//      finalizer (as it will, because we added them to the root set),
+//      it traces it via gc_trace_finalizer(), which will visit the
+//      finalizer's closure and its "next" pointer.
+//
+//   2. After the full trace, and then the fix-point on pending
+//      ephemerons, for each priority from 0 upwards:
+//
+//      i. Visit each finalizable object in the table.  If the object
+//         was as-yet unvisited, then it is unreachable and thus
+//         finalizable; the finalizer is added to the global "fired"
+//         list, and changes state from "attached" to "fired".
+//         Otherwise it is re-added to the finalizer table.
+//
+//     ii. If any finalizer was added to the fired list, then those
+//         objects were also added to the grey worklist; run tracing
+//         again until the grey set is empty, including ephemerons.
+//
+//   3. Finally, call the finalizer callback if the list of fired finalizers is
+//      nonempty.
+//
+// ## Concurrency
+//
+// The finalizer table is wait-free.  It keeps a count of active finalizers, and
+// chooses a bucket based on the count modulo the number of buckets.  Adding a
+// finalizer to the table is an atomic push on a linked list.  The table is
+// completely rebuilt during the GC pause, redistributing survivor entries
+// across the buckets, and pushing all finalizable entries onto the single
+// "fired" linked list.
+//
+// The fired list is also wait-free.  As noted above, it is built
+// during the pause, and mutators pop items off of it atomically.
+//
+// ## Generations
+//
+// It would be ideal if a young generation had its own finalizer table.
+// Promoting an object would require promoting its finalizer to the old
+// finalizer table.  Not yet implemented (but would be nice).
+
+#ifndef GC_EMBEDDER_FINALIZER_HEADER
+#error Embedder should define GC_EMBEDDER_FINALIZER_HEADER
+#endif
+
+enum finalizer_state {
+  FINALIZER_STATE_INIT = 0, // Finalizer is newborn.
+  FINALIZER_STATE_ACTIVE,   // Finalizer is ours and in the finalizer table.
+  FINALIZER_STATE_FIRED,    // Finalizer is handed back to mutator.
+};
+
+struct gc_finalizer {
+  GC_EMBEDDER_FINALIZER_HEADER
+  enum finalizer_state state;
+  struct gc_ref object;
+  struct gc_ref closure;
+  struct gc_finalizer *next;
+};
+
+// Enough buckets to parallelize closure marking.  No need to look up a
+// finalizer for a given object.
+#define BUCKET_COUNT 32
+
+struct gc_finalizer_table {
+  size_t finalizer_count;
+  struct gc_finalizer* buckets[BUCKET_COUNT];
+};
+
+struct gc_finalizer_state {
+  gc_finalizer_callback have_finalizers;
+  struct gc_finalizer *fired;
+  size_t fired_this_cycle;
+  size_t table_count;
+  struct gc_finalizer_table tables[0];
+};
+
+// public
+size_t gc_finalizer_size(void) { return sizeof(struct gc_finalizer); }
+struct gc_ref gc_finalizer_object(struct gc_finalizer *f) { return f->object; }
+struct gc_ref gc_finalizer_closure(struct gc_finalizer *f) { return f->closure; }
+
+// internal
+struct gc_finalizer_state* gc_make_finalizer_state(void) {
+  size_t ntables = gc_finalizer_priority_count();
+  size_t size = (sizeof(struct gc_finalizer_state) +
+                 sizeof(struct gc_finalizer_table) * ntables);
+  struct gc_finalizer_state *ret = malloc(size);
+  if (!ret)
+    return NULL;
+  memset(ret, 0, size);
+  ret->table_count = ntables;
+  return ret;
+}
+
+static void finalizer_list_push(struct gc_finalizer **loc,
+                                struct gc_finalizer *head) {
+  struct gc_finalizer *tail = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    head->next = tail;
+  } while (!atomic_compare_exchange_weak(loc, &tail, head));
+}
+
+static struct gc_finalizer* finalizer_list_pop(struct gc_finalizer **loc) {
+  struct gc_finalizer *head = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (!head) return NULL;
+  } while (!atomic_compare_exchange_weak(loc, &head, head->next));
+  head->next = NULL;
+  return head;
+}
+
+static void add_finalizer_to_table(struct gc_finalizer_table *table,
+                                   struct gc_finalizer *f) {
+  size_t count = atomic_fetch_add_explicit(&table->finalizer_count, 1,
+                                           memory_order_relaxed);
+  struct gc_finalizer **loc = &table->buckets[count % BUCKET_COUNT];
+  finalizer_list_push(loc, f);
+}
+
+// internal
+void gc_finalizer_init_internal(struct gc_finalizer *f,
+                                struct gc_ref object,
+                                struct gc_ref closure) {
+  // Caller responsible for any write barrier, though really the
+  // assumption is that the finalizer is younger than the key and the
+  // value.
+  if (f->state != FINALIZER_STATE_INIT)
+    GC_CRASH();
+  if (gc_ref_is_heap_object(f->object))
+    GC_CRASH();
+  f->object = object;
+  f->closure = closure;
+}
+
+// internal
+void gc_finalizer_attach_internal(struct gc_finalizer_state *state,
+                                  struct gc_finalizer *f,
+                                  unsigned priority) {
+  // Caller responsible for any write barrier, though really the
+  // assumption is that the finalizer is younger than the key and the
+  // value.
+  if (f->state != FINALIZER_STATE_INIT)
+    GC_CRASH();
+  if (!gc_ref_is_heap_object(f->object))
+    GC_CRASH();
+
+  f->state = FINALIZER_STATE_ACTIVE;
+
+  GC_ASSERT(priority < state->table_count);
+  add_finalizer_to_table(&state->tables[priority], f);
+}
+
+// internal
+struct gc_finalizer* gc_finalizer_state_pop(struct gc_finalizer_state *state) {
+  return finalizer_list_pop(&state->fired);
+}
+
+static void
+add_fired_finalizer(struct gc_finalizer_state *state,
+                    struct gc_finalizer *f) {
+  if (f->state != FINALIZER_STATE_ACTIVE)
+    GC_CRASH();
+  f->state = FINALIZER_STATE_FIRED;
+  finalizer_list_push(&state->fired, f);
+}
+
+// internal
+void
+gc_finalizer_externally_activated(struct gc_finalizer *f) {
+  if (f->state != FINALIZER_STATE_INIT)
+    GC_CRASH();
+  f->state = FINALIZER_STATE_ACTIVE;
+}
+
+// internal
+void
+gc_finalizer_externally_fired(struct gc_finalizer_state *state,
+                              struct gc_finalizer *f) {
+  add_fired_finalizer(state, f);
+}
+
+// internal
+size_t gc_visit_finalizer_roots(struct gc_finalizer_state *state,
+                                void (*visit)(struct gc_edge,
+                                              struct gc_heap*,
+                                              void *),
+                                struct gc_heap *heap,
+                                void *visit_data) {
+  size_t count;
+  for (size_t tidx = 0; tidx < state->table_count; tidx++) {
+    struct gc_finalizer_table *table = &state->tables[tidx];
+    if (table->finalizer_count) {
+      count += table->finalizer_count;
+      for (size_t bidx = 0; bidx < BUCKET_COUNT; bidx++)
+        visit(gc_edge(&table->buckets[bidx]), heap, visit_data);
+    }
+  }
+  return count;
+}
+
+// public
+void gc_trace_finalizer(struct gc_finalizer *f,
+                        void (*visit)(struct gc_edge edge,
+                                      struct gc_heap *heap,
+                                      void *visit_data),
+                        struct gc_heap *heap,
+                        void *trace_data) {
+  if (f->state != FINALIZER_STATE_ACTIVE)
+    visit(gc_edge(&f->object), heap, trace_data);
+  visit(gc_edge(&f->closure), heap, trace_data);
+  visit(gc_edge(&f->next), heap, trace_data);
+}
+
+// Sweeping is currently serial.  It could run in parallel but we want to
+// resolve all finalizers before shading any additional node.  Perhaps we should
+// relax this restriction though; if the user attaches two finalizers to the
+// same object, it's probably OK to only have one finalizer fire per cycle.
+
+// internal
+size_t gc_resolve_finalizers(struct gc_finalizer_state *state,
+                             size_t priority,
+                             void (*visit)(struct gc_edge edge,
+                                           struct gc_heap *heap,
+                                           void *visit_data),
+                             struct gc_heap *heap,
+                             void *visit_data) {
+  GC_ASSERT(priority < state->table_count);
+  struct gc_finalizer_table *table = &state->tables[priority];
+  size_t finalizers_fired = 0;
+  // Visit each finalizer in the table.  If its object was already visited,
+  // re-add the finalizer to the table.  Otherwise enqueue its object edge for
+  // tracing and mark the finalizer as fired.
+  if (table->finalizer_count) {
+    struct gc_finalizer_table scratch = { 0, };
+    for (size_t bidx = 0; bidx < BUCKET_COUNT; bidx++) {
+      struct gc_finalizer *next;
+      for (struct gc_finalizer *f = table->buckets[bidx]; f; f = next) {
+        next = f->next;
+        f->next = NULL;
+        struct gc_edge edge = gc_edge(&f->object);
+        if (gc_visit_ephemeron_key(edge, heap)) {
+          add_finalizer_to_table(&scratch, f);
+        } else {
+          finalizers_fired++;
+          visit(edge, heap, visit_data);
+          add_fired_finalizer(state, f);
+        }
+      }
+    }
+    memcpy(table, &scratch, sizeof(*table));
+  }
+  state->fired_this_cycle += finalizers_fired;
+  return finalizers_fired;
+}
+
+// internal
+void gc_notify_finalizers(struct gc_finalizer_state *state,
+                          struct gc_heap *heap) {
+  if (state->fired_this_cycle && state->have_finalizers) {
+    state->have_finalizers(heap, state->fired_this_cycle);
+    state->fired_this_cycle = 0;
+  }
+}
+
+// internal
+void gc_finalizer_state_set_callback(struct gc_finalizer_state *state,
+                                     gc_finalizer_callback callback) {
+  state->have_finalizers = callback;
+}
diff --git a/src/gc-internal.h b/src/gc-internal.h
index abc9bd83a..7cbb79f58 100644
--- a/src/gc-internal.h
+++ b/src/gc-internal.h
@@ -6,6 +6,7 @@
 #endif
 
 #include "gc-ephemeron-internal.h"
+#include "gc-finalizer-internal.h"
 #include "gc-options-internal.h"
 
 #endif // GC_INTERNAL_H
diff --git a/src/pcc.c b/src/pcc.c
index c71f2d04e..fa7342e4d 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -131,6 +131,7 @@ struct gc_heap {
   int collecting;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
+  struct gc_finalizer_state *finalizer_state;
   size_t mutator_count;
   size_t paused_mutator_count;
   size_t inactive_mutator_count;
@@ -649,6 +650,9 @@ static inline void trace_root(struct gc_root root, struct gc_heap *heap,
     gc_trace_resolved_ephemerons(root.resolved_ephemerons, tracer_visit,
                                  heap, worker);
     break;
+  case GC_ROOT_KIND_EDGE:
+    tracer_visit(root.edge, heap, worker);
+    break;
   default:
     GC_CRASH();
   }
@@ -712,10 +716,16 @@ static int maybe_grow_heap(struct gc_heap *heap) {
   return 0;
 }
 
+static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
+                            void *unused) {
+  gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
+}
+
 static void add_roots(struct gc_heap *heap) {
   for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next)
     gc_tracer_add_root(&heap->tracer, gc_root_mutator(mut));
   gc_tracer_add_root(&heap->tracer, gc_root_heap(heap));
+  gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
 }
 
 static void resolve_ephemerons_lazily(struct gc_heap *heap) {
@@ -729,12 +739,26 @@ static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
   gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
 }
 
-static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
-  struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
-  if (!resolved)
-    return 0;
-  gc_tracer_add_root(&heap->tracer, gc_root_resolved_ephemerons(resolved));
-  return 1;
+static void trace_resolved_ephemerons(struct gc_heap *heap) {
+  for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+       resolved;
+       resolved = gc_pop_resolved_ephemerons(heap)) {
+    gc_tracer_add_root(&heap->tracer, gc_root_resolved_ephemerons(resolved));
+    gc_tracer_trace(&heap->tracer);
+  }
+}
+
+static void resolve_finalizers(struct gc_heap *heap) {
+  for (size_t priority = 0;
+       priority < gc_finalizer_priority_count();
+       priority++) {
+    if (gc_resolve_finalizers(heap->finalizer_state, priority,
+                              visit_root_edge, heap, NULL)) {
+      gc_tracer_trace(&heap->tracer);
+      trace_resolved_ephemerons(heap);
+    }
+  }
+  gc_notify_finalizers(heap->finalizer_state, heap);
 }
 
 static void sweep_ephemerons(struct gc_heap *heap) {
@@ -765,9 +789,10 @@ static void collect(struct gc_mutator *mut) {
   gc_tracer_trace(&heap->tracer);
   HEAP_EVENT(heap, heap_traced);
   resolve_ephemerons_eagerly(heap);
-  while (enqueue_resolved_ephemerons(heap))
-    gc_tracer_trace(&heap->tracer);
+  trace_resolved_ephemerons(heap);
   HEAP_EVENT(heap, ephemerons_traced);
+  resolve_finalizers(heap);
+  HEAP_EVENT(heap, finalizers_traced);
   sweep_ephemerons(heap);
   gc_tracer_release(&heap->tracer);
   pcc_space_finish_gc(cspace);
@@ -891,6 +916,28 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
   return heap->count;
 }
 
+struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_finalizer_size());
+}
+
+void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                         unsigned priority, struct gc_ref object,
+                         struct gc_ref closure) {
+  gc_finalizer_init_internal(finalizer, object, closure);
+  gc_finalizer_attach_internal(mutator_heap(mut)->finalizer_state,
+                               finalizer, priority);
+  // No write barrier.
+}
+
+struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+  return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
+}
+
+void gc_set_finalizer_callback(struct gc_heap *heap,
+                               gc_finalizer_callback callback) {
+  gc_finalizer_state_set_callback(heap->finalizer_state, callback);
+}
+
 static struct pcc_slab* allocate_slabs(size_t nslabs) {
   size_t size = nslabs * SLAB_SIZE;
   size_t extent = size + SLAB_SIZE;
@@ -969,6 +1016,10 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   if (!heap_prepare_pending_ephemerons(heap))
     GC_CRASH();
 
+  heap->finalizer_state = gc_make_finalizer_state();
+  if (!heap->finalizer_state)
+    GC_CRASH();
+
   return 1;
 }
 
diff --git a/src/root.h b/src/root.h
index a6a91f987..5228dcb4f 100644
--- a/src/root.h
+++ b/src/root.h
@@ -1,6 +1,8 @@
 #ifndef ROOT_H
 #define ROOT_H
 
+#include "gc-edge.h"
+
 struct gc_ephemeron;
 struct gc_heap;
 struct gc_mutator;
@@ -9,7 +11,8 @@ enum gc_root_kind {
   GC_ROOT_KIND_NONE,
   GC_ROOT_KIND_HEAP,
   GC_ROOT_KIND_MUTATOR,
-  GC_ROOT_KIND_RESOLVED_EPHEMERONS
+  GC_ROOT_KIND_RESOLVED_EPHEMERONS,
+  GC_ROOT_KIND_EDGE,
 };
 
 struct gc_root {
@@ -18,6 +21,7 @@ struct gc_root {
     struct gc_heap *heap;
     struct gc_mutator *mutator;
     struct gc_ephemeron *resolved_ephemerons;
+    struct gc_edge edge;
   };
 };
 
@@ -40,4 +44,11 @@ gc_root_resolved_ephemerons(struct gc_ephemeron* resolved) {
   return ret;
 }
 
+static inline struct gc_root
+gc_root_edge(struct gc_edge edge) {
+  struct gc_root ret = { GC_ROOT_KIND_EDGE };
+  ret.edge = edge;
+  return ret;
+}
+
 #endif // ROOT_H
diff --git a/src/semi.c b/src/semi.c
index af9134bd7..739a21d75 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -37,6 +37,7 @@ struct gc_heap {
   struct semi_space semi_space;
   struct large_object_space large_object_space;
   struct gc_pending_ephemerons *pending_ephemerons;
+  struct gc_finalizer_state *finalizer_state;
   struct gc_extern_space *extern_space;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
@@ -350,6 +351,37 @@ static void adjust_heap_size_and_limits(struct gc_heap *heap,
   semi->limit = new_limit;
 }
 
+static uintptr_t trace_closure(struct gc_heap *heap, struct semi_space *semi,
+                               uintptr_t grey) {
+  while(grey < semi->hp)
+    grey = scan(heap, gc_ref(grey));
+  return grey;
+}
+
+static uintptr_t resolve_ephemerons(struct gc_heap *heap, uintptr_t grey) {
+  for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+       resolved;
+       resolved = gc_pop_resolved_ephemerons(heap)) {
+    gc_trace_resolved_ephemerons(resolved, trace, heap, NULL);
+    grey = trace_closure(heap, heap_semi_space(heap), grey);
+  }
+  return grey;
+}
+
+static uintptr_t resolve_finalizers(struct gc_heap *heap, uintptr_t grey) {
+  for (size_t priority = 0;
+       priority < gc_finalizer_priority_count();
+       priority++) {
+    if (gc_resolve_finalizers(heap->finalizer_state, priority,
+                              trace, heap, NULL)) {
+      grey = trace_closure(heap, heap_semi_space(heap), grey);
+      grey = resolve_ephemerons(heap, grey);
+    }
+  }
+  gc_notify_finalizers(heap->finalizer_state, heap);
+  return grey;
+}
+
 static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   int is_minor = 0;
@@ -373,22 +405,17 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
     gc_trace_heap_roots(heap->roots, trace, heap, NULL);
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, trace, heap, NULL);
+  gc_visit_finalizer_roots(heap->finalizer_state, trace, heap, NULL);
   HEAP_EVENT(heap, roots_traced);
   // fprintf(stderr, "pushed %zd bytes in roots\n", space->hp - grey);
-  while(grey < semi->hp)
-    grey = scan(heap, gc_ref(grey));
+  grey = trace_closure(heap, semi, grey);
   HEAP_EVENT(heap, heap_traced);
   gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
   heap->check_pending_ephemerons = 1;
-  do {
-    struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
-    if (!resolved)
-      break;
-    gc_trace_resolved_ephemerons(resolved, trace, heap, NULL);
-    while(grey < semi->hp)
-      grey = scan(heap, gc_ref(grey));
-  } while (1);
+  grey = resolve_ephemerons(heap, grey);
   HEAP_EVENT(heap, ephemerons_traced);
+  grey = resolve_finalizers(heap, grey);
+  HEAP_EVENT(heap, finalizers_traced);
   large_object_space_finish_gc(large, 0);
   gc_extern_space_finish_gc(heap->extern_space, 0);
   semi_space_finish_gc(semi, large->live_pages_at_last_collection);
@@ -486,6 +513,28 @@ void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
   gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
 }
 
+struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_finalizer_size());
+}
+
+void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                         unsigned priority, struct gc_ref object,
+                         struct gc_ref closure) {
+  gc_finalizer_init_internal(finalizer, object, closure);
+  gc_finalizer_attach_internal(mutator_heap(mut)->finalizer_state,
+                               finalizer, priority);
+  // No write barrier.
+}
+
+struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+  return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
+}
+
+void gc_set_finalizer_callback(struct gc_heap *heap,
+                               gc_finalizer_callback callback) {
+  gc_finalizer_state_set_callback(heap->finalizer_state, callback);
+}
+
 static int region_init(struct region *region, size_t size) {
   region->base = 0;
   region->active_size = 0;
@@ -542,8 +591,11 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   heap->options = options;
   heap->size = options->common.heap_size;
   heap->roots = NULL;
+  heap->finalizer_state = gc_make_finalizer_state();
+  if (!heap->finalizer_state)
+    GC_CRASH();
 
-  return heap_prepare_pending_ephemerons(heap);
+return heap_prepare_pending_ephemerons(heap);
 }
 
 int gc_option_from_string(const char *str) {
diff --git a/src/whippet.c b/src/whippet.c
index 3e17f5422..3771babde 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -301,6 +301,7 @@ struct gc_heap {
   int mark_while_stopping;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
+  struct gc_finalizer_state *finalizer_state;
   enum gc_collection_kind gc_kind;
   int multithreaded;
   size_t mutator_count;
@@ -1231,8 +1232,28 @@ static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
 static inline void trace_root(struct gc_root root,
                               struct gc_heap *heap,
                               struct gc_trace_worker *worker) {
-  // We don't use parallel root tracing yet.
-  GC_CRASH();
+  switch (root.kind) {
+  case GC_ROOT_KIND_HEAP:
+    gc_trace_heap_roots(root.heap->roots, tracer_visit, heap, worker);
+    break;
+  case GC_ROOT_KIND_MUTATOR:
+    gc_trace_mutator_roots(root.mutator->roots, tracer_visit, heap, worker);
+    break;
+  case GC_ROOT_KIND_RESOLVED_EPHEMERONS:
+    gc_trace_resolved_ephemerons(root.resolved_ephemerons, tracer_visit,
+                                 heap, worker);
+    break;
+  case GC_ROOT_KIND_EDGE:
+    tracer_visit(root.edge, heap, worker);
+    break;
+  default:
+    GC_CRASH();
+  }
+}
+
+static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
+                            void *unused) {
+  gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
 }
 
 static void
@@ -1823,6 +1844,7 @@ static void trace_pinned_roots_after_stop(struct gc_heap *heap) {
 static void trace_roots_after_stop(struct gc_heap *heap) {
   trace_mutator_roots_after_stop(heap);
   gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap, NULL);
+  gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
   trace_generational_roots(heap);
 }
 
@@ -1890,6 +1912,28 @@ static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
   return 1;
 }
 
+static void trace_resolved_ephemerons(struct gc_heap *heap) {
+  for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+       resolved;
+       resolved = gc_pop_resolved_ephemerons(heap)) {
+    gc_tracer_add_root(&heap->tracer, gc_root_resolved_ephemerons(resolved));
+    gc_tracer_trace(&heap->tracer);
+  }
+}
+
+static void resolve_finalizers(struct gc_heap *heap) {
+  for (size_t priority = 0;
+       priority < gc_finalizer_priority_count();
+       priority++) {
+    if (gc_resolve_finalizers(heap->finalizer_state, priority,
+                              visit_root_edge, heap, NULL)) {
+      gc_tracer_trace(&heap->tracer);
+      trace_resolved_ephemerons(heap);
+    }
+  }
+  gc_notify_finalizers(heap->finalizer_state, heap);
+}
+
 static void sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
@@ -1934,9 +1978,10 @@ static void collect(struct gc_mutator *mut,
   gc_tracer_trace(&heap->tracer);
   HEAP_EVENT(heap, heap_traced);
   resolve_ephemerons_eagerly(heap);
-  while (enqueue_resolved_ephemerons(heap))
-    gc_tracer_trace(&heap->tracer);
+  trace_resolved_ephemerons(heap);
   HEAP_EVENT(heap, ephemerons_traced);
+  resolve_finalizers(heap);
+  HEAP_EVENT(heap, finalizers_traced);
   sweep_ephemerons(heap);
   gc_tracer_release(&heap->tracer);
   mark_space_finish_gc(space, gc_kind);
@@ -2322,6 +2367,28 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
   return heap->count;
 }
 
+struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_finalizer_size());
+}
+
+void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                         unsigned priority, struct gc_ref object,
+                         struct gc_ref closure) {
+  gc_finalizer_init_internal(finalizer, object, closure);
+  gc_finalizer_attach_internal(mutator_heap(mut)->finalizer_state,
+                               finalizer, priority);
+  // No write barrier.
+}
+
+struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+  return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
+}
+
+void gc_set_finalizer_callback(struct gc_heap *heap,
+                               gc_finalizer_callback callback) {
+  gc_finalizer_state_set_callback(heap->finalizer_state, callback);
+}
+
 static struct slab* allocate_slabs(size_t nslabs) {
   size_t size = nslabs * SLAB_SIZE;
   size_t extent = size + SLAB_SIZE;
@@ -2406,6 +2473,10 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   if (!heap_prepare_pending_ephemerons(heap))
     GC_CRASH();
 
+  heap->finalizer_state = gc_make_finalizer_state();
+  if (!heap->finalizer_state)
+    GC_CRASH();
+
   return 1;
 }
 

From 068e0e5cdf9179d368514138b61219a639b4067a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 24 Jul 2024 09:51:56 +0200
Subject: [PATCH 244/403] Add benchmark for finalizers

Doesn't quite work on BDW yet.
---
 Makefile                         |   2 +-
 benchmarks/finalizers-embedder.h |  55 +++++++
 benchmarks/finalizers-types.h    |  22 +++
 benchmarks/finalizers.c          | 266 +++++++++++++++++++++++++++++++
 src/bdw.c                        |   6 +-
 src/pcc.c                        |   5 +-
 src/semi.c                       |   2 +-
 src/whippet.c                    |   2 +-
 8 files changed, 352 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/finalizers-embedder.h
 create mode 100644 benchmarks/finalizers-types.h
 create mode 100644 benchmarks/finalizers.c

diff --git a/Makefile b/Makefile
index c1ba15f43..db5f1a7c2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TESTS = quads mt-gcbench ephemerons # MT_GCBench MT_GCBench2
+TESTS = quads mt-gcbench ephemerons finalizers
 COLLECTORS = \
 	bdw \
 	semi \
diff --git a/benchmarks/finalizers-embedder.h b/benchmarks/finalizers-embedder.h
new file mode 100644
index 000000000..0dde1ae29
--- /dev/null
+++ b/benchmarks/finalizers-embedder.h
@@ -0,0 +1,55 @@
+#ifndef FINALIZERS_EMBEDDER_H
+#define FINALIZERS_EMBEDDER_H
+
+#include <stddef.h>
+
+#include "finalizers-types.h"
+#include "gc-finalizer.h"
+
+struct gc_heap;
+
+#define DEFINE_METHODS(name, Name, NAME) \
+  static inline size_t name##_size(Name *obj) GC_ALWAYS_INLINE; \
+  static inline void visit_##name##_fields(Name *obj,\
+                                           void (*visit)(struct gc_edge edge, \
+                                                         struct gc_heap *heap, \
+                                                         void *visit_data), \
+                                           struct gc_heap *heap,        \
+                                           void *visit_data) GC_ALWAYS_INLINE;
+FOR_EACH_HEAP_OBJECT_KIND(DEFINE_METHODS)
+#undef DEFINE_METHODS
+
+static inline size_t small_object_size(SmallObject *obj) { return sizeof(*obj); }
+static inline size_t finalizer_size(Finalizer *obj) { return gc_finalizer_size(); }
+static inline size_t pair_size(Pair *obj) { return sizeof(*obj); }
+
+static inline void
+visit_small_object_fields(SmallObject *obj,
+                          void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                        void *visit_data),
+                          struct gc_heap *heap,
+                          void *visit_data) {}
+
+static inline void
+visit_finalizer_fields(Finalizer *finalizer,
+                       void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                     void *visit_data),
+
+                       struct gc_heap *heap,
+                       void *visit_data) {
+  gc_trace_finalizer((struct gc_finalizer*)finalizer, visit, heap, visit_data);
+}
+
+static inline void
+visit_pair_fields(Pair *pair,
+                  void (*visit)(struct gc_edge edge, struct gc_heap *heap,
+                                void *visit_data),
+                  struct gc_heap *heap,
+                  void *visit_data) {
+  visit(gc_edge(&pair->car), heap, visit_data);
+  visit(gc_edge(&pair->cdr), heap, visit_data);
+}
+
+#include "simple-gc-embedder.h"
+
+#endif // FINALIZERS_EMBEDDER_H
diff --git a/benchmarks/finalizers-types.h b/benchmarks/finalizers-types.h
new file mode 100644
index 000000000..3597ad5d7
--- /dev/null
+++ b/benchmarks/finalizers-types.h
@@ -0,0 +1,22 @@
+#ifndef FINALIZERS_TYPES_H
+#define FINALIZERS_TYPES_H
+
+#define FOR_EACH_HEAP_OBJECT_KIND(M) \
+  M(pair, Pair, PAIR) \
+  M(finalizer, Finalizer, FINALIZER) \
+  M(small_object, SmallObject, SMALL_OBJECT)
+
+#include "heap-objects.h"
+#include "simple-tagging-scheme.h"
+
+struct SmallObject {
+  struct gc_header header;
+};
+
+struct Pair {
+  struct gc_header header;
+  void *car;
+  void *cdr;
+};
+
+#endif // FINALIZERS_TYPES_H
diff --git a/benchmarks/finalizers.c b/benchmarks/finalizers.c
new file mode 100644
index 000000000..434283a53
--- /dev/null
+++ b/benchmarks/finalizers.c
@@ -0,0 +1,266 @@
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "assert.h"
+#include "gc-api.h"
+#include "gc-basic-stats.h"
+#include "gc-finalizer.h"
+#include "simple-roots-api.h"
+#include "finalizers-types.h"
+#include "simple-allocator.h"
+
+typedef HANDLE_TO(SmallObject) SmallObjectHandle;
+typedef HANDLE_TO(struct gc_finalizer) FinalizerHandle;
+typedef HANDLE_TO(Pair) PairHandle;
+
+static SmallObject* allocate_small_object(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_SMALL_OBJECT, sizeof(SmallObject));
+}
+
+static Pair* allocate_pair(struct gc_mutator *mut) {
+  return gc_allocate_with_kind(mut, ALLOC_KIND_PAIR, sizeof(Pair));
+}
+
+static struct gc_finalizer* allocate_finalizer(struct gc_mutator *mut) {
+  struct gc_finalizer *ret = gc_allocate_finalizer(mut);
+  *tag_word(gc_ref_from_heap_object(ret)) = tag_live(ALLOC_KIND_FINALIZER);
+  return ret;
+}
+
+/* Get the current time in microseconds */
+static unsigned long current_time(void)
+{
+  struct timeval t;
+  if (gettimeofday(&t, NULL) == -1)
+    return 0;
+  return t.tv_sec * 1000 * 1000 + t.tv_usec;
+}
+
+struct thread {
+  struct gc_mutator *mut;
+  struct gc_mutator_roots roots;
+};
+
+static void print_elapsed(const char *what, unsigned long start) {
+  unsigned long end = current_time();
+  unsigned long msec = (end - start) / 1000;
+  unsigned long usec = (end - start) % 1000;
+  printf("Completed %s in %lu.%.3lu msec\n", what, msec, usec);
+}
+
+struct call_with_gc_data {
+  void* (*f)(struct thread *);
+  struct gc_heap *heap;
+};
+static void* call_with_gc_inner(struct gc_stack_addr *addr, void *arg) {
+  struct call_with_gc_data *data = arg;
+  struct gc_mutator *mut = gc_init_for_thread(addr, data->heap);
+  struct thread t = { mut, };
+  gc_mutator_set_roots(mut, &t.roots);
+  void *ret = data->f(&t);
+  gc_finish_for_thread(mut);
+  return ret;
+}
+static void* call_with_gc(void* (*f)(struct thread *),
+                          struct gc_heap *heap) {
+  struct call_with_gc_data data = { f, heap };
+  return gc_call_with_stack_addr(call_with_gc_inner, &data);
+}
+
+#define CHECK(x)                                                        \
+  do {                                                                  \
+    if (!(x)) {                                                         \
+      fprintf(stderr, "%s:%d: check failed: %s\n", __FILE__, __LINE__, #x); \
+      exit(1);                                                          \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NULL(x) CHECK_EQ(x, NULL)
+#define CHECK_NOT_NULL(x) CHECK_NE(x, NULL)
+
+static double heap_size;
+static double heap_multiplier;
+static size_t nthreads;
+
+static void cause_gc(struct gc_mutator *mut) {
+  // Doing a full collection lets us reason precisely about liveness.
+  gc_collect(mut, GC_COLLECTION_MAJOR);
+}
+
+static Pair* make_finalizer_chain(struct thread *t, size_t length) {
+  PairHandle head = { NULL };
+  PairHandle tail = { NULL };
+  PUSH_HANDLE(t, head);
+  PUSH_HANDLE(t, tail);
+
+  for (size_t i = 0; i < length; i++) {
+    HANDLE_SET(tail, HANDLE_REF(head));
+    HANDLE_SET(head, allocate_pair(t->mut));
+    HANDLE_REF(head)->car = allocate_small_object(t->mut);
+    HANDLE_REF(head)->cdr = HANDLE_REF(tail);
+    struct gc_finalizer *finalizer = allocate_finalizer(t->mut);
+    gc_finalizer_attach(t->mut, finalizer, 0,
+                        gc_ref_from_heap_object(HANDLE_REF(head)),
+                        gc_ref_from_heap_object(HANDLE_REF(head)->car));
+  }
+
+  Pair *ret = HANDLE_REF(head);
+  POP_HANDLE(t);
+  POP_HANDLE(t);
+  return ret;
+}
+
+static void* run_one_test(struct thread *t) {
+  size_t unit_size = gc_finalizer_size() + sizeof(Pair);
+  size_t list_length = heap_size / nthreads / heap_multiplier / unit_size;
+  ssize_t outstanding = list_length;
+
+  printf("Allocating list %zu nodes long.  Total size %.3fGB.\n",
+         list_length, list_length * unit_size / 1e9);
+
+  unsigned long thread_start = current_time();
+
+  PairHandle chain = { NULL };
+  PUSH_HANDLE(t, chain);
+
+  HANDLE_SET(chain, make_finalizer_chain(t, list_length));
+  cause_gc(t->mut);
+
+  size_t finalized = 0;
+  for (struct gc_finalizer *f = gc_pop_finalizable(t->mut);
+       f;
+       f = gc_pop_finalizable(t->mut)) {
+    Pair* p = gc_ref_heap_object(gc_finalizer_object(f));
+    SmallObject* o = gc_ref_heap_object(gc_finalizer_closure(f));
+    CHECK_EQ(p->car, o);
+    finalized++;
+  }
+  printf("thread %p: GC before clear finalized %zu nodes.\n", t, finalized);
+  outstanding -= finalized;
+
+  HANDLE_SET(chain, NULL);
+  cause_gc(t->mut);
+
+  finalized = 0;
+  for (struct gc_finalizer *f = gc_pop_finalizable(t->mut);
+       f;
+       f = gc_pop_finalizable(t->mut)) {
+    Pair* p = gc_ref_heap_object(gc_finalizer_object(f));
+    SmallObject* o = gc_ref_heap_object(gc_finalizer_closure(f));
+    CHECK_EQ(p->car, o);
+    finalized++;
+  }
+  printf("thread %p: GC after clear finalized %zu nodes.\n", t, finalized);
+  outstanding -= finalized;
+
+  print_elapsed("thread", thread_start);
+
+  POP_HANDLE(t);
+
+  return (void*)outstanding;
+}
+
+static void* run_one_test_in_thread(void *arg) {
+  struct gc_heap *heap = arg;
+  return call_with_gc(run_one_test, heap);
+}
+
+struct join_data { int status; pthread_t thread; };
+static void *join_thread(void *data) {
+  struct join_data *join_data = data;
+  void *ret;
+  join_data->status = pthread_join(join_data->thread, &ret);
+  return ret;
+}
+
+#define MAX_THREAD_COUNT 256
+
+int main(int argc, char *argv[]) {
+  if (argc < 4 || 5 < argc) {
+    fprintf(stderr, "usage: %s HEAP_SIZE MULTIPLIER NTHREADS [GC-OPTIONS]\n", argv[0]);
+    return 1;
+  }
+
+  heap_size = atof(argv[1]);
+  heap_multiplier = atof(argv[2]);
+  nthreads = atol(argv[3]);
+
+  if (heap_size < 8192) {
+    fprintf(stderr,
+            "Heap size should probably be at least 8192, right? '%s'\n",
+            argv[1]);
+    return 1;
+  }
+  if (!(1.0 < heap_multiplier && heap_multiplier < 100)) {
+    fprintf(stderr, "Failed to parse heap multiplier '%s'\n", argv[2]);
+    return 1;
+  }
+  if (nthreads < 1 || nthreads > MAX_THREAD_COUNT) {
+    fprintf(stderr, "Expected integer between 1 and %d for thread count, got '%s'\n",
+            (int)MAX_THREAD_COUNT, argv[2]);
+    return 1;
+  }
+
+  printf("Allocating heap of %.3fGB (%.2f multiplier of live data).\n",
+         heap_size / 1e9, heap_multiplier);
+
+  struct gc_options *options = gc_allocate_options();
+  gc_options_set_int(options, GC_OPTION_HEAP_SIZE_POLICY, GC_HEAP_SIZE_FIXED);
+  gc_options_set_size(options, GC_OPTION_HEAP_SIZE, heap_size);
+  if (argc == 5) {
+    if (!gc_options_parse_and_set_many(options, argv[4])) {
+      fprintf(stderr, "Failed to set GC options: '%s'\n", argv[4]);
+      return 1;
+    }
+  }
+
+  struct gc_heap *heap;
+  struct gc_mutator *mut;
+  struct gc_basic_stats stats;
+  if (!gc_init(options, NULL, &heap, &mut, GC_BASIC_STATS, &stats)) {
+    fprintf(stderr, "Failed to initialize GC with heap size %zu bytes\n",
+            (size_t)heap_size);
+    return 1;
+  }
+  struct thread main_thread = { mut, };
+  gc_mutator_set_roots(mut, &main_thread.roots);
+
+  pthread_t threads[MAX_THREAD_COUNT];
+  // Run one of the threads in the main thread.
+  for (size_t i = 1; i < nthreads; i++) {
+    int status = pthread_create(&threads[i], NULL, run_one_test_in_thread, heap);
+    if (status) {
+      errno = status;
+      perror("Failed to create thread");
+      return 1;
+    }
+  }
+  ssize_t outstanding = (size_t)run_one_test(&main_thread);
+  for (size_t i = 1; i < nthreads; i++) {
+    struct join_data data = { 0, threads[i] };
+    void *ret = gc_call_without_gc(mut, join_thread, &data);
+    if (data.status) {
+      errno = data.status;
+      perror("Failed to join thread");
+      return 1;
+    }
+    ssize_t thread_outstanding = (ssize_t)ret;
+    outstanding += thread_outstanding;
+  }
+  
+  if (outstanding)
+    printf("\n\nWARNING: %zd nodes outstanding!!!\n\n", outstanding);
+
+  gc_basic_stats_finish(&stats);
+  fputs("\n", stdout);
+  gc_basic_stats_print(&stats, stdout);
+
+  return 0;
+}
+
diff --git a/src/bdw.c b/src/bdw.c
index e9b7cb9f5..8eae3b4ee 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -219,14 +219,14 @@ void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
   void *prev_data = NULL;
   gc_finalizer_init_internal(finalizer, object, closure);
   gc_finalizer_externally_activated(finalizer);
-  GC_REGISTER_FINALIZER_NO_ORDER (gc_ref_heap_object(object), finalize_object,
-                                  finalizer, &prev, &prev_data);
+  GC_REGISTER_FINALIZER_NO_ORDER(gc_ref_heap_object(object), finalize_object,
+                                 finalizer, &prev, &prev_data);
   // FIXME: Allow multiple finalizers per object.
   GC_ASSERT(prev == NULL);
   GC_ASSERT(prev_data == NULL);
 }
 
-struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
   GC_invoke_finalizers();
   return gc_finalizer_state_pop(mut->heap->finalizer_state);
 }
diff --git a/src/pcc.c b/src/pcc.c
index fa7342e4d..abee8ce49 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -592,7 +592,7 @@ static void request_mutators_to_stop(struct gc_heap *heap) {
 static void allow_mutators_to_continue(struct gc_heap *heap) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(all_mutators_stopped(heap));
-  heap->paused_mutator_count = 0;
+  heap->paused_mutator_count--;
   atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
   GC_ASSERT(!mutators_are_stopping(heap));
   pthread_cond_broadcast(&heap->mutator_cond);
@@ -683,6 +683,7 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   do {
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
   } while (mutators_are_stopping(heap));
+  heap->paused_mutator_count--;
 
   MUTATOR_EVENT(mut, mutator_restarted);
 }
@@ -929,7 +930,7 @@ void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
   // No write barrier.
 }
 
-struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
   return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
 }
 
diff --git a/src/semi.c b/src/semi.c
index 739a21d75..0ed954727 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -526,7 +526,7 @@ void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
   // No write barrier.
 }
 
-struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
   return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
 }
 
diff --git a/src/whippet.c b/src/whippet.c
index 3771babde..1f3edda2b 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -2380,7 +2380,7 @@ void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
   // No write barrier.
 }
 
-struct gc_finalizer* gc_finalizer_pop(struct gc_mutator *mut) {
+struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
   return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
 }
 

From c7309a9657792d424631b745580bc99e056b6adc Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 24 Jul 2024 09:54:15 +0200
Subject: [PATCH 245/403] Ensure fired finalizers are traced

---
 src/gc-finalizer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gc-finalizer.c b/src/gc-finalizer.c
index ed9c39173..c0e5831bf 100644
--- a/src/gc-finalizer.c
+++ b/src/gc-finalizer.c
@@ -233,6 +233,7 @@ size_t gc_visit_finalizer_roots(struct gc_finalizer_state *state,
         visit(gc_edge(&table->buckets[bidx]), heap, visit_data);
     }
   }
+  visit(gc_edge(&state->fired), heap, visit_data);
   return count;
 }
 

From 3d4d4d047c8edd73960d49239ff269792d7ada7d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 24 Jul 2024 10:01:16 +0200
Subject: [PATCH 246/403] Fix BDW finalization; ensure pending set is marked

---
 src/bdw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bdw.c b/src/bdw.c
index 8eae3b4ee..a69901f7f 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -219,7 +219,7 @@ void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
   void *prev_data = NULL;
   gc_finalizer_init_internal(finalizer, object, closure);
   gc_finalizer_externally_activated(finalizer);
-  GC_REGISTER_FINALIZER_NO_ORDER(gc_ref_heap_object(object), finalize_object,
+  GC_register_finalizer_no_order(gc_ref_heap_object(object), finalize_object,
                                  finalizer, &prev, &prev_data);
   // FIXME: Allow multiple finalizers per object.
   GC_ASSERT(prev == NULL);
@@ -315,6 +315,8 @@ mark_heap(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
   if (heap->roots)
     gc_trace_heap_roots(heap->roots, bdw_mark_edge, heap, &state);
 
+  gc_visit_finalizer_roots(heap->finalizer_state, bdw_mark_edge, heap, &state);
+
   state.mark_stack_ptr = GC_MARK_AND_PUSH (heap->mutators,
                                            state.mark_stack_ptr,
                                            state.mark_stack_limit,

From ab5071f97a55b83d14bc9cfdfe7487a1b49dc1b2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 27 Jul 2024 22:26:24 +0200
Subject: [PATCH 247/403] Rework docs

---
 README.md                |  12 +--
 doc/README.md            |  24 ++++++
 doc/collector-bdw.md     |  20 +++++
 doc/collector-pcc.md     |  69 +++++++++++++++++
 doc/collector-semi.md    |  23 ++++++
 doc/collector-whippet.md | 158 +++++++++++++++++++++++++++++++++++++++
 doc/collectors.md        |  41 ++++++++++
 doc/design.md            |  64 ----------------
 8 files changed, 336 insertions(+), 75 deletions(-)
 create mode 100644 doc/README.md
 create mode 100644 doc/collector-bdw.md
 create mode 100644 doc/collector-pcc.md
 create mode 100644 doc/collector-semi.md
 create mode 100644 doc/collector-whippet.md
 create mode 100644 doc/collectors.md
 delete mode 100644 doc/design.md

diff --git a/README.md b/README.md
index e62b0661a..c922af64d 100644
--- a/README.md
+++ b/README.md
@@ -8,19 +8,9 @@ Whippet is an embed-only C library, designed to be copied into a
 program's source tree.  It exposes an abstract C API for managed memory
 allocation, and provides a number of implementations of that API.
 
-One of the implementations is also called "whippet", and is the
-motivation for creating this library.  For a detailed introduction, see
-[Whippet: Towards a new local
-maximum](https://wingolog.org/archives/2023/02/07/whippet-towards-a-new-local-maximum),
-a talk given at FOSDEM 2023.
-
 ## Documentation
 
- * [Design](./doc/design.md): What's the general idea?
- * [Manual](./doc/manual.md): How do you get your program to use
-   Whippet?  What is the API?
- * [Guile](./doc/guile.md): Some notes on a potential rebase of Guile on
-   top of Whippet.
+See the [documentation](./doc/README.md).
 
 ## Source repository structure
 
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 000000000..fc5348ddb
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,24 @@
+# Whippet documentation
+
+ * [Manual](./manual.md): How do you get your program to use
+   Whippet?  What is the API?
+
+ * [Collector implementations](./collectors.md): There are a number of
+   implementations of the Whippet API with differing performance
+   characteristics and which impose different requirements on the
+   embedder.
+   - [Semi-space collector (semi)](./collector-semi.md): For
+     single-threaded embedders who are not too tight on memory.
+   - [Parallel copying collector (pcc)](./collector-pcc.md): Like semi,
+     but with support for multiple mutator threads.  Faster than semi if
+     multiple cores are available at collection-time.
+   - [Whippet collector (whippet)](./collector-whippet.md):
+     Immix-inspired collector.  Optionally parallel, conservative (stack
+     and/or heap), and/or generational.
+   - [Boehm-Demers-Weiser collector (bdw)](./collector-bdw.md):
+     Conservative mark-sweep collector, implemented by
+     Boehm-Demers-Weiser library.
+
+ * [Guile](./doc/guile.md): Some notes on a potential rebase of Guile on
+   top of Whippet.
+
diff --git a/doc/collector-bdw.md b/doc/collector-bdw.md
new file mode 100644
index 000000000..b86a9d3b1
--- /dev/null
+++ b/doc/collector-bdw.md
@@ -0,0 +1,20 @@
+# Boehm-Demers-Weiser collector
+
+Whippet's `bdw` collector is backed by a third-party garbage collector,
+the [Boehm-Demers-Weiser collector](https://github.com/ivmai/bdwgc).
+
+BDW-GC is a mark-sweep collector with conservative root-finding,
+conservative heap tracing, and parallel tracing.
+
+Whereas the other Whippet collectors which rely on mutators to
+[periodically check if they need to
+stop](https://github.com/wingo/whippet/blob/main/doc/manual.md#safepoints),
+`bdw` will stop mutators with a POSIX signal.  Also, it doesn't really
+support ephemerons (the Whippet `bdw` collector simulates them using
+finalizers), and both ephemerons and finalizers only approximate the
+Whippet behavior, because they are implemented in terms of what BDW-GC
+provides.
+
+It's a bit of an oddball from a Whippet perspective, but useful as a
+migration path if you have an embedder that is already using BDW-GC.
+And, it is a useful performance comparison.
diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
new file mode 100644
index 000000000..a55b0fa4f
--- /dev/null
+++ b/doc/collector-pcc.md
@@ -0,0 +1,69 @@
+# Parallel copying collector
+
+Whippet's `pcc` collector is a copying collector, like the more simple
+[`semi`](./collector-semi.md), but supporting multiple mutator threads
+and multiple tracing threads.
+
+Like `semi`, `pcc` traces by evacuation: it moves all live objects on
+every collection.  (Exception:  objects larger than 8192 bytes are
+placed into a partitioned space traces by marking in place instead of
+copying.)  Evacuation requires precise roots, so if your embedder does
+not support precise roots, `pcc` is not for you.
+
+Again like `semi`, `pcc` generally requires a heap size at least twice
+as large as the maximum live heap size, and performs best with ample
+heap sizes; between 3× and 5× is best.
+
+## Implementation notes
+
+Unlike `semi` which has a single global bump-pointer allocation region,
+`pcc` structures the heap into 64-kB blocks.  In this way it supports
+multiple mutator threads: mutators do local bump-pointer allocation into
+their own block, and when their block is full, they fetch another from
+the global store.
+
+The block size is 64 kB, but really it's 128 kB, because each block has
+two halves: the active region and the copy reserve.  Dividing each block
+in two allows the collector to easily grow and shrink the heap while
+ensuring there is always enough reserve space.
+
+Blocks are allocated in 64-MB aligned slabs, so there are 512 blocks in
+a slab.  The first block in a slab is used by the collector itself, to
+keep metadata for the rest of the blocks, for example a chain pointer
+allowing blocks to be collected in lists, a saved allocation pointer for
+partially-filled blocks, whether the block is paged in or out, and so
+on.
+
+`pcc` supports tracing in parallel.  This mechanism works somewhat like
+allocation, in which multiple trace workers compete to evacuate objects
+into their local allocation buffers; when an allocation buffer is full,
+the trace worker grabs another, just like mutators do.
+
+However, unlike the simple semi-space collector which uses a Cheney grey
+worklist, `pcc` uses the [fine-grained work-stealing parallel
+tracer](../src/parallel-tracer.h) originally developed for [Whippet's
+Immix-like collector](./collector-whippet.md).  Each trace worker
+maintains a [local queue of objects that need
+tracing](../src/local-worklist.h), which currently has 1024 entries.  If
+the local queue becomes full, the worker will publish 3/4 of those
+entries to the worker's [shared worklist](../src/shared-worklist.h).
+When a worker runs out of local work, it will first try to remove work
+from its own shared worklist, then will try to steal from other workers.
+
+Because threads compete to evacuate objects, `pcc` uses [atomic
+compare-and-swap instead of simple forwarding pointer
+updates](./manual.md#forwarding-objects), which imposes around a ~30%
+performance penalty.  `pcc` generally starts to outperform `semi` when
+it can trace with 2 threads, and gets better with each additional
+thread.
+
+I sometimes wonder whether the worklist should contain grey edges or
+grey objects.  [MMTk](https://www.mmtk.io/) seems to do the former, and bundles edges into work
+packets, which are the unit of work sharing.  I don't know yet what is
+best and look forward to experimenting once I have better benchmarks.
+
+The memory used for the external worklist is dynamically allocated from
+the OS and is not currently counted as contributing to the heap size.
+If you are targetting a microcontroller or something, probably you need
+to choose a different kind of collector that never dynamically
+allocates, such as `semi`.
diff --git a/doc/collector-semi.md b/doc/collector-semi.md
new file mode 100644
index 000000000..1900d2d27
--- /dev/null
+++ b/doc/collector-semi.md
@@ -0,0 +1,23 @@
+# Semi-space collector
+
+The `semi` collector is simple.  It is mostly useful as a first
+collector to try out, to make sure that a mutator correctly records all
+roots: because `semi` moves every live object on every collection, it is
+very effective at shaking out mutator bugs.
+
+If your embedder chooses to not precisely record roots, for example
+instead choosing to conservatively scan the stack, then the semi-space
+collector is not for you: `semi` requires precise roots.
+
+For more on semi-space collectors, see
+https://wingolog.org/archives/2022/12/10/a-simple-semi-space-collector.
+
+Whippet's `semi` collector incorporates a large-object space, which
+marks objects in place instead of moving.  Otherwise, `semi` generally
+requires a heap size at least twice as large as the maximum live heap
+size, and performs best with ample heap sizes; between 3× and 5× is
+best.
+
+The semi-space collector doesn't support multiple mutator threads.  If
+you want a whole-heap copying collector for a multi-threaded mutator,
+look at [pcc](./collector-pcc.md).
diff --git a/doc/collector-whippet.md b/doc/collector-whippet.md
new file mode 100644
index 000000000..2975035d5
--- /dev/null
+++ b/doc/collector-whippet.md
@@ -0,0 +1,158 @@
+# Whippet collector
+
+One collector implementation in the Whippet garbage collection library
+is also called Whippet.  Naming-wise this is a somewhat confusing
+situation; perhaps it will change.
+
+Anyway, the `whippet` collector is mainly a mark-region collector,
+inspired by
+[Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
+To a first approximation, Whippet is a whole-heap Immix collector with a
+large object space on the side.
+
+When tracing, `whippet` mostly marks objects in place.  If the heap is
+too fragmented, it can compact the heap by choosing to evacuate
+sparsely-populated heap blocks instead of marking in place.  However
+evacuation is strictly optional, which means that `whippet` is also
+compatible with conservative root-finding, making it a good replacement
+for embedders that currently use the [Boehm-Demers-Weiser
+collector](./collector-bdw.md).
+
+## Differences from Immix
+
+The original Immix divides the heap into 32kB blocks, and then divides
+those blocks into 128B lines.  An Immix allocation can span lines but
+not blocks; allocations larger than 8kB go into a separate large object
+space.  Mutators request blocks from the global store and allocate into
+those blocks using bump-pointer allocation.  When all blocks are
+consumed, Immix stops the world and traces the object graph, marking
+objects but also the lines that objects are on.  After marking, blocks
+contain some lines with live objects and others that are completely
+free.  Spans of free lines are called holes.  When a mutator gets a
+recycled block from the global block store, it allocates into those
+holes.  For an exposition of Immix, see the lovely detailed [Rust
+implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
+
+The essential difference of `whippet` from Immix stems from a simple
+observation: Immix needs a side table of line mark bytes and also a mark
+bit or bits in each object (or in a side table).  But if instead you
+choose to store mark bytes instead of bits (for concurrency reasons) in
+a side table, with one mark byte per granule (unit of allocation,
+perhaps 16 bytes), then you effectively have a line mark table where the
+granule size is the line size.  You can bump-pointer allocate into holes
+in the mark byte table.
+
+You might think this is a bad tradeoff, and perhaps it is: I don't know
+yet.  If your granule size is two pointers, then one mark byte per
+granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
+32-bit, it's a lot!  On the other hand, instead of the worst case of one
+survivor object wasting a line (or two, in the case of conservative line
+marking), granule-size-is-line-size instead wastes nothing.  Also, you
+don't need GC bits in the object itself, and you can use the mark byte
+array to record the object end, so that finding holes in a block can
+just read the mark table and can avoid looking at object memory.
+
+## Optional features
+
+The `whippet` collector has a few feature flags that can be turned on or
+off.  If you use the [standard embedder makefile include](../embed.mk),
+then there is a name for each combination of features: `whippet` has no
+additional features, `parallel-whippet` enables parallel marking,
+`parallel-generational-whippet` enables parallelism,
+`stack-conservative-parallel-generational-whippet` uses conservative
+root-finding, and `heap-conservative-parallel-generational-whippet`
+additionally traces the heap conservatively.  You can leave off
+components of the name to get a collector without those features.
+Underneath this corresponds to some pre-processor definitions passed to
+the compiler on the command line.
+
+### Generations
+
+Whippet supports generational tracing via the [sticky mark-bit
+algorithm](https://wingolog.org/archives/2022/10/22/the-sticky-mark-bit-algorithm).
+This requires that the embedder emit [write
+barriers](https://github.com/wingo/whippet/blob/main/doc/manual.md#write-barriers);
+if your embedder cannot ensure write barriers are always invoked, then
+generational collection is not for you.  (We could perhaps relax this a
+bit, following what [Ruby developers
+did](http://rvm.jp/~ko1/activities/rgengc_ismm.pdf).)
+
+The write barrier is currently a card-marking barrier emitted on stores,
+with one card byte per 256 object bytes, where the card location can be
+computed from the object address because blocks are allocated in
+two-megabyte aligned slabs.
+
+### Parallel tracing
+
+You almost certainly want this on!  `parallel-whippet` uses a the
+[fine-grained work-stealing parallel tracer](../src/parallel-tracer.h).
+Each trace worker maintains a [local queue of objects that need
+tracing](../src/local-worklist.h), which currently has a capacity of
+1024 entries.  If the local queue becomes full, the worker will publish
+3/4 of those entries to the worker's [shared
+worklist](../src/shared-worklist.h).  When a worker runs out of local
+work, it will first try to remove work from its own shared worklist,
+then will try to steal from other workers.
+
+The memory used for the external worklist is dynamically allocated from
+the OS and is not currently counted as contributing to the heap size.
+If you absolutely need to avoid dynamic allocation during GC, `whippet`
+(even serial whippet) would need some work for your use case, to
+allocate a fixed-size space for a marking queue and to gracefully handle
+mark queue overflow.
+
+### Conservative stack scanning
+
+With `semi` and `pcc`, embedders must precisely enumerate the set of
+*roots*: the edges into the heap from outside.  Commonly, roots include
+global variables, as well as working variables from each mutator's
+stack.  Whippet can optionally mark mutator stacks *conservatively*:
+treating each word on the stack as if it may be an object reference, and
+marking any object at that address.
+
+After all these years, *whether* to mark stacks conservatively or not is
+still an open research question.  Conservative stack scanning retain too
+much data if an integer is confused for an object reference and removes
+a layer of correctness-by-construction from a system.  Sometimes it is
+required, for example if your embedder cannot enumerate roots precisely.
+But there are reasons to consider it even if you can do precise roots:
+it removes the need for the compiler to produce a stack map to store the
+precise root enumeration at every safepoint; it removes the need to look
+up a stack map when tracing; and it allows C or C++ support code to
+avoid having to place roots in traceable locations published to the
+garbage collector.  And the [performance question is still
+open](https://dl.acm.org/doi/10.1145/2660193.2660198).
+
+Anyway.  Whippet can scan roots conservatively.  Those roots are pinned
+for the collection; even if the collection will compact via evacuation,
+referents of conservative roots won't be moved.  Objects not directly
+referenced by roots can be evacuated, however.
+
+### Conservative heap scanning
+
+In addition to stack and global references, the Boehm-Demers-Weiser
+collector scans heap objects conservatively as well, treating each word
+of each heap object as if it were a reference.  Whippet can do that, if
+the embedder is unable to provide a `gc_trace_object` implementation.
+However this is generally a performance lose, and it prevents
+evacuation.
+
+## Other implementation tidbits
+
+`whippet` does lazy sweeping: as a mutator grabs a fresh block, it
+reclaims memory that was unmarked in the previous collection before
+making the memory available for allocation.  This makes sweeping
+naturally cache-friendly and parallel.
+
+The mark byte array facilitates conservative collection by being an
+oracle for "does this address start an object".
+
+There is some support for concurrent marking by having three mark bit
+states (dead, survivor, marked) that rotate at each collection; some
+collector configurations can have mutators mark before waiting for other
+mutators to stop.  True concurrent marking and associated barriers
+are not yet implemented.
+
+For a detailed introduction, see [Whippet: Towards a new local
+maximum](https://wingolog.org/archives/2023/02/07/whippet-towards-a-new-local-maximum),
+a talk given at FOSDEM 2023.
diff --git a/doc/collectors.md b/doc/collectors.md
new file mode 100644
index 000000000..77e4206ec
--- /dev/null
+++ b/doc/collectors.md
@@ -0,0 +1,41 @@
+# Whippet collectors
+
+Whippet has four collectors currently:
+ - [Semi-space collector (semi)](./collector-semi.md): For
+   single-threaded embedders who are not too tight on memory.
+ - [Parallel copying collector (pcc)](./collector-pcc.md): Like semi,
+   but with support for multiple mutator threads.  Faster than semi if
+   multiple cores are available at collection-time.
+ - [Whippet collector (whippet)](./collector-whippet.md):
+   Immix-inspired collector.  Optionally parallel, conservative (stack
+   and/or heap), and/or generational.
+ - [Boehm-Demers-Weiser collector (bdw)](./collector-bdw.md):
+   Conservative mark-sweep collector, implemented by
+   Boehm-Demers-Weiser library.
+
+## How to choose?
+
+If you are migrating an embedder off BDW-GC, then it could be reasonable
+to first go to `bdw`, then `stack-conservative-parallel-whippet`.
+
+If you have an embedder with precise roots, use `semi` if
+single-threaded, or `pcc` if multi-threaded.  That will shake out
+mutator/embedder bugs.  Then if memory is tight, switch to
+`parallel-whippet`, possibly `parallel-generational-whippet`.
+
+If you are writing a new project, you have a choice as to whether to pay
+the development cost of precise roots or not.  If choose to not have
+precise roots, then go for `stack-conservative-parallel-whippet`
+directly.
+
+## More collectors
+
+It would be nice to have a classic generational GC, perhaps using
+parallel-whippet for the old generation but a pcc-style copying nursery.
+
+Support for concurrent marking in `whippet` would be good as well,
+perhaps with a SATB barrier.  (Or, if you are the sort of person to bet
+on conservative stack scanning, perhaps a retreating-wavefront barrier
+would be more appropriate.)
+
+Contributions are welcome, provided they have no more dependencies!
diff --git a/doc/design.md b/doc/design.md
deleted file mode 100644
index 1a1c69bee..000000000
--- a/doc/design.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Design
-
-Whippet is mainly a mark-region collector, like
-[Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
-See also the lovely detailed [Rust
-implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
-
-To a first approximation, Whippet is a whole-heap Immix collector with a
-large object space on the side.  See the Immix paper for full details,
-but basically Immix divides the heap into 32kB blocks, and then divides
-those blocks into 128B lines.  An Immix allocation never spans blocks;
-allocations larger than 8kB go into a separate large object space.
-Mutators request blocks from the global store and allocate into those
-blocks using bump-pointer allocation.  When all blocks are consumed,
-Immix stops the world and traces the object graph, marking objects but
-also the lines that objects are on.  After marking, blocks contain some
-lines with live objects and others that are completely free.  Spans of
-free lines are called holes.  When a mutator gets a recycled block from
-the global block store, it allocates into those holes.  Also, sometimes
-Immix can choose to evacuate rather than mark.  Bump-pointer-into-holes
-allocation is quite compatible with conservative roots, so it's an
-interesting option for Guile, which has a lot of legacy C API users.
-
-The essential difference of Whippet from Immix stems from a simple
-observation: Immix needs a side table of line mark bytes and also a mark
-bit or bits in each object (or in a side table).  But if instead you
-choose to store mark bytes instead of bits (for concurrency reasons) in
-a side table, with one mark byte per granule (unit of allocation,
-perhaps 16 bytes), then you effectively have a line mark table where the
-granule size is the line size.  You can bump-pointer allocate into holes
-in the mark byte table.
-
-You might think this is a bad tradeoff, and perhaps it is: I don't know
-yet.  If your granule size is two pointers, then one mark byte per
-granule is 6.25% overhead on 64-bit, or 12.5% on 32-bit.  Especially on
-32-bit, it's a lot!  On the other hand, instead of the worst case of one
-survivor object wasting a line (or two, in the case of conservative line
-marking), granule-size-is-line-size instead wastes nothing.  Also, you
-don't need GC bits in the object itself, and you can use the mark byte
-array to record the object end, so that finding holes in a block can
-just read the mark table and can avoid looking at object memory.
-
-Other ideas in Whippet:
-
- * Minimize stop-the-world phase via parallel marking and punting all
-   sweeping to mutators
-
- * Enable mutator parallelism via lock-free block acquisition and lazy
-   statistics collation
-
- * Allocate block space using aligned 4 MB slabs, with embedded metadata
-   to allow metadata bytes, slab headers, and block metadata to be
-   located via address arithmetic
-
- * Facilitate conservative collection via mark byte array, oracle for
-   "does this address start an object"
-
- * Enable in-place generational collection via card table with one entry
-   per 256B or so
-
- * Enable concurrent marking by having three mark bit states (dead,
-   survivor, marked) that rotate at each collection, and sweeping a
-   block clears metadata for dead objects; but concurrent marking and
-   associated SATB barrier not yet implemented

From cc6b1c1fb590454f271550fb1ddb92cf0de6cb20 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 27 Jul 2024 22:28:55 +0200
Subject: [PATCH 248/403] typos

---
 doc/collector-pcc.md | 6 +++---
 doc/collectors.md    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index a55b0fa4f..61af6c4a5 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -6,9 +6,9 @@ and multiple tracing threads.
 
 Like `semi`, `pcc` traces by evacuation: it moves all live objects on
 every collection.  (Exception:  objects larger than 8192 bytes are
-placed into a partitioned space traces by marking in place instead of
-copying.)  Evacuation requires precise roots, so if your embedder does
-not support precise roots, `pcc` is not for you.
+placed into a partitioned space which traces by marking in place instead
+of copying.)  Evacuation requires precise roots, so if your embedder
+does not support precise roots, `pcc` is not for you.
 
 Again like `semi`, `pcc` generally requires a heap size at least twice
 as large as the maximum live heap size, and performs best with ample
diff --git a/doc/collectors.md b/doc/collectors.md
index 77e4206ec..81af46e59 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -24,7 +24,7 @@ mutator/embedder bugs.  Then if memory is tight, switch to
 `parallel-whippet`, possibly `parallel-generational-whippet`.
 
 If you are writing a new project, you have a choice as to whether to pay
-the development cost of precise roots or not.  If choose to not have
+the development cost of precise roots or not.  If you choose to not have
 precise roots, then go for `stack-conservative-parallel-whippet`
 directly.
 

From 9b2677a57bf6986f05163debe7c85e19b5183d20 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 27 Jul 2024 22:30:42 +0200
Subject: [PATCH 249/403] Docs docs

---
 doc/collector-pcc.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index 61af6c4a5..a20e58e64 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -57,11 +57,6 @@ performance penalty.  `pcc` generally starts to outperform `semi` when
 it can trace with 2 threads, and gets better with each additional
 thread.
 
-I sometimes wonder whether the worklist should contain grey edges or
-grey objects.  [MMTk](https://www.mmtk.io/) seems to do the former, and bundles edges into work
-packets, which are the unit of work sharing.  I don't know yet what is
-best and look forward to experimenting once I have better benchmarks.
-
 The memory used for the external worklist is dynamically allocated from
 the OS and is not currently counted as contributing to the heap size.
 If you are targetting a microcontroller or something, probably you need

From 3ce089972973e88c863fd9a6ce13102bdd4ea835 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 27 Jul 2024 22:32:08 +0200
Subject: [PATCH 250/403] Docs docs

---
 doc/collector-whippet.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/collector-whippet.md b/doc/collector-whippet.md
index 2975035d5..acd09abd7 100644
--- a/doc/collector-whippet.md
+++ b/doc/collector-whippet.md
@@ -58,7 +58,7 @@ The `whippet` collector has a few feature flags that can be turned on or
 off.  If you use the [standard embedder makefile include](../embed.mk),
 then there is a name for each combination of features: `whippet` has no
 additional features, `parallel-whippet` enables parallel marking,
-`parallel-generational-whippet` enables parallelism,
+`parallel-generational-whippet` enables generations,
 `stack-conservative-parallel-generational-whippet` uses conservative
 root-finding, and `heap-conservative-parallel-generational-whippet`
 additionally traces the heap conservatively.  You can leave off

From 50e90a026a48ab5a1abc17587a4ea8f7caab4d99 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 27 Jul 2024 22:33:56 +0200
Subject: [PATCH 251/403] Docs docs

---
 doc/collector-whippet.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/doc/collector-whippet.md b/doc/collector-whippet.md
index acd09abd7..2f0c17466 100644
--- a/doc/collector-whippet.md
+++ b/doc/collector-whippet.md
@@ -111,17 +111,17 @@ treating each word on the stack as if it may be an object reference, and
 marking any object at that address.
 
 After all these years, *whether* to mark stacks conservatively or not is
-still an open research question.  Conservative stack scanning retain too
-much data if an integer is confused for an object reference and removes
-a layer of correctness-by-construction from a system.  Sometimes it is
-required, for example if your embedder cannot enumerate roots precisely.
-But there are reasons to consider it even if you can do precise roots:
-it removes the need for the compiler to produce a stack map to store the
-precise root enumeration at every safepoint; it removes the need to look
-up a stack map when tracing; and it allows C or C++ support code to
-avoid having to place roots in traceable locations published to the
-garbage collector.  And the [performance question is still
-open](https://dl.acm.org/doi/10.1145/2660193.2660198).
+still an open research question.  Conservative stack scanning can retain
+too much data if an integer is confused for an object reference and
+removes a layer of correctness-by-construction from a system.  Sometimes
+it is required, for example if your embedder cannot enumerate roots
+precisely.  But there are reasons to consider it even if you can do
+precise roots:  it removes the need for the compiler to produce a stack
+map to store the precise root enumeration at every safepoint; it removes
+the need to look up a stack map when tracing; and it allows C or C++
+support code to avoid having to place roots in traceable locations
+published to the garbage collector.  And the [performance question is
+still open](https://dl.acm.org/doi/10.1145/2660193.2660198).
 
 Anyway.  Whippet can scan roots conservatively.  Those roots are pinned
 for the collection; even if the collection will compact via evacuation,

From 22f5b443309300d0d76ade07433cc5286c91dc5e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 27 Jul 2024 22:34:43 +0200
Subject: [PATCH 252/403] Docs docs

---
 doc/collector-whippet.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/doc/collector-whippet.md b/doc/collector-whippet.md
index 2f0c17466..23fb2a1bb 100644
--- a/doc/collector-whippet.md
+++ b/doc/collector-whippet.md
@@ -114,14 +114,15 @@ After all these years, *whether* to mark stacks conservatively or not is
 still an open research question.  Conservative stack scanning can retain
 too much data if an integer is confused for an object reference and
 removes a layer of correctness-by-construction from a system.  Sometimes
-it is required, for example if your embedder cannot enumerate roots
-precisely.  But there are reasons to consider it even if you can do
-precise roots:  it removes the need for the compiler to produce a stack
-map to store the precise root enumeration at every safepoint; it removes
-the need to look up a stack map when tracing; and it allows C or C++
-support code to avoid having to place roots in traceable locations
-published to the garbage collector.  And the [performance question is
-still open](https://dl.acm.org/doi/10.1145/2660193.2660198).
+conservative stack-scanning is required, for example if your embedder
+cannot enumerate roots precisely.  But there are reasons to consider it
+even if you can do precise roots: conservative scanning removes the need
+for the compiler to produce a stack map to store the precise root
+enumeration at every safepoint; it removes the need to look up a stack
+map when tracing; and it allows C or C++ support code to avoid having to
+place roots in traceable locations published to the garbage collector.
+And the [performance question is still
+open](https://dl.acm.org/doi/10.1145/2660193.2660198).
 
 Anyway.  Whippet can scan roots conservatively.  Those roots are pinned
 for the collection; even if the collection will compact via evacuation,

From 6c5cdd73c93b4b47ad2d81cbe709c22f56f030e4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 29 Jul 2024 11:21:17 +0200
Subject: [PATCH 253/403] refactor pcc.  no functional change

---
 src/pcc.c | 337 +++++++++++++++++++++++++++---------------------------
 1 file changed, 170 insertions(+), 167 deletions(-)

diff --git a/src/pcc.c b/src/pcc.c
index abee8ce49..1638abf85 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -52,7 +52,7 @@ struct pcc_block {
     struct {
       struct pcc_block *next;
       uint8_t in_core;
-      size_t allocated; // For partially-allocated blocks.
+      size_t allocated; // For partly-empty blocks.
     };
     uint8_t padding[HEADER_BYTES_PER_BLOCK];
   };
@@ -101,10 +101,10 @@ struct pcc_extent {
 };
 
 struct pcc_space {
-  struct pcc_block *available;
-  struct pcc_block *partially_allocated;
-  struct pcc_block *allocated ALIGNED_TO_AVOID_FALSE_SHARING;
-  size_t allocated_block_count;
+  struct pcc_block *empty;
+  struct pcc_block *partly_full;
+  struct pcc_block *full ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t full_block_count;
   struct pcc_block *paged_out ALIGNED_TO_AVOID_FALSE_SHARING;
   size_t fragmentation ALIGNED_TO_AVOID_FALSE_SHARING;
   ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
@@ -150,10 +150,14 @@ struct gc_heap {
 #define MUTATOR_EVENT(mut, event, ...)                                  \
   (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
 
-struct gc_mutator {
+struct gc_allocator {
   uintptr_t hp;
   uintptr_t limit;
   struct pcc_block *block;
+};
+
+struct gc_mutator {
+  struct gc_allocator allocator;
   struct gc_heap *heap;
   struct gc_mutator_roots *roots;
   void *event_listener_data;
@@ -162,9 +166,7 @@ struct gc_mutator {
 };
 
 struct gc_trace_worker_data {
-  uintptr_t hp;
-  uintptr_t limit;
-  struct pcc_block *block;
+  struct gc_allocator allocator;
 };
 
 static inline struct pcc_space* heap_pcc_space(struct gc_heap *heap) {
@@ -180,15 +182,6 @@ static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
-static inline void pcc_space_compute_region(struct pcc_space *space,
-                                            struct pcc_block *block,
-                                            uintptr_t *hp, uintptr_t *limit) {
-  struct pcc_block_payload *payload = block_payload(block);
-  struct pcc_region *region = &payload->regions[space->active_region];
-  *hp = (uintptr_t)&region[0];
-  *limit = (uintptr_t)&region[1];
-}
-
 static void push_block(struct pcc_block **list,
                        struct pcc_block *block) {
   struct pcc_block *next = atomic_load_explicit(list, memory_order_acquire);
@@ -208,39 +201,33 @@ static struct pcc_block* pop_block(struct pcc_block **list) {
   return head;
 }
 
-static struct pcc_block* pop_available_block(struct pcc_space *space) {
-  return pop_block(&space->available);
+static struct pcc_block* pop_empty_block(struct pcc_space *space) {
+  return pop_block(&space->empty);
 }
-static void push_available_block(struct pcc_space *space,
-                                 struct pcc_block *block) {
-  push_block(&space->available, block);
+static void push_empty_block(struct pcc_space *space,
+                             struct pcc_block *block) {
+  push_block(&space->empty, block);
 }
 
-static struct pcc_block* pop_allocated_block(struct pcc_space *space) {
-  return pop_block(&space->allocated);
+static struct pcc_block* pop_full_block(struct pcc_space *space) {
+  return pop_block(&space->full);
 }
-static void push_allocated_block(struct pcc_space *space,
-                                 struct pcc_block *block) {
-  push_block(&space->allocated, block);
-  atomic_fetch_add_explicit(&space->allocated_block_count, 1,
+static void push_full_block(struct pcc_space *space,
+                            struct pcc_block *block) {
+  push_block(&space->full, block);
+  atomic_fetch_add_explicit(&space->full_block_count, 1,
                             memory_order_relaxed);
 }
 
-static struct pcc_block* pop_partially_allocated_block(struct pcc_space *space) {
-  return pop_block(&space->partially_allocated);
+static struct pcc_block* pop_partly_full_block(struct pcc_space *space) {
+  return pop_block(&space->partly_full);
 }
-static void push_partially_allocated_block(struct pcc_space *space,
-                                           struct pcc_block *block,
-                                           uintptr_t hp) {
-  size_t allocated = hp & (REGION_SIZE - 1);
-  if (allocated) {
-    block->allocated = allocated;
-    push_block(&space->partially_allocated, block);
-  } else {
-    // Could be hp was bumped all the way to the limit, in which case
-    // allocated wraps to 0; in any case the block is full.
-    push_allocated_block(space, block);
-  }
+static void push_partly_full_block(struct pcc_space *space,
+                                   struct pcc_block *block,
+                                   size_t allocated_bytes) {
+  GC_ASSERT(allocated_bytes);
+  block->allocated = allocated_bytes;
+  push_block(&space->partly_full, block);
 }
 
 static struct pcc_block* pop_paged_out_block(struct pcc_space *space) {
@@ -271,7 +258,7 @@ static void record_fragmentation(struct pcc_space *space,
 }
 
 static ssize_t pcc_space_request_release_memory(struct pcc_space *space,
-                                                  size_t bytes) {
+                                                size_t bytes) {
   return atomic_fetch_add(&space->bytes_to_page_out, bytes) + bytes;
 }
 
@@ -279,7 +266,7 @@ static int
 pcc_space_page_out_blocks_until_memory_released(struct pcc_space *space) {
   ssize_t pending = atomic_load(&space->bytes_to_page_out);
   while (pending > 0) {
-    struct pcc_block *block = pop_available_block(space);
+    struct pcc_block *block = pop_empty_block(space);
     if (!block) return 0;
     page_out_block(space, block);
     pending =
@@ -295,12 +282,102 @@ static void pcc_space_reacquire_memory(struct pcc_space *space,
   while (pending + BLOCK_SIZE <= 0) {
     struct pcc_block *block = page_in_block(space);
     GC_ASSERT(block);
-    push_available_block(space, block);
+    push_empty_block(space, block);
     pending =
       atomic_fetch_add(&space->bytes_to_page_out, BLOCK_SIZE) + BLOCK_SIZE;
   }
 }
 
+static inline void allocator_set_block(struct gc_allocator *alloc,
+                                       struct pcc_block *block,
+                                       int active_region) {
+  struct pcc_block_payload *payload = block_payload(block);
+  struct pcc_region *region = &payload->regions[active_region];
+  alloc->block = block;
+  alloc->hp = (uintptr_t)&region[0];
+  alloc->limit = (uintptr_t)&region[1];
+}
+
+static inline int allocator_acquire_block(struct gc_allocator *alloc,
+                                          struct pcc_block *block,
+                                          int active_region) {
+  if (block) {
+    allocator_set_block(alloc, block, active_region);
+    return 1;
+  }
+  return 0;
+}
+
+static int
+allocator_acquire_empty_block(struct gc_allocator *alloc,
+                              struct pcc_space *space) {
+  return allocator_acquire_block(alloc, pop_empty_block(space),
+                                 space->active_region);
+}
+
+static int
+allocator_acquire_partly_full_block(struct gc_allocator *alloc,
+                                    struct pcc_space *space) {
+  if (allocator_acquire_block(alloc, pop_partly_full_block(space),
+                              space->active_region)) {
+    alloc->hp += alloc->block->allocated;
+    return 1;
+  }
+  return 0;
+}
+
+static void allocator_release_full_block(struct gc_allocator *alloc,
+                                         struct pcc_space *space) {
+  record_fragmentation(space, alloc->limit - alloc->hp);
+  push_full_block(space, alloc->block);
+  alloc->hp = alloc->limit = 0;
+  alloc->block = NULL;
+}
+
+static void allocator_release_partly_full_block(struct gc_allocator *alloc,
+                                                struct pcc_space *space) {
+  size_t allocated = alloc->hp & (REGION_SIZE - 1);
+  if (allocated) {
+    push_partly_full_block(space, alloc->block, allocated);
+  } else {
+    // Could be hp was bumped all the way to the limit, in which case
+    // allocated wraps to 0; in any case the block is full.
+    push_full_block(space, alloc->block);
+  }
+  alloc->hp = alloc->limit = 0;
+  alloc->block = NULL;
+}
+
+static inline struct gc_ref allocate(struct gc_allocator *alloc,
+                                     struct pcc_space *space,
+                                     size_t size,
+                                     void (*get_more_empty_blocks)(void *data),
+                                     void *data) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, GC_ALIGNMENT);
+
+  if (alloc->hp + size <= alloc->limit)
+    goto done;
+
+  if (alloc->block)
+    allocator_release_full_block(alloc, space);
+  while (allocator_acquire_partly_full_block(alloc, space)) {
+    if (alloc->hp + size <= alloc->limit)
+      goto done;
+    allocator_release_full_block(alloc, space);
+  }
+  while (!allocator_acquire_empty_block(alloc, space))
+    get_more_empty_blocks(data);
+  // The newly acquired block is empty and is therefore large enough for
+  // a small allocation.
+
+done:
+  struct gc_ref ret = gc_ref(alloc->hp);
+  alloc->hp += size;
+  return ret;
+}
+
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                          struct gc_heap *heap,
@@ -309,23 +386,10 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                struct gc_tracer *tracer,
                                struct gc_heap *heap,
                                struct gc_trace_worker *worker) {
-  struct gc_trace_worker_data data = {0,0,NULL};
+  struct gc_trace_worker_data data = {{0,0,NULL},};
   f(tracer, heap, worker, &data);
-  if (data.block)
-    push_partially_allocated_block(heap_pcc_space(heap), data.block,
-                                   data.hp);
-  // FIXME: Add (data.limit - data.hp) to fragmentation.
-}
-
-static void clear_mutator_allocation_buffers(struct gc_heap *heap) {
-  for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next) {
-    if (mut->block) {
-      record_fragmentation(heap_pcc_space(heap), mut->limit - mut->hp);
-      push_allocated_block(heap_pcc_space(heap), mut->block);
-      mut->block = NULL;
-    }
-    mut->hp = mut->limit = 0;
-  }
+  if (data.allocator.block)
+    allocator_release_partly_full_block(&data.allocator, heap_pcc_space(heap));
 }
 
 static struct pcc_block*
@@ -342,70 +406,31 @@ append_block_lists(struct pcc_block *head, struct pcc_block *tail) {
 
 static void pcc_space_flip(struct pcc_space *space) {
   // Mutators stopped, can access nonatomically.
-  struct pcc_block *available = space->available;
-  struct pcc_block *partially_allocated = space->partially_allocated;
-  struct pcc_block *allocated = space->allocated;
-  allocated = append_block_lists(partially_allocated, allocated);
-  space->available = append_block_lists(available, allocated);
-  space->partially_allocated = NULL;
-  space->allocated = NULL;
-  space->allocated_block_count = 0;
+  space->empty =
+    append_block_lists(space->empty,
+                       append_block_lists(space->partly_full, space->full));
+  space->partly_full = NULL;
+  space->full = NULL;
+  space->full_block_count = 0;
   space->fragmentation = 0;
   space->active_region ^= 1;
 }
 
 static void pcc_space_finish_gc(struct pcc_space *space) {
   // Mutators stopped, can access nonatomically.
-  space->live_bytes_at_last_gc = space->allocated_block_count * REGION_SIZE;
+  space->live_bytes_at_last_gc = space->full_block_count * REGION_SIZE;
   space->fragmentation_at_last_gc = space->fragmentation;
 }
 
-static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
-
-static struct gc_ref evacuation_allocate(struct pcc_space *space,
-                                         struct gc_trace_worker_data *data,
-                                         size_t size) {
-  GC_ASSERT(size > 0);
-  GC_ASSERT(size <= gc_allocator_large_threshold());
-  size = align_up(size, GC_ALIGNMENT);
-
-  uintptr_t hp = data->hp;
-  uintptr_t limit = data->limit;
-  uintptr_t new_hp = hp + size;
-  if (new_hp <= limit)
-    goto done;
-
-  if (data->block) {
-    record_fragmentation(space, limit - hp);
-    push_allocated_block(space, data->block);
-  }
-  while ((data->block = pop_partially_allocated_block(space))) {
-    pcc_space_compute_region(space, data->block, &hp, &limit);
-    hp += data->block->allocated;
-    new_hp = hp + size;
-    if (new_hp <= limit) {
-      data->limit = limit;
-      goto done;
-    }
-    record_fragmentation(space, limit - hp);
-    push_allocated_block(space, data->block);
-  }
-  data->block = pop_available_block(space);
-  if (!data->block) {
-    // Can happen if space is really tight and reordering of objects
-    // during evacuation resulted in more end-of-block fragmentation and
-    // thus block use than before collection started.  A dire situation.
-    fprintf(stderr, "Out of memory\n");
-    GC_CRASH();
-  }
-  pcc_space_compute_region(space, data->block, &hp, &data->limit);
-  new_hp = hp + size;
-  // The region is empty and is therefore large enough for a small
-  // allocation.
-
-done:
-  data->hp = new_hp;
-  return gc_ref(hp);
+static void get_more_empty_blocks_during_evacuation(void *data) {
+  // If space is really tight and reordering of objects during
+  // evacuation resulted in more end-of-block fragmentation and thus
+  // block use than before collection started, we can actually run out
+  // of memory while collecting.  We should probably attempt to expand
+  // the heap here, at least by a single block; it's better than the
+  // alternatives.
+  fprintf(stderr, "Out of memory\n");
+  GC_CRASH();
 }
 
 static inline int pcc_space_forward(struct pcc_space *space,
@@ -427,7 +452,9 @@ static inline int pcc_space_forward(struct pcc_space *space,
   case GC_FORWARDING_STATE_ACQUIRED: {
     // We claimed the object successfully; evacuating is up to us.
     size_t bytes = gc_atomic_forward_object_size(&fwd);
-    struct gc_ref new_ref = evacuation_allocate(space, data, bytes);
+    struct gc_ref new_ref = allocate(&data->allocator, space, bytes,
+                                     get_more_empty_blocks_during_evacuation,
+                                     NULL);
     // Copy object contents before committing, as we don't know what
     // part of the object (if any) will be overwritten by the
     // commit.
@@ -545,7 +572,6 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
   heap_lock(heap);
-  mut->block = NULL;
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
   while (mutators_are_stopping(heap))
@@ -564,11 +590,8 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
-  if (mut->block) {
-    push_partially_allocated_block(heap_pcc_space(heap), mut->block,
-                                   mut->hp);
-    mut->block = NULL;
-  }
+  if (mut->allocator.block)
+    allocator_release_partly_full_block(&mut->allocator, heap_pcc_space(heap));
   heap_lock(heap);
   heap->mutator_count--;
   if (mut->next)
@@ -703,6 +726,8 @@ static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
+  if (mut->allocator.block)
+    allocator_release_full_block(&mut->allocator, heap_pcc_space(heap));
   heap_lock(heap);
   pause_mutator_for_collection(heap, mut);
   heap_unlock(heap);
@@ -766,6 +791,7 @@ static void sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
+static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 static void collect(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   struct pcc_space *cspace = heap_pcc_space(heap);
@@ -782,7 +808,6 @@ static void collect(struct gc_mutator *mut) {
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
-  clear_mutator_allocation_buffers(heap);
   pcc_space_flip(cspace);
   gc_tracer_prepare(&heap->tracer);
   add_roots(heap);
@@ -816,6 +841,8 @@ static void collect(struct gc_mutator *mut) {
 
 static void trigger_collection(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
+  if (mut->allocator.block)
+    allocator_release_full_block(&mut->allocator, heap_pcc_space(heap));
   heap_lock(heap);
   long epoch = heap->count;
   while (mutators_are_stopping(heap))
@@ -853,47 +880,20 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
   return ret;
 }
 
+static void get_more_empty_blocks_for_mutator(void *mut) {
+  trigger_collection(mut);
+}
+
 void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
 
   if (size > gc_allocator_large_threshold())
     return allocate_large(mut, size);
 
-  size = align_up(size, GC_ALIGNMENT);
-  uintptr_t hp = mut->hp;
-  uintptr_t limit = mut->limit;
-  uintptr_t new_hp = hp + size;
-  if (new_hp <= limit)
-    goto done;
-
-  struct pcc_space *space = heap_pcc_space(mutator_heap(mut));
-  if (mut->block) {
-    record_fragmentation(space, limit - hp);
-    push_allocated_block(space, mut->block);
-  }
-  while ((mut->block = pop_partially_allocated_block(space))) {
-    pcc_space_compute_region(space, mut->block, &hp, &limit);
-    hp += mut->block->allocated;
-    new_hp = hp + size;
-    if (new_hp <= limit) {
-      mut->limit = limit;
-      goto done;
-    }
-    record_fragmentation(space, limit - hp);
-    push_allocated_block(space, mut->block);
-  }
-  while (!(mut->block = pop_available_block(space))) {
-    trigger_collection(mut);
-  }
-  pcc_space_compute_region(space, mut->block, &hp, &mut->limit);
-  new_hp = hp + size;
-  // The region is empty and is therefore large enough for a small
-  // allocation.
-
-done:
-  mut->hp = new_hp;
-  gc_clear_fresh_allocation(gc_ref(hp), size);
-  return gc_ref_heap_object(gc_ref(hp));
+  return gc_ref_heap_object(allocate(&mut->allocator,
+                                     heap_pcc_space(mutator_heap(mut)),
+                                     size, get_more_empty_blocks_for_mutator,
+                                     mut));
 }
 
 void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
@@ -1031,10 +1031,10 @@ static int pcc_space_init(struct pcc_space *space, struct gc_heap *heap) {
   if (!slabs)
     return 0;
 
-  space->available = NULL;
-  space->partially_allocated = NULL;
-  space->allocated = NULL;
-  space->allocated_block_count = 0;
+  space->empty = NULL;
+  space->partly_full = NULL;
+  space->full = NULL;
+  space->full_block_count = 0;
   space->paged_out = NULL;
   space->fragmentation = 0;
   space->bytes_to_page_out = 0;
@@ -1056,7 +1056,7 @@ static int pcc_space_init(struct pcc_space *space, struct gc_heap *heap) {
         size -= BLOCK_SIZE;
       } else {
         block->in_core = 1;
-        push_available_block(space, block);
+        push_empty_block(space, block);
       }
     }
   }
@@ -1069,10 +1069,11 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             void *event_listener_data) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_ALIGNMENT);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), GC_LARGE_OBJECT_THRESHOLD);
+  GC_ASSERT_EQ(0, offsetof(struct gc_mutator, allocator));
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
-               offsetof(struct gc_mutator, hp));
+               offsetof(struct gc_allocator, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
-               offsetof(struct gc_mutator, limit));
+               offsetof(struct gc_allocator, limit));
 
   if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
     fprintf(stderr, "fixed heap size is currently required\n");
@@ -1121,6 +1122,8 @@ void gc_finish_for_thread(struct gc_mutator *mut) {
 
 static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
+  if (mut->allocator.block)
+    allocator_release_partly_full_block(&mut->allocator, heap_pcc_space(heap));
   heap_lock(heap);
   heap->inactive_mutator_count++;
   if (all_mutators_stopped(heap))

From 4c6889b751df153c4c9026af2de08120c9bcd10b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 08:56:01 +0200
Subject: [PATCH 254/403] Factor copy space out of pcc

---
 src/copy-space.h | 566 +++++++++++++++++++++++++++++++++++++++++++++++
 src/pcc.c        | 565 ++++------------------------------------------
 2 files changed, 608 insertions(+), 523 deletions(-)
 create mode 100644 src/copy-space.h

diff --git a/src/copy-space.h b/src/copy-space.h
new file mode 100644
index 000000000..6f10a3f7b
--- /dev/null
+++ b/src/copy-space.h
@@ -0,0 +1,566 @@
+#ifndef COPY_SPACE_H
+#define COPY_SPACE_H
+
+#include <stdlib.h>
+
+#include "gc-api.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-attrs.h"
+#include "gc-inline.h"
+#include "spin.h"
+
+// A copy space: a block-structured space that traces via evacuation.
+
+#define COPY_SPACE_SLAB_SIZE (64 * 1024 * 1024)
+#define COPY_SPACE_REGION_SIZE (64 * 1024)
+#define COPY_SPACE_BLOCK_SIZE (2 * COPY_SPACE_REGION_SIZE)
+#define COPY_SPACE_BLOCKS_PER_SLAB \
+  (COPY_SPACE_SLAB_SIZE / COPY_SPACE_BLOCK_SIZE)
+#define COPY_SPACE_HEADER_BYTES_PER_BLOCK \
+  (COPY_SPACE_BLOCK_SIZE / COPY_SPACE_BLOCKS_PER_SLAB)
+#define COPY_SPACE_HEADER_BLOCKS_PER_SLAB 1
+#define COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB \
+  (COPY_SPACE_BLOCKS_PER_SLAB - COPY_SPACE_HEADER_BLOCKS_PER_SLAB)
+#define COPY_SPACE_HEADER_BYTES_PER_SLAB \
+  (COPY_SPACE_HEADER_BYTES_PER_BLOCK * COPY_SPACE_HEADER_BLOCKS_PER_SLAB)
+
+struct copy_space_slab;
+
+struct copy_space_slab_header {
+  union {
+    struct {
+      struct copy_space_slab *next;
+      struct copy_space_slab *prev;
+      unsigned incore_block_count;
+    };
+    uint8_t padding[COPY_SPACE_HEADER_BYTES_PER_SLAB];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct copy_space_slab_header),
+                 COPY_SPACE_HEADER_BYTES_PER_SLAB);
+
+// Really just the block header.
+struct copy_space_block {
+  union {
+    struct {
+      struct copy_space_block *next;
+      uint8_t in_core;
+      size_t allocated; // For partly-empty blocks.
+    };
+    uint8_t padding[COPY_SPACE_HEADER_BYTES_PER_BLOCK];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct copy_space_block),
+                 COPY_SPACE_HEADER_BYTES_PER_BLOCK);
+
+struct copy_space_region {
+  char data[COPY_SPACE_REGION_SIZE];
+};
+
+struct copy_space_block_payload {
+  struct copy_space_region regions[2];
+};
+
+struct copy_space_slab {
+  struct copy_space_slab_header header;
+  struct copy_space_block headers[COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB];
+  struct copy_space_block_payload blocks[COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB];
+};
+STATIC_ASSERT_EQ(sizeof(struct copy_space_slab), COPY_SPACE_SLAB_SIZE);
+
+static inline struct copy_space_block*
+copy_space_block_header(struct copy_space_block_payload *payload) {
+  uintptr_t addr = (uintptr_t) payload;
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  struct copy_space_slab *slab = (struct copy_space_slab*) base;
+  uintptr_t block_idx =
+    (addr / COPY_SPACE_BLOCK_SIZE) % COPY_SPACE_BLOCKS_PER_SLAB;
+  return &slab->headers[block_idx - COPY_SPACE_HEADER_BLOCKS_PER_SLAB];
+}
+
+static inline struct copy_space_block_payload*
+copy_space_block_payload(struct copy_space_block *block) {
+  uintptr_t addr = (uintptr_t) block;
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  struct copy_space_slab *slab = (struct copy_space_slab*) base;
+  uintptr_t block_idx =
+    (addr / COPY_SPACE_HEADER_BYTES_PER_BLOCK) % COPY_SPACE_BLOCKS_PER_SLAB;
+  return &slab->blocks[block_idx - COPY_SPACE_HEADER_BLOCKS_PER_SLAB];
+}
+
+static uint8_t
+copy_space_object_region(struct gc_ref obj) {
+  return (gc_ref_value(obj) / COPY_SPACE_REGION_SIZE) & 1;
+}
+
+struct copy_space_extent {
+  uintptr_t low_addr;
+  uintptr_t high_addr;
+};
+
+struct copy_space {
+  struct copy_space_block *empty;
+  struct copy_space_block *partly_full;
+  struct copy_space_block *full ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t allocated_bytes;
+  size_t fragmentation;
+  struct copy_space_block *paged_out ALIGNED_TO_AVOID_FALSE_SHARING;
+  ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
+  // The rest of these members are only changed rarely and with the heap
+  // lock.
+  uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
+  size_t allocated_bytes_at_last_gc;
+  size_t fragmentation_at_last_gc;
+  struct copy_space_extent *extents;
+  size_t nextents;
+  struct copy_space_slab *slabs;
+  size_t nslabs;
+};
+
+struct copy_space_allocator {
+  uintptr_t hp;
+  uintptr_t limit;
+  struct copy_space_block *block;
+};
+
+static void
+copy_space_push_block(struct copy_space_block **list,
+                      struct copy_space_block *block) {
+  struct copy_space_block *next =
+    atomic_load_explicit(list, memory_order_acquire);
+  do {
+    block->next = next;
+  } while (!atomic_compare_exchange_weak(list, &next, block));
+}
+
+static struct copy_space_block*
+copy_space_pop_block(struct copy_space_block **list) {
+  struct copy_space_block *head =
+    atomic_load_explicit(list, memory_order_acquire);
+  struct copy_space_block *next;
+  do {
+    if (!head)
+      return NULL;
+  } while (!atomic_compare_exchange_weak(list, &head, head->next));
+  head->next = NULL;
+  return head;
+}
+
+static struct copy_space_block*
+copy_space_pop_empty_block(struct copy_space *space) {
+  struct copy_space_block *ret = copy_space_pop_block(&space->empty);
+  if (ret)
+    ret->allocated = 0;
+  return ret;
+}
+
+static void
+copy_space_push_empty_block(struct copy_space *space,
+                            struct copy_space_block *block) {
+  copy_space_push_block(&space->empty, block);
+}
+
+static struct copy_space_block*
+copy_space_pop_full_block(struct copy_space *space) {
+  return copy_space_pop_block(&space->full);
+}
+
+static void
+copy_space_push_full_block(struct copy_space *space,
+                           struct copy_space_block *block) {
+  copy_space_push_block(&space->full, block);
+}
+
+static struct copy_space_block*
+copy_space_pop_partly_full_block(struct copy_space *space) {
+  return copy_space_pop_block(&space->partly_full);
+}
+
+static void
+copy_space_push_partly_full_block(struct copy_space *space,
+                                  struct copy_space_block *block) {
+  copy_space_push_block(&space->partly_full, block);
+}
+
+static struct copy_space_block*
+copy_space_pop_paged_out_block(struct copy_space *space) {
+  return copy_space_pop_block(&space->paged_out);
+}
+
+static void
+copy_space_push_paged_out_block(struct copy_space *space,
+                                struct copy_space_block *block) {
+  copy_space_push_block(&space->paged_out, block);
+}
+
+static void
+copy_space_page_out_block(struct copy_space *space,
+                          struct copy_space_block *block) {
+  block->in_core = 0;
+  madvise(copy_space_block_payload(block), COPY_SPACE_BLOCK_SIZE, MADV_DONTNEED);
+  copy_space_push_paged_out_block(space, block);
+}
+
+static struct copy_space_block*
+copy_space_page_in_block(struct copy_space *space) {
+  struct copy_space_block* block = copy_space_pop_paged_out_block(space);
+  if (block) block->in_core = 1;
+  return block;
+}
+
+static ssize_t
+copy_space_request_release_memory(struct copy_space *space, size_t bytes) {
+  return atomic_fetch_add(&space->bytes_to_page_out, bytes) + bytes;
+}
+
+static int
+copy_space_page_out_blocks_until_memory_released(struct copy_space *space) {
+  ssize_t pending = atomic_load(&space->bytes_to_page_out);
+  while (pending > 0) {
+    struct copy_space_block *block = copy_space_pop_empty_block(space);
+    if (!block) return 0;
+    copy_space_page_out_block(space, block);
+    pending = (atomic_fetch_sub(&space->bytes_to_page_out, COPY_SPACE_BLOCK_SIZE)
+               - COPY_SPACE_BLOCK_SIZE);
+  }
+  return 1;
+}
+
+static void
+copy_space_reacquire_memory(struct copy_space *space, size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
+  while (pending + COPY_SPACE_BLOCK_SIZE <= 0) {
+    struct copy_space_block *block = copy_space_page_in_block(space);
+    GC_ASSERT(block);
+    copy_space_push_empty_block(space, block);
+    pending = (atomic_fetch_add(&space->bytes_to_page_out, COPY_SPACE_BLOCK_SIZE)
+               + COPY_SPACE_BLOCK_SIZE);
+  }
+}
+
+static inline void
+copy_space_allocator_set_block(struct copy_space_allocator *alloc,
+                               struct copy_space_block *block,
+                               int active_region) {
+  struct copy_space_block_payload *payload = copy_space_block_payload(block);
+  struct copy_space_region *region = &payload->regions[active_region];
+  alloc->block = block;
+  alloc->hp = (uintptr_t)&region[0];
+  alloc->limit = (uintptr_t)&region[1];
+}
+
+static inline int
+copy_space_allocator_acquire_block(struct copy_space_allocator *alloc,
+                                   struct copy_space_block *block,
+                                   int active_region) {
+  if (block) {
+    copy_space_allocator_set_block(alloc, block, active_region);
+    return 1;
+  }
+  return 0;
+}
+
+static int
+copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
+                                         struct copy_space *space) {
+  return copy_space_allocator_acquire_block(alloc,
+                                            copy_space_pop_empty_block(space),
+                                            space->active_region);
+}
+
+static int
+copy_space_allocator_acquire_partly_full_block(struct copy_space_allocator *alloc,
+                                               struct copy_space *space) {
+  if (copy_space_allocator_acquire_block(alloc,
+                                         copy_space_pop_partly_full_block(space),
+                                         space->active_region)) {
+    alloc->hp += alloc->block->allocated;
+    return 1;
+  }
+  return 0;
+}
+
+static void
+copy_space_allocator_release_full_block(struct copy_space_allocator *alloc,
+                                        struct copy_space *space) {
+  size_t fragmentation = alloc->limit - alloc->hp;
+  size_t allocated = COPY_SPACE_REGION_SIZE - alloc->block->allocated;
+  atomic_fetch_add_explicit(&space->allocated_bytes, allocated,
+                            memory_order_relaxed);
+  if (fragmentation)
+    atomic_fetch_add_explicit(&space->fragmentation, fragmentation,
+                              memory_order_relaxed);
+  copy_space_push_full_block(space, alloc->block);
+  alloc->hp = alloc->limit = 0;
+  alloc->block = NULL;
+}
+
+static void
+copy_space_allocator_release_partly_full_block(struct copy_space_allocator *alloc,
+                                               struct copy_space *space) {
+  size_t allocated = alloc->hp & (COPY_SPACE_REGION_SIZE - 1);
+  if (allocated) {
+    atomic_fetch_add_explicit(&space->allocated_bytes,
+                              allocated - alloc->block->allocated,
+                              memory_order_relaxed);
+    alloc->block->allocated = allocated;
+    copy_space_push_partly_full_block(space, alloc->block);
+  } else {
+    // In this case, hp was bumped all the way to the limit, in which
+    // case allocated wraps to 0; the block is full.
+    atomic_fetch_add_explicit(&space->allocated_bytes,
+                              COPY_SPACE_REGION_SIZE - alloc->block->allocated,
+                              memory_order_relaxed);
+    copy_space_push_full_block(space, alloc->block);
+  }
+  alloc->hp = alloc->limit = 0;
+  alloc->block = NULL;
+}
+
+static inline struct gc_ref
+copy_space_allocate(struct copy_space_allocator *alloc,
+                    struct copy_space *space,
+                    size_t size,
+                    void (*get_more_empty_blocks)(void *data),
+                    void *data) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, gc_allocator_small_granule_size());
+
+  if (alloc->hp + size <= alloc->limit)
+    goto done;
+
+  if (alloc->block)
+    copy_space_allocator_release_full_block(alloc, space);
+  while (copy_space_allocator_acquire_partly_full_block(alloc, space)) {
+    if (alloc->hp + size <= alloc->limit)
+      goto done;
+    copy_space_allocator_release_full_block(alloc, space);
+  }
+  while (!copy_space_allocator_acquire_empty_block(alloc, space))
+    get_more_empty_blocks(data);
+  // The newly acquired block is empty and is therefore large enough for
+  // a small allocation.
+
+done:
+  struct gc_ref ret = gc_ref(alloc->hp);
+  alloc->hp += size;
+  return ret;
+}
+
+static struct copy_space_block*
+copy_space_append_block_lists(struct copy_space_block *head,
+                              struct copy_space_block *tail) {
+  if (!head) return tail;
+  if (tail) {
+    struct copy_space_block *walk = head;
+    while (walk->next)
+      walk = walk->next;
+    walk->next = tail;
+  }
+  return head;
+}
+
+static void
+copy_space_flip(struct copy_space *space) {
+  // Mutators stopped, can access nonatomically.
+  struct copy_space_block *flip = space->full;
+  flip = copy_space_append_block_lists(space->partly_full, flip);
+  flip = copy_space_append_block_lists(space->empty, flip);
+  space->empty = flip;
+  space->partly_full = NULL;
+  space->full = NULL;
+  space->allocated_bytes = 0;
+  space->fragmentation = 0;
+  space->active_region ^= 1;
+}
+
+static void
+copy_space_finish_gc(struct copy_space *space) {
+  // Mutators stopped, can access nonatomically.
+  space->allocated_bytes_at_last_gc = space->allocated_bytes;
+  space->fragmentation_at_last_gc = space->fragmentation;
+}
+
+static void
+copy_space_gc_during_evacuation(void *data) {
+  // If space is really tight and reordering of objects during
+  // evacuation resulted in more end-of-block fragmentation and thus
+  // block use than before collection started, we can actually run out
+  // of memory while collecting.  We should probably attempt to expand
+  // the heap here, at least by a single block; it's better than the
+  // alternatives.
+  fprintf(stderr, "Out of memory\n");
+  GC_CRASH();
+}
+
+static inline int
+copy_space_forward(struct copy_space *space, struct gc_edge edge,
+                   struct gc_ref old_ref, struct copy_space_allocator *alloc) {
+  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+
+  if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+    gc_atomic_forward_acquire(&fwd);
+
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+  case GC_FORWARDING_STATE_ABORTED:
+  default:
+    // Impossible.
+    GC_CRASH();
+  case GC_FORWARDING_STATE_ACQUIRED: {
+    // We claimed the object successfully; evacuating is up to us.
+    size_t bytes = gc_atomic_forward_object_size(&fwd);
+    struct gc_ref new_ref =
+      copy_space_allocate(alloc, space, bytes,
+                          copy_space_gc_during_evacuation, NULL);
+    // Copy object contents before committing, as we don't know what
+    // part of the object (if any) will be overwritten by the
+    // commit.
+    memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), bytes);
+    gc_atomic_forward_commit(&fwd, new_ref);
+    gc_edge_update(edge, new_ref);
+    return 1;
+  }
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known, or evacuation aborts.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        break;
+      yield_for_spin(spin_count);
+    }
+    GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+    // Fall through.
+  case GC_FORWARDING_STATE_FORWARDED:
+    // The object has been evacuated already.  Update the edge;
+    // whoever forwarded the object will make sure it's eventually
+    // traced.
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 0;
+  }
+}
+
+static int
+copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
+                             struct gc_ref old_ref) {
+  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+    return 0;
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        break;
+      yield_for_spin(spin_count);
+    }
+    GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+    // Fall through.
+  case GC_FORWARDING_STATE_FORWARDED:
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 1;
+  default:
+    GC_CRASH();
+  }
+}
+
+static inline int
+copy_space_contains(struct copy_space *space, struct gc_ref ref) {
+  for (size_t i = 0; i < space->nextents; i++)
+    if (space->extents[i].low_addr <= gc_ref_value(ref) &&
+        gc_ref_value(ref) < space->extents[i].high_addr)
+      return 1;
+  return 0;
+}
+
+static inline void
+copy_space_allocator_init(struct copy_space_allocator *alloc,
+                          struct copy_space *space) {
+  memset(alloc, 0, sizeof(*alloc));
+}
+
+static inline void
+copy_space_allocator_finish(struct copy_space_allocator *alloc,
+                            struct copy_space *space) {
+  if (alloc->block)
+    copy_space_allocator_release_partly_full_block(alloc, space);
+}
+
+static struct copy_space_slab*
+copy_space_allocate_slabs(size_t nslabs) {
+  size_t size = nslabs * COPY_SPACE_SLAB_SIZE;
+  size_t extent = size + COPY_SPACE_SLAB_SIZE;
+
+  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return NULL;
+  }
+
+  uintptr_t base = (uintptr_t) mem;
+  uintptr_t end = base + extent;
+  uintptr_t aligned_base = align_up(base, COPY_SPACE_SLAB_SIZE);
+  uintptr_t aligned_end = aligned_base + size;
+
+  if (aligned_base - base)
+    munmap((void*)base, aligned_base - base);
+  if (end - aligned_end)
+    munmap((void*)aligned_end, end - aligned_end);
+
+  return (struct copy_space_slab*) aligned_base;
+}
+
+static int
+copy_space_init(struct copy_space *space, size_t size) {
+  size = align_up(size, COPY_SPACE_BLOCK_SIZE);
+  size_t reserved = align_up(size, COPY_SPACE_SLAB_SIZE);
+  size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
+  struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs);
+  if (!slabs)
+    return 0;
+
+  space->empty = NULL;
+  space->partly_full = NULL;
+  space->full = NULL;
+  space->paged_out = NULL;
+  space->allocated_bytes = 0;
+  space->fragmentation = 0;
+  space->bytes_to_page_out = 0;
+  space->active_region = 0;
+  space->allocated_bytes_at_last_gc = 0;
+  space->fragmentation_at_last_gc = 0;
+  space->extents = calloc(1, sizeof(struct copy_space_extent));
+  space->extents[0].low_addr = (uintptr_t) slabs;
+  space->extents[0].high_addr = space->extents[0].low_addr + reserved;
+  space->nextents = 1;
+  space->slabs = slabs;
+  space->nslabs = nslabs;
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
+      struct copy_space_block *block = &slabs[slab].headers[idx];
+      if (reserved > size) {
+        block->in_core = 0;
+        copy_space_push_paged_out_block(space, block);
+        reserved -= COPY_SPACE_BLOCK_SIZE;
+      } else {
+        block->in_core = 1;
+        copy_space_push_empty_block(space, block);
+      }
+    }
+  }
+  return 1;
+}
+
+#endif // COPY_SPACE_H
diff --git a/src/pcc.c b/src/pcc.c
index 1638abf85..b0aeddda0 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -12,6 +12,7 @@
 #define GC_IMPL 1
 #include "gc-internal.h"
 
+#include "copy-space.h"
 #include "debug.h"
 #include "gc-align.h"
 #include "gc-inline.h"
@@ -21,106 +22,8 @@
 #include "spin.h"
 #include "pcc-attrs.h"
 
-#define SLAB_SIZE (64 * 1024 * 1024)
-#define REGION_SIZE (64 * 1024)
-#define BLOCK_SIZE (2 * REGION_SIZE)
-#define BLOCKS_PER_SLAB (SLAB_SIZE / BLOCK_SIZE)
-#define HEADER_BYTES_PER_BLOCK (BLOCK_SIZE / BLOCKS_PER_SLAB)
-#define HEADER_BLOCKS_PER_SLAB 1
-#define NONHEADER_BLOCKS_PER_SLAB (BLOCKS_PER_SLAB - HEADER_BLOCKS_PER_SLAB)
-#define HEADER_BYTES_PER_SLAB (HEADER_BYTES_PER_BLOCK * HEADER_BLOCKS_PER_SLAB)
-
-struct pcc_slab;
-struct pcc_block;
-
-struct pcc_slab_header {
-  union {
-    struct {
-      struct pcc_slab *next;
-      struct pcc_slab *prev;
-      unsigned incore_block_count;
-    };
-    uint8_t padding[HEADER_BYTES_PER_SLAB];
-  };
-};
-STATIC_ASSERT_EQ(sizeof(struct pcc_slab_header),
-                 HEADER_BYTES_PER_SLAB);
-
-// Really just the block header.
-struct pcc_block {
-  union {
-    struct {
-      struct pcc_block *next;
-      uint8_t in_core;
-      size_t allocated; // For partly-empty blocks.
-    };
-    uint8_t padding[HEADER_BYTES_PER_BLOCK];
-  };
-};
-STATIC_ASSERT_EQ(sizeof(struct pcc_block),
-                 HEADER_BYTES_PER_BLOCK);
-
-struct pcc_region {
-  char data[REGION_SIZE];
-};
-
-struct pcc_block_payload {
-  struct pcc_region regions[2];
-};
-
-struct pcc_slab {
-  struct pcc_slab_header header;
-  struct pcc_block headers[NONHEADER_BLOCKS_PER_SLAB];
-  struct pcc_block_payload blocks[NONHEADER_BLOCKS_PER_SLAB];
-};
-STATIC_ASSERT_EQ(sizeof(struct pcc_slab), SLAB_SIZE);
-
-static struct pcc_block *block_header(struct pcc_block_payload *payload) {
-  uintptr_t addr = (uintptr_t) payload;
-  uintptr_t base = align_down(addr, SLAB_SIZE);
-  struct pcc_slab *slab = (struct pcc_slab*) base;
-  uintptr_t block_idx = (addr / BLOCK_SIZE) % BLOCKS_PER_SLAB;
-  return &slab->headers[block_idx - HEADER_BLOCKS_PER_SLAB];
-}
-
-static struct pcc_block_payload *block_payload(struct pcc_block *block) {
-  uintptr_t addr = (uintptr_t) block;
-  uintptr_t base = align_down(addr, SLAB_SIZE);
-  struct pcc_slab *slab = (struct pcc_slab*) base;
-  uintptr_t block_idx = (addr / HEADER_BYTES_PER_BLOCK) % BLOCKS_PER_SLAB;
-  return &slab->blocks[block_idx - HEADER_BLOCKS_PER_SLAB];
-}
-
-static uint8_t pcc_object_region(struct gc_ref obj) {
-  return (gc_ref_value(obj) / REGION_SIZE) & 1;
-}
-
-struct pcc_extent {
-  uintptr_t low_addr;
-  uintptr_t high_addr;
-};
-
-struct pcc_space {
-  struct pcc_block *empty;
-  struct pcc_block *partly_full;
-  struct pcc_block *full ALIGNED_TO_AVOID_FALSE_SHARING;
-  size_t full_block_count;
-  struct pcc_block *paged_out ALIGNED_TO_AVOID_FALSE_SHARING;
-  size_t fragmentation ALIGNED_TO_AVOID_FALSE_SHARING;
-  ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
-  // The rest of these members are only changed rarely and with the heap
-  // lock.
-  uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
-  size_t live_bytes_at_last_gc;
-  size_t fragmentation_at_last_gc;
-  struct pcc_extent *extents;
-  size_t nextents;
-  struct pcc_slab *slabs;
-  size_t nslabs;
-};
-
 struct gc_heap {
-  struct pcc_space pcc_space;
+  struct copy_space copy_space;
   struct large_object_space large_object_space;
   struct gc_extern_space *extern_space;
   size_t large_object_pages;
@@ -150,14 +53,8 @@ struct gc_heap {
 #define MUTATOR_EVENT(mut, event, ...)                                  \
   (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
 
-struct gc_allocator {
-  uintptr_t hp;
-  uintptr_t limit;
-  struct pcc_block *block;
-};
-
 struct gc_mutator {
-  struct gc_allocator allocator;
+  struct copy_space_allocator allocator;
   struct gc_heap *heap;
   struct gc_mutator_roots *roots;
   void *event_listener_data;
@@ -166,11 +63,11 @@ struct gc_mutator {
 };
 
 struct gc_trace_worker_data {
-  struct gc_allocator allocator;
+  struct copy_space_allocator allocator;
 };
 
-static inline struct pcc_space* heap_pcc_space(struct gc_heap *heap) {
-  return &heap->pcc_space;
+static inline struct copy_space* heap_copy_space(struct gc_heap *heap) {
+  return &heap->copy_space;
 }
 static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
@@ -182,202 +79,6 @@ static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
-static void push_block(struct pcc_block **list,
-                       struct pcc_block *block) {
-  struct pcc_block *next = atomic_load_explicit(list, memory_order_acquire);
-  do {
-    block->next = next;
-  } while (!atomic_compare_exchange_weak(list, &next, block));
-}
-
-static struct pcc_block* pop_block(struct pcc_block **list) {
-  struct pcc_block *head = atomic_load_explicit(list, memory_order_acquire);
-  struct pcc_block *next;
-  do {
-    if (!head)
-      return NULL;
-  } while (!atomic_compare_exchange_weak(list, &head, head->next));
-  head->next = NULL;
-  return head;
-}
-
-static struct pcc_block* pop_empty_block(struct pcc_space *space) {
-  return pop_block(&space->empty);
-}
-static void push_empty_block(struct pcc_space *space,
-                             struct pcc_block *block) {
-  push_block(&space->empty, block);
-}
-
-static struct pcc_block* pop_full_block(struct pcc_space *space) {
-  return pop_block(&space->full);
-}
-static void push_full_block(struct pcc_space *space,
-                            struct pcc_block *block) {
-  push_block(&space->full, block);
-  atomic_fetch_add_explicit(&space->full_block_count, 1,
-                            memory_order_relaxed);
-}
-
-static struct pcc_block* pop_partly_full_block(struct pcc_space *space) {
-  return pop_block(&space->partly_full);
-}
-static void push_partly_full_block(struct pcc_space *space,
-                                   struct pcc_block *block,
-                                   size_t allocated_bytes) {
-  GC_ASSERT(allocated_bytes);
-  block->allocated = allocated_bytes;
-  push_block(&space->partly_full, block);
-}
-
-static struct pcc_block* pop_paged_out_block(struct pcc_space *space) {
-  return pop_block(&space->paged_out);
-}
-static void push_paged_out_block(struct pcc_space *space,
-                                 struct pcc_block *block) {
-  push_block(&space->paged_out, block);
-}
-
-static void page_out_block(struct pcc_space *space,
-                           struct pcc_block *block) {
-  block->in_core = 0;
-  madvise(block_payload(block), BLOCK_SIZE, MADV_DONTNEED);
-  push_paged_out_block(space, block);
-}
-
-static struct pcc_block* page_in_block(struct pcc_space *space) {
-  struct pcc_block* block = pop_paged_out_block(space);
-  if (block) block->in_core = 1;
-  return block;
-}
-
-static void record_fragmentation(struct pcc_space *space,
-                                 size_t bytes) {
-  atomic_fetch_add_explicit(&space->fragmentation, bytes,
-                            memory_order_relaxed);
-}
-
-static ssize_t pcc_space_request_release_memory(struct pcc_space *space,
-                                                size_t bytes) {
-  return atomic_fetch_add(&space->bytes_to_page_out, bytes) + bytes;
-}
-
-static int
-pcc_space_page_out_blocks_until_memory_released(struct pcc_space *space) {
-  ssize_t pending = atomic_load(&space->bytes_to_page_out);
-  while (pending > 0) {
-    struct pcc_block *block = pop_empty_block(space);
-    if (!block) return 0;
-    page_out_block(space, block);
-    pending =
-      atomic_fetch_sub(&space->bytes_to_page_out, BLOCK_SIZE) - BLOCK_SIZE;
-  }
-  return 1;
-}
-
-static void pcc_space_reacquire_memory(struct pcc_space *space,
-                                       size_t bytes) {
-  ssize_t pending =
-    atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
-  while (pending + BLOCK_SIZE <= 0) {
-    struct pcc_block *block = page_in_block(space);
-    GC_ASSERT(block);
-    push_empty_block(space, block);
-    pending =
-      atomic_fetch_add(&space->bytes_to_page_out, BLOCK_SIZE) + BLOCK_SIZE;
-  }
-}
-
-static inline void allocator_set_block(struct gc_allocator *alloc,
-                                       struct pcc_block *block,
-                                       int active_region) {
-  struct pcc_block_payload *payload = block_payload(block);
-  struct pcc_region *region = &payload->regions[active_region];
-  alloc->block = block;
-  alloc->hp = (uintptr_t)&region[0];
-  alloc->limit = (uintptr_t)&region[1];
-}
-
-static inline int allocator_acquire_block(struct gc_allocator *alloc,
-                                          struct pcc_block *block,
-                                          int active_region) {
-  if (block) {
-    allocator_set_block(alloc, block, active_region);
-    return 1;
-  }
-  return 0;
-}
-
-static int
-allocator_acquire_empty_block(struct gc_allocator *alloc,
-                              struct pcc_space *space) {
-  return allocator_acquire_block(alloc, pop_empty_block(space),
-                                 space->active_region);
-}
-
-static int
-allocator_acquire_partly_full_block(struct gc_allocator *alloc,
-                                    struct pcc_space *space) {
-  if (allocator_acquire_block(alloc, pop_partly_full_block(space),
-                              space->active_region)) {
-    alloc->hp += alloc->block->allocated;
-    return 1;
-  }
-  return 0;
-}
-
-static void allocator_release_full_block(struct gc_allocator *alloc,
-                                         struct pcc_space *space) {
-  record_fragmentation(space, alloc->limit - alloc->hp);
-  push_full_block(space, alloc->block);
-  alloc->hp = alloc->limit = 0;
-  alloc->block = NULL;
-}
-
-static void allocator_release_partly_full_block(struct gc_allocator *alloc,
-                                                struct pcc_space *space) {
-  size_t allocated = alloc->hp & (REGION_SIZE - 1);
-  if (allocated) {
-    push_partly_full_block(space, alloc->block, allocated);
-  } else {
-    // Could be hp was bumped all the way to the limit, in which case
-    // allocated wraps to 0; in any case the block is full.
-    push_full_block(space, alloc->block);
-  }
-  alloc->hp = alloc->limit = 0;
-  alloc->block = NULL;
-}
-
-static inline struct gc_ref allocate(struct gc_allocator *alloc,
-                                     struct pcc_space *space,
-                                     size_t size,
-                                     void (*get_more_empty_blocks)(void *data),
-                                     void *data) {
-  GC_ASSERT(size > 0);
-  GC_ASSERT(size <= gc_allocator_large_threshold());
-  size = align_up(size, GC_ALIGNMENT);
-
-  if (alloc->hp + size <= alloc->limit)
-    goto done;
-
-  if (alloc->block)
-    allocator_release_full_block(alloc, space);
-  while (allocator_acquire_partly_full_block(alloc, space)) {
-    if (alloc->hp + size <= alloc->limit)
-      goto done;
-    allocator_release_full_block(alloc, space);
-  }
-  while (!allocator_acquire_empty_block(alloc, space))
-    get_more_empty_blocks(data);
-  // The newly acquired block is empty and is therefore large enough for
-  // a small allocation.
-
-done:
-  struct gc_ref ret = gc_ref(alloc->hp);
-  alloc->hp += size;
-  return ret;
-}
-
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                          struct gc_heap *heap,
@@ -386,110 +87,10 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                struct gc_tracer *tracer,
                                struct gc_heap *heap,
                                struct gc_trace_worker *worker) {
-  struct gc_trace_worker_data data = {{0,0,NULL},};
+  struct gc_trace_worker_data data;
+  copy_space_allocator_init(&data.allocator, heap_copy_space(heap));
   f(tracer, heap, worker, &data);
-  if (data.allocator.block)
-    allocator_release_partly_full_block(&data.allocator, heap_pcc_space(heap));
-}
-
-static struct pcc_block*
-append_block_lists(struct pcc_block *head, struct pcc_block *tail) {
-  if (!head) return tail;
-  if (tail) {
-    struct pcc_block *walk = head;
-    while (walk->next)
-      walk = walk->next;
-    walk->next = tail;
-  }
-  return head;
-}
-
-static void pcc_space_flip(struct pcc_space *space) {
-  // Mutators stopped, can access nonatomically.
-  space->empty =
-    append_block_lists(space->empty,
-                       append_block_lists(space->partly_full, space->full));
-  space->partly_full = NULL;
-  space->full = NULL;
-  space->full_block_count = 0;
-  space->fragmentation = 0;
-  space->active_region ^= 1;
-}
-
-static void pcc_space_finish_gc(struct pcc_space *space) {
-  // Mutators stopped, can access nonatomically.
-  space->live_bytes_at_last_gc = space->full_block_count * REGION_SIZE;
-  space->fragmentation_at_last_gc = space->fragmentation;
-}
-
-static void get_more_empty_blocks_during_evacuation(void *data) {
-  // If space is really tight and reordering of objects during
-  // evacuation resulted in more end-of-block fragmentation and thus
-  // block use than before collection started, we can actually run out
-  // of memory while collecting.  We should probably attempt to expand
-  // the heap here, at least by a single block; it's better than the
-  // alternatives.
-  fprintf(stderr, "Out of memory\n");
-  GC_CRASH();
-}
-
-static inline int pcc_space_forward(struct pcc_space *space,
-                                    struct gc_edge edge,
-                                    struct gc_ref old_ref,
-                                    struct gc_trace_worker_data *data) {
-  GC_ASSERT(pcc_object_region(old_ref) != space->active_region);
-  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
-
-  if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
-    gc_atomic_forward_acquire(&fwd);
-
-  switch (fwd.state) {
-  case GC_FORWARDING_STATE_NOT_FORWARDED:
-  case GC_FORWARDING_STATE_ABORTED:
-  default:
-    // Impossible.
-    GC_CRASH();
-  case GC_FORWARDING_STATE_ACQUIRED: {
-    // We claimed the object successfully; evacuating is up to us.
-    size_t bytes = gc_atomic_forward_object_size(&fwd);
-    struct gc_ref new_ref = allocate(&data->allocator, space, bytes,
-                                     get_more_empty_blocks_during_evacuation,
-                                     NULL);
-    // Copy object contents before committing, as we don't know what
-    // part of the object (if any) will be overwritten by the
-    // commit.
-    memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), bytes);
-    gc_atomic_forward_commit(&fwd, new_ref);
-    gc_edge_update(edge, new_ref);
-    return 1;
-  }
-  case GC_FORWARDING_STATE_BUSY:
-    // Someone else claimed this object first.  Spin until new address
-    // known, or evacuation aborts.
-    for (size_t spin_count = 0;; spin_count++) {
-      if (gc_atomic_forward_retry_busy(&fwd))
-        break;
-      yield_for_spin(spin_count);
-    }
-    GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-    // Fall through.
-  case GC_FORWARDING_STATE_FORWARDED:
-    // The object has been evacuated already.  Update the edge;
-    // whoever forwarded the object will make sure it's eventually
-    // traced.
-    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-    return 0;
-  }
-}
-
-static inline int pcc_space_contains(struct pcc_space *space,
-                                     struct gc_ref ref) {
-
-  for (size_t i = 0; i < space->nextents; i++)
-    if (space->extents[i].low_addr <= gc_ref_value(ref) &&
-        gc_ref_value(ref) < space->extents[i].high_addr)
-      return 1;
-  return 0;
+  copy_space_allocator_finish(&data.allocator, heap_copy_space(heap));
 }
 
 static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
@@ -497,8 +98,9 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
                            struct gc_trace_worker_data *data) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(pcc_space_contains(heap_pcc_space(heap), ref)))
-    return pcc_space_forward(heap_pcc_space(heap), edge, ref, data);
+  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
+    return copy_space_forward(heap_copy_space(heap), edge, ref,
+                              &data->allocator);
   else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap), ref);
   else
@@ -523,30 +125,10 @@ int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(pcc_space_contains(heap_pcc_space(heap), ref))) {
-    struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
-    switch (fwd.state) {
-    case GC_FORWARDING_STATE_NOT_FORWARDED:
-      return 0;
-    case GC_FORWARDING_STATE_BUSY:
-      // Someone else claimed this object first.  Spin until new address
-      // known.
-      for (size_t spin_count = 0;; spin_count++) {
-        if (gc_atomic_forward_retry_busy(&fwd))
-          break;
-        yield_for_spin(spin_count);
-      }
-      GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-      // Fall through.
-    case GC_FORWARDING_STATE_FORWARDED:
-      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-      return 1;
-    default:
-      GC_CRASH();
-    }
-  } else if (large_object_space_contains(heap_large_object_space(heap), ref)) {
+  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
+    return copy_space_forward_if_traced(heap_copy_space(heap), edge, ref);
+  if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_is_copied(heap_large_object_space(heap), ref);
-  }
   GC_CRASH();
 }
 
@@ -571,6 +153,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
+  copy_space_allocator_init(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
@@ -590,8 +173,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
-  if (mut->allocator.block)
-    allocator_release_partly_full_block(&mut->allocator, heap_pcc_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
   heap->mutator_count--;
   if (mut->next)
@@ -627,7 +209,7 @@ static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
   GC_ASSERT(npages <= previous);
   size_t bytes = (previous - npages) <<
     heap_large_object_space(heap)->page_size_log2;
-  pcc_space_reacquire_memory(heap_pcc_space(heap), bytes);
+  copy_space_reacquire_memory(heap_copy_space(heap), bytes);
 }
 
 void gc_mutator_set_roots(struct gc_mutator *mut,
@@ -654,8 +236,8 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              struct gc_trace_worker *worker) {
 #ifdef DEBUG
-  if (pcc_space_contains(heap_pcc_space(heap), ref))
-    GC_ASSERT(pcc_object_region(ref) == heap_pcc_space(heap)->active_region);
+  if (copy_space_contains(heap_copy_space(heap), ref))
+    GC_ASSERT(copy_space_object_region(ref) == heap_copy_space(heap)->active_region);
 #endif
   gc_trace_object(ref, tracer_visit, heap, worker, NULL);
 }
@@ -726,8 +308,7 @@ static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
-  if (mut->allocator.block)
-    allocator_release_full_block(&mut->allocator, heap_pcc_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
   pause_mutator_for_collection(heap, mut);
   heap_unlock(heap);
@@ -794,7 +375,7 @@ static void sweep_ephemerons(struct gc_heap *heap) {
 static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
 static void collect(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
-  struct pcc_space *cspace = heap_pcc_space(heap);
+  struct copy_space *copy_space = heap_copy_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
   struct gc_extern_space *exspace = heap_extern_space(heap);
   MUTATOR_EVENT(mut, mutator_cause_gc);
@@ -808,7 +389,7 @@ static void collect(struct gc_mutator *mut) {
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
-  pcc_space_flip(cspace);
+  copy_space_flip(copy_space);
   gc_tracer_prepare(&heap->tracer);
   add_roots(heap);
   HEAP_EVENT(heap, roots_traced);
@@ -821,18 +402,18 @@ static void collect(struct gc_mutator *mut) {
   HEAP_EVENT(heap, finalizers_traced);
   sweep_ephemerons(heap);
   gc_tracer_release(&heap->tracer);
-  pcc_space_finish_gc(cspace);
+  copy_space_finish_gc(copy_space);
   large_object_space_finish_gc(lospace, 0);
   gc_extern_space_finish_gc(exspace, 0);
   heap->count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
-  size_t live_size = (cspace->live_bytes_at_last_gc +
+  size_t live_size = (copy_space->allocated_bytes_at_last_gc +
                       large_object_space_size_at_last_collection(lospace));
   HEAP_EVENT(heap, live_data_size, live_size);
   maybe_grow_heap(heap);
-  if (!pcc_space_page_out_blocks_until_memory_released(cspace)) {
+  if (!copy_space_page_out_blocks_until_memory_released(copy_space)) {
     fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
-            heap->size, cspace->nslabs);
+            heap->size, copy_space->nslabs);
     GC_CRASH();
   }
   HEAP_EVENT(heap, restarting_mutators);
@@ -841,8 +422,7 @@ static void collect(struct gc_mutator *mut) {
 
 static void trigger_collection(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
-  if (mut->allocator.block)
-    allocator_release_full_block(&mut->allocator, heap_pcc_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
   long epoch = heap->count;
   while (mutators_are_stopping(heap))
@@ -862,9 +442,9 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
 
   size_t npages = large_object_space_npages(space, size);
 
-  pcc_space_request_release_memory(heap_pcc_space(heap),
+  copy_space_request_release_memory(heap_copy_space(heap),
                                      npages << space->page_size_log2);
-  while (!pcc_space_page_out_blocks_until_memory_released(heap_pcc_space(heap)))
+  while (!copy_space_page_out_blocks_until_memory_released(heap_copy_space(heap)))
     trigger_collection(mut);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
@@ -890,10 +470,13 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   if (size > gc_allocator_large_threshold())
     return allocate_large(mut, size);
 
-  return gc_ref_heap_object(allocate(&mut->allocator,
-                                     heap_pcc_space(mutator_heap(mut)),
-                                     size, get_more_empty_blocks_for_mutator,
-                                     mut));
+  struct gc_ref ret = copy_space_allocate(&mut->allocator,
+                                          heap_copy_space(mutator_heap(mut)),
+                                          size,
+                                          get_more_empty_blocks_for_mutator,
+                                          mut);
+  gc_clear_fresh_allocation(ret, size);
+  return gc_ref_heap_object(ret);
 }
 
 void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
@@ -939,30 +522,6 @@ void gc_set_finalizer_callback(struct gc_heap *heap,
   gc_finalizer_state_set_callback(heap->finalizer_state, callback);
 }
 
-static struct pcc_slab* allocate_slabs(size_t nslabs) {
-  size_t size = nslabs * SLAB_SIZE;
-  size_t extent = size + SLAB_SIZE;
-
-  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("mmap failed");
-    return NULL;
-  }
-
-  uintptr_t base = (uintptr_t) mem;
-  uintptr_t end = base + extent;
-  uintptr_t aligned_base = align_up(base, SLAB_SIZE);
-  uintptr_t aligned_end = aligned_base + size;
-
-  if (aligned_base - base)
-    munmap((void*)base, aligned_base - base);
-  if (end - aligned_end)
-    munmap((void*)aligned_end, end - aligned_end);
-
-  return (struct pcc_slab*) aligned_base;
-}
-
 static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
   struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
   size_t target = heap->size * heap->pending_ephemerons_size_factor;
@@ -1024,45 +583,6 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   return 1;
 }
 
-static int pcc_space_init(struct pcc_space *space, struct gc_heap *heap) {
-  size_t size = align_up(heap->size, SLAB_SIZE);
-  size_t nslabs = size / SLAB_SIZE;
-  struct pcc_slab *slabs = allocate_slabs(nslabs);
-  if (!slabs)
-    return 0;
-
-  space->empty = NULL;
-  space->partly_full = NULL;
-  space->full = NULL;
-  space->full_block_count = 0;
-  space->paged_out = NULL;
-  space->fragmentation = 0;
-  space->bytes_to_page_out = 0;
-  space->active_region = 0;
-  space->live_bytes_at_last_gc = 0;
-  space->fragmentation_at_last_gc = 0;
-  space->extents = calloc(1, sizeof(struct pcc_extent));
-  space->extents[0].low_addr = (uintptr_t) slabs;
-  space->extents[0].high_addr = space->extents[0].low_addr + size;
-  space->nextents = 1;
-  space->slabs = slabs;
-  space->nslabs = nslabs;
-  for (size_t slab = 0; slab < nslabs; slab++) {
-    for (size_t idx = 0; idx < NONHEADER_BLOCKS_PER_SLAB; idx++) {
-      struct pcc_block *block = &slabs[slab].headers[idx];
-      if (size > heap->size) {
-        block->in_core = 0;
-        push_paged_out_block(space, block);
-        size -= BLOCK_SIZE;
-      } else {
-        block->in_core = 1;
-        push_empty_block(space, block);
-      }
-    }
-  }
-  return 1;
-}
-
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut,
             struct gc_event_listener event_listener,
@@ -1071,9 +591,9 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   GC_ASSERT_EQ(gc_allocator_large_threshold(), GC_LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(0, offsetof(struct gc_mutator, allocator));
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
-               offsetof(struct gc_allocator, hp));
+               offsetof(struct copy_space_allocator, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
-               offsetof(struct gc_allocator, limit));
+               offsetof(struct copy_space_allocator, limit));
 
   if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
     fprintf(stderr, "fixed heap size is currently required\n");
@@ -1090,8 +610,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   (*heap)->event_listener_data = event_listener_data;
   HEAP_EVENT(*heap, init, (*heap)->size);
 
-  struct pcc_space *space = heap_pcc_space(*heap);
-  if (!pcc_space_init(space, *heap)) {
+  struct copy_space *space = heap_copy_space(*heap);
+  if (!copy_space_init(space, (*heap)->size)) {
     free(*heap);
     *heap = NULL;
     return 0;
@@ -1122,8 +642,7 @@ void gc_finish_for_thread(struct gc_mutator *mut) {
 
 static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
-  if (mut->allocator.block)
-    allocator_release_partly_full_block(&mut->allocator, heap_pcc_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
   heap->inactive_mutator_count++;
   if (all_mutators_stopped(heap))

From d3383ad911d7935daa518fa6c6ece6212a4abf39 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 11:48:25 +0200
Subject: [PATCH 255/403] Bulk-zero copy-space blocks

---
 api/pcc-attrs.h  |  2 +-
 src/copy-space.h | 16 +++++++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 7d589115f..5f80488a0 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -40,7 +40,7 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
 }
 
 static inline int gc_allocator_needs_clear(void) {
-  return 1;
+  return 0;
 }
 
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
diff --git a/src/copy-space.h b/src/copy-space.h
index 6f10a3f7b..fd6922b0f 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -51,6 +51,7 @@ struct copy_space_block {
     struct {
       struct copy_space_block *next;
       uint8_t in_core;
+      uint8_t all_zeroes[2];
       size_t allocated; // For partly-empty blocks.
     };
     uint8_t padding[COPY_SPACE_HEADER_BYTES_PER_BLOCK];
@@ -203,6 +204,7 @@ static void
 copy_space_page_out_block(struct copy_space *space,
                           struct copy_space_block *block) {
   block->in_core = 0;
+  block->all_zeroes[0] = block->all_zeroes[1] = 1;
   madvise(copy_space_block_payload(block), COPY_SPACE_BLOCK_SIZE, MADV_DONTNEED);
   copy_space_push_paged_out_block(space, block);
 }
@@ -270,9 +272,16 @@ copy_space_allocator_acquire_block(struct copy_space_allocator *alloc,
 static int
 copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
                                          struct copy_space *space) {
-  return copy_space_allocator_acquire_block(alloc,
-                                            copy_space_pop_empty_block(space),
-                                            space->active_region);
+  if (copy_space_allocator_acquire_block(alloc,
+                                         copy_space_pop_empty_block(space),
+                                         space->active_region)) {
+    if (alloc->block->all_zeroes[space->active_region])
+      alloc->block->all_zeroes[space->active_region] = 0;
+    else
+      memset((char*)alloc->hp, 0, COPY_SPACE_REGION_SIZE);
+    return 1;
+  }
+  return 0;
 }
 
 static int
@@ -550,6 +559,7 @@ copy_space_init(struct copy_space *space, size_t size) {
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
+      block->all_zeroes[0] = block->all_zeroes[1] = 1;
       if (reserved > size) {
         block->in_core = 0;
         copy_space_push_paged_out_block(space, block);

From 0ee58abb46e791f9cca6faae053ac09c523a66bd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 11:56:46 +0200
Subject: [PATCH 256/403] Add gc_finalizer_priority_count to gc-embedder-api.h

---
 api/gc-embedder-api.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index bb091caeb..b176d7bef 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -19,6 +19,7 @@ struct gc_heap;
 struct gc_extern_space;
 
 GC_EMBEDDER_API inline int gc_is_valid_conservative_ref_displacement(uintptr_t displacement);
+GC_EMBEDDER_API inline size_t gc_finalizer_priority_count(void);
 
 GC_EMBEDDER_API inline int gc_extern_space_visit(struct gc_extern_space *space,
                                                  struct gc_edge edge,

From 37e57f8c8defd27a57464966eef68dca082dbb51 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 14:41:04 +0200
Subject: [PATCH 257/403] Add serial copying collector

---
 Makefile             |   4 +
 doc/collector-pcc.md |  59 ++--
 doc/collector-scc.md |  62 ++++
 doc/collectors.md    |  22 +-
 embed.mk             |   3 +
 src/copy-space.h     |  35 +++
 src/scc.c            | 669 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 802 insertions(+), 52 deletions(-)
 create mode 100644 doc/collector-scc.md
 create mode 100644 src/scc.c

diff --git a/Makefile b/Makefile
index db5f1a7c2..56c7325c4 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@ TESTS = quads mt-gcbench ephemerons finalizers
 COLLECTORS = \
 	bdw \
 	semi \
+	scc \
 	pcc \
 	\
 	whippet \
@@ -63,6 +64,9 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
+GC_STEM_scc       = scc
+GC_CFLAGS_scc     = -DGC_PRECISE_ROOTS=1
+
 GC_STEM_pcc       = pcc
 GC_CFLAGS_pcc     = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 
diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index a20e58e64..c79fb2aea 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -1,49 +1,23 @@
 # Parallel copying collector
 
-Whippet's `pcc` collector is a copying collector, like the more simple
-[`semi`](./collector-semi.md), but supporting multiple mutator threads
-and multiple tracing threads.
+Whippet's `pcc` collector is a copying collector, exactly like
+[`scc`](./collector-scc.md), but supporting multiple tracing threads.
+See the discussion of `scc` for a general overview.
 
-Like `semi`, `pcc` traces by evacuation: it moves all live objects on
-every collection.  (Exception:  objects larger than 8192 bytes are
-placed into a partitioned space which traces by marking in place instead
-of copying.)  Evacuation requires precise roots, so if your embedder
-does not support precise roots, `pcc` is not for you.
-
-Again like `semi`, `pcc` generally requires a heap size at least twice
-as large as the maximum live heap size, and performs best with ample
-heap sizes; between 3× and 5× is best.
+Also like `scc` and `semi`, `pcc` is not generational yet.  If and when
+`pcc` grows a young generation, it would be a great collector.
 
 ## Implementation notes
 
-Unlike `semi` which has a single global bump-pointer allocation region,
-`pcc` structures the heap into 64-kB blocks.  In this way it supports
-multiple mutator threads: mutators do local bump-pointer allocation into
-their own block, and when their block is full, they fetch another from
-the global store.
-
-The block size is 64 kB, but really it's 128 kB, because each block has
-two halves: the active region and the copy reserve.  Dividing each block
-in two allows the collector to easily grow and shrink the heap while
-ensuring there is always enough reserve space.
-
-Blocks are allocated in 64-MB aligned slabs, so there are 512 blocks in
-a slab.  The first block in a slab is used by the collector itself, to
-keep metadata for the rest of the blocks, for example a chain pointer
-allowing blocks to be collected in lists, a saved allocation pointer for
-partially-filled blocks, whether the block is paged in or out, and so
-on.
-
 `pcc` supports tracing in parallel.  This mechanism works somewhat like
 allocation, in which multiple trace workers compete to evacuate objects
 into their local allocation buffers; when an allocation buffer is full,
 the trace worker grabs another, just like mutators do.
 
-However, unlike the simple semi-space collector which uses a Cheney grey
-worklist, `pcc` uses the [fine-grained work-stealing parallel
-tracer](../src/parallel-tracer.h) originally developed for [Whippet's
-Immix-like collector](./collector-whippet.md).  Each trace worker
-maintains a [local queue of objects that need
+To maintain a queue of objects to trace, `pcc` uses the [fine-grained
+work-stealing parallel tracer](../src/parallel-tracer.h) originally
+developed for [Whippet's Immix-like collector](./collector-whippet.md).
+Each trace worker maintains a [local queue of objects that need
 tracing](../src/local-worklist.h), which currently has 1024 entries.  If
 the local queue becomes full, the worker will publish 3/4 of those
 entries to the worker's [shared worklist](../src/shared-worklist.h).
@@ -53,12 +27,11 @@ from its own shared worklist, then will try to steal from other workers.
 Because threads compete to evacuate objects, `pcc` uses [atomic
 compare-and-swap instead of simple forwarding pointer
 updates](./manual.md#forwarding-objects), which imposes around a ~30%
-performance penalty.  `pcc` generally starts to outperform `semi` when
-it can trace with 2 threads, and gets better with each additional
-thread.
+performance penalty.  `pcc` generally starts to outperform `scc` when it
+can trace with 2 threads, and gets better with each additional thread.
 
-The memory used for the external worklist is dynamically allocated from
-the OS and is not currently counted as contributing to the heap size.
-If you are targetting a microcontroller or something, probably you need
-to choose a different kind of collector that never dynamically
-allocates, such as `semi`.
+As with `scc`, the memory used for the external worklist is dynamically
+allocated from the OS and is not currently counted as contributing to
+the heap size.  If you are targetting a microcontroller or something,
+probably you need to choose a different kind of collector that never
+dynamically allocates, such as `semi`.
diff --git a/doc/collector-scc.md b/doc/collector-scc.md
new file mode 100644
index 000000000..2512bb9fd
--- /dev/null
+++ b/doc/collector-scc.md
@@ -0,0 +1,62 @@
+# Serial copying collector
+
+Whippet's `scc` collector is a copying collector, like the more simple
+[`semi`](./collector-semi.md), but supporting multiple mutator threads,
+and using an external FIFO worklist instead of a Cheney worklist.
+
+Like `semi`, `scc` traces by evacuation: it moves all live objects on
+every collection.  (Exception:  objects larger than 8192 bytes are
+placed into a partitioned space which traces by marking in place instead
+of copying.)  Evacuation requires precise roots, so if your embedder
+does not support precise roots, `scc` is not for you.
+
+Again like `semi`, `scc` generally requires a heap size at least twice
+as large as the maximum live heap size, and performs best with ample
+heap sizes; between 3× and 5× is best.
+
+Overall, `scc` is most useful for isolating the performance implications
+of using a block-structured heap and of using an external worklist
+rather than a Cheney worklist as `semi` does.  It also supports multiple
+mutator threads, so it is generally more useful than `semi`.  Also,
+compared to `pcc`, we can measure the overhead that `pcc` imposes to
+atomically forward objects.
+
+But given a choice, you probably want `pcc`; though it's slower with
+only one tracing thread, once you have more than once tracing thread
+it's a win over `scc`.
+
+## Implementation notes
+
+Unlike `semi` which has a single global bump-pointer allocation region,
+`scc` structures the heap into 64-kB blocks.  In this way it supports
+multiple mutator threads: mutators do local bump-pointer allocation into
+their own block, and when their block is full, they fetch another from
+the global store.
+
+The block size is 64 kB, but really it's 128 kB, because each block has
+two halves: the active region and the copy reserve.  Dividing each block
+in two allows the collector to easily grow and shrink the heap while
+ensuring there is always enough reserve space.
+
+Blocks are allocated in 64-MB aligned slabs, so there are 512 blocks in
+a slab.  The first block in a slab is used by the collector itself, to
+keep metadata for the rest of the blocks, for example a chain pointer
+allowing blocks to be collected in lists, a saved allocation pointer for
+partially-filled blocks, whether the block is paged in or out, and so
+on.
+
+Unlike the simple semi-space collector which uses a Cheney grey
+worklist, `scc` uses a [simple first-in, first-out queue of objects to
+be traced](../src/simple-worklist.h) originally developed for [Whippet's
+Immix-like collector](./collector-whippet.md).  Like a Cheney worklist,
+this should result in objects being copied in breadth-first order.  The
+literature would suggest that depth-first is generally better for
+locality, but that preserving allocation order is generally best.  This
+is something to experiment with in the future.
+
+The memory used for the external worklist is dynamically allocated from
+the OS and is not currently counted as contributing to the heap size.
+If you are targetting a microcontroller or something, probably you need
+to choose a different kind of collector that never dynamically
+allocates, such as `semi`.
+
diff --git a/doc/collectors.md b/doc/collectors.md
index 81af46e59..c6fced97f 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -1,15 +1,17 @@
 # Whippet collectors
 
-Whippet has four collectors currently:
- - [Semi-space collector (semi)](./collector-semi.md): For
+Whippet has five collectors currently:
+ - [Semi-space collector (`semi`)](./collector-semi.md): For
    single-threaded embedders who are not too tight on memory.
- - [Parallel copying collector (pcc)](./collector-pcc.md): Like semi,
-   but with support for multiple mutator threads.  Faster than semi if
+ - [Serial copying collector (`scc`)](./collector-scc.md): Like `semi`,
+   but with support for multiple mutator threads.
+ - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `scc`,
+   but with support for multiple tracing threads.  Faster than `scc` if
    multiple cores are available at collection-time.
- - [Whippet collector (whippet)](./collector-whippet.md):
+ - [Whippet collector (`whippet`)](./collector-whippet.md):
    Immix-inspired collector.  Optionally parallel, conservative (stack
    and/or heap), and/or generational.
- - [Boehm-Demers-Weiser collector (bdw)](./collector-bdw.md):
+ - [Boehm-Demers-Weiser collector (`bdw`)](./collector-bdw.md):
    Conservative mark-sweep collector, implemented by
    Boehm-Demers-Weiser library.
 
@@ -18,11 +20,13 @@ Whippet has four collectors currently:
 If you are migrating an embedder off BDW-GC, then it could be reasonable
 to first go to `bdw`, then `stack-conservative-parallel-whippet`.
 
-If you have an embedder with precise roots, use `semi` if
-single-threaded, or `pcc` if multi-threaded.  That will shake out
-mutator/embedder bugs.  Then if memory is tight, switch to
+If you have an embedder with precise roots, use `pcc`.  That will shake
+out mutator/embedder bugs.  Then if memory is tight, switch to
 `parallel-whippet`, possibly `parallel-generational-whippet`.
 
+If you are aiming for maximum simplicity and minimal code size (ten
+kilobytes or so), use `semi`.
+
 If you are writing a new project, you have a choice as to whether to pay
 the development cost of precise roots or not.  If you choose to not have
 precise roots, then go for `stack-conservative-parallel-whippet`
diff --git a/embed.mk b/embed.mk
index 9284781e0..020cb10d3 100644
--- a/embed.mk
+++ b/embed.mk
@@ -42,6 +42,9 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
+GC_STEM_scc        = scc
+GC_CFLAGS_scc      = -DGC_PRECISE_ROOTS=1
+
 GC_STEM_pcc        = pcc
 GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 
diff --git a/src/copy-space.h b/src/copy-space.h
index fd6922b0f..2d6b2f246 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -484,6 +484,41 @@ copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
   }
 }
 
+static inline int
+copy_space_forward_nonatomic(struct copy_space *space, struct gc_edge edge,
+                             struct gc_ref old_ref, struct copy_space_allocator *alloc) {
+  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
+
+  uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
+  if (forwarded) {
+    gc_edge_update(edge, gc_ref(forwarded));
+    return 0;
+  } else {
+    size_t size;
+    gc_trace_object(old_ref, NULL, NULL, NULL, &size);
+    struct gc_ref new_ref =
+      copy_space_allocate(alloc, space, size,
+                          copy_space_gc_during_evacuation, NULL);
+    memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), size);
+    gc_object_forward_nonatomic(old_ref, new_ref);
+    gc_edge_update(edge, new_ref);
+    return 1;
+  }
+}
+
+static int
+copy_space_forward_if_traced_nonatomic(struct copy_space *space,
+                                       struct gc_edge edge,
+                                       struct gc_ref old_ref) {
+  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
+  uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
+  if (forwarded) {
+    gc_edge_update(edge, gc_ref(forwarded));
+    return 1;
+  }
+  return 0;
+}
+
 static inline int
 copy_space_contains(struct copy_space *space, struct gc_ref ref) {
   for (size_t i = 0; i < space->nextents; i++)
diff --git a/src/scc.c b/src/scc.c
new file mode 100644
index 000000000..28dcef0d2
--- /dev/null
+++ b/src/scc.c
@@ -0,0 +1,669 @@
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "gc-api.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
+
+#include "copy-space.h"
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-inline.h"
+#include "gc-trace.h"
+#include "large-object-space.h"
+#include "serial-tracer.h"
+#include "spin.h"
+#include "scc-attrs.h"
+
+struct gc_heap {
+  struct copy_space copy_space;
+  struct large_object_space large_object_space;
+  struct gc_extern_space *extern_space;
+  size_t large_object_pages;
+  pthread_mutex_t lock;
+  pthread_cond_t collector_cond;
+  pthread_cond_t mutator_cond;
+  size_t size;
+  int collecting;
+  int check_pending_ephemerons;
+  struct gc_pending_ephemerons *pending_ephemerons;
+  struct gc_finalizer_state *finalizer_state;
+  size_t mutator_count;
+  size_t paused_mutator_count;
+  size_t inactive_mutator_count;
+  struct gc_heap_roots *roots;
+  struct gc_mutator *mutators;
+  long count;
+  struct gc_tracer tracer;
+  double pending_ephemerons_size_factor;
+  double pending_ephemerons_size_slop;
+  struct gc_event_listener event_listener;
+  void *event_listener_data;
+};
+
+#define HEAP_EVENT(heap, event, ...)                                    \
+  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
+#define MUTATOR_EVENT(mut, event, ...)                                  \
+  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+
+struct gc_mutator {
+  struct copy_space_allocator allocator;
+  struct gc_heap *heap;
+  struct gc_mutator_roots *roots;
+  void *event_listener_data;
+  struct gc_mutator *next;
+  struct gc_mutator *prev;
+};
+
+struct gc_trace_worker_data {
+  struct copy_space_allocator allocator;
+};
+
+static inline struct copy_space* heap_copy_space(struct gc_heap *heap) {
+  return &heap->copy_space;
+}
+static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
+  return &heap->large_object_space;
+}
+static inline struct gc_extern_space* heap_extern_space(struct gc_heap *heap) {
+  return heap->extern_space;
+}
+static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
+  return mutator->heap;
+}
+
+static void
+gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
+                                         struct gc_heap *heap,
+                                         struct gc_trace_worker *worker,
+                                         struct gc_trace_worker_data *data),
+                               struct gc_tracer *tracer,
+                               struct gc_heap *heap,
+                               struct gc_trace_worker *worker) {
+  struct gc_trace_worker_data data;
+  copy_space_allocator_init(&data.allocator, heap_copy_space(heap));
+  f(tracer, heap, worker, &data);
+  copy_space_allocator_finish(&data.allocator, heap_copy_space(heap));
+}
+
+static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
+                           struct gc_ref ref,
+                           struct gc_trace_worker_data *data) {
+  if (!gc_ref_is_heap_object(ref))
+    return 0;
+  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
+    return copy_space_forward_nonatomic(heap_copy_space(heap), edge, ref,
+                                        &data->allocator);
+  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+    return large_object_space_mark_object(heap_large_object_space(heap), ref);
+  else
+    return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
+}
+
+static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge,
+                             struct gc_trace_worker *worker) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  struct gc_trace_worker_data *data = gc_trace_worker_data(worker);
+  int is_new = do_trace(heap, edge, ref, data);
+
+  if (is_new && heap->check_pending_ephemerons)
+    gc_resolve_pending_ephemerons(ref, heap);
+
+  return is_new;
+}
+
+int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+  struct gc_ref ref = gc_edge_ref(edge);
+  if (!gc_ref_is_heap_object(ref))
+    return 0;
+  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
+    return copy_space_forward_if_traced_nonatomic(heap_copy_space(heap), edge,
+                                                  ref);
+  if (large_object_space_contains(heap_large_object_space(heap), ref))
+    return large_object_space_is_copied(heap_large_object_space(heap), ref);
+  GC_CRASH();
+}
+
+static int mutators_are_stopping(struct gc_heap *heap) {
+  return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
+}
+
+static inline void heap_lock(struct gc_heap *heap) {
+  pthread_mutex_lock(&heap->lock);
+}
+static inline void heap_unlock(struct gc_heap *heap) {
+  pthread_mutex_unlock(&heap->lock);
+}
+
+// with heap lock
+static inline int all_mutators_stopped(struct gc_heap *heap) {
+  return heap->mutator_count ==
+    heap->paused_mutator_count + heap->inactive_mutator_count;
+}
+
+static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  mut->heap = heap;
+  mut->event_listener_data =
+    heap->event_listener.mutator_added(heap->event_listener_data);
+  copy_space_allocator_init(&mut->allocator, heap_copy_space(heap));
+  heap_lock(heap);
+  // We have no roots.  If there is a GC currently in progress, we have
+  // nothing to add.  Just wait until it's done.
+  while (mutators_are_stopping(heap))
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  mut->next = mut->prev = NULL;
+  struct gc_mutator *tail = heap->mutators;
+  if (tail) {
+    mut->next = tail;
+    tail->prev = mut;
+  }
+  heap->mutators = mut;
+  heap->mutator_count++;
+  heap_unlock(heap);
+}
+
+static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  MUTATOR_EVENT(mut, mutator_removed);
+  mut->heap = NULL;
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  heap_lock(heap);
+  heap->mutator_count--;
+  if (mut->next)
+    mut->next->prev = mut->prev;
+  if (mut->prev)
+    mut->prev->next = mut->next;
+  else
+    heap->mutators = mut->next;
+  // We have no roots.  If there is a GC stop currently in progress,
+  // maybe tell the controller it can continue.
+  if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
+  heap_unlock(heap);
+}
+
+static void request_mutators_to_stop(struct gc_heap *heap) {
+  GC_ASSERT(!mutators_are_stopping(heap));
+  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
+}
+
+static void allow_mutators_to_continue(struct gc_heap *heap) {
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(all_mutators_stopped(heap));
+  heap->paused_mutator_count--;
+  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
+  GC_ASSERT(!mutators_are_stopping(heap));
+  pthread_cond_broadcast(&heap->mutator_cond);
+}
+
+static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
+  size_t previous = heap->large_object_pages;
+  heap->large_object_pages = npages;
+  GC_ASSERT(npages <= previous);
+  size_t bytes = (previous - npages) <<
+    heap_large_object_space(heap)->page_size_log2;
+  copy_space_reacquire_memory(heap_copy_space(heap), bytes);
+}
+
+void gc_mutator_set_roots(struct gc_mutator *mut,
+                          struct gc_mutator_roots *roots) {
+  mut->roots = roots;
+}
+void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
+  heap->roots = roots;
+}
+void gc_heap_set_extern_space(struct gc_heap *heap,
+                              struct gc_extern_space *space) {
+  heap->extern_space = space;
+}
+
+static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
+                                void *trace_data) GC_ALWAYS_INLINE;
+static inline void
+tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  struct gc_trace_worker *worker = trace_data;
+  if (trace_edge(heap, edge, worker))
+    gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
+}
+
+static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
+                             struct gc_trace_worker *worker) {
+#ifdef DEBUG
+  if (copy_space_contains(heap_copy_space(heap), ref))
+    GC_ASSERT(copy_space_object_region(ref) == heap_copy_space(heap)->active_region);
+#endif
+  gc_trace_object(ref, tracer_visit, heap, worker, NULL);
+}
+
+static inline void trace_root(struct gc_root root, struct gc_heap *heap,
+                              struct gc_trace_worker *worker) {
+  switch (root.kind) {
+  case GC_ROOT_KIND_HEAP:
+    gc_trace_heap_roots(root.heap->roots, tracer_visit, heap, worker);
+    break;
+  case GC_ROOT_KIND_MUTATOR:
+    gc_trace_mutator_roots(root.mutator->roots, tracer_visit, heap, worker);
+    break;
+  case GC_ROOT_KIND_RESOLVED_EPHEMERONS:
+    gc_trace_resolved_ephemerons(root.resolved_ephemerons, tracer_visit,
+                                 heap, worker);
+    break;
+  case GC_ROOT_KIND_EDGE:
+    tracer_visit(root.edge, heap, worker);
+    break;
+  default:
+    GC_CRASH();
+  }
+}
+
+static void wait_for_mutators_to_stop(struct gc_heap *heap) {
+  heap->paused_mutator_count++;
+  while (!all_mutators_stopped(heap))
+    pthread_cond_wait(&heap->collector_cond, &heap->lock);
+}
+
+void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                             struct gc_edge edge, struct gc_ref new_val) {
+}
+
+static void
+pause_mutator_for_collection(struct gc_heap *heap,
+                             struct gc_mutator *mut) GC_NEVER_INLINE;
+static void
+pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(!all_mutators_stopped(heap));
+  MUTATOR_EVENT(mut, mutator_stopped);
+  heap->paused_mutator_count++;
+  if (all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
+
+  do {
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  } while (mutators_are_stopping(heap));
+  heap->paused_mutator_count--;
+
+  MUTATOR_EVENT(mut, mutator_restarted);
+}
+
+static void
+pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void
+pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  GC_ASSERT(mutators_are_stopping(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
+  pause_mutator_for_collection(heap, mut);
+}
+
+static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  GC_ASSERT(mutators_are_stopping(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  heap_lock(heap);
+  pause_mutator_for_collection(heap, mut);
+  heap_unlock(heap);
+}
+
+static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
+  while (mutators_are_stopping(mutator_heap(mut)))
+    pause_mutator_for_collection_without_lock(mut);
+}
+
+static int maybe_grow_heap(struct gc_heap *heap) {
+  return 0;
+}
+
+static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
+                            void *unused) {
+  gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
+}
+
+static void add_roots(struct gc_heap *heap) {
+  for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next)
+    gc_tracer_add_root(&heap->tracer, gc_root_mutator(mut));
+  gc_tracer_add_root(&heap->tracer, gc_root_heap(heap));
+  gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
+}
+
+static void resolve_ephemerons_lazily(struct gc_heap *heap) {
+  heap->check_pending_ephemerons = 0;
+}
+
+static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
+  heap->check_pending_ephemerons = 1;
+  gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
+}
+
+static void trace_resolved_ephemerons(struct gc_heap *heap) {
+  for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
+       resolved;
+       resolved = gc_pop_resolved_ephemerons(heap)) {
+    gc_tracer_add_root(&heap->tracer, gc_root_resolved_ephemerons(resolved));
+    gc_tracer_trace(&heap->tracer);
+  }
+}
+
+static void resolve_finalizers(struct gc_heap *heap) {
+  for (size_t priority = 0;
+       priority < gc_finalizer_priority_count();
+       priority++) {
+    if (gc_resolve_finalizers(heap->finalizer_state, priority,
+                              visit_root_edge, heap, NULL)) {
+      gc_tracer_trace(&heap->tracer);
+      trace_resolved_ephemerons(heap);
+    }
+  }
+  gc_notify_finalizers(heap->finalizer_state, heap);
+}
+
+static void sweep_ephemerons(struct gc_heap *heap) {
+  return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
+}
+
+static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
+static void collect(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  struct copy_space *copy_space = heap_copy_space(heap);
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  struct gc_extern_space *exspace = heap_extern_space(heap);
+  MUTATOR_EVENT(mut, mutator_cause_gc);
+  DEBUG("start collect #%ld:\n", heap->count);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
+  large_object_space_start_gc(lospace, 0);
+  gc_extern_space_start_gc(exspace, 0);
+  resolve_ephemerons_lazily(heap);
+  HEAP_EVENT(heap, requesting_stop);
+  request_mutators_to_stop(heap);
+  HEAP_EVENT(heap, waiting_for_stop);
+  wait_for_mutators_to_stop(heap);
+  HEAP_EVENT(heap, mutators_stopped);
+  copy_space_flip(copy_space);
+  gc_tracer_prepare(&heap->tracer);
+  add_roots(heap);
+  HEAP_EVENT(heap, roots_traced);
+  gc_tracer_trace(&heap->tracer);
+  HEAP_EVENT(heap, heap_traced);
+  resolve_ephemerons_eagerly(heap);
+  trace_resolved_ephemerons(heap);
+  HEAP_EVENT(heap, ephemerons_traced);
+  resolve_finalizers(heap);
+  HEAP_EVENT(heap, finalizers_traced);
+  sweep_ephemerons(heap);
+  gc_tracer_release(&heap->tracer);
+  copy_space_finish_gc(copy_space);
+  large_object_space_finish_gc(lospace, 0);
+  gc_extern_space_finish_gc(exspace, 0);
+  heap->count++;
+  heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
+  size_t live_size = (copy_space->allocated_bytes_at_last_gc +
+                      large_object_space_size_at_last_collection(lospace));
+  HEAP_EVENT(heap, live_data_size, live_size);
+  maybe_grow_heap(heap);
+  if (!copy_space_page_out_blocks_until_memory_released(copy_space)) {
+    fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
+            heap->size, copy_space->nslabs);
+    GC_CRASH();
+  }
+  HEAP_EVENT(heap, restarting_mutators);
+  allow_mutators_to_continue(heap);
+}
+
+static void trigger_collection(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  heap_lock(heap);
+  long epoch = heap->count;
+  while (mutators_are_stopping(heap))
+    pause_mutator_for_collection_with_lock(mut);
+  if (epoch == heap->count)
+    collect(mut);
+  heap_unlock(heap);
+}
+
+void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
+  trigger_collection(mut);
+}
+
+static void* allocate_large(struct gc_mutator *mut, size_t size) {
+  struct gc_heap *heap = mutator_heap(mut);
+  struct large_object_space *space = heap_large_object_space(heap);
+
+  size_t npages = large_object_space_npages(space, size);
+
+  copy_space_request_release_memory(heap_copy_space(heap),
+                                     npages << space->page_size_log2);
+  while (!copy_space_page_out_blocks_until_memory_released(heap_copy_space(heap)))
+    trigger_collection(mut);
+  atomic_fetch_add(&heap->large_object_pages, npages);
+
+  void *ret = large_object_space_alloc(space, npages);
+  if (!ret)
+    ret = large_object_space_obtain_and_alloc(space, npages);
+
+  if (!ret) {
+    perror("weird: we have the space but mmap didn't work");
+    GC_CRASH();
+  }
+
+  return ret;
+}
+
+static void get_more_empty_blocks_for_mutator(void *mut) {
+  trigger_collection(mut);
+}
+
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+  GC_ASSERT(size > 0); // allocating 0 bytes would be silly
+
+  if (size > gc_allocator_large_threshold())
+    return allocate_large(mut, size);
+
+  struct gc_ref ret = copy_space_allocate(&mut->allocator,
+                                          heap_copy_space(mutator_heap(mut)),
+                                          size,
+                                          get_more_empty_blocks_for_mutator,
+                                          mut);
+  gc_clear_fresh_allocation(ret, size);
+  return gc_ref_heap_object(ret);
+}
+
+void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
+  return gc_allocate(mut, size);
+}
+
+struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_ephemeron_size());
+}
+
+void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                       struct gc_ref key, struct gc_ref value) {
+  gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
+}
+
+struct gc_pending_ephemerons *gc_heap_pending_ephemerons(struct gc_heap *heap) {
+  return heap->pending_ephemerons;
+}
+
+unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+  return heap->count;
+}
+
+struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+  return gc_allocate(mut, gc_finalizer_size());
+}
+
+void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                         unsigned priority, struct gc_ref object,
+                         struct gc_ref closure) {
+  gc_finalizer_init_internal(finalizer, object, closure);
+  gc_finalizer_attach_internal(mutator_heap(mut)->finalizer_state,
+                               finalizer, priority);
+  // No write barrier.
+}
+
+struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
+  return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
+}
+
+void gc_set_finalizer_callback(struct gc_heap *heap,
+                               gc_finalizer_callback callback) {
+  gc_finalizer_state_set_callback(heap->finalizer_state, callback);
+}
+
+static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
+  struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
+  size_t target = heap->size * heap->pending_ephemerons_size_factor;
+  double slop = heap->pending_ephemerons_size_slop;
+
+  heap->pending_ephemerons = gc_prepare_pending_ephemerons(cur, target, slop);
+
+  return !!heap->pending_ephemerons;
+}
+
+struct gc_options {
+  struct gc_common_options common;
+};
+int gc_option_from_string(const char *str) {
+  return gc_common_option_from_string(str);
+}
+struct gc_options* gc_allocate_options(void) {
+  struct gc_options *ret = malloc(sizeof(struct gc_options));
+  gc_init_common_options(&ret->common);
+  return ret;
+}
+int gc_options_set_int(struct gc_options *options, int option, int value) {
+  return gc_common_options_set_int(&options->common, option, value);
+}
+int gc_options_set_size(struct gc_options *options, int option,
+                        size_t value) {
+  return gc_common_options_set_size(&options->common, option, value);
+}
+int gc_options_set_double(struct gc_options *options, int option,
+                          double value) {
+  return gc_common_options_set_double(&options->common, option, value);
+}
+int gc_options_parse_and_set(struct gc_options *options, int option,
+                             const char *value) {
+  return gc_common_options_parse_and_set(&options->common, option, value);
+}
+
+static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
+  // *heap is already initialized to 0.
+
+  pthread_mutex_init(&heap->lock, NULL);
+  pthread_cond_init(&heap->mutator_cond, NULL);
+  pthread_cond_init(&heap->collector_cond, NULL);
+  heap->size = options->common.heap_size;
+
+  if (options->common.parallelism != 1)
+    fprintf(stderr, "warning: parallelism unimplemented in semispace copying collector\n");
+
+  if (!gc_tracer_init(&heap->tracer, heap, 1))
+    GC_CRASH();
+
+  heap->pending_ephemerons_size_factor = 0.005;
+  heap->pending_ephemerons_size_slop = 0.5;
+
+  if (!heap_prepare_pending_ephemerons(heap))
+    GC_CRASH();
+
+  heap->finalizer_state = gc_make_finalizer_state();
+  if (!heap->finalizer_state)
+    GC_CRASH();
+
+  return 1;
+}
+
+int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
+            struct gc_heap **heap, struct gc_mutator **mut,
+            struct gc_event_listener event_listener,
+            void *event_listener_data) {
+  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_ALIGNMENT);
+  GC_ASSERT_EQ(gc_allocator_large_threshold(), GC_LARGE_OBJECT_THRESHOLD);
+  GC_ASSERT_EQ(0, offsetof(struct gc_mutator, allocator));
+  GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
+               offsetof(struct copy_space_allocator, hp));
+  GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
+               offsetof(struct copy_space_allocator, limit));
+
+  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
+    fprintf(stderr, "fixed heap size is currently required\n");
+    return 0;
+  }
+
+  *heap = calloc(1, sizeof(struct gc_heap));
+  if (!*heap) GC_CRASH();
+
+  if (!heap_init(*heap, options))
+    GC_CRASH();
+
+  (*heap)->event_listener = event_listener;
+  (*heap)->event_listener_data = event_listener_data;
+  HEAP_EVENT(*heap, init, (*heap)->size);
+
+  struct copy_space *space = heap_copy_space(*heap);
+  if (!copy_space_init(space, (*heap)->size)) {
+    free(*heap);
+    *heap = NULL;
+    return 0;
+  }
+  
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+    GC_CRASH();
+
+  *mut = calloc(1, sizeof(struct gc_mutator));
+  if (!*mut) GC_CRASH();
+  add_mutator(*heap, *mut);
+  return 1;
+}
+
+struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
+                                      struct gc_heap *heap) {
+  struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
+  if (!ret)
+    GC_CRASH();
+  add_mutator(heap, ret);
+  return ret;
+}
+
+void gc_finish_for_thread(struct gc_mutator *mut) {
+  remove_mutator(mutator_heap(mut), mut);
+  free(mut);
+}
+
+static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  GC_ASSERT(mut->next == NULL);
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  heap_lock(heap);
+  heap->inactive_mutator_count++;
+  if (all_mutators_stopped(heap))
+    pthread_cond_signal(&heap->collector_cond);
+  heap_unlock(heap);
+}
+
+static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  heap_lock(heap);
+  while (mutators_are_stopping(heap))
+    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+  heap->inactive_mutator_count--;
+  heap_unlock(heap);
+}
+
+void* gc_call_without_gc(struct gc_mutator *mut,
+                         void* (*f)(void*),
+                         void *data) {
+  struct gc_heap *heap = mutator_heap(mut);
+  deactivate_mutator(heap, mut);
+  void *ret = f(data);
+  reactivate_mutator(heap, mut);
+  return ret;
+}

From 1925f84d3aa73f352cc2afbc508639736af738d6 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 14:44:34 +0200
Subject: [PATCH 258/403] Add missing file

---
 api/scc-attrs.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 api/scc-attrs.h

diff --git a/api/scc-attrs.h b/api/scc-attrs.h
new file mode 100644
index 000000000..4db408cad
--- /dev/null
+++ b/api/scc-attrs.h
@@ -0,0 +1,60 @@
+#ifndef SCC_ATTRS_H
+#define SCC_ATTRS_H
+
+#include "gc-config.h"
+#include "gc-assert.h"
+#include "gc-attrs.h"
+
+static const uintptr_t GC_ALIGNMENT = 8;
+static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
+
+static inline enum gc_allocator_kind gc_allocator_kind(void) {
+  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
+}
+static inline size_t gc_allocator_small_granule_size(void) {
+  return GC_ALIGNMENT;
+}
+static inline size_t gc_allocator_large_threshold(void) {
+  return GC_LARGE_OBJECT_THRESHOLD;
+}
+
+static inline size_t gc_allocator_allocation_pointer_offset(void) {
+  return sizeof(uintptr_t) * 0;
+}
+static inline size_t gc_allocator_allocation_limit_offset(void) {
+  return sizeof(uintptr_t) * 1;
+}
+
+static inline size_t gc_allocator_freelist_offset(size_t size) {
+  GC_CRASH();
+}
+
+static inline size_t gc_allocator_alloc_table_alignment(void) {
+  return 0;
+}
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
+  GC_CRASH();
+}
+
+static inline int gc_allocator_needs_clear(void) {
+  return 0;
+}
+
+static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
+  return GC_WRITE_BARRIER_NONE;
+}
+static inline size_t gc_write_barrier_card_table_alignment(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_card_size(void) {
+  GC_CRASH();
+}
+
+static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
+  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
+}
+
+#endif // SCC_ATTRS_H

From 12eb0e0c42dc8c689b7bf028f18a011f061d5e60 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 14:47:32 +0200
Subject: [PATCH 259/403] Update documentation

---
 doc/README.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index fc5348ddb..c14a537fe 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -7,17 +7,6 @@
    implementations of the Whippet API with differing performance
    characteristics and which impose different requirements on the
    embedder.
-   - [Semi-space collector (semi)](./collector-semi.md): For
-     single-threaded embedders who are not too tight on memory.
-   - [Parallel copying collector (pcc)](./collector-pcc.md): Like semi,
-     but with support for multiple mutator threads.  Faster than semi if
-     multiple cores are available at collection-time.
-   - [Whippet collector (whippet)](./collector-whippet.md):
-     Immix-inspired collector.  Optionally parallel, conservative (stack
-     and/or heap), and/or generational.
-   - [Boehm-Demers-Weiser collector (bdw)](./collector-bdw.md):
-     Conservative mark-sweep collector, implemented by
-     Boehm-Demers-Weiser library.
 
  * [Guile](./doc/guile.md): Some notes on a potential rebase of Guile on
    top of Whippet.

From e40b224fafa6dd99c65552024249ea1410b783de Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 15:03:24 +0200
Subject: [PATCH 260/403] Attempt to dynamically choose whether to atomically
 forward

---
 src/copy-space.h | 34 ++++++++++++++++++++++++++++------
 src/pcc.c        |  3 ++-
 src/scc.c        |  3 ++-
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 2d6b2f246..a863ca51a 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -116,6 +116,7 @@ struct copy_space {
   // The rest of these members are only changed rarely and with the heap
   // lock.
   uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
+  uint8_t atomic_forward;
   size_t allocated_bytes_at_last_gc;
   size_t fragmentation_at_last_gc;
   struct copy_space_extent *extents;
@@ -411,8 +412,9 @@ copy_space_gc_during_evacuation(void *data) {
 }
 
 static inline int
-copy_space_forward(struct copy_space *space, struct gc_edge edge,
-                   struct gc_ref old_ref, struct copy_space_allocator *alloc) {
+copy_space_forward_atomic(struct copy_space *space, struct gc_edge edge,
+                          struct gc_ref old_ref,
+                          struct copy_space_allocator *alloc) {
   GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
 
@@ -459,8 +461,9 @@ copy_space_forward(struct copy_space *space, struct gc_edge edge,
 }
 
 static int
-copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
-                             struct gc_ref old_ref) {
+copy_space_forward_if_traced_atomic(struct copy_space *space,
+                                    struct gc_edge edge,
+                                    struct gc_ref old_ref) {
   GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
   switch (fwd.state) {
@@ -486,7 +489,8 @@ copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
 
 static inline int
 copy_space_forward_nonatomic(struct copy_space *space, struct gc_edge edge,
-                             struct gc_ref old_ref, struct copy_space_allocator *alloc) {
+                             struct gc_ref old_ref,
+                             struct copy_space_allocator *alloc) {
   GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
 
   uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
@@ -519,6 +523,23 @@ copy_space_forward_if_traced_nonatomic(struct copy_space *space,
   return 0;
 }
 
+static inline int
+copy_space_forward(struct copy_space *space, struct gc_edge edge,
+                   struct gc_ref old_ref,
+                   struct copy_space_allocator *alloc) {
+  if (space->atomic_forward)
+    return copy_space_forward_atomic(space, edge, old_ref, alloc);
+  return copy_space_forward_nonatomic(space, edge, old_ref, alloc);
+}
+
+static inline int
+copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
+                             struct gc_ref old_ref) {
+  if (space->atomic_forward)
+    return copy_space_forward_if_traced_atomic(space, edge, old_ref);
+  return copy_space_forward_if_traced_nonatomic(space, edge, old_ref);
+}
+
 static inline int
 copy_space_contains(struct copy_space *space, struct gc_ref ref) {
   for (size_t i = 0; i < space->nextents; i++)
@@ -567,7 +588,7 @@ copy_space_allocate_slabs(size_t nslabs) {
 }
 
 static int
-copy_space_init(struct copy_space *space, size_t size) {
+copy_space_init(struct copy_space *space, size_t size, int atomic) {
   size = align_up(size, COPY_SPACE_BLOCK_SIZE);
   size_t reserved = align_up(size, COPY_SPACE_SLAB_SIZE);
   size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
@@ -583,6 +604,7 @@ copy_space_init(struct copy_space *space, size_t size) {
   space->fragmentation = 0;
   space->bytes_to_page_out = 0;
   space->active_region = 0;
+  space->atomic_forward = atomic;
   space->allocated_bytes_at_last_gc = 0;
   space->fragmentation_at_last_gc = 0;
   space->extents = calloc(1, sizeof(struct copy_space_extent));
diff --git a/src/pcc.c b/src/pcc.c
index b0aeddda0..3023d465d 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -611,7 +611,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   HEAP_EVENT(*heap, init, (*heap)->size);
 
   struct copy_space *space = heap_copy_space(*heap);
-  if (!copy_space_init(space, (*heap)->size)) {
+  int atomic_forward = options->common.parallelism > 1;
+  if (!copy_space_init(space, (*heap)->size, atomic_forward)) {
     free(*heap);
     *heap = NULL;
     return 0;
diff --git a/src/scc.c b/src/scc.c
index 28dcef0d2..33fe8a1b1 100644
--- a/src/scc.c
+++ b/src/scc.c
@@ -611,7 +611,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   HEAP_EVENT(*heap, init, (*heap)->size);
 
   struct copy_space *space = heap_copy_space(*heap);
-  if (!copy_space_init(space, (*heap)->size)) {
+  int atomic_forward = 0;
+  if (!copy_space_init(space, (*heap)->size, atomic_forward)) {
     free(*heap);
     *heap = NULL;
     return 0;

From ea15d142fcd6365170516a3799a23be5f695edca Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 5 Aug 2024 15:19:07 +0200
Subject: [PATCH 261/403] Update documentation

---
 doc/collector-pcc.md | 11 ++++++-----
 doc/collectors.md    |  5 +++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index c79fb2aea..f2f3ff390 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -24,11 +24,12 @@ entries to the worker's [shared worklist](../src/shared-worklist.h).
 When a worker runs out of local work, it will first try to remove work
 from its own shared worklist, then will try to steal from other workers.
 
-Because threads compete to evacuate objects, `pcc` uses [atomic
-compare-and-swap instead of simple forwarding pointer
-updates](./manual.md#forwarding-objects), which imposes around a ~30%
-performance penalty.  `pcc` generally starts to outperform `scc` when it
-can trace with 2 threads, and gets better with each additional thread.
+If only one tracing thread is enabled (`parallelism=1`), `pcc` uses
+non-atomic forwarding, but if multiple threads compete to evacuate
+objects, `pcc` uses [atomic compare-and-swap instead of simple
+forwarding pointer updates](./manual.md#forwarding-objects).  This
+imposes around a ~30% performance penalty but having multiple tracing
+threads is generally worth it, unless the object graph is itself serial.
 
 As with `scc`, the memory used for the external worklist is dynamically
 allocated from the OS and is not currently counted as contributing to
diff --git a/doc/collectors.md b/doc/collectors.md
index c6fced97f..cdf4dcb8b 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -6,8 +6,7 @@ Whippet has five collectors currently:
  - [Serial copying collector (`scc`)](./collector-scc.md): Like `semi`,
    but with support for multiple mutator threads.
  - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `scc`,
-   but with support for multiple tracing threads.  Faster than `scc` if
-   multiple cores are available at collection-time.
+   but with support for multiple tracing threads.
  - [Whippet collector (`whippet`)](./collector-whippet.md):
    Immix-inspired collector.  Optionally parallel, conservative (stack
    and/or heap), and/or generational.
@@ -27,6 +26,8 @@ out mutator/embedder bugs.  Then if memory is tight, switch to
 If you are aiming for maximum simplicity and minimal code size (ten
 kilobytes or so), use `semi`.
 
+Only use `scc` if you are investigating GC internals.
+
 If you are writing a new project, you have a choice as to whether to pay
 the development cost of precise roots or not.  If you choose to not have
 precise roots, then go for `stack-conservative-parallel-whippet`

From 1f6f5aeab72a1161b57b1781fe970f3150766dec Mon Sep 17 00:00:00 2001
From: Zheng Junjie <zhengjunjie@iscas.ac.cn>
Date: Wed, 14 Aug 2024 22:36:38 +0800
Subject: [PATCH 262/403] fix guile.md link

---
 doc/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/README.md b/doc/README.md
index c14a537fe..eee1ad701 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -8,6 +8,6 @@
    characteristics and which impose different requirements on the
    embedder.
 
- * [Guile](./doc/guile.md): Some notes on a potential rebase of Guile on
+ * [Guile](./guile.md): Some notes on a potential rebase of Guile on
    top of Whippet.
 

From b6e9d3c0bb547011f036e69fce524d5dd19a1531 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 21 Aug 2024 10:16:00 +0200
Subject: [PATCH 263/403] Fix bug in which empty block would not have mark bits
 cleared

---
 src/whippet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/whippet.c b/src/whippet.c
index 1f3edda2b..2f77b251e 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -2153,6 +2153,8 @@ static size_t next_hole(struct gc_mutator *mut) {
       if (granules < GRANULES_PER_BLOCK)
         return granules;
       struct block_summary *summary = block_summary_for_addr(mut->block);
+      // Sweep mark bytes for completely empty block.
+      memset(metadata_byte_for_addr(mut->block), 0, GRANULES_PER_BLOCK);
       block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
       // Sweeping found a completely empty block.  If we are below the
       // minimum evacuation reserve, take the block.

From 004a3d04110174d81f27c45b5cdb127bc73d1f5e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 18 Aug 2024 19:42:07 +0200
Subject: [PATCH 264/403] Factor nofl-space out of whippet

---
 api/gc-api.h     |   10 +-
 src/nofl-space.h | 1448 ++++++++++++++++++++++++++++++++++++
 src/swar.h       |   51 ++
 src/whippet.c    | 1830 +++++++---------------------------------------
 4 files changed, 1782 insertions(+), 1557 deletions(-)
 create mode 100644 src/nofl-space.h
 create mode 100644 src/swar.h

diff --git a/api/gc-api.h b/api/gc-api.h
index 4831500fe..e60be7579 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -59,11 +59,9 @@ static inline void gc_clear_fresh_allocation(struct gc_ref obj,
   memset(gc_ref_heap_object(obj), 0, size);
 }
 
-static inline void gc_update_alloc_table(struct gc_mutator *mut,
-                                         struct gc_ref obj,
+static inline void gc_update_alloc_table(struct gc_ref obj,
                                          size_t size) GC_ALWAYS_INLINE;
-static inline void gc_update_alloc_table(struct gc_mutator *mut,
-                                         struct gc_ref obj,
+static inline void gc_update_alloc_table(struct gc_ref obj,
                                          size_t size) {
   size_t alignment = gc_allocator_alloc_table_alignment();
   if (!alignment) return;
@@ -117,7 +115,7 @@ static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut,
   *hp_loc = new_hp;
 
   gc_clear_fresh_allocation(gc_ref(hp), size);
-  gc_update_alloc_table(mut, gc_ref(hp), size);
+  gc_update_alloc_table(gc_ref(hp), size);
 
   return (void*)hp;
 }
@@ -138,7 +136,7 @@ static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut, size
   *freelist_loc = *(void**)head;
 
   gc_clear_fresh_allocation(gc_ref_from_heap_object(head), size);
-  gc_update_alloc_table(mut, gc_ref_from_heap_object(head), size);
+  gc_update_alloc_table(gc_ref_from_heap_object(head), size);
 
   return head;
 }
diff --git a/src/nofl-space.h b/src/nofl-space.h
new file mode 100644
index 000000000..fd718c962
--- /dev/null
+++ b/src/nofl-space.h
@@ -0,0 +1,1448 @@
+#ifndef NOFL_SPACE_H
+#define NOFL_SPACE_H
+
+#include <stdatomic.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include "gc-api.h"
+
+#define GC_IMPL 1
+#include "gc-internal.h"
+
+#include "assert.h"
+#include "debug.h"
+#include "gc-align.h"
+#include "gc-attrs.h"
+#include "gc-inline.h"
+#include "spin.h"
+#include "swar.h"
+
+// This is the nofl space!  It is a mark space which doesn't use
+// free-lists to allocate, and which can evacuate objects if
+// fragmentation is too high, inspired by Immix.  Nofl stands for "no
+// free-list", but also "novel", in the sense that it hasn't been tried
+// before.
+
+#define NOFL_GRANULE_SIZE 16
+#define NOFL_GRANULE_SIZE_LOG_2 4
+#define NOFL_MEDIUM_OBJECT_THRESHOLD 256
+#define NOFL_MEDIUM_OBJECT_GRANULE_THRESHOLD 16
+
+STATIC_ASSERT_EQ(NOFL_GRANULE_SIZE, 1 << NOFL_GRANULE_SIZE_LOG_2);
+STATIC_ASSERT_EQ(NOFL_MEDIUM_OBJECT_THRESHOLD,
+                 NOFL_MEDIUM_OBJECT_GRANULE_THRESHOLD * NOFL_GRANULE_SIZE);
+
+// Each granule has one mark byte stored in a side table.  A granule's
+// mark state is a whole byte instead of a bit to facilitate parallel
+// marking.  (Parallel markers are allowed to race.)  We also use this
+// byte to compute object extent, via a bit flag indicating
+// end-of-object.
+//
+// Because we want to allow for conservative roots, we need to know
+// whether an address indicates an object or not.  That means that when
+// an object is allocated, it has to set a bit, somewhere.  We use the
+// metadata byte for this purpose, setting the "young" bit.
+//
+// The "young" bit's name might make you think about generational
+// collection, and indeed all objects collected in a minor collection
+// will have this bit set.  However, the nofl space never needs to check
+// for the young bit; if it weren't for the need to identify
+// conservative roots, we wouldn't need a young bit at all.  Perhaps in
+// an all-precise system, we would be able to avoid the overhead of
+// initializing mark byte upon each fresh allocation.
+//
+// When an object becomes dead after a GC, it will still have a bit set
+// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
+// clear these bits before the next collection.  But, for concurrent
+// marking, we will also be marking "live" objects, updating their mark
+// bits.  So there are four object states concurrently observable:
+// young, dead, survivor, and marked.  (If we didn't have concurrent
+// marking we would still need the "marked" state, because marking
+// mutator roots before stopping is also a form of concurrent marking.)
+// Even though these states are mutually exclusive, we use separate bits
+// for them because we have the space.  After each collection, the dead,
+// survivor, and marked states rotate by one bit.
+enum nofl_metadata_byte {
+  NOFL_METADATA_BYTE_NONE = 0,
+  NOFL_METADATA_BYTE_YOUNG = 1,
+  NOFL_METADATA_BYTE_MARK_0 = 2,
+  NOFL_METADATA_BYTE_MARK_1 = 4,
+  NOFL_METADATA_BYTE_MARK_2 = 8,
+  NOFL_METADATA_BYTE_END = 16,
+  NOFL_METADATA_BYTE_EPHEMERON = 32,
+  NOFL_METADATA_BYTE_PINNED = 64,
+  NOFL_METADATA_BYTE_UNUSED_1 = 128
+};
+
+static uint8_t
+nofl_rotate_dead_survivor_marked(uint8_t mask) {
+  uint8_t all =
+    NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  return ((mask << 1) | (mask >> 2)) & all;
+}
+
+#define NOFL_SLAB_SIZE (4 * 1024 * 1024)
+#define NOFL_BLOCK_SIZE (64 * 1024)
+#define NOFL_METADATA_BYTES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
+#define NOFL_BLOCKS_PER_SLAB (NOFL_SLAB_SIZE / NOFL_BLOCK_SIZE)
+#define NOFL_META_BLOCKS_PER_SLAB (NOFL_METADATA_BYTES_PER_BLOCK * NOFL_BLOCKS_PER_SLAB / NOFL_BLOCK_SIZE)
+#define NOFL_NONMETA_BLOCKS_PER_SLAB (NOFL_BLOCKS_PER_SLAB - NOFL_META_BLOCKS_PER_SLAB)
+#define NOFL_METADATA_BYTES_PER_SLAB (NOFL_NONMETA_BLOCKS_PER_SLAB * NOFL_METADATA_BYTES_PER_BLOCK)
+#define NOFL_SLACK_METADATA_BYTES_PER_SLAB (NOFL_META_BLOCKS_PER_SLAB * NOFL_METADATA_BYTES_PER_BLOCK)
+#define NOFL_REMSET_BYTES_PER_BLOCK (NOFL_SLACK_METADATA_BYTES_PER_SLAB / NOFL_BLOCKS_PER_SLAB)
+#define NOFL_REMSET_BYTES_PER_SLAB (NOFL_REMSET_BYTES_PER_BLOCK * NOFL_NONMETA_BLOCKS_PER_SLAB)
+#define NOFL_SLACK_REMSET_BYTES_PER_SLAB (NOFL_REMSET_BYTES_PER_BLOCK * NOFL_META_BLOCKS_PER_SLAB)
+#define NOFL_SUMMARY_BYTES_PER_BLOCK (NOFL_SLACK_REMSET_BYTES_PER_SLAB / NOFL_BLOCKS_PER_SLAB)
+#define NOFL_SUMMARY_BYTES_PER_SLAB (NOFL_SUMMARY_BYTES_PER_BLOCK * NONMETA_BLOCKS_PER_SLAB)
+#define NOFL_SLACK_SUMMARY_BYTES_PER_SLAB (NOFL_SUMMARY_BYTES_PER_BLOCK * NOFL_META_BLOCKS_PER_SLAB)
+#define NOFL_HEADER_BYTES_PER_SLAB NOFL_SLACK_SUMMARY_BYTES_PER_SLAB
+
+struct nofl_slab;
+
+struct nofl_slab_header {
+  union {
+    struct {
+      struct nofl_slab *next;
+      struct nofl_slab *prev;
+    };
+    uint8_t padding[NOFL_HEADER_BYTES_PER_SLAB];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct nofl_slab_header), NOFL_HEADER_BYTES_PER_SLAB);
+
+// Sometimes we want to put a block on a singly-linked list.  For that
+// there's a pointer reserved in the block summary.  But because the
+// pointer is aligned (32kB on 32-bit, 64kB on 64-bit), we can portably
+// hide up to 15 flags in the low bits.  These flags can be accessed
+// non-atomically by the mutator when it owns a block; otherwise they
+// need to be accessed atomically.
+enum nofl_block_summary_flag {
+  NOFL_BLOCK_OUT_FOR_THREAD = 0x1,
+  NOFL_BLOCK_HAS_PIN = 0x2,
+  NOFL_BLOCK_PAGED_OUT = 0x4,
+  NOFL_BLOCK_NEEDS_SWEEP = 0x8,
+  NOFL_BLOCK_UNAVAILABLE = 0x10,
+  NOFL_BLOCK_EVACUATE = 0x20,
+  NOFL_BLOCK_VENERABLE = 0x40,
+  NOFL_BLOCK_VENERABLE_AFTER_SWEEP = 0x80,
+  NOFL_BLOCK_FLAG_UNUSED_8 = 0x100,
+  NOFL_BLOCK_FLAG_UNUSED_9 = 0x200,
+  NOFL_BLOCK_FLAG_UNUSED_10 = 0x400,
+  NOFL_BLOCK_FLAG_UNUSED_11 = 0x800,
+  NOFL_BLOCK_FLAG_UNUSED_12 = 0x1000,
+  NOFL_BLOCK_FLAG_UNUSED_13 = 0x2000,
+  NOFL_BLOCK_FLAG_UNUSED_14 = 0x4000,
+};
+
+struct nofl_block_summary {
+  union {
+    struct {
+      // Counters related to previous collection: how many holes there
+      // were, and how much space they had.
+      uint16_t hole_count;
+      uint16_t free_granules;
+      // Counters related to allocation since previous collection:
+      // wasted space due to fragmentation.
+      uint16_t holes_with_fragmentation;
+      uint16_t fragmentation_granules;
+      // After a block is swept, if it's empty it goes on the empties
+      // list.  Otherwise if it's not immediately used by a mutator (as
+      // is usually the case), it goes on the swept list.  Both of these
+      // lists use this field.  But as the next element in the field is
+      // block-aligned, we stash flags in the low bits.
+      uintptr_t next_and_flags;
+    };
+    uint8_t padding[NOFL_SUMMARY_BYTES_PER_BLOCK];
+  };
+};
+STATIC_ASSERT_EQ(sizeof(struct nofl_block_summary),
+                 NOFL_SUMMARY_BYTES_PER_BLOCK);
+
+struct nofl_block {
+  char data[NOFL_BLOCK_SIZE];
+};
+
+struct nofl_slab {
+  struct nofl_slab_header header;
+  struct nofl_block_summary summaries[NOFL_NONMETA_BLOCKS_PER_SLAB];
+  uint8_t remembered_set[NOFL_REMSET_BYTES_PER_SLAB];
+  uint8_t metadata[NOFL_METADATA_BYTES_PER_SLAB];
+  struct nofl_block blocks[NOFL_NONMETA_BLOCKS_PER_SLAB];
+};
+STATIC_ASSERT_EQ(sizeof(struct nofl_slab), NOFL_SLAB_SIZE);
+
+static struct nofl_slab*
+nofl_object_slab(void *obj) {
+  uintptr_t addr = (uintptr_t) obj;
+  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
+  return (struct nofl_slab*) base;
+}
+
+static uint8_t*
+nofl_metadata_byte_for_addr(uintptr_t addr) {
+  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
+  uintptr_t granule = (addr & (NOFL_SLAB_SIZE - 1)) >> NOFL_GRANULE_SIZE_LOG_2;
+  return (uint8_t*) (base + granule);
+}
+
+static uint8_t*
+nofl_metadata_byte_for_object(struct gc_ref ref) {
+  return nofl_metadata_byte_for_addr(gc_ref_value(ref));
+}
+
+#define NOFL_GRANULES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
+#define NOFL_GRANULES_PER_REMSET_BYTE \
+  (NOFL_GRANULES_PER_BLOCK / NOFL_REMSET_BYTES_PER_BLOCK)
+
+static struct nofl_block_summary*
+nofl_block_summary_for_addr(uintptr_t addr) {
+  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
+  uintptr_t block = (addr & (NOFL_SLAB_SIZE - 1)) / NOFL_BLOCK_SIZE;
+  return (struct nofl_block_summary*)
+    (base + block * sizeof(struct nofl_block_summary));
+}
+
+static uintptr_t
+nofl_block_summary_has_flag(struct nofl_block_summary *summary,
+                            enum nofl_block_summary_flag flag) {
+  return summary->next_and_flags & flag;
+}
+
+static void
+nofl_block_summary_set_flag(struct nofl_block_summary *summary,
+                                        enum nofl_block_summary_flag flag) {
+  summary->next_and_flags |= flag;
+}
+
+static void
+nofl_block_summary_clear_flag(struct nofl_block_summary *summary,
+                              enum nofl_block_summary_flag flag) {
+  summary->next_and_flags &= ~(uintptr_t)flag;
+}
+
+static uintptr_t
+nofl_block_summary_next(struct nofl_block_summary *summary) {
+  return align_down(summary->next_and_flags, NOFL_BLOCK_SIZE);
+}
+
+static void
+nofl_block_summary_set_next(struct nofl_block_summary *summary,
+                            uintptr_t next) {
+  GC_ASSERT((next & (NOFL_BLOCK_SIZE - 1)) == 0);
+  summary->next_and_flags =
+    (summary->next_and_flags & (NOFL_BLOCK_SIZE - 1)) | next;
+}
+
+// Lock-free block list.
+struct nofl_block_list {
+  size_t count;
+  uintptr_t blocks;
+};
+
+static void
+nofl_push_block(struct nofl_block_list *list, uintptr_t block) {
+  atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  uintptr_t next = atomic_load_explicit(&list->blocks, memory_order_acquire);
+  do {
+    nofl_block_summary_set_next(summary, next);
+  } while (!atomic_compare_exchange_weak(&list->blocks, &next, block));
+}
+
+static uintptr_t
+nofl_pop_block(struct nofl_block_list *list) {
+  uintptr_t head = atomic_load_explicit(&list->blocks, memory_order_acquire);
+  struct nofl_block_summary *summary;
+  uintptr_t next;
+  do {
+    if (!head)
+      return 0;
+    summary = nofl_block_summary_for_addr(head);
+    next = nofl_block_summary_next(summary);
+  } while (!atomic_compare_exchange_weak(&list->blocks, &head, next));
+  nofl_block_summary_set_next(summary, 0);
+  atomic_fetch_sub_explicit(&list->count, 1, memory_order_acq_rel);
+  return head;
+}
+
+static inline size_t
+nofl_size_to_granules(size_t size) {
+  return (size + NOFL_GRANULE_SIZE - 1) >> NOFL_GRANULE_SIZE_LOG_2;
+}
+
+struct nofl_evacuation_allocator {
+  size_t allocated; // atomically
+  size_t limit;
+  uintptr_t block_cursor; // atomically
+};
+
+struct nofl_space {
+  uint64_t sweep_mask;
+  uint8_t live_mask;
+  uint8_t marked_mask;
+  uint8_t evacuating;
+  uintptr_t low_addr;
+  size_t extent;
+  size_t heap_size;
+  uint8_t last_collection_was_minor;
+  uintptr_t next_block;   // atomically
+  struct nofl_block_list empty;
+  struct nofl_block_list unavailable;
+  struct nofl_block_list evacuation_targets;
+  double evacuation_minimum_reserve;
+  double evacuation_reserve;
+  double venerable_threshold;
+  ssize_t pending_unavailable_bytes; // atomically
+  struct nofl_evacuation_allocator evacuation_allocator;
+  struct nofl_slab *slabs;
+  size_t nslabs;
+  uintptr_t granules_freed_by_last_collection; // atomically
+  uintptr_t fragmentation_granules_since_last_collection; // atomically
+};
+
+struct nofl_allocator {
+  uintptr_t alloc;
+  uintptr_t sweep;
+  uintptr_t block;
+};
+
+static inline void
+nofl_clear_memory(uintptr_t addr, size_t size) {
+  memset((char*)addr, 0, size);
+}
+
+static size_t
+nofl_space_live_object_granules(uint8_t *metadata) {
+  return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 1;
+}
+
+static inline int
+nofl_space_mark_object(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *loc = nofl_metadata_byte_for_object(ref);
+  uint8_t byte = *loc;
+  if (byte & space->marked_mask)
+    return 0;
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  *loc = (byte & ~mask) | space->marked_mask;
+  return 1;
+}
+
+static uintptr_t
+nofl_make_evacuation_allocator_cursor(uintptr_t block, size_t allocated) {
+  GC_ASSERT(allocated < (NOFL_BLOCK_SIZE - 1) * (uint64_t) NOFL_BLOCK_SIZE);
+  return align_down(block, NOFL_BLOCK_SIZE) | (allocated / NOFL_BLOCK_SIZE);
+}
+
+static void
+nofl_prepare_evacuation_allocator(struct nofl_evacuation_allocator *alloc,
+                                  struct nofl_block_list *targets) {
+  uintptr_t first_block = targets->blocks;
+  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
+  alloc->limit =
+    atomic_load_explicit(&targets->count, memory_order_acquire) * NOFL_BLOCK_SIZE;
+  atomic_store_explicit(&alloc->block_cursor,
+                        nofl_make_evacuation_allocator_cursor(first_block, 0),
+                        memory_order_release);
+}
+
+static void
+nofl_clear_remaining_metadata_bytes_in_block(uintptr_t block,
+                                             uintptr_t allocated) {
+  GC_ASSERT((allocated & (NOFL_GRANULE_SIZE - 1)) == 0);
+  uintptr_t base = block + allocated;
+  uintptr_t limit = block + NOFL_BLOCK_SIZE;
+  uintptr_t granules = (limit - base) >> NOFL_GRANULE_SIZE_LOG_2;
+  GC_ASSERT(granules <= NOFL_GRANULES_PER_BLOCK);
+  memset(nofl_metadata_byte_for_addr(base), 0, granules);
+}
+
+static void
+nofl_finish_evacuation_allocator_block(uintptr_t block,
+                                       uintptr_t allocated) {
+  GC_ASSERT(allocated <= NOFL_BLOCK_SIZE);
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+  size_t fragmentation = (NOFL_BLOCK_SIZE - allocated) >> NOFL_GRANULE_SIZE_LOG_2;
+  summary->hole_count = 1;
+  summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+  summary->holes_with_fragmentation = fragmentation ? 1 : 0;
+  summary->fragmentation_granules = fragmentation;
+  if (fragmentation)
+    nofl_clear_remaining_metadata_bytes_in_block(block, allocated);
+}
+
+static void
+nofl_finish_evacuation_allocator(struct nofl_evacuation_allocator *alloc,
+                                 struct nofl_block_list *targets,
+                                 struct nofl_block_list *empties,
+                                 size_t reserve) {
+  // Blocks that we used for evacuation get returned to the mutator as
+  // sweepable blocks.  Blocks that we didn't get to use go to the
+  // empties.
+  size_t allocated = atomic_load_explicit(&alloc->allocated,
+                                          memory_order_acquire);
+  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
+  if (allocated > alloc->limit)
+    allocated = alloc->limit;
+  while (allocated >= NOFL_BLOCK_SIZE) {
+    uintptr_t block = nofl_pop_block(targets);
+    GC_ASSERT(block);
+    allocated -= NOFL_BLOCK_SIZE;
+  }
+  if (allocated) {
+    // Finish off the last partially-filled block.
+    uintptr_t block = nofl_pop_block(targets);
+    GC_ASSERT(block);
+    nofl_finish_evacuation_allocator_block(block, allocated);
+  }
+  size_t remaining = atomic_load_explicit(&targets->count, memory_order_acquire);
+  while (remaining-- > reserve)
+    nofl_push_block(empties, nofl_pop_block(targets));
+}
+
+static struct gc_ref
+nofl_evacuation_allocate(struct nofl_space *space, size_t granules) {
+  // All collector threads compete to allocate from what is logically a
+  // single bump-pointer arena, which is actually composed of a linked
+  // list of blocks.
+  struct nofl_evacuation_allocator *alloc = &space->evacuation_allocator;
+  uintptr_t cursor = atomic_load_explicit(&alloc->block_cursor,
+                                          memory_order_acquire);
+  size_t bytes = granules * NOFL_GRANULE_SIZE;
+  size_t prev = atomic_load_explicit(&alloc->allocated, memory_order_acquire);
+  size_t block_mask = (NOFL_BLOCK_SIZE - 1);
+  size_t next;
+  do {
+    if (prev >= alloc->limit)
+      // No more space.
+      return gc_ref_null();
+    next = prev + bytes;
+    if ((prev ^ next) & ~block_mask)
+      // Allocation straddles a block boundary; advance so it starts a
+      // fresh block.
+      next = (next & ~block_mask) + bytes;
+  } while (!atomic_compare_exchange_weak(&alloc->allocated, &prev, next));
+  // OK, we've claimed our memory, starting at next - bytes.  Now find
+  // the node in the linked list of evacuation targets that corresponds
+  // to this allocation pointer.
+  uintptr_t block = cursor & ~block_mask;
+  // This is the SEQ'th block to be allocated into.
+  uintptr_t seq = cursor & block_mask;
+  // Therefore this block handles allocations starting at SEQ*BLOCK_SIZE
+  // and continuing for NOFL_BLOCK_SIZE bytes.
+  uintptr_t base = seq * NOFL_BLOCK_SIZE;
+
+  while ((base ^ next) & ~block_mask) {
+    GC_ASSERT(base < next);
+    if (base + NOFL_BLOCK_SIZE > prev) {
+      // The allocation straddles a block boundary, and the cursor has
+      // caught up so that we identify the block for the previous
+      // allocation pointer.  Finish the previous block, probably
+      // leaving a small hole at the end.
+      nofl_finish_evacuation_allocator_block(block, prev - base);
+    }
+    // Cursor lags; advance it.
+    block = nofl_block_summary_next(nofl_block_summary_for_addr(block));
+    base += NOFL_BLOCK_SIZE;
+    if (base >= alloc->limit) {
+      // Ran out of blocks!
+      GC_ASSERT(!block);
+      return gc_ref_null();
+    }
+    GC_ASSERT(block);
+    // This store can race with other allocators, but that's OK as long
+    // as it never advances the cursor beyond the allocation pointer,
+    // which it won't because we updated the allocation pointer already.
+    atomic_store_explicit(&alloc->block_cursor,
+                          nofl_make_evacuation_allocator_cursor(block, base),
+                          memory_order_release);
+  }
+
+  uintptr_t addr = block + (next & block_mask) - bytes;
+  return gc_ref(addr);
+}
+
+static inline int
+nofl_space_evacuate_or_mark_object(struct nofl_space *space,
+                                   struct gc_edge edge,
+                                   struct gc_ref old_ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(old_ref);
+  uint8_t byte = *metadata;
+  if (byte & space->marked_mask)
+    return 0;
+  if (space->evacuating &&
+      nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(old_ref)),
+                                  NOFL_BLOCK_EVACUATE)) {
+    // This is an evacuating collection, and we are attempting to
+    // evacuate this block, and we are tracing this particular object
+    // for what appears to be the first time.
+    struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+
+    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+      gc_atomic_forward_acquire(&fwd);
+
+    switch (fwd.state) {
+    case GC_FORWARDING_STATE_NOT_FORWARDED:
+    case GC_FORWARDING_STATE_ABORTED:
+      // Impossible.
+      GC_CRASH();
+    case GC_FORWARDING_STATE_ACQUIRED: {
+      // We claimed the object successfully; evacuating is up to us.
+      size_t object_granules = nofl_space_live_object_granules(metadata);
+      struct gc_ref new_ref = nofl_evacuation_allocate(space, object_granules);
+      if (gc_ref_is_heap_object(new_ref)) {
+        // Copy object contents before committing, as we don't know what
+        // part of the object (if any) will be overwritten by the
+        // commit.
+        memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
+               object_granules * NOFL_GRANULE_SIZE);
+        gc_atomic_forward_commit(&fwd, new_ref);
+        // Now update extent metadata, and indicate to the caller that
+        // the object's fields need to be traced.
+        uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
+        memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
+        gc_edge_update(edge, new_ref);
+        metadata = new_metadata;
+        // Fall through to set mark bits.
+      } else {
+        // Well shucks; allocation failed, marking the end of
+        // opportunistic evacuation.  No future evacuation of this
+        // object will succeed.  Mark in place instead.
+        gc_atomic_forward_abort(&fwd);
+      }
+      break;
+    }
+    case GC_FORWARDING_STATE_BUSY:
+      // Someone else claimed this object first.  Spin until new address
+      // known, or evacuation aborts.
+      for (size_t spin_count = 0;; spin_count++) {
+        if (gc_atomic_forward_retry_busy(&fwd))
+          break;
+        yield_for_spin(spin_count);
+      }
+      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+        // Remove evacuation aborted; remote will mark and enqueue.
+        return 0;
+      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+      // Fall through.
+    case GC_FORWARDING_STATE_FORWARDED:
+      // The object has been evacuated already.  Update the edge;
+      // whoever forwarded the object will make sure it's eventually
+      // traced.
+      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+      return 0;
+    }
+  }
+
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  *metadata = (byte & ~mask) | space->marked_mask;
+  return 1;
+}
+
+static inline int
+nofl_space_contains_address(struct nofl_space *space, uintptr_t addr) {
+  return addr - space->low_addr < space->extent;
+}
+
+static inline int
+nofl_space_contains_conservative_ref(struct nofl_space *space,
+                                     struct gc_conservative_ref ref) {
+  return nofl_space_contains_address(space, gc_conservative_ref_value(ref));
+}
+
+static inline int
+nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
+  return nofl_space_contains_address(space, gc_ref_value(ref));
+}
+
+static int
+nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
+                                     struct gc_edge edge,
+                                     struct gc_ref ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
+  uint8_t byte = *metadata;
+  if (byte & space->marked_mask)
+    return 1;
+
+  if (!space->evacuating)
+    return 0;
+  if (!nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(ref)),
+                                   NOFL_BLOCK_EVACUATE))
+    return 0;
+
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+    return 0;
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known, or evacuation aborts.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        break;
+      yield_for_spin(spin_count);
+    }
+    if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+      // Remote evacuation aborted; remote will mark and enqueue.
+      return 1;
+    ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+    // Fall through.
+  case GC_FORWARDING_STATE_FORWARDED:
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 1;
+  default:
+    GC_CRASH();
+  }
+}
+
+static inline struct gc_ref
+nofl_space_mark_conservative_ref(struct nofl_space *space,
+                                 struct gc_conservative_ref ref,
+                                 int possibly_interior) {
+  uintptr_t addr = gc_conservative_ref_value(ref);
+
+  if (possibly_interior) {
+    addr = align_down(addr, NOFL_GRANULE_SIZE);
+  } else {
+    // Addr not an aligned granule?  Not an object.
+    uintptr_t displacement = addr & (NOFL_GRANULE_SIZE - 1);
+    if (!gc_is_valid_conservative_ref_displacement(displacement))
+      return gc_ref_null();
+    addr -= displacement;
+  }
+
+  // Addr in meta block?  Not an object.
+  if ((addr & (NOFL_SLAB_SIZE - 1)) < NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE)
+    return gc_ref_null();
+
+  // Addr in block that has been paged out?  Not an object.
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+  if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+    return gc_ref_null();
+
+  uint8_t *loc = nofl_metadata_byte_for_addr(addr);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
+
+  // Already marked object?  Nothing to do.
+  if (byte & space->marked_mask)
+    return gc_ref_null();
+
+  // Addr is the not start of an unmarked object?  Search backwards if
+  // we have interior pointers, otherwise not an object.
+  uint8_t object_start_mask = space->live_mask | NOFL_METADATA_BYTE_YOUNG;
+  if (!(byte & object_start_mask)) {
+    if (!possibly_interior)
+      return gc_ref_null();
+
+    uintptr_t block_base = align_down(addr, NOFL_BLOCK_SIZE);
+    uint8_t *loc_base = nofl_metadata_byte_for_addr(block_base);
+    do {
+      // Searched past block?  Not an object.
+      if (loc-- == loc_base)
+        return gc_ref_null();
+
+      byte = atomic_load_explicit(loc, memory_order_relaxed);
+
+      // Ran into the end of some other allocation?  Not an object, then.
+      if (byte & NOFL_METADATA_BYTE_END)
+        return gc_ref_null();
+
+      // Continue until we find object start.
+    } while (!(byte & object_start_mask));
+
+    // Found object start, and object is unmarked; adjust addr.
+    addr = block_base + (loc - loc_base) * NOFL_GRANULE_SIZE;
+  }
+
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  atomic_store_explicit(loc, (byte & ~mask) | space->marked_mask,
+                        memory_order_relaxed);
+
+  return gc_ref(addr);
+}
+
+static inline size_t
+nofl_space_object_size(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *loc = nofl_metadata_byte_for_object(ref);
+  size_t granules = nofl_space_live_object_granules(loc);
+  return granules * NOFL_GRANULE_SIZE;
+}
+
+static void
+nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
+  nofl_block_summary_set_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+  nofl_push_block(&space->unavailable, block);
+}
+
+static uintptr_t
+nofl_pop_unavailable_block(struct nofl_space *space) {
+  uintptr_t block = nofl_pop_block(&space->unavailable);
+  if (!block)
+    return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
+  nofl_block_summary_clear_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  return block;
+}
+
+static uintptr_t
+nofl_pop_empty_block(struct nofl_space *space) {
+  return nofl_pop_block(&space->empty);
+}
+
+static int
+nofl_maybe_push_evacuation_target(struct nofl_space *space,
+                                  uintptr_t block, double reserve) {
+  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
+                                         NOFL_BLOCK_NEEDS_SWEEP));
+  size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
+                                        memory_order_acquire);
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
+                                            memory_order_acquire);
+  if (targets >= (total - unavailable) * reserve)
+    return 0;
+
+  nofl_push_block(&space->evacuation_targets, block);
+  return 1;
+}
+
+static int
+nofl_push_evacuation_target_if_needed(struct nofl_space *space,
+                                      uintptr_t block) {
+  return nofl_maybe_push_evacuation_target(space, block,
+                                           space->evacuation_minimum_reserve);
+}
+
+static int
+nofl_push_evacuation_target_if_possible(struct nofl_space *space,
+                                        uintptr_t block) {
+  return nofl_maybe_push_evacuation_target(space, block,
+                                           space->evacuation_reserve);
+}
+
+static void
+nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
+  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
+                                         NOFL_BLOCK_NEEDS_SWEEP));
+  nofl_push_block(&space->empty, block);
+}
+
+static ssize_t
+nofl_space_request_release_memory(struct nofl_space *space, size_t bytes) {
+  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
+}
+
+static void
+nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
+  while (pending + NOFL_BLOCK_SIZE <= 0) {
+    uintptr_t block = nofl_pop_unavailable_block(space);
+    GC_ASSERT(block);
+    if (nofl_push_evacuation_target_if_needed(space, block))
+      continue;
+    nofl_push_empty_block(space, block);
+    pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
+      + NOFL_BLOCK_SIZE;
+  }
+}
+
+static size_t
+nofl_allocator_next_hole(struct nofl_allocator *alloc,
+                         struct nofl_space *space);
+
+static int
+nofl_space_sweep_until_memory_released(struct nofl_space *space,
+                                       struct nofl_allocator *alloc) {
+  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                         memory_order_acquire);
+  // First try to unmap previously-identified empty blocks.  If pending
+  // > 0 and other mutators happen to identify empty blocks, they will
+  // be unmapped directly and moved to the unavailable list.
+  while (pending > 0) {
+    uintptr_t block = nofl_pop_empty_block(space);
+    if (!block)
+      break;
+    // Note that we may have competing uses; if we're evacuating,
+    // perhaps we should push this block to the evacuation target list.
+    // That would enable us to reach a fragmentation low water-mark in
+    // fewer cycles.  But maybe evacuation started in order to obtain
+    // free blocks for large objects; in that case we should just reap
+    // the fruits of our labor.  Probably this second use-case is more
+    // important.
+    nofl_push_unavailable_block(space, block);
+    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
+    pending -= NOFL_BLOCK_SIZE;
+  }
+  // Otherwise, sweep, transitioning any empty blocks to unavailable and
+  // throwing away any non-empty block.  A bit wasteful but hastening
+  // the next collection is a reasonable thing to do here.
+  while (pending > 0) {
+    if (!nofl_allocator_next_hole(alloc, space))
+      return 0;
+    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                   memory_order_acquire);
+  }
+  return pending <= 0;
+}
+
+static inline int
+nofl_is_ephemeron(struct gc_ref ref) {
+  uint8_t meta = *nofl_metadata_byte_for_addr(gc_ref_value(ref));
+  return meta & NOFL_METADATA_BYTE_EPHEMERON;
+}
+
+static void
+nofl_space_set_ephemeron_flag(struct gc_ref ref) {
+  if (gc_has_conservative_intraheap_edges()) {
+    uint8_t *metadata = nofl_metadata_byte_for_addr(gc_ref_value(ref));
+    *metadata |= NOFL_METADATA_BYTE_EPHEMERON;
+  }
+}
+
+static void nofl_finish_sweeping(struct nofl_allocator *alloc,
+                                 struct nofl_space *space);
+static void nofl_finish_sweeping_in_block(struct nofl_allocator *alloc,
+                                          struct nofl_space *space);
+
+// Note that it's quite possible (and even likely) that any given remset
+// byte doesn't hold any roots, if all stores were to nursery objects.
+STATIC_ASSERT_EQ(NOFL_GRANULES_PER_REMSET_BYTE % 8, 0);
+static void
+nofl_space_trace_card(struct nofl_space *space, struct nofl_slab *slab,
+                      size_t card,
+                      void (*enqueue)(struct gc_ref, struct gc_heap*),
+                      struct gc_heap *heap) {
+  uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
+  size_t granule_base = card * NOFL_GRANULES_PER_REMSET_BYTE;
+  for (size_t granule_in_remset = 0;
+       granule_in_remset < NOFL_GRANULES_PER_REMSET_BYTE;
+       granule_in_remset += 8, granule_base += 8) {
+    uint64_t mark_bytes = load_eight_aligned_bytes(slab->metadata + granule_base);
+    mark_bytes &= space->sweep_mask;
+    while (mark_bytes) {
+      size_t granule_offset = count_zero_bytes(mark_bytes);
+      mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
+      size_t granule = granule_base + granule_offset;
+      uintptr_t addr = first_addr_in_slab + granule * NOFL_GRANULE_SIZE;
+      GC_ASSERT(nofl_metadata_byte_for_addr(addr) == &slab->metadata[granule]);
+      enqueue(gc_ref(addr), heap);
+    }
+  }
+}
+
+static void
+nofl_space_trace_remembered_set(struct nofl_space *space,
+                                void (*enqueue)(struct gc_ref,
+                                                struct gc_heap*),
+                                struct gc_heap *heap) {
+  GC_ASSERT(!space->evacuating);
+  for (size_t s = 0; s < space->nslabs; s++) {
+    struct nofl_slab *slab = &space->slabs[s];
+    uint8_t *remset = slab->remembered_set;
+    for (size_t card_base = 0;
+         card_base < NOFL_REMSET_BYTES_PER_SLAB;
+         card_base += 8) {
+      uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
+      if (!remset_bytes) continue;
+      memset(remset + card_base, 0, 8);
+      while (remset_bytes) {
+        size_t card_offset = count_zero_bytes(remset_bytes);
+        remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
+        nofl_space_trace_card(space, slab, card_base + card_offset,
+                              enqueue, heap);
+      }
+    }
+  }
+}
+
+static void
+nofl_space_clear_remembered_set(struct nofl_space *space) {
+  if (!GC_GENERATIONAL) return;
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    memset(space->slabs[slab].remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
+  }
+}
+
+static void
+nofl_space_reset_sweeper(struct nofl_space *space) {
+  space->next_block = (uintptr_t) &space->slabs[0].blocks;
+}
+
+static void
+nofl_space_update_mark_patterns(struct nofl_space *space,
+                                int advance_mark_mask) {
+  uint8_t survivor_mask = space->marked_mask;
+  uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask);
+  if (advance_mark_mask)
+    space->marked_mask = next_marked_mask;
+  space->live_mask = survivor_mask | next_marked_mask;
+  space->sweep_mask = broadcast_byte(space->live_mask);
+}
+
+static void
+nofl_space_reset_statistics(struct nofl_space *space) {
+  space->granules_freed_by_last_collection = 0;
+  space->fragmentation_granules_since_last_collection = 0;
+}
+
+static size_t
+nofl_space_yield(struct nofl_space *space) {
+  return space->granules_freed_by_last_collection * NOFL_GRANULE_SIZE;
+}
+
+static size_t
+nofl_space_evacuation_reserve(struct nofl_space *space) {
+  return atomic_load_explicit(&space->evacuation_targets.count,
+                              memory_order_acquire) * NOFL_BLOCK_SIZE;
+}
+
+static size_t
+nofl_space_fragmentation(struct nofl_space *space) {
+  size_t granules = space->fragmentation_granules_since_last_collection;
+  return granules * NOFL_GRANULE_SIZE;
+}
+
+static void
+nofl_space_release_evacuation_target_blocks(struct nofl_space *space) {
+  // Move excess evacuation target blocks back to empties.
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
+                                            memory_order_acquire);
+  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
+  nofl_finish_evacuation_allocator(&space->evacuation_allocator,
+                                   &space->evacuation_targets,
+                                   &space->empty,
+                                   reserve);
+}
+
+static void
+nofl_space_prepare_for_evacuation(struct nofl_space *space,
+                                  enum gc_collection_kind gc_kind) {
+  if (gc_kind != GC_COLLECTION_COMPACTING) {
+    space->evacuating = 0;
+    space->evacuation_reserve = space->evacuation_minimum_reserve;
+    return;
+  }
+
+  // Put the mutator into evacuation mode, collecting up to 50% of free space as
+  // evacuation blocks.
+  space->evacuation_reserve = 0.5;
+
+  size_t target_blocks = space->evacuation_targets.count;
+  DEBUG("evacuation target block count: %zu\n", target_blocks);
+
+  if (target_blocks == 0) {
+    DEBUG("no evacuation target blocks, disabling evacuation for this round\n");
+    space->evacuating = 0;
+    return;
+  }
+
+  size_t target_granules = target_blocks * NOFL_GRANULES_PER_BLOCK;
+  // Compute histogram where domain is the number of granules in a block
+  // that survived the last collection, aggregated into 33 buckets, and
+  // range is number of blocks in that bucket.  (Bucket 0 is for blocks
+  // that were found to be completely empty; such blocks may be on the
+  // evacuation target list.)
+  const size_t bucket_count = 33;
+  size_t histogram[33] = {0,};
+  size_t bucket_size = NOFL_GRANULES_PER_BLOCK / 32;
+  size_t empties = 0;
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
+      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+        continue;
+      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
+        empties++;
+        continue;
+      }
+      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
+      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+      histogram[bucket]++;
+    }
+  }
+
+  // Blocks which lack the NEEDS_SWEEP flag are empty, either because
+  // they have been removed from the pool and have the UNAVAILABLE flag
+  // set, or because they are on the empties or evacuation target
+  // lists.  When evacuation starts, the empties list should be empty.
+  GC_ASSERT(empties == target_blocks);
+
+  // Now select a number of blocks that is likely to fill the space in
+  // the target blocks.  Prefer candidate blocks with fewer survivors
+  // from the last GC, to increase expected free block yield.
+  for (size_t bucket = 0; bucket < bucket_count; bucket++) {
+    size_t bucket_granules = bucket * bucket_size * histogram[bucket];
+    if (bucket_granules <= target_granules) {
+      target_granules -= bucket_granules;
+    } else {
+      histogram[bucket] = target_granules / (bucket_size * bucket);
+      target_granules = 0;
+    }
+  }
+
+  // Having selected the number of blocks, now we set the evacuation
+  // candidate flag on all blocks.
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
+      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+        continue;
+      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP))
+        continue;
+      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
+      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+      if (histogram[bucket]) {
+        nofl_block_summary_set_flag(summary, NOFL_BLOCK_EVACUATE);
+        histogram[bucket]--;
+      } else {
+        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_EVACUATE);
+      }
+    }
+  }
+
+  // We are ready to evacuate!
+  nofl_prepare_evacuation_allocator(&space->evacuation_allocator,
+                                    &space->evacuation_targets);
+  space->evacuating = 1;
+}
+
+static void
+nofl_space_verify_before_restart(struct nofl_space *space) {
+  // Iterate objects in each block, verifying that the END bytes correspond to
+  // the measured object size.
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
+      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+        continue;
+
+      uintptr_t addr = (uintptr_t)space->slabs[slab].blocks[block].data;
+      uintptr_t limit = addr + NOFL_BLOCK_SIZE;
+      uint8_t *meta = nofl_metadata_byte_for_addr(addr);
+      while (addr < limit) {
+        if (meta[0] & space->live_mask) {
+          struct gc_ref obj = gc_ref(addr);
+          size_t obj_bytes;
+          gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
+          size_t granules = nofl_size_to_granules(obj_bytes);
+          GC_ASSERT(granules);
+          for (size_t granule = 0; granule < granules - 1; granule++)
+            GC_ASSERT(!(meta[granule] & NOFL_METADATA_BYTE_END));
+          GC_ASSERT(meta[granules - 1] & NOFL_METADATA_BYTE_END);
+          meta += granules;
+          addr += granules * NOFL_GRANULE_SIZE;
+        } else {
+          meta++;
+          addr += NOFL_GRANULE_SIZE;
+        }
+      }
+      GC_ASSERT(addr == limit);
+    }
+  }
+}
+
+static void
+nofl_space_finish_gc(struct nofl_space *space,
+                     enum gc_collection_kind gc_kind) {
+  space->evacuating = 0;
+  space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
+  nofl_space_reset_sweeper(space);
+  nofl_space_update_mark_patterns(space, 0);
+  nofl_space_reset_statistics(space);
+  nofl_space_release_evacuation_target_blocks(space);
+  if (GC_DEBUG)
+    nofl_space_verify_before_restart(space);
+}
+
+static int
+nofl_sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
+  uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
+  // If the metadata byte is nonzero, that means either a young, dead,
+  // survived, or marked object.  If it's live (survived or marked), we
+  // found the next mark.  Otherwise it's dead and we clear the byte.
+  // If we see an END, that means an end of a dead object; clear it.
+  if (metadata) {
+    if (metadata & sweep_mask)
+      return 1;
+    atomic_store_explicit(loc, 0, memory_order_relaxed);
+  }
+  return 0;
+}
+
+static int
+nofl_sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
+  uintptr_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
+  if (metadata) {
+    if (metadata & sweep_mask)
+      return 1;
+    atomic_store_explicit(loc, 0, memory_order_relaxed);
+  }
+  return 0;
+}
+
+static uintptr_t
+nofl_space_next_block_to_sweep(struct nofl_space *space) {
+  uintptr_t block = atomic_load_explicit(&space->next_block,
+                                         memory_order_acquire);
+  uintptr_t next_block;
+  do {
+    if (block == 0)
+      return 0;
+
+    next_block = block + NOFL_BLOCK_SIZE;
+    if (next_block % NOFL_SLAB_SIZE == 0) {
+      uintptr_t hi_addr = space->low_addr + space->extent;
+      if (next_block == hi_addr)
+        next_block = 0;
+      else
+        next_block += NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE;
+    }
+  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
+                                         next_block));
+  return block;
+}
+
+static void
+nofl_allocator_release_block(struct nofl_allocator *alloc) {
+  alloc->alloc = alloc->sweep = alloc->block = 0;
+}
+
+static void
+nofl_allocator_finish_block(struct nofl_allocator *alloc,
+                            struct nofl_space *space) {
+  GC_ASSERT(alloc->block);
+  struct nofl_block_summary *block = nofl_block_summary_for_addr(alloc->block);
+  atomic_fetch_add(&space->granules_freed_by_last_collection,
+                   block->free_granules);
+  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
+                   block->fragmentation_granules);
+
+  // If this block has mostly survivors, we should avoid sweeping it and
+  // trying to allocate into it for a minor GC.  Sweep it next time to
+  // clear any garbage allocated in this cycle and mark it as
+  // "venerable" (i.e., old).
+  GC_ASSERT(!nofl_block_summary_has_flag(block, NOFL_BLOCK_VENERABLE));
+  if (!nofl_block_summary_has_flag(block, NOFL_BLOCK_VENERABLE_AFTER_SWEEP) &&
+      block->free_granules < NOFL_GRANULES_PER_BLOCK * space->venerable_threshold)
+    nofl_block_summary_set_flag(block, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+
+  nofl_allocator_release_block(alloc);
+}
+
+// Sweep some heap to reclaim free space, resetting alloc->alloc and
+// alloc->sweep.  Return the size of the hole in granules.
+static size_t
+nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
+                                  struct nofl_space *space) {
+  uintptr_t sweep = alloc->sweep;
+  if (sweep == 0)
+    return 0;
+  uintptr_t limit = alloc->block + NOFL_BLOCK_SIZE;
+  uintptr_t sweep_mask = space->sweep_mask;
+
+  while (sweep != limit) {
+    GC_ASSERT((sweep & (NOFL_GRANULE_SIZE - 1)) == 0);
+    uint8_t* metadata = nofl_metadata_byte_for_addr(sweep);
+    size_t limit_granules = (limit - sweep) >> NOFL_GRANULE_SIZE_LOG_2;
+
+    // Except for when we first get a block, alloc->sweep is positioned
+    // right after a hole, which can point to either the end of the
+    // block or to a live object.  Assume that a live object is more
+    // common.
+    {
+      size_t live_granules = 0;
+      while (limit_granules && (metadata[0] & sweep_mask)) {
+        // Object survived collection; skip over it and continue sweeping.
+        size_t object_granules = nofl_space_live_object_granules(metadata);
+        live_granules += object_granules;
+        limit_granules -= object_granules;
+        metadata += object_granules;
+      }
+      if (!limit_granules)
+        break;
+      sweep += live_granules * NOFL_GRANULE_SIZE;
+    }
+
+    size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+    GC_ASSERT(free_granules);
+    GC_ASSERT(free_granules <= limit_granules);
+
+    struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
+    summary->hole_count++;
+    GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - summary->free_granules);
+    summary->free_granules += free_granules;
+
+    size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
+    alloc->alloc = sweep;
+    alloc->sweep = sweep + free_bytes;
+    return free_granules;
+  }
+
+  nofl_allocator_finish_block(alloc, space);
+  return 0;
+}
+
+static void
+nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
+  size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
+  if (granules) {
+    struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
+    summary->holes_with_fragmentation++;
+    summary->fragmentation_granules += granules;
+    uint8_t *metadata = nofl_metadata_byte_for_addr(alloc->alloc);
+    memset(metadata, 0, granules);
+    alloc->alloc = alloc->sweep;
+  }
+  // FIXME: add to fragmentation
+}
+
+static int
+nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
+                                     struct nofl_space *space) {
+  GC_ASSERT(alloc->block);
+  uintptr_t block = alloc->block;
+  if (atomic_load_explicit(&space->pending_unavailable_bytes,
+                           memory_order_acquire) <= 0)
+    return 0;
+
+  nofl_push_unavailable_block(space, block);
+  atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
+  nofl_allocator_release_block(alloc);
+  return 1;
+}
+
+static size_t
+nofl_allocator_next_hole(struct nofl_allocator *alloc,
+                         struct nofl_space *space) {
+  nofl_allocator_finish_hole(alloc);
+  // As we sweep if we find that a block is empty, we return it to the
+  // empties list.  Empties are precious.  But if we return 10 blocks in
+  // a row, and still find an 11th empty, go ahead and use it.
+  size_t empties_countdown = 10;
+  while (1) {
+    // Sweep current block for a hole.
+    size_t granules = nofl_allocator_next_hole_in_block(alloc, space);
+    if (granules) {
+      // If the hole spans only part of a block, let the allocator try
+      // to use it.
+      if (granules < NOFL_GRANULES_PER_BLOCK)
+        return granules;
+      struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
+      memset(nofl_metadata_byte_for_addr(alloc->block), 0, NOFL_GRANULES_PER_BLOCK);
+      nofl_block_summary_clear_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+      // Sweeping found a completely empty block.  If we are below the
+      // minimum evacuation reserve, take the block.
+      if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
+        nofl_allocator_release_block(alloc);
+        continue;
+      }
+      // If we have pending pages to release to the OS, we should unmap
+      // this block.
+      if (nofl_maybe_release_swept_empty_block(alloc, space))
+        continue;
+      // Otherwise if we've already returned lots of empty blocks to the
+      // freelist, let the allocator keep this block.
+      if (!empties_countdown) {
+        // After this block is allocated into, it will need to be swept.
+        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        return granules;
+      }
+      // Otherwise we push to the empty blocks list.
+      nofl_push_empty_block(space, alloc->block);
+      nofl_allocator_release_block(alloc);
+      empties_countdown--;
+    }
+    GC_ASSERT(alloc->block == 0);
+    while (1) {
+      uintptr_t block = nofl_space_next_block_to_sweep(space);
+      if (block) {
+        // Sweeping found a block.  We might take it for allocation, or
+        // we might send it back.
+        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+        // If it's marked unavailable, it's already on a list of
+        // unavailable blocks, so skip and get the next block.
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+          continue;
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE)) {
+          // Skip venerable blocks after a minor GC -- we don't need to
+          // sweep as they weren't allocated into last cycle, and the
+          // mark bytes didn't rotate, so we have no cleanup to do; and
+          // we shouldn't try to allocate into them as it's not worth
+          // it.  Any wasted space is measured as fragmentation.
+          if (space->last_collection_was_minor)
+            continue;
+          else
+            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE);
+        }
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
+          // Prepare to sweep the block for holes.
+          alloc->alloc = alloc->sweep = alloc->block = block;
+          if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP)) {
+            // In the last cycle we noted that this block consists of
+            // mostly old data.  Sweep any garbage, commit the mark as
+            // venerable, and avoid allocating into it.
+            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+            if (space->last_collection_was_minor) {
+              nofl_finish_sweeping_in_block(alloc, space);
+              nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE);
+              continue;
+            }
+          }
+          // This block was marked in the last GC and needs sweeping.
+          // As we sweep we'll want to record how many bytes were live
+          // at the last collection.  As we allocate we'll record how
+          // many granules were wasted because of fragmentation.
+          summary->hole_count = 0;
+          summary->free_granules = 0;
+          summary->holes_with_fragmentation = 0;
+          summary->fragmentation_granules = 0;
+          break;
+        } else {
+          // Otherwise this block is completely empty and is on the
+          // empties list.  We take from the empties list only after all
+          // the NEEDS_SWEEP blocks are processed.
+          continue;
+        }
+      } else {
+        // We are done sweeping for blocks.  Now take from the empties
+        // list.
+        block = nofl_pop_empty_block(space);
+        // No empty block?  Return 0 to cause collection.
+        if (!block)
+          return 0;
+
+        // Maybe we should use this empty as a target for evacuation.
+        if (nofl_push_evacuation_target_if_possible(space, block))
+          continue;
+
+        // Otherwise give the block to the allocator.
+        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        summary->hole_count = 1;
+        summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+        summary->holes_with_fragmentation = 0;
+        summary->fragmentation_granules = 0;
+        alloc->block = block;
+        alloc->alloc = block;
+        alloc->sweep = block + NOFL_BLOCK_SIZE;
+        return NOFL_GRANULES_PER_BLOCK;
+      }
+    }
+  }
+}
+
+static void
+nofl_finish_sweeping_in_block(struct nofl_allocator *alloc,
+                              struct nofl_space *space) {
+  do {
+    nofl_allocator_finish_hole(alloc);
+  } while (nofl_allocator_next_hole_in_block(alloc, space));
+}
+
+// Another thread is triggering GC.  Before we stop, finish clearing the
+// dead mark bytes for the mutator's block, and release the block.
+static void
+nofl_finish_sweeping(struct nofl_allocator *alloc,
+                     struct nofl_space *space) {
+  while (nofl_allocator_next_hole(alloc, space)) {}
+}
+
+static struct gc_ref
+nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
+              size_t size, void (*gc)(void*), void *gc_data) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, NOFL_GRANULE_SIZE);
+
+  if (alloc->alloc + size > alloc->sweep) {
+    size_t granules = size >> NOFL_GRANULE_SIZE_LOG_2;
+    while (1) {
+      size_t hole = nofl_allocator_next_hole(alloc, space);
+      if (hole >= granules) {
+        nofl_clear_memory(alloc->alloc, hole * NOFL_GRANULE_SIZE);
+        break;
+      }
+      if (!hole)
+        gc(gc_data);
+    }
+  }
+
+  struct gc_ref ret = gc_ref(alloc->alloc);
+  alloc->alloc += size;
+  gc_update_alloc_table(ret, size);
+  return ret;
+}
+
+static struct nofl_slab*
+nofl_allocate_slabs(size_t nslabs) {
+  size_t size = nslabs * NOFL_SLAB_SIZE;
+  size_t extent = size + NOFL_SLAB_SIZE;
+
+  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return NULL;
+  }
+
+  uintptr_t base = (uintptr_t) mem;
+  uintptr_t end = base + extent;
+  uintptr_t aligned_base = align_up(base, NOFL_SLAB_SIZE);
+  uintptr_t aligned_end = aligned_base + size;
+
+  if (aligned_base - base)
+    munmap((void*)base, aligned_base - base);
+  if (end - aligned_end)
+    munmap((void*)aligned_end, end - aligned_end);
+
+  return (struct nofl_slab*) aligned_base;
+}
+
+static int
+nofl_space_init(struct nofl_space *space, size_t size, int atomic,
+                double venerable_threshold) {
+  size = align_up(size, NOFL_BLOCK_SIZE);
+  size_t reserved = align_up(size, NOFL_SLAB_SIZE);
+  size_t nslabs = reserved / NOFL_SLAB_SIZE;
+  struct nofl_slab *slabs = nofl_allocate_slabs(nslabs);
+  if (!slabs)
+    return 0;
+
+  space->marked_mask = NOFL_METADATA_BYTE_MARK_0;
+  nofl_space_update_mark_patterns(space, 0);
+  space->slabs = slabs;
+  space->nslabs = nslabs;
+  space->low_addr = (uintptr_t) slabs;
+  space->extent = reserved;
+  space->next_block = 0;
+  space->evacuation_minimum_reserve = 0.02;
+  space->evacuation_reserve = space->evacuation_minimum_reserve;
+  space->venerable_threshold = venerable_threshold;
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
+      if (reserved > size) {
+        nofl_push_unavailable_block(space, addr);
+        reserved -= NOFL_BLOCK_SIZE;
+      } else {
+        if (!nofl_push_evacuation_target_if_needed(space, addr))
+          nofl_push_empty_block(space, addr);
+      }
+    }
+  }
+  return 1;
+}
+
+#endif // NOFL_SPACE_H
diff --git a/src/swar.h b/src/swar.h
new file mode 100644
index 000000000..293d99ec2
--- /dev/null
+++ b/src/swar.h
@@ -0,0 +1,51 @@
+#ifndef SWAR_H
+#define SWAR_H
+
+#include <string.h>
+
+static inline size_t
+count_zero_bytes(uint64_t bytes) {
+  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
+}
+
+static uint64_t
+broadcast_byte(uint8_t byte) {
+  uint64_t result = byte;
+  return result * 0x0101010101010101ULL;
+}
+
+static inline uint64_t
+load_eight_aligned_bytes(uint8_t *ptr) {
+  GC_ASSERT(((uintptr_t)ptr & 7) == 0);
+  uint8_t * __attribute__((aligned(8))) aligned_ptr = ptr;
+  uint64_t word;
+  memcpy(&word, aligned_ptr, 8);
+#ifdef WORDS_BIGENDIAN
+  word = __builtin_bswap64(word);
+#endif
+  return word;
+}
+
+static size_t
+scan_for_byte(uint8_t *ptr, size_t limit, uint64_t mask) {
+  size_t n = 0;
+  size_t unaligned = ((uintptr_t) ptr) & 7;
+  if (unaligned) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8);
+    bytes &= mask;
+    if (bytes)
+      return count_zero_bytes(bytes);
+    n += 8 - unaligned;
+  }
+
+  for(; n < limit; n += 8) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr + n);
+    bytes &= mask;
+    if (bytes)
+      return n + count_zero_bytes(bytes);
+  }
+
+  return limit;
+}
+
+#endif // SWAR_H
diff --git a/src/whippet.c b/src/whippet.c
index 2f77b251e..76f8f1ed5 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -19,6 +19,7 @@
 #include "gc-stack.h"
 #include "gc-trace.h"
 #include "large-object-space.h"
+#include "nofl-space.h"
 #if GC_PARALLEL
 #include "parallel-tracer.h"
 #else
@@ -27,269 +28,10 @@
 #include "spin.h"
 #include "whippet-attrs.h"
 
-#define GRANULE_SIZE 16
-#define GRANULE_SIZE_LOG_2 4
-#define MEDIUM_OBJECT_THRESHOLD 256
-#define MEDIUM_OBJECT_GRANULE_THRESHOLD 16
 #define LARGE_OBJECT_THRESHOLD 8192
-#define LARGE_OBJECT_GRANULE_THRESHOLD 512
-
-STATIC_ASSERT_EQ(GRANULE_SIZE, 1 << GRANULE_SIZE_LOG_2);
-STATIC_ASSERT_EQ(MEDIUM_OBJECT_THRESHOLD,
-                 MEDIUM_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
-STATIC_ASSERT_EQ(LARGE_OBJECT_THRESHOLD,
-                 LARGE_OBJECT_GRANULE_THRESHOLD * GRANULE_SIZE);
-
-// Each granule has one mark byte stored in a side table.  A granule's
-// mark state is a whole byte instead of a bit to facilitate parallel
-// marking.  (Parallel markers are allowed to race.)  We also use this
-// byte to compute object extent, via a bit flag indicating
-// end-of-object.
-//
-// Because we want to allow for conservative roots, we need to know
-// whether an address indicates an object or not.  That means that when
-// an object is allocated, it has to set a bit, somewhere.  We use the
-// metadata byte for this purpose, setting the "young" bit.
-//
-// The "young" bit's name might make you think about generational
-// collection, and indeed all objects collected in a minor collection
-// will have this bit set.  However, whippet never needs to check for
-// the young bit; if it weren't for the need to identify conservative
-// roots, we wouldn't need a young bit at all.  Perhaps in an
-// all-precise system, we would be able to avoid the overhead of
-// initializing mark byte upon each fresh allocation.
-//
-// When an object becomes dead after a GC, it will still have a bit set
-// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
-// clear these bits before the next collection.  But, for concurrent
-// marking, we will also be marking "live" objects, updating their mark
-// bits.  So there are four object states concurrently observable:
-// young, dead, survivor, and marked.  (If we didn't have concurrent
-// marking we would still need the "marked" state, because marking
-// mutator roots before stopping is also a form of concurrent marking.)
-// Even though these states are mutually exclusive, we use separate bits
-// for them because we have the space.  After each collection, the dead,
-// survivor, and marked states rotate by one bit.
-enum metadata_byte {
-  METADATA_BYTE_NONE = 0,
-  METADATA_BYTE_YOUNG = 1,
-  METADATA_BYTE_MARK_0 = 2,
-  METADATA_BYTE_MARK_1 = 4,
-  METADATA_BYTE_MARK_2 = 8,
-  METADATA_BYTE_END = 16,
-  METADATA_BYTE_EPHEMERON = 32,
-  METADATA_BYTE_PINNED = 64,
-  METADATA_BYTE_UNUSED_1 = 128
-};
-
-static uint8_t rotate_dead_survivor_marked(uint8_t mask) {
-  uint8_t all =
-    METADATA_BYTE_MARK_0 | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
-  return ((mask << 1) | (mask >> 2)) & all;
-}
-
-#define SLAB_SIZE (4 * 1024 * 1024)
-#define BLOCK_SIZE (64 * 1024)
-#define METADATA_BYTES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
-#define BLOCKS_PER_SLAB (SLAB_SIZE / BLOCK_SIZE)
-#define META_BLOCKS_PER_SLAB (METADATA_BYTES_PER_BLOCK * BLOCKS_PER_SLAB / BLOCK_SIZE)
-#define NONMETA_BLOCKS_PER_SLAB (BLOCKS_PER_SLAB - META_BLOCKS_PER_SLAB)
-#define METADATA_BYTES_PER_SLAB (NONMETA_BLOCKS_PER_SLAB * METADATA_BYTES_PER_BLOCK)
-#define SLACK_METADATA_BYTES_PER_SLAB (META_BLOCKS_PER_SLAB * METADATA_BYTES_PER_BLOCK)
-#define REMSET_BYTES_PER_BLOCK (SLACK_METADATA_BYTES_PER_SLAB / BLOCKS_PER_SLAB)
-#define REMSET_BYTES_PER_SLAB (REMSET_BYTES_PER_BLOCK * NONMETA_BLOCKS_PER_SLAB)
-#define SLACK_REMSET_BYTES_PER_SLAB (REMSET_BYTES_PER_BLOCK * META_BLOCKS_PER_SLAB)
-#define SUMMARY_BYTES_PER_BLOCK (SLACK_REMSET_BYTES_PER_SLAB / BLOCKS_PER_SLAB)
-#define SUMMARY_BYTES_PER_SLAB (SUMMARY_BYTES_PER_BLOCK * NONMETA_BLOCKS_PER_SLAB)
-#define SLACK_SUMMARY_BYTES_PER_SLAB (SUMMARY_BYTES_PER_BLOCK * META_BLOCKS_PER_SLAB)
-#define HEADER_BYTES_PER_SLAB SLACK_SUMMARY_BYTES_PER_SLAB
-
-struct slab;
-
-struct slab_header {
-  union {
-    struct {
-      struct slab *next;
-      struct slab *prev;
-    };
-    uint8_t padding[HEADER_BYTES_PER_SLAB];
-  };
-};
-STATIC_ASSERT_EQ(sizeof(struct slab_header), HEADER_BYTES_PER_SLAB);
-
-// Sometimes we want to put a block on a singly-linked list.  For that
-// there's a pointer reserved in the block summary.  But because the
-// pointer is aligned (32kB on 32-bit, 64kB on 64-bit), we can portably
-// hide up to 15 flags in the low bits.  These flags can be accessed
-// non-atomically by the mutator when it owns a block; otherwise they
-// need to be accessed atomically.
-enum block_summary_flag {
-  BLOCK_OUT_FOR_THREAD = 0x1,
-  BLOCK_HAS_PIN = 0x2,
-  BLOCK_PAGED_OUT = 0x4,
-  BLOCK_NEEDS_SWEEP = 0x8,
-  BLOCK_UNAVAILABLE = 0x10,
-  BLOCK_EVACUATE = 0x20,
-  BLOCK_VENERABLE = 0x40,
-  BLOCK_VENERABLE_AFTER_SWEEP = 0x80,
-  BLOCK_FLAG_UNUSED_8 = 0x100,
-  BLOCK_FLAG_UNUSED_9 = 0x200,
-  BLOCK_FLAG_UNUSED_10 = 0x400,
-  BLOCK_FLAG_UNUSED_11 = 0x800,
-  BLOCK_FLAG_UNUSED_12 = 0x1000,
-  BLOCK_FLAG_UNUSED_13 = 0x2000,
-  BLOCK_FLAG_UNUSED_14 = 0x4000,
-};
-
-struct block_summary {
-  union {
-    struct {
-      //struct block *next;
-      // Counters related to previous collection: how many holes there
-      // were, and how much space they had.
-      uint16_t hole_count;
-      uint16_t free_granules;
-      // Counters related to allocation since previous collection:
-      // wasted space due to fragmentation.
-      uint16_t holes_with_fragmentation;
-      uint16_t fragmentation_granules;
-      // After a block is swept, if it's empty it goes on the empties
-      // list.  Otherwise if it's not immediately used by a mutator (as
-      // is usually the case), it goes on the swept list.  Both of these
-      // lists use this field.  But as the next element in the field is
-      // block-aligned, we stash flags in the low bits.
-      uintptr_t next_and_flags;
-    };
-    uint8_t padding[SUMMARY_BYTES_PER_BLOCK];
-  };
-};
-STATIC_ASSERT_EQ(sizeof(struct block_summary), SUMMARY_BYTES_PER_BLOCK);
-
-struct block {
-  char data[BLOCK_SIZE];
-};
-
-struct slab {
-  struct slab_header header;
-  struct block_summary summaries[NONMETA_BLOCKS_PER_SLAB];
-  uint8_t remembered_set[REMSET_BYTES_PER_SLAB];
-  uint8_t metadata[METADATA_BYTES_PER_SLAB];
-  struct block blocks[NONMETA_BLOCKS_PER_SLAB];
-};
-STATIC_ASSERT_EQ(sizeof(struct slab), SLAB_SIZE);
-
-static struct slab *object_slab(void *obj) {
-  uintptr_t addr = (uintptr_t) obj;
-  uintptr_t base = align_down(addr, SLAB_SIZE);
-  return (struct slab*) base;
-}
-
-static uint8_t *metadata_byte_for_addr(uintptr_t addr) {
-  uintptr_t base = align_down(addr, SLAB_SIZE);
-  uintptr_t granule = (addr & (SLAB_SIZE - 1)) >> GRANULE_SIZE_LOG_2;
-  return (uint8_t*) (base + granule);
-}
-
-static uint8_t *metadata_byte_for_object(struct gc_ref ref) {
-  return metadata_byte_for_addr(gc_ref_value(ref));
-}
-
-#define GRANULES_PER_BLOCK (BLOCK_SIZE / GRANULE_SIZE)
-#define GRANULES_PER_REMSET_BYTE (GRANULES_PER_BLOCK / REMSET_BYTES_PER_BLOCK)
-
-static struct block_summary* block_summary_for_addr(uintptr_t addr) {
-  uintptr_t base = align_down(addr, SLAB_SIZE);
-  uintptr_t block = (addr & (SLAB_SIZE - 1)) / BLOCK_SIZE;
-  return (struct block_summary*) (base + block * sizeof(struct block_summary));
-}
-
-static uintptr_t block_summary_has_flag(struct block_summary *summary,
-                                        enum block_summary_flag flag) {
-  return summary->next_and_flags & flag;
-}
-static void block_summary_set_flag(struct block_summary *summary,
-                                   enum block_summary_flag flag) {
-  summary->next_and_flags |= flag;
-}
-static void block_summary_clear_flag(struct block_summary *summary,
-                                     enum block_summary_flag flag) {
-  summary->next_and_flags &= ~(uintptr_t)flag;
-}
-static uintptr_t block_summary_next(struct block_summary *summary) {
-  return align_down(summary->next_and_flags, BLOCK_SIZE);
-}
-static void block_summary_set_next(struct block_summary *summary,
-                                   uintptr_t next) {
-  GC_ASSERT((next & (BLOCK_SIZE - 1)) == 0);
-  summary->next_and_flags =
-    (summary->next_and_flags & (BLOCK_SIZE - 1)) | next;
-}
-
-// Lock-free block list.
-struct block_list {
-  size_t count;
-  uintptr_t blocks;
-};
-
-static void push_block(struct block_list *list, uintptr_t block) {
-  atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
-  struct block_summary *summary = block_summary_for_addr(block);
-  uintptr_t next = atomic_load_explicit(&list->blocks, memory_order_acquire);
-  do {
-    block_summary_set_next(summary, next);
-  } while (!atomic_compare_exchange_weak(&list->blocks, &next, block));
-}
-
-static uintptr_t pop_block(struct block_list *list) {
-  uintptr_t head = atomic_load_explicit(&list->blocks, memory_order_acquire);
-  struct block_summary *summary;
-  uintptr_t next;
-  do {
-    if (!head)
-      return 0;
-    summary = block_summary_for_addr(head);
-    next = block_summary_next(summary);
-  } while (!atomic_compare_exchange_weak(&list->blocks, &head, next));
-  block_summary_set_next(summary, 0);
-  atomic_fetch_sub_explicit(&list->count, 1, memory_order_acq_rel);
-  return head;
-}
-
-static inline size_t size_to_granules(size_t size) {
-  return (size + GRANULE_SIZE - 1) >> GRANULE_SIZE_LOG_2;
-}
-
-struct evacuation_allocator {
-  size_t allocated; // atomically
-  size_t limit;
-  uintptr_t block_cursor; // atomically
-};
-
-struct mark_space {
-  uint64_t sweep_mask;
-  uint8_t live_mask;
-  uint8_t marked_mask;
-  uint8_t evacuating;
-  uintptr_t low_addr;
-  size_t extent;
-  size_t heap_size;
-  uintptr_t next_block;   // atomically
-  struct block_list empty;
-  struct block_list unavailable;
-  struct block_list evacuation_targets;
-  double evacuation_minimum_reserve;
-  double evacuation_reserve;
-  double venerable_threshold;
-  ssize_t pending_unavailable_bytes; // atomically
-  struct evacuation_allocator evacuation_allocator;
-  struct slab *slabs;
-  size_t nslabs;
-  uintptr_t granules_freed_by_last_collection; // atomically
-  uintptr_t fragmentation_granules_since_last_collection; // atomically
-};
 
 struct gc_heap {
-  struct mark_space mark_space;
+  struct nofl_space nofl_space;
   struct large_object_space large_object_space;
   struct gc_extern_space *extern_space;
   size_t large_object_pages;
@@ -336,10 +78,7 @@ struct gc_mutator_mark_buf {
 };
 
 struct gc_mutator {
-  // Bump-pointer allocation into holes.
-  uintptr_t alloc;
-  uintptr_t sweep;
-  uintptr_t block;
+  struct nofl_allocator allocator;
   struct gc_heap *heap;
   struct gc_stack stack;
   struct gc_mutator_roots *roots;
@@ -352,319 +91,32 @@ struct gc_mutator {
   struct gc_mutator *next;
 };
 
-static inline struct mark_space* heap_mark_space(struct gc_heap *heap) {
-  return &heap->mark_space;
+static inline struct nofl_space*
+heap_nofl_space(struct gc_heap *heap) {
+  return &heap->nofl_space;
 }
-static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
+static inline struct large_object_space*
+heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
-static inline struct gc_extern_space* heap_extern_space(struct gc_heap *heap) {
+static inline struct gc_extern_space*
+heap_extern_space(struct gc_heap *heap) {
   return heap->extern_space;
 }
-static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
+static inline struct gc_heap*
+mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
-static inline void clear_memory(uintptr_t addr, size_t size) {
-  memset((char*)addr, 0, size);
-}
-
 static void collect(struct gc_mutator *mut,
                     enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 
-static inline uint64_t load_eight_aligned_bytes(uint8_t *mark) {
-  GC_ASSERT(((uintptr_t)mark & 7) == 0);
-  uint8_t * __attribute__((aligned(8))) aligned_mark = mark;
-  uint64_t word;
-  memcpy(&word, aligned_mark, 8);
-#ifdef WORDS_BIGENDIAN
-  word = __builtin_bswap64(word);
-#endif
-  return word;
-}
-
-static inline size_t count_zero_bytes(uint64_t bytes) {
-  return bytes ? (__builtin_ctzll(bytes) / 8) : sizeof(bytes);
-}
-
-static uint64_t broadcast_byte(uint8_t byte) {
-  uint64_t result = byte;
-  return result * 0x0101010101010101ULL;
-}
-
-static size_t next_mark(uint8_t *mark, size_t limit, uint64_t sweep_mask) {
-  size_t n = 0;
-  // If we have a hole, it is likely to be more that 8 granules long.
-  // Assuming that it's better to make aligned loads, first we align the
-  // sweep pointer, then we load aligned mark words.
-  size_t unaligned = ((uintptr_t) mark) & 7;
-  if (unaligned) {
-    uint64_t bytes = load_eight_aligned_bytes(mark - unaligned) >> (unaligned * 8);
-    bytes &= sweep_mask;
-    if (bytes)
-      return count_zero_bytes(bytes);
-    n += 8 - unaligned;
-  }
-
-  for(; n < limit; n += 8) {
-    uint64_t bytes = load_eight_aligned_bytes(mark + n);
-    bytes &= sweep_mask;
-    if (bytes)
-      return n + count_zero_bytes(bytes);
-  }
-
-  return limit;
-}
-
-static size_t mark_space_live_object_granules(uint8_t *metadata) {
-  return next_mark(metadata, -1, broadcast_byte(METADATA_BYTE_END)) + 1;
-}
-
-static inline int mark_space_mark_object(struct mark_space *space,
-                                         struct gc_ref ref) {
-  uint8_t *loc = metadata_byte_for_object(ref);
-  uint8_t byte = *loc;
-  if (byte & space->marked_mask)
-    return 0;
-  uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
-    | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
-  *loc = (byte & ~mask) | space->marked_mask;
-  return 1;
-}
-
-static uintptr_t make_evacuation_allocator_cursor(uintptr_t block,
-                                                  size_t allocated) {
-  GC_ASSERT(allocated < (BLOCK_SIZE - 1) * (uint64_t) BLOCK_SIZE);
-  return align_down(block, BLOCK_SIZE) | (allocated / BLOCK_SIZE);
-}
-
-static void prepare_evacuation_allocator(struct evacuation_allocator *alloc,
-                                         struct block_list *targets) {
-  uintptr_t first_block = targets->blocks;
-  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
-  alloc->limit =
-    atomic_load_explicit(&targets->count, memory_order_acquire) * BLOCK_SIZE;
-  atomic_store_explicit(&alloc->block_cursor,
-                        make_evacuation_allocator_cursor(first_block, 0),
-                        memory_order_release);
-}
-
-static void clear_remaining_metadata_bytes_in_block(uintptr_t block,
-                                                    uintptr_t allocated) {
-  GC_ASSERT((allocated & (GRANULE_SIZE - 1)) == 0);
-  uintptr_t base = block + allocated;
-  uintptr_t limit = block + BLOCK_SIZE;
-  uintptr_t granules = (limit - base) >> GRANULE_SIZE_LOG_2;
-  GC_ASSERT(granules <= GRANULES_PER_BLOCK);
-  memset(metadata_byte_for_addr(base), 0, granules);
-}
-
-static void finish_evacuation_allocator_block(uintptr_t block,
-                                              uintptr_t allocated) {
-  GC_ASSERT(allocated <= BLOCK_SIZE);
-  struct block_summary *summary = block_summary_for_addr(block);
-  block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
-  size_t fragmentation = (BLOCK_SIZE - allocated) >> GRANULE_SIZE_LOG_2;
-  summary->hole_count = 1;
-  summary->free_granules = GRANULES_PER_BLOCK;
-  summary->holes_with_fragmentation = fragmentation ? 1 : 0;
-  summary->fragmentation_granules = fragmentation;
-  if (fragmentation)
-    clear_remaining_metadata_bytes_in_block(block, allocated);
-}
-
-static void finish_evacuation_allocator(struct evacuation_allocator *alloc,
-                                        struct block_list *targets,
-                                        struct block_list *empties,
-                                        size_t reserve) {
-  // Blocks that we used for evacuation get returned to the mutator as
-  // sweepable blocks.  Blocks that we didn't get to use go to the
-  // empties.
-  size_t allocated = atomic_load_explicit(&alloc->allocated,
-                                          memory_order_acquire);
-  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
-  if (allocated > alloc->limit)
-    allocated = alloc->limit;
-  while (allocated >= BLOCK_SIZE) {
-    uintptr_t block = pop_block(targets);
-    GC_ASSERT(block);
-    allocated -= BLOCK_SIZE;
-  }
-  if (allocated) {
-    // Finish off the last partially-filled block.
-    uintptr_t block = pop_block(targets);
-    GC_ASSERT(block);
-    finish_evacuation_allocator_block(block, allocated);
-  }
-  size_t remaining = atomic_load_explicit(&targets->count, memory_order_acquire);
-  while (remaining-- > reserve)
-    push_block(empties, pop_block(targets));
-}
-
-static struct gc_ref evacuation_allocate(struct mark_space *space,
-                                         size_t granules) {
-  // All collector threads compete to allocate from what is logically a
-  // single bump-pointer arena, which is actually composed of a linked
-  // list of blocks.
-  struct evacuation_allocator *alloc = &space->evacuation_allocator;
-  uintptr_t cursor = atomic_load_explicit(&alloc->block_cursor,
-                                          memory_order_acquire);
-  size_t bytes = granules * GRANULE_SIZE;
-  size_t prev = atomic_load_explicit(&alloc->allocated, memory_order_acquire);
-  size_t block_mask = (BLOCK_SIZE - 1);
-  size_t next;
-  do {
-    if (prev >= alloc->limit)
-      // No more space.
-      return gc_ref_null();
-    next = prev + bytes;
-    if ((prev ^ next) & ~block_mask)
-      // Allocation straddles a block boundary; advance so it starts a
-      // fresh block.
-      next = (next & ~block_mask) + bytes;
-  } while (!atomic_compare_exchange_weak(&alloc->allocated, &prev, next));
-  // OK, we've claimed our memory, starting at next - bytes.  Now find
-  // the node in the linked list of evacuation targets that corresponds
-  // to this allocation pointer.
-  uintptr_t block = cursor & ~block_mask;
-  // This is the SEQ'th block to be allocated into.
-  uintptr_t seq = cursor & block_mask;
-  // Therefore this block handles allocations starting at SEQ*BLOCK_SIZE
-  // and continuing for BLOCK_SIZE bytes.
-  uintptr_t base = seq * BLOCK_SIZE;
-
-  while ((base ^ next) & ~block_mask) {
-    GC_ASSERT(base < next);
-    if (base + BLOCK_SIZE > prev) {
-      // The allocation straddles a block boundary, and the cursor has
-      // caught up so that we identify the block for the previous
-      // allocation pointer.  Finish the previous block, probably
-      // leaving a small hole at the end.
-      finish_evacuation_allocator_block(block, prev - base);
-    }
-    // Cursor lags; advance it.
-    block = block_summary_next(block_summary_for_addr(block));
-    base += BLOCK_SIZE;
-    if (base >= alloc->limit) {
-      // Ran out of blocks!
-      GC_ASSERT(!block);
-      return gc_ref_null();
-    }
-    GC_ASSERT(block);
-    // This store can race with other allocators, but that's OK as long
-    // as it never advances the cursor beyond the allocation pointer,
-    // which it won't because we updated the allocation pointer already.
-    atomic_store_explicit(&alloc->block_cursor,
-                          make_evacuation_allocator_cursor(block, base),
-                          memory_order_release);
-  }
-
-  uintptr_t addr = block + (next & block_mask) - bytes;
-  return gc_ref(addr);
-}
-
-static inline int mark_space_evacuate_or_mark_object(struct mark_space *space,
-                                                     struct gc_edge edge,
-                                                     struct gc_ref old_ref) {
-  uint8_t *metadata = metadata_byte_for_object(old_ref);
-  uint8_t byte = *metadata;
-  if (byte & space->marked_mask)
-    return 0;
-  if (space->evacuating &&
-      block_summary_has_flag(block_summary_for_addr(gc_ref_value(old_ref)),
-                             BLOCK_EVACUATE)) {
-    // This is an evacuating collection, and we are attempting to
-    // evacuate this block, and we are tracing this particular object
-    // for what appears to be the first time.
-    struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
-
-    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
-      gc_atomic_forward_acquire(&fwd);
-
-    switch (fwd.state) {
-    case GC_FORWARDING_STATE_NOT_FORWARDED:
-    case GC_FORWARDING_STATE_ABORTED:
-      // Impossible.
-      GC_CRASH();
-    case GC_FORWARDING_STATE_ACQUIRED: {
-      // We claimed the object successfully; evacuating is up to us.
-      size_t object_granules = mark_space_live_object_granules(metadata);
-      struct gc_ref new_ref = evacuation_allocate(space, object_granules);
-      if (gc_ref_is_heap_object(new_ref)) {
-        // Copy object contents before committing, as we don't know what
-        // part of the object (if any) will be overwritten by the
-        // commit.
-        memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
-               object_granules * GRANULE_SIZE);
-        gc_atomic_forward_commit(&fwd, new_ref);
-        // Now update extent metadata, and indicate to the caller that
-        // the object's fields need to be traced.
-        uint8_t *new_metadata = metadata_byte_for_object(new_ref);
-        memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
-        gc_edge_update(edge, new_ref);
-        metadata = new_metadata;
-        // Fall through to set mark bits.
-      } else {
-        // Well shucks; allocation failed, marking the end of
-        // opportunistic evacuation.  No future evacuation of this
-        // object will succeed.  Mark in place instead.
-        gc_atomic_forward_abort(&fwd);
-      }
-      break;
-    }
-    case GC_FORWARDING_STATE_BUSY:
-      // Someone else claimed this object first.  Spin until new address
-      // known, or evacuation aborts.
-      for (size_t spin_count = 0;; spin_count++) {
-        if (gc_atomic_forward_retry_busy(&fwd))
-          break;
-        yield_for_spin(spin_count);
-      }
-      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
-        // Remove evacuation aborted; remote will mark and enqueue.
-        return 0;
-      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-      // Fall through.
-    case GC_FORWARDING_STATE_FORWARDED:
-      // The object has been evacuated already.  Update the edge;
-      // whoever forwarded the object will make sure it's eventually
-      // traced.
-      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-      return 0;
-    }
-  }
-
-  uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
-    | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
-  *metadata = (byte & ~mask) | space->marked_mask;
-  return 1;
-}
-
-static inline int mark_space_contains_address(struct mark_space *space,
-                                              uintptr_t addr) {
-  return addr - space->low_addr < space->extent;
-}
-
-static inline int mark_space_contains_conservative_ref(struct mark_space *space,
-                                                       struct gc_conservative_ref ref) {
-  return mark_space_contains_address(space, gc_conservative_ref_value(ref));
-}
-
-static inline int mark_space_contains(struct mark_space *space,
-                                      struct gc_ref ref) {
-  return mark_space_contains_address(space, gc_ref_value(ref));
-}
-
-static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
-                           struct gc_ref ref) {
+static inline int
+do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
-    if (heap_mark_space(heap)->evacuating)
-      return mark_space_evacuate_or_mark_object(heap_mark_space(heap), edge,
-                                                ref);
-    return mark_space_mark_object(heap_mark_space(heap), ref);
-  }
+  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref)))
+    return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref);
   else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           ref);
@@ -675,7 +127,8 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
 static inline int trace_edge(struct gc_heap *heap,
                              struct gc_edge edge) GC_ALWAYS_INLINE;
 
-static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
+static inline int
+trace_edge(struct gc_heap *heap, struct gc_edge edge) {
   struct gc_ref ref = gc_edge_ref(edge);
   int is_new = do_trace(heap, edge, ref);
 
@@ -687,135 +140,41 @@ static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge) {
   return is_new;
 }
 
-int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
+int
+gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
-    struct mark_space *space = heap_mark_space(heap);
-    uint8_t *metadata = metadata_byte_for_object(ref);
-    uint8_t byte = *metadata;
-    if (byte & space->marked_mask)
-      return 1;
 
-    if (!space->evacuating)
-      return 0;
-    if (!block_summary_has_flag(block_summary_for_addr(gc_ref_value(ref)),
-                                BLOCK_EVACUATE))
-      return 0;
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  if (GC_LIKELY(nofl_space_contains(nofl_space, ref)))
+    return nofl_space_forward_or_mark_if_traced(nofl_space, edge, ref);
+
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  if (large_object_space_contains(lospace, ref))
+    return large_object_space_is_copied(lospace, ref);
 
-    struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
-    switch (fwd.state) {
-    case GC_FORWARDING_STATE_NOT_FORWARDED:
-      return 0;
-    case GC_FORWARDING_STATE_BUSY:
-      // Someone else claimed this object first.  Spin until new address
-      // known, or evacuation aborts.
-      for (size_t spin_count = 0;; spin_count++) {
-        if (gc_atomic_forward_retry_busy(&fwd))
-          break;
-        yield_for_spin(spin_count);
-      }
-      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
-        // Remote evacuation aborted; remote will mark and enqueue.
-        return 1;
-      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-      // Fall through.
-    case GC_FORWARDING_STATE_FORWARDED:
-      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-      return 1;
-    default:
-      GC_CRASH();
-    }
-  } else if (large_object_space_contains(heap_large_object_space(heap), ref)) {
-    return large_object_space_is_copied(heap_large_object_space(heap), ref);
-  }
   GC_CRASH();
 }
 
-static inline struct gc_ref mark_space_mark_conservative_ref(struct mark_space *space,
-                                                             struct gc_conservative_ref ref,
-                                                             int possibly_interior) {
-  uintptr_t addr = gc_conservative_ref_value(ref);
-
-  if (possibly_interior) {
-    addr = align_down(addr, GRANULE_SIZE);
-  } else {
-    // Addr not an aligned granule?  Not an object.
-    uintptr_t displacement = addr & (GRANULE_SIZE - 1);
-    if (!gc_is_valid_conservative_ref_displacement(displacement))
-      return gc_ref_null();
-    addr -= displacement;
-  }
-
-  // Addr in meta block?  Not an object.
-  if ((addr & (SLAB_SIZE - 1)) < META_BLOCKS_PER_SLAB * BLOCK_SIZE)
-    return gc_ref_null();
-
-  // Addr in block that has been paged out?  Not an object.
-  struct block_summary *summary = block_summary_for_addr(addr);
-  if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
-    return gc_ref_null();
-
-  uint8_t *loc = metadata_byte_for_addr(addr);
-  uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
-
-  // Already marked object?  Nothing to do.
-  if (byte & space->marked_mask)
-    return gc_ref_null();
-
-  // Addr is the not start of an unmarked object?  Search backwards if
-  // we have interior pointers, otherwise not an object.
-  uint8_t object_start_mask = space->live_mask | METADATA_BYTE_YOUNG;
-  if (!(byte & object_start_mask)) {
-    if (!possibly_interior)
-      return gc_ref_null();
-
-    uintptr_t block_base = align_down(addr, BLOCK_SIZE);
-    uint8_t *loc_base = metadata_byte_for_addr(block_base);
-    do {
-      // Searched past block?  Not an object.
-      if (loc-- == loc_base)
-        return gc_ref_null();
-
-      byte = atomic_load_explicit(loc, memory_order_relaxed);
-
-      // Ran into the end of some other allocation?  Not an object, then.
-      if (byte & METADATA_BYTE_END)
-        return gc_ref_null();
-
-      // Continue until we find object start.
-    } while (!(byte & object_start_mask));
-
-    // Found object start, and object is unmarked; adjust addr.
-    addr = block_base + (loc - loc_base) * GRANULE_SIZE;
-  }
-
-  uint8_t mask = METADATA_BYTE_YOUNG | METADATA_BYTE_MARK_0
-    | METADATA_BYTE_MARK_1 | METADATA_BYTE_MARK_2;
-  atomic_store_explicit(loc, (byte & ~mask) | space->marked_mask,
-                        memory_order_relaxed);
-
-  return gc_ref(addr);
-}
-
-static inline struct gc_ref do_trace_conservative_ref(struct gc_heap *heap,
-                                                      struct gc_conservative_ref ref,
-                                                      int possibly_interior) {
+static inline struct gc_ref
+do_trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
+                          int possibly_interior) {
   if (!gc_conservative_ref_might_be_a_heap_object(ref, possibly_interior))
     return gc_ref_null();
 
-  if (GC_LIKELY(mark_space_contains_conservative_ref(heap_mark_space(heap), ref)))
-    return mark_space_mark_conservative_ref(heap_mark_space(heap), ref,
-                                            possibly_interior);
-  else
-    return large_object_space_mark_conservative_ref(heap_large_object_space(heap),
-                                                    ref, possibly_interior);
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  if (GC_LIKELY(nofl_space_contains_conservative_ref(nofl_space, ref)))
+    return nofl_space_mark_conservative_ref(nofl_space, ref, possibly_interior);
+
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  return large_object_space_mark_conservative_ref(lospace, ref,
+                                                  possibly_interior);
 }
 
-static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
-                                                   struct gc_conservative_ref ref,
-                                                   int possibly_interior) {
+static inline struct gc_ref
+trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
+                       int possibly_interior) {
   struct gc_ref ret = do_trace_conservative_ref(heap, ref, possibly_interior);
 
   if (gc_ref_is_heap_object(ret) &&
@@ -826,35 +185,29 @@ static inline struct gc_ref trace_conservative_ref(struct gc_heap *heap,
   return ret;
 }
 
-static inline size_t mark_space_object_size(struct mark_space *space,
-                                            struct gc_ref ref) {
-  uint8_t *loc = metadata_byte_for_object(ref);
-  size_t granules = mark_space_live_object_granules(loc);
-  return granules * GRANULE_SIZE;
-}
-
-static int heap_has_multiple_mutators(struct gc_heap *heap) {
-  return atomic_load_explicit(&heap->multithreaded, memory_order_relaxed);
-}
-
-static int mutators_are_stopping(struct gc_heap *heap) {
+static int
+mutators_are_stopping(struct gc_heap *heap) {
   return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
 }
 
-static inline void heap_lock(struct gc_heap *heap) {
+static inline void
+heap_lock(struct gc_heap *heap) {
   pthread_mutex_lock(&heap->lock);
 }
-static inline void heap_unlock(struct gc_heap *heap) {
+static inline void
+heap_unlock(struct gc_heap *heap) {
   pthread_mutex_unlock(&heap->lock);
 }
 
 // with heap lock
-static inline int all_mutators_stopped(struct gc_heap *heap) {
+static inline int
+all_mutators_stopped(struct gc_heap *heap) {
   return heap->mutator_count ==
     heap->paused_mutator_count + heap->inactive_mutator_count;
 }
 
-static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+static void
+add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
@@ -869,7 +222,8 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
-static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+static void
+remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
@@ -881,12 +235,14 @@ static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
-static void request_mutators_to_stop(struct gc_heap *heap) {
+static void
+request_mutators_to_stop(struct gc_heap *heap) {
   GC_ASSERT(!mutators_are_stopping(heap));
   atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
 }
 
-static void allow_mutators_to_continue(struct gc_heap *heap) {
+static void
+allow_mutators_to_continue(struct gc_heap *heap) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(all_mutators_stopped(heap));
   heap->paused_mutator_count = 0;
@@ -895,129 +251,18 @@ static void allow_mutators_to_continue(struct gc_heap *heap) {
   pthread_cond_broadcast(&heap->mutator_cond);
 }
 
-static void push_unavailable_block(struct mark_space *space, uintptr_t block) {
-  struct block_summary *summary = block_summary_for_addr(block);
-  GC_ASSERT(!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP));
-  GC_ASSERT(!block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
-  block_summary_set_flag(summary, BLOCK_UNAVAILABLE);
-  madvise((void*)block, BLOCK_SIZE, MADV_DONTNEED);
-  push_block(&space->unavailable, block);
-}
-
-static uintptr_t pop_unavailable_block(struct mark_space *space) {
-  uintptr_t block = pop_block(&space->unavailable);
-  if (!block)
-    return 0;
-  struct block_summary *summary = block_summary_for_addr(block);
-  GC_ASSERT(block_summary_has_flag(summary, BLOCK_UNAVAILABLE));
-  block_summary_clear_flag(summary, BLOCK_UNAVAILABLE);
-  return block;
-}
-
-static uintptr_t pop_empty_block(struct mark_space *space) {
-  return pop_block(&space->empty);
-}
-
-static int maybe_push_evacuation_target(struct mark_space *space,
-                                        uintptr_t block, double reserve) {
-  GC_ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
-                                    BLOCK_NEEDS_SWEEP));
-  size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
-                                        memory_order_acquire);
-  size_t total = space->nslabs * NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
-                                            memory_order_acquire);
-  if (targets >= (total - unavailable) * reserve)
-    return 0;
-
-  push_block(&space->evacuation_targets, block);
-  return 1;
-}
-
-static int push_evacuation_target_if_needed(struct mark_space *space,
-                                            uintptr_t block) {
-  return maybe_push_evacuation_target(space, block,
-                                      space->evacuation_minimum_reserve);
-}
-
-static int push_evacuation_target_if_possible(struct mark_space *space,
-                                              uintptr_t block) {
-  return maybe_push_evacuation_target(space, block,
-                                      space->evacuation_reserve);
-}
-
-static void push_empty_block(struct mark_space *space, uintptr_t block) {
-  GC_ASSERT(!block_summary_has_flag(block_summary_for_addr(block),
-                                    BLOCK_NEEDS_SWEEP));
-  push_block(&space->empty, block);
-}
-
-static ssize_t mark_space_request_release_memory(struct mark_space *space,
-                                                 size_t bytes) {
-  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
-}
-
-static void mark_space_reacquire_memory(struct mark_space *space,
-                                        size_t bytes) {
-  ssize_t pending =
-    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
-  while (pending + BLOCK_SIZE <= 0) {
-    uintptr_t block = pop_unavailable_block(space);
-    GC_ASSERT(block);
-    if (push_evacuation_target_if_needed(space, block))
-      continue;
-    push_empty_block(space, block);
-    pending = atomic_fetch_add(&space->pending_unavailable_bytes, BLOCK_SIZE)
-      + BLOCK_SIZE;
-  }
-}
-
-static size_t next_hole(struct gc_mutator *mut);
-
-static int sweep_until_memory_released(struct gc_mutator *mut) {
-  struct mark_space *space = heap_mark_space(mutator_heap(mut));
-  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                         memory_order_acquire);
-  // First try to unmap previously-identified empty blocks.  If pending
-  // > 0 and other mutators happen to identify empty blocks, they will
-  // be unmapped directly and moved to the unavailable list.
-  while (pending > 0) {
-    uintptr_t block = pop_empty_block(space);
-    if (!block)
-      break;
-    // Note that we may have competing uses; if we're evacuating,
-    // perhaps we should push this block to the evacuation target list.
-    // That would enable us to reach a fragmentation low water-mark in
-    // fewer cycles.  But maybe evacuation started in order to obtain
-    // free blocks for large objects; in that case we should just reap
-    // the fruits of our labor.  Probably this second use-case is more
-    // important.
-    push_unavailable_block(space, block);
-    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, BLOCK_SIZE);
-    pending -= BLOCK_SIZE;
-  }
-  // Otherwise, sweep, transitioning any empty blocks to unavailable and
-  // throwing away any non-empty block.  A bit wasteful but hastening
-  // the next collection is a reasonable thing to do here.
-  while (pending > 0) {
-    if (!next_hole(mut))
-      return 0;
-    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                   memory_order_acquire);
-  }
-  return pending <= 0;
-}
-
-static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
+static void
+heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
   size_t previous = heap->large_object_pages;
   heap->large_object_pages = npages;
   GC_ASSERT(npages <= previous);
   size_t bytes = (previous - npages) <<
     heap_large_object_space(heap)->page_size_log2;
-  mark_space_reacquire_memory(heap_mark_space(heap), bytes);
+  nofl_space_reacquire_memory(heap_nofl_space(heap), bytes);
 }
 
-static void mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
+static void
+mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
   size_t old_capacity = buf->capacity;
   size_t old_bytes = old_capacity * sizeof(struct gc_ref);
 
@@ -1038,27 +283,30 @@ static void mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
   buf->capacity = new_capacity;
 }
 
-static void mutator_mark_buf_push(struct gc_mutator_mark_buf *buf,
-                                  struct gc_ref ref) {
+static void
+mutator_mark_buf_push(struct gc_mutator_mark_buf *buf, struct gc_ref ref) {
   if (GC_UNLIKELY(buf->size == buf->capacity))
     mutator_mark_buf_grow(buf);
   buf->objects[buf->size++] = ref;
 }
 
-static void mutator_mark_buf_release(struct gc_mutator_mark_buf *buf) {
+static void
+mutator_mark_buf_release(struct gc_mutator_mark_buf *buf) {
   size_t bytes = buf->size * sizeof(struct gc_ref);
   if (bytes >= getpagesize())
     madvise(buf->objects, align_up(bytes, getpagesize()), MADV_DONTNEED);
   buf->size = 0;
 }
 
-static void mutator_mark_buf_destroy(struct gc_mutator_mark_buf *buf) {
+static void
+mutator_mark_buf_destroy(struct gc_mutator_mark_buf *buf) {
   size_t bytes = buf->capacity * sizeof(struct gc_ref);
   if (bytes)
     munmap(buf->objects, bytes);
 }
 
-static void enqueue_mutator_for_tracing(struct gc_mutator *mut) {
+static void
+enqueue_mutator_for_tracing(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mut->next == NULL);
   struct gc_mutator *next =
@@ -1069,23 +317,26 @@ static void enqueue_mutator_for_tracing(struct gc_mutator *mut) {
                                          &next, mut));
 }
 
-static int heap_should_mark_while_stopping(struct gc_heap *heap) {
+static int
+heap_should_mark_while_stopping(struct gc_heap *heap) {
   return atomic_load_explicit(&heap->mark_while_stopping, memory_order_acquire);
 }
 
-static int mutator_should_mark_while_stopping(struct gc_mutator *mut) {
+static int
+mutator_should_mark_while_stopping(struct gc_mutator *mut) {
   return heap_should_mark_while_stopping(mutator_heap(mut));
 }
 
-void gc_mutator_set_roots(struct gc_mutator *mut,
-                          struct gc_mutator_roots *roots) {
+void
+gc_mutator_set_roots(struct gc_mutator *mut, struct gc_mutator_roots *roots) {
   mut->roots = roots;
 }
-void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
+void
+gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
   heap->roots = roots;
 }
-void gc_heap_set_extern_space(struct gc_heap *heap,
-                              struct gc_extern_space *space) {
+void
+gc_heap_set_extern_space(struct gc_heap *heap, struct gc_extern_space *space) {
   heap->extern_space = space;
 }
 
@@ -1109,59 +360,67 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
-static void trace_and_enqueue_locally(struct gc_edge edge,
-                                      struct gc_heap *heap,
-                                      void *data) {
+static void
+trace_and_enqueue_locally(struct gc_edge edge, struct gc_heap *heap,
+                          void *data) {
   struct gc_mutator *mut = data;
   if (trace_edge(heap, edge))
     mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
-static inline void do_trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
-                                                                 struct gc_heap *heap,
-                                                                 void *data,
-                                                                 int possibly_interior) {
+static inline void
+do_trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
+                                              struct gc_heap *heap,
+                                              void *data,
+                                              int possibly_interior) {
   struct gc_mutator *mut = data;
   struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(object))
     mutator_mark_buf_push(&mut->mark_buf, object);
 }
 
-static void trace_possibly_interior_conservative_ref_and_enqueue_locally
-    (struct gc_conservative_ref ref, struct gc_heap *heap, void *data) {
+static void
+trace_possibly_interior_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
+                                                             struct gc_heap *heap,
+                                                             void *data) {
   return do_trace_conservative_ref_and_enqueue_locally(ref, heap, data, 1);
 }
 
-static void trace_conservative_ref_and_enqueue_locally
-    (struct gc_conservative_ref ref, struct gc_heap *heap, void *data) {
+static void
+trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
+                                           struct gc_heap *heap,
+                                           void *data) {
   return do_trace_conservative_ref_and_enqueue_locally(ref, heap, data, 0);
 }
 
-static void trace_and_enqueue_globally(struct gc_edge edge,
-                                       struct gc_heap *heap,
-                                       void *unused) {
+static void
+trace_and_enqueue_globally(struct gc_edge edge, struct gc_heap *heap,
+                           void *unused) {
   if (trace_edge(heap, edge))
     gc_tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
-static inline void do_trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
-                                                                  struct gc_heap *heap,
-                                                                  void *data,
-                                                                  int possibly_interior) {
+static inline void
+do_trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                               struct gc_heap *heap,
+                                               void *data,
+                                               int possibly_interior) {
   struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
   if (gc_ref_is_heap_object(object))
     gc_tracer_enqueue_root(&heap->tracer, object);
 }
 
-static void trace_possibly_interior_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
-                                                                          struct gc_heap *heap,
-                                                                          void *data) {
+static void
+trace_possibly_interior_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                                              struct gc_heap *heap,
+                                                              void *data) {
   return do_trace_conservative_ref_and_enqueue_globally(ref, heap, data, 1);
 }
 
-static void trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
-                                                        struct gc_heap *heap,
-                                                        void *data) {
+static void
+trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
+                                            struct gc_heap *heap,
+                                            void *data) {
   return do_trace_conservative_ref_and_enqueue_globally(ref, heap, data, 0);
 }
 
@@ -1186,9 +445,9 @@ trace_conservative_edges(uintptr_t low,
     trace(load_conservative_ref(addr), heap, data);
 }
 
-static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
-                                                 struct gc_heap *heap,
-                                                 void *data) {
+static inline void
+tracer_trace_conservative_ref(struct gc_conservative_ref ref,
+                              struct gc_heap *heap, void *data) {
   struct gc_trace_worker *worker = data;
   int possibly_interior = 0;
   struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
@@ -1196,22 +455,21 @@ static inline void tracer_trace_conservative_ref(struct gc_conservative_ref ref,
     gc_trace_worker_enqueue(worker, resolved);
 }
 
-static inline void trace_one_conservatively(struct gc_ref ref,
-                                            struct gc_heap *heap,
-                                            struct gc_trace_worker *worker) {
+static inline void
+trace_one_conservatively(struct gc_ref ref, struct gc_heap *heap,
+                         struct gc_trace_worker *worker) {
   size_t bytes;
-  if (GC_LIKELY(mark_space_contains(heap_mark_space(heap), ref))) {
+  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref))) {
     // Generally speaking we trace conservatively and don't allow much
     // in the way of incremental precise marking on a
     // conservative-by-default heap.  But, we make an exception for
     // ephemerons.
-    uint8_t meta = *metadata_byte_for_addr(gc_ref_value(ref));
-    if (GC_UNLIKELY(meta & METADATA_BYTE_EPHEMERON)) {
+    if (GC_UNLIKELY(nofl_is_ephemeron(ref))) {
       gc_trace_ephemeron(gc_ref_heap_object(ref), tracer_visit, heap,
                          worker);
       return;
     }
-    bytes = mark_space_object_size(heap_mark_space(heap), ref);
+    bytes = nofl_space_object_size(heap_nofl_space(heap), ref);
   } else {
     bytes = large_object_space_object_size(heap_large_object_space(heap), ref);
   }
@@ -1221,17 +479,18 @@ static inline void trace_one_conservatively(struct gc_ref ref,
                            worker);
 }
 
-static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             struct gc_trace_worker *worker) {
+static inline void
+trace_one(struct gc_ref ref, struct gc_heap *heap,
+          struct gc_trace_worker *worker) {
   if (gc_has_conservative_intraheap_edges())
     trace_one_conservatively(ref, heap, worker);
   else
     gc_trace_object(ref, tracer_visit, heap, worker, NULL);
 }
 
-static inline void trace_root(struct gc_root root,
-                              struct gc_heap *heap,
-                              struct gc_trace_worker *worker) {
+static inline void
+trace_root(struct gc_root root, struct gc_heap *heap,
+           struct gc_trace_worker *worker) {
   switch (root.kind) {
   case GC_ROOT_KIND_HEAP:
     gc_trace_heap_roots(root.heap->roots, tracer_visit, heap, worker);
@@ -1251,8 +510,8 @@ static inline void trace_root(struct gc_root root,
   }
 }
 
-static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
-                            void *unused) {
+static void
+visit_root_edge(struct gc_edge edge, struct gc_heap *heap, void *unused) {
   gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
 }
 
@@ -1304,7 +563,8 @@ trace_mutator_conservative_roots(struct gc_mutator *mut,
 
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
-static void trace_stopping_mutator_roots(struct gc_mutator *mut) {
+static void
+trace_stopping_mutator_roots(struct gc_mutator *mut) {
   GC_ASSERT(mutator_should_mark_while_stopping(mut));
   struct gc_heap *heap = mutator_heap(mut);
   trace_mutator_conservative_roots(mut,
@@ -1313,20 +573,23 @@ static void trace_stopping_mutator_roots(struct gc_mutator *mut) {
   gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, heap, mut);
 }
 
-static void trace_mutator_conservative_roots_with_lock(struct gc_mutator *mut) {
+static void
+trace_mutator_conservative_roots_with_lock(struct gc_mutator *mut) {
   trace_mutator_conservative_roots(mut,
                                    mark_and_globally_enqueue_mutator_conservative_roots,
                                    mutator_heap(mut),
                                    NULL);
 }
 
-static void trace_mutator_roots_with_lock(struct gc_mutator *mut) {
+static void
+trace_mutator_roots_with_lock(struct gc_mutator *mut) {
   trace_mutator_conservative_roots_with_lock(mut);
   gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
                          mutator_heap(mut), NULL);
 }
 
-static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
+static void
+trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     trace_mutator_roots_with_lock(mut);
@@ -1334,19 +597,18 @@ static void trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
     enqueue_mutator_for_tracing(mut);
 }
 
-static void release_stopping_mutator_roots(struct gc_mutator *mut) {
+static void
+release_stopping_mutator_roots(struct gc_mutator *mut) {
   mutator_mark_buf_release(&mut->mark_buf);
 }
 
-static void wait_for_mutators_to_stop(struct gc_heap *heap) {
+static void
+wait_for_mutators_to_stop(struct gc_heap *heap) {
   heap->paused_mutator_count++;
   while (!all_mutators_stopped(heap))
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static void finish_sweeping(struct gc_mutator *mut);
-static void finish_sweeping_in_block(struct gc_mutator *mut);
-
 static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   if (!active_mutators_already_marked)
@@ -1361,7 +623,8 @@ static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
     trace_mutator_conservative_roots_with_lock(mut);
 }
 
-static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
+static void
+trace_mutator_roots_after_stop(struct gc_heap *heap) {
   struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   while (mut) {
@@ -1380,88 +643,42 @@ static void trace_mutator_roots_after_stop(struct gc_heap *heap) {
   atomic_store(&heap->mutator_trace_list, NULL);
 
   for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next) {
-    finish_sweeping_in_block(mut);
+    nofl_finish_sweeping_in_block(&mut->allocator, heap_nofl_space(heap));
     trace_mutator_roots_with_lock(mut);
   }
 }
 
-static void trace_global_conservative_roots(struct gc_heap *heap) {
+static void
+trace_global_conservative_roots(struct gc_heap *heap) {
   if (gc_has_global_conservative_roots())
     gc_platform_visit_global_conservative_roots
       (mark_and_globally_enqueue_heap_conservative_roots, heap, NULL);
 }
 
-static void enqueue_generational_root(struct gc_ref ref, struct gc_heap *heap) {
+static void
+enqueue_generational_root(struct gc_ref ref, struct gc_heap *heap) {
   gc_tracer_enqueue_root(&heap->tracer, ref);
 }
 
-// Note that it's quite possible (and even likely) that any given remset
-// byte doesn't hold any roots, if all stores were to nursery objects.
-STATIC_ASSERT_EQ(GRANULES_PER_REMSET_BYTE % 8, 0);
-static void mark_space_trace_card(struct mark_space *space,
-                                  struct gc_heap *heap, struct slab *slab,
-                                  size_t card) {
-  uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
-  size_t granule_base = card * GRANULES_PER_REMSET_BYTE;
-  for (size_t granule_in_remset = 0;
-       granule_in_remset < GRANULES_PER_REMSET_BYTE;
-       granule_in_remset += 8, granule_base += 8) {
-    uint64_t mark_bytes = load_eight_aligned_bytes(slab->metadata + granule_base);
-    mark_bytes &= space->sweep_mask;
-    while (mark_bytes) {
-      size_t granule_offset = count_zero_bytes(mark_bytes);
-      mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
-      size_t granule = granule_base + granule_offset;
-      uintptr_t addr = first_addr_in_slab + granule * GRANULE_SIZE;
-      GC_ASSERT(metadata_byte_for_addr(addr) == &slab->metadata[granule]);
-      enqueue_generational_root(gc_ref(addr), heap);
-    }
-  }
-}
-
-static void mark_space_trace_remembered_set(struct mark_space *space,
-                                            struct gc_heap *heap) {
-  GC_ASSERT(!space->evacuating);
-  for (size_t s = 0; s < space->nslabs; s++) {
-    struct slab *slab = &space->slabs[s];
-    uint8_t *remset = slab->remembered_set;
-    for (size_t card_base = 0;
-         card_base < REMSET_BYTES_PER_SLAB;
-         card_base += 8) {
-      uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
-      if (!remset_bytes) continue;
-      memset(remset + card_base, 0, 8);
-      while (remset_bytes) {
-        size_t card_offset = count_zero_bytes(remset_bytes);
-        remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
-        mark_space_trace_card(space, heap, slab, card_base + card_offset);
-      }
-    }
-  }
-}
-
-static void mark_space_clear_remembered_set(struct mark_space *space) {
-  if (!GC_GENERATIONAL) return;
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    memset(space->slabs[slab].remembered_set, 0, REMSET_BYTES_PER_SLAB);
-  }
-}
-
-void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                             struct gc_edge edge, struct gc_ref new_val) {
+void
+gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                        struct gc_edge edge, struct gc_ref new_val) {
   GC_ASSERT(obj_size > gc_allocator_large_threshold());
   gc_object_set_remembered(obj);
 }
 
-static void trace_generational_roots(struct gc_heap *heap) {
+static void
+trace_generational_roots(struct gc_heap *heap) {
   // TODO: Add lospace nursery.
   if (atomic_load(&heap->gc_kind) == GC_COLLECTION_MINOR) {
-    mark_space_trace_remembered_set(heap_mark_space(heap), heap);
+    nofl_space_trace_remembered_set(heap_nofl_space(heap),
+                                    enqueue_generational_root,
+                                    heap);
     large_object_space_trace_remembered_set(heap_large_object_space(heap),
                                             enqueue_generational_root,
                                             heap);
   } else {
-    mark_space_clear_remembered_set(heap_mark_space(heap));
+    nofl_space_clear_remembered_set(heap_nofl_space(heap));
     large_object_space_clear_remembered_set(heap_large_object_space(heap));
   }
 }
@@ -1502,7 +719,7 @@ pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
-  finish_sweeping_in_block(mut);
+  nofl_finish_sweeping_in_block(&mut->allocator, heap_nofl_space(heap));
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     // No need to collect results in mark buf; we can enqueue roots directly.
@@ -1513,11 +730,12 @@ pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
 }
 
 static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
+static void
+pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
-  finish_sweeping(mut);
+  nofl_finish_sweeping(&mut->allocator, heap_nofl_space(heap));
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     trace_stopping_mutator_roots(mut);
@@ -1528,66 +746,45 @@ static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   release_stopping_mutator_roots(mut);
 }
 
-static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
+static inline void
+maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
   while (mutators_are_stopping(mutator_heap(mut)))
     pause_mutator_for_collection_without_lock(mut);
 }
 
-static void reset_sweeper(struct mark_space *space) {
-  space->next_block = (uintptr_t) &space->slabs[0].blocks;
-}
-
-static void update_mark_patterns(struct mark_space *space,
-                                 int advance_mark_mask) {
-  uint8_t survivor_mask = space->marked_mask;
-  uint8_t next_marked_mask = rotate_dead_survivor_marked(survivor_mask);
-  if (advance_mark_mask)
-    space->marked_mask = next_marked_mask;
-  space->live_mask = survivor_mask | next_marked_mask;
-  space->sweep_mask = broadcast_byte(space->live_mask);
-}
-
-static void reset_statistics(struct mark_space *space) {
-  space->granules_freed_by_last_collection = 0;
-  space->fragmentation_granules_since_last_collection = 0;
-}
-
 static int maybe_grow_heap(struct gc_heap *heap) {
   return 0;
 }
 
-static double heap_last_gc_yield(struct gc_heap *heap) {
-  struct mark_space *mark_space = heap_mark_space(heap);
-  size_t mark_space_yield = mark_space->granules_freed_by_last_collection;
-  mark_space_yield <<= GRANULE_SIZE_LOG_2;
-  size_t evacuation_block_yield =
-    atomic_load_explicit(&mark_space->evacuation_targets.count,
-                         memory_order_acquire) * BLOCK_SIZE;
-  size_t minimum_evacuation_block_yield =
-    heap->size * mark_space->evacuation_minimum_reserve;
-  if (evacuation_block_yield < minimum_evacuation_block_yield)
-    evacuation_block_yield = 0;
-  else
-    evacuation_block_yield -= minimum_evacuation_block_yield;
+static double
+heap_last_gc_yield(struct gc_heap *heap) {
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  size_t nofl_yield = nofl_space_yield(nofl_space);
+  size_t evacuation_reserve = nofl_space_evacuation_reserve(nofl_space);
+  // FIXME: Size nofl evacuation reserve based on size of nofl space,
+  // not heap size.
+  size_t minimum_evacuation_reserve =
+    heap->size * nofl_space->evacuation_minimum_reserve;
+  if (evacuation_reserve > minimum_evacuation_reserve)
+    nofl_yield += evacuation_reserve - minimum_evacuation_reserve;
   struct large_object_space *lospace = heap_large_object_space(heap);
   size_t lospace_yield = lospace->pages_freed_by_last_collection;
   lospace_yield <<= lospace->page_size_log2;
 
-  double yield = mark_space_yield + lospace_yield + evacuation_block_yield;
+  double yield = nofl_yield + lospace_yield;
   return yield / heap->size;
 }
 
-static double heap_fragmentation(struct gc_heap *heap) {
-  struct mark_space *mark_space = heap_mark_space(heap);
-  size_t fragmentation_granules =
-    mark_space->fragmentation_granules_since_last_collection;
-  size_t heap_granules = heap->size >> GRANULE_SIZE_LOG_2;
-
-  return ((double)fragmentation_granules) / heap_granules;
+static double
+heap_fragmentation(struct gc_heap *heap) {
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  size_t fragmentation = nofl_space_fragmentation(nofl_space);
+  return ((double)fragmentation) / heap->size;
 }
 
-static void detect_out_of_memory(struct gc_heap *heap) {
-  struct mark_space *mark_space = heap_mark_space(heap);
+static void
+detect_out_of_memory(struct gc_heap *heap) {
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
 
   if (heap->count == 0)
@@ -1596,28 +793,28 @@ static void detect_out_of_memory(struct gc_heap *heap) {
   double last_yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
 
-  double yield_epsilon = BLOCK_SIZE * 1.0 / heap->size;
-  double fragmentation_epsilon = LARGE_OBJECT_THRESHOLD * 1.0 / BLOCK_SIZE;
+  double yield_epsilon = NOFL_BLOCK_SIZE * 1.0 / heap->size;
+  double fragmentation_epsilon = LARGE_OBJECT_THRESHOLD * 1.0 / NOFL_BLOCK_SIZE;
 
   if (last_yield - fragmentation > yield_epsilon)
     return;
 
   if (fragmentation > fragmentation_epsilon
-      && atomic_load(&mark_space->evacuation_targets.count))
+      && atomic_load(&nofl_space->evacuation_targets.count))
     return;
 
   // No yield in last gc and we do not expect defragmentation to
   // be able to yield more space: out of memory.
   fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
-          heap->size, mark_space->nslabs);
+          heap->size, nofl_space->nslabs);
   GC_CRASH();
 }
 
-static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
-                                             double threshold) {
+static double
+clamp_major_gc_yield_threshold(struct gc_heap *heap, double threshold) {
   if (threshold < heap->minimum_major_gc_yield_threshold)
     threshold = heap->minimum_major_gc_yield_threshold;
-  double one_block = BLOCK_SIZE * 1.0 / heap->size;
+  double one_block = NOFL_BLOCK_SIZE * 1.0 / heap->size;
   if (threshold < one_block)
     threshold = one_block;
   return threshold;
@@ -1626,13 +823,13 @@ static double clamp_major_gc_yield_threshold(struct gc_heap *heap,
 static enum gc_collection_kind
 determine_collection_kind(struct gc_heap *heap,
                           enum gc_collection_kind requested) {
-  struct mark_space *mark_space = heap_mark_space(heap);
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
   enum gc_collection_kind previous_gc_kind = atomic_load(&heap->gc_kind);
   enum gc_collection_kind gc_kind;
   int mark_while_stopping = 1;
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
-  ssize_t pending = atomic_load_explicit(&mark_space->pending_unavailable_bytes,
+  ssize_t pending = atomic_load_explicit(&nofl_space->pending_unavailable_bytes,
                                          memory_order_acquire);
 
   if (heap->count == 0) {
@@ -1725,186 +922,44 @@ determine_collection_kind(struct gc_heap *heap,
   return gc_kind;
 }
 
-static void release_evacuation_target_blocks(struct mark_space *space) {
-  // Move excess evacuation target blocks back to empties.
-  size_t total = space->nslabs * NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
-                                            memory_order_acquire);
-  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
-  finish_evacuation_allocator(&space->evacuation_allocator,
-                              &space->evacuation_targets, &space->empty,
-                              reserve);
-}
-
-static void prepare_for_evacuation(struct gc_heap *heap) {
-  struct mark_space *space = heap_mark_space(heap);
-
-  if (heap->gc_kind != GC_COLLECTION_COMPACTING) {
-    space->evacuating = 0;
-    space->evacuation_reserve = space->evacuation_minimum_reserve;
-    return;
-  }
-
-  // Put the mutator into evacuation mode, collecting up to 50% of free space as
-  // evacuation blocks.
-  space->evacuation_reserve = 0.5;
-
-  size_t target_blocks = space->evacuation_targets.count;
-  DEBUG("evacuation target block count: %zu\n", target_blocks);
-
-  if (target_blocks == 0) {
-    DEBUG("no evacuation target blocks, disabling evacuation for this round\n");
-    space->evacuating = 0;
-    return;
-  }
-
-  size_t target_granules = target_blocks * GRANULES_PER_BLOCK;
-  // Compute histogram where domain is the number of granules in a block
-  // that survived the last collection, aggregated into 33 buckets, and
-  // range is number of blocks in that bucket.  (Bucket 0 is for blocks
-  // that were found to be completely empty; such blocks may be on the
-  // evacuation target list.)
-  const size_t bucket_count = 33;
-  size_t histogram[33] = {0,};
-  size_t bucket_size = GRANULES_PER_BLOCK / 32;
-  size_t empties = 0;
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct block_summary *summary = &space->slabs[slab].summaries[block];
-      if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
-        continue;
-      if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
-        empties++;
-        continue;
-      }
-      size_t survivor_granules = GRANULES_PER_BLOCK - summary->free_granules;
-      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
-      histogram[bucket]++;
-    }
-  }
-
-  // Blocks which lack the NEEDS_SWEEP flag are empty, either because
-  // they have been removed from the pool and have the UNAVAILABLE flag
-  // set, or because they are on the empties or evacuation target
-  // lists.  When evacuation starts, the empties list should be empty.
-  GC_ASSERT(empties == target_blocks);
-
-  // Now select a number of blocks that is likely to fill the space in
-  // the target blocks.  Prefer candidate blocks with fewer survivors
-  // from the last GC, to increase expected free block yield.
-  for (size_t bucket = 0; bucket < bucket_count; bucket++) {
-    size_t bucket_granules = bucket * bucket_size * histogram[bucket];
-    if (bucket_granules <= target_granules) {
-      target_granules -= bucket_granules;
-    } else {
-      histogram[bucket] = target_granules / (bucket_size * bucket);
-      target_granules = 0;
-    }
-  }
-
-  // Having selected the number of blocks, now we set the evacuation
-  // candidate flag on all blocks.
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct block_summary *summary = &space->slabs[slab].summaries[block];
-      if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
-        continue;
-      if (!block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP))
-        continue;
-      size_t survivor_granules = GRANULES_PER_BLOCK - summary->free_granules;
-      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
-      if (histogram[bucket]) {
-        block_summary_set_flag(summary, BLOCK_EVACUATE);
-        histogram[bucket]--;
-      } else {
-        block_summary_clear_flag(summary, BLOCK_EVACUATE);
-      }
-    }
-  }
-
-  // We are ready to evacuate!
-  prepare_evacuation_allocator(&space->evacuation_allocator,
-                               &space->evacuation_targets);
-  space->evacuating = 1;
-}
-
-static void trace_conservative_roots_after_stop(struct gc_heap *heap) {
-  GC_ASSERT(!heap_mark_space(heap)->evacuating);
+static void
+trace_conservative_roots_after_stop(struct gc_heap *heap) {
+  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
   if (gc_has_mutator_conservative_roots())
     trace_mutator_conservative_roots_after_stop(heap);
   if (gc_has_global_conservative_roots())
     trace_global_conservative_roots(heap);
 }
 
-static void trace_pinned_roots_after_stop(struct gc_heap *heap) {
-  GC_ASSERT(!heap_mark_space(heap)->evacuating);
+static void
+trace_pinned_roots_after_stop(struct gc_heap *heap) {
+  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
   trace_conservative_roots_after_stop(heap);
 }
 
-static void trace_roots_after_stop(struct gc_heap *heap) {
+static void
+trace_roots_after_stop(struct gc_heap *heap) {
   trace_mutator_roots_after_stop(heap);
   gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap, NULL);
   gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
   trace_generational_roots(heap);
 }
 
-static void verify_mark_space_before_restart(struct mark_space *space) {
-  // Iterate objects in each block, verifying that the END bytes correspond to
-  // the measured object size.
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct block_summary *summary = &space->slabs[slab].summaries[block];
-      if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
-        continue;
-
-      uintptr_t addr = (uintptr_t)space->slabs[slab].blocks[block].data;
-      uintptr_t limit = addr + BLOCK_SIZE;
-      uint8_t *meta = metadata_byte_for_addr(addr);
-      while (addr < limit) {
-        if (meta[0] & space->live_mask) {
-          struct gc_ref obj = gc_ref(addr);
-          size_t obj_bytes = 0;
-          gc_trace_object(gc_ref(addr), NULL, NULL, NULL, &obj_bytes);
-          size_t granules = size_to_granules(obj_bytes);
-          GC_ASSERT(granules);
-          for (size_t granule = 0; granule < granules - 1; granule++)
-            GC_ASSERT(!(meta[granule] & METADATA_BYTE_END));
-          GC_ASSERT(meta[granules - 1] & METADATA_BYTE_END);
-          meta += granules;
-          addr += granules * GRANULE_SIZE;
-        } else {
-          meta++;
-          addr += GRANULE_SIZE;
-        }
-      }
-      GC_ASSERT(addr == limit);
-    }
-  }
-}
-
-static void mark_space_finish_gc(struct mark_space *space,
-                                 enum gc_collection_kind gc_kind) {
-  space->evacuating = 0;
-  reset_sweeper(space);
-  update_mark_patterns(space, 0);
-  reset_statistics(space);
-  release_evacuation_target_blocks(space);
-  if (GC_DEBUG)
-    verify_mark_space_before_restart(space);
-}
-
-static void resolve_ephemerons_lazily(struct gc_heap *heap) {
+static void
+resolve_ephemerons_lazily(struct gc_heap *heap) {
   atomic_store_explicit(&heap->check_pending_ephemerons, 0,
                         memory_order_release);
 }
 
-static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
+static void
+resolve_ephemerons_eagerly(struct gc_heap *heap) {
   atomic_store_explicit(&heap->check_pending_ephemerons, 1,
                         memory_order_release);
   gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
 }
 
-static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
+static int
+enqueue_resolved_ephemerons(struct gc_heap *heap) {
   struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
   if (!resolved)
     return 0;
@@ -1912,7 +967,8 @@ static int enqueue_resolved_ephemerons(struct gc_heap *heap) {
   return 1;
 }
 
-static void trace_resolved_ephemerons(struct gc_heap *heap) {
+static void
+trace_resolved_ephemerons(struct gc_heap *heap) {
   for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
        resolved;
        resolved = gc_pop_resolved_ephemerons(heap)) {
@@ -1921,7 +977,8 @@ static void trace_resolved_ephemerons(struct gc_heap *heap) {
   }
 }
 
-static void resolve_finalizers(struct gc_heap *heap) {
+static void
+resolve_finalizers(struct gc_heap *heap) {
   for (size_t priority = 0;
        priority < gc_finalizer_priority_count();
        priority++) {
@@ -1934,14 +991,15 @@ static void resolve_finalizers(struct gc_heap *heap) {
   gc_notify_finalizers(heap->finalizer_state, heap);
 }
 
-static void sweep_ephemerons(struct gc_heap *heap) {
+static void
+sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
-static void collect(struct gc_mutator *mut,
-                    enum gc_collection_kind requested_kind) {
+static void
+collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
-  struct mark_space *space = heap_mark_space(heap);
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
   struct gc_extern_space *exspace = heap_extern_space(heap);
   if (maybe_grow_heap(heap)) {
@@ -1954,7 +1012,7 @@ static void collect(struct gc_mutator *mut,
     determine_collection_kind(heap, requested_kind);
   int is_minor = gc_kind == GC_COLLECTION_MINOR;
   HEAP_EVENT(heap, prepare_gc, gc_kind);
-  update_mark_patterns(space, !is_minor);
+  nofl_space_update_mark_patterns(nofl_space, !is_minor);
   large_object_space_start_gc(lospace, is_minor);
   gc_extern_space_start_gc(exspace, is_minor);
   resolve_ephemerons_lazily(heap);
@@ -1962,7 +1020,7 @@ static void collect(struct gc_mutator *mut,
   HEAP_EVENT(heap, requesting_stop);
   request_mutators_to_stop(heap);
   trace_mutator_roots_with_lock_before_stop(mut);
-  finish_sweeping(mut);
+  nofl_finish_sweeping(&mut->allocator, nofl_space);
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
@@ -1972,7 +1030,7 @@ static void collect(struct gc_mutator *mut,
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
-  prepare_for_evacuation(heap);
+  nofl_space_prepare_for_evacuation(nofl_space, gc_kind);
   trace_roots_after_stop(heap);
   HEAP_EVENT(heap, roots_traced);
   gc_tracer_trace(&heap->tracer);
@@ -1984,7 +1042,7 @@ static void collect(struct gc_mutator *mut,
   HEAP_EVENT(heap, finalizers_traced);
   sweep_ephemerons(heap);
   gc_tracer_release(&heap->tracer);
-  mark_space_finish_gc(space, gc_kind);
+  nofl_space_finish_gc(nofl_space, gc_kind);
   large_object_space_finish_gc(lospace, is_minor);
   gc_extern_space_finish_gc(exspace, is_minor);
   heap->count++;
@@ -1994,281 +1052,9 @@ static void collect(struct gc_mutator *mut,
   allow_mutators_to_continue(heap);
 }
 
-static int sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
-  uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
-  // If the metadata byte is nonzero, that means either a young, dead,
-  // survived, or marked object.  If it's live (survived or marked), we
-  // found the next mark.  Otherwise it's dead and we clear the byte.
-  // If we see an END, that means an end of a dead object; clear it.
-  if (metadata) {
-    if (metadata & sweep_mask)
-      return 1;
-    atomic_store_explicit(loc, 0, memory_order_relaxed);
-  }
-  return 0;
-}
-
-static int sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
-  uintptr_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
-  if (metadata) {
-    if (metadata & sweep_mask)
-      return 1;
-    atomic_store_explicit(loc, 0, memory_order_relaxed);
-  }
-  return 0;
-}
-
-static uintptr_t mark_space_next_block_to_sweep(struct mark_space *space) {
-  uintptr_t block = atomic_load_explicit(&space->next_block,
-                                         memory_order_acquire);
-  uintptr_t next_block;
-  do {
-    if (block == 0)
-      return 0;
-
-    next_block = block + BLOCK_SIZE;
-    if (next_block % SLAB_SIZE == 0) {
-      uintptr_t hi_addr = space->low_addr + space->extent;
-      if (next_block == hi_addr)
-        next_block = 0;
-      else
-        next_block += META_BLOCKS_PER_SLAB * BLOCK_SIZE;
-    }
-  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
-                                         next_block));
-  return block;
-}
-
-static void finish_block(struct gc_mutator *mut) {
-  GC_ASSERT(mut->block);
-  struct block_summary *block = block_summary_for_addr(mut->block);
-  struct mark_space *space = heap_mark_space(mutator_heap(mut));
-  atomic_fetch_add(&space->granules_freed_by_last_collection,
-                   block->free_granules);
-  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
-                   block->fragmentation_granules);
-
-  // If this block has mostly survivors, we should avoid sweeping it and
-  // trying to allocate into it for a minor GC.  Sweep it next time to
-  // clear any garbage allocated in this cycle and mark it as
-  // "venerable" (i.e., old).
-  GC_ASSERT(!block_summary_has_flag(block, BLOCK_VENERABLE));
-  if (!block_summary_has_flag(block, BLOCK_VENERABLE_AFTER_SWEEP) &&
-      block->free_granules < GRANULES_PER_BLOCK * space->venerable_threshold)
-    block_summary_set_flag(block, BLOCK_VENERABLE_AFTER_SWEEP);
-
-  mut->block = mut->alloc = mut->sweep = 0;
-}
-
-// Sweep some heap to reclaim free space, resetting mut->alloc and
-// mut->sweep.  Return the size of the hole in granules.
-static size_t next_hole_in_block(struct gc_mutator *mut) {
-  uintptr_t sweep = mut->sweep;
-  if (sweep == 0)
-    return 0;
-  uintptr_t limit = mut->block + BLOCK_SIZE;
-  uintptr_t sweep_mask = heap_mark_space(mutator_heap(mut))->sweep_mask;
-
-  while (sweep != limit) {
-    GC_ASSERT((sweep & (GRANULE_SIZE - 1)) == 0);
-    uint8_t* metadata = metadata_byte_for_addr(sweep);
-    size_t limit_granules = (limit - sweep) >> GRANULE_SIZE_LOG_2;
-
-    // Except for when we first get a block, mut->sweep is positioned
-    // right after a hole, which can point to either the end of the
-    // block or to a live object.  Assume that a live object is more
-    // common.
-    {
-      size_t live_granules = 0;
-      while (limit_granules && (metadata[0] & sweep_mask)) {
-        // Object survived collection; skip over it and continue sweeping.
-        size_t object_granules = mark_space_live_object_granules(metadata);
-        live_granules += object_granules;
-        limit_granules -= object_granules;
-        metadata += object_granules;
-      }
-      if (!limit_granules)
-        break;
-      sweep += live_granules * GRANULE_SIZE;
-    }
-
-    size_t free_granules = next_mark(metadata, limit_granules, sweep_mask);
-    GC_ASSERT(free_granules);
-    GC_ASSERT(free_granules <= limit_granules);
-
-    struct block_summary *summary = block_summary_for_addr(sweep);
-    summary->hole_count++;
-    GC_ASSERT(free_granules <= GRANULES_PER_BLOCK - summary->free_granules);
-    summary->free_granules += free_granules;
-
-    size_t free_bytes = free_granules * GRANULE_SIZE;
-    mut->alloc = sweep;
-    mut->sweep = sweep + free_bytes;
-    return free_granules;
-  }
-
-  finish_block(mut);
-  return 0;
-}
-
-static void finish_hole(struct gc_mutator *mut) {
-  size_t granules = (mut->sweep - mut->alloc) / GRANULE_SIZE;
-  if (granules) {
-    struct block_summary *summary = block_summary_for_addr(mut->block);
-    summary->holes_with_fragmentation++;
-    summary->fragmentation_granules += granules;
-    uint8_t *metadata = metadata_byte_for_addr(mut->alloc);
-    memset(metadata, 0, granules);
-    mut->alloc = mut->sweep;
-  }
-  // FIXME: add to fragmentation
-}
-
-static int maybe_release_swept_empty_block(struct gc_mutator *mut) {
-  GC_ASSERT(mut->block);
-  struct mark_space *space = heap_mark_space(mutator_heap(mut));
-  uintptr_t block = mut->block;
-  if (atomic_load_explicit(&space->pending_unavailable_bytes,
-                           memory_order_acquire) <= 0)
-    return 0;
-
-  push_unavailable_block(space, block);
-  atomic_fetch_sub(&space->pending_unavailable_bytes, BLOCK_SIZE);
-  mut->alloc = mut->sweep = mut->block = 0;
-  return 1;
-}
-
-static size_t next_hole(struct gc_mutator *mut) {
-  finish_hole(mut);
-  // As we sweep if we find that a block is empty, we return it to the
-  // empties list.  Empties are precious.  But if we return 10 blocks in
-  // a row, and still find an 11th empty, go ahead and use it.
-  size_t empties_countdown = 10;
-  struct mark_space *space = heap_mark_space(mutator_heap(mut));
-  while (1) {
-    // Sweep current block for a hole.
-    size_t granules = next_hole_in_block(mut);
-    if (granules) {
-      // If the hole spans only part of a block, give it to the mutator.
-      if (granules < GRANULES_PER_BLOCK)
-        return granules;
-      struct block_summary *summary = block_summary_for_addr(mut->block);
-      // Sweep mark bytes for completely empty block.
-      memset(metadata_byte_for_addr(mut->block), 0, GRANULES_PER_BLOCK);
-      block_summary_clear_flag(summary, BLOCK_NEEDS_SWEEP);
-      // Sweeping found a completely empty block.  If we are below the
-      // minimum evacuation reserve, take the block.
-      if (push_evacuation_target_if_needed(space, mut->block)) {
-        mut->alloc = mut->sweep = mut->block = 0;
-        continue;
-      }
-      // If we have pending pages to release to the OS, we should unmap
-      // this block.
-      if (maybe_release_swept_empty_block(mut))
-        continue;
-      // Otherwise if we've already returned lots of empty blocks to the
-      // freelist, give this block to the mutator.
-      if (!empties_countdown) {
-        // After this block is allocated into, it will need to be swept.
-        block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
-        return granules;
-      }
-      // Otherwise we push to the empty blocks list.
-      push_empty_block(space, mut->block);
-      mut->alloc = mut->sweep = mut->block = 0;
-      empties_countdown--;
-    }
-    GC_ASSERT(mut->block == 0);
-    while (1) {
-      uintptr_t block = mark_space_next_block_to_sweep(space);
-      if (block) {
-        // Sweeping found a block.  We might take it for allocation, or
-        // we might send it back.
-        struct block_summary *summary = block_summary_for_addr(block);
-        // If it's marked unavailable, it's already on a list of
-        // unavailable blocks, so skip and get the next block.
-        if (block_summary_has_flag(summary, BLOCK_UNAVAILABLE))
-          continue;
-        if (block_summary_has_flag(summary, BLOCK_VENERABLE)) {
-          // Skip venerable blocks after a minor GC -- we don't need to
-          // sweep as they weren't allocated into last cycle, and the
-          // mark bytes didn't rotate, so we have no cleanup to do; and
-          // we shouldn't try to allocate into them as it's not worth
-          // it.  Any wasted space is measured as fragmentation.
-          if (mutator_heap(mut)->last_collection_was_minor)
-            continue;
-          else
-            block_summary_clear_flag(summary, BLOCK_VENERABLE);
-        }
-        if (block_summary_has_flag(summary, BLOCK_NEEDS_SWEEP)) {
-          // Prepare to sweep the block for holes.
-          mut->alloc = mut->sweep = mut->block = block;
-          if (block_summary_has_flag(summary, BLOCK_VENERABLE_AFTER_SWEEP)) {
-            // In the last cycle we noted that this block consists of
-            // mostly old data.  Sweep any garbage, commit the mark as
-            // venerable, and avoid allocating into it.
-            block_summary_clear_flag(summary, BLOCK_VENERABLE_AFTER_SWEEP);
-            if (mutator_heap(mut)->last_collection_was_minor) {
-              finish_sweeping_in_block(mut);
-              block_summary_set_flag(summary, BLOCK_VENERABLE);
-              continue;
-            }
-          }
-          // This block was marked in the last GC and needs sweeping.
-          // As we sweep we'll want to record how many bytes were live
-          // at the last collection.  As we allocate we'll record how
-          // many granules were wasted because of fragmentation.
-          summary->hole_count = 0;
-          summary->free_granules = 0;
-          summary->holes_with_fragmentation = 0;
-          summary->fragmentation_granules = 0;
-          break;
-        } else {
-          // Otherwise this block is completely empty and is on the
-          // empties list.  We take from the empties list only after all
-          // the NEEDS_SWEEP blocks are processed.
-          continue;
-        }
-      } else {
-        // We are done sweeping for blocks.  Now take from the empties
-        // list.
-        block = pop_empty_block(space);
-        // No empty block?  Return 0 to cause collection.
-        if (!block)
-          return 0;
-
-        // Maybe we should use this empty as a target for evacuation.
-        if (push_evacuation_target_if_possible(space, block))
-          continue;
-
-        // Otherwise return the block to the mutator.
-        struct block_summary *summary = block_summary_for_addr(block);
-        block_summary_set_flag(summary, BLOCK_NEEDS_SWEEP);
-        summary->hole_count = 1;
-        summary->free_granules = GRANULES_PER_BLOCK;
-        summary->holes_with_fragmentation = 0;
-        summary->fragmentation_granules = 0;
-        mut->block = block;
-        mut->alloc = block;
-        mut->sweep = block + BLOCK_SIZE;
-        return GRANULES_PER_BLOCK;
-      }
-    }
-  }
-}
-
-static void finish_sweeping_in_block(struct gc_mutator *mut) {
-  do { finish_hole(mut); } while (next_hole_in_block(mut));
-}
-
-// Another thread is triggering GC.  Before we stop, finish clearing the
-// dead mark bytes for the mutator's block, and release the block.
-static void finish_sweeping(struct gc_mutator *mut) {
-  while (next_hole(mut)) {}
-}
-
-static void trigger_collection(struct gc_mutator *mut,
-                               enum gc_collection_kind requested_kind) {
+static void
+trigger_collection(struct gc_mutator *mut,
+                   enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   int prev_kind = -1;
   heap_lock(heap);
@@ -2279,26 +1065,30 @@ static void trigger_collection(struct gc_mutator *mut,
   heap_unlock(heap);
 }
 
-void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
+void
+gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
   trigger_collection(mut, kind);
 }
 
-static void* allocate_large(struct gc_mutator *mut, size_t size) {
+static void*
+allocate_large(struct gc_mutator *mut, size_t size) {
   struct gc_heap *heap = mutator_heap(mut);
-  struct large_object_space *space = heap_large_object_space(heap);
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  struct large_object_space *lospace = heap_large_object_space(heap);
 
-  size_t npages = large_object_space_npages(space, size);
+  size_t npages = large_object_space_npages(lospace, size);
 
-  mark_space_request_release_memory(heap_mark_space(heap),
-                                    npages << space->page_size_log2);
+  nofl_space_request_release_memory(nofl_space,
+                                    npages << lospace->page_size_log2);
 
-  while (!sweep_until_memory_released(mut))
+  while (!nofl_space_sweep_until_memory_released(nofl_space,
+                                                 &mut->allocator))
     trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
-  void *ret = large_object_space_alloc(space, npages);
+  void *ret = large_object_space_alloc(lospace, npages);
   if (!ret)
-    ret = large_object_space_obtain_and_alloc(space, npages);
+    ret = large_object_space_obtain_and_alloc(lospace, npages);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
@@ -2308,113 +1098,81 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
   return ret;
 }
 
-void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+static void
+collect_for_small_allocation(void *mut) {
+  trigger_collection(mut, GC_COLLECTION_ANY);
+}
+
+void*
+gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
 
   if (size > gc_allocator_large_threshold())
     return allocate_large(mut, size);
 
-  size = align_up(size, GRANULE_SIZE);
-  uintptr_t alloc = mut->alloc;
-  uintptr_t sweep = mut->sweep;
-  uintptr_t new_alloc = alloc + size;
-  struct gc_ref ret;
-  if (new_alloc <= sweep) {
-    mut->alloc = new_alloc;
-    ret = gc_ref(alloc);
-  } else {
-    size_t granules = size >> GRANULE_SIZE_LOG_2;
-    while (1) {
-      size_t hole = next_hole(mut);
-      if (hole >= granules) {
-        clear_memory(mut->alloc, hole * GRANULE_SIZE);
-        break;
-      }
-      if (!hole)
-        trigger_collection(mut, GC_COLLECTION_ANY);
-    }
-    ret = gc_ref(mut->alloc);
-    mut->alloc += size;
-  }
-  gc_update_alloc_table(mut, ret, size);
-  return gc_ref_heap_object(ret);
+  return gc_ref_heap_object(nofl_allocate(&mut->allocator,
+                                          heap_nofl_space(mutator_heap(mut)),
+                                          size, collect_for_small_allocation,
+                                          mut));
 }
 
-void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
+void*
+gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
-struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
+struct gc_ephemeron*
+gc_allocate_ephemeron(struct gc_mutator *mut) {
   struct gc_ref ret =
     gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size()));
-  if (gc_has_conservative_intraheap_edges()) {
-    uint8_t *metadata = metadata_byte_for_addr(gc_ref_value(ret));
-    *metadata |= METADATA_BYTE_EPHEMERON;
-  }
+  nofl_space_set_ephemeron_flag(ret);
   return gc_ref_heap_object(ret);
 }
 
-void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
-                       struct gc_ref key, struct gc_ref value) {
+void
+gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
+                  struct gc_ref key, struct gc_ref value) {
   gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
   // No write barrier: we require that the ephemeron be newer than the
   // key or the value.
 }
 
-struct gc_pending_ephemerons *gc_heap_pending_ephemerons(struct gc_heap *heap) {
+struct gc_pending_ephemerons *
+gc_heap_pending_ephemerons(struct gc_heap *heap) {
   return heap->pending_ephemerons;
 }
 
-unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
+unsigned
+gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
   return heap->count;
 }
 
-struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
+struct gc_finalizer*
+gc_allocate_finalizer(struct gc_mutator *mut) {
   return gc_allocate(mut, gc_finalizer_size());
 }
 
-void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
-                         unsigned priority, struct gc_ref object,
-                         struct gc_ref closure) {
+void
+gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
+                    unsigned priority, struct gc_ref object,
+                    struct gc_ref closure) {
   gc_finalizer_init_internal(finalizer, object, closure);
   gc_finalizer_attach_internal(mutator_heap(mut)->finalizer_state,
                                finalizer, priority);
   // No write barrier.
 }
 
-struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
+struct gc_finalizer*
+gc_pop_finalizable(struct gc_mutator *mut) {
   return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
 }
 
-void gc_set_finalizer_callback(struct gc_heap *heap,
+void
+gc_set_finalizer_callback(struct gc_heap *heap,
                                gc_finalizer_callback callback) {
   gc_finalizer_state_set_callback(heap->finalizer_state, callback);
 }
 
-static struct slab* allocate_slabs(size_t nslabs) {
-  size_t size = nslabs * SLAB_SIZE;
-  size_t extent = size + SLAB_SIZE;
-
-  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("mmap failed");
-    return NULL;
-  }
-
-  uintptr_t base = (uintptr_t) mem;
-  uintptr_t end = base + extent;
-  uintptr_t aligned_base = align_up(base, SLAB_SIZE);
-  uintptr_t aligned_end = aligned_base + size;
-
-  if (aligned_base - base)
-    munmap((void*)base, aligned_base - base);
-  if (end - aligned_end)
-    munmap((void*)aligned_end, end - aligned_end);
-
-  return (struct slab*) aligned_base;
-}
-
 static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
   struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
   size_t target = heap->size * heap->pending_ephemerons_size_factor;
@@ -2482,55 +1240,23 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   return 1;
 }
 
-static int mark_space_init(struct mark_space *space, struct gc_heap *heap) {
-  size_t size = align_up(heap->size, SLAB_SIZE);
-  size_t nslabs = size / SLAB_SIZE;
-  struct slab *slabs = allocate_slabs(nslabs);
-  if (!slabs)
-    return 0;
-
-  space->marked_mask = METADATA_BYTE_MARK_0;
-  update_mark_patterns(space, 0);
-  space->slabs = slabs;
-  space->nslabs = nslabs;
-  space->low_addr = (uintptr_t) slabs;
-  space->extent = size;
-  space->next_block = 0;
-  space->evacuation_minimum_reserve = 0.02;
-  space->evacuation_reserve = space->evacuation_minimum_reserve;
-  space->venerable_threshold = heap->fragmentation_low_threshold;
-  for (size_t slab = 0; slab < nslabs; slab++) {
-    for (size_t block = 0; block < NONMETA_BLOCKS_PER_SLAB; block++) {
-      uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
-      if (size > heap->size) {
-        push_unavailable_block(space, addr);
-        size -= BLOCK_SIZE;
-      } else {
-        if (!push_evacuation_target_if_needed(space, addr))
-          push_empty_block(space, addr);
-      }
-    }
-  }
-  return 1;
-}
-
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut,
             struct gc_event_listener event_listener,
             void *event_listener_data) {
-  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GRANULE_SIZE);
+  GC_ASSERT_EQ(gc_allocator_small_granule_size(), NOFL_GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
-               offsetof(struct gc_mutator, alloc));
+               offsetof(struct nofl_allocator, alloc));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
-               offsetof(struct gc_mutator, sweep));
-  GC_ASSERT_EQ(gc_allocator_alloc_table_alignment(), SLAB_SIZE);
-  GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), METADATA_BYTE_YOUNG);
-  GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), METADATA_BYTE_END);
+               offsetof(struct nofl_allocator, sweep));
+  GC_ASSERT_EQ(gc_allocator_alloc_table_alignment(), NOFL_SLAB_SIZE);
+  GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), NOFL_METADATA_BYTE_YOUNG);
+  GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), NOFL_METADATA_BYTE_END);
   if (GC_GENERATIONAL) {
-    GC_ASSERT_EQ(gc_write_barrier_card_table_alignment(), SLAB_SIZE);
+    GC_ASSERT_EQ(gc_write_barrier_card_table_alignment(), NOFL_SLAB_SIZE);
     GC_ASSERT_EQ(gc_write_barrier_card_size(),
-                 BLOCK_SIZE / REMSET_BYTES_PER_BLOCK);
+                 NOFL_BLOCK_SIZE / NOFL_REMSET_BYTES_PER_BLOCK);
   }
 
   if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
@@ -2548,8 +1274,10 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   (*heap)->event_listener_data = event_listener_data;
   HEAP_EVENT(*heap, init, (*heap)->size);
 
-  struct mark_space *space = heap_mark_space(*heap);
-  if (!mark_space_init(space, *heap)) {
+  struct nofl_space *space = heap_nofl_space(*heap);
+  if (!nofl_space_init(space, (*heap)->size,
+                       options->common.parallelism != 1,
+                       (*heap)->fragmentation_low_threshold)) {
     free(*heap);
     *heap = NULL;
     return 0;

From b663e5878e031ac6c4ba25bdd4779543f2f32075 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 20 Aug 2024 14:33:56 +0200
Subject: [PATCH 265/403] nofl: Refactor evacuation allocation to be
 thread-local

This relaxes a "reliability" requirement, as in
https://wingolog.org/archives/2024/07/10/copying-collectors-with-block-structured-heaps-are-unreliable.
---
 src/nofl-space.h | 1803 ++++++++++++++++++++++------------------------
 src/whippet.c    |   63 +-
 2 files changed, 914 insertions(+), 952 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index fd718c962..eba3cd386 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -34,55 +34,6 @@ STATIC_ASSERT_EQ(NOFL_GRANULE_SIZE, 1 << NOFL_GRANULE_SIZE_LOG_2);
 STATIC_ASSERT_EQ(NOFL_MEDIUM_OBJECT_THRESHOLD,
                  NOFL_MEDIUM_OBJECT_GRANULE_THRESHOLD * NOFL_GRANULE_SIZE);
 
-// Each granule has one mark byte stored in a side table.  A granule's
-// mark state is a whole byte instead of a bit to facilitate parallel
-// marking.  (Parallel markers are allowed to race.)  We also use this
-// byte to compute object extent, via a bit flag indicating
-// end-of-object.
-//
-// Because we want to allow for conservative roots, we need to know
-// whether an address indicates an object or not.  That means that when
-// an object is allocated, it has to set a bit, somewhere.  We use the
-// metadata byte for this purpose, setting the "young" bit.
-//
-// The "young" bit's name might make you think about generational
-// collection, and indeed all objects collected in a minor collection
-// will have this bit set.  However, the nofl space never needs to check
-// for the young bit; if it weren't for the need to identify
-// conservative roots, we wouldn't need a young bit at all.  Perhaps in
-// an all-precise system, we would be able to avoid the overhead of
-// initializing mark byte upon each fresh allocation.
-//
-// When an object becomes dead after a GC, it will still have a bit set
-// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
-// clear these bits before the next collection.  But, for concurrent
-// marking, we will also be marking "live" objects, updating their mark
-// bits.  So there are four object states concurrently observable:
-// young, dead, survivor, and marked.  (If we didn't have concurrent
-// marking we would still need the "marked" state, because marking
-// mutator roots before stopping is also a form of concurrent marking.)
-// Even though these states are mutually exclusive, we use separate bits
-// for them because we have the space.  After each collection, the dead,
-// survivor, and marked states rotate by one bit.
-enum nofl_metadata_byte {
-  NOFL_METADATA_BYTE_NONE = 0,
-  NOFL_METADATA_BYTE_YOUNG = 1,
-  NOFL_METADATA_BYTE_MARK_0 = 2,
-  NOFL_METADATA_BYTE_MARK_1 = 4,
-  NOFL_METADATA_BYTE_MARK_2 = 8,
-  NOFL_METADATA_BYTE_END = 16,
-  NOFL_METADATA_BYTE_EPHEMERON = 32,
-  NOFL_METADATA_BYTE_PINNED = 64,
-  NOFL_METADATA_BYTE_UNUSED_1 = 128
-};
-
-static uint8_t
-nofl_rotate_dead_survivor_marked(uint8_t mask) {
-  uint8_t all =
-    NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  return ((mask << 1) | (mask >> 2)) & all;
-}
-
 #define NOFL_SLAB_SIZE (4 * 1024 * 1024)
 #define NOFL_BLOCK_SIZE (64 * 1024)
 #define NOFL_METADATA_BYTES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
@@ -144,7 +95,9 @@ struct nofl_block_summary {
       uint16_t hole_count;
       uint16_t free_granules;
       // Counters related to allocation since previous collection:
-      // wasted space due to fragmentation.
+      // wasted space due to fragmentation.  Also used by blocks on the
+      // "partly full" list, which have zero holes_with_fragmentation
+      // but nonzero fragmentation_granules.
       uint16_t holes_with_fragmentation;
       uint16_t fragmentation_granules;
       // After a block is swept, if it's empty it goes on the empties
@@ -173,6 +126,91 @@ struct nofl_slab {
 };
 STATIC_ASSERT_EQ(sizeof(struct nofl_slab), NOFL_SLAB_SIZE);
 
+// Lock-free block list.
+struct nofl_block_list {
+  size_t count;
+  uintptr_t blocks;
+};
+
+struct nofl_space {
+  uint64_t sweep_mask;
+  uint8_t live_mask;
+  uint8_t marked_mask;
+  uint8_t evacuating;
+  uintptr_t low_addr;
+  size_t extent;
+  size_t heap_size;
+  uint8_t last_collection_was_minor;
+  uintptr_t next_block;   // atomically
+  struct nofl_block_list empty;
+  struct nofl_block_list unavailable;
+  struct nofl_block_list partly_full;
+  struct nofl_block_list evacuation_targets;
+  double evacuation_minimum_reserve;
+  double evacuation_reserve;
+  double venerable_threshold;
+  ssize_t pending_unavailable_bytes; // atomically
+  struct nofl_slab *slabs;
+  size_t nslabs;
+  uintptr_t granules_freed_by_last_collection; // atomically
+  uintptr_t fragmentation_granules_since_last_collection; // atomically
+};
+
+struct nofl_allocator {
+  uintptr_t alloc;
+  uintptr_t sweep;
+  uintptr_t block;
+};
+
+// Each granule has one mark byte stored in a side table.  A granule's
+// mark state is a whole byte instead of a bit to facilitate parallel
+// marking.  (Parallel markers are allowed to race.)  We also use this
+// byte to compute object extent, via a bit flag indicating
+// end-of-object.
+//
+// Because we want to allow for conservative roots, we need to know
+// whether an address indicates an object or not.  That means that when
+// an object is allocated, it has to set a bit, somewhere.  We use the
+// metadata byte for this purpose, setting the "young" bit.
+//
+// The "young" bit's name might make you think about generational
+// collection, and indeed all objects collected in a minor collection
+// will have this bit set.  However, the nofl space never needs to check
+// for the young bit; if it weren't for the need to identify
+// conservative roots, we wouldn't need a young bit at all.  Perhaps in
+// an all-precise system, we would be able to avoid the overhead of
+// initializing mark byte upon each fresh allocation.
+//
+// When an object becomes dead after a GC, it will still have a bit set
+// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
+// clear these bits before the next collection.  But, for concurrent
+// marking, we will also be marking "live" objects, updating their mark
+// bits.  So there are four object states concurrently observable:
+// young, dead, survivor, and marked.  (If we didn't have concurrent
+// marking we would still need the "marked" state, because marking
+// mutator roots before stopping is also a form of concurrent marking.)
+// Even though these states are mutually exclusive, we use separate bits
+// for them because we have the space.  After each collection, the dead,
+// survivor, and marked states rotate by one bit.
+enum nofl_metadata_byte {
+  NOFL_METADATA_BYTE_NONE = 0,
+  NOFL_METADATA_BYTE_YOUNG = 1,
+  NOFL_METADATA_BYTE_MARK_0 = 2,
+  NOFL_METADATA_BYTE_MARK_1 = 4,
+  NOFL_METADATA_BYTE_MARK_2 = 8,
+  NOFL_METADATA_BYTE_END = 16,
+  NOFL_METADATA_BYTE_EPHEMERON = 32,
+  NOFL_METADATA_BYTE_PINNED = 64,
+  NOFL_METADATA_BYTE_UNUSED_1 = 128
+};
+
+static uint8_t
+nofl_rotate_dead_survivor_marked(uint8_t mask) {
+  uint8_t all =
+    NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  return ((mask << 1) | (mask >> 2)) & all;
+}
+
 static struct nofl_slab*
 nofl_object_slab(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
@@ -235,12 +273,6 @@ nofl_block_summary_set_next(struct nofl_block_summary *summary,
     (summary->next_and_flags & (NOFL_BLOCK_SIZE - 1)) | next;
 }
 
-// Lock-free block list.
-struct nofl_block_list {
-  size_t count;
-  uintptr_t blocks;
-};
-
 static void
 nofl_push_block(struct nofl_block_list *list, uintptr_t block) {
   atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
@@ -267,46 +299,72 @@ nofl_pop_block(struct nofl_block_list *list) {
   return head;
 }
 
-static inline size_t
-nofl_size_to_granules(size_t size) {
-  return (size + NOFL_GRANULE_SIZE - 1) >> NOFL_GRANULE_SIZE_LOG_2;
+static size_t
+nofl_block_count(struct nofl_block_list *list) {
+  return atomic_load_explicit(&list->count, memory_order_acquire);
 }
 
-struct nofl_evacuation_allocator {
-  size_t allocated; // atomically
-  size_t limit;
-  uintptr_t block_cursor; // atomically
-};
+static void
+nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
+  nofl_block_summary_set_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+  nofl_push_block(&space->unavailable, block);
+}
 
-struct nofl_space {
-  uint64_t sweep_mask;
-  uint8_t live_mask;
-  uint8_t marked_mask;
-  uint8_t evacuating;
-  uintptr_t low_addr;
-  size_t extent;
-  size_t heap_size;
-  uint8_t last_collection_was_minor;
-  uintptr_t next_block;   // atomically
-  struct nofl_block_list empty;
-  struct nofl_block_list unavailable;
-  struct nofl_block_list evacuation_targets;
-  double evacuation_minimum_reserve;
-  double evacuation_reserve;
-  double venerable_threshold;
-  ssize_t pending_unavailable_bytes; // atomically
-  struct nofl_evacuation_allocator evacuation_allocator;
-  struct nofl_slab *slabs;
-  size_t nslabs;
-  uintptr_t granules_freed_by_last_collection; // atomically
-  uintptr_t fragmentation_granules_since_last_collection; // atomically
-};
+static uintptr_t
+nofl_pop_unavailable_block(struct nofl_space *space) {
+  uintptr_t block = nofl_pop_block(&space->unavailable);
+  if (!block)
+    return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
+  nofl_block_summary_clear_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  return block;
+}
 
-struct nofl_allocator {
-  uintptr_t alloc;
-  uintptr_t sweep;
-  uintptr_t block;
-};
+static uintptr_t
+nofl_pop_empty_block(struct nofl_space *space) {
+  return nofl_pop_block(&space->empty);
+}
+
+static int
+nofl_maybe_push_evacuation_target(struct nofl_space *space,
+                                  uintptr_t block, double reserve) {
+  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
+                                         NOFL_BLOCK_NEEDS_SWEEP));
+  size_t targets = nofl_block_count(&space->evacuation_targets);
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = nofl_block_count(&space->unavailable);
+  if (targets >= (total - unavailable) * reserve)
+    return 0;
+
+  nofl_push_block(&space->evacuation_targets, block);
+  return 1;
+}
+
+static int
+nofl_push_evacuation_target_if_needed(struct nofl_space *space,
+                                      uintptr_t block) {
+  return nofl_maybe_push_evacuation_target(space, block,
+                                           space->evacuation_minimum_reserve);
+}
+
+static int
+nofl_push_evacuation_target_if_possible(struct nofl_space *space,
+                                        uintptr_t block) {
+  return nofl_maybe_push_evacuation_target(space, block,
+                                           space->evacuation_reserve);
+}
+
+static void
+nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
+  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
+                                         NOFL_BLOCK_NEEDS_SWEEP));
+  nofl_push_block(&space->empty, block);
+}
 
 static inline void
 nofl_clear_memory(uintptr_t addr, size_t size) {
@@ -318,36 +376,6 @@ nofl_space_live_object_granules(uint8_t *metadata) {
   return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 1;
 }
 
-static inline int
-nofl_space_mark_object(struct nofl_space *space, struct gc_ref ref) {
-  uint8_t *loc = nofl_metadata_byte_for_object(ref);
-  uint8_t byte = *loc;
-  if (byte & space->marked_mask)
-    return 0;
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  *loc = (byte & ~mask) | space->marked_mask;
-  return 1;
-}
-
-static uintptr_t
-nofl_make_evacuation_allocator_cursor(uintptr_t block, size_t allocated) {
-  GC_ASSERT(allocated < (NOFL_BLOCK_SIZE - 1) * (uint64_t) NOFL_BLOCK_SIZE);
-  return align_down(block, NOFL_BLOCK_SIZE) | (allocated / NOFL_BLOCK_SIZE);
-}
-
-static void
-nofl_prepare_evacuation_allocator(struct nofl_evacuation_allocator *alloc,
-                                  struct nofl_block_list *targets) {
-  uintptr_t first_block = targets->blocks;
-  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
-  alloc->limit =
-    atomic_load_explicit(&targets->count, memory_order_acquire) * NOFL_BLOCK_SIZE;
-  atomic_store_explicit(&alloc->block_cursor,
-                        nofl_make_evacuation_allocator_cursor(first_block, 0),
-                        memory_order_release);
-}
-
 static void
 nofl_clear_remaining_metadata_bytes_in_block(uintptr_t block,
                                              uintptr_t allocated) {
@@ -360,115 +388,747 @@ nofl_clear_remaining_metadata_bytes_in_block(uintptr_t block,
 }
 
 static void
-nofl_finish_evacuation_allocator_block(uintptr_t block,
-                                       uintptr_t allocated) {
-  GC_ASSERT(allocated <= NOFL_BLOCK_SIZE);
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-  size_t fragmentation = (NOFL_BLOCK_SIZE - allocated) >> NOFL_GRANULE_SIZE_LOG_2;
-  summary->hole_count = 1;
-  summary->free_granules = NOFL_GRANULES_PER_BLOCK;
-  summary->holes_with_fragmentation = fragmentation ? 1 : 0;
-  summary->fragmentation_granules = fragmentation;
-  if (fragmentation)
-    nofl_clear_remaining_metadata_bytes_in_block(block, allocated);
+nofl_allocator_reset(struct nofl_allocator *alloc) {
+  alloc->alloc = alloc->sweep = alloc->block = 0;
 }
 
 static void
-nofl_finish_evacuation_allocator(struct nofl_evacuation_allocator *alloc,
-                                 struct nofl_block_list *targets,
-                                 struct nofl_block_list *empties,
-                                 size_t reserve) {
-  // Blocks that we used for evacuation get returned to the mutator as
-  // sweepable blocks.  Blocks that we didn't get to use go to the
-  // empties.
-  size_t allocated = atomic_load_explicit(&alloc->allocated,
-                                          memory_order_acquire);
-  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
-  if (allocated > alloc->limit)
-    allocated = alloc->limit;
-  while (allocated >= NOFL_BLOCK_SIZE) {
-    uintptr_t block = nofl_pop_block(targets);
-    GC_ASSERT(block);
-    allocated -= NOFL_BLOCK_SIZE;
+nofl_allocator_release_full_block(struct nofl_allocator *alloc,
+                                  struct nofl_space *space,
+                                  struct nofl_block_summary *summary) {
+  GC_ASSERT(alloc->block);
+  GC_ASSERT(alloc->alloc == alloc->sweep);
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE));
+
+  atomic_fetch_add(&space->granules_freed_by_last_collection,
+                   summary->free_granules);
+  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
+                   summary->fragmentation_granules);
+
+  // If this block has mostly survivors, we should avoid sweeping it and
+  // trying to allocate into it for a minor GC.  Sweep it next time to
+  // clear any garbage allocated in this cycle and mark it as
+  // "venerable" (i.e., old).
+  if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP) &&
+      summary->free_granules < NOFL_GRANULES_PER_BLOCK * space->venerable_threshold)
+    nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+
+  nofl_allocator_reset(alloc);
+}
+
+static void
+nofl_allocator_release_partly_full_block(struct nofl_allocator *alloc,
+                                         struct nofl_space *space,
+                                         struct nofl_block_summary *summary) {
+  // A block can go on the partly full list if it has exactly one
+  // hole, located at the end of the block.
+  GC_ASSERT(alloc->alloc > alloc->block);
+  GC_ASSERT(alloc->sweep == alloc->block + NOFL_BLOCK_SIZE);
+  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
+  size_t hole_size = alloc->sweep - alloc->alloc;
+  GC_ASSERT(hole_size);
+  summary->fragmentation_granules = hole_size >> NOFL_GRANULE_SIZE_LOG_2;
+  nofl_push_block(&space->partly_full, alloc->block);
+  nofl_allocator_reset(alloc);
+}
+
+static size_t
+nofl_allocator_acquire_partly_full_block(struct nofl_allocator *alloc,
+                                         struct nofl_space *space) {
+  uintptr_t block = nofl_pop_block(&space->partly_full);
+  if (!block) return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(summary->holes_with_fragmentation == 0);
+  alloc->block = block;
+  alloc->sweep = block + NOFL_BLOCK_SIZE;
+  size_t hole_granules = summary->fragmentation_granules;
+  summary->fragmentation_granules = 0;
+  alloc->alloc = alloc->sweep - (hole_granules << NOFL_GRANULE_SIZE_LOG_2);
+  return hole_granules;
+}
+
+static size_t
+nofl_allocator_acquire_empty_block(struct nofl_allocator *alloc,
+                                   struct nofl_space *space) {
+  uintptr_t block = nofl_pop_empty_block(space);
+  if (!block) return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  summary->hole_count = 1;
+  summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+  summary->holes_with_fragmentation = 0;
+  summary->fragmentation_granules = 0;
+  nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+  alloc->block = alloc->alloc = block;
+  alloc->sweep = block + NOFL_BLOCK_SIZE;
+  nofl_clear_memory(block, NOFL_BLOCK_SIZE);
+  return NOFL_GRANULES_PER_BLOCK;
+}
+
+static void
+nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
+  size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
+  if (granules) {
+    struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
+    summary->holes_with_fragmentation++;
+    summary->fragmentation_granules += granules;
+    uint8_t *metadata = nofl_metadata_byte_for_addr(alloc->alloc);
+    memset(metadata, 0, granules);
+    alloc->alloc = alloc->sweep;
   }
-  if (allocated) {
-    // Finish off the last partially-filled block.
-    uintptr_t block = nofl_pop_block(targets);
-    GC_ASSERT(block);
-    nofl_finish_evacuation_allocator_block(block, allocated);
+}
+
+// Sweep some heap to reclaim free space, advancing alloc->alloc and
+// alloc->sweep.  Return the size of the hole in granules, or 0 if we
+// reached the end of the block.
+static size_t
+nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
+                                  uintptr_t sweep_mask) {
+  GC_ASSERT(alloc->block != 0);
+  GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
+  uintptr_t sweep = alloc->sweep;
+  uintptr_t limit = alloc->block + NOFL_BLOCK_SIZE;
+
+  if (sweep == limit)
+    return 0;
+
+  GC_ASSERT((sweep & (NOFL_GRANULE_SIZE - 1)) == 0);
+  uint8_t* metadata = nofl_metadata_byte_for_addr(sweep);
+  size_t limit_granules = (limit - sweep) >> NOFL_GRANULE_SIZE_LOG_2;
+
+  // Except for when we first get a block, alloc->sweep is positioned
+  // right after a hole, which can point to either the end of the
+  // block or to a live object.  Assume that a live object is more
+  // common.
+  while (limit_granules && (metadata[0] & sweep_mask)) {
+    // Object survived collection; skip over it and continue sweeping.
+    size_t object_granules = nofl_space_live_object_granules(metadata);
+    sweep += object_granules * NOFL_GRANULE_SIZE;
+    limit_granules -= object_granules;
+    metadata += object_granules;
+  }
+  if (!limit_granules) {
+    GC_ASSERT_EQ(sweep, limit);
+    alloc->alloc = alloc->sweep = limit;
+    return 0;
+  }
+
+  size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+  GC_ASSERT(free_granules);
+  GC_ASSERT(free_granules <= limit_granules);
+
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
+  summary->hole_count++;
+  GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - summary->free_granules);
+  summary->free_granules += free_granules;
+
+  size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
+  alloc->alloc = sweep;
+  alloc->sweep = sweep + free_bytes;
+  return free_granules;
+}
+
+static void
+nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
+                                        uintptr_t sweep_mask) {
+  do {
+    nofl_allocator_finish_hole(alloc);
+  } while (nofl_allocator_next_hole_in_block(alloc, sweep_mask));
+}
+
+static void
+nofl_allocator_release_block(struct nofl_allocator *alloc,
+                             struct nofl_space *space) {
+  GC_ASSERT(alloc->block);
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
+  if (alloc->alloc < alloc->sweep &&
+      alloc->sweep == alloc->block + NOFL_BLOCK_SIZE &&
+      summary->holes_with_fragmentation == 0) {
+    nofl_allocator_release_partly_full_block(alloc, space, summary);
+  } else {
+    nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
+    nofl_allocator_release_full_block(alloc, space, summary);
+  }
+}
+
+static void
+nofl_allocator_finish(struct nofl_allocator *alloc, struct nofl_space *space) {
+  if (alloc->block)
+    nofl_allocator_release_block(alloc, space);
+}
+
+static uintptr_t
+nofl_space_next_block_to_sweep(struct nofl_space *space) {
+  uintptr_t block = atomic_load_explicit(&space->next_block,
+                                         memory_order_acquire);
+  uintptr_t next_block;
+  do {
+    if (block == 0)
+      return 0;
+
+    next_block = block + NOFL_BLOCK_SIZE;
+    if (next_block % NOFL_SLAB_SIZE == 0) {
+      uintptr_t hi_addr = space->low_addr + space->extent;
+      if (next_block == hi_addr)
+        next_block = 0;
+      else
+        next_block += NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE;
+    }
+  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
+                                         next_block));
+  return block;
+}
+
+static int
+nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
+                                     struct nofl_space *space) {
+  GC_ASSERT(alloc->block);
+  uintptr_t block = alloc->block;
+  if (atomic_load_explicit(&space->pending_unavailable_bytes,
+                           memory_order_acquire) <= 0)
+    return 0;
+
+  nofl_push_unavailable_block(space, block);
+  atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
+  nofl_allocator_reset(alloc);
+  return 1;
+}
+
+static size_t
+nofl_allocator_next_hole(struct nofl_allocator *alloc,
+                         struct nofl_space *space) {
+  nofl_allocator_finish_hole(alloc);
+  // As we sweep if we find that a block is empty, we return it to the
+  // empties list.  Empties are precious.  But if we return 10 blocks in
+  // a row, and still find an 11th empty, go ahead and use it.
+  size_t empties_countdown = 10;
+  while (1) {
+    // Sweep current block for a hole.
+    if (alloc->block) {
+      struct nofl_block_summary *summary =
+        nofl_block_summary_for_addr(alloc->block);
+      size_t granules =
+        nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+      if (granules) {
+        // If the hole spans only part of a block, let the allocator try
+        // to use it.
+        if (granules < NOFL_GRANULES_PER_BLOCK)
+          return granules;
+        // Otherwise we have an empty block.
+        nofl_clear_remaining_metadata_bytes_in_block(alloc->block, 0);
+        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        // If we need an evacuation reserve block, take it.
+        if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
+          nofl_allocator_reset(alloc);
+          continue;
+        }
+        // If we have pending pages to release to the OS, we should unmap
+        // this block.
+        if (nofl_maybe_release_swept_empty_block(alloc, space))
+          continue;
+        // Otherwise if we've already returned lots of empty blocks to the
+        // freelist, let the allocator keep this block.
+        if (!empties_countdown) {
+          // After this block is allocated into, it will need to be swept.
+          nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+          return granules;
+        }
+        // Otherwise we push to the empty blocks list.
+        nofl_push_empty_block(space, alloc->block);
+        nofl_allocator_reset(alloc);
+        empties_countdown--;
+      } else {
+        nofl_allocator_release_full_block(alloc, space, summary);
+      }
+    }
+
+    GC_ASSERT(alloc->block == 0);
+
+    {
+      size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+      if (granules)
+        return granules;
+    }
+
+    while (1) {
+      uintptr_t block = nofl_space_next_block_to_sweep(space);
+      if (block) {
+        // Sweeping found a block.  We might take it for allocation, or
+        // we might send it back.
+        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+        // If it's marked unavailable, it's already on a list of
+        // unavailable blocks, so skip and get the next block.
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+          continue;
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE)) {
+          // Skip venerable blocks after a minor GC -- we don't need to
+          // sweep as they weren't allocated into last cycle, and the
+          // mark bytes didn't rotate, so we have no cleanup to do; and
+          // we shouldn't try to allocate into them as it's not worth
+          // it.  Any wasted space is measured as fragmentation.
+          if (space->last_collection_was_minor)
+            continue;
+          else
+            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE);
+        }
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
+          // Prepare to sweep the block for holes.
+          alloc->alloc = alloc->sweep = alloc->block = block;
+          if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP)) {
+            // In the last cycle we noted that this block consists of
+            // mostly old data.  Sweep any garbage, commit the mark as
+            // venerable, and avoid allocating into it.
+            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+            if (space->last_collection_was_minor) {
+              nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
+              nofl_allocator_release_full_block(alloc, space, summary);
+              nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE);
+              continue;
+            }
+          }
+          // This block was marked in the last GC and needs sweeping.
+          // As we sweep we'll want to record how many bytes were live
+          // at the last collection.  As we allocate we'll record how
+          // many granules were wasted because of fragmentation.
+          summary->hole_count = 0;
+          summary->free_granules = 0;
+          summary->holes_with_fragmentation = 0;
+          summary->fragmentation_granules = 0;
+          break;
+        } else {
+          // Otherwise this block is completely empty and is on the
+          // empties list.  We take from the empties list only after all
+          // the NEEDS_SWEEP blocks are processed.
+          continue;
+        }
+      } else {
+        // We are done sweeping for blocks.  Now take from the empties
+        // list.
+        block = nofl_pop_empty_block(space);
+        // No empty block?  Return 0 to cause collection.
+        if (!block)
+          return 0;
+
+        // Maybe we should use this empty as a target for evacuation.
+        if (nofl_push_evacuation_target_if_possible(space, block))
+          continue;
+
+        // Otherwise give the block to the allocator.
+        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        summary->hole_count = 1;
+        summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+        summary->holes_with_fragmentation = 0;
+        summary->fragmentation_granules = 0;
+        alloc->block = block;
+        alloc->alloc = block;
+        alloc->sweep = block + NOFL_BLOCK_SIZE;
+        return NOFL_GRANULES_PER_BLOCK;
+      }
+    }
   }
-  size_t remaining = atomic_load_explicit(&targets->count, memory_order_acquire);
-  while (remaining-- > reserve)
-    nofl_push_block(empties, nofl_pop_block(targets));
 }
 
 static struct gc_ref
-nofl_evacuation_allocate(struct nofl_space *space, size_t granules) {
-  // All collector threads compete to allocate from what is logically a
-  // single bump-pointer arena, which is actually composed of a linked
-  // list of blocks.
-  struct nofl_evacuation_allocator *alloc = &space->evacuation_allocator;
-  uintptr_t cursor = atomic_load_explicit(&alloc->block_cursor,
-                                          memory_order_acquire);
-  size_t bytes = granules * NOFL_GRANULE_SIZE;
-  size_t prev = atomic_load_explicit(&alloc->allocated, memory_order_acquire);
-  size_t block_mask = (NOFL_BLOCK_SIZE - 1);
-  size_t next;
-  do {
-    if (prev >= alloc->limit)
-      // No more space.
-      return gc_ref_null();
-    next = prev + bytes;
-    if ((prev ^ next) & ~block_mask)
-      // Allocation straddles a block boundary; advance so it starts a
-      // fresh block.
-      next = (next & ~block_mask) + bytes;
-  } while (!atomic_compare_exchange_weak(&alloc->allocated, &prev, next));
-  // OK, we've claimed our memory, starting at next - bytes.  Now find
-  // the node in the linked list of evacuation targets that corresponds
-  // to this allocation pointer.
-  uintptr_t block = cursor & ~block_mask;
-  // This is the SEQ'th block to be allocated into.
-  uintptr_t seq = cursor & block_mask;
-  // Therefore this block handles allocations starting at SEQ*BLOCK_SIZE
-  // and continuing for NOFL_BLOCK_SIZE bytes.
-  uintptr_t base = seq * NOFL_BLOCK_SIZE;
+nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
+              size_t size, void (*gc)(void*), void *gc_data) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, NOFL_GRANULE_SIZE);
 
-  while ((base ^ next) & ~block_mask) {
-    GC_ASSERT(base < next);
-    if (base + NOFL_BLOCK_SIZE > prev) {
-      // The allocation straddles a block boundary, and the cursor has
-      // caught up so that we identify the block for the previous
-      // allocation pointer.  Finish the previous block, probably
-      // leaving a small hole at the end.
-      nofl_finish_evacuation_allocator_block(block, prev - base);
+  if (alloc->alloc + size > alloc->sweep) {
+    size_t granules = size >> NOFL_GRANULE_SIZE_LOG_2;
+    while (1) {
+      size_t hole = nofl_allocator_next_hole(alloc, space);
+      if (hole >= granules) {
+        nofl_clear_memory(alloc->alloc, hole * NOFL_GRANULE_SIZE);
+        break;
+      }
+      if (!hole)
+        gc(gc_data);
     }
-    // Cursor lags; advance it.
-    block = nofl_block_summary_next(nofl_block_summary_for_addr(block));
-    base += NOFL_BLOCK_SIZE;
-    if (base >= alloc->limit) {
-      // Ran out of blocks!
-      GC_ASSERT(!block);
-      return gc_ref_null();
-    }
-    GC_ASSERT(block);
-    // This store can race with other allocators, but that's OK as long
-    // as it never advances the cursor beyond the allocation pointer,
-    // which it won't because we updated the allocation pointer already.
-    atomic_store_explicit(&alloc->block_cursor,
-                          nofl_make_evacuation_allocator_cursor(block, base),
-                          memory_order_release);
   }
 
-  uintptr_t addr = block + (next & block_mask) - bytes;
-  return gc_ref(addr);
+  struct gc_ref ret = gc_ref(alloc->alloc);
+  alloc->alloc += size;
+  gc_update_alloc_table(ret, size);
+  return ret;
+}
+
+static size_t
+nofl_allocator_acquire_evacuation_block(struct nofl_allocator* alloc,
+                                        struct nofl_space *space) {
+  size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+  if (granules)
+    return granules;
+  return nofl_allocator_acquire_empty_block(alloc, space);
+}
+
+static struct gc_ref
+nofl_evacuation_allocate(struct nofl_allocator* alloc, struct nofl_space *space,
+                         size_t granules) {
+  size_t avail = (alloc->sweep - alloc->alloc) >> NOFL_GRANULE_SIZE_LOG_2;
+  while (avail < granules) {
+    if (alloc->block) {
+      nofl_allocator_finish_hole(alloc);
+      nofl_allocator_release_full_block(alloc, space,
+                                        nofl_block_summary_for_addr(alloc->block));
+    }
+    avail = nofl_allocator_acquire_evacuation_block(alloc, space);
+    if (!avail)
+      return gc_ref_null();
+  }
+
+  struct gc_ref ret = gc_ref(alloc->alloc);
+  alloc->alloc += granules * NOFL_GRANULE_SIZE;
+  gc_update_alloc_table(ret, granules * NOFL_GRANULE_SIZE);
+  return ret;
+}
+
+// Another thread is triggering GC.  Before we stop, finish clearing the
+// dead mark bytes for the mutator's block, and release the block.
+static void
+nofl_finish_sweeping(struct nofl_allocator *alloc,
+                     struct nofl_space *space) {
+  while (nofl_allocator_next_hole(alloc, space)) {}
+}
+
+static inline int
+nofl_is_ephemeron(struct gc_ref ref) {
+  uint8_t meta = *nofl_metadata_byte_for_addr(gc_ref_value(ref));
+  return meta & NOFL_METADATA_BYTE_EPHEMERON;
+}
+
+static void
+nofl_space_set_ephemeron_flag(struct gc_ref ref) {
+  if (gc_has_conservative_intraheap_edges()) {
+    uint8_t *metadata = nofl_metadata_byte_for_addr(gc_ref_value(ref));
+    *metadata |= NOFL_METADATA_BYTE_EPHEMERON;
+  }
+}
+
+// Note that it's quite possible (and even likely) that any given remset
+// byte doesn't hold any roots, if all stores were to nursery objects.
+STATIC_ASSERT_EQ(NOFL_GRANULES_PER_REMSET_BYTE % 8, 0);
+static void
+nofl_space_trace_card(struct nofl_space *space, struct nofl_slab *slab,
+                      size_t card,
+                      void (*enqueue)(struct gc_ref, struct gc_heap*),
+                      struct gc_heap *heap) {
+  uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
+  size_t granule_base = card * NOFL_GRANULES_PER_REMSET_BYTE;
+  for (size_t granule_in_remset = 0;
+       granule_in_remset < NOFL_GRANULES_PER_REMSET_BYTE;
+       granule_in_remset += 8, granule_base += 8) {
+    uint64_t mark_bytes = load_eight_aligned_bytes(slab->metadata + granule_base);
+    mark_bytes &= space->sweep_mask;
+    while (mark_bytes) {
+      size_t granule_offset = count_zero_bytes(mark_bytes);
+      mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
+      size_t granule = granule_base + granule_offset;
+      uintptr_t addr = first_addr_in_slab + granule * NOFL_GRANULE_SIZE;
+      GC_ASSERT(nofl_metadata_byte_for_addr(addr) == &slab->metadata[granule]);
+      enqueue(gc_ref(addr), heap);
+    }
+  }
+}
+
+static void
+nofl_space_trace_remembered_set(struct nofl_space *space,
+                                void (*enqueue)(struct gc_ref,
+                                                struct gc_heap*),
+                                struct gc_heap *heap) {
+  GC_ASSERT(!space->evacuating);
+  for (size_t s = 0; s < space->nslabs; s++) {
+    struct nofl_slab *slab = &space->slabs[s];
+    uint8_t *remset = slab->remembered_set;
+    for (size_t card_base = 0;
+         card_base < NOFL_REMSET_BYTES_PER_SLAB;
+         card_base += 8) {
+      uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
+      if (!remset_bytes) continue;
+      memset(remset + card_base, 0, 8);
+      while (remset_bytes) {
+        size_t card_offset = count_zero_bytes(remset_bytes);
+        remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
+        nofl_space_trace_card(space, slab, card_base + card_offset,
+                              enqueue, heap);
+      }
+    }
+  }
+}
+
+static void
+nofl_space_clear_remembered_set(struct nofl_space *space) {
+  if (!GC_GENERATIONAL) return;
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    memset(space->slabs[slab].remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
+  }
+}
+
+static void
+nofl_space_reset_sweeper(struct nofl_space *space) {
+  space->next_block = (uintptr_t) &space->slabs[0].blocks;
+}
+
+static void
+nofl_space_update_mark_patterns(struct nofl_space *space,
+                                int advance_mark_mask) {
+  uint8_t survivor_mask = space->marked_mask;
+  uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask);
+  if (advance_mark_mask)
+    space->marked_mask = next_marked_mask;
+  space->live_mask = survivor_mask | next_marked_mask;
+  space->sweep_mask = broadcast_byte(space->live_mask);
+}
+
+static void
+nofl_space_reset_statistics(struct nofl_space *space) {
+  space->granules_freed_by_last_collection = 0;
+  space->fragmentation_granules_since_last_collection = 0;
+}
+
+static size_t
+nofl_space_yield(struct nofl_space *space) {
+  return space->granules_freed_by_last_collection * NOFL_GRANULE_SIZE;
+}
+
+static size_t
+nofl_space_evacuation_reserve_bytes(struct nofl_space *space) {
+  return nofl_block_count(&space->evacuation_targets) * NOFL_BLOCK_SIZE;
+}
+
+static size_t
+nofl_space_fragmentation(struct nofl_space *space) {
+  size_t granules = space->fragmentation_granules_since_last_collection;
+  return granules * NOFL_GRANULE_SIZE;
+}
+
+static void
+nofl_space_prepare_evacuation(struct nofl_space *space) {
+  GC_ASSERT(!space->evacuating);
+  {
+    uintptr_t block;
+    while ((block = nofl_pop_block(&space->evacuation_targets)))
+      nofl_push_empty_block(space, block);
+  }
+  size_t target_blocks = nofl_block_count(&space->empty);
+  DEBUG("evacuation target block count: %zu\n", target_blocks);
+
+  if (target_blocks == 0) {
+    DEBUG("no evacuation target blocks, not evacuating this round\n");
+    return;
+  }
+
+  // Put the mutator into evacuation mode, collecting up to 50% of free
+  // space as evacuation blocks.
+  space->evacuation_reserve = 0.5;
+  space->evacuating = 1;
+
+  size_t target_granules = target_blocks * NOFL_GRANULES_PER_BLOCK;
+  // Compute histogram where domain is the number of granules in a block
+  // that survived the last collection, aggregated into 33 buckets, and
+  // range is number of blocks in that bucket.  (Bucket 0 is for blocks
+  // that were found to be completely empty; such blocks may be on the
+  // evacuation target list.)
+  const size_t bucket_count = 33;
+  size_t histogram[33] = {0,};
+  size_t bucket_size = NOFL_GRANULES_PER_BLOCK / 32;
+  size_t empties = 0;
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
+      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+        continue;
+      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
+        empties++;
+        continue;
+      }
+      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
+      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+      histogram[bucket]++;
+    }
+  }
+
+  // Blocks which lack the NEEDS_SWEEP flag are empty, either because
+  // they have been removed from the pool and have the UNAVAILABLE flag
+  // set, or because they are on the empties or evacuation target
+  // lists.  When evacuation starts, the empties list should be empty.
+  GC_ASSERT(empties == target_blocks);
+
+  // Now select a number of blocks that is likely to fill the space in
+  // the target blocks.  Prefer candidate blocks with fewer survivors
+  // from the last GC, to increase expected free block yield.
+  for (size_t bucket = 0; bucket < bucket_count; bucket++) {
+    size_t bucket_granules = bucket * bucket_size * histogram[bucket];
+    if (bucket_granules <= target_granules) {
+      target_granules -= bucket_granules;
+    } else {
+      histogram[bucket] = target_granules / (bucket_size * bucket);
+      target_granules = 0;
+    }
+  }
+
+  // Having selected the number of blocks, now we set the evacuation
+  // candidate flag on all blocks.
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
+      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+        continue;
+      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP))
+        continue;
+      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
+      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+      if (histogram[bucket]) {
+        nofl_block_summary_set_flag(summary, NOFL_BLOCK_EVACUATE);
+        histogram[bucket]--;
+      } else {
+        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_EVACUATE);
+      }
+    }
+  }
+}
+
+static void
+nofl_space_finish_evacuation(struct nofl_space *space) {
+  // When evacuation began, the evacuation reserve was moved to the
+  // empties list.  Now that evacuation is finished, attempt to
+  // repopulate the reserve.
+  GC_ASSERT(space->evacuating);
+  space->evacuating = 0;
+  space->evacuation_reserve = space->evacuation_minimum_reserve;
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = nofl_block_count(&space->unavailable);
+  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
+  GC_ASSERT(nofl_block_count(&space->evacuation_targets) == 0);
+  while (reserve--) {
+    uintptr_t block = nofl_pop_block(&space->empty);
+    if (!block) break;
+    nofl_push_block(&space->evacuation_targets, block);
+  }
+  {
+    // FIXME: We should avoid sweeping partly full blocks, but it's too annoying
+    // to do at the moment given the way sweeping works.
+    uintptr_t block;
+    do {
+      block = nofl_pop_block(&space->partly_full);
+    } while (block);
+  }
+}
+
+static inline size_t
+nofl_size_to_granules(size_t size) {
+  return (size + NOFL_GRANULE_SIZE - 1) >> NOFL_GRANULE_SIZE_LOG_2;
+}
+
+static void
+nofl_space_verify_before_restart(struct nofl_space *space) {
+  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
+  // Iterate objects in each block, verifying that the END bytes correspond to
+  // the measured object size.
+  for (size_t slab = 0; slab < space->nslabs; slab++) {
+    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
+      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
+      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+        continue;
+
+      uintptr_t addr = (uintptr_t)space->slabs[slab].blocks[block].data;
+      uintptr_t limit = addr + NOFL_BLOCK_SIZE;
+      uint8_t *meta = nofl_metadata_byte_for_addr(addr);
+      while (addr < limit) {
+        if (meta[0] & space->live_mask) {
+          struct gc_ref obj = gc_ref(addr);
+          size_t obj_bytes;
+          gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
+          size_t granules = nofl_size_to_granules(obj_bytes);
+          GC_ASSERT(granules);
+          for (size_t granule = 0; granule < granules - 1; granule++)
+            GC_ASSERT(!(meta[granule] & NOFL_METADATA_BYTE_END));
+          GC_ASSERT(meta[granules - 1] & NOFL_METADATA_BYTE_END);
+          meta += granules;
+          addr += granules * NOFL_GRANULE_SIZE;
+        } else {
+          meta++;
+          addr += NOFL_GRANULE_SIZE;
+        }
+      }
+      GC_ASSERT(addr == limit);
+    }
+  }
+}
+
+static void
+nofl_space_finish_gc(struct nofl_space *space,
+                     enum gc_collection_kind gc_kind) {
+  space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
+  if (space->evacuating)
+    nofl_space_finish_evacuation(space);
+  nofl_space_reset_sweeper(space);
+  nofl_space_update_mark_patterns(space, 0);
+  nofl_space_reset_statistics(space);
+  if (GC_DEBUG)
+    nofl_space_verify_before_restart(space);
+}
+
+static ssize_t
+nofl_space_request_release_memory(struct nofl_space *space, size_t bytes) {
+  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
+}
+
+static void
+nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
+  while (pending + NOFL_BLOCK_SIZE <= 0) {
+    uintptr_t block = nofl_pop_unavailable_block(space);
+    GC_ASSERT(block);
+    if (!nofl_push_evacuation_target_if_needed(space, block))
+      nofl_push_empty_block(space, block);
+    pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
+      + NOFL_BLOCK_SIZE;
+  }
+}
+
+static int
+nofl_space_sweep_until_memory_released(struct nofl_space *space,
+                                       struct nofl_allocator *alloc) {
+  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                         memory_order_acquire);
+  // First try to unmap previously-identified empty blocks.  If pending
+  // > 0 and other mutators happen to identify empty blocks, they will
+  // be unmapped directly and moved to the unavailable list.
+  while (pending > 0) {
+    uintptr_t block = nofl_pop_empty_block(space);
+    if (!block)
+      break;
+    // Note that we may have competing uses; if we're evacuating,
+    // perhaps we should push this block to the evacuation target list.
+    // That would enable us to reach a fragmentation low water-mark in
+    // fewer cycles.  But maybe evacuation started in order to obtain
+    // free blocks for large objects; in that case we should just reap
+    // the fruits of our labor.  Probably this second use-case is more
+    // important.
+    nofl_push_unavailable_block(space, block);
+    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
+    pending -= NOFL_BLOCK_SIZE;
+  }
+  // Otherwise, sweep, transitioning any empty blocks to unavailable and
+  // throwing away any non-empty block.  A bit wasteful but hastening
+  // the next collection is a reasonable thing to do here.
+  while (pending > 0) {
+    if (!nofl_allocator_next_hole(alloc, space))
+      return 0;
+    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                   memory_order_acquire);
+  }
+  return pending <= 0;
 }
 
 static inline int
 nofl_space_evacuate_or_mark_object(struct nofl_space *space,
                                    struct gc_edge edge,
-                                   struct gc_ref old_ref) {
+                                   struct gc_ref old_ref,
+                                   struct nofl_allocator *evacuate) {
   uint8_t *metadata = nofl_metadata_byte_for_object(old_ref);
   uint8_t byte = *metadata;
   if (byte & space->marked_mask)
@@ -492,7 +1152,8 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
     case GC_FORWARDING_STATE_ACQUIRED: {
       // We claimed the object successfully; evacuating is up to us.
       size_t object_granules = nofl_space_live_object_granules(metadata);
-      struct gc_ref new_ref = nofl_evacuation_allocate(space, object_granules);
+      struct gc_ref new_ref = nofl_evacuation_allocate(evacuate, space,
+                                                       object_granules);
       if (gc_ref_is_heap_object(new_ref)) {
         // Copy object contents before committing, as we don't know what
         // part of the object (if any) will be overwritten by the
@@ -673,718 +1334,6 @@ nofl_space_object_size(struct nofl_space *space, struct gc_ref ref) {
   return granules * NOFL_GRANULE_SIZE;
 }
 
-static void
-nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
-  nofl_block_summary_set_flag(summary, NOFL_BLOCK_UNAVAILABLE);
-  madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
-  nofl_push_block(&space->unavailable, block);
-}
-
-static uintptr_t
-nofl_pop_unavailable_block(struct nofl_space *space) {
-  uintptr_t block = nofl_pop_block(&space->unavailable);
-  if (!block)
-    return 0;
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
-  nofl_block_summary_clear_flag(summary, NOFL_BLOCK_UNAVAILABLE);
-  return block;
-}
-
-static uintptr_t
-nofl_pop_empty_block(struct nofl_space *space) {
-  return nofl_pop_block(&space->empty);
-}
-
-static int
-nofl_maybe_push_evacuation_target(struct nofl_space *space,
-                                  uintptr_t block, double reserve) {
-  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
-                                         NOFL_BLOCK_NEEDS_SWEEP));
-  size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
-                                        memory_order_acquire);
-  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
-                                            memory_order_acquire);
-  if (targets >= (total - unavailable) * reserve)
-    return 0;
-
-  nofl_push_block(&space->evacuation_targets, block);
-  return 1;
-}
-
-static int
-nofl_push_evacuation_target_if_needed(struct nofl_space *space,
-                                      uintptr_t block) {
-  return nofl_maybe_push_evacuation_target(space, block,
-                                           space->evacuation_minimum_reserve);
-}
-
-static int
-nofl_push_evacuation_target_if_possible(struct nofl_space *space,
-                                        uintptr_t block) {
-  return nofl_maybe_push_evacuation_target(space, block,
-                                           space->evacuation_reserve);
-}
-
-static void
-nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
-  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
-                                         NOFL_BLOCK_NEEDS_SWEEP));
-  nofl_push_block(&space->empty, block);
-}
-
-static ssize_t
-nofl_space_request_release_memory(struct nofl_space *space, size_t bytes) {
-  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
-}
-
-static void
-nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
-  ssize_t pending =
-    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
-  while (pending + NOFL_BLOCK_SIZE <= 0) {
-    uintptr_t block = nofl_pop_unavailable_block(space);
-    GC_ASSERT(block);
-    if (nofl_push_evacuation_target_if_needed(space, block))
-      continue;
-    nofl_push_empty_block(space, block);
-    pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
-      + NOFL_BLOCK_SIZE;
-  }
-}
-
-static size_t
-nofl_allocator_next_hole(struct nofl_allocator *alloc,
-                         struct nofl_space *space);
-
-static int
-nofl_space_sweep_until_memory_released(struct nofl_space *space,
-                                       struct nofl_allocator *alloc) {
-  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                         memory_order_acquire);
-  // First try to unmap previously-identified empty blocks.  If pending
-  // > 0 and other mutators happen to identify empty blocks, they will
-  // be unmapped directly and moved to the unavailable list.
-  while (pending > 0) {
-    uintptr_t block = nofl_pop_empty_block(space);
-    if (!block)
-      break;
-    // Note that we may have competing uses; if we're evacuating,
-    // perhaps we should push this block to the evacuation target list.
-    // That would enable us to reach a fragmentation low water-mark in
-    // fewer cycles.  But maybe evacuation started in order to obtain
-    // free blocks for large objects; in that case we should just reap
-    // the fruits of our labor.  Probably this second use-case is more
-    // important.
-    nofl_push_unavailable_block(space, block);
-    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
-    pending -= NOFL_BLOCK_SIZE;
-  }
-  // Otherwise, sweep, transitioning any empty blocks to unavailable and
-  // throwing away any non-empty block.  A bit wasteful but hastening
-  // the next collection is a reasonable thing to do here.
-  while (pending > 0) {
-    if (!nofl_allocator_next_hole(alloc, space))
-      return 0;
-    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                   memory_order_acquire);
-  }
-  return pending <= 0;
-}
-
-static inline int
-nofl_is_ephemeron(struct gc_ref ref) {
-  uint8_t meta = *nofl_metadata_byte_for_addr(gc_ref_value(ref));
-  return meta & NOFL_METADATA_BYTE_EPHEMERON;
-}
-
-static void
-nofl_space_set_ephemeron_flag(struct gc_ref ref) {
-  if (gc_has_conservative_intraheap_edges()) {
-    uint8_t *metadata = nofl_metadata_byte_for_addr(gc_ref_value(ref));
-    *metadata |= NOFL_METADATA_BYTE_EPHEMERON;
-  }
-}
-
-static void nofl_finish_sweeping(struct nofl_allocator *alloc,
-                                 struct nofl_space *space);
-static void nofl_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                                          struct nofl_space *space);
-
-// Note that it's quite possible (and even likely) that any given remset
-// byte doesn't hold any roots, if all stores were to nursery objects.
-STATIC_ASSERT_EQ(NOFL_GRANULES_PER_REMSET_BYTE % 8, 0);
-static void
-nofl_space_trace_card(struct nofl_space *space, struct nofl_slab *slab,
-                      size_t card,
-                      void (*enqueue)(struct gc_ref, struct gc_heap*),
-                      struct gc_heap *heap) {
-  uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
-  size_t granule_base = card * NOFL_GRANULES_PER_REMSET_BYTE;
-  for (size_t granule_in_remset = 0;
-       granule_in_remset < NOFL_GRANULES_PER_REMSET_BYTE;
-       granule_in_remset += 8, granule_base += 8) {
-    uint64_t mark_bytes = load_eight_aligned_bytes(slab->metadata + granule_base);
-    mark_bytes &= space->sweep_mask;
-    while (mark_bytes) {
-      size_t granule_offset = count_zero_bytes(mark_bytes);
-      mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
-      size_t granule = granule_base + granule_offset;
-      uintptr_t addr = first_addr_in_slab + granule * NOFL_GRANULE_SIZE;
-      GC_ASSERT(nofl_metadata_byte_for_addr(addr) == &slab->metadata[granule]);
-      enqueue(gc_ref(addr), heap);
-    }
-  }
-}
-
-static void
-nofl_space_trace_remembered_set(struct nofl_space *space,
-                                void (*enqueue)(struct gc_ref,
-                                                struct gc_heap*),
-                                struct gc_heap *heap) {
-  GC_ASSERT(!space->evacuating);
-  for (size_t s = 0; s < space->nslabs; s++) {
-    struct nofl_slab *slab = &space->slabs[s];
-    uint8_t *remset = slab->remembered_set;
-    for (size_t card_base = 0;
-         card_base < NOFL_REMSET_BYTES_PER_SLAB;
-         card_base += 8) {
-      uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
-      if (!remset_bytes) continue;
-      memset(remset + card_base, 0, 8);
-      while (remset_bytes) {
-        size_t card_offset = count_zero_bytes(remset_bytes);
-        remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
-        nofl_space_trace_card(space, slab, card_base + card_offset,
-                              enqueue, heap);
-      }
-    }
-  }
-}
-
-static void
-nofl_space_clear_remembered_set(struct nofl_space *space) {
-  if (!GC_GENERATIONAL) return;
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    memset(space->slabs[slab].remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
-  }
-}
-
-static void
-nofl_space_reset_sweeper(struct nofl_space *space) {
-  space->next_block = (uintptr_t) &space->slabs[0].blocks;
-}
-
-static void
-nofl_space_update_mark_patterns(struct nofl_space *space,
-                                int advance_mark_mask) {
-  uint8_t survivor_mask = space->marked_mask;
-  uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask);
-  if (advance_mark_mask)
-    space->marked_mask = next_marked_mask;
-  space->live_mask = survivor_mask | next_marked_mask;
-  space->sweep_mask = broadcast_byte(space->live_mask);
-}
-
-static void
-nofl_space_reset_statistics(struct nofl_space *space) {
-  space->granules_freed_by_last_collection = 0;
-  space->fragmentation_granules_since_last_collection = 0;
-}
-
-static size_t
-nofl_space_yield(struct nofl_space *space) {
-  return space->granules_freed_by_last_collection * NOFL_GRANULE_SIZE;
-}
-
-static size_t
-nofl_space_evacuation_reserve(struct nofl_space *space) {
-  return atomic_load_explicit(&space->evacuation_targets.count,
-                              memory_order_acquire) * NOFL_BLOCK_SIZE;
-}
-
-static size_t
-nofl_space_fragmentation(struct nofl_space *space) {
-  size_t granules = space->fragmentation_granules_since_last_collection;
-  return granules * NOFL_GRANULE_SIZE;
-}
-
-static void
-nofl_space_release_evacuation_target_blocks(struct nofl_space *space) {
-  // Move excess evacuation target blocks back to empties.
-  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
-                                            memory_order_acquire);
-  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
-  nofl_finish_evacuation_allocator(&space->evacuation_allocator,
-                                   &space->evacuation_targets,
-                                   &space->empty,
-                                   reserve);
-}
-
-static void
-nofl_space_prepare_for_evacuation(struct nofl_space *space,
-                                  enum gc_collection_kind gc_kind) {
-  if (gc_kind != GC_COLLECTION_COMPACTING) {
-    space->evacuating = 0;
-    space->evacuation_reserve = space->evacuation_minimum_reserve;
-    return;
-  }
-
-  // Put the mutator into evacuation mode, collecting up to 50% of free space as
-  // evacuation blocks.
-  space->evacuation_reserve = 0.5;
-
-  size_t target_blocks = space->evacuation_targets.count;
-  DEBUG("evacuation target block count: %zu\n", target_blocks);
-
-  if (target_blocks == 0) {
-    DEBUG("no evacuation target blocks, disabling evacuation for this round\n");
-    space->evacuating = 0;
-    return;
-  }
-
-  size_t target_granules = target_blocks * NOFL_GRANULES_PER_BLOCK;
-  // Compute histogram where domain is the number of granules in a block
-  // that survived the last collection, aggregated into 33 buckets, and
-  // range is number of blocks in that bucket.  (Bucket 0 is for blocks
-  // that were found to be completely empty; such blocks may be on the
-  // evacuation target list.)
-  const size_t bucket_count = 33;
-  size_t histogram[33] = {0,};
-  size_t bucket_size = NOFL_GRANULES_PER_BLOCK / 32;
-  size_t empties = 0;
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
-      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-        continue;
-      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
-        empties++;
-        continue;
-      }
-      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
-      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
-      histogram[bucket]++;
-    }
-  }
-
-  // Blocks which lack the NEEDS_SWEEP flag are empty, either because
-  // they have been removed from the pool and have the UNAVAILABLE flag
-  // set, or because they are on the empties or evacuation target
-  // lists.  When evacuation starts, the empties list should be empty.
-  GC_ASSERT(empties == target_blocks);
-
-  // Now select a number of blocks that is likely to fill the space in
-  // the target blocks.  Prefer candidate blocks with fewer survivors
-  // from the last GC, to increase expected free block yield.
-  for (size_t bucket = 0; bucket < bucket_count; bucket++) {
-    size_t bucket_granules = bucket * bucket_size * histogram[bucket];
-    if (bucket_granules <= target_granules) {
-      target_granules -= bucket_granules;
-    } else {
-      histogram[bucket] = target_granules / (bucket_size * bucket);
-      target_granules = 0;
-    }
-  }
-
-  // Having selected the number of blocks, now we set the evacuation
-  // candidate flag on all blocks.
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
-      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-        continue;
-      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP))
-        continue;
-      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
-      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
-      if (histogram[bucket]) {
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_EVACUATE);
-        histogram[bucket]--;
-      } else {
-        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_EVACUATE);
-      }
-    }
-  }
-
-  // We are ready to evacuate!
-  nofl_prepare_evacuation_allocator(&space->evacuation_allocator,
-                                    &space->evacuation_targets);
-  space->evacuating = 1;
-}
-
-static void
-nofl_space_verify_before_restart(struct nofl_space *space) {
-  // Iterate objects in each block, verifying that the END bytes correspond to
-  // the measured object size.
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
-      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-        continue;
-
-      uintptr_t addr = (uintptr_t)space->slabs[slab].blocks[block].data;
-      uintptr_t limit = addr + NOFL_BLOCK_SIZE;
-      uint8_t *meta = nofl_metadata_byte_for_addr(addr);
-      while (addr < limit) {
-        if (meta[0] & space->live_mask) {
-          struct gc_ref obj = gc_ref(addr);
-          size_t obj_bytes;
-          gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
-          size_t granules = nofl_size_to_granules(obj_bytes);
-          GC_ASSERT(granules);
-          for (size_t granule = 0; granule < granules - 1; granule++)
-            GC_ASSERT(!(meta[granule] & NOFL_METADATA_BYTE_END));
-          GC_ASSERT(meta[granules - 1] & NOFL_METADATA_BYTE_END);
-          meta += granules;
-          addr += granules * NOFL_GRANULE_SIZE;
-        } else {
-          meta++;
-          addr += NOFL_GRANULE_SIZE;
-        }
-      }
-      GC_ASSERT(addr == limit);
-    }
-  }
-}
-
-static void
-nofl_space_finish_gc(struct nofl_space *space,
-                     enum gc_collection_kind gc_kind) {
-  space->evacuating = 0;
-  space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
-  nofl_space_reset_sweeper(space);
-  nofl_space_update_mark_patterns(space, 0);
-  nofl_space_reset_statistics(space);
-  nofl_space_release_evacuation_target_blocks(space);
-  if (GC_DEBUG)
-    nofl_space_verify_before_restart(space);
-}
-
-static int
-nofl_sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
-  uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
-  // If the metadata byte is nonzero, that means either a young, dead,
-  // survived, or marked object.  If it's live (survived or marked), we
-  // found the next mark.  Otherwise it's dead and we clear the byte.
-  // If we see an END, that means an end of a dead object; clear it.
-  if (metadata) {
-    if (metadata & sweep_mask)
-      return 1;
-    atomic_store_explicit(loc, 0, memory_order_relaxed);
-  }
-  return 0;
-}
-
-static int
-nofl_sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
-  uintptr_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
-  if (metadata) {
-    if (metadata & sweep_mask)
-      return 1;
-    atomic_store_explicit(loc, 0, memory_order_relaxed);
-  }
-  return 0;
-}
-
-static uintptr_t
-nofl_space_next_block_to_sweep(struct nofl_space *space) {
-  uintptr_t block = atomic_load_explicit(&space->next_block,
-                                         memory_order_acquire);
-  uintptr_t next_block;
-  do {
-    if (block == 0)
-      return 0;
-
-    next_block = block + NOFL_BLOCK_SIZE;
-    if (next_block % NOFL_SLAB_SIZE == 0) {
-      uintptr_t hi_addr = space->low_addr + space->extent;
-      if (next_block == hi_addr)
-        next_block = 0;
-      else
-        next_block += NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE;
-    }
-  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
-                                         next_block));
-  return block;
-}
-
-static void
-nofl_allocator_release_block(struct nofl_allocator *alloc) {
-  alloc->alloc = alloc->sweep = alloc->block = 0;
-}
-
-static void
-nofl_allocator_finish_block(struct nofl_allocator *alloc,
-                            struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
-  struct nofl_block_summary *block = nofl_block_summary_for_addr(alloc->block);
-  atomic_fetch_add(&space->granules_freed_by_last_collection,
-                   block->free_granules);
-  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
-                   block->fragmentation_granules);
-
-  // If this block has mostly survivors, we should avoid sweeping it and
-  // trying to allocate into it for a minor GC.  Sweep it next time to
-  // clear any garbage allocated in this cycle and mark it as
-  // "venerable" (i.e., old).
-  GC_ASSERT(!nofl_block_summary_has_flag(block, NOFL_BLOCK_VENERABLE));
-  if (!nofl_block_summary_has_flag(block, NOFL_BLOCK_VENERABLE_AFTER_SWEEP) &&
-      block->free_granules < NOFL_GRANULES_PER_BLOCK * space->venerable_threshold)
-    nofl_block_summary_set_flag(block, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
-
-  nofl_allocator_release_block(alloc);
-}
-
-// Sweep some heap to reclaim free space, resetting alloc->alloc and
-// alloc->sweep.  Return the size of the hole in granules.
-static size_t
-nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
-                                  struct nofl_space *space) {
-  uintptr_t sweep = alloc->sweep;
-  if (sweep == 0)
-    return 0;
-  uintptr_t limit = alloc->block + NOFL_BLOCK_SIZE;
-  uintptr_t sweep_mask = space->sweep_mask;
-
-  while (sweep != limit) {
-    GC_ASSERT((sweep & (NOFL_GRANULE_SIZE - 1)) == 0);
-    uint8_t* metadata = nofl_metadata_byte_for_addr(sweep);
-    size_t limit_granules = (limit - sweep) >> NOFL_GRANULE_SIZE_LOG_2;
-
-    // Except for when we first get a block, alloc->sweep is positioned
-    // right after a hole, which can point to either the end of the
-    // block or to a live object.  Assume that a live object is more
-    // common.
-    {
-      size_t live_granules = 0;
-      while (limit_granules && (metadata[0] & sweep_mask)) {
-        // Object survived collection; skip over it and continue sweeping.
-        size_t object_granules = nofl_space_live_object_granules(metadata);
-        live_granules += object_granules;
-        limit_granules -= object_granules;
-        metadata += object_granules;
-      }
-      if (!limit_granules)
-        break;
-      sweep += live_granules * NOFL_GRANULE_SIZE;
-    }
-
-    size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
-    GC_ASSERT(free_granules);
-    GC_ASSERT(free_granules <= limit_granules);
-
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
-    summary->hole_count++;
-    GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - summary->free_granules);
-    summary->free_granules += free_granules;
-
-    size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
-    alloc->alloc = sweep;
-    alloc->sweep = sweep + free_bytes;
-    return free_granules;
-  }
-
-  nofl_allocator_finish_block(alloc, space);
-  return 0;
-}
-
-static void
-nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
-  size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
-  if (granules) {
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
-    summary->holes_with_fragmentation++;
-    summary->fragmentation_granules += granules;
-    uint8_t *metadata = nofl_metadata_byte_for_addr(alloc->alloc);
-    memset(metadata, 0, granules);
-    alloc->alloc = alloc->sweep;
-  }
-  // FIXME: add to fragmentation
-}
-
-static int
-nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
-                                     struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
-  uintptr_t block = alloc->block;
-  if (atomic_load_explicit(&space->pending_unavailable_bytes,
-                           memory_order_acquire) <= 0)
-    return 0;
-
-  nofl_push_unavailable_block(space, block);
-  atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
-  nofl_allocator_release_block(alloc);
-  return 1;
-}
-
-static size_t
-nofl_allocator_next_hole(struct nofl_allocator *alloc,
-                         struct nofl_space *space) {
-  nofl_allocator_finish_hole(alloc);
-  // As we sweep if we find that a block is empty, we return it to the
-  // empties list.  Empties are precious.  But if we return 10 blocks in
-  // a row, and still find an 11th empty, go ahead and use it.
-  size_t empties_countdown = 10;
-  while (1) {
-    // Sweep current block for a hole.
-    size_t granules = nofl_allocator_next_hole_in_block(alloc, space);
-    if (granules) {
-      // If the hole spans only part of a block, let the allocator try
-      // to use it.
-      if (granules < NOFL_GRANULES_PER_BLOCK)
-        return granules;
-      struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
-      memset(nofl_metadata_byte_for_addr(alloc->block), 0, NOFL_GRANULES_PER_BLOCK);
-      nofl_block_summary_clear_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-      // Sweeping found a completely empty block.  If we are below the
-      // minimum evacuation reserve, take the block.
-      if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
-        nofl_allocator_release_block(alloc);
-        continue;
-      }
-      // If we have pending pages to release to the OS, we should unmap
-      // this block.
-      if (nofl_maybe_release_swept_empty_block(alloc, space))
-        continue;
-      // Otherwise if we've already returned lots of empty blocks to the
-      // freelist, let the allocator keep this block.
-      if (!empties_countdown) {
-        // After this block is allocated into, it will need to be swept.
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-        return granules;
-      }
-      // Otherwise we push to the empty blocks list.
-      nofl_push_empty_block(space, alloc->block);
-      nofl_allocator_release_block(alloc);
-      empties_countdown--;
-    }
-    GC_ASSERT(alloc->block == 0);
-    while (1) {
-      uintptr_t block = nofl_space_next_block_to_sweep(space);
-      if (block) {
-        // Sweeping found a block.  We might take it for allocation, or
-        // we might send it back.
-        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-        // If it's marked unavailable, it's already on a list of
-        // unavailable blocks, so skip and get the next block.
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-          continue;
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE)) {
-          // Skip venerable blocks after a minor GC -- we don't need to
-          // sweep as they weren't allocated into last cycle, and the
-          // mark bytes didn't rotate, so we have no cleanup to do; and
-          // we shouldn't try to allocate into them as it's not worth
-          // it.  Any wasted space is measured as fragmentation.
-          if (space->last_collection_was_minor)
-            continue;
-          else
-            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE);
-        }
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
-          // Prepare to sweep the block for holes.
-          alloc->alloc = alloc->sweep = alloc->block = block;
-          if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP)) {
-            // In the last cycle we noted that this block consists of
-            // mostly old data.  Sweep any garbage, commit the mark as
-            // venerable, and avoid allocating into it.
-            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
-            if (space->last_collection_was_minor) {
-              nofl_finish_sweeping_in_block(alloc, space);
-              nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE);
-              continue;
-            }
-          }
-          // This block was marked in the last GC and needs sweeping.
-          // As we sweep we'll want to record how many bytes were live
-          // at the last collection.  As we allocate we'll record how
-          // many granules were wasted because of fragmentation.
-          summary->hole_count = 0;
-          summary->free_granules = 0;
-          summary->holes_with_fragmentation = 0;
-          summary->fragmentation_granules = 0;
-          break;
-        } else {
-          // Otherwise this block is completely empty and is on the
-          // empties list.  We take from the empties list only after all
-          // the NEEDS_SWEEP blocks are processed.
-          continue;
-        }
-      } else {
-        // We are done sweeping for blocks.  Now take from the empties
-        // list.
-        block = nofl_pop_empty_block(space);
-        // No empty block?  Return 0 to cause collection.
-        if (!block)
-          return 0;
-
-        // Maybe we should use this empty as a target for evacuation.
-        if (nofl_push_evacuation_target_if_possible(space, block))
-          continue;
-
-        // Otherwise give the block to the allocator.
-        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-        summary->hole_count = 1;
-        summary->free_granules = NOFL_GRANULES_PER_BLOCK;
-        summary->holes_with_fragmentation = 0;
-        summary->fragmentation_granules = 0;
-        alloc->block = block;
-        alloc->alloc = block;
-        alloc->sweep = block + NOFL_BLOCK_SIZE;
-        return NOFL_GRANULES_PER_BLOCK;
-      }
-    }
-  }
-}
-
-static void
-nofl_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                              struct nofl_space *space) {
-  do {
-    nofl_allocator_finish_hole(alloc);
-  } while (nofl_allocator_next_hole_in_block(alloc, space));
-}
-
-// Another thread is triggering GC.  Before we stop, finish clearing the
-// dead mark bytes for the mutator's block, and release the block.
-static void
-nofl_finish_sweeping(struct nofl_allocator *alloc,
-                     struct nofl_space *space) {
-  while (nofl_allocator_next_hole(alloc, space)) {}
-}
-
-static struct gc_ref
-nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
-              size_t size, void (*gc)(void*), void *gc_data) {
-  GC_ASSERT(size > 0);
-  GC_ASSERT(size <= gc_allocator_large_threshold());
-  size = align_up(size, NOFL_GRANULE_SIZE);
-
-  if (alloc->alloc + size > alloc->sweep) {
-    size_t granules = size >> NOFL_GRANULE_SIZE_LOG_2;
-    while (1) {
-      size_t hole = nofl_allocator_next_hole(alloc, space);
-      if (hole >= granules) {
-        nofl_clear_memory(alloc->alloc, hole * NOFL_GRANULE_SIZE);
-        break;
-      }
-      if (!hole)
-        gc(gc_data);
-    }
-  }
-
-  struct gc_ref ret = gc_ref(alloc->alloc);
-  alloc->alloc += size;
-  gc_update_alloc_table(ret, size);
-  return ret;
-}
-
 static struct nofl_slab*
 nofl_allocate_slabs(size_t nslabs) {
   size_t size = nslabs * NOFL_SLAB_SIZE;
diff --git a/src/whippet.c b/src/whippet.c
index 76f8f1ed5..6e942d7da 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -91,6 +91,10 @@ struct gc_mutator {
   struct gc_mutator *next;
 };
 
+struct gc_trace_worker_data {
+  struct nofl_allocator allocator;
+};
+
 static inline struct nofl_space*
 heap_nofl_space(struct gc_heap *heap) {
   return &heap->nofl_space;
@@ -108,16 +112,34 @@ mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
+static void
+gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
+                                         struct gc_heap *heap,
+                                         struct gc_trace_worker *worker,
+                                         struct gc_trace_worker_data *data),
+                               struct gc_tracer *tracer,
+                               struct gc_heap *heap,
+                               struct gc_trace_worker *worker) {
+  struct gc_trace_worker_data data;
+  nofl_allocator_reset(&data.allocator);
+  f(tracer, heap, worker, &data);
+  nofl_allocator_finish(&data.allocator, heap_nofl_space(heap));
+}
+
 static void collect(struct gc_mutator *mut,
                     enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 
 static inline int
-do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref) {
+do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
+         struct gc_trace_worker *worker) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref)))
-    return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref);
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref))) {
+    struct nofl_allocator *alloc =
+      worker ? &gc_trace_worker_data(worker)->allocator : NULL;
+    return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref,
+                                              alloc);
+  } else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           ref);
   else
@@ -125,12 +147,13 @@ do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref) {
 }
 
 static inline int trace_edge(struct gc_heap *heap,
-                             struct gc_edge edge) GC_ALWAYS_INLINE;
+                             struct gc_edge edge,
+                             struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
 
 static inline int
-trace_edge(struct gc_heap *heap, struct gc_edge edge) {
+trace_edge(struct gc_heap *heap, struct gc_edge edge, struct gc_trace_worker *worker) {
   struct gc_ref ref = gc_edge_ref(edge);
-  int is_new = do_trace(heap, edge, ref);
+  int is_new = do_trace(heap, edge, ref, worker);
 
   if (is_new &&
       GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
@@ -340,23 +363,12 @@ gc_heap_set_extern_space(struct gc_heap *heap, struct gc_extern_space *space) {
   heap->extern_space = space;
 }
 
-static void
-gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
-                                         struct gc_heap *heap,
-                                         struct gc_trace_worker *worker,
-                                         struct gc_trace_worker_data *data),
-                               struct gc_tracer *tracer,
-                               struct gc_heap *heap,
-                               struct gc_trace_worker *worker) {
-  f(tracer, heap, worker, NULL);
-}
-
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
 static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   struct gc_trace_worker *worker = trace_data;
-  if (trace_edge(heap, edge))
+  if (trace_edge(heap, edge, worker))
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
@@ -364,7 +376,7 @@ static void
 trace_and_enqueue_locally(struct gc_edge edge, struct gc_heap *heap,
                           void *data) {
   struct gc_mutator *mut = data;
-  if (trace_edge(heap, edge))
+  if (trace_edge(heap, edge, NULL))
     mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
@@ -396,7 +408,7 @@ trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
 static void
 trace_and_enqueue_globally(struct gc_edge edge, struct gc_heap *heap,
                            void *unused) {
-  if (trace_edge(heap, edge))
+  if (trace_edge(heap, edge, NULL))
     gc_tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
@@ -643,7 +655,7 @@ trace_mutator_roots_after_stop(struct gc_heap *heap) {
   atomic_store(&heap->mutator_trace_list, NULL);
 
   for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next) {
-    nofl_finish_sweeping_in_block(&mut->allocator, heap_nofl_space(heap));
+    nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
     trace_mutator_roots_with_lock(mut);
   }
 }
@@ -719,7 +731,7 @@ pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
-  nofl_finish_sweeping_in_block(&mut->allocator, heap_nofl_space(heap));
+  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     // No need to collect results in mark buf; we can enqueue roots directly.
@@ -760,7 +772,7 @@ static double
 heap_last_gc_yield(struct gc_heap *heap) {
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   size_t nofl_yield = nofl_space_yield(nofl_space);
-  size_t evacuation_reserve = nofl_space_evacuation_reserve(nofl_space);
+  size_t evacuation_reserve = nofl_space_evacuation_reserve_bytes(nofl_space);
   // FIXME: Size nofl evacuation reserve based on size of nofl space,
   // not heap size.
   size_t minimum_evacuation_reserve =
@@ -1030,7 +1042,8 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
-  nofl_space_prepare_for_evacuation(nofl_space, gc_kind);
+  if (gc_kind == GC_COLLECTION_COMPACTING)
+    nofl_space_prepare_evacuation(nofl_space);
   trace_roots_after_stop(heap);
   HEAP_EVENT(heap, roots_traced);
   gc_tracer_trace(&heap->tracer);

From d137e1397c349493804024c56bef308178e74f55 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 22 Aug 2024 18:04:21 +0200
Subject: [PATCH 266/403] Instead of partitioning blocks by flag, put them in
 separate lists

This way you can directly iterate blocks of a certain kind.  Also verify
these lists more thoroughly, and allow full blocks that are the results
of evacuation to skip being swept the next round.  Also!  Have
next_hole_in_block / next_hole_in_block ensure that the object data and
the mark bytes are clear.
---
 src/nofl-space.h | 558 ++++++++++++++++++++++++++---------------------
 src/whippet.c    |   5 +-
 2 files changed, 310 insertions(+), 253 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index eba3cd386..134fbccd8 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -70,14 +70,14 @@ STATIC_ASSERT_EQ(sizeof(struct nofl_slab_header), NOFL_HEADER_BYTES_PER_SLAB);
 // non-atomically by the mutator when it owns a block; otherwise they
 // need to be accessed atomically.
 enum nofl_block_summary_flag {
-  NOFL_BLOCK_OUT_FOR_THREAD = 0x1,
-  NOFL_BLOCK_HAS_PIN = 0x2,
-  NOFL_BLOCK_PAGED_OUT = 0x4,
-  NOFL_BLOCK_NEEDS_SWEEP = 0x8,
-  NOFL_BLOCK_UNAVAILABLE = 0x10,
-  NOFL_BLOCK_EVACUATE = 0x20,
-  NOFL_BLOCK_VENERABLE = 0x40,
-  NOFL_BLOCK_VENERABLE_AFTER_SWEEP = 0x80,
+  NOFL_BLOCK_EVACUATE = 0x1,
+  NOFL_BLOCK_ZERO = 0x2,
+  NOFL_BLOCK_UNAVAILABLE = 0x4,
+  NOFL_BLOCK_FLAG_UNUSED_3 = 0x8,
+  NOFL_BLOCK_FLAG_UNUSED_4 = 0x10,
+  NOFL_BLOCK_FLAG_UNUSED_5 = 0x20,
+  NOFL_BLOCK_FLAG_UNUSED_6 = 0x40,
+  NOFL_BLOCK_FLAG_UNUSED_7 = 0x80,
   NOFL_BLOCK_FLAG_UNUSED_8 = 0x100,
   NOFL_BLOCK_FLAG_UNUSED_9 = 0x200,
   NOFL_BLOCK_FLAG_UNUSED_10 = 0x400,
@@ -141,14 +141,17 @@ struct nofl_space {
   size_t extent;
   size_t heap_size;
   uint8_t last_collection_was_minor;
-  uintptr_t next_block;   // atomically
   struct nofl_block_list empty;
   struct nofl_block_list unavailable;
+  struct nofl_block_list to_sweep;
   struct nofl_block_list partly_full;
+  struct nofl_block_list full;
+  struct nofl_block_list promoted;
+  struct nofl_block_list old;
   struct nofl_block_list evacuation_targets;
   double evacuation_minimum_reserve;
   double evacuation_reserve;
-  double venerable_threshold;
+  double promotion_threshold;
   ssize_t pending_unavailable_bytes; // atomically
   struct nofl_slab *slabs;
   size_t nslabs;
@@ -277,6 +280,7 @@ static void
 nofl_push_block(struct nofl_block_list *list, uintptr_t block) {
   atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
   struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT_EQ(nofl_block_summary_next(summary), 0);
   uintptr_t next = atomic_load_explicit(&list->blocks, memory_order_acquire);
   do {
     nofl_block_summary_set_next(summary, next);
@@ -306,10 +310,8 @@ nofl_block_count(struct nofl_block_list *list) {
 
 static void
 nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
-  nofl_block_summary_set_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  nofl_block_summary_set_flag(nofl_block_summary_for_addr(block),
+                              NOFL_BLOCK_ZERO | NOFL_BLOCK_UNAVAILABLE);
   madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
   nofl_push_block(&space->unavailable, block);
 }
@@ -317,14 +319,17 @@ nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
 static uintptr_t
 nofl_pop_unavailable_block(struct nofl_space *space) {
   uintptr_t block = nofl_pop_block(&space->unavailable);
-  if (!block)
-    return 0;
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
-  nofl_block_summary_clear_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  if (block)
+    nofl_block_summary_clear_flag(nofl_block_summary_for_addr(block),
+                                  NOFL_BLOCK_UNAVAILABLE);
   return block;
 }
 
+static void
+nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
+  nofl_push_block(&space->empty, block);
+}
+
 static uintptr_t
 nofl_pop_empty_block(struct nofl_space *space) {
   return nofl_pop_block(&space->empty);
@@ -333,8 +338,6 @@ nofl_pop_empty_block(struct nofl_space *space) {
 static int
 nofl_maybe_push_evacuation_target(struct nofl_space *space,
                                   uintptr_t block, double reserve) {
-  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
-                                         NOFL_BLOCK_NEEDS_SWEEP));
   size_t targets = nofl_block_count(&space->evacuation_targets);
   size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
   size_t unavailable = nofl_block_count(&space->unavailable);
@@ -359,13 +362,6 @@ nofl_push_evacuation_target_if_possible(struct nofl_space *space,
                                            space->evacuation_reserve);
 }
 
-static void
-nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
-  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
-                                         NOFL_BLOCK_NEEDS_SWEEP));
-  nofl_push_block(&space->empty, block);
-}
-
 static inline void
 nofl_clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);
@@ -376,17 +372,6 @@ nofl_space_live_object_granules(uint8_t *metadata) {
   return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 1;
 }
 
-static void
-nofl_clear_remaining_metadata_bytes_in_block(uintptr_t block,
-                                             uintptr_t allocated) {
-  GC_ASSERT((allocated & (NOFL_GRANULE_SIZE - 1)) == 0);
-  uintptr_t base = block + allocated;
-  uintptr_t limit = block + NOFL_BLOCK_SIZE;
-  uintptr_t granules = (limit - base) >> NOFL_GRANULE_SIZE_LOG_2;
-  GC_ASSERT(granules <= NOFL_GRANULES_PER_BLOCK);
-  memset(nofl_metadata_byte_for_addr(base), 0, granules);
-}
-
 static void
 nofl_allocator_reset(struct nofl_allocator *alloc) {
   alloc->alloc = alloc->sweep = alloc->block = 0;
@@ -394,12 +379,10 @@ nofl_allocator_reset(struct nofl_allocator *alloc) {
 
 static void
 nofl_allocator_release_full_block(struct nofl_allocator *alloc,
-                                  struct nofl_space *space,
-                                  struct nofl_block_summary *summary) {
+                                  struct nofl_space *space) {
   GC_ASSERT(alloc->block);
   GC_ASSERT(alloc->alloc == alloc->sweep);
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE));
-
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
   atomic_fetch_add(&space->granules_freed_by_last_collection,
                    summary->free_granules);
   atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
@@ -409,24 +392,51 @@ nofl_allocator_release_full_block(struct nofl_allocator *alloc,
   // trying to allocate into it for a minor GC.  Sweep it next time to
   // clear any garbage allocated in this cycle and mark it as
   // "venerable" (i.e., old).
-  if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP) &&
-      summary->free_granules < NOFL_GRANULES_PER_BLOCK * space->venerable_threshold)
-    nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+  if (GC_GENERATIONAL &&
+      summary->free_granules < NOFL_GRANULES_PER_BLOCK * space->promotion_threshold)
+    nofl_push_block(&space->promoted, alloc->block);
+  else
+    nofl_push_block(&space->full, alloc->block);
 
   nofl_allocator_reset(alloc);
 }
 
+static void
+nofl_allocator_release_full_evacuation_target(struct nofl_allocator *alloc,
+                                              struct nofl_space *space) {
+  GC_ASSERT(alloc->alloc > alloc->block);
+  GC_ASSERT(alloc->sweep == alloc->block + NOFL_BLOCK_SIZE);
+  size_t hole_size = alloc->sweep - alloc->alloc;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
+  // FIXME: Check how this affects statistics.
+  GC_ASSERT_EQ(summary->hole_count, 1);
+  GC_ASSERT_EQ(summary->free_granules, NOFL_GRANULES_PER_BLOCK);
+  atomic_fetch_add(&space->granules_freed_by_last_collection,
+                   NOFL_GRANULES_PER_BLOCK);
+  if (hole_size) {
+    hole_size >>= NOFL_GRANULE_SIZE_LOG_2;
+    summary->holes_with_fragmentation = 1;
+    summary->fragmentation_granules = hole_size >> NOFL_GRANULE_SIZE_LOG_2;
+    atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
+                     summary->fragmentation_granules);
+  } else {
+    GC_ASSERT_EQ(summary->fragmentation_granules, 0);
+    GC_ASSERT_EQ(summary->holes_with_fragmentation, 0);
+  }
+  nofl_push_block(&space->old, alloc->block);
+  nofl_allocator_reset(alloc);
+}
+
 static void
 nofl_allocator_release_partly_full_block(struct nofl_allocator *alloc,
-                                         struct nofl_space *space,
-                                         struct nofl_block_summary *summary) {
+                                         struct nofl_space *space) {
   // A block can go on the partly full list if it has exactly one
   // hole, located at the end of the block.
   GC_ASSERT(alloc->alloc > alloc->block);
   GC_ASSERT(alloc->sweep == alloc->block + NOFL_BLOCK_SIZE);
-  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
   size_t hole_size = alloc->sweep - alloc->alloc;
   GC_ASSERT(hole_size);
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
   summary->fragmentation_granules = hole_size >> NOFL_GRANULE_SIZE_LOG_2;
   nofl_push_block(&space->partly_full, alloc->block);
   nofl_allocator_reset(alloc);
@@ -457,13 +467,24 @@ nofl_allocator_acquire_empty_block(struct nofl_allocator *alloc,
   summary->free_granules = NOFL_GRANULES_PER_BLOCK;
   summary->holes_with_fragmentation = 0;
   summary->fragmentation_granules = 0;
-  nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
   alloc->block = alloc->alloc = block;
   alloc->sweep = block + NOFL_BLOCK_SIZE;
-  nofl_clear_memory(block, NOFL_BLOCK_SIZE);
+  if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_ZERO))
+    nofl_block_summary_clear_flag(summary, NOFL_BLOCK_ZERO);
+  else
+    nofl_clear_memory(block, NOFL_BLOCK_SIZE);
   return NOFL_GRANULES_PER_BLOCK;
 }
 
+static size_t
+nofl_allocator_acquire_evacuation_target(struct nofl_allocator* alloc,
+                                         struct nofl_space *space) {
+  size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+  if (granules)
+    return granules;
+  return nofl_allocator_acquire_empty_block(alloc, space);
+}
+
 static void
 nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
   size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
@@ -471,8 +492,6 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
     struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
     summary->holes_with_fragmentation++;
     summary->fragmentation_granules += granules;
-    uint8_t *metadata = nofl_metadata_byte_for_addr(alloc->alloc);
-    memset(metadata, 0, granules);
     alloc->alloc = alloc->sweep;
   }
 }
@@ -513,15 +532,18 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
   }
 
   size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+  size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
   GC_ASSERT(free_granules);
   GC_ASSERT(free_granules <= limit_granules);
 
+  memset(metadata, 0, free_granules);
+  memset((char*)sweep, 0, free_bytes);
+
   struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
   summary->hole_count++;
   GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - summary->free_granules);
   summary->free_granules += free_granules;
 
-  size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
   alloc->alloc = sweep;
   alloc->sweep = sweep + free_bytes;
   return free_granules;
@@ -539,14 +561,15 @@ static void
 nofl_allocator_release_block(struct nofl_allocator *alloc,
                              struct nofl_space *space) {
   GC_ASSERT(alloc->block);
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
   if (alloc->alloc < alloc->sweep &&
       alloc->sweep == alloc->block + NOFL_BLOCK_SIZE &&
-      summary->holes_with_fragmentation == 0) {
-    nofl_allocator_release_partly_full_block(alloc, space, summary);
+      nofl_block_summary_for_addr(alloc->block)->holes_with_fragmentation == 0) {
+    nofl_allocator_release_partly_full_block(alloc, space);
+  } else if (space->evacuating) {
+    nofl_allocator_release_full_evacuation_target(alloc, space);
   } else {
     nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
-    nofl_allocator_release_full_block(alloc, space, summary);
+    nofl_allocator_release_full_block(alloc, space);
   }
 }
 
@@ -556,28 +579,6 @@ nofl_allocator_finish(struct nofl_allocator *alloc, struct nofl_space *space) {
     nofl_allocator_release_block(alloc, space);
 }
 
-static uintptr_t
-nofl_space_next_block_to_sweep(struct nofl_space *space) {
-  uintptr_t block = atomic_load_explicit(&space->next_block,
-                                         memory_order_acquire);
-  uintptr_t next_block;
-  do {
-    if (block == 0)
-      return 0;
-
-    next_block = block + NOFL_BLOCK_SIZE;
-    if (next_block % NOFL_SLAB_SIZE == 0) {
-      uintptr_t hi_addr = space->low_addr + space->extent;
-      if (next_block == hi_addr)
-        next_block = 0;
-      else
-        next_block += NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE;
-    }
-  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
-                                         next_block));
-  return block;
-}
-
 static int
 nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
                                      struct nofl_space *space) {
@@ -593,6 +594,17 @@ nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
   return 1;
 }
 
+static int
+nofl_allocator_acquire_block_to_sweep(struct nofl_allocator *alloc,
+                                      struct nofl_space *space) {
+  uintptr_t block = nofl_pop_block(&space->to_sweep);
+  if (block) {
+    alloc->block = alloc->alloc = alloc->sweep = block;
+    return 1;
+  }
+  return 0;
+}
+
 static size_t
 nofl_allocator_next_hole(struct nofl_allocator *alloc,
                          struct nofl_space *space) {
@@ -604,8 +616,6 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
   while (1) {
     // Sweep current block for a hole.
     if (alloc->block) {
-      struct nofl_block_summary *summary =
-        nofl_block_summary_for_addr(alloc->block);
       size_t granules =
         nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
       if (granules) {
@@ -613,10 +623,8 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
         // to use it.
         if (granules < NOFL_GRANULES_PER_BLOCK)
           return granules;
-        // Otherwise we have an empty block.
-        nofl_clear_remaining_metadata_bytes_in_block(alloc->block, 0);
-        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-        // If we need an evacuation reserve block, take it.
+        // Otherwise we have an empty block.  If we need an evacuation reserve
+        // block, take it.
         if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
           nofl_allocator_reset(alloc);
           continue;
@@ -627,17 +635,14 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
           continue;
         // Otherwise if we've already returned lots of empty blocks to the
         // freelist, let the allocator keep this block.
-        if (!empties_countdown) {
-          // After this block is allocated into, it will need to be swept.
-          nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        if (!empties_countdown)
           return granules;
-        }
         // Otherwise we push to the empty blocks list.
         nofl_push_empty_block(space, alloc->block);
         nofl_allocator_reset(alloc);
         empties_countdown--;
       } else {
-        nofl_allocator_release_full_block(alloc, space, summary);
+        nofl_allocator_release_full_block(alloc, space);
       }
     }
 
@@ -649,72 +654,30 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
         return granules;
     }
 
-    while (1) {
-      uintptr_t block = nofl_space_next_block_to_sweep(space);
-      if (block) {
-        // Sweeping found a block.  We might take it for allocation, or
-        // we might send it back.
-        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-        // If it's marked unavailable, it's already on a list of
-        // unavailable blocks, so skip and get the next block.
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-          continue;
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE)) {
-          // Skip venerable blocks after a minor GC -- we don't need to
-          // sweep as they weren't allocated into last cycle, and the
-          // mark bytes didn't rotate, so we have no cleanup to do; and
-          // we shouldn't try to allocate into them as it's not worth
-          // it.  Any wasted space is measured as fragmentation.
-          if (space->last_collection_was_minor)
-            continue;
-          else
-            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE);
-        }
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
-          // Prepare to sweep the block for holes.
-          alloc->alloc = alloc->sweep = alloc->block = block;
-          if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP)) {
-            // In the last cycle we noted that this block consists of
-            // mostly old data.  Sweep any garbage, commit the mark as
-            // venerable, and avoid allocating into it.
-            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
-            if (space->last_collection_was_minor) {
-              nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
-              nofl_allocator_release_full_block(alloc, space, summary);
-              nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE);
-              continue;
-            }
-          }
-          // This block was marked in the last GC and needs sweeping.
-          // As we sweep we'll want to record how many bytes were live
-          // at the last collection.  As we allocate we'll record how
-          // many granules were wasted because of fragmentation.
-          summary->hole_count = 0;
-          summary->free_granules = 0;
-          summary->holes_with_fragmentation = 0;
-          summary->fragmentation_granules = 0;
-          break;
-        } else {
-          // Otherwise this block is completely empty and is on the
-          // empties list.  We take from the empties list only after all
-          // the NEEDS_SWEEP blocks are processed.
-          continue;
-        }
-      } else {
-        // We are done sweeping for blocks.  Now take from the empties
-        // list.
-        block = nofl_pop_empty_block(space);
-        // No empty block?  Return 0 to cause collection.
-        if (!block)
-          return 0;
+    if (nofl_allocator_acquire_block_to_sweep(alloc, space)) {
+      struct nofl_block_summary *summary =
+        nofl_block_summary_for_addr(alloc->block);
+      // This block was marked in the last GC and needs sweeping.
+      // As we sweep we'll want to record how many bytes were live
+      // at the last collection.  As we allocate we'll record how
+      // many granules were wasted because of fragmentation.
+      summary->hole_count = 0;
+      summary->free_granules = 0;
+      summary->holes_with_fragmentation = 0;
+      summary->fragmentation_granules = 0;
+      continue;
+    }
 
+    // We are done sweeping for blocks.  Now take from the empties list.
+    {
+      uintptr_t block;
+      while ((block = nofl_pop_empty_block(space))) {
         // Maybe we should use this empty as a target for evacuation.
         if (nofl_push_evacuation_target_if_possible(space, block))
           continue;
 
         // Otherwise give the block to the allocator.
         struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
         summary->hole_count = 1;
         summary->free_granules = NOFL_GRANULES_PER_BLOCK;
         summary->holes_with_fragmentation = 0;
@@ -725,6 +688,9 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
         return NOFL_GRANULES_PER_BLOCK;
       }
     }
+
+    // Couldn't acquire another block; return 0 to cause collection.
+    return 0;
   }
 }
 
@@ -740,7 +706,6 @@ nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
     while (1) {
       size_t hole = nofl_allocator_next_hole(alloc, space);
       if (hole >= granules) {
-        nofl_clear_memory(alloc->alloc, hole * NOFL_GRANULE_SIZE);
         break;
       }
       if (!hole)
@@ -754,33 +719,22 @@ nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
   return ret;
 }
 
-static size_t
-nofl_allocator_acquire_evacuation_block(struct nofl_allocator* alloc,
-                                        struct nofl_space *space) {
-  size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
-  if (granules)
-    return granules;
-  return nofl_allocator_acquire_empty_block(alloc, space);
-}
-
 static struct gc_ref
 nofl_evacuation_allocate(struct nofl_allocator* alloc, struct nofl_space *space,
                          size_t granules) {
   size_t avail = (alloc->sweep - alloc->alloc) >> NOFL_GRANULE_SIZE_LOG_2;
   while (avail < granules) {
-    if (alloc->block) {
-      nofl_allocator_finish_hole(alloc);
-      nofl_allocator_release_full_block(alloc, space,
-                                        nofl_block_summary_for_addr(alloc->block));
-    }
-    avail = nofl_allocator_acquire_evacuation_block(alloc, space);
+    if (alloc->block)
+      // No need to finish the hole, these mark bytes are zero.
+      nofl_allocator_release_full_evacuation_target(alloc, space);
+    avail = nofl_allocator_acquire_evacuation_target(alloc, space);
     if (!avail)
       return gc_ref_null();
   }
 
   struct gc_ref ret = gc_ref(alloc->alloc);
   alloc->alloc += granules * NOFL_GRANULE_SIZE;
-  gc_update_alloc_table(ret, granules * NOFL_GRANULE_SIZE);
+  // Caller is responsible for updating alloc table.
   return ret;
 }
 
@@ -860,27 +814,12 @@ nofl_space_trace_remembered_set(struct nofl_space *space,
 static void
 nofl_space_clear_remembered_set(struct nofl_space *space) {
   if (!GC_GENERATIONAL) return;
+  // FIXME: Don't assume slabs are contiguous.
   for (size_t slab = 0; slab < space->nslabs; slab++) {
     memset(space->slabs[slab].remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
   }
 }
 
-static void
-nofl_space_reset_sweeper(struct nofl_space *space) {
-  space->next_block = (uintptr_t) &space->slabs[0].blocks;
-}
-
-static void
-nofl_space_update_mark_patterns(struct nofl_space *space,
-                                int advance_mark_mask) {
-  uint8_t survivor_mask = space->marked_mask;
-  uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask);
-  if (advance_mark_mask)
-    space->marked_mask = next_marked_mask;
-  space->live_mask = survivor_mask | next_marked_mask;
-  space->sweep_mask = broadcast_byte(space->live_mask);
-}
-
 static void
 nofl_space_reset_statistics(struct nofl_space *space) {
   space->granules_freed_by_last_collection = 0;
@@ -911,6 +850,12 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
     while ((block = nofl_pop_block(&space->evacuation_targets)))
       nofl_push_empty_block(space, block);
   }
+  // Blocks are either to_sweep, empty, or unavailable.
+  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
+  GC_ASSERT_EQ(nofl_block_count(&space->full), 0);
+  GC_ASSERT_EQ(nofl_block_count(&space->promoted), 0);
+  GC_ASSERT_EQ(nofl_block_count(&space->old), 0);
+  GC_ASSERT_EQ(nofl_block_count(&space->evacuation_targets), 0);
   size_t target_blocks = nofl_block_count(&space->empty);
   DEBUG("evacuation target block count: %zu\n", target_blocks);
 
@@ -933,28 +878,17 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
   const size_t bucket_count = 33;
   size_t histogram[33] = {0,};
   size_t bucket_size = NOFL_GRANULES_PER_BLOCK / 32;
-  size_t empties = 0;
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
-      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-        continue;
-      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
-        empties++;
-        continue;
-      }
+  {
+    uintptr_t block = space->to_sweep.blocks;
+    while (block) {
+      struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
       size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
       size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
       histogram[bucket]++;
+      block = nofl_block_summary_next(summary);
     }
   }
 
-  // Blocks which lack the NEEDS_SWEEP flag are empty, either because
-  // they have been removed from the pool and have the UNAVAILABLE flag
-  // set, or because they are on the empties or evacuation target
-  // lists.  When evacuation starts, the empties list should be empty.
-  GC_ASSERT(empties == target_blocks);
-
   // Now select a number of blocks that is likely to fill the space in
   // the target blocks.  Prefer candidate blocks with fewer survivors
   // from the last GC, to increase expected free block yield.
@@ -969,14 +903,11 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
   }
 
   // Having selected the number of blocks, now we set the evacuation
-  // candidate flag on all blocks.
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
-      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-        continue;
-      if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP))
-        continue;
+  // candidate flag on all blocks that have live objects.
+  {
+    uintptr_t block = space->to_sweep.blocks;
+    while (block) {
+      struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
       size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
       size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
       if (histogram[bucket]) {
@@ -985,10 +916,50 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
       } else {
         nofl_block_summary_clear_flag(summary, NOFL_BLOCK_EVACUATE);
       }
+      block = nofl_block_summary_next(summary);
     }
   }
 }
 
+static void
+nofl_space_update_mark_patterns(struct nofl_space *space,
+                                int advance_mark_mask) {
+  uint8_t survivor_mask = space->marked_mask;
+  uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask);
+  if (advance_mark_mask)
+    space->marked_mask = next_marked_mask;
+  space->live_mask = survivor_mask | next_marked_mask;
+  space->sweep_mask = broadcast_byte(space->live_mask);
+}
+
+static void
+nofl_space_prepare_gc(struct nofl_space *space, enum gc_collection_kind kind) {
+  nofl_space_update_mark_patterns(space, !(kind == GC_COLLECTION_MINOR));
+}
+
+static void
+nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
+  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
+  GC_ASSERT_EQ(nofl_block_count(&space->to_sweep), 0);
+
+  // Any block that was the target of allocation in the last cycle will need to
+  // be swept next cycle.
+  uintptr_t block;
+  while ((block = nofl_pop_block(&space->full)))
+    nofl_push_block(&space->to_sweep, block);
+
+  if (gc_kind != GC_COLLECTION_MINOR) {
+    uintptr_t block;
+    while ((block = nofl_pop_block(&space->promoted)))
+      nofl_push_block(&space->to_sweep, block);
+    while ((block = nofl_pop_block(&space->old)))
+      nofl_push_block(&space->to_sweep, block);
+  }
+
+  if (gc_kind == GC_COLLECTION_COMPACTING)
+    nofl_space_prepare_evacuation(space);
+}
+
 static void
 nofl_space_finish_evacuation(struct nofl_space *space) {
   // When evacuation began, the evacuation reserve was moved to the
@@ -996,7 +967,6 @@ nofl_space_finish_evacuation(struct nofl_space *space) {
   // repopulate the reserve.
   GC_ASSERT(space->evacuating);
   space->evacuating = 0;
-  space->evacuation_reserve = space->evacuation_minimum_reserve;
   size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
   size_t unavailable = nofl_block_count(&space->unavailable);
   size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
@@ -1006,13 +976,15 @@ nofl_space_finish_evacuation(struct nofl_space *space) {
     if (!block) break;
     nofl_push_block(&space->evacuation_targets, block);
   }
-  {
-    // FIXME: We should avoid sweeping partly full blocks, but it's too annoying
-    // to do at the moment given the way sweeping works.
-    uintptr_t block;
-    do {
-      block = nofl_pop_block(&space->partly_full);
-    } while (block);
+}
+
+static void
+nofl_space_promote_blocks(struct nofl_space *space) {
+  uintptr_t block;
+  while ((block = nofl_pop_block(&space->promoted))) {
+    struct nofl_allocator alloc = { block, block, block };
+    nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
+    nofl_push_block(&space->old, block);
   }
 }
 
@@ -1022,50 +994,135 @@ nofl_size_to_granules(size_t size) {
 }
 
 static void
-nofl_space_verify_before_restart(struct nofl_space *space) {
-  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
-  // Iterate objects in each block, verifying that the END bytes correspond to
-  // the measured object size.
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      struct nofl_block_summary *summary = &space->slabs[slab].summaries[block];
-      if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-        continue;
-
-      uintptr_t addr = (uintptr_t)space->slabs[slab].blocks[block].data;
-      uintptr_t limit = addr + NOFL_BLOCK_SIZE;
-      uint8_t *meta = nofl_metadata_byte_for_addr(addr);
-      while (addr < limit) {
-        if (meta[0] & space->live_mask) {
-          struct gc_ref obj = gc_ref(addr);
-          size_t obj_bytes;
-          gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
-          size_t granules = nofl_size_to_granules(obj_bytes);
-          GC_ASSERT(granules);
-          for (size_t granule = 0; granule < granules - 1; granule++)
-            GC_ASSERT(!(meta[granule] & NOFL_METADATA_BYTE_END));
-          GC_ASSERT(meta[granules - 1] & NOFL_METADATA_BYTE_END);
-          meta += granules;
-          addr += granules * NOFL_GRANULE_SIZE;
-        } else {
-          meta++;
-          addr += NOFL_GRANULE_SIZE;
-        }
+nofl_space_verify_sweepable_blocks(struct nofl_space *space,
+                                   struct nofl_block_list *list)
+{
+  uintptr_t addr = list->blocks;
+  while (addr) {
+    struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+    // Iterate objects in the block, verifying that the END bytes correspond to
+    // the measured object size.
+    uintptr_t limit = addr + NOFL_BLOCK_SIZE;
+    uint8_t *meta = nofl_metadata_byte_for_addr(addr);
+    while (addr < limit) {
+      if (meta[0] & space->live_mask) {
+        struct gc_ref obj = gc_ref(addr);
+        size_t obj_bytes;
+        gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
+        size_t granules = nofl_size_to_granules(obj_bytes);
+        GC_ASSERT(granules);
+        for (size_t granule = 0; granule < granules - 1; granule++)
+          GC_ASSERT(!(meta[granule] & NOFL_METADATA_BYTE_END));
+        GC_ASSERT(meta[granules - 1] & NOFL_METADATA_BYTE_END);
+        meta += granules;
+        addr += granules * NOFL_GRANULE_SIZE;
+      } else {
+        meta++;
+        addr += NOFL_GRANULE_SIZE;
       }
-      GC_ASSERT(addr == limit);
     }
+    GC_ASSERT(addr == limit);
+    addr = nofl_block_summary_next(summary);
   }
 }
 
+static void
+nofl_space_verify_swept_blocks(struct nofl_space *space,
+                               struct nofl_block_list *list) {
+  uintptr_t addr = list->blocks;
+  while (addr) {
+    struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+    // Iterate objects in the block, verifying that the END bytes correspond to
+    // the measured object size.
+    uintptr_t limit = addr + NOFL_BLOCK_SIZE;
+    uint8_t *meta = nofl_metadata_byte_for_addr(addr);
+    while (addr < limit) {
+      if (meta[0]) {
+        GC_ASSERT(meta[0] & space->marked_mask);
+        GC_ASSERT_EQ(meta[0] & ~(space->marked_mask | NOFL_METADATA_BYTE_END), 0);
+        struct gc_ref obj = gc_ref(addr);
+        size_t obj_bytes;
+        gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
+        size_t granules = nofl_size_to_granules(obj_bytes);
+        GC_ASSERT(granules);
+        for (size_t granule = 0; granule < granules - 1; granule++)
+          GC_ASSERT(!(meta[granule] & NOFL_METADATA_BYTE_END));
+        GC_ASSERT(meta[granules - 1] & NOFL_METADATA_BYTE_END);
+        meta += granules;
+        addr += granules * NOFL_GRANULE_SIZE;
+      } else {
+        meta++;
+        addr += NOFL_GRANULE_SIZE;
+      }
+    }
+    GC_ASSERT(addr == limit);
+    addr = nofl_block_summary_next(summary);
+  }
+}
+
+static void
+nofl_space_verify_empty_blocks(struct nofl_space *space,
+                               struct nofl_block_list *list,
+                               int paged_in) {
+  uintptr_t addr = list->blocks;
+  while (addr) {
+    struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+    // Iterate objects in the block, verifying that the END bytes correspond to
+    // the measured object size.
+    uintptr_t limit = addr + NOFL_BLOCK_SIZE;
+    uint8_t *meta = nofl_metadata_byte_for_addr(addr);
+    while (addr < limit) {
+      GC_ASSERT_EQ(*meta, 0);
+      if (paged_in) {
+        char zeroes[NOFL_GRANULE_SIZE] = { 0, };
+        GC_ASSERT_EQ(memcmp((char*)addr, zeroes, NOFL_GRANULE_SIZE), 0);
+      }
+      meta++;
+      addr += NOFL_GRANULE_SIZE;
+    }
+    GC_ASSERT(addr == limit);
+    addr = nofl_block_summary_next(summary);
+  }
+}
+
+static void
+nofl_space_verify_before_restart(struct nofl_space *space) {
+  nofl_space_verify_sweepable_blocks(space, &space->to_sweep);
+  nofl_space_verify_sweepable_blocks(space, &space->promoted);
+  // If there are full or partly full blocks, they were filled during
+  // evacuation.
+  nofl_space_verify_swept_blocks(space, &space->partly_full);
+  nofl_space_verify_swept_blocks(space, &space->full);
+  nofl_space_verify_swept_blocks(space, &space->old);
+  nofl_space_verify_empty_blocks(space, &space->empty, 1);
+  nofl_space_verify_empty_blocks(space, &space->unavailable, 0);
+  // GC_ASSERT(space->last_collection_was_minor || !nofl_block_count(&space->old));
+}
+
 static void
 nofl_space_finish_gc(struct nofl_space *space,
                      enum gc_collection_kind gc_kind) {
   space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
   if (space->evacuating)
     nofl_space_finish_evacuation(space);
-  nofl_space_reset_sweeper(space);
-  nofl_space_update_mark_patterns(space, 0);
+  else {
+    space->evacuation_reserve = space->evacuation_minimum_reserve;
+    // If we were evacuating and preferentially allocated empty blocks
+    // to the evacuation reserve, return those blocks to the empty set
+    // for allocation by the mutator.
+    size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+    size_t unavailable = nofl_block_count(&space->unavailable);
+    size_t target = space->evacuation_minimum_reserve * (total - unavailable);
+    size_t reserve = nofl_block_count(&space->evacuation_targets);
+    while (reserve-- > target)
+      nofl_push_block(&space->empty,
+                      nofl_pop_block(&space->evacuation_targets));
+  }
+
+  // FIXME: Promote concurrently instead of during the pause.
+  nofl_space_promote_blocks(space);
   nofl_space_reset_statistics(space);
+  nofl_space_update_mark_patterns(space, 0);
   if (GC_DEBUG)
     nofl_space_verify_before_restart(space);
 }
@@ -1361,7 +1418,7 @@ nofl_allocate_slabs(size_t nslabs) {
 
 static int
 nofl_space_init(struct nofl_space *space, size_t size, int atomic,
-                double venerable_threshold) {
+                double promotion_threshold) {
   size = align_up(size, NOFL_BLOCK_SIZE);
   size_t reserved = align_up(size, NOFL_SLAB_SIZE);
   size_t nslabs = reserved / NOFL_SLAB_SIZE;
@@ -1375,10 +1432,9 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
   space->nslabs = nslabs;
   space->low_addr = (uintptr_t) slabs;
   space->extent = reserved;
-  space->next_block = 0;
   space->evacuation_minimum_reserve = 0.02;
   space->evacuation_reserve = space->evacuation_minimum_reserve;
-  space->venerable_threshold = venerable_threshold;
+  space->promotion_threshold = promotion_threshold;
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
@@ -1386,6 +1442,8 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
         nofl_push_unavailable_block(space, addr);
         reserved -= NOFL_BLOCK_SIZE;
       } else {
+        nofl_block_summary_set_flag(nofl_block_summary_for_addr(addr),
+                                    NOFL_BLOCK_ZERO);
         if (!nofl_push_evacuation_target_if_needed(space, addr))
           nofl_push_empty_block(space, addr);
       }
diff --git a/src/whippet.c b/src/whippet.c
index 6e942d7da..1a00939b1 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -1024,7 +1024,7 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
     determine_collection_kind(heap, requested_kind);
   int is_minor = gc_kind == GC_COLLECTION_MINOR;
   HEAP_EVENT(heap, prepare_gc, gc_kind);
-  nofl_space_update_mark_patterns(nofl_space, !is_minor);
+  nofl_space_prepare_gc(nofl_space, gc_kind);
   large_object_space_start_gc(lospace, is_minor);
   gc_extern_space_start_gc(exspace, is_minor);
   resolve_ephemerons_lazily(heap);
@@ -1042,8 +1042,7 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
-  if (gc_kind == GC_COLLECTION_COMPACTING)
-    nofl_space_prepare_evacuation(nofl_space);
+  nofl_space_start_gc(nofl_space, gc_kind);
   trace_roots_after_stop(heap);
   HEAP_EVENT(heap, roots_traced);
   gc_tracer_trace(&heap->tracer);

From 19fdd481d59e6ef1b49c1f5303f087af6b0dc091 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 23 Aug 2024 21:20:40 +0200
Subject: [PATCH 267/403] Fix some corner cases with hole zeroing of empty
 blocks

---
 src/nofl-space.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 134fbccd8..bd9516cab 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -676,8 +676,13 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
         if (nofl_push_evacuation_target_if_possible(space, block))
           continue;
 
-        // Otherwise give the block to the allocator.
         struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_ZERO))
+          nofl_block_summary_clear_flag(summary, NOFL_BLOCK_ZERO);
+        else
+          nofl_clear_memory(block, NOFL_BLOCK_SIZE);
+
+        // Otherwise give the block to the allocator.
         summary->hole_count = 1;
         summary->free_granules = NOFL_GRANULES_PER_BLOCK;
         summary->holes_with_fragmentation = 0;
@@ -1073,7 +1078,7 @@ nofl_space_verify_empty_blocks(struct nofl_space *space,
     uint8_t *meta = nofl_metadata_byte_for_addr(addr);
     while (addr < limit) {
       GC_ASSERT_EQ(*meta, 0);
-      if (paged_in) {
+      if (paged_in && nofl_block_summary_has_flag(summary, NOFL_BLOCK_ZERO)) {
         char zeroes[NOFL_GRANULE_SIZE] = { 0, };
         GC_ASSERT_EQ(memcmp((char*)addr, zeroes, NOFL_GRANULE_SIZE), 0);
       }

From 7db72e7f80a96cafebe8a9c8d4e296ff6e53994d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 24 Aug 2024 09:09:23 +0200
Subject: [PATCH 268/403] whippet: ensure mutators release allocators before
 start_gc

---
 src/nofl-space.h | 4 ++--
 src/whippet.c    | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index bd9516cab..1f2f17765 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -944,17 +944,17 @@ nofl_space_prepare_gc(struct nofl_space *space, enum gc_collection_kind kind) {
 
 static void
 nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
-  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->to_sweep), 0);
 
   // Any block that was the target of allocation in the last cycle will need to
   // be swept next cycle.
   uintptr_t block;
+  while ((block = nofl_pop_block(&space->partly_full)))
+    nofl_push_block(&space->to_sweep, block);
   while ((block = nofl_pop_block(&space->full)))
     nofl_push_block(&space->to_sweep, block);
 
   if (gc_kind != GC_COLLECTION_MINOR) {
-    uintptr_t block;
     while ((block = nofl_pop_block(&space->promoted)))
       nofl_push_block(&space->to_sweep, block);
     while ((block = nofl_pop_block(&space->old)))
diff --git a/src/whippet.c b/src/whippet.c
index 1a00939b1..05597b4f8 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -247,6 +247,7 @@ add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 
 static void
 remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
@@ -621,7 +622,8 @@ wait_for_mutators_to_stop(struct gc_heap *heap) {
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static void trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
+static void
+trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
   int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
   if (!active_mutators_already_marked)
     for (struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
@@ -654,10 +656,8 @@ trace_mutator_roots_after_stop(struct gc_heap *heap) {
   }
   atomic_store(&heap->mutator_trace_list, NULL);
 
-  for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next) {
-    nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
+  for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next)
     trace_mutator_roots_with_lock(mut);
-  }
 }
 
 static void
@@ -1323,6 +1323,7 @@ void gc_finish_for_thread(struct gc_mutator *mut) {
 
 static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
+  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   heap_lock(heap);
   mut->next = heap->inactive_mutators;
   heap->inactive_mutators = mut;

From 010185f729638a6b84a6e3a47df5bd81d893a9dc Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 24 Aug 2024 16:38:11 +0200
Subject: [PATCH 269/403] nofl: Refactor to trace visitor

---
 src/nofl-space.h | 187 ++++++++++++++++++++++++++---------------------
 1 file changed, 105 insertions(+), 82 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 1f2f17765..38c82f357 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1186,6 +1186,88 @@ nofl_space_sweep_until_memory_released(struct nofl_space *space,
   return pending <= 0;
 }
 
+static inline int
+nofl_space_should_evacuate(struct nofl_space *space, struct gc_ref obj) {
+  if (!space->evacuating)
+    return 0;
+  struct nofl_block_summary *summary =
+    nofl_block_summary_for_addr(gc_ref_value(obj));
+  return nofl_block_summary_has_flag(summary, NOFL_BLOCK_EVACUATE);
+}
+
+static inline int
+nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  *metadata = (byte & ~mask) | space->marked_mask;
+  return 1;
+}
+
+static inline int
+nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
+                    struct gc_edge edge,
+                    struct gc_ref old_ref,
+                    struct nofl_allocator *evacuate) {
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+
+  if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+    gc_atomic_forward_acquire(&fwd);
+
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+  case GC_FORWARDING_STATE_ABORTED:
+  default:
+    // Impossible.
+    GC_CRASH();
+  case GC_FORWARDING_STATE_ACQUIRED: {
+    // We claimed the object successfully; evacuating is up to us.
+    size_t object_granules = nofl_space_live_object_granules(metadata);
+    struct gc_ref new_ref = nofl_evacuation_allocate(evacuate, space,
+                                                     object_granules);
+    if (gc_ref_is_heap_object(new_ref)) {
+      // Copy object contents before committing, as we don't know what
+      // part of the object (if any) will be overwritten by the
+      // commit.
+      memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
+             object_granules * NOFL_GRANULE_SIZE);
+      gc_atomic_forward_commit(&fwd, new_ref);
+      // Now update extent metadata, and indicate to the caller that
+      // the object's fields need to be traced.
+      uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
+      memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
+      gc_edge_update(edge, new_ref);
+      return nofl_space_set_mark(space, new_metadata, byte);
+    } else {
+      // Well shucks; allocation failed, marking the end of
+      // opportunistic evacuation.  No future evacuation of this
+      // object will succeed.  Mark in place instead.
+      gc_atomic_forward_abort(&fwd);
+      return nofl_space_set_mark(space, metadata, byte);
+    }
+    break;
+  }
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known, or evacuation aborts.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
+        break;
+      yield_for_spin(spin_count);
+    }
+    if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+      // Remove evacuation aborted; remote will mark and enqueue.
+      return 0;
+    ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+    // Fall through.
+  case GC_FORWARDING_STATE_FORWARDED:
+    // The object has been evacuated already.  Update the edge;
+    // whoever forwarded the object will make sure it's eventually
+    // traced.
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 0;
+  }
+}
+
 static inline int
 nofl_space_evacuate_or_mark_object(struct nofl_space *space,
                                    struct gc_edge edge,
@@ -1195,75 +1277,12 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
   uint8_t byte = *metadata;
   if (byte & space->marked_mask)
     return 0;
-  if (space->evacuating &&
-      nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(old_ref)),
-                                  NOFL_BLOCK_EVACUATE)) {
-    // This is an evacuating collection, and we are attempting to
-    // evacuate this block, and we are tracing this particular object
-    // for what appears to be the first time.
-    struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
 
-    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
-      gc_atomic_forward_acquire(&fwd);
+  if (nofl_space_should_evacuate(space, old_ref))
+    return nofl_space_evacuate(space, metadata, byte, edge, old_ref,
+                               evacuate);
 
-    switch (fwd.state) {
-    case GC_FORWARDING_STATE_NOT_FORWARDED:
-    case GC_FORWARDING_STATE_ABORTED:
-      // Impossible.
-      GC_CRASH();
-    case GC_FORWARDING_STATE_ACQUIRED: {
-      // We claimed the object successfully; evacuating is up to us.
-      size_t object_granules = nofl_space_live_object_granules(metadata);
-      struct gc_ref new_ref = nofl_evacuation_allocate(evacuate, space,
-                                                       object_granules);
-      if (gc_ref_is_heap_object(new_ref)) {
-        // Copy object contents before committing, as we don't know what
-        // part of the object (if any) will be overwritten by the
-        // commit.
-        memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
-               object_granules * NOFL_GRANULE_SIZE);
-        gc_atomic_forward_commit(&fwd, new_ref);
-        // Now update extent metadata, and indicate to the caller that
-        // the object's fields need to be traced.
-        uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
-        memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
-        gc_edge_update(edge, new_ref);
-        metadata = new_metadata;
-        // Fall through to set mark bits.
-      } else {
-        // Well shucks; allocation failed, marking the end of
-        // opportunistic evacuation.  No future evacuation of this
-        // object will succeed.  Mark in place instead.
-        gc_atomic_forward_abort(&fwd);
-      }
-      break;
-    }
-    case GC_FORWARDING_STATE_BUSY:
-      // Someone else claimed this object first.  Spin until new address
-      // known, or evacuation aborts.
-      for (size_t spin_count = 0;; spin_count++) {
-        if (gc_atomic_forward_retry_busy(&fwd))
-          break;
-        yield_for_spin(spin_count);
-      }
-      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
-        // Remove evacuation aborted; remote will mark and enqueue.
-        return 0;
-      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-      // Fall through.
-    case GC_FORWARDING_STATE_FORWARDED:
-      // The object has been evacuated already.  Update the edge;
-      // whoever forwarded the object will make sure it's eventually
-      // traced.
-      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-      return 0;
-    }
-  }
-
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  *metadata = (byte & ~mask) | space->marked_mask;
-  return 1;
+  return nofl_space_set_mark(space, metadata, byte);
 }
 
 static inline int
@@ -1282,21 +1301,10 @@ nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
   return nofl_space_contains_address(space, gc_ref_value(ref));
 }
 
-static int
-nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
-                                     struct gc_edge edge,
-                                     struct gc_ref ref) {
-  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
-  uint8_t byte = *metadata;
-  if (byte & space->marked_mask)
-    return 1;
-
-  if (!space->evacuating)
-    return 0;
-  if (!nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(ref)),
-                                   NOFL_BLOCK_EVACUATE))
-    return 0;
-
+static inline int
+nofl_space_forward_if_evacuated(struct nofl_space *space,
+                                struct gc_edge edge,
+                                struct gc_ref ref) {
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
   switch (fwd.state) {
   case GC_FORWARDING_STATE_NOT_FORWARDED:
@@ -1322,6 +1330,21 @@ nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
   }
 }
 
+static int
+nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
+                                     struct gc_edge edge,
+                                     struct gc_ref ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
+  uint8_t byte = *metadata;
+  if (byte & space->marked_mask)
+    return 1;
+
+  if (!nofl_space_should_evacuate(space, ref))
+    return 0;
+
+  return nofl_space_forward_if_evacuated(space, edge, ref);
+}
+
 static inline struct gc_ref
 nofl_space_mark_conservative_ref(struct nofl_space *space,
                                  struct gc_conservative_ref ref,

From 6dcec272b10051bd445ea1ca70a421ad27d19443 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 24 Aug 2024 21:28:20 +0200
Subject: [PATCH 270/403] nofl: eagerly sweep empty blocks

---
 src/nofl-space.h | 83 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 9 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 38c82f357..56dbff781 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -55,8 +55,7 @@ struct nofl_slab;
 struct nofl_slab_header {
   union {
     struct {
-      struct nofl_slab *next;
-      struct nofl_slab *prev;
+      uint8_t block_marks[NOFL_BLOCKS_PER_SLAB];
     };
     uint8_t padding[NOFL_HEADER_BYTES_PER_SLAB];
   };
@@ -233,6 +232,45 @@ nofl_metadata_byte_for_object(struct gc_ref ref) {
   return nofl_metadata_byte_for_addr(gc_ref_value(ref));
 }
 
+static int
+nofl_block_is_marked(uintptr_t addr) {
+  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
+  struct nofl_slab *slab = (struct nofl_slab *) base;
+  unsigned block_idx = (addr / NOFL_BLOCK_SIZE) % NOFL_BLOCKS_PER_SLAB;
+  uint8_t mark_byte = block_idx / 8;
+  GC_ASSERT(mark_byte < NOFL_HEADER_BYTES_PER_SLAB);
+  uint8_t mark_mask = 1U << (block_idx % 8);
+  uint8_t byte = atomic_load_explicit(&slab->header.block_marks[mark_byte],
+                                      memory_order_relaxed);
+  return byte & mark_mask;
+}
+
+static void
+nofl_block_set_mark(uintptr_t addr) {
+  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
+  struct nofl_slab *slab = (struct nofl_slab *) base;
+  unsigned block_idx = (addr / NOFL_BLOCK_SIZE) % NOFL_BLOCKS_PER_SLAB;
+  uint8_t mark_byte = block_idx / 8;
+  GC_ASSERT(mark_byte < NOFL_HEADER_BYTES_PER_SLAB);
+  uint8_t mark_mask = 1U << (block_idx % 8);
+  atomic_fetch_or_explicit(&slab->header.block_marks[mark_byte],
+                           mark_mask,
+                           memory_order_relaxed);
+}
+
+static void
+nofl_block_clear_mark(uintptr_t addr) {
+  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
+  struct nofl_slab *slab = (struct nofl_slab *) base;
+  unsigned block_idx = (addr / NOFL_BLOCK_SIZE) % NOFL_BLOCKS_PER_SLAB;
+  uint8_t mark_byte = block_idx / 8;
+  GC_ASSERT(mark_byte < NOFL_HEADER_BYTES_PER_SLAB);
+  uint8_t mark_mask = 1U << (block_idx % 8);
+  atomic_fetch_and_explicit(&slab->header.block_marks[mark_byte],
+                            ~mark_mask,
+                            memory_order_relaxed);
+}
+
 #define NOFL_GRANULES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
 #define NOFL_GRANULES_PER_REMSET_BYTE \
   (NOFL_GRANULES_PER_BLOCK / NOFL_REMSET_BYTES_PER_BLOCK)
@@ -1124,6 +1162,25 @@ nofl_space_finish_gc(struct nofl_space *space,
                       nofl_pop_block(&space->evacuation_targets));
   }
 
+  {
+    struct nofl_block_list to_sweep = {0,};
+    uintptr_t block;
+    while ((block = nofl_pop_block(&space->to_sweep))) {
+      if (nofl_block_is_marked(block)) {
+        nofl_block_clear_mark(block);
+        nofl_push_block(&to_sweep, block);
+      } else {
+        // Block is empty.
+        memset(nofl_metadata_byte_for_addr(block), 0, NOFL_GRANULES_PER_BLOCK);
+        nofl_push_empty_block(space, block);
+      }
+    }
+    atomic_store_explicit(&space->to_sweep.count, to_sweep.count,
+                          memory_order_release);
+    atomic_store_explicit(&space->to_sweep.blocks, to_sweep.blocks,
+                          memory_order_release);
+  }
+
   // FIXME: Promote concurrently instead of during the pause.
   nofl_space_promote_blocks(space);
   nofl_space_reset_statistics(space);
@@ -1199,7 +1256,18 @@ static inline int
 nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
   uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
     | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  *metadata = (byte & ~mask) | space->marked_mask;
+  atomic_store_explicit(metadata,
+                        (byte & ~mask) | space->marked_mask,
+                        memory_order_relaxed);
+  return 1;
+}
+
+static inline int
+nofl_space_set_nonempty_mark(struct nofl_space *space, uint8_t *metadata,
+                             uint8_t byte, struct gc_ref ref) {
+  nofl_space_set_mark(space, metadata, byte);
+  if (!nofl_block_is_marked(gc_ref_value(ref)))
+    nofl_block_set_mark(gc_ref_value(ref));
   return 1;
 }
 
@@ -1242,7 +1310,7 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
       // opportunistic evacuation.  No future evacuation of this
       // object will succeed.  Mark in place instead.
       gc_atomic_forward_abort(&fwd);
-      return nofl_space_set_mark(space, metadata, byte);
+      return nofl_space_set_nonempty_mark(space, metadata, byte, old_ref);
     }
     break;
   }
@@ -1282,7 +1350,7 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
     return nofl_space_evacuate(space, metadata, byte, edge, old_ref,
                                evacuate);
 
-  return nofl_space_set_mark(space, metadata, byte);
+  return nofl_space_set_nonempty_mark(space, metadata, byte, old_ref);
 }
 
 static inline int
@@ -1404,10 +1472,7 @@ nofl_space_mark_conservative_ref(struct nofl_space *space,
     addr = block_base + (loc - loc_base) * NOFL_GRANULE_SIZE;
   }
 
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  atomic_store_explicit(loc, (byte & ~mask) | space->marked_mask,
-                        memory_order_relaxed);
+  nofl_space_set_nonempty_mark(space, loc, byte, gc_ref(addr));
 
   return gc_ref(addr);
 }

From b8c0fa0e90e0dd2159cd61e23c7833c95bd24c65 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 24 Aug 2024 21:43:38 +0200
Subject: [PATCH 271/403] nofl: simplify sweeping

No more need to identify empties during sweeping, as that is done
eagerly during the pause.
---
 src/nofl-space.h | 130 ++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 87 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 56dbff781..fd05251a7 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -647,94 +647,49 @@ static size_t
 nofl_allocator_next_hole(struct nofl_allocator *alloc,
                          struct nofl_space *space) {
   nofl_allocator_finish_hole(alloc);
-  // As we sweep if we find that a block is empty, we return it to the
-  // empties list.  Empties are precious.  But if we return 10 blocks in
-  // a row, and still find an 11th empty, go ahead and use it.
-  size_t empties_countdown = 10;
-  while (1) {
-    // Sweep current block for a hole.
-    if (alloc->block) {
-      size_t granules =
-        nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
-      if (granules) {
-        // If the hole spans only part of a block, let the allocator try
-        // to use it.
-        if (granules < NOFL_GRANULES_PER_BLOCK)
-          return granules;
-        // Otherwise we have an empty block.  If we need an evacuation reserve
-        // block, take it.
-        if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
-          nofl_allocator_reset(alloc);
-          continue;
-        }
-        // If we have pending pages to release to the OS, we should unmap
-        // this block.
-        if (nofl_maybe_release_swept_empty_block(alloc, space))
-          continue;
-        // Otherwise if we've already returned lots of empty blocks to the
-        // freelist, let the allocator keep this block.
-        if (!empties_countdown)
-          return granules;
-        // Otherwise we push to the empty blocks list.
-        nofl_push_empty_block(space, alloc->block);
-        nofl_allocator_reset(alloc);
-        empties_countdown--;
-      } else {
-        nofl_allocator_release_full_block(alloc, space);
-      }
-    }
 
-    GC_ASSERT(alloc->block == 0);
-
-    {
-      size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
-      if (granules)
-        return granules;
-    }
-
-    if (nofl_allocator_acquire_block_to_sweep(alloc, space)) {
-      struct nofl_block_summary *summary =
-        nofl_block_summary_for_addr(alloc->block);
-      // This block was marked in the last GC and needs sweeping.
-      // As we sweep we'll want to record how many bytes were live
-      // at the last collection.  As we allocate we'll record how
-      // many granules were wasted because of fragmentation.
-      summary->hole_count = 0;
-      summary->free_granules = 0;
-      summary->holes_with_fragmentation = 0;
-      summary->fragmentation_granules = 0;
-      continue;
-    }
-
-    // We are done sweeping for blocks.  Now take from the empties list.
-    {
-      uintptr_t block;
-      while ((block = nofl_pop_empty_block(space))) {
-        // Maybe we should use this empty as a target for evacuation.
-        if (nofl_push_evacuation_target_if_possible(space, block))
-          continue;
-
-        struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_ZERO))
-          nofl_block_summary_clear_flag(summary, NOFL_BLOCK_ZERO);
-        else
-          nofl_clear_memory(block, NOFL_BLOCK_SIZE);
-
-        // Otherwise give the block to the allocator.
-        summary->hole_count = 1;
-        summary->free_granules = NOFL_GRANULES_PER_BLOCK;
-        summary->holes_with_fragmentation = 0;
-        summary->fragmentation_granules = 0;
-        alloc->block = block;
-        alloc->alloc = block;
-        alloc->sweep = block + NOFL_BLOCK_SIZE;
-        return NOFL_GRANULES_PER_BLOCK;
-      }
-    }
-
-    // Couldn't acquire another block; return 0 to cause collection.
-    return 0;
+  // Sweep current block for a hole.
+  if (alloc->block) {
+    size_t granules =
+      nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+    if (granules)
+      return granules;
+    else
+      nofl_allocator_release_full_block(alloc, space);
   }
+
+  GC_ASSERT(alloc->block == 0);
+
+  {
+    size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+    if (granules)
+      return granules;
+  }
+
+  while (nofl_allocator_acquire_block_to_sweep(alloc, space)) {
+    struct nofl_block_summary *summary =
+      nofl_block_summary_for_addr(alloc->block);
+    // This block was marked in the last GC and needs sweeping.
+    // As we sweep we'll want to record how many bytes were live
+    // at the last collection.  As we allocate we'll record how
+    // many granules were wasted because of fragmentation.
+    summary->hole_count = 0;
+    summary->free_granules = 0;
+    summary->holes_with_fragmentation = 0;
+    summary->fragmentation_granules = 0;
+    size_t granules =
+      nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+    if (granules)
+      return granules;
+    nofl_allocator_release_full_block(alloc, space);
+  }
+
+  // We are done sweeping for blocks.  Now take from the empties list.
+  if (nofl_allocator_acquire_empty_block(alloc, space))
+    return NOFL_GRANULES_PER_BLOCK;
+
+  // Couldn't acquire another block; return 0 to cause collection.
+  return 0;
 }
 
 static struct gc_ref
@@ -1172,7 +1127,8 @@ nofl_space_finish_gc(struct nofl_space *space,
       } else {
         // Block is empty.
         memset(nofl_metadata_byte_for_addr(block), 0, NOFL_GRANULES_PER_BLOCK);
-        nofl_push_empty_block(space, block);
+        if (!nofl_push_evacuation_target_if_possible(space, block))
+          nofl_push_empty_block(space, block);
       }
     }
     atomic_store_explicit(&space->to_sweep.count, to_sweep.count,

From 7b4a56c51aa8249c0910f82b9b8750b9dd26d711 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 25 Aug 2024 08:45:17 +0200
Subject: [PATCH 272/403] nofl: Fix sticky mark bit treatment for block marks

---
 src/nofl-space.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index fd05251a7..118b4e65b 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -930,9 +930,21 @@ nofl_space_update_mark_patterns(struct nofl_space *space,
   space->sweep_mask = broadcast_byte(space->live_mask);
 }
 
+static void
+nofl_space_clear_block_marks(struct nofl_space *space) {
+  for (size_t s = 0; s < space->nslabs; s++) {
+    struct nofl_slab *slab = &space->slabs[s];
+    memset(&slab->header.block_marks, 0, NOFL_BLOCKS_PER_SLAB / 8);
+  }
+}
+
 static void
 nofl_space_prepare_gc(struct nofl_space *space, enum gc_collection_kind kind) {
-  nofl_space_update_mark_patterns(space, !(kind == GC_COLLECTION_MINOR));
+  int is_minor = kind == GC_COLLECTION_MINOR;
+  if (!is_minor) {
+    nofl_space_update_mark_patterns(space, 1);
+    nofl_space_clear_block_marks(space);
+  }
 }
 
 static void
@@ -1122,7 +1134,6 @@ nofl_space_finish_gc(struct nofl_space *space,
     uintptr_t block;
     while ((block = nofl_pop_block(&space->to_sweep))) {
       if (nofl_block_is_marked(block)) {
-        nofl_block_clear_mark(block);
         nofl_push_block(&to_sweep, block);
       } else {
         // Block is empty.

From 59c9f5dff943f3cf13fa0ae60952a65773e5b5d9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 25 Aug 2024 08:58:55 +0200
Subject: [PATCH 273/403] Mark blocks that are targets of evacuation

---
 src/nofl-space.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 118b4e65b..2a657f7d7 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1271,7 +1271,8 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
       uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
       memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
       gc_edge_update(edge, new_ref);
-      return nofl_space_set_mark(space, new_metadata, byte);
+      return nofl_space_set_nonempty_mark(space, new_metadata, byte,
+                                          new_ref);
     } else {
       // Well shucks; allocation failed, marking the end of
       // opportunistic evacuation.  No future evacuation of this

From 59b85abbda4d9004e231b429c45e64914bd3f9fd Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 25 Aug 2024 20:46:05 +0200
Subject: [PATCH 274/403] Fix regarding memset of block marks

---
 src/nofl-space.h | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 2a657f7d7..989fc562d 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -258,19 +258,6 @@ nofl_block_set_mark(uintptr_t addr) {
                            memory_order_relaxed);
 }
 
-static void
-nofl_block_clear_mark(uintptr_t addr) {
-  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
-  struct nofl_slab *slab = (struct nofl_slab *) base;
-  unsigned block_idx = (addr / NOFL_BLOCK_SIZE) % NOFL_BLOCKS_PER_SLAB;
-  uint8_t mark_byte = block_idx / 8;
-  GC_ASSERT(mark_byte < NOFL_HEADER_BYTES_PER_SLAB);
-  uint8_t mark_mask = 1U << (block_idx % 8);
-  atomic_fetch_and_explicit(&slab->header.block_marks[mark_byte],
-                            ~mark_mask,
-                            memory_order_relaxed);
-}
-
 #define NOFL_GRANULES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
 #define NOFL_GRANULES_PER_REMSET_BYTE \
   (NOFL_GRANULES_PER_BLOCK / NOFL_REMSET_BYTES_PER_BLOCK)
@@ -934,7 +921,7 @@ static void
 nofl_space_clear_block_marks(struct nofl_space *space) {
   for (size_t s = 0; s < space->nslabs; s++) {
     struct nofl_slab *slab = &space->slabs[s];
-    memset(&slab->header.block_marks, 0, NOFL_BLOCKS_PER_SLAB / 8);
+    memset(slab->header.block_marks, 0, NOFL_BLOCKS_PER_SLAB / 8);
   }
 }
 

From c949e4e4a965171cc66618c2917db35892a3f0a2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 30 Aug 2024 14:12:15 +0200
Subject: [PATCH 275/403] Refactor representation of blocks in nofl space

---
 src/nofl-space.h | 470 +++++++++++++++++++++++++++--------------------
 1 file changed, 266 insertions(+), 204 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 989fc562d..14dfb12e2 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -65,9 +65,11 @@ STATIC_ASSERT_EQ(sizeof(struct nofl_slab_header), NOFL_HEADER_BYTES_PER_SLAB);
 // Sometimes we want to put a block on a singly-linked list.  For that
 // there's a pointer reserved in the block summary.  But because the
 // pointer is aligned (32kB on 32-bit, 64kB on 64-bit), we can portably
-// hide up to 15 flags in the low bits.  These flags can be accessed
-// non-atomically by the mutator when it owns a block; otherwise they
-// need to be accessed atomically.
+// hide up to 15 flags in the low bits.  These flags are accessed
+// non-atomically, in two situations: one, when a block is not on a
+// list, which guarantees that no other thread can access it; or when no
+// pushing or popping is happening, for example during an evacuation
+// cycle.
 enum nofl_block_summary_flag {
   NOFL_BLOCK_EVACUATE = 0x1,
   NOFL_BLOCK_ZERO = 0x2,
@@ -99,11 +101,8 @@ struct nofl_block_summary {
       // but nonzero fragmentation_granules.
       uint16_t holes_with_fragmentation;
       uint16_t fragmentation_granules;
-      // After a block is swept, if it's empty it goes on the empties
-      // list.  Otherwise if it's not immediately used by a mutator (as
-      // is usually the case), it goes on the swept list.  Both of these
-      // lists use this field.  But as the next element in the field is
-      // block-aligned, we stash flags in the low bits.
+      // Next pointer, and flags in low bits.  See comment above
+      // regarding enum nofl_block_summary_flag.
       uintptr_t next_and_flags;
     };
     uint8_t padding[NOFL_SUMMARY_BYTES_PER_BLOCK];
@@ -116,6 +115,11 @@ struct nofl_block {
   char data[NOFL_BLOCK_SIZE];
 };
 
+struct nofl_block_ref {
+  struct nofl_block_summary *summary;
+  uintptr_t addr;
+};
+
 struct nofl_slab {
   struct nofl_slab_header header;
   struct nofl_block_summary summaries[NOFL_NONMETA_BLOCKS_PER_SLAB];
@@ -161,7 +165,7 @@ struct nofl_space {
 struct nofl_allocator {
   uintptr_t alloc;
   uintptr_t sweep;
-  uintptr_t block;
+  struct nofl_block_ref block;
 };
 
 // Each granule has one mark byte stored in a side table.  A granule's
@@ -278,7 +282,7 @@ nofl_block_summary_has_flag(struct nofl_block_summary *summary,
 
 static void
 nofl_block_summary_set_flag(struct nofl_block_summary *summary,
-                                        enum nofl_block_summary_flag flag) {
+                            enum nofl_block_summary_flag flag) {
   summary->next_and_flags |= flag;
 }
 
@@ -301,29 +305,102 @@ nofl_block_summary_set_next(struct nofl_block_summary *summary,
     (summary->next_and_flags & (NOFL_BLOCK_SIZE - 1)) | next;
 }
 
-static void
-nofl_push_block(struct nofl_block_list *list, uintptr_t block) {
-  atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT_EQ(nofl_block_summary_next(summary), 0);
-  uintptr_t next = atomic_load_explicit(&list->blocks, memory_order_acquire);
-  do {
-    nofl_block_summary_set_next(summary, next);
-  } while (!atomic_compare_exchange_weak(&list->blocks, &next, block));
+static struct nofl_block_ref
+nofl_block_for_addr(uintptr_t addr) {
+  return (struct nofl_block_ref) {
+    nofl_block_summary_for_addr(addr),
+    align_down(addr, NOFL_BLOCK_SIZE)
+  };
+}
+
+static struct nofl_block_ref
+nofl_block_null(void) {
+  return (struct nofl_block_ref) { NULL, 0 };
+}
+
+static int
+nofl_block_is_null(struct nofl_block_ref block) {
+  return block.summary == NULL;
 }
 
 static uintptr_t
-nofl_pop_block(struct nofl_block_list *list) {
+nofl_block_has_flag(struct nofl_block_ref block, uintptr_t flags) {
+  GC_ASSERT(!nofl_block_is_null(block));
+  return nofl_block_summary_has_flag(block.summary, flags);
+}
+
+static void
+nofl_block_set_flag(struct nofl_block_ref block, uintptr_t flags) {
+  GC_ASSERT(!nofl_block_is_null(block));
+  nofl_block_summary_set_flag(block.summary, flags);
+}
+
+static void
+nofl_block_clear_flag(struct nofl_block_ref block, uintptr_t flags) {
+  GC_ASSERT(!nofl_block_is_null(block));
+  nofl_block_summary_clear_flag(block.summary, flags);
+}
+
+static struct nofl_block_ref
+nofl_block_next(struct nofl_block_ref block) {
+  GC_ASSERT(!nofl_block_is_null(block));
+  return nofl_block_for_addr(nofl_block_summary_next(block.summary));
+}
+
+static void
+nofl_block_set_next(struct nofl_block_ref head, struct nofl_block_ref tail) {
+  GC_ASSERT(!nofl_block_is_null(head));
+  nofl_block_summary_set_next(head.summary, tail.addr);
+}
+
+static int
+nofl_allocator_has_block(struct nofl_allocator *alloc) {
+  return !nofl_block_is_null(alloc->block);
+}
+
+static struct nofl_block_ref
+nofl_block_head(struct nofl_block_list *list) {
   uintptr_t head = atomic_load_explicit(&list->blocks, memory_order_acquire);
-  struct nofl_block_summary *summary;
-  uintptr_t next;
+  if (!head)
+    return nofl_block_null();
+  return (struct nofl_block_ref){ nofl_block_summary_for_addr(head), head };
+}
+
+static int
+nofl_block_compare_and_exchange(struct nofl_block_list *list,
+                                struct nofl_block_ref *expected,
+                                struct nofl_block_ref desired) {
+  if (atomic_compare_exchange_weak_explicit(&list->blocks,
+                                            &expected->addr,
+                                            desired.addr,
+                                            memory_order_acq_rel,
+                                            memory_order_acquire))
+    return 1;
+      
+  expected->summary = nofl_block_summary_for_addr(expected->addr);
+  return 0;
+}
+
+static void
+nofl_push_block(struct nofl_block_list *list, struct nofl_block_ref block) {
+  atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
+  GC_ASSERT(nofl_block_is_null(nofl_block_next(block)));
+  struct nofl_block_ref next = nofl_block_head(list);
   do {
-    if (!head)
-      return 0;
-    summary = nofl_block_summary_for_addr(head);
-    next = nofl_block_summary_next(summary);
-  } while (!atomic_compare_exchange_weak(&list->blocks, &head, next));
-  nofl_block_summary_set_next(summary, 0);
+    nofl_block_set_next(block, next);
+  } while (!nofl_block_compare_and_exchange(list, &next, block));
+}
+
+static struct nofl_block_ref
+nofl_pop_block(struct nofl_block_list *list) {
+  struct nofl_block_ref head = nofl_block_head(list);
+  struct nofl_block_ref next;
+  do {
+    if (nofl_block_is_null(head))
+      return nofl_block_null();
+    next = nofl_block_next(head);
+  } while (!nofl_block_compare_and_exchange(list, &head, next));
+  nofl_block_set_next(head, nofl_block_null());
   atomic_fetch_sub_explicit(&list->count, 1, memory_order_acq_rel);
   return head;
 }
@@ -334,35 +411,36 @@ nofl_block_count(struct nofl_block_list *list) {
 }
 
 static void
-nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
-  nofl_block_summary_set_flag(nofl_block_summary_for_addr(block),
-                              NOFL_BLOCK_ZERO | NOFL_BLOCK_UNAVAILABLE);
-  madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+nofl_push_unavailable_block(struct nofl_space *space,
+                            struct nofl_block_ref block) {
+  nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_UNAVAILABLE);
+  madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
   nofl_push_block(&space->unavailable, block);
 }
 
-static uintptr_t
+static struct nofl_block_ref
 nofl_pop_unavailable_block(struct nofl_space *space) {
-  uintptr_t block = nofl_pop_block(&space->unavailable);
-  if (block)
-    nofl_block_summary_clear_flag(nofl_block_summary_for_addr(block),
-                                  NOFL_BLOCK_UNAVAILABLE);
+  struct nofl_block_ref block = nofl_pop_block(&space->unavailable);
+  if (!nofl_block_is_null(block))
+    nofl_block_clear_flag(block, NOFL_BLOCK_UNAVAILABLE);
   return block;
 }
 
 static void
-nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
+nofl_push_empty_block(struct nofl_space *space,
+                      struct nofl_block_ref block) {
   nofl_push_block(&space->empty, block);
 }
 
-static uintptr_t
+static struct nofl_block_ref
 nofl_pop_empty_block(struct nofl_space *space) {
   return nofl_pop_block(&space->empty);
 }
 
 static int
 nofl_maybe_push_evacuation_target(struct nofl_space *space,
-                                  uintptr_t block, double reserve) {
+                                  struct nofl_block_ref block,
+                                  double reserve) {
   size_t targets = nofl_block_count(&space->evacuation_targets);
   size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
   size_t unavailable = nofl_block_count(&space->unavailable);
@@ -375,14 +453,14 @@ nofl_maybe_push_evacuation_target(struct nofl_space *space,
 
 static int
 nofl_push_evacuation_target_if_needed(struct nofl_space *space,
-                                      uintptr_t block) {
+                                      struct nofl_block_ref block) {
   return nofl_maybe_push_evacuation_target(space, block,
                                            space->evacuation_minimum_reserve);
 }
 
 static int
 nofl_push_evacuation_target_if_possible(struct nofl_space *space,
-                                        uintptr_t block) {
+                                        struct nofl_block_ref block) {
   return nofl_maybe_push_evacuation_target(space, block,
                                            space->evacuation_reserve);
 }
@@ -399,29 +477,36 @@ nofl_space_live_object_granules(uint8_t *metadata) {
 
 static void
 nofl_allocator_reset(struct nofl_allocator *alloc) {
-  alloc->alloc = alloc->sweep = alloc->block = 0;
+  alloc->alloc = alloc->sweep = 0;
+  alloc->block = nofl_block_null();
+}
+
+static int
+nofl_should_promote_block(struct nofl_space *space,
+                          struct nofl_block_ref block) {
+  // If this block has mostly survivors, we can promote it to the old
+  // generation.  Old-generation blocks won't be used for allocation
+  // until after the next full GC.
+  if (!GC_GENERATIONAL) return 0;
+  size_t threshold = NOFL_GRANULES_PER_BLOCK * space->promotion_threshold;
+  return block.summary->free_granules < threshold;
 }
 
 static void
 nofl_allocator_release_full_block(struct nofl_allocator *alloc,
                                   struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
+  GC_ASSERT(nofl_allocator_has_block(alloc));
+  struct nofl_block_ref block = alloc->block;
   GC_ASSERT(alloc->alloc == alloc->sweep);
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
   atomic_fetch_add(&space->granules_freed_by_last_collection,
-                   summary->free_granules);
+                   block.summary->free_granules);
   atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
-                   summary->fragmentation_granules);
+                   block.summary->fragmentation_granules);
 
-  // If this block has mostly survivors, we should avoid sweeping it and
-  // trying to allocate into it for a minor GC.  Sweep it next time to
-  // clear any garbage allocated in this cycle and mark it as
-  // "venerable" (i.e., old).
-  if (GC_GENERATIONAL &&
-      summary->free_granules < NOFL_GRANULES_PER_BLOCK * space->promotion_threshold)
-    nofl_push_block(&space->promoted, alloc->block);
+  if (nofl_should_promote_block(space, block))
+    nofl_push_block(&space->promoted, block);
   else
-    nofl_push_block(&space->full, alloc->block);
+    nofl_push_block(&space->full, block);
 
   nofl_allocator_reset(alloc);
 }
@@ -429,26 +514,27 @@ nofl_allocator_release_full_block(struct nofl_allocator *alloc,
 static void
 nofl_allocator_release_full_evacuation_target(struct nofl_allocator *alloc,
                                               struct nofl_space *space) {
-  GC_ASSERT(alloc->alloc > alloc->block);
-  GC_ASSERT(alloc->sweep == alloc->block + NOFL_BLOCK_SIZE);
+  GC_ASSERT(nofl_allocator_has_block(alloc));
+  struct nofl_block_ref block = alloc->block;
+  GC_ASSERT(alloc->alloc > block.addr);
+  GC_ASSERT(alloc->sweep == block.addr + NOFL_BLOCK_SIZE);
   size_t hole_size = alloc->sweep - alloc->alloc;
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
   // FIXME: Check how this affects statistics.
-  GC_ASSERT_EQ(summary->hole_count, 1);
-  GC_ASSERT_EQ(summary->free_granules, NOFL_GRANULES_PER_BLOCK);
+  GC_ASSERT_EQ(block.summary->hole_count, 1);
+  GC_ASSERT_EQ(block.summary->free_granules, NOFL_GRANULES_PER_BLOCK);
   atomic_fetch_add(&space->granules_freed_by_last_collection,
                    NOFL_GRANULES_PER_BLOCK);
   if (hole_size) {
     hole_size >>= NOFL_GRANULE_SIZE_LOG_2;
-    summary->holes_with_fragmentation = 1;
-    summary->fragmentation_granules = hole_size >> NOFL_GRANULE_SIZE_LOG_2;
+    block.summary->holes_with_fragmentation = 1;
+    block.summary->fragmentation_granules = hole_size / NOFL_GRANULE_SIZE;
     atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
-                     summary->fragmentation_granules);
+                     block.summary->fragmentation_granules);
   } else {
-    GC_ASSERT_EQ(summary->fragmentation_granules, 0);
-    GC_ASSERT_EQ(summary->holes_with_fragmentation, 0);
+    GC_ASSERT_EQ(block.summary->fragmentation_granules, 0);
+    GC_ASSERT_EQ(block.summary->holes_with_fragmentation, 0);
   }
-  nofl_push_block(&space->old, alloc->block);
+  nofl_push_block(&space->old, block);
   nofl_allocator_reset(alloc);
 }
 
@@ -457,27 +543,28 @@ nofl_allocator_release_partly_full_block(struct nofl_allocator *alloc,
                                          struct nofl_space *space) {
   // A block can go on the partly full list if it has exactly one
   // hole, located at the end of the block.
-  GC_ASSERT(alloc->alloc > alloc->block);
-  GC_ASSERT(alloc->sweep == alloc->block + NOFL_BLOCK_SIZE);
+  GC_ASSERT(nofl_allocator_has_block(alloc));
+  struct nofl_block_ref block = alloc->block;
+  GC_ASSERT(alloc->alloc > block.addr);
+  GC_ASSERT(alloc->sweep == block.addr + NOFL_BLOCK_SIZE);
   size_t hole_size = alloc->sweep - alloc->alloc;
   GC_ASSERT(hole_size);
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
-  summary->fragmentation_granules = hole_size >> NOFL_GRANULE_SIZE_LOG_2;
-  nofl_push_block(&space->partly_full, alloc->block);
+  block.summary->fragmentation_granules = hole_size / NOFL_GRANULE_SIZE;
+  nofl_push_block(&space->partly_full, block);
   nofl_allocator_reset(alloc);
 }
 
 static size_t
 nofl_allocator_acquire_partly_full_block(struct nofl_allocator *alloc,
                                          struct nofl_space *space) {
-  uintptr_t block = nofl_pop_block(&space->partly_full);
-  if (!block) return 0;
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(summary->holes_with_fragmentation == 0);
+  struct nofl_block_ref block = nofl_pop_block(&space->partly_full);
+  if (nofl_block_is_null(block))
+    return 0;
+  GC_ASSERT_EQ(block.summary->holes_with_fragmentation, 0);
   alloc->block = block;
-  alloc->sweep = block + NOFL_BLOCK_SIZE;
-  size_t hole_granules = summary->fragmentation_granules;
-  summary->fragmentation_granules = 0;
+  alloc->sweep = block.addr + NOFL_BLOCK_SIZE;
+  size_t hole_granules = block.summary->fragmentation_granules;
+  block.summary->fragmentation_granules = 0;
   alloc->alloc = alloc->sweep - (hole_granules << NOFL_GRANULE_SIZE_LOG_2);
   return hole_granules;
 }
@@ -485,19 +572,20 @@ nofl_allocator_acquire_partly_full_block(struct nofl_allocator *alloc,
 static size_t
 nofl_allocator_acquire_empty_block(struct nofl_allocator *alloc,
                                    struct nofl_space *space) {
-  uintptr_t block = nofl_pop_empty_block(space);
-  if (!block) return 0;
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  summary->hole_count = 1;
-  summary->free_granules = NOFL_GRANULES_PER_BLOCK;
-  summary->holes_with_fragmentation = 0;
-  summary->fragmentation_granules = 0;
-  alloc->block = alloc->alloc = block;
-  alloc->sweep = block + NOFL_BLOCK_SIZE;
-  if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_ZERO))
-    nofl_block_summary_clear_flag(summary, NOFL_BLOCK_ZERO);
+  struct nofl_block_ref block = nofl_pop_empty_block(space);
+  if (nofl_block_is_null(block))
+    return 0;
+  block.summary->hole_count = 1;
+  block.summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+  block.summary->holes_with_fragmentation = 0;
+  block.summary->fragmentation_granules = 0;
+  alloc->block = block;
+  alloc->alloc = block.addr;
+  alloc->sweep = block.addr + NOFL_BLOCK_SIZE;
+  if (nofl_block_has_flag(block, NOFL_BLOCK_ZERO))
+    nofl_block_clear_flag(block, NOFL_BLOCK_ZERO);
   else
-    nofl_clear_memory(block, NOFL_BLOCK_SIZE);
+    nofl_clear_memory(block.addr, NOFL_BLOCK_SIZE);
   return NOFL_GRANULES_PER_BLOCK;
 }
 
@@ -514,9 +602,8 @@ static void
 nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
   size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
   if (granules) {
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(alloc->block);
-    summary->holes_with_fragmentation++;
-    summary->fragmentation_granules += granules;
+    alloc->block.summary->holes_with_fragmentation++;
+    alloc->block.summary->fragmentation_granules += granules;
     alloc->alloc = alloc->sweep;
   }
 }
@@ -527,10 +614,10 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
 static size_t
 nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
                                   uintptr_t sweep_mask) {
-  GC_ASSERT(alloc->block != 0);
+  GC_ASSERT(nofl_allocator_has_block(alloc));
   GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
   uintptr_t sweep = alloc->sweep;
-  uintptr_t limit = alloc->block + NOFL_BLOCK_SIZE;
+  uintptr_t limit = alloc->block.addr + NOFL_BLOCK_SIZE;
 
   if (sweep == limit)
     return 0;
@@ -564,10 +651,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
   memset(metadata, 0, free_granules);
   memset((char*)sweep, 0, free_bytes);
 
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
-  summary->hole_count++;
-  GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - summary->free_granules);
-  summary->free_granules += free_granules;
+  alloc->block.summary->hole_count++;
+  GC_ASSERT(free_granules <=
+            NOFL_GRANULES_PER_BLOCK - alloc->block.summary->free_granules);
+  alloc->block.summary->free_granules += free_granules;
 
   alloc->alloc = sweep;
   alloc->sweep = sweep + free_bytes;
@@ -585,10 +672,10 @@ nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
 static void
 nofl_allocator_release_block(struct nofl_allocator *alloc,
                              struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
+  GC_ASSERT(nofl_allocator_has_block(alloc));
   if (alloc->alloc < alloc->sweep &&
-      alloc->sweep == alloc->block + NOFL_BLOCK_SIZE &&
-      nofl_block_summary_for_addr(alloc->block)->holes_with_fragmentation == 0) {
+      alloc->sweep == alloc->block.addr + NOFL_BLOCK_SIZE &&
+      alloc->block.summary->holes_with_fragmentation == 0) {
     nofl_allocator_release_partly_full_block(alloc, space);
   } else if (space->evacuating) {
     nofl_allocator_release_full_evacuation_target(alloc, space);
@@ -600,34 +687,19 @@ nofl_allocator_release_block(struct nofl_allocator *alloc,
 
 static void
 nofl_allocator_finish(struct nofl_allocator *alloc, struct nofl_space *space) {
-  if (alloc->block)
+  if (nofl_allocator_has_block(alloc))
     nofl_allocator_release_block(alloc, space);
 }
 
-static int
-nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
-                                     struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
-  uintptr_t block = alloc->block;
-  if (atomic_load_explicit(&space->pending_unavailable_bytes,
-                           memory_order_acquire) <= 0)
-    return 0;
-
-  nofl_push_unavailable_block(space, block);
-  atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
-  nofl_allocator_reset(alloc);
-  return 1;
-}
-
 static int
 nofl_allocator_acquire_block_to_sweep(struct nofl_allocator *alloc,
                                       struct nofl_space *space) {
-  uintptr_t block = nofl_pop_block(&space->to_sweep);
-  if (block) {
-    alloc->block = alloc->alloc = alloc->sweep = block;
-    return 1;
-  }
-  return 0;
+  struct nofl_block_ref block = nofl_pop_block(&space->to_sweep);
+  if (nofl_block_is_null(block))
+    return 0;
+  alloc->block = block;
+  alloc->alloc = alloc->sweep = block.addr;
+  return 1;
 }
 
 static size_t
@@ -636,17 +708,16 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
   nofl_allocator_finish_hole(alloc);
 
   // Sweep current block for a hole.
-  if (alloc->block) {
+  if (nofl_allocator_has_block(alloc)) {
     size_t granules =
       nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
     if (granules)
       return granules;
     else
       nofl_allocator_release_full_block(alloc, space);
+    GC_ASSERT(!nofl_allocator_has_block(alloc));
   }
 
-  GC_ASSERT(alloc->block == 0);
-
   {
     size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
     if (granules)
@@ -654,16 +725,14 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
   }
 
   while (nofl_allocator_acquire_block_to_sweep(alloc, space)) {
-    struct nofl_block_summary *summary =
-      nofl_block_summary_for_addr(alloc->block);
     // This block was marked in the last GC and needs sweeping.
     // As we sweep we'll want to record how many bytes were live
     // at the last collection.  As we allocate we'll record how
     // many granules were wasted because of fragmentation.
-    summary->hole_count = 0;
-    summary->free_granules = 0;
-    summary->holes_with_fragmentation = 0;
-    summary->fragmentation_granules = 0;
+    alloc->block.summary->hole_count = 0;
+    alloc->block.summary->free_granules = 0;
+    alloc->block.summary->holes_with_fragmentation = 0;
+    alloc->block.summary->fragmentation_granules = 0;
     size_t granules =
       nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
     if (granules)
@@ -709,7 +778,7 @@ nofl_evacuation_allocate(struct nofl_allocator* alloc, struct nofl_space *space,
                          size_t granules) {
   size_t avail = (alloc->sweep - alloc->alloc) >> NOFL_GRANULE_SIZE_LOG_2;
   while (avail < granules) {
-    if (alloc->block)
+    if (nofl_allocator_has_block(alloc))
       // No need to finish the hole, these mark bytes are zero.
       nofl_allocator_release_full_evacuation_target(alloc, space);
     avail = nofl_allocator_acquire_evacuation_target(alloc, space);
@@ -830,11 +899,10 @@ nofl_space_fragmentation(struct nofl_space *space) {
 static void
 nofl_space_prepare_evacuation(struct nofl_space *space) {
   GC_ASSERT(!space->evacuating);
-  {
-    uintptr_t block;
-    while ((block = nofl_pop_block(&space->evacuation_targets)))
-      nofl_push_empty_block(space, block);
-  }
+  struct nofl_block_ref block;
+  while (!nofl_block_is_null
+         (block = nofl_pop_block(&space->evacuation_targets)))
+    nofl_push_empty_block(space, block);
   // Blocks are either to_sweep, empty, or unavailable.
   GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->full), 0);
@@ -863,15 +931,12 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
   const size_t bucket_count = 33;
   size_t histogram[33] = {0,};
   size_t bucket_size = NOFL_GRANULES_PER_BLOCK / 32;
-  {
-    uintptr_t block = space->to_sweep.blocks;
-    while (block) {
-      struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
-      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
-      histogram[bucket]++;
-      block = nofl_block_summary_next(summary);
-    }
+  for (struct nofl_block_ref b = nofl_block_for_addr(space->to_sweep.blocks);
+       !nofl_block_is_null(b);
+       b = nofl_block_next(b)) {
+    size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - b.summary->free_granules;
+    size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+    histogram[bucket]++;
   }
 
   // Now select a number of blocks that is likely to fill the space in
@@ -889,19 +954,16 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
 
   // Having selected the number of blocks, now we set the evacuation
   // candidate flag on all blocks that have live objects.
-  {
-    uintptr_t block = space->to_sweep.blocks;
-    while (block) {
-      struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-      size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - summary->free_granules;
-      size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
-      if (histogram[bucket]) {
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_EVACUATE);
-        histogram[bucket]--;
-      } else {
-        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_EVACUATE);
-      }
-      block = nofl_block_summary_next(summary);
+  for (struct nofl_block_ref b = nofl_block_for_addr(space->to_sweep.blocks);
+       !nofl_block_is_null(b);
+       b = nofl_block_next(b)) {
+    size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - b.summary->free_granules;
+    size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
+    if (histogram[bucket]) {
+      nofl_block_set_flag(b, NOFL_BLOCK_EVACUATE);
+      histogram[bucket]--;
+    } else {
+      nofl_block_clear_flag(b, NOFL_BLOCK_EVACUATE);
     }
   }
 }
@@ -940,16 +1002,16 @@ nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
 
   // Any block that was the target of allocation in the last cycle will need to
   // be swept next cycle.
-  uintptr_t block;
-  while ((block = nofl_pop_block(&space->partly_full)))
+  struct nofl_block_ref block;
+  while (!nofl_block_is_null(block = nofl_pop_block(&space->partly_full)))
     nofl_push_block(&space->to_sweep, block);
-  while ((block = nofl_pop_block(&space->full)))
+  while (!nofl_block_is_null(block = nofl_pop_block(&space->full)))
     nofl_push_block(&space->to_sweep, block);
 
   if (gc_kind != GC_COLLECTION_MINOR) {
-    while ((block = nofl_pop_block(&space->promoted)))
+    while (!nofl_block_is_null(block = nofl_pop_block(&space->promoted)))
       nofl_push_block(&space->to_sweep, block);
-    while ((block = nofl_pop_block(&space->old)))
+    while (!nofl_block_is_null(block = nofl_pop_block(&space->old)))
       nofl_push_block(&space->to_sweep, block);
   }
 
@@ -969,17 +1031,17 @@ nofl_space_finish_evacuation(struct nofl_space *space) {
   size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
   GC_ASSERT(nofl_block_count(&space->evacuation_targets) == 0);
   while (reserve--) {
-    uintptr_t block = nofl_pop_block(&space->empty);
-    if (!block) break;
+    struct nofl_block_ref block = nofl_pop_block(&space->empty);
+    if (nofl_block_is_null(block)) break;
     nofl_push_block(&space->evacuation_targets, block);
   }
 }
 
 static void
 nofl_space_promote_blocks(struct nofl_space *space) {
-  uintptr_t block;
-  while ((block = nofl_pop_block(&space->promoted))) {
-    struct nofl_allocator alloc = { block, block, block };
+  struct nofl_block_ref block;
+  while (!nofl_block_is_null(block = nofl_pop_block(&space->promoted))) {
+    struct nofl_allocator alloc = { block.addr, block.addr, block };
     nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
     nofl_push_block(&space->old, block);
   }
@@ -994,13 +1056,14 @@ static void
 nofl_space_verify_sweepable_blocks(struct nofl_space *space,
                                    struct nofl_block_list *list)
 {
-  uintptr_t addr = list->blocks;
-  while (addr) {
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+  for (struct nofl_block_ref b = nofl_block_for_addr(list->blocks);
+       !nofl_block_is_null(b);
+       b = nofl_block_next(b)) {
     // Iterate objects in the block, verifying that the END bytes correspond to
     // the measured object size.
+    uintptr_t addr = b.addr;
     uintptr_t limit = addr + NOFL_BLOCK_SIZE;
-    uint8_t *meta = nofl_metadata_byte_for_addr(addr);
+    uint8_t *meta = nofl_metadata_byte_for_addr(b.addr);
     while (addr < limit) {
       if (meta[0] & space->live_mask) {
         struct gc_ref obj = gc_ref(addr);
@@ -1019,18 +1082,18 @@ nofl_space_verify_sweepable_blocks(struct nofl_space *space,
       }
     }
     GC_ASSERT(addr == limit);
-    addr = nofl_block_summary_next(summary);
   }
 }
 
 static void
 nofl_space_verify_swept_blocks(struct nofl_space *space,
                                struct nofl_block_list *list) {
-  uintptr_t addr = list->blocks;
-  while (addr) {
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+  for (struct nofl_block_ref b = nofl_block_for_addr(list->blocks);
+       !nofl_block_is_null(b);
+       b = nofl_block_next(b)) {
     // Iterate objects in the block, verifying that the END bytes correspond to
     // the measured object size.
+    uintptr_t addr = b.addr;
     uintptr_t limit = addr + NOFL_BLOCK_SIZE;
     uint8_t *meta = nofl_metadata_byte_for_addr(addr);
     while (addr < limit) {
@@ -1053,7 +1116,6 @@ nofl_space_verify_swept_blocks(struct nofl_space *space,
       }
     }
     GC_ASSERT(addr == limit);
-    addr = nofl_block_summary_next(summary);
   }
 }
 
@@ -1061,16 +1123,17 @@ static void
 nofl_space_verify_empty_blocks(struct nofl_space *space,
                                struct nofl_block_list *list,
                                int paged_in) {
-  uintptr_t addr = list->blocks;
-  while (addr) {
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+  for (struct nofl_block_ref b = nofl_block_for_addr(list->blocks);
+       !nofl_block_is_null(b);
+       b = nofl_block_next(b)) {
     // Iterate objects in the block, verifying that the END bytes correspond to
     // the measured object size.
+    uintptr_t addr = b.addr;
     uintptr_t limit = addr + NOFL_BLOCK_SIZE;
     uint8_t *meta = nofl_metadata_byte_for_addr(addr);
     while (addr < limit) {
       GC_ASSERT_EQ(*meta, 0);
-      if (paged_in && nofl_block_summary_has_flag(summary, NOFL_BLOCK_ZERO)) {
+      if (paged_in && nofl_block_has_flag(b, NOFL_BLOCK_ZERO)) {
         char zeroes[NOFL_GRANULE_SIZE] = { 0, };
         GC_ASSERT_EQ(memcmp((char*)addr, zeroes, NOFL_GRANULE_SIZE), 0);
       }
@@ -1078,7 +1141,6 @@ nofl_space_verify_empty_blocks(struct nofl_space *space,
       addr += NOFL_GRANULE_SIZE;
     }
     GC_ASSERT(addr == limit);
-    addr = nofl_block_summary_next(summary);
   }
 }
 
@@ -1118,13 +1180,14 @@ nofl_space_finish_gc(struct nofl_space *space,
 
   {
     struct nofl_block_list to_sweep = {0,};
-    uintptr_t block;
-    while ((block = nofl_pop_block(&space->to_sweep))) {
-      if (nofl_block_is_marked(block)) {
+    struct nofl_block_ref block;
+    while (!nofl_block_is_null(block = nofl_pop_block(&space->to_sweep))) {
+      if (nofl_block_is_marked(block.addr)) {
         nofl_push_block(&to_sweep, block);
       } else {
         // Block is empty.
-        memset(nofl_metadata_byte_for_addr(block), 0, NOFL_GRANULES_PER_BLOCK);
+        memset(nofl_metadata_byte_for_addr(block.addr), 0,
+               NOFL_GRANULES_PER_BLOCK);
         if (!nofl_push_evacuation_target_if_possible(space, block))
           nofl_push_empty_block(space, block);
       }
@@ -1153,8 +1216,8 @@ nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
   while (pending + NOFL_BLOCK_SIZE <= 0) {
-    uintptr_t block = nofl_pop_unavailable_block(space);
-    GC_ASSERT(block);
+    struct nofl_block_ref block = nofl_pop_unavailable_block(space);
+    GC_ASSERT(!nofl_block_is_null(block));
     if (!nofl_push_evacuation_target_if_needed(space, block))
       nofl_push_empty_block(space, block);
     pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
@@ -1171,8 +1234,8 @@ nofl_space_sweep_until_memory_released(struct nofl_space *space,
   // > 0 and other mutators happen to identify empty blocks, they will
   // be unmapped directly and moved to the unavailable list.
   while (pending > 0) {
-    uintptr_t block = nofl_pop_empty_block(space);
-    if (!block)
+    struct nofl_block_ref block = nofl_pop_empty_block(space);
+    if (nofl_block_is_null(block))
       break;
     // Note that we may have competing uses; if we're evacuating,
     // perhaps we should push this block to the evacuation target list.
@@ -1182,7 +1245,8 @@ nofl_space_sweep_until_memory_released(struct nofl_space *space,
     // the fruits of our labor.  Probably this second use-case is more
     // important.
     nofl_push_unavailable_block(space, block);
-    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
+    pending = atomic_fetch_sub(&space->pending_unavailable_bytes,
+                               NOFL_BLOCK_SIZE);
     pending -= NOFL_BLOCK_SIZE;
   }
   // Otherwise, sweep, transitioning any empty blocks to unavailable and
@@ -1201,9 +1265,8 @@ static inline int
 nofl_space_should_evacuate(struct nofl_space *space, struct gc_ref obj) {
   if (!space->evacuating)
     return 0;
-  struct nofl_block_summary *summary =
-    nofl_block_summary_for_addr(gc_ref_value(obj));
-  return nofl_block_summary_has_flag(summary, NOFL_BLOCK_EVACUATE);
+  return nofl_block_has_flag(nofl_block_for_addr(gc_ref_value(obj)),
+                             NOFL_BLOCK_EVACUATE);
 }
 
 static inline int
@@ -1389,8 +1452,7 @@ nofl_space_mark_conservative_ref(struct nofl_space *space,
     return gc_ref_null();
 
   // Addr in block that has been paged out?  Not an object.
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
-  if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+  if (nofl_block_has_flag(nofl_block_for_addr(addr), NOFL_BLOCK_UNAVAILABLE))
     return gc_ref_null();
 
   uint8_t *loc = nofl_metadata_byte_for_addr(addr);
@@ -1486,14 +1548,14 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
+      struct nofl_block_ref block = nofl_block_for_addr(addr);
+      nofl_block_set_flag(block, NOFL_BLOCK_ZERO);
       if (reserved > size) {
-        nofl_push_unavailable_block(space, addr);
+        nofl_push_unavailable_block(space, block);
         reserved -= NOFL_BLOCK_SIZE;
       } else {
-        nofl_block_summary_set_flag(nofl_block_summary_for_addr(addr),
-                                    NOFL_BLOCK_ZERO);
-        if (!nofl_push_evacuation_target_if_needed(space, addr))
-          nofl_push_empty_block(space, addr);
+        if (!nofl_push_evacuation_target_if_needed(space, block))
+          nofl_push_empty_block(space, block);
       }
     }
   }

From 1a7d08baac5adf53a8742ba41f7360b90565d572 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 30 Aug 2024 15:27:17 +0200
Subject: [PATCH 276/403] Add separate extents header

Will be useful to let slab heaps grow.
---
 src/copy-space.h | 21 +++---------
 src/extents.h    | 88 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 16 deletions(-)
 create mode 100644 src/extents.h

diff --git a/src/copy-space.h b/src/copy-space.h
index a863ca51a..e51f9f8ee 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -10,6 +10,7 @@
 
 #include "assert.h"
 #include "debug.h"
+#include "extents.h"
 #include "gc-align.h"
 #include "gc-attrs.h"
 #include "gc-inline.h"
@@ -100,11 +101,6 @@ copy_space_object_region(struct gc_ref obj) {
   return (gc_ref_value(obj) / COPY_SPACE_REGION_SIZE) & 1;
 }
 
-struct copy_space_extent {
-  uintptr_t low_addr;
-  uintptr_t high_addr;
-};
-
 struct copy_space {
   struct copy_space_block *empty;
   struct copy_space_block *partly_full;
@@ -119,8 +115,7 @@ struct copy_space {
   uint8_t atomic_forward;
   size_t allocated_bytes_at_last_gc;
   size_t fragmentation_at_last_gc;
-  struct copy_space_extent *extents;
-  size_t nextents;
+  struct extents *extents;
   struct copy_space_slab *slabs;
   size_t nslabs;
 };
@@ -542,11 +537,7 @@ copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
 
 static inline int
 copy_space_contains(struct copy_space *space, struct gc_ref ref) {
-  for (size_t i = 0; i < space->nextents; i++)
-    if (space->extents[i].low_addr <= gc_ref_value(ref) &&
-        gc_ref_value(ref) < space->extents[i].high_addr)
-      return 1;
-  return 0;
+  return extents_contain_addr(space->extents, gc_ref_value(ref));
 }
 
 static inline void
@@ -607,10 +598,8 @@ copy_space_init(struct copy_space *space, size_t size, int atomic) {
   space->atomic_forward = atomic;
   space->allocated_bytes_at_last_gc = 0;
   space->fragmentation_at_last_gc = 0;
-  space->extents = calloc(1, sizeof(struct copy_space_extent));
-  space->extents[0].low_addr = (uintptr_t) slabs;
-  space->extents[0].high_addr = space->extents[0].low_addr + reserved;
-  space->nextents = 1;
+  space->extents = extents_allocate(10);
+  extents_adjoin(space->extents, slabs, reserved);
   space->slabs = slabs;
   space->nslabs = nslabs;
   for (size_t slab = 0; slab < nslabs; slab++) {
diff --git a/src/extents.h b/src/extents.h
new file mode 100644
index 000000000..62dba92b9
--- /dev/null
+++ b/src/extents.h
@@ -0,0 +1,88 @@
+#ifndef EXTENTS_H
+#define EXTENTS_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "gc-assert.h"
+
+struct extent_range {
+  uintptr_t lo_addr;
+  uintptr_t hi_addr;
+};
+
+struct extents {
+  size_t size;
+  size_t capacity;
+  struct extent_range ranges[];
+};
+
+static inline int
+extents_contain_addr(struct extents *extents, uintptr_t addr) {
+  size_t lo = 0;
+  size_t hi = extents->size;
+  while (lo != hi) {
+    size_t mid = (lo + hi) / 2;
+    struct extent_range range = extents->ranges[mid];
+    if (addr < range.lo_addr) {
+      hi = mid;
+    } else if (addr < range.hi_addr) {
+      return 1;
+    } else {
+      lo = mid + 1;
+    }
+  }
+  return 0;
+}
+
+static struct extents*
+extents_allocate(size_t capacity) {
+  size_t byte_size =
+    sizeof(struct extents) + sizeof(struct extent_range) * capacity;
+  struct extents *ret = malloc(byte_size);
+  if (!ret) __builtin_trap();
+  memset(ret, 0, byte_size);
+  ret->capacity = capacity;
+  return ret;
+}
+
+static struct extents*
+extents_insert(struct extents *old, size_t idx, struct extent_range range) {
+  if (old->size < old->capacity) {
+    size_t bytes_to_move = sizeof(struct extent_range) * (old->size - idx);
+    memmove(&old->ranges[idx + 1], &old->ranges[idx], bytes_to_move);
+    old->ranges[idx] = range;
+    old->size++;
+    return old;
+  } else {
+    struct extents *new_ = extents_allocate(old->capacity * 2 + 1);
+    memcpy(&new_->ranges[0], &old->ranges[0],
+           sizeof(struct extent_range) * idx);
+    memcpy(&new_->ranges[idx + 1], &old->ranges[idx],
+           sizeof(struct extent_range) * (old->size - idx));
+    new_->ranges[idx] = range;
+    new_->size = old->size + 1;
+    free(old);
+    return new_;
+  }
+}
+
+static struct extents*
+extents_adjoin(struct extents *extents, void *lo_addr, size_t size) {
+  size_t i;
+  struct extent_range range = { (uintptr_t)lo_addr, (uintptr_t)lo_addr + size };
+  for (i = 0; i < extents->size; i++) {
+    if (range.hi_addr < extents->ranges[i].lo_addr) {
+      break;
+    } else if (range.hi_addr == extents->ranges[i].lo_addr) {
+      extents->ranges[i].lo_addr = range.lo_addr;
+      return extents;
+    } else if (range.lo_addr == extents->ranges[i].hi_addr) {
+      extents->ranges[i].hi_addr = range.hi_addr;
+      return extents;
+    }
+  }
+  return extents_insert(extents, i, range);
+}
+  
+#endif // EXTENTS_H

From 8d6db735fdd541c1d2f4730fd3246a8dd45f4ba4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 30 Aug 2024 16:51:30 +0200
Subject: [PATCH 277/403] Nofl space can have discontiguous slabs

---
 src/nofl-space.h | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 14dfb12e2..3df0960f0 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -13,6 +13,7 @@
 
 #include "assert.h"
 #include "debug.h"
+#include "extents.h"
 #include "gc-align.h"
 #include "gc-attrs.h"
 #include "gc-inline.h"
@@ -140,8 +141,7 @@ struct nofl_space {
   uint8_t live_mask;
   uint8_t marked_mask;
   uint8_t evacuating;
-  uintptr_t low_addr;
-  size_t extent;
+  struct extents *extents;
   size_t heap_size;
   uint8_t last_collection_was_minor;
   struct nofl_block_list empty;
@@ -156,7 +156,7 @@ struct nofl_space {
   double evacuation_reserve;
   double promotion_threshold;
   ssize_t pending_unavailable_bytes; // atomically
-  struct nofl_slab *slabs;
+  struct nofl_slab **slabs;
   size_t nslabs;
   uintptr_t granules_freed_by_last_collection; // atomically
   uintptr_t fragmentation_granules_since_last_collection; // atomically
@@ -847,7 +847,7 @@ nofl_space_trace_remembered_set(struct nofl_space *space,
                                 struct gc_heap *heap) {
   GC_ASSERT(!space->evacuating);
   for (size_t s = 0; s < space->nslabs; s++) {
-    struct nofl_slab *slab = &space->slabs[s];
+    struct nofl_slab *slab = space->slabs[s];
     uint8_t *remset = slab->remembered_set;
     for (size_t card_base = 0;
          card_base < NOFL_REMSET_BYTES_PER_SLAB;
@@ -868,9 +868,8 @@ nofl_space_trace_remembered_set(struct nofl_space *space,
 static void
 nofl_space_clear_remembered_set(struct nofl_space *space) {
   if (!GC_GENERATIONAL) return;
-  // FIXME: Don't assume slabs are contiguous.
   for (size_t slab = 0; slab < space->nslabs; slab++) {
-    memset(space->slabs[slab].remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
+    memset(space->slabs[slab]->remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
   }
 }
 
@@ -982,7 +981,7 @@ nofl_space_update_mark_patterns(struct nofl_space *space,
 static void
 nofl_space_clear_block_marks(struct nofl_space *space) {
   for (size_t s = 0; s < space->nslabs; s++) {
-    struct nofl_slab *slab = &space->slabs[s];
+    struct nofl_slab *slab = space->slabs[s];
     memset(slab->header.block_marks, 0, NOFL_BLOCKS_PER_SLAB / 8);
   }
 }
@@ -1373,7 +1372,7 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
 
 static inline int
 nofl_space_contains_address(struct nofl_space *space, uintptr_t addr) {
-  return addr - space->low_addr < space->extent;
+  return extents_contain_addr(space->extents, addr);
 }
 
 static inline int
@@ -1526,6 +1525,20 @@ nofl_allocate_slabs(size_t nslabs) {
   return (struct nofl_slab*) aligned_base;
 }
 
+static void
+nofl_space_add_slabs(struct nofl_space *space, struct nofl_slab *slabs,
+                     size_t nslabs) {
+  size_t old_size = space->nslabs * sizeof(struct nofl_slab*);
+  size_t additional_size = nslabs * sizeof(struct nofl_slab*);
+  space->extents = extents_adjoin(space->extents, slabs,
+                                  nslabs * sizeof(struct nofl_slab));
+  space->slabs = realloc(space->slabs, old_size + additional_size);
+  if (!space->slabs)
+    GC_CRASH();
+  while (nslabs--)
+    space->slabs[space->nslabs++] = slabs++;
+}
+
 static int
 nofl_space_init(struct nofl_space *space, size_t size, int atomic,
                 double promotion_threshold) {
@@ -1538,10 +1551,8 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
 
   space->marked_mask = NOFL_METADATA_BYTE_MARK_0;
   nofl_space_update_mark_patterns(space, 0);
-  space->slabs = slabs;
-  space->nslabs = nslabs;
-  space->low_addr = (uintptr_t) slabs;
-  space->extent = reserved;
+  space->extents = extents_allocate(10);
+  nofl_space_add_slabs(space, slabs, nslabs);
   space->evacuation_minimum_reserve = 0.02;
   space->evacuation_reserve = space->evacuation_minimum_reserve;
   space->promotion_threshold = promotion_threshold;

From cf129f10defd844cb4d15538890c7c180caa74a3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 30 Aug 2024 21:19:51 +0200
Subject: [PATCH 278/403] nofl: Block marks are bytes

There was no need to use a bitvector, and the marks were only being
partially cleared.  More straightforward (and still low overhead) as
bytes.
---
 src/nofl-space.h | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 3df0960f0..037dff0d2 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -236,30 +236,24 @@ nofl_metadata_byte_for_object(struct gc_ref ref) {
   return nofl_metadata_byte_for_addr(gc_ref_value(ref));
 }
 
-static int
-nofl_block_is_marked(uintptr_t addr) {
+static uint8_t*
+nofl_block_mark_loc(uintptr_t addr) {
   uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
   struct nofl_slab *slab = (struct nofl_slab *) base;
   unsigned block_idx = (addr / NOFL_BLOCK_SIZE) % NOFL_BLOCKS_PER_SLAB;
-  uint8_t mark_byte = block_idx / 8;
-  GC_ASSERT(mark_byte < NOFL_HEADER_BYTES_PER_SLAB);
-  uint8_t mark_mask = 1U << (block_idx % 8);
-  uint8_t byte = atomic_load_explicit(&slab->header.block_marks[mark_byte],
-                                      memory_order_relaxed);
-  return byte & mark_mask;
+  return &slab->header.block_marks[block_idx];
+}
+
+static int
+nofl_block_is_marked(uintptr_t addr) {
+  return atomic_load_explicit(nofl_block_mark_loc(addr), memory_order_relaxed);
 }
 
 static void
 nofl_block_set_mark(uintptr_t addr) {
-  uintptr_t base = align_down(addr, NOFL_SLAB_SIZE);
-  struct nofl_slab *slab = (struct nofl_slab *) base;
-  unsigned block_idx = (addr / NOFL_BLOCK_SIZE) % NOFL_BLOCKS_PER_SLAB;
-  uint8_t mark_byte = block_idx / 8;
-  GC_ASSERT(mark_byte < NOFL_HEADER_BYTES_PER_SLAB);
-  uint8_t mark_mask = 1U << (block_idx % 8);
-  atomic_fetch_or_explicit(&slab->header.block_marks[mark_byte],
-                           mark_mask,
-                           memory_order_relaxed);
+  uint8_t *loc = nofl_block_mark_loc(addr);
+  if (!atomic_load_explicit(loc, memory_order_relaxed))
+    atomic_store_explicit(loc, 1, memory_order_relaxed);
 }
 
 #define NOFL_GRANULES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
@@ -982,7 +976,7 @@ static void
 nofl_space_clear_block_marks(struct nofl_space *space) {
   for (size_t s = 0; s < space->nslabs; s++) {
     struct nofl_slab *slab = space->slabs[s];
-    memset(slab->header.block_marks, 0, NOFL_BLOCKS_PER_SLAB / 8);
+    memset(slab->header.block_marks, 0, sizeof(slab->header.block_marks));
   }
 }
 
@@ -1282,8 +1276,7 @@ static inline int
 nofl_space_set_nonempty_mark(struct nofl_space *space, uint8_t *metadata,
                              uint8_t byte, struct gc_ref ref) {
   nofl_space_set_mark(space, metadata, byte);
-  if (!nofl_block_is_marked(gc_ref_value(ref)))
-    nofl_block_set_mark(gc_ref_value(ref));
+  nofl_block_set_mark(gc_ref_value(ref));
   return 1;
 }
 

From 44a7240e16de8fd906bcef5ddb6d329800d420db Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 2 Sep 2024 13:18:07 +0200
Subject: [PATCH 279/403] Rename "whippet" collector to "mmc": mostly marking
 collector

---
 Makefile                                      | 50 ++++++++--------
 README.md                                     | 57 +++++++++++--------
 api/{whippet-attrs.h => mmc-attrs.h}          |  6 +-
 benchmarks/README.md                          |  6 +-
 ...{collector-whippet.md => collector-mmc.md} | 49 +++++++---------
 doc/collectors.md                             | 17 +++---
 doc/guile.md                                  |  2 +-
 doc/manual.md                                 | 42 ++++++++------
 embed.mk                                      | 26 ++++-----
 src/{whippet.c => mmc.c}                      |  2 +-
 10 files changed, 133 insertions(+), 124 deletions(-)
 rename api/{whippet-attrs.h => mmc-attrs.h} (95%)
 rename doc/{collector-whippet.md => collector-mmc.md} (80%)
 rename src/{whippet.c => mmc.c} (99%)

diff --git a/Makefile b/Makefile
index 56c7325c4..2a1fded30 100644
--- a/Makefile
+++ b/Makefile
@@ -5,21 +5,21 @@ COLLECTORS = \
 	scc \
 	pcc \
 	\
-	whippet \
-	stack-conservative-whippet \
-	heap-conservative-whippet \
+	mmc \
+	stack-conservative-mmc \
+	heap-conservative-mmc \
 	\
-	parallel-whippet \
-	stack-conservative-parallel-whippet \
-	heap-conservative-parallel-whippet \
+	parallel-mmc \
+	stack-conservative-parallel-mmc \
+	heap-conservative-parallel-mmc \
 	\
-	generational-whippet \
-	stack-conservative-generational-whippet \
-	heap-conservative-generational-whippet \
+	generational-mmc \
+	stack-conservative-generational-mmc \
+	heap-conservative-generational-mmc \
 	\
-	parallel-generational-whippet \
-	stack-conservative-parallel-generational-whippet \
-	heap-conservative-parallel-generational-whippet
+	parallel-generational-mmc \
+	stack-conservative-parallel-generational-mmc \
+	heap-conservative-parallel-generational-mmc
 
 DEFAULT_BUILD := opt
 
@@ -70,28 +70,28 @@ GC_CFLAGS_scc     = -DGC_PRECISE_ROOTS=1
 GC_STEM_pcc       = pcc
 GC_CFLAGS_pcc     = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 
-define whippet_variant
-GC_STEM_$(1)       = whippet
+define mmc_variant
+GC_STEM_$(1)       = mmc
 GC_CFLAGS_$(1)     = $(2)
 endef
 
-define generational_whippet_variants
-$(call whippet_variant,$(1)whippet,$(2))
-$(call whippet_variant,$(1)generational_whippet,$(2) -DGC_GENERATIONAL=1)
+define generational_mmc_variants
+$(call mmc_variant,$(1)mmc,$(2))
+$(call mmc_variant,$(1)generational_mmc,$(2) -DGC_GENERATIONAL=1)
 endef
 
-define parallel_whippet_variants
-$(call generational_whippet_variants,$(1),$(2))
-$(call generational_whippet_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
+define parallel_mmc_variants
+$(call generational_mmc_variants,$(1),$(2))
+$(call generational_mmc_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
 endef
 
-define trace_whippet_variants
-$(call parallel_whippet_variants,,-DGC_PRECISE_ROOTS=1)
-$(call parallel_whippet_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
-$(call parallel_whippet_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
+define trace_mmc_variants
+$(call parallel_mmc_variants,,-DGC_PRECISE_ROOTS=1)
+$(call parallel_mmc_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
+$(call parallel_mmc_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
 endef
 
-$(eval $(call trace_whippet_variants))
+$(eval $(call trace_mmc_variants))
 
 # $(1) is the benchmark, $(2) is the collector configuration
 make_gc_var    = $$($(1)$(subst -,_,$(2)))
diff --git a/README.md b/README.md
index c922af64d..601646cc5 100644
--- a/README.md
+++ b/README.md
@@ -12,50 +12,59 @@ allocation, and provides a number of implementations of that API.
 
 See the [documentation](./doc/README.md).
 
+## Features
+
+ - Per-object pinning (with `mmc` collectors)
+ - Finalization (supporting resuscitation)
+ - Ephemerons (except on `bdw`, which has a polyfill)
+ - Conservative roots (optionally with `mmc` or always with `bdw`)
+ - Precise roots (optionally with `mmc` or always with `semi` / `pcc` /
+   `scc`)
+ - Precise embedder-parameterized heap tracing (except with `bdw`)
+ - Conservative heap tracing (optionally with `mmc`, always with `bdw`)
+ - Parallel tracing (except `semi` and `scc`)
+ - Parallel mutators (except `semi`)
+ - Inline allocation / write barrier fast paths (supporting JIT)
+ - One unified API with no-overhead abstraction: switch collectors when
+   you like
+
 ## Source repository structure
 
  * [api/](./api/): The user-facing API.  Also, the "embedder API"; see
    the [manual](./doc/manual.md) for more.
  * [doc/](./doc/): Documentation, such as it is.
- * [src/](./src/): The actual GC implementation.  The specific
-   implementations of the Whippet API are [`semi.c`](./src/semi.c), a
-   semi-space collector; [`bdw.c`](./src/bdw.c), the third-party
-   [BDW-GC](https://github.com/ivmai/bdwgc) conservative parallel
-   stop-the-world mark-sweep segregated-fits collector with lazy
-   sweeping; and [`whippet.c`](./src/whippet.c), the whippet collector.
+ * [src/](./src/): The actual GC implementation, containing a number of
+   collector implementations.  The embedder chooses which collector to
+   use at compile-time.  See the [documentation](./doc/collectors.md)
+   for more on the different collectors (`semi`, `bdw`, `scc`, `pcc`,
+   and the different flavors of `mmc`).
  * [benchmarks/](./benchmarks/): Benchmarks.  A work in progress.
  * [test/](./test/): A dusty attic of minimal testing.
 
-## To do
+## Status and roadmap
 
-### Missing features before Guile can use Whippet
+As of September 2024, Whippet is almost feature-complete.  The main
+missing feature is dynamic heap growth and shrinkage
+(https://github.com/wingo/whippet/issues/5), which should land soon.
 
- - [X] Pinning
- - [X] Conservative stacks
- - [X] Conservative data segments
- - [ ] Heap growth/shrinking
- - [ ] Debugging/tracing
- - [X] Finalizers
- - [X] Weak references / weak maps
+After that, the next phase on the roadmap is support for tracing, and
+some performance noodling.
 
-### Features that would improve Whippet performance
-
- - [X] Immix-style opportunistic evacuation
- - ~~[ ] Overflow allocation~~ (should just evacuate instead)
- - [X] Generational GC via sticky mark bits
- - [ ] Generational GC with semi-space nursery
- - [ ] Concurrent marking with SATB barrier
+Once that is done, the big task is integrating Whippet into the [Guile
+Scheme](https://gnu.org/s/guile) language run-time, replacing BDW-GC.
+Fingers crossed!
 
 ## About the name
 
 It sounds better than WIP (work-in-progress) garbage collector, doesn't
 it?  Also apparently a whippet is a kind of dog that is fast for its
-size.  It would be nice if whippet-gc turns out to have this property.
+size.  It would be nice if the Whippet collectors turn out to have this
+property.
 
 ## License
 
 ```
-Copyright (c) 2022-2023 Andy Wingo
+Copyright (c) 2022-2024 Andy Wingo
 
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the
diff --git a/api/whippet-attrs.h b/api/mmc-attrs.h
similarity index 95%
rename from api/whippet-attrs.h
rename to api/mmc-attrs.h
index e6e5b22b9..111b2512c 100644
--- a/api/whippet-attrs.h
+++ b/api/mmc-attrs.h
@@ -1,5 +1,5 @@
-#ifndef WHIPPET_ATTRS_H
-#define WHIPPET_ATTRS_H
+#ifndef MMC_ATTRS_H
+#define MMC_ATTRS_H
 
 #include "gc-config.h"
 #include "gc-assert.h"
@@ -61,4 +61,4 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
-#endif // WHIPPET_ATTRS_H
+#endif // MMC_ATTRS_H
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1a9f1ac87..00ec1f731 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -7,14 +7,14 @@
    threads.  We analytically compute the peak amount of live data and
    then size the GC heap as a multiplier of that size.  It has a peak
    heap consumption of 10 MB or so per mutator thread: not very large.
-   At a 2x heap multiplier, it causes about 30 collections for the
-   whippet collector, and runs somewhere around 200-400 milliseconds in
+   At a 2x heap multiplier, it causes about 30 collections for the `mmc`
+   collector, and runs somewhere around 200-400 milliseconds in
    single-threaded mode, on the machines I have in 2022.  For low thread
    counts, the GCBench benchmark is small; but then again many Guile
    processes also are quite short-lived, so perhaps it is useful to
    ensure that small heaps remain lightweight.
 
-   To stress Whippet's handling of fragmentation, we modified this
+   To stress `mmc`'s handling of fragmentation, we modified this
    benchmark to intersperse pseudorandomly-sized holes between tree
    nodes.
 
diff --git a/doc/collector-whippet.md b/doc/collector-mmc.md
similarity index 80%
rename from doc/collector-whippet.md
rename to doc/collector-mmc.md
index 23fb2a1bb..e88b1409d 100644
--- a/doc/collector-whippet.md
+++ b/doc/collector-mmc.md
@@ -1,19 +1,14 @@
-# Whippet collector
+# Mostly-copying collector
 
-One collector implementation in the Whippet garbage collection library
-is also called Whippet.  Naming-wise this is a somewhat confusing
-situation; perhaps it will change.
-
-Anyway, the `whippet` collector is mainly a mark-region collector,
-inspired by
+The `mmc` collector is mainly a mark-region collector, inspired by
 [Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).
-To a first approximation, Whippet is a whole-heap Immix collector with a
+To a first approximation, `mmc` is a whole-heap Immix collector with a
 large object space on the side.
 
-When tracing, `whippet` mostly marks objects in place.  If the heap is
+When tracing, `mmc` mostly marks objects in place.  If the heap is
 too fragmented, it can compact the heap by choosing to evacuate
 sparsely-populated heap blocks instead of marking in place.  However
-evacuation is strictly optional, which means that `whippet` is also
+evacuation is strictly optional, which means that `mmc` is also
 compatible with conservative root-finding, making it a good replacement
 for embedders that currently use the [Boehm-Demers-Weiser
 collector](./collector-bdw.md).
@@ -33,7 +28,7 @@ recycled block from the global block store, it allocates into those
 holes.  For an exposition of Immix, see the lovely detailed [Rust
 implementation](http://users.cecs.anu.edu.au/~steveb/pubs/papers/rust-ismm-2016.pdf).
 
-The essential difference of `whippet` from Immix stems from a simple
+The essential difference of `mmc` from Immix stems from a simple
 observation: Immix needs a side table of line mark bytes and also a mark
 bit or bits in each object (or in a side table).  But if instead you
 choose to store mark bytes instead of bits (for concurrency reasons) in
@@ -54,13 +49,13 @@ just read the mark table and can avoid looking at object memory.
 
 ## Optional features
 
-The `whippet` collector has a few feature flags that can be turned on or
+The `mmc` collector has a few feature flags that can be turned on or
 off.  If you use the [standard embedder makefile include](../embed.mk),
-then there is a name for each combination of features: `whippet` has no
-additional features, `parallel-whippet` enables parallel marking,
-`parallel-generational-whippet` enables generations,
-`stack-conservative-parallel-generational-whippet` uses conservative
-root-finding, and `heap-conservative-parallel-generational-whippet`
+then there is a name for each combination of features: `mmc` has no
+additional features, `parallel-mmc` enables parallel marking,
+`parallel-generational-mmc` enables generations,
+`stack-conservative-parallel-generational-mmc` uses conservative
+root-finding, and `heap-conservative-parallel-generational-mmc`
 additionally traces the heap conservatively.  You can leave off
 components of the name to get a collector without those features.
 Underneath this corresponds to some pre-processor definitions passed to
@@ -68,7 +63,7 @@ the compiler on the command line.
 
 ### Generations
 
-Whippet supports generational tracing via the [sticky mark-bit
+`mmc` supports generational tracing via the [sticky mark-bit
 algorithm](https://wingolog.org/archives/2022/10/22/the-sticky-mark-bit-algorithm).
 This requires that the embedder emit [write
 barriers](https://github.com/wingo/whippet/blob/main/doc/manual.md#write-barriers);
@@ -84,7 +79,7 @@ two-megabyte aligned slabs.
 
 ### Parallel tracing
 
-You almost certainly want this on!  `parallel-whippet` uses a the
+You almost certainly want this on!  `parallel-mmc` uses a the
 [fine-grained work-stealing parallel tracer](../src/parallel-tracer.h).
 Each trace worker maintains a [local queue of objects that need
 tracing](../src/local-worklist.h), which currently has a capacity of
@@ -96,17 +91,17 @@ then will try to steal from other workers.
 
 The memory used for the external worklist is dynamically allocated from
 the OS and is not currently counted as contributing to the heap size.
-If you absolutely need to avoid dynamic allocation during GC, `whippet`
-(even serial whippet) would need some work for your use case, to
-allocate a fixed-size space for a marking queue and to gracefully handle
-mark queue overflow.
+If you absolutely need to avoid dynamic allocation during GC, `mmc`
+(even `serial-mmc`) would need some work for your use case, to allocate
+a fixed-size space for a marking queue and to gracefully handle mark
+queue overflow.
 
 ### Conservative stack scanning
 
 With `semi` and `pcc`, embedders must precisely enumerate the set of
 *roots*: the edges into the heap from outside.  Commonly, roots include
 global variables, as well as working variables from each mutator's
-stack.  Whippet can optionally mark mutator stacks *conservatively*:
+stack.  `mmc` can optionally mark mutator stacks *conservatively*:
 treating each word on the stack as if it may be an object reference, and
 marking any object at that address.
 
@@ -124,7 +119,7 @@ place roots in traceable locations published to the garbage collector.
 And the [performance question is still
 open](https://dl.acm.org/doi/10.1145/2660193.2660198).
 
-Anyway.  Whippet can scan roots conservatively.  Those roots are pinned
+Anyway.  `mmc` can scan roots conservatively.  Those roots are pinned
 for the collection; even if the collection will compact via evacuation,
 referents of conservative roots won't be moved.  Objects not directly
 referenced by roots can be evacuated, however.
@@ -133,14 +128,14 @@ referenced by roots can be evacuated, however.
 
 In addition to stack and global references, the Boehm-Demers-Weiser
 collector scans heap objects conservatively as well, treating each word
-of each heap object as if it were a reference.  Whippet can do that, if
+of each heap object as if it were a reference.  `mmc` can do that, if
 the embedder is unable to provide a `gc_trace_object` implementation.
 However this is generally a performance lose, and it prevents
 evacuation.
 
 ## Other implementation tidbits
 
-`whippet` does lazy sweeping: as a mutator grabs a fresh block, it
+`mmc` does lazy sweeping: as a mutator grabs a fresh block, it
 reclaims memory that was unmarked in the previous collection before
 making the memory available for allocation.  This makes sweeping
 naturally cache-friendly and parallel.
diff --git a/doc/collectors.md b/doc/collectors.md
index cdf4dcb8b..1c23f3e9d 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -7,7 +7,7 @@ Whippet has five collectors currently:
    but with support for multiple mutator threads.
  - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `scc`,
    but with support for multiple tracing threads.
- - [Whippet collector (`whippet`)](./collector-whippet.md):
+ - [Mostly marking collector (`mmc`)](./collector-mmc.md):
    Immix-inspired collector.  Optionally parallel, conservative (stack
    and/or heap), and/or generational.
  - [Boehm-Demers-Weiser collector (`bdw`)](./collector-bdw.md):
@@ -17,11 +17,11 @@ Whippet has five collectors currently:
 ## How to choose?
 
 If you are migrating an embedder off BDW-GC, then it could be reasonable
-to first go to `bdw`, then `stack-conservative-parallel-whippet`.
+to first go to `bdw`, then `stack-conservative-parallel-mmc`.
 
 If you have an embedder with precise roots, use `pcc`.  That will shake
 out mutator/embedder bugs.  Then if memory is tight, switch to
-`parallel-whippet`, possibly `parallel-generational-whippet`.
+`parallel-mmc`, possibly `parallel-generational-mmc`.
 
 If you are aiming for maximum simplicity and minimal code size (ten
 kilobytes or so), use `semi`.
@@ -30,17 +30,16 @@ Only use `scc` if you are investigating GC internals.
 
 If you are writing a new project, you have a choice as to whether to pay
 the development cost of precise roots or not.  If you choose to not have
-precise roots, then go for `stack-conservative-parallel-whippet`
-directly.
+precise roots, then go for `stack-conservative-parallel-mmc` directly.
 
 ## More collectors
 
 It would be nice to have a classic generational GC, perhaps using
-parallel-whippet for the old generation but a pcc-style copying nursery.
+`parallel-mmc` for the old generation but a pcc-style copying nursery.
 
-Support for concurrent marking in `whippet` would be good as well,
-perhaps with a SATB barrier.  (Or, if you are the sort of person to bet
-on conservative stack scanning, perhaps a retreating-wavefront barrier
+Support for concurrent marking in `mmc` would be good as well, perhaps
+with a SATB barrier.  (Or, if you are the sort of person to bet on
+conservative stack scanning, perhaps a retreating-wavefront barrier
 would be more appropriate.)
 
 Contributions are welcome, provided they have no more dependencies!
diff --git a/doc/guile.md b/doc/guile.md
index 05bc17e15..12bdb97fc 100644
--- a/doc/guile.md
+++ b/doc/guile.md
@@ -1,6 +1,6 @@
 # Whippet and Guile
 
-If the Whippet collector works out, it could replace Guile's garbage
+If the `mmc` collector works out, it could replace Guile's garbage
 collector.  Guile currently uses BDW-GC.  Guile has a widely used C API
 and implements part of its run-time in C.  For this reason it may be
 infeasible to require precise enumeration of GC roots -- we may need to
diff --git a/doc/manual.md b/doc/manual.md
index 73aa537eb..d856df219 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -21,7 +21,7 @@ for full details, but for a cheat sheet, you might do something like
 this to copy Whippet into the `whippet/` directory of your project root:
 
 ```
-git remote add whippet https://github.com/wingo/whippet-gc
+git remote add whippet https://github.com/wingo/whippet
 git fetch whippet
 git merge -s ours --no-commit --allow-unrelated-histories whippet/main
 git read-tree --prefix=whippet/ -u whippet/main
@@ -92,7 +92,7 @@ that all "large" or potentially large objects have a flag bit reserved
 for use of the garbage collector.  A large object is one whose size
 exceeds the `gc_allocator_large_threshold()` (see
 [`gc-attrs.h`](../api/gc-attrs.h)), which is a collector-specific value.
-Currently the only generational collector is the in-place Whippet
+Currently the only generational collector is the in-place `mmc`
 collector, whose large object threshold is 4096 bytes.  The
 `gc_object_set_remembered`, `gc_object_is_remembered_nonatomic`, and
 `gc_object_clear_remembered_nonatomic` embedder functions manage the
@@ -187,15 +187,10 @@ implementations of that API: `semi`, a simple semi-space collector;
 `pcc`, a parallel copying collector (like semi but multithreaded);
 `bdw`, an implementation via the third-party
 [Boehm-Demers-Weiser](https://github.com/ivmai/bdwgc) conservative
-collector; and `whippet`, an Immix-like collector.
-
-There is a bit of name overloading between the Whippet abstract API, the
-collection of GC implementations, and the specific Whippet collector;
-our apologies.  It's just like that, and we hope to make the usage
-obvious from context.
+collector; and `mmc`, a mostly-marking collector inspired by Immix.
 
 The program that embeds Whippet selects the collector implementation at
-build-time.  In the case of the specific Whippet collector, the program
+build-time.  In the case of the `mmc` collector, the program
 also configures a specific collector mode, again at build-time:
 generational or not, parallel or not, stack-conservative or not, and
 heap-conservative or not.  It may be nice in the future to be able to
@@ -353,15 +348,26 @@ $(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 \
   -include foo-embedder.h -o gc.o -c bdw.c
 ```
 
-#### Building `whippet`
+#### Building `pcc`
 
-Finally, there is the whippet collector.  It can collect roots precisely
-or conservatively, trace precisely or conservatively, be parallel or
-not, and be generational or not.
+The parallel copying collector is like `semi` but better in every way:
+it supports multiple mutator threads, and evacuates in parallel if
+multiple threads are available.
+
+```
+$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 \
+  -include foo-embedder.h -o gc.o -c pcc.c
+```
+
+#### Building `mmc`
+
+Finally, there is the mostly-marking collector.  It can collect roots
+precisely or conservatively, trace precisely or conservatively, be
+parallel or not, and be generational or not.
 
 ```
 $(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-  -include foo-embedder.h -o gc.o -c whippet.c
+  -include foo-embedder.h -o gc.o -c mvv.c
 ```
 
 ### Compiling your program
@@ -370,12 +376,12 @@ Any compilation unit that uses the GC API should have the same set of
 compile-time options defined as when compiling the collector.
 Additionally those compilation units should include the "attributes"
 header for the collector in question, namely `semi-attrs.h`,
-`bdw-attrs.h`, or `whippet-attrs.h`.  For example, for parallel
-generational whippet, you might have:
+`bdw-attrs.h`, `pcc-attrs.h`, or `mmc-attrs.h`.  For example, for
+parallel generational mmc, you might have:
 
 ```
 $(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-  -include whippet-attrs.h -o my-program.o -c my-program.c
+  -include mmc-attrs.h -o my-program.o -c my-program.c
 ```
 
 ### Linking the collector into your program
@@ -462,7 +468,7 @@ defined for all collectors:
 
 You can set these options via `gc_option_set_int` and so on; see
 [`gc-options.h`](../api/gc-options.h).  Or, you can parse options from
-strings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
+trings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
 on.  Use `gc_option_from_string` to determine if a string is really an
 option.  Use `gc_option_parse_and_set` to parse a value for an option.
 Use `gc_options_parse_and_set_many` to parse a number of comma-delimited
diff --git a/embed.mk b/embed.mk
index 020cb10d3..fef8de8b1 100644
--- a/embed.mk
+++ b/embed.mk
@@ -48,28 +48,28 @@ GC_CFLAGS_scc      = -DGC_PRECISE_ROOTS=1
 GC_STEM_pcc        = pcc
 GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 
-define whippet_variant
-GC_STEM_$(1)       = whippet
+define mmc_variant
+GC_STEM_$(1)       = mmc
 GC_CFLAGS_$(1)     = $(2)
 endef
 
-define generational_whippet_variants
-$(call whippet_variant,$(1)whippet,$(2))
-$(call whippet_variant,$(1)generational_whippet,$(2) -DGC_GENERATIONAL=1)
+define generational_mmc_variants
+$(call mmc_variant,$(1)mmc,$(2))
+$(call mmc_variant,$(1)generational_mmc,$(2) -DGC_GENERATIONAL=1)
 endef
 
-define parallel_whippet_variants
-$(call generational_whippet_variants,$(1),$(2))
-$(call generational_whippet_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
+define parallel_mmc_variants
+$(call generational_mmc_variants,$(1),$(2))
+$(call generational_mmc_variants,$(1)parallel_,$(2) -DGC_PARALLEL=1)
 endef
 
-define trace_whippet_variants
-$(call parallel_whippet_variants,,-DGC_PRECISE_ROOTS=1)
-$(call parallel_whippet_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
-$(call parallel_whippet_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
+define trace_mmc_variants
+$(call parallel_mmc_variants,,-DGC_PRECISE_ROOTS=1)
+$(call parallel_mmc_variants,stack_conservative_,-DGC_CONSERVATIVE_ROOTS=1)
+$(call parallel_mmc_variants,heap_conservative_,-DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1)
 endef
 
-$(eval $(call trace_whippet_variants))
+$(eval $(call trace_mmc_variants))
 
 gc_var         = $($(1)$(subst -,_,$(2)))
 gc_impl        = $(call gc_var,GC_STEM_,$(1)).c
diff --git a/src/whippet.c b/src/mmc.c
similarity index 99%
rename from src/whippet.c
rename to src/mmc.c
index 05597b4f8..f33ed4509 100644
--- a/src/whippet.c
+++ b/src/mmc.c
@@ -26,7 +26,7 @@
 #include "serial-tracer.h"
 #endif
 #include "spin.h"
-#include "whippet-attrs.h"
+#include "mmc-attrs.h"
 
 #define LARGE_OBJECT_THRESHOLD 8192
 

From 519949edf38bbfa11b3ff9c543473ac8922eb858 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 2 Sep 2024 13:24:38 +0200
Subject: [PATCH 280/403] Update .gitignore

---
 .gitignore | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 231806c14..507595694 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,16 +1,16 @@
 /*.o
 /*.bdw
 /*.semi
-/*.whippet
-/*.generational-whippet
-/*.parallel-whippet
-/*.parallel-generational-whippet
-/*.stack-conservative-whippet
-/*.stack-conservative-generational-whippet
-/*.stack-conservative-parallel-whippet
-/*.stack-conservative-parallel-generational-whippet
-/*.heap-conservative-whippet
-/*.heap-conservative-generational-whippet
-/*.heap-conservative-parallel-whippet
-/*.heap-conservative-parallel-generational-whippet
+/*.mmc
+/*.generational-mmc
+/*.parallel-mmc
+/*.parallel-generational-mmc
+/*.stack-conservative-mmc
+/*.stack-conservative-generational-mmc
+/*.stack-conservative-parallel-mmc
+/*.stack-conservative-parallel-generational-mmc
+/*.heap-conservative-mmc
+/*.heap-conservative-generational-mmc
+/*.heap-conservative-parallel-mmc
+/*.heap-conservative-parallel-generational-mmc
 /.deps/

From 2915b052e44595567f8dcc159acedd2eb9bf6fa3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 2 Sep 2024 14:20:52 +0200
Subject: [PATCH 281/403] Whoops, typo on mmc docs

---
 doc/collector-mmc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/collector-mmc.md b/doc/collector-mmc.md
index e88b1409d..20af1d581 100644
--- a/doc/collector-mmc.md
+++ b/doc/collector-mmc.md
@@ -1,4 +1,4 @@
-# Mostly-copying collector
+# Mostly-marking collector
 
 The `mmc` collector is mainly a mark-region collector, inspired by
 [Immix](http://users.cecs.anu.edu.au/~steveb/pubs/papers/immix-pldi-2008.pdf).

From 8604ad6bebb795682c93a1dcd04268b12eff098a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 8 Sep 2024 09:54:16 +0200
Subject: [PATCH 282/403] mmc reformatting

---
 src/mmc.c | 57 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index f33ed4509..4c3cb9dc8 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -1185,7 +1185,8 @@ gc_set_finalizer_callback(struct gc_heap *heap,
   gc_finalizer_state_set_callback(heap->finalizer_state, callback);
 }
 
-static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
+static int
+heap_prepare_pending_ephemerons(struct gc_heap *heap) {
   struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
   size_t target = heap->size * heap->pending_ephemerons_size_factor;
   double slop = heap->pending_ephemerons_size_slop;
@@ -1198,31 +1199,44 @@ static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
 struct gc_options {
   struct gc_common_options common;
 };
-int gc_option_from_string(const char *str) {
+
+int
+gc_option_from_string(const char *str) {
   return gc_common_option_from_string(str);
 }
-struct gc_options* gc_allocate_options(void) {
+
+struct gc_options*
+gc_allocate_options(void) {
   struct gc_options *ret = malloc(sizeof(struct gc_options));
   gc_init_common_options(&ret->common);
   return ret;
 }
-int gc_options_set_int(struct gc_options *options, int option, int value) {
+
+int
+gc_options_set_int(struct gc_options *options, int option, int value) {
   return gc_common_options_set_int(&options->common, option, value);
 }
-int gc_options_set_size(struct gc_options *options, int option,
-                        size_t value) {
+
+int
+gc_options_set_size(struct gc_options *options, int option,
+                    size_t value) {
   return gc_common_options_set_size(&options->common, option, value);
 }
-int gc_options_set_double(struct gc_options *options, int option,
-                          double value) {
+
+int
+gc_options_set_double(struct gc_options *options, int option,
+                      double value) {
   return gc_common_options_set_double(&options->common, option, value);
 }
-int gc_options_parse_and_set(struct gc_options *options, int option,
-                             const char *value) {
+
+int
+gc_options_parse_and_set(struct gc_options *options, int option,
+                         const char *value) {
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
+static int
+heap_init(struct gc_heap *heap, const struct gc_options *options) {
   // *heap is already initialized to 0.
 
   pthread_mutex_init(&heap->lock, NULL);
@@ -1252,10 +1266,11 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   return 1;
 }
 
-int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
-            struct gc_heap **heap, struct gc_mutator **mut,
-            struct gc_event_listener event_listener,
-            void *event_listener_data) {
+int
+gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
+        struct gc_heap **heap, struct gc_mutator **mut,
+        struct gc_event_listener event_listener,
+        void *event_listener_data) {
   GC_ASSERT_EQ(gc_allocator_small_granule_size(), NOFL_GRANULE_SIZE);
   GC_ASSERT_EQ(gc_allocator_large_threshold(), LARGE_OBJECT_THRESHOLD);
   GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
@@ -1305,7 +1320,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   return 1;
 }
 
-struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
+struct gc_mutator*
+gc_init_for_thread(struct gc_stack_addr *stack_base,
                                       struct gc_heap *heap) {
   struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
   if (!ret)
@@ -1315,13 +1331,15 @@ struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
   return ret;
 }
 
-void gc_finish_for_thread(struct gc_mutator *mut) {
+void
+gc_finish_for_thread(struct gc_mutator *mut) {
   remove_mutator(mutator_heap(mut), mut);
   mutator_mark_buf_destroy(&mut->mark_buf);
   free(mut);
 }
 
-static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+static void
+deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   heap_lock(heap);
@@ -1334,7 +1352,8 @@ static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
-static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+static void
+reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_lock(heap);
   while (mutators_are_stopping(heap))
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);

From 9f437485ec1f49ee4369f9aeaf4dd324d71c4b26 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 9 Sep 2024 15:02:12 +0200
Subject: [PATCH 283/403] MMC marks roots in parallel during the pause, not
 while stopping

Following the analysis in
https://wingolog.org/archives/2024/09/06/on-taking-advantage-of-ragged-stops,
we simplify MMC by traversing roots only during the pause.  This lets us
use gc_tracer parallel root-tracing.
---
 src/mmc.c             | 551 +++++++++++-------------------------------
 src/nofl-space.h      |  52 ++--
 src/parallel-tracer.h | 114 ++++++---
 src/root.h            |  39 ++-
 src/serial-tracer.h   |  35 +--
 src/tracer.h          |  13 +-
 6 files changed, 312 insertions(+), 492 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index 4c3cb9dc8..e6335dced 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -40,7 +40,6 @@ struct gc_heap {
   pthread_cond_t mutator_cond;
   size_t size;
   int collecting;
-  int mark_while_stopping;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
   struct gc_finalizer_state *finalizer_state;
@@ -50,10 +49,9 @@ struct gc_heap {
   size_t paused_mutator_count;
   size_t inactive_mutator_count;
   struct gc_heap_roots *roots;
-  struct gc_mutator *mutator_trace_list;
+  struct gc_mutator *mutators;
   long count;
   uint8_t last_collection_was_minor;
-  struct gc_mutator *inactive_mutators;
   struct gc_tracer tracer;
   double fragmentation_low_threshold;
   double fragmentation_high_threshold;
@@ -71,24 +69,14 @@ struct gc_heap {
 #define MUTATOR_EVENT(mut, event, ...)                                  \
   (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
 
-struct gc_mutator_mark_buf {
-  size_t size;
-  size_t capacity;
-  struct gc_ref *objects;
-};
-
 struct gc_mutator {
   struct nofl_allocator allocator;
   struct gc_heap *heap;
   struct gc_stack stack;
   struct gc_mutator_roots *roots;
-  struct gc_mutator_mark_buf mark_buf;
   void *event_listener_data;
-  // Three uses for this in-object linked-list pointer:
-  //  - inactive (blocked in syscall) mutators
-  //  - grey objects when stopping active mutators for mark-in-place
-  //  - untraced mutators when stopping active mutators for evacuation
   struct gc_mutator *next;
+  struct gc_mutator *prev;
 };
 
 struct gc_trace_worker_data {
@@ -126,9 +114,6 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
   nofl_allocator_finish(&data.allocator, heap_nofl_space(heap));
 }
 
-static void collect(struct gc_mutator *mut,
-                    enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
-
 static inline int
 do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
          struct gc_trace_worker *worker) {
@@ -180,34 +165,6 @@ gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   GC_CRASH();
 }
 
-static inline struct gc_ref
-do_trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
-                          int possibly_interior) {
-  if (!gc_conservative_ref_might_be_a_heap_object(ref, possibly_interior))
-    return gc_ref_null();
-
-  struct nofl_space *nofl_space = heap_nofl_space(heap);
-  if (GC_LIKELY(nofl_space_contains_conservative_ref(nofl_space, ref)))
-    return nofl_space_mark_conservative_ref(nofl_space, ref, possibly_interior);
-
-  struct large_object_space *lospace = heap_large_object_space(heap);
-  return large_object_space_mark_conservative_ref(lospace, ref,
-                                                  possibly_interior);
-}
-
-static inline struct gc_ref
-trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
-                       int possibly_interior) {
-  struct gc_ref ret = do_trace_conservative_ref(heap, ref, possibly_interior);
-
-  if (gc_ref_is_heap_object(ret) &&
-      GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
-                                       memory_order_relaxed)))
-    gc_resolve_pending_ephemerons(ret, heap);
-
-  return ret;
-}
-
 static int
 mutators_are_stopping(struct gc_heap *heap) {
   return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
@@ -241,6 +198,13 @@ add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
   if (heap->mutator_count == 1)
     heap->multithreaded = 1;
+  mut->next = mut->prev = NULL;
+  struct gc_mutator *tail = heap->mutators;
+  if (tail) {
+    mut->next = tail;
+    tail->prev = mut;
+  }
+  heap->mutators = mut;
   heap->mutator_count++;
   heap_unlock(heap);
 }
@@ -252,6 +216,12 @@ remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = NULL;
   heap_lock(heap);
   heap->mutator_count--;
+  if (mut->next)
+    mut->next->prev = mut->prev;
+  if (mut->prev)
+    mut->prev->next = mut->next;
+  else
+    heap->mutators = mut->next;
   // We have no roots.  If there is a GC stop currently in progress,
   // maybe tell the controller it can continue.
   if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
@@ -285,72 +255,6 @@ heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
   nofl_space_reacquire_memory(heap_nofl_space(heap), bytes);
 }
 
-static void
-mutator_mark_buf_grow(struct gc_mutator_mark_buf *buf) {
-  size_t old_capacity = buf->capacity;
-  size_t old_bytes = old_capacity * sizeof(struct gc_ref);
-
-  size_t new_bytes = old_bytes ? old_bytes * 2 : getpagesize();
-  size_t new_capacity = new_bytes / sizeof(struct gc_ref);
-
-  void *mem = mmap(NULL, new_bytes, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("allocating mutator mark buffer failed");
-    GC_CRASH();
-  }
-  if (old_bytes) {
-    memcpy(mem, buf->objects, old_bytes);
-    munmap(buf->objects, old_bytes);
-  }
-  buf->objects = mem;
-  buf->capacity = new_capacity;
-}
-
-static void
-mutator_mark_buf_push(struct gc_mutator_mark_buf *buf, struct gc_ref ref) {
-  if (GC_UNLIKELY(buf->size == buf->capacity))
-    mutator_mark_buf_grow(buf);
-  buf->objects[buf->size++] = ref;
-}
-
-static void
-mutator_mark_buf_release(struct gc_mutator_mark_buf *buf) {
-  size_t bytes = buf->size * sizeof(struct gc_ref);
-  if (bytes >= getpagesize())
-    madvise(buf->objects, align_up(bytes, getpagesize()), MADV_DONTNEED);
-  buf->size = 0;
-}
-
-static void
-mutator_mark_buf_destroy(struct gc_mutator_mark_buf *buf) {
-  size_t bytes = buf->capacity * sizeof(struct gc_ref);
-  if (bytes)
-    munmap(buf->objects, bytes);
-}
-
-static void
-enqueue_mutator_for_tracing(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mut->next == NULL);
-  struct gc_mutator *next =
-    atomic_load_explicit(&heap->mutator_trace_list, memory_order_acquire);
-  do {
-    mut->next = next;
-  } while (!atomic_compare_exchange_weak(&heap->mutator_trace_list,
-                                         &next, mut));
-}
-
-static int
-heap_should_mark_while_stopping(struct gc_heap *heap) {
-  return atomic_load_explicit(&heap->mark_while_stopping, memory_order_acquire);
-}
-
-static int
-mutator_should_mark_while_stopping(struct gc_mutator *mut) {
-  return heap_should_mark_while_stopping(mutator_heap(mut));
-}
-
 void
 gc_mutator_set_roots(struct gc_mutator *mut, struct gc_mutator_roots *roots) {
   mut->roots = roots;
@@ -373,68 +277,42 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
-static void
-trace_and_enqueue_locally(struct gc_edge edge, struct gc_heap *heap,
-                          void *data) {
-  struct gc_mutator *mut = data;
-  if (trace_edge(heap, edge, NULL))
-    mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
+static inline struct gc_ref
+do_trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
+                          int possibly_interior) {
+  if (!gc_conservative_ref_might_be_a_heap_object(ref, possibly_interior))
+    return gc_ref_null();
+
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  if (GC_LIKELY(nofl_space_contains_conservative_ref(nofl_space, ref)))
+    return nofl_space_mark_conservative_ref(nofl_space, ref, possibly_interior);
+
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  return large_object_space_mark_conservative_ref(lospace, ref,
+                                                  possibly_interior);
+}
+
+static inline struct gc_ref
+trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
+                       int possibly_interior) {
+  struct gc_ref ret = do_trace_conservative_ref(heap, ref, possibly_interior);
+
+  if (gc_ref_is_heap_object(ret) &&
+      GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
+                                       memory_order_relaxed)))
+    gc_resolve_pending_ephemerons(ret, heap);
+
+  return ret;
 }
 
 static inline void
-do_trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
-                                              struct gc_heap *heap,
-                                              void *data,
-                                              int possibly_interior) {
-  struct gc_mutator *mut = data;
-  struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
-  if (gc_ref_is_heap_object(object))
-    mutator_mark_buf_push(&mut->mark_buf, object);
-}
-
-static void
-trace_possibly_interior_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
-                                                             struct gc_heap *heap,
-                                                             void *data) {
-  return do_trace_conservative_ref_and_enqueue_locally(ref, heap, data, 1);
-}
-
-static void
-trace_conservative_ref_and_enqueue_locally(struct gc_conservative_ref ref,
-                                           struct gc_heap *heap,
-                                           void *data) {
-  return do_trace_conservative_ref_and_enqueue_locally(ref, heap, data, 0);
-}
-
-static void
-trace_and_enqueue_globally(struct gc_edge edge, struct gc_heap *heap,
-                           void *unused) {
-  if (trace_edge(heap, edge, NULL))
-    gc_tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
-}
-
-static inline void
-do_trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
-                                               struct gc_heap *heap,
-                                               void *data,
-                                               int possibly_interior) {
-  struct gc_ref object = trace_conservative_ref(heap, ref, possibly_interior);
-  if (gc_ref_is_heap_object(object))
-    gc_tracer_enqueue_root(&heap->tracer, object);
-}
-
-static void
-trace_possibly_interior_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
-                                                              struct gc_heap *heap,
-                                                              void *data) {
-  return do_trace_conservative_ref_and_enqueue_globally(ref, heap, data, 1);
-}
-
-static void
-trace_conservative_ref_and_enqueue_globally(struct gc_conservative_ref ref,
-                                            struct gc_heap *heap,
-                                            void *data) {
-  return do_trace_conservative_ref_and_enqueue_globally(ref, heap, data, 0);
+tracer_trace_conservative_ref(struct gc_conservative_ref ref,
+                              struct gc_heap *heap,
+                              struct gc_trace_worker *worker,
+                              int possibly_interior) {
+  struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
+  if (gc_ref_is_heap_object(resolved))
+    gc_trace_worker_enqueue(worker, resolved);
 }
 
 static inline struct gc_conservative_ref
@@ -446,26 +324,13 @@ load_conservative_ref(uintptr_t addr) {
 }
 
 static inline void
-trace_conservative_edges(uintptr_t low,
-                         uintptr_t high,
-                         void (*trace)(struct gc_conservative_ref,
-                                       struct gc_heap *, void *),
-                         struct gc_heap *heap,
-                         void *data) {
+trace_conservative_edges(uintptr_t low, uintptr_t high, int possibly_interior,
+                         struct gc_heap *heap, struct gc_trace_worker *worker) {
   GC_ASSERT(low == align_down(low, sizeof(uintptr_t)));
   GC_ASSERT(high == align_down(high, sizeof(uintptr_t)));
   for (uintptr_t addr = low; addr < high; addr += sizeof(uintptr_t))
-    trace(load_conservative_ref(addr), heap, data);
-}
-
-static inline void
-tracer_trace_conservative_ref(struct gc_conservative_ref ref,
-                              struct gc_heap *heap, void *data) {
-  struct gc_trace_worker *worker = data;
-  int possibly_interior = 0;
-  struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
-  if (gc_ref_is_heap_object(resolved))
-    gc_trace_worker_enqueue(worker, resolved);
+    tracer_trace_conservative_ref(load_conservative_ref(addr), heap, worker,
+                                  possibly_interior);
 }
 
 static inline void
@@ -486,10 +351,10 @@ trace_one_conservatively(struct gc_ref ref, struct gc_heap *heap,
   } else {
     bytes = large_object_space_object_size(heap_large_object_space(heap), ref);
   }
-  trace_conservative_edges(gc_ref_value(ref),
-                           gc_ref_value(ref) + bytes,
-                           tracer_trace_conservative_ref, heap,
-                           worker);
+  // Intraheap edges are not interior.
+  int possibly_interior = 0;
+  trace_conservative_edges(gc_ref_value(ref), gc_ref_value(ref) + bytes,
+                           possibly_interior, heap, worker);
 }
 
 static inline void
@@ -511,6 +376,14 @@ trace_root(struct gc_root root, struct gc_heap *heap,
   case GC_ROOT_KIND_MUTATOR:
     gc_trace_mutator_roots(root.mutator->roots, tracer_visit, heap, worker);
     break;
+  case GC_ROOT_KIND_CONSERVATIVE_EDGES:
+    trace_conservative_edges(root.range.lo_addr, root.range.hi_addr, 0,
+                             heap, worker);
+    break;
+  case GC_ROOT_KIND_CONSERVATIVE_POSSIBLY_INTERIOR_EDGES:
+    trace_conservative_edges(root.range.lo_addr, root.range.hi_addr, 1,
+                             heap, worker);
+    break;
   case GC_ROOT_KIND_RESOLVED_EPHEMERONS:
     gc_trace_resolved_ephemerons(root.resolved_ephemerons, tracer_visit,
                                  heap, worker);
@@ -518,101 +391,52 @@ trace_root(struct gc_root root, struct gc_heap *heap,
   case GC_ROOT_KIND_EDGE:
     tracer_visit(root.edge, heap, worker);
     break;
+  case GC_ROOT_KIND_REMEMBERED_OBJECT:
+    trace_one(root.ref, heap, worker);
+    break;
+  case GC_ROOT_KIND_REMEMBERED_SLAB:
+    nofl_space_trace_remembered_slab(heap_nofl_space(heap), root.idx,
+                                     trace_one, heap, worker);
+    break;
   default:
     GC_CRASH();
   }
 }
 
 static void
-visit_root_edge(struct gc_edge edge, struct gc_heap *heap, void *unused) {
-  gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
+enqueue_conservative_roots(uintptr_t low, uintptr_t high,
+                           struct gc_heap *heap, void *data) {
+  int *possibly_interior = data;
+  gc_tracer_add_root(&heap->tracer,
+                     gc_root_conservative_edges(low, high, *possibly_interior));
 }
 
 static void
-mark_and_globally_enqueue_mutator_conservative_roots(uintptr_t low,
-                                                     uintptr_t high,
-                                                     struct gc_heap *heap,
-                                                     void *data) {
-  trace_conservative_edges(low, high,
-                           gc_mutator_conservative_roots_may_be_interior()
-                           ? trace_possibly_interior_conservative_ref_and_enqueue_globally
-                           : trace_conservative_ref_and_enqueue_globally,
-                           heap, data);
+enqueue_mutator_conservative_roots(struct gc_heap *heap) {
+  if (gc_has_mutator_conservative_roots()) {
+    int possibly_interior = gc_mutator_conservative_roots_may_be_interior();
+    for (struct gc_mutator *mut = heap->mutators;
+         mut;
+         mut = mut->next)
+      gc_stack_visit(&mut->stack, enqueue_conservative_roots, heap,
+                     &possibly_interior);
+  }
 }
 
 static void
-mark_and_globally_enqueue_heap_conservative_roots(uintptr_t low,
-                                                  uintptr_t high,
-                                                  struct gc_heap *heap,
-                                                  void *data) {
-  trace_conservative_edges(low, high,
-                           trace_conservative_ref_and_enqueue_globally,
-                           heap, data);
+enqueue_global_conservative_roots(struct gc_heap *heap) {
+  if (gc_has_global_conservative_roots()) {
+    int possibly_interior = 0;
+    gc_platform_visit_global_conservative_roots
+      (enqueue_conservative_roots, heap, &possibly_interior);
+  }
 }
 
 static void
-mark_and_locally_enqueue_mutator_conservative_roots(uintptr_t low,
-                                                    uintptr_t high,
-                                                    struct gc_heap *heap,
-                                                    void *data) {
-  trace_conservative_edges(low, high,
-                           gc_mutator_conservative_roots_may_be_interior()
-                           ? trace_possibly_interior_conservative_ref_and_enqueue_locally
-                           : trace_conservative_ref_and_enqueue_locally,
-                           heap, data);
-}
-
-static inline void
-trace_mutator_conservative_roots(struct gc_mutator *mut,
-                                 void (*trace_range)(uintptr_t low,
-                                                     uintptr_t high,
-                                                     struct gc_heap *heap,
-                                                     void *data),
-                                 struct gc_heap *heap,
-                                 void *data) {
-  if (gc_has_mutator_conservative_roots())
-    gc_stack_visit(&mut->stack, trace_range, heap, data);
-}
-
-// Mark the roots of a mutator that is stopping for GC.  We can't
-// enqueue them directly, so we send them to the controller in a buffer.
-static void
-trace_stopping_mutator_roots(struct gc_mutator *mut) {
-  GC_ASSERT(mutator_should_mark_while_stopping(mut));
-  struct gc_heap *heap = mutator_heap(mut);
-  trace_mutator_conservative_roots(mut,
-                                   mark_and_locally_enqueue_mutator_conservative_roots,
-                                   heap, mut);
-  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_locally, heap, mut);
-}
-
-static void
-trace_mutator_conservative_roots_with_lock(struct gc_mutator *mut) {
-  trace_mutator_conservative_roots(mut,
-                                   mark_and_globally_enqueue_mutator_conservative_roots,
-                                   mutator_heap(mut),
-                                   NULL);
-}
-
-static void
-trace_mutator_roots_with_lock(struct gc_mutator *mut) {
-  trace_mutator_conservative_roots_with_lock(mut);
-  gc_trace_mutator_roots(mut->roots, trace_and_enqueue_globally,
-                         mutator_heap(mut), NULL);
-}
-
-static void
-trace_mutator_roots_with_lock_before_stop(struct gc_mutator *mut) {
-  gc_stack_capture_hot(&mut->stack);
-  if (mutator_should_mark_while_stopping(mut))
-    trace_mutator_roots_with_lock(mut);
-  else
-    enqueue_mutator_for_tracing(mut);
-}
-
-static void
-release_stopping_mutator_roots(struct gc_mutator *mut) {
-  mutator_mark_buf_release(&mut->mark_buf);
+enqueue_pinned_roots(struct gc_heap *heap) {
+  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
+  enqueue_mutator_conservative_roots(heap);
+  enqueue_global_conservative_roots(heap);
 }
 
 static void
@@ -622,56 +446,6 @@ wait_for_mutators_to_stop(struct gc_heap *heap) {
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static void
-trace_mutator_conservative_roots_after_stop(struct gc_heap *heap) {
-  int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
-  if (!active_mutators_already_marked)
-    for (struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
-         mut;
-         mut = mut->next)
-      trace_mutator_conservative_roots_with_lock(mut);
-
-  for (struct gc_mutator *mut = heap->inactive_mutators;
-       mut;
-       mut = mut->next)
-    trace_mutator_conservative_roots_with_lock(mut);
-}
-
-static void
-trace_mutator_roots_after_stop(struct gc_heap *heap) {
-  struct gc_mutator *mut = atomic_load(&heap->mutator_trace_list);
-  int active_mutators_already_marked = heap_should_mark_while_stopping(heap);
-  while (mut) {
-    // Also collect any already-marked grey objects and put them on the
-    // global trace queue.
-    if (active_mutators_already_marked)
-      gc_tracer_enqueue_roots(&heap->tracer, mut->mark_buf.objects,
-                              mut->mark_buf.size);
-    else
-      trace_mutator_roots_with_lock(mut);
-    // Also unlink mutator_trace_list chain.
-    struct gc_mutator *next = mut->next;
-    mut->next = NULL;
-    mut = next;
-  }
-  atomic_store(&heap->mutator_trace_list, NULL);
-
-  for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next)
-    trace_mutator_roots_with_lock(mut);
-}
-
-static void
-trace_global_conservative_roots(struct gc_heap *heap) {
-  if (gc_has_global_conservative_roots())
-    gc_platform_visit_global_conservative_roots
-      (mark_and_globally_enqueue_heap_conservative_roots, heap, NULL);
-}
-
-static void
-enqueue_generational_root(struct gc_ref ref, struct gc_heap *heap) {
-  gc_tracer_enqueue_root(&heap->tracer, ref);
-}
-
 void
 gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                         struct gc_edge edge, struct gc_ref new_val) {
@@ -679,22 +453,6 @@ gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
   gc_object_set_remembered(obj);
 }
 
-static void
-trace_generational_roots(struct gc_heap *heap) {
-  // TODO: Add lospace nursery.
-  if (atomic_load(&heap->gc_kind) == GC_COLLECTION_MINOR) {
-    nofl_space_trace_remembered_set(heap_nofl_space(heap),
-                                    enqueue_generational_root,
-                                    heap);
-    large_object_space_trace_remembered_set(heap_large_object_space(heap),
-                                            enqueue_generational_root,
-                                            heap);
-  } else {
-    nofl_space_clear_remembered_set(heap_nofl_space(heap));
-    large_object_space_clear_remembered_set(heap_large_object_space(heap));
-  }
-}
-
 static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap,
                              struct gc_mutator *mut) GC_NEVER_INLINE;
@@ -733,11 +491,6 @@ pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_stopping);
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   gc_stack_capture_hot(&mut->stack);
-  if (mutator_should_mark_while_stopping(mut))
-    // No need to collect results in mark buf; we can enqueue roots directly.
-    trace_mutator_roots_with_lock(mut);
-  else
-    enqueue_mutator_for_tracing(mut);
   return pause_mutator_for_collection(heap, mut);
 }
 
@@ -749,13 +502,9 @@ pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_stopping);
   nofl_finish_sweeping(&mut->allocator, heap_nofl_space(heap));
   gc_stack_capture_hot(&mut->stack);
-  if (mutator_should_mark_while_stopping(mut))
-    trace_stopping_mutator_roots(mut);
-  enqueue_mutator_for_tracing(mut);
   heap_lock(heap);
   pause_mutator_for_collection(heap, mut);
   heap_unlock(heap);
-  release_stopping_mutator_roots(mut);
 }
 
 static inline void
@@ -838,7 +587,6 @@ determine_collection_kind(struct gc_heap *heap,
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   enum gc_collection_kind previous_gc_kind = atomic_load(&heap->gc_kind);
   enum gc_collection_kind gc_kind;
-  int mark_while_stopping = 1;
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   ssize_t pending = atomic_load_explicit(&nofl_space->pending_unavailable_bytes,
@@ -856,17 +604,6 @@ determine_collection_kind(struct gc_heap *heap,
     // free blocks, and we decided not to expand the heap.  Let's do an
     // evacuating major collection to maximize the free block yield.
     gc_kind = GC_COLLECTION_COMPACTING;
-
-    // Generally speaking, we allow mutators to mark their own stacks
-    // before pausing.  This is a limited form of concurrent marking, as
-    // other mutators might be running, not having received the signal
-    // to stop yet.  In a compacting collection, this results in pinned
-    // roots, because we haven't started evacuating yet and instead mark
-    // in place.  However as in this case we are trying to reclaim free
-    // blocks, try to avoid any pinning caused by the ragged-stop
-    // marking.  Of course if the mutator has conservative roots we will
-    // have pinning anyway and might as well allow ragged stops.
-    mark_while_stopping = gc_has_conservative_roots();
   } else if (previous_gc_kind == GC_COLLECTION_COMPACTING
              && fragmentation >= heap->fragmentation_low_threshold) {
     DEBUG("continuing evacuation due to fragmentation %.2f%% > %.2f%%\n",
@@ -912,7 +649,6 @@ determine_collection_kind(struct gc_heap *heap,
       gc_kind == GC_COLLECTION_COMPACTING) {
     DEBUG("welp.  conservative heap scanning, no evacuation for you\n");
     gc_kind = GC_COLLECTION_MAJOR;
-    mark_while_stopping = 1;
   }
 
   // If this is the first in a series of minor collections, reset the
@@ -927,34 +663,49 @@ determine_collection_kind(struct gc_heap *heap,
           yield * 100., clamped * 100.);
   }
 
-  atomic_store_explicit(&heap->mark_while_stopping, mark_while_stopping,
-                        memory_order_release);
-
   atomic_store(&heap->gc_kind, gc_kind);
   return gc_kind;
 }
 
 static void
-trace_conservative_roots_after_stop(struct gc_heap *heap) {
-  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
-  if (gc_has_mutator_conservative_roots())
-    trace_mutator_conservative_roots_after_stop(heap);
-  if (gc_has_global_conservative_roots())
-    trace_global_conservative_roots(heap);
+enqueue_root_edge(struct gc_edge edge, struct gc_heap *heap, void *unused) {
+  gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
 }
 
 static void
-trace_pinned_roots_after_stop(struct gc_heap *heap) {
-  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
-  trace_conservative_roots_after_stop(heap);
+enqueue_remembered_object(struct gc_ref ref, struct gc_heap *heap) {
+  gc_tracer_add_root(&heap->tracer, gc_root_remembered_object(ref));
 }
 
 static void
-trace_roots_after_stop(struct gc_heap *heap) {
-  trace_mutator_roots_after_stop(heap);
-  gc_trace_heap_roots(heap->roots, trace_and_enqueue_globally, heap, NULL);
-  gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
-  trace_generational_roots(heap);
+enqueue_generational_roots(struct gc_heap *heap,
+                           enum gc_collection_kind gc_kind) {
+  // TODO: Add lospace nursery.
+  if (gc_kind == GC_COLLECTION_MINOR) {
+    for (size_t i = 0; i < heap_nofl_space(heap)->nslabs; i++)
+      gc_tracer_add_root(&heap->tracer, gc_root_remembered_slab(i));
+    large_object_space_trace_remembered_set(heap_large_object_space(heap),
+                                            enqueue_remembered_object,
+                                            heap);
+  } else {
+    nofl_space_clear_remembered_set(heap_nofl_space(heap));
+    large_object_space_clear_remembered_set(heap_large_object_space(heap));
+  }
+}
+
+static void
+enqueue_relocatable_roots(struct gc_heap *heap,
+                          enum gc_collection_kind gc_kind) {
+  for (struct gc_mutator *mut = heap->mutators;
+       mut;
+       mut = mut->next) {
+    if (mut->roots)
+      gc_tracer_add_root(&heap->tracer, gc_root_mutator(mut));
+  }
+  if (heap->roots)
+    gc_tracer_add_root(&heap->tracer, gc_root_heap(heap));
+  gc_visit_finalizer_roots(heap->finalizer_state, enqueue_root_edge, heap, NULL);
+  enqueue_generational_roots(heap, gc_kind);
 }
 
 static void
@@ -970,15 +721,6 @@ resolve_ephemerons_eagerly(struct gc_heap *heap) {
   gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
 }
 
-static int
-enqueue_resolved_ephemerons(struct gc_heap *heap) {
-  struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
-  if (!resolved)
-    return 0;
-  gc_trace_resolved_ephemerons(resolved, trace_and_enqueue_globally, heap, NULL);
-  return 1;
-}
-
 static void
 trace_resolved_ephemerons(struct gc_heap *heap) {
   for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
@@ -995,7 +737,7 @@ resolve_finalizers(struct gc_heap *heap) {
        priority < gc_finalizer_priority_count();
        priority++) {
     if (gc_resolve_finalizers(heap->finalizer_state, priority,
-                              visit_root_edge, heap, NULL)) {
+                              enqueue_root_edge, heap, NULL)) {
       gc_tracer_trace(&heap->tracer);
       trace_resolved_ephemerons(heap);
     }
@@ -1008,6 +750,8 @@ sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
+static void collect(struct gc_mutator *mut,
+                    enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 static void
 collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
@@ -1020,6 +764,13 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   }
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
+  HEAP_EVENT(heap, requesting_stop);
+  request_mutators_to_stop(heap);
+  gc_stack_capture_hot(&mut->stack);
+  nofl_finish_sweeping(&mut->allocator, nofl_space);
+  HEAP_EVENT(heap, waiting_for_stop);
+  wait_for_mutators_to_stop(heap);
+  HEAP_EVENT(heap, mutators_stopped);
   enum gc_collection_kind gc_kind =
     determine_collection_kind(heap, requested_kind);
   int is_minor = gc_kind == GC_COLLECTION_MINOR;
@@ -1029,22 +780,17 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   gc_extern_space_start_gc(exspace, is_minor);
   resolve_ephemerons_lazily(heap);
   gc_tracer_prepare(&heap->tracer);
-  HEAP_EVENT(heap, requesting_stop);
-  request_mutators_to_stop(heap);
-  trace_mutator_roots_with_lock_before_stop(mut);
-  nofl_finish_sweeping(&mut->allocator, nofl_space);
-  HEAP_EVENT(heap, waiting_for_stop);
-  wait_for_mutators_to_stop(heap);
-  HEAP_EVENT(heap, mutators_stopped);
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
   HEAP_EVENT(heap, live_data_size, heap->size * (1 - yield));
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
-  trace_pinned_roots_after_stop(heap);
-  nofl_space_start_gc(nofl_space, gc_kind);
-  trace_roots_after_stop(heap);
+  enqueue_pinned_roots(heap);
+  if (gc_kind == GC_COLLECTION_COMPACTING)
+    gc_tracer_trace_roots(&heap->tracer);
   HEAP_EVENT(heap, roots_traced);
+  enqueue_relocatable_roots(heap, gc_kind);
+  nofl_space_start_gc(nofl_space, gc_kind);
   gc_tracer_trace(&heap->tracer);
   HEAP_EVENT(heap, heap_traced);
   resolve_ephemerons_eagerly(heap);
@@ -1334,7 +1080,6 @@ gc_init_for_thread(struct gc_stack_addr *stack_base,
 void
 gc_finish_for_thread(struct gc_mutator *mut) {
   remove_mutator(mutator_heap(mut), mut);
-  mutator_mark_buf_destroy(&mut->mark_buf);
   free(mut);
 }
 
@@ -1343,8 +1088,6 @@ deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   heap_lock(heap);
-  mut->next = heap->inactive_mutators;
-  heap->inactive_mutators = mut;
   heap->inactive_mutator_count++;
   gc_stack_capture_hot(&mut->stack);
   if (all_mutators_stopped(heap))
@@ -1357,18 +1100,12 @@ reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_lock(heap);
   while (mutators_are_stopping(heap))
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  struct gc_mutator **prev = &heap->inactive_mutators;
-  while (*prev != mut)
-    prev = &(*prev)->next;
-  *prev = mut->next;
-  mut->next = NULL;
   heap->inactive_mutator_count--;
   heap_unlock(heap);
 }
 
-void* gc_call_without_gc(struct gc_mutator *mut,
-                         void* (*f)(void*),
-                         void *data) {
+void*
+gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*), void *data) {
   struct gc_heap *heap = mutator_heap(mut);
   deactivate_mutator(heap, mut);
   void *ret = f(data);
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 037dff0d2..5a217cdb0 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -808,14 +808,19 @@ nofl_space_set_ephemeron_flag(struct gc_ref ref) {
   }
 }
 
+struct gc_trace_worker;
+
 // Note that it's quite possible (and even likely) that any given remset
 // byte doesn't hold any roots, if all stores were to nursery objects.
 STATIC_ASSERT_EQ(NOFL_GRANULES_PER_REMSET_BYTE % 8, 0);
 static void
 nofl_space_trace_card(struct nofl_space *space, struct nofl_slab *slab,
                       size_t card,
-                      void (*enqueue)(struct gc_ref, struct gc_heap*),
-                      struct gc_heap *heap) {
+                      void (*trace_object)(struct gc_ref,
+                                           struct gc_heap*,
+                                           struct gc_trace_worker*),
+                      struct gc_heap *heap,
+                      struct gc_trace_worker *worker) {
   uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
   size_t granule_base = card * NOFL_GRANULES_PER_REMSET_BYTE;
   for (size_t granule_in_remset = 0;
@@ -829,32 +834,33 @@ nofl_space_trace_card(struct nofl_space *space, struct nofl_slab *slab,
       size_t granule = granule_base + granule_offset;
       uintptr_t addr = first_addr_in_slab + granule * NOFL_GRANULE_SIZE;
       GC_ASSERT(nofl_metadata_byte_for_addr(addr) == &slab->metadata[granule]);
-      enqueue(gc_ref(addr), heap);
+      trace_object(gc_ref(addr), heap, worker);
     }
   }
 }
 
 static void
-nofl_space_trace_remembered_set(struct nofl_space *space,
-                                void (*enqueue)(struct gc_ref,
-                                                struct gc_heap*),
-                                struct gc_heap *heap) {
-  GC_ASSERT(!space->evacuating);
-  for (size_t s = 0; s < space->nslabs; s++) {
-    struct nofl_slab *slab = space->slabs[s];
-    uint8_t *remset = slab->remembered_set;
-    for (size_t card_base = 0;
-         card_base < NOFL_REMSET_BYTES_PER_SLAB;
-         card_base += 8) {
-      uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
-      if (!remset_bytes) continue;
-      memset(remset + card_base, 0, 8);
-      while (remset_bytes) {
-        size_t card_offset = count_zero_bytes(remset_bytes);
-        remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
-        nofl_space_trace_card(space, slab, card_base + card_offset,
-                              enqueue, heap);
-      }
+nofl_space_trace_remembered_slab(struct nofl_space *space,
+                                 size_t slab_idx,
+                                 void (*trace_object)(struct gc_ref,
+                                                      struct gc_heap*,
+                                                      struct gc_trace_worker*),
+                                 struct gc_heap *heap,
+                                 struct gc_trace_worker *worker) {
+  GC_ASSERT(slab_idx < space->nslabs);
+  struct nofl_slab *slab = space->slabs[slab_idx];
+  uint8_t *remset = slab->remembered_set;
+  for (size_t card_base = 0;
+       card_base < NOFL_REMSET_BYTES_PER_SLAB;
+       card_base += 8) {
+    uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
+    if (!remset_bytes) continue;
+    memset(remset + card_base, 0, 8);
+    while (remset_bytes) {
+      size_t card_offset = count_zero_bytes(remset_bytes);
+      remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
+      nofl_space_trace_card(space, slab, card_base + card_offset,
+                            trace_object, heap, worker);
     }
   }
 }
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 888c0fad6..20d66730f 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -57,6 +57,7 @@ struct gc_tracer {
   long epoch;
   pthread_mutex_t lock;
   pthread_cond_t cond;
+  int trace_roots_only;
   struct root_worklist roots;
   struct gc_trace_worker workers[TRACE_WORKERS_MAX_COUNT];
 };
@@ -112,6 +113,7 @@ gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
   tracer->heap = heap;
   atomic_init(&tracer->active_tracers, 0);
   tracer->epoch = 0;
+  tracer->trace_roots_only = 0;
   pthread_mutex_init(&tracer->lock, NULL);
   pthread_cond_init(&tracer->cond, NULL);
   root_worklist_init(&tracer->roots);
@@ -299,32 +301,52 @@ trace_with_data(struct gc_tracer *tracer,
   atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
   worker->data = data;
 
-  size_t n = 0;
   DEBUG("tracer #%zu: running trace loop\n", worker->id);
 
-  do {
-    struct gc_root root = root_worklist_pop(&tracer->roots);
-    if (root.kind == GC_ROOT_KIND_NONE)
-      break;
-    trace_root(root, heap, worker);
-  } while (1);
-
-  do {
-    while (1) {
-      struct gc_ref ref;
-      if (!local_worklist_empty(&worker->local)) {
-        ref = local_worklist_pop(&worker->local);
-      } else {
-        ref = trace_worker_steal(worker);
-        if (!gc_ref_is_heap_object(ref))
-          break;
-      }
-      trace_one(ref, heap, worker);
+  {
+    DEBUG("tracer #%zu: tracing roots\n", worker->id);
+    size_t n = 0;
+    do {
+      struct gc_root root = root_worklist_pop(&tracer->roots);
+      if (root.kind == GC_ROOT_KIND_NONE)
+        break;
+      trace_root(root, heap, worker);
       n++;
-    }
-  } while (trace_worker_should_continue(worker));
+    } while (1);
 
-  DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
+    DEBUG("tracer #%zu: done tracing roots, %zu roots traced\n", worker->id, n);
+  }
+
+  if (tracer->trace_roots_only) {
+    // Unlike the full trace where work is generated during the trace, a
+    // roots-only trace consumes work monotonically; any object enqueued as a
+    // result of marking roots isn't ours to deal with.  However we do need to
+    // synchronize with remote workers to ensure they have completed their
+    // work items.
+    if (worker->id == 0) {
+      for (size_t i = 1; i < tracer->worker_count; i++)
+        pthread_mutex_lock(&tracer->workers[i].lock);
+    }
+  } else {
+    DEBUG("tracer #%zu: tracing objects\n", worker->id);
+    size_t n = 0;
+    do {
+      while (1) {
+        struct gc_ref ref;
+        if (!local_worklist_empty(&worker->local)) {
+          ref = local_worklist_pop(&worker->local);
+        } else {
+          ref = trace_worker_steal(worker);
+          if (!gc_ref_is_heap_object(ref))
+            break;
+        }
+        trace_one(ref, heap, worker);
+        n++;
+      }
+    } while (trace_worker_should_continue(worker));
+
+    DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
+  }
 
   worker->data = NULL;
   atomic_fetch_sub_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
@@ -336,17 +358,28 @@ trace_worker_trace(struct gc_trace_worker *worker) {
                                  worker->heap, worker);
 }
 
-static inline void
-gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref ref) {
-  struct shared_worklist *worker0_deque = &tracer->workers[0].shared;
-  shared_worklist_push(worker0_deque, ref);
-}
+static inline int
+gc_tracer_should_parallelize(struct gc_tracer *tracer) {
+  if (root_worklist_size(&tracer->roots) > 1)
+    return 1;
 
-static inline void
-gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objv,
-                        size_t count) {
-  struct shared_worklist *worker0_deque = &tracer->workers[0].shared;
-  shared_worklist_push_many(worker0_deque, objv, count);
+  if (tracer->trace_roots_only)
+    return 0;
+
+  size_t nonempty_worklists = 0;
+  ssize_t parallel_threshold =
+    LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    ssize_t size = shared_worklist_size(&tracer->workers[i].shared);
+    if (!size)
+      continue;
+    nonempty_worklists++;
+    if (nonempty_worklists > 1)
+      return 1;
+    if (size >= parallel_threshold)
+      return 1;
+  }
+  return 0;
 }
 
 static inline void
@@ -356,10 +389,7 @@ gc_tracer_trace(struct gc_tracer *tracer) {
   for (int i = 1; i < tracer->worker_count; i++)
     pthread_mutex_unlock(&tracer->workers[i].lock);
 
-  ssize_t parallel_threshold =
-    LOCAL_WORKLIST_SIZE - LOCAL_WORKLIST_SHARE_AMOUNT;
-  if (root_worklist_size(&tracer->roots) > 1 ||
-      shared_worklist_size(&tracer->workers[0].shared) >= parallel_threshold) {
+  if (gc_tracer_should_parallelize(tracer)) {
     DEBUG("waking workers\n");
     tracer_unpark_all_workers(tracer);
   } else {
@@ -372,4 +402,16 @@ gc_tracer_trace(struct gc_tracer *tracer) {
   DEBUG("trace finished\n");
 }
 
+static inline void
+gc_tracer_trace_roots(struct gc_tracer *tracer) {
+  DEBUG("starting roots-only trace\n");
+
+  tracer->trace_roots_only = 1;
+  gc_tracer_trace(tracer);
+  tracer->trace_roots_only = 0;
+  
+  GC_ASSERT_EQ(atomic_load(&tracer->active_tracers), 0);
+  DEBUG("roots-only trace finished\n");
+}
+
 #endif // PARALLEL_TRACER_H
diff --git a/src/root.h b/src/root.h
index 5228dcb4f..46e019b06 100644
--- a/src/root.h
+++ b/src/root.h
@@ -2,6 +2,7 @@
 #define ROOT_H
 
 #include "gc-edge.h"
+#include "extents.h"
 
 struct gc_ephemeron;
 struct gc_heap;
@@ -11,8 +12,12 @@ enum gc_root_kind {
   GC_ROOT_KIND_NONE,
   GC_ROOT_KIND_HEAP,
   GC_ROOT_KIND_MUTATOR,
+  GC_ROOT_KIND_CONSERVATIVE_EDGES,
+  GC_ROOT_KIND_CONSERVATIVE_POSSIBLY_INTERIOR_EDGES,
   GC_ROOT_KIND_RESOLVED_EPHEMERONS,
   GC_ROOT_KIND_EDGE,
+  GC_ROOT_KIND_REMEMBERED_OBJECT,
+  GC_ROOT_KIND_REMEMBERED_SLAB,
 };
 
 struct gc_root {
@@ -21,22 +26,38 @@ struct gc_root {
     struct gc_heap *heap;
     struct gc_mutator *mutator;
     struct gc_ephemeron *resolved_ephemerons;
+    struct extent_range range;
     struct gc_edge edge;
+    struct gc_ref ref;
+    size_t idx;
   };
 };
 
-static inline struct gc_root gc_root_heap(struct gc_heap* heap) {
+static inline struct gc_root
+gc_root_heap(struct gc_heap* heap) {
   struct gc_root ret = { GC_ROOT_KIND_HEAP };
   ret.heap = heap;
   return ret;
 }
 
-static inline struct gc_root gc_root_mutator(struct gc_mutator* mutator) {
+static inline struct gc_root
+gc_root_mutator(struct gc_mutator* mutator) {
   struct gc_root ret = { GC_ROOT_KIND_MUTATOR };
   ret.mutator = mutator;
   return ret;
 }
 
+static inline struct gc_root
+gc_root_conservative_edges(uintptr_t lo_addr, uintptr_t hi_addr,
+                           int possibly_interior) {
+  enum gc_root_kind kind = possibly_interior
+    ? GC_ROOT_KIND_CONSERVATIVE_POSSIBLY_INTERIOR_EDGES
+    : GC_ROOT_KIND_CONSERVATIVE_EDGES;
+  struct gc_root ret = { kind };
+  ret.range = (struct extent_range) {lo_addr, hi_addr};
+  return ret;
+}
+
 static inline struct gc_root
 gc_root_resolved_ephemerons(struct gc_ephemeron* resolved) {
   struct gc_root ret = { GC_ROOT_KIND_RESOLVED_EPHEMERONS };
@@ -51,4 +72,18 @@ gc_root_edge(struct gc_edge edge) {
   return ret;
 }
 
+static inline struct gc_root
+gc_root_remembered_object(struct gc_ref ref) {
+  struct gc_root ret = { GC_ROOT_KIND_REMEMBERED_OBJECT };
+  ret.ref = ref;
+  return ret;
+}
+
+static inline struct gc_root
+gc_root_remembered_slab(size_t idx) {
+  struct gc_root ret = { GC_ROOT_KIND_REMEMBERED_SLAB };
+  ret.idx = idx;
+  return ret;
+}
+
 #endif // ROOT_H
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index 96ab7e563..b9575fddb 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -12,6 +12,7 @@
 
 struct gc_tracer {
   struct gc_heap *heap;
+  int trace_roots_only;
   struct root_worklist roots;
   struct simple_worklist worklist;
 };
@@ -30,6 +31,7 @@ static int
 gc_tracer_init(struct gc_tracer *tracer, struct gc_heap *heap,
                size_t parallelism) {
   tracer->heap = heap;
+  tracer->trace_roots_only = 0;
   root_worklist_init(&tracer->roots);
   return simple_worklist_init(&tracer->worklist);
 }
@@ -43,19 +45,11 @@ gc_tracer_add_root(struct gc_tracer *tracer, struct gc_root root) {
   root_worklist_push(&tracer->roots, root);
 }
 
-static inline void
-gc_tracer_enqueue_root(struct gc_tracer *tracer, struct gc_ref obj) {
-  simple_worklist_push(&tracer->worklist, obj);
-}
-static inline void
-gc_tracer_enqueue_roots(struct gc_tracer *tracer, struct gc_ref *objs,
-                        size_t count) {
-  simple_worklist_push_many(&tracer->worklist, objs, count);
-}
 static inline void
 gc_trace_worker_enqueue(struct gc_trace_worker *worker, struct gc_ref ref) {
-  gc_tracer_enqueue_root(worker->tracer, ref);
+  simple_worklist_push(&worker->tracer->worklist, ref);
 }
+
 static inline void
 tracer_trace_with_data(struct gc_tracer *tracer, struct gc_heap *heap,
                        struct gc_trace_worker *worker,
@@ -68,12 +62,14 @@ tracer_trace_with_data(struct gc_tracer *tracer, struct gc_heap *heap,
     trace_root(root, heap, worker);
   } while (1);
   root_worklist_reset(&tracer->roots);
-  do {
-    struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
-    if (!gc_ref_is_heap_object(obj))
-      break;
-    trace_one(obj, heap, worker);
-  } while (1);
+  if (!tracer->trace_roots_only) {
+    do {
+      struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
+      if (!gc_ref_is_heap_object(obj))
+        break;
+      trace_one(obj, heap, worker);
+    } while (1);
+  }
 }
 static inline void
 gc_tracer_trace(struct gc_tracer *tracer) {
@@ -82,4 +78,11 @@ gc_tracer_trace(struct gc_tracer *tracer) {
                                  &worker);
 }
 
+static inline void
+gc_tracer_trace_roots(struct gc_tracer *tracer) {
+  tracer->trace_roots_only = 1;
+  gc_tracer_trace(tracer);
+  tracer->trace_roots_only = 0;
+}
+
 #endif // SERIAL_TRACER_H
diff --git a/src/tracer.h b/src/tracer.h
index ec6a140b1..c563a7018 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -47,11 +47,8 @@ static void gc_tracer_prepare(struct gc_tracer *tracer);
 static void gc_tracer_release(struct gc_tracer *tracer);
 
 // Add root objects to the trace.  Call before tracer_trace.
-static inline void gc_tracer_enqueue_root(struct gc_tracer *tracer,
-                                          struct gc_ref obj);
-static inline void gc_tracer_enqueue_roots(struct gc_tracer *tracer,
-                                           struct gc_ref *objs,
-                                           size_t count);
+static inline void gc_tracer_add_root(struct gc_tracer *tracer,
+                                      struct gc_root root);
 
 // Given that an object has been shaded grey, enqueue for tracing.
 static inline void gc_trace_worker_enqueue(struct gc_trace_worker *worker,
@@ -59,10 +56,10 @@ static inline void gc_trace_worker_enqueue(struct gc_trace_worker *worker,
 static inline struct gc_trace_worker_data*
 gc_trace_worker_data(struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
 
-static inline void gc_tracer_add_root(struct gc_tracer *tracer,
-                                      struct gc_root root);
+// Just trace roots.
+static inline void gc_tracer_trace_roots(struct gc_tracer *tracer);
 
-// Run the full trace.
+// Run the full trace, including roots.
 static inline void gc_tracer_trace(struct gc_tracer *tracer);
 
 #endif // TRACER_H

From 6545b34073528c3dbc7a6ed133cef2c96303d780 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 10 Sep 2024 10:55:38 +0200
Subject: [PATCH 284/403] Reorder events in event listener; refactors to mmc
 and pcc

In GC, request mutators to stop before doing anything else; changes the
order of the event listener interface.  Also, refactor mmc to look more
like pcc.
---
 api/gc-basic-stats.h          |  17 ++--
 api/gc-event-listener-chain.h |  15 ++-
 api/gc-event-listener.h       |   2 +-
 api/gc-null-event-listener.h  |   6 +-
 doc/manual.md                 |   4 +-
 src/bdw.c                     |   2 +-
 src/copy-space.h              |   4 +-
 src/mmc.c                     | 169 +++++++++++++++-------------------
 src/pcc.c                     |  71 +++++++-------
 src/scc.c                     |   6 +-
 src/semi.c                    |   2 +-
 11 files changed, 140 insertions(+), 158 deletions(-)

diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 40cdbcb3e..6b39a59c6 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -40,6 +40,15 @@ static inline void gc_basic_stats_init(void *data, size_t heap_size) {
   stats->heap_size = stats->max_heap_size = heap_size;
 }
 
+static inline void gc_basic_stats_requesting_stop(void *data) {
+  struct gc_basic_stats *stats = data;
+  uint64_t now = gc_basic_stats_now();
+  stats->elapsed_mutator_usec += now - stats->last_time_usec;
+  stats->last_time_usec = now;
+}
+static inline void gc_basic_stats_waiting_for_stop(void *data) {}
+static inline void gc_basic_stats_mutators_stopped(void *data) {}
+
 static inline void gc_basic_stats_prepare_gc(void *data,
                                              enum gc_collection_kind kind) {
   struct gc_basic_stats *stats = data;
@@ -47,14 +56,8 @@ static inline void gc_basic_stats_prepare_gc(void *data,
     stats->minor_collection_count++;
   else
     stats->major_collection_count++;
-  uint64_t now = gc_basic_stats_now();
-  stats->elapsed_mutator_usec += now - stats->last_time_usec;
-  stats->last_time_usec = now;
 }
 
-static inline void gc_basic_stats_requesting_stop(void *data) {}
-static inline void gc_basic_stats_waiting_for_stop(void *data) {}
-static inline void gc_basic_stats_mutators_stopped(void *data) {}
 static inline void gc_basic_stats_roots_traced(void *data) {}
 static inline void gc_basic_stats_heap_traced(void *data) {}
 static inline void gc_basic_stats_ephemerons_traced(void *data) {}
@@ -94,10 +97,10 @@ static inline void gc_basic_stats_live_data_size(void *data, size_t size) {
 #define GC_BASIC_STATS                                                  \
   ((struct gc_event_listener) {                                         \
     gc_basic_stats_init,                                                \
-    gc_basic_stats_prepare_gc,                                          \
     gc_basic_stats_requesting_stop,                                     \
     gc_basic_stats_waiting_for_stop,                                    \
     gc_basic_stats_mutators_stopped,                                    \
+    gc_basic_stats_prepare_gc,                                          \
     gc_basic_stats_roots_traced,                                        \
     gc_basic_stats_heap_traced,                                         \
     gc_basic_stats_ephemerons_traced,                                   \
diff --git a/api/gc-event-listener-chain.h b/api/gc-event-listener-chain.h
index 96a7356a8..27b56d5c6 100644
--- a/api/gc-event-listener-chain.h
+++ b/api/gc-event-listener-chain.h
@@ -20,13 +20,6 @@ static inline void gc_event_listener_chain_init(void *data, size_t heap_size) {
   chain->tail.init(chain->tail_data, heap_size);
 }
 
-static inline void gc_event_listener_chain_prepare_gc(void *data,
-                                             enum gc_collection_kind kind) {
-  struct gc_event_listener_chain *chain = data;
-  chain->head.prepare_gc(chain->head_data, kind);
-  chain->tail.prepare_gc(chain->tail_data, kind);
-}
-
 static inline void gc_event_listener_chain_requesting_stop(void *data) {
   struct gc_event_listener_chain *chain = data;
   chain->head.requesting_stop(chain->head_data);
@@ -42,6 +35,12 @@ static inline void gc_event_listener_chain_mutators_stopped(void *data) {
   chain->head.mutators_stopped(chain->head_data);
   chain->tail.mutators_stopped(chain->tail_data);
 }
+static inline void
+gc_event_listener_chain_prepare_gc(void *data, enum gc_collection_kind kind) {
+  struct gc_event_listener_chain *chain = data;
+  chain->head.prepare_gc(chain->head_data, kind);
+  chain->tail.prepare_gc(chain->tail_data, kind);
+}
 static inline void gc_event_listener_chain_roots_traced(void *data) {
   struct gc_event_listener_chain *chain = data;
   chain->head.roots_traced(chain->head_data);
@@ -121,10 +120,10 @@ static inline void gc_event_listener_chain_live_data_size(void *data, size_t siz
 #define GC_EVENT_LISTENER_CHAIN                                         \
   ((struct gc_event_listener) {                                         \
     gc_event_listener_chain_init,                                       \
-    gc_event_listener_chain_prepare_gc,                                 \
     gc_event_listener_chain_requesting_stop,                            \
     gc_event_listener_chain_waiting_for_stop,                           \
     gc_event_listener_chain_mutators_stopped,                           \
+    gc_event_listener_chain_prepare_gc,                                 \
     gc_event_listener_chain_roots_traced,                               \
     gc_event_listener_chain_heap_traced,                                \
     gc_event_listener_chain_ephemerons_traced,                          \
diff --git a/api/gc-event-listener.h b/api/gc-event-listener.h
index f5d8180f6..66801a52c 100644
--- a/api/gc-event-listener.h
+++ b/api/gc-event-listener.h
@@ -5,10 +5,10 @@
 
 struct gc_event_listener {
   void (*init)(void *data, size_t heap_size);
-  void (*prepare_gc)(void *data, enum gc_collection_kind kind);
   void (*requesting_stop)(void *data);
   void (*waiting_for_stop)(void *data);
   void (*mutators_stopped)(void *data);
+  void (*prepare_gc)(void *data, enum gc_collection_kind kind);
   void (*roots_traced)(void *data);
   void (*heap_traced)(void *data);
   void (*ephemerons_traced)(void *data);
diff --git a/api/gc-null-event-listener.h b/api/gc-null-event-listener.h
index 7563c3a46..9c032ffc2 100644
--- a/api/gc-null-event-listener.h
+++ b/api/gc-null-event-listener.h
@@ -4,11 +4,11 @@
 #include "gc-event-listener.h"
 
 static inline void gc_null_event_listener_init(void *data, size_t size) {}
-static inline void gc_null_event_listener_prepare_gc(void *data,
-                                                     enum gc_collection_kind) {}
 static inline void gc_null_event_listener_requesting_stop(void *data) {}
 static inline void gc_null_event_listener_waiting_for_stop(void *data) {}
 static inline void gc_null_event_listener_mutators_stopped(void *data) {}
+static inline void gc_null_event_listener_prepare_gc(void *data,
+                                                     enum gc_collection_kind) {}
 static inline void gc_null_event_listener_roots_traced(void *data) {}
 static inline void gc_null_event_listener_heap_traced(void *data) {}
 static inline void gc_null_event_listener_ephemerons_traced(void *data) {}
@@ -28,10 +28,10 @@ static inline void gc_null_event_listener_live_data_size(void *, size_t) {}
 #define GC_NULL_EVENT_LISTENER                                         \
   ((struct gc_event_listener) {                                        \
     gc_null_event_listener_init,                                       \
-    gc_null_event_listener_prepare_gc,                                 \
     gc_null_event_listener_requesting_stop,                            \
     gc_null_event_listener_waiting_for_stop,                           \
     gc_null_event_listener_mutators_stopped,                           \
+    gc_null_event_listener_prepare_gc,                                 \
     gc_null_event_listener_roots_traced,                               \
     gc_null_event_listener_heap_traced,                                \
     gc_null_event_listener_ephemerons_traced,                          \
diff --git a/doc/manual.md b/doc/manual.md
index d856df219..eb648f54d 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -594,8 +594,6 @@ There are three heap events:
 The collection events form a kind of state machine, and are called in
 this order:
 
- * `prepare_gc(void* data, int is_minor, int is_compacting)`: Called at
-   the beginning of GC.  Some mutators may still be active.
  * `requesting_stop(void* data)`: Called when the collector asks
    mutators to stop.
  * `waiting_for_stop(void* data)`: Called when the collector has done
@@ -603,6 +601,8 @@ this order:
    mutators to stop.
  * `mutators_stopped(void* data)`: Called when all mutators have
    stopped; the trace phase follows.
+ * `prepare_gc(void* data, enum gc_collection_kind gc_kind)`: Called 
+   to indicate which kind of collection is happening.
  * `roots_traced(void* data)`: Called when roots have been visited.
  * `heap_traced(void* data)`: Called when the whole heap has been
    traced.
diff --git a/src/bdw.c b/src/bdw.c
index a69901f7f..3149bf3c7 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -412,13 +412,13 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
 static void on_collection_event(GC_EventType event) {
   switch (event) {
   case GC_EVENT_START: {
-    HEAP_EVENT(prepare_gc, GC_COLLECTION_MAJOR);
     HEAP_EVENT(requesting_stop);
     HEAP_EVENT(waiting_for_stop);
     break;
   }
   case GC_EVENT_MARK_START:
     HEAP_EVENT(mutators_stopped);
+    HEAP_EVENT(prepare_gc, GC_COLLECTION_MAJOR);
     break;
   case GC_EVENT_MARK_END:
     HEAP_EVENT(roots_traced);
diff --git a/src/copy-space.h b/src/copy-space.h
index e51f9f8ee..f125a08e1 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -2,6 +2,7 @@
 #define COPY_SPACE_H
 
 #include <stdlib.h>
+#include <sys/mman.h>
 
 #include "gc-api.h"
 
@@ -541,8 +542,7 @@ copy_space_contains(struct copy_space *space, struct gc_ref ref) {
 }
 
 static inline void
-copy_space_allocator_init(struct copy_space_allocator *alloc,
-                          struct copy_space *space) {
+copy_space_allocator_init(struct copy_space_allocator *alloc) {
   memset(alloc, 0, sizeof(*alloc));
 }
 
diff --git a/src/mmc.c b/src/mmc.c
index e6335dced..7975a643c 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -3,9 +3,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <unistd.h>
 
 #include "gc-api.h"
 
@@ -44,14 +41,12 @@ struct gc_heap {
   struct gc_pending_ephemerons *pending_ephemerons;
   struct gc_finalizer_state *finalizer_state;
   enum gc_collection_kind gc_kind;
-  int multithreaded;
   size_t mutator_count;
   size_t paused_mutator_count;
   size_t inactive_mutator_count;
   struct gc_heap_roots *roots;
   struct gc_mutator *mutators;
   long count;
-  uint8_t last_collection_was_minor;
   struct gc_tracer tracer;
   double fragmentation_low_threshold;
   double fragmentation_high_threshold;
@@ -116,29 +111,24 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
 
 static inline int
 do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
-         struct gc_trace_worker *worker) {
+         struct gc_trace_worker_data *data) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref))) {
-    struct nofl_allocator *alloc =
-      worker ? &gc_trace_worker_data(worker)->allocator : NULL;
+  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref)))
     return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref,
-                                              alloc);
-  } else if (large_object_space_contains(heap_large_object_space(heap), ref))
+                                              &data->allocator);
+  else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           ref);
   else
     return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
 }
 
-static inline int trace_edge(struct gc_heap *heap,
-                             struct gc_edge edge,
-                             struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
-
 static inline int
-trace_edge(struct gc_heap *heap, struct gc_edge edge, struct gc_trace_worker *worker) {
+trace_edge(struct gc_heap *heap, struct gc_edge edge,
+           struct gc_trace_worker_data *data) {
   struct gc_ref ref = gc_edge_ref(edge);
-  int is_new = do_trace(heap, edge, ref, worker);
+  int is_new = do_trace(heap, edge, ref, data);
 
   if (is_new &&
       GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
@@ -191,13 +181,12 @@ add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
+  nofl_allocator_reset(&mut->allocator);
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
   while (mutators_are_stopping(heap))
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  if (heap->mutator_count == 1)
-    heap->multithreaded = 1;
   mut->next = mut->prev = NULL;
   struct gc_mutator *tail = heap->mutators;
   if (tail) {
@@ -229,32 +218,6 @@ remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
-static void
-request_mutators_to_stop(struct gc_heap *heap) {
-  GC_ASSERT(!mutators_are_stopping(heap));
-  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
-}
-
-static void
-allow_mutators_to_continue(struct gc_heap *heap) {
-  GC_ASSERT(mutators_are_stopping(heap));
-  GC_ASSERT(all_mutators_stopped(heap));
-  heap->paused_mutator_count = 0;
-  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
-  GC_ASSERT(!mutators_are_stopping(heap));
-  pthread_cond_broadcast(&heap->mutator_cond);
-}
-
-static void
-heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
-  size_t previous = heap->large_object_pages;
-  heap->large_object_pages = npages;
-  GC_ASSERT(npages <= previous);
-  size_t bytes = (previous - npages) <<
-    heap_large_object_space(heap)->page_size_log2;
-  nofl_space_reacquire_memory(heap_nofl_space(heap), bytes);
-}
-
 void
 gc_mutator_set_roots(struct gc_mutator *mut, struct gc_mutator_roots *roots) {
   mut->roots = roots;
@@ -273,7 +236,7 @@ static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
 static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   struct gc_trace_worker *worker = trace_data;
-  if (trace_edge(heap, edge, worker))
+  if (trace_edge(heap, edge, gc_trace_worker_data(worker)))
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
@@ -404,39 +367,29 @@ trace_root(struct gc_root root, struct gc_heap *heap,
 }
 
 static void
-enqueue_conservative_roots(uintptr_t low, uintptr_t high,
-                           struct gc_heap *heap, void *data) {
-  int *possibly_interior = data;
-  gc_tracer_add_root(&heap->tracer,
-                     gc_root_conservative_edges(low, high, *possibly_interior));
+request_mutators_to_stop(struct gc_heap *heap) {
+  GC_ASSERT(!mutators_are_stopping(heap));
+  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
 }
 
 static void
-enqueue_mutator_conservative_roots(struct gc_heap *heap) {
-  if (gc_has_mutator_conservative_roots()) {
-    int possibly_interior = gc_mutator_conservative_roots_may_be_interior();
-    for (struct gc_mutator *mut = heap->mutators;
-         mut;
-         mut = mut->next)
-      gc_stack_visit(&mut->stack, enqueue_conservative_roots, heap,
-                     &possibly_interior);
-  }
+allow_mutators_to_continue(struct gc_heap *heap) {
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(all_mutators_stopped(heap));
+  heap->paused_mutator_count--;
+  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
+  GC_ASSERT(!mutators_are_stopping(heap));
+  pthread_cond_broadcast(&heap->mutator_cond);
 }
 
 static void
-enqueue_global_conservative_roots(struct gc_heap *heap) {
-  if (gc_has_global_conservative_roots()) {
-    int possibly_interior = 0;
-    gc_platform_visit_global_conservative_roots
-      (enqueue_conservative_roots, heap, &possibly_interior);
-  }
-}
-
-static void
-enqueue_pinned_roots(struct gc_heap *heap) {
-  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
-  enqueue_mutator_conservative_roots(heap);
-  enqueue_global_conservative_roots(heap);
+heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
+  size_t previous = heap->large_object_pages;
+  heap->large_object_pages = npages;
+  GC_ASSERT(npages <= previous);
+  size_t bytes = (previous - npages) <<
+    heap_large_object_space(heap)->page_size_log2;
+  nofl_space_reacquire_memory(heap_nofl_space(heap), bytes);
 }
 
 static void
@@ -446,13 +399,6 @@ wait_for_mutators_to_stop(struct gc_heap *heap) {
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-void
-gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                        struct gc_edge edge, struct gc_ref new_val) {
-  GC_ASSERT(obj_size > gc_allocator_large_threshold());
-  gc_object_set_remembered(obj);
-}
-
 static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap,
                              struct gc_mutator *mut) GC_NEVER_INLINE;
@@ -466,17 +412,10 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   if (all_mutators_stopped(heap))
     pthread_cond_signal(&heap->collector_cond);
 
-  // Go to sleep and wake up when the collector is done.  Note,
-  // however, that it may be that some other mutator manages to
-  // trigger collection before we wake up.  In that case we need to
-  // mark roots, not just sleep again.  To detect a wakeup on this
-  // collection vs a future collection, we use the global GC count.
-  // This is safe because the count is protected by the heap lock,
-  // which we hold.
-  long epoch = heap->count;
   do
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  while (mutators_are_stopping(heap) && heap->count == epoch);
+  while (mutators_are_stopping(heap));
+  heap->paused_mutator_count--;
 
   MUTATOR_EVENT(mut, mutator_restarted);
   return collection_kind;
@@ -489,8 +428,6 @@ pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
-  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
-  gc_stack_capture_hot(&mut->stack);
   return pause_mutator_for_collection(heap, mut);
 }
 
@@ -667,6 +604,42 @@ determine_collection_kind(struct gc_heap *heap,
   return gc_kind;
 }
 
+static void
+enqueue_conservative_roots(uintptr_t low, uintptr_t high,
+                           struct gc_heap *heap, void *data) {
+  int *possibly_interior = data;
+  gc_tracer_add_root(&heap->tracer,
+                     gc_root_conservative_edges(low, high, *possibly_interior));
+}
+
+static void
+enqueue_mutator_conservative_roots(struct gc_heap *heap) {
+  if (gc_has_mutator_conservative_roots()) {
+    int possibly_interior = gc_mutator_conservative_roots_may_be_interior();
+    for (struct gc_mutator *mut = heap->mutators;
+         mut;
+         mut = mut->next)
+      gc_stack_visit(&mut->stack, enqueue_conservative_roots, heap,
+                     &possibly_interior);
+  }
+}
+
+static void
+enqueue_global_conservative_roots(struct gc_heap *heap) {
+  if (gc_has_global_conservative_roots()) {
+    int possibly_interior = 0;
+    gc_platform_visit_global_conservative_roots
+      (enqueue_conservative_roots, heap, &possibly_interior);
+  }
+}
+
+static void
+enqueue_pinned_roots(struct gc_heap *heap) {
+  GC_ASSERT(!heap_nofl_space(heap)->evacuating);
+  enqueue_mutator_conservative_roots(heap);
+  enqueue_global_conservative_roots(heap);
+}
+
 static void
 enqueue_root_edge(struct gc_edge edge, struct gc_heap *heap, void *unused) {
   gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
@@ -766,7 +739,6 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   DEBUG("start collect #%ld:\n", heap->count);
   HEAP_EVENT(heap, requesting_stop);
   request_mutators_to_stop(heap);
-  gc_stack_capture_hot(&mut->stack);
   nofl_finish_sweeping(&mut->allocator, nofl_space);
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
@@ -786,8 +758,11 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   enqueue_pinned_roots(heap);
+  // Eagerly trace pinned roots if we are going to relocate objects.
   if (gc_kind == GC_COLLECTION_COMPACTING)
     gc_tracer_trace_roots(&heap->tracer);
+  // Process the rest of the roots in parallel.  This heap event should probably
+  // be removed, as there is no clear cutoff time.
   HEAP_EVENT(heap, roots_traced);
   enqueue_relocatable_roots(heap, gc_kind);
   nofl_space_start_gc(nofl_space, gc_kind);
@@ -804,7 +779,6 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   large_object_space_finish_gc(lospace, is_minor);
   gc_extern_space_finish_gc(exspace, is_minor);
   heap->count++;
-  heap->last_collection_was_minor = is_minor;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
@@ -815,6 +789,8 @@ trigger_collection(struct gc_mutator *mut,
                    enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   int prev_kind = -1;
+  gc_stack_capture_hot(&mut->stack);
+  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   heap_lock(heap);
   while (mutators_are_stopping(heap))
     prev_kind = pause_mutator_for_collection_with_lock(mut);
@@ -879,6 +855,13 @@ gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+void
+gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                        struct gc_edge edge, struct gc_ref new_val) {
+  GC_ASSERT(obj_size > gc_allocator_large_threshold());
+  gc_object_set_remembered(obj);
+}
+
 struct gc_ephemeron*
 gc_allocate_ephemeron(struct gc_mutator *mut) {
   struct gc_ref ret =
diff --git a/src/pcc.c b/src/pcc.c
index 3023d465d..403bfe51f 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -3,9 +3,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <unistd.h>
 
 #include "gc-api.h"
 
@@ -88,7 +85,7 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                struct gc_heap *heap,
                                struct gc_trace_worker *worker) {
   struct gc_trace_worker_data data;
-  copy_space_allocator_init(&data.allocator, heap_copy_space(heap));
+  copy_space_allocator_init(&data.allocator);
   f(tracer, heap, worker, &data);
   copy_space_allocator_finish(&data.allocator, heap_copy_space(heap));
 }
@@ -153,7 +150,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
-  copy_space_allocator_init(&mut->allocator, heap_copy_space(heap));
+  copy_space_allocator_init(&mut->allocator);
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
@@ -171,9 +168,9 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 }
 
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
   heap->mutator_count--;
   if (mut->next)
@@ -189,29 +186,6 @@ static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
-static void request_mutators_to_stop(struct gc_heap *heap) {
-  GC_ASSERT(!mutators_are_stopping(heap));
-  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
-}
-
-static void allow_mutators_to_continue(struct gc_heap *heap) {
-  GC_ASSERT(mutators_are_stopping(heap));
-  GC_ASSERT(all_mutators_stopped(heap));
-  heap->paused_mutator_count--;
-  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
-  GC_ASSERT(!mutators_are_stopping(heap));
-  pthread_cond_broadcast(&heap->mutator_cond);
-}
-
-static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
-  size_t previous = heap->large_object_pages;
-  heap->large_object_pages = npages;
-  GC_ASSERT(npages <= previous);
-  size_t bytes = (previous - npages) <<
-    heap_large_object_space(heap)->page_size_log2;
-  copy_space_reacquire_memory(heap_copy_space(heap), bytes);
-}
-
 void gc_mutator_set_roots(struct gc_mutator *mut,
                           struct gc_mutator_roots *roots) {
   mut->roots = roots;
@@ -263,16 +237,35 @@ static inline void trace_root(struct gc_root root, struct gc_heap *heap,
   }
 }
 
+static void request_mutators_to_stop(struct gc_heap *heap) {
+  GC_ASSERT(!mutators_are_stopping(heap));
+  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
+}
+
+static void allow_mutators_to_continue(struct gc_heap *heap) {
+  GC_ASSERT(mutators_are_stopping(heap));
+  GC_ASSERT(all_mutators_stopped(heap));
+  heap->paused_mutator_count--;
+  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
+  GC_ASSERT(!mutators_are_stopping(heap));
+  pthread_cond_broadcast(&heap->mutator_cond);
+}
+
+static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
+  size_t previous = heap->large_object_pages;
+  heap->large_object_pages = npages;
+  GC_ASSERT(npages <= previous);
+  size_t bytes = (previous - npages) <<
+    heap_large_object_space(heap)->page_size_log2;
+  copy_space_reacquire_memory(heap_copy_space(heap), bytes);
+}
+
 static void wait_for_mutators_to_stop(struct gc_heap *heap) {
   heap->paused_mutator_count++;
   while (!all_mutators_stopped(heap))
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                             struct gc_edge edge, struct gc_ref new_val) {
-}
-
 static void
 pause_mutator_for_collection(struct gc_heap *heap,
                              struct gc_mutator *mut) GC_NEVER_INLINE;
@@ -380,16 +373,16 @@ static void collect(struct gc_mutator *mut) {
   struct gc_extern_space *exspace = heap_extern_space(heap);
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
-  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
-  large_object_space_start_gc(lospace, 0);
-  gc_extern_space_start_gc(exspace, 0);
-  resolve_ephemerons_lazily(heap);
   HEAP_EVENT(heap, requesting_stop);
   request_mutators_to_stop(heap);
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
   copy_space_flip(copy_space);
+  large_object_space_start_gc(lospace, 0);
+  gc_extern_space_start_gc(exspace, 0);
+  resolve_ephemerons_lazily(heap);
   gc_tracer_prepare(&heap->tracer);
   add_roots(heap);
   HEAP_EVENT(heap, roots_traced);
@@ -483,6 +476,10 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
+                             struct gc_edge edge, struct gc_ref new_val) {
+}
+
 struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
   return gc_allocate(mut, gc_ephemeron_size());
 }
diff --git a/src/scc.c b/src/scc.c
index 33fe8a1b1..46ab98560 100644
--- a/src/scc.c
+++ b/src/scc.c
@@ -88,7 +88,7 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                struct gc_heap *heap,
                                struct gc_trace_worker *worker) {
   struct gc_trace_worker_data data;
-  copy_space_allocator_init(&data.allocator, heap_copy_space(heap));
+  copy_space_allocator_init(&data.allocator);
   f(tracer, heap, worker, &data);
   copy_space_allocator_finish(&data.allocator, heap_copy_space(heap));
 }
@@ -152,7 +152,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
-  copy_space_allocator_init(&mut->allocator, heap_copy_space(heap));
+  copy_space_allocator_init(&mut->allocator);
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
@@ -377,7 +377,6 @@ static void collect(struct gc_mutator *mut) {
   struct gc_extern_space *exspace = heap_extern_space(heap);
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
-  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
   large_object_space_start_gc(lospace, 0);
   gc_extern_space_start_gc(exspace, 0);
   resolve_ephemerons_lazily(heap);
@@ -386,6 +385,7 @@ static void collect(struct gc_mutator *mut) {
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
   copy_space_flip(copy_space);
   gc_tracer_prepare(&heap->tracer);
   add_roots(heap);
diff --git a/src/semi.c b/src/semi.c
index 0ed954727..3b4d90b20 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -386,11 +386,11 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   int is_minor = 0;
   int is_compacting = 1;
-  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
 
   HEAP_EVENT(heap, requesting_stop);
   HEAP_EVENT(heap, waiting_for_stop);
   HEAP_EVENT(heap, mutators_stopped);
+  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
 
   struct semi_space *semi = heap_semi_space(heap);
   struct large_object_space *large = heap_large_object_space(heap);

From 1ff082705e1f0c2681c68c0766d47e86fcd392a0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 10 Sep 2024 11:06:40 +0200
Subject: [PATCH 285/403] Remove scc

PCC with GC_PARALLEL=0 is exactly equivalent to SCC.  Also now that PCC
will dynamically avoid atomic forwarding if parallelism is disabled at
run-time, there is no need to keep SCC around.
---
 Makefile             |   4 -
 README.md            |   9 +-
 api/scc-attrs.h      |  60 ----
 doc/collector-pcc.md |  86 ++++--
 doc/collector-scc.md |  62 ----
 doc/collectors.md    |   8 +-
 embed.mk             |   3 -
 src/copy-space.h     |   4 +-
 src/pcc.c            |   4 +
 src/scc.c            | 670 -------------------------------------------
 10 files changed, 78 insertions(+), 832 deletions(-)
 delete mode 100644 api/scc-attrs.h
 delete mode 100644 doc/collector-scc.md
 delete mode 100644 src/scc.c

diff --git a/Makefile b/Makefile
index 2a1fded30..30d84ff71 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,6 @@ TESTS = quads mt-gcbench ephemerons finalizers
 COLLECTORS = \
 	bdw \
 	semi \
-	scc \
 	pcc \
 	\
 	mmc \
@@ -64,9 +63,6 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
-GC_STEM_scc       = scc
-GC_CFLAGS_scc     = -DGC_PRECISE_ROOTS=1
-
 GC_STEM_pcc       = pcc
 GC_CFLAGS_pcc     = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 
diff --git a/README.md b/README.md
index 601646cc5..52e98e77b 100644
--- a/README.md
+++ b/README.md
@@ -18,11 +18,10 @@ See the [documentation](./doc/README.md).
  - Finalization (supporting resuscitation)
  - Ephemerons (except on `bdw`, which has a polyfill)
  - Conservative roots (optionally with `mmc` or always with `bdw`)
- - Precise roots (optionally with `mmc` or always with `semi` / `pcc` /
-   `scc`)
+ - Precise roots (optionally with `mmc` or always with `semi` / `pcc`)
  - Precise embedder-parameterized heap tracing (except with `bdw`)
  - Conservative heap tracing (optionally with `mmc`, always with `bdw`)
- - Parallel tracing (except `semi` and `scc`)
+ - Parallel tracing (except `semi`)
  - Parallel mutators (except `semi`)
  - Inline allocation / write barrier fast paths (supporting JIT)
  - One unified API with no-overhead abstraction: switch collectors when
@@ -36,8 +35,8 @@ See the [documentation](./doc/README.md).
  * [src/](./src/): The actual GC implementation, containing a number of
    collector implementations.  The embedder chooses which collector to
    use at compile-time.  See the [documentation](./doc/collectors.md)
-   for more on the different collectors (`semi`, `bdw`, `scc`, `pcc`,
-   and the different flavors of `mmc`).
+   for more on the different collectors (`semi`, `bdw`, `pcc`, and the
+   different flavors of `mmc`).
  * [benchmarks/](./benchmarks/): Benchmarks.  A work in progress.
  * [test/](./test/): A dusty attic of minimal testing.
 
diff --git a/api/scc-attrs.h b/api/scc-attrs.h
deleted file mode 100644
index 4db408cad..000000000
--- a/api/scc-attrs.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef SCC_ATTRS_H
-#define SCC_ATTRS_H
-
-#include "gc-config.h"
-#include "gc-assert.h"
-#include "gc-attrs.h"
-
-static const uintptr_t GC_ALIGNMENT = 8;
-static const size_t GC_LARGE_OBJECT_THRESHOLD = 8192;
-
-static inline enum gc_allocator_kind gc_allocator_kind(void) {
-  return GC_ALLOCATOR_INLINE_BUMP_POINTER;
-}
-static inline size_t gc_allocator_small_granule_size(void) {
-  return GC_ALIGNMENT;
-}
-static inline size_t gc_allocator_large_threshold(void) {
-  return GC_LARGE_OBJECT_THRESHOLD;
-}
-
-static inline size_t gc_allocator_allocation_pointer_offset(void) {
-  return sizeof(uintptr_t) * 0;
-}
-static inline size_t gc_allocator_allocation_limit_offset(void) {
-  return sizeof(uintptr_t) * 1;
-}
-
-static inline size_t gc_allocator_freelist_offset(size_t size) {
-  GC_CRASH();
-}
-
-static inline size_t gc_allocator_alloc_table_alignment(void) {
-  return 0;
-}
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
-  GC_CRASH();
-}
-static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  GC_CRASH();
-}
-
-static inline int gc_allocator_needs_clear(void) {
-  return 0;
-}
-
-static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
-  return GC_WRITE_BARRIER_NONE;
-}
-static inline size_t gc_write_barrier_card_table_alignment(void) {
-  GC_CRASH();
-}
-static inline size_t gc_write_barrier_card_size(void) {
-  GC_CRASH();
-}
-
-static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
-  return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
-}
-
-#endif // SCC_ATTRS_H
diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index f2f3ff390..64af6c769 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -1,38 +1,84 @@
 # Parallel copying collector
 
-Whippet's `pcc` collector is a copying collector, exactly like
-[`scc`](./collector-scc.md), but supporting multiple tracing threads.
-See the discussion of `scc` for a general overview.
+Whippet's `pcc` collector is a copying collector, like the more simple
+[`semi`](./collector-semi.md), but supporting multiple mutator threads,
+multiple tracing threads, and using an external FIFO worklist instead of
+a Cheney worklist.
 
-Also like `scc` and `semi`, `pcc` is not generational yet.  If and when
-`pcc` grows a young generation, it would be a great collector.
+Like `semi`, `pcc` traces by evacuation: it moves all live objects on
+every collection.  (Exception:  objects larger than 8192 bytes are
+placed into a partitioned space which traces by marking in place instead
+of copying.)  Evacuation requires precise roots, so if your embedder
+does not support precise roots, `pcc` is not for you.
+
+Again like `semi`, `pcc` generally requires a heap size at least twice
+as large as the maximum live heap size, and performs best with ample
+heap sizes; between 3× and 5× is best.
+
+Overall, `pcc` is a better version of `semi`.  It should have broadly
+the same performance characteristics with a single mutator and with
+parallelism disabled, additionally allowing multiple mutators, and
+scaling better with multiple tracing threads.
+
+Also like `semi`, `pcc` is not generational yet.  If and when `pcc`
+grows a young generation, it would be a great collector.
 
 ## Implementation notes
 
+Unlike `semi` which has a single global bump-pointer allocation region,
+`pcc` structures the heap into 64-kB blocks.  In this way it supports
+multiple mutator threads: mutators do local bump-pointer allocation into
+their own block, and when their block is full, they fetch another from
+the global store.
+
+The block size is 64 kB, but really it's 128 kB, because each block has
+two halves: the active region and the copy reserve.  Dividing each block
+in two allows the collector to easily grow and shrink the heap while
+ensuring there is always enough reserve space.
+
+Blocks are allocated in 64-MB aligned slabs, so there are 512 blocks in
+a slab.  The first block in a slab is used by the collector itself, to
+keep metadata for the rest of the blocks, for example a chain pointer
+allowing blocks to be collected in lists, a saved allocation pointer for
+partially-filled blocks, whether the block is paged in or out, and so
+on.
+
 `pcc` supports tracing in parallel.  This mechanism works somewhat like
 allocation, in which multiple trace workers compete to evacuate objects
 into their local allocation buffers; when an allocation buffer is full,
 the trace worker grabs another, just like mutators do.
 
-To maintain a queue of objects to trace, `pcc` uses the [fine-grained
-work-stealing parallel tracer](../src/parallel-tracer.h) originally
-developed for [Whippet's Immix-like collector](./collector-whippet.md).
-Each trace worker maintains a [local queue of objects that need
-tracing](../src/local-worklist.h), which currently has 1024 entries.  If
-the local queue becomes full, the worker will publish 3/4 of those
-entries to the worker's [shared worklist](../src/shared-worklist.h).
-When a worker runs out of local work, it will first try to remove work
-from its own shared worklist, then will try to steal from other workers.
+Unlike the simple semi-space collector which uses a Cheney grey
+worklist, `pcc` uses an external worklist.  If parallelism is disabled
+at compile-time, it uses a [simple first-in, first-out queue of objects
+to be traced](../src/simple-worklist.h) originally developed for
+[Whippet's Immix-like collector](./collector-whippet.md).  Like a Cheney
+worklist, this should result in objects being copied in breadth-first
+order.  The literature would suggest that depth-first is generally
+better for locality, but that preserving allocation order is generally
+best.  This is something to experiment with in the future.
 
-If only one tracing thread is enabled (`parallelism=1`), `pcc` uses
+If parallelism is enabled, as it is by default, `pcc` uses the
+[fine-grained work-stealing parallel tracer](../src/parallel-tracer.h)
+originally developed for [Whippet's Immix-like
+collector](./collector-whippet.md).  Each trace worker maintains a
+[local queue of objects that need tracing](../src/local-worklist.h),
+which currently has 1024 entries.  If the local queue becomes full, the
+worker will publish 3/4 of those entries to the worker's [shared
+worklist](../src/shared-worklist.h).  When a worker runs out of local
+work, it will first try to remove work from its own shared worklist,
+then will try to steal from other workers.
+
+If only one tracing thread is enabled at run-time (`parallelism=1`) (or
+if parallelism is disabled at compile-time), `pcc` will evacuate by
 non-atomic forwarding, but if multiple threads compete to evacuate
 objects, `pcc` uses [atomic compare-and-swap instead of simple
 forwarding pointer updates](./manual.md#forwarding-objects).  This
 imposes around a ~30% performance penalty but having multiple tracing
 threads is generally worth it, unless the object graph is itself serial.
 
-As with `scc`, the memory used for the external worklist is dynamically
-allocated from the OS and is not currently counted as contributing to
-the heap size.  If you are targetting a microcontroller or something,
-probably you need to choose a different kind of collector that never
-dynamically allocates, such as `semi`.
+The memory used for the external worklist is dynamically allocated from
+the OS and is not currently counted as contributing to the heap size.
+If you are targetting a microcontroller or something, probably you need
+to choose a different kind of collector that never dynamically
+allocates, such as `semi`.
diff --git a/doc/collector-scc.md b/doc/collector-scc.md
deleted file mode 100644
index 2512bb9fd..000000000
--- a/doc/collector-scc.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# Serial copying collector
-
-Whippet's `scc` collector is a copying collector, like the more simple
-[`semi`](./collector-semi.md), but supporting multiple mutator threads,
-and using an external FIFO worklist instead of a Cheney worklist.
-
-Like `semi`, `scc` traces by evacuation: it moves all live objects on
-every collection.  (Exception:  objects larger than 8192 bytes are
-placed into a partitioned space which traces by marking in place instead
-of copying.)  Evacuation requires precise roots, so if your embedder
-does not support precise roots, `scc` is not for you.
-
-Again like `semi`, `scc` generally requires a heap size at least twice
-as large as the maximum live heap size, and performs best with ample
-heap sizes; between 3× and 5× is best.
-
-Overall, `scc` is most useful for isolating the performance implications
-of using a block-structured heap and of using an external worklist
-rather than a Cheney worklist as `semi` does.  It also supports multiple
-mutator threads, so it is generally more useful than `semi`.  Also,
-compared to `pcc`, we can measure the overhead that `pcc` imposes to
-atomically forward objects.
-
-But given a choice, you probably want `pcc`; though it's slower with
-only one tracing thread, once you have more than once tracing thread
-it's a win over `scc`.
-
-## Implementation notes
-
-Unlike `semi` which has a single global bump-pointer allocation region,
-`scc` structures the heap into 64-kB blocks.  In this way it supports
-multiple mutator threads: mutators do local bump-pointer allocation into
-their own block, and when their block is full, they fetch another from
-the global store.
-
-The block size is 64 kB, but really it's 128 kB, because each block has
-two halves: the active region and the copy reserve.  Dividing each block
-in two allows the collector to easily grow and shrink the heap while
-ensuring there is always enough reserve space.
-
-Blocks are allocated in 64-MB aligned slabs, so there are 512 blocks in
-a slab.  The first block in a slab is used by the collector itself, to
-keep metadata for the rest of the blocks, for example a chain pointer
-allowing blocks to be collected in lists, a saved allocation pointer for
-partially-filled blocks, whether the block is paged in or out, and so
-on.
-
-Unlike the simple semi-space collector which uses a Cheney grey
-worklist, `scc` uses a [simple first-in, first-out queue of objects to
-be traced](../src/simple-worklist.h) originally developed for [Whippet's
-Immix-like collector](./collector-whippet.md).  Like a Cheney worklist,
-this should result in objects being copied in breadth-first order.  The
-literature would suggest that depth-first is generally better for
-locality, but that preserving allocation order is generally best.  This
-is something to experiment with in the future.
-
-The memory used for the external worklist is dynamically allocated from
-the OS and is not currently counted as contributing to the heap size.
-If you are targetting a microcontroller or something, probably you need
-to choose a different kind of collector that never dynamically
-allocates, such as `semi`.
-
diff --git a/doc/collectors.md b/doc/collectors.md
index 1c23f3e9d..ccb95ef76 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -3,10 +3,8 @@
 Whippet has five collectors currently:
  - [Semi-space collector (`semi`)](./collector-semi.md): For
    single-threaded embedders who are not too tight on memory.
- - [Serial copying collector (`scc`)](./collector-scc.md): Like `semi`,
-   but with support for multiple mutator threads.
- - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `scc`,
-   but with support for multiple tracing threads.
+ - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `semi`,
+   but with support for multiple mutator and tracing threads.
  - [Mostly marking collector (`mmc`)](./collector-mmc.md):
    Immix-inspired collector.  Optionally parallel, conservative (stack
    and/or heap), and/or generational.
@@ -26,8 +24,6 @@ out mutator/embedder bugs.  Then if memory is tight, switch to
 If you are aiming for maximum simplicity and minimal code size (ten
 kilobytes or so), use `semi`.
 
-Only use `scc` if you are investigating GC internals.
-
 If you are writing a new project, you have a choice as to whether to pay
 the development cost of precise roots or not.  If you choose to not have
 precise roots, then go for `stack-conservative-parallel-mmc` directly.
diff --git a/embed.mk b/embed.mk
index fef8de8b1..d0608b345 100644
--- a/embed.mk
+++ b/embed.mk
@@ -42,9 +42,6 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 
-GC_STEM_scc        = scc
-GC_CFLAGS_scc      = -DGC_PRECISE_ROOTS=1
-
 GC_STEM_pcc        = pcc
 GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 
diff --git a/src/copy-space.h b/src/copy-space.h
index f125a08e1..ba26444bb 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -523,7 +523,7 @@ static inline int
 copy_space_forward(struct copy_space *space, struct gc_edge edge,
                    struct gc_ref old_ref,
                    struct copy_space_allocator *alloc) {
-  if (space->atomic_forward)
+  if (GC_PARALLEL && space->atomic_forward)
     return copy_space_forward_atomic(space, edge, old_ref, alloc);
   return copy_space_forward_nonatomic(space, edge, old_ref, alloc);
 }
@@ -531,7 +531,7 @@ copy_space_forward(struct copy_space *space, struct gc_edge edge,
 static inline int
 copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
                              struct gc_ref old_ref) {
-  if (space->atomic_forward)
+  if (GC_PARALLEL && space->atomic_forward)
     return copy_space_forward_if_traced_atomic(space, edge, old_ref);
   return copy_space_forward_if_traced_nonatomic(space, edge, old_ref);
 }
diff --git a/src/pcc.c b/src/pcc.c
index 403bfe51f..0f5d84abd 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -15,7 +15,11 @@
 #include "gc-inline.h"
 #include "gc-trace.h"
 #include "large-object-space.h"
+#if GC_PARALLEL
 #include "parallel-tracer.h"
+#else
+#include "serial-tracer.h"
+#endif
 #include "spin.h"
 #include "pcc-attrs.h"
 
diff --git a/src/scc.c b/src/scc.c
deleted file mode 100644
index 46ab98560..000000000
--- a/src/scc.c
+++ /dev/null
@@ -1,670 +0,0 @@
-#include <pthread.h>
-#include <stdatomic.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "gc-api.h"
-
-#define GC_IMPL 1
-#include "gc-internal.h"
-
-#include "copy-space.h"
-#include "debug.h"
-#include "gc-align.h"
-#include "gc-inline.h"
-#include "gc-trace.h"
-#include "large-object-space.h"
-#include "serial-tracer.h"
-#include "spin.h"
-#include "scc-attrs.h"
-
-struct gc_heap {
-  struct copy_space copy_space;
-  struct large_object_space large_object_space;
-  struct gc_extern_space *extern_space;
-  size_t large_object_pages;
-  pthread_mutex_t lock;
-  pthread_cond_t collector_cond;
-  pthread_cond_t mutator_cond;
-  size_t size;
-  int collecting;
-  int check_pending_ephemerons;
-  struct gc_pending_ephemerons *pending_ephemerons;
-  struct gc_finalizer_state *finalizer_state;
-  size_t mutator_count;
-  size_t paused_mutator_count;
-  size_t inactive_mutator_count;
-  struct gc_heap_roots *roots;
-  struct gc_mutator *mutators;
-  long count;
-  struct gc_tracer tracer;
-  double pending_ephemerons_size_factor;
-  double pending_ephemerons_size_slop;
-  struct gc_event_listener event_listener;
-  void *event_listener_data;
-};
-
-#define HEAP_EVENT(heap, event, ...)                                    \
-  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
-#define MUTATOR_EVENT(mut, event, ...)                                  \
-  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
-
-struct gc_mutator {
-  struct copy_space_allocator allocator;
-  struct gc_heap *heap;
-  struct gc_mutator_roots *roots;
-  void *event_listener_data;
-  struct gc_mutator *next;
-  struct gc_mutator *prev;
-};
-
-struct gc_trace_worker_data {
-  struct copy_space_allocator allocator;
-};
-
-static inline struct copy_space* heap_copy_space(struct gc_heap *heap) {
-  return &heap->copy_space;
-}
-static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
-  return &heap->large_object_space;
-}
-static inline struct gc_extern_space* heap_extern_space(struct gc_heap *heap) {
-  return heap->extern_space;
-}
-static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
-  return mutator->heap;
-}
-
-static void
-gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
-                                         struct gc_heap *heap,
-                                         struct gc_trace_worker *worker,
-                                         struct gc_trace_worker_data *data),
-                               struct gc_tracer *tracer,
-                               struct gc_heap *heap,
-                               struct gc_trace_worker *worker) {
-  struct gc_trace_worker_data data;
-  copy_space_allocator_init(&data.allocator);
-  f(tracer, heap, worker, &data);
-  copy_space_allocator_finish(&data.allocator, heap_copy_space(heap));
-}
-
-static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
-                           struct gc_ref ref,
-                           struct gc_trace_worker_data *data) {
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
-  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
-    return copy_space_forward_nonatomic(heap_copy_space(heap), edge, ref,
-                                        &data->allocator);
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
-    return large_object_space_mark_object(heap_large_object_space(heap), ref);
-  else
-    return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
-}
-
-static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge,
-                             struct gc_trace_worker *worker) {
-  struct gc_ref ref = gc_edge_ref(edge);
-  struct gc_trace_worker_data *data = gc_trace_worker_data(worker);
-  int is_new = do_trace(heap, edge, ref, data);
-
-  if (is_new && heap->check_pending_ephemerons)
-    gc_resolve_pending_ephemerons(ref, heap);
-
-  return is_new;
-}
-
-int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
-  struct gc_ref ref = gc_edge_ref(edge);
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
-  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
-    return copy_space_forward_if_traced_nonatomic(heap_copy_space(heap), edge,
-                                                  ref);
-  if (large_object_space_contains(heap_large_object_space(heap), ref))
-    return large_object_space_is_copied(heap_large_object_space(heap), ref);
-  GC_CRASH();
-}
-
-static int mutators_are_stopping(struct gc_heap *heap) {
-  return atomic_load_explicit(&heap->collecting, memory_order_relaxed);
-}
-
-static inline void heap_lock(struct gc_heap *heap) {
-  pthread_mutex_lock(&heap->lock);
-}
-static inline void heap_unlock(struct gc_heap *heap) {
-  pthread_mutex_unlock(&heap->lock);
-}
-
-// with heap lock
-static inline int all_mutators_stopped(struct gc_heap *heap) {
-  return heap->mutator_count ==
-    heap->paused_mutator_count + heap->inactive_mutator_count;
-}
-
-static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
-  mut->heap = heap;
-  mut->event_listener_data =
-    heap->event_listener.mutator_added(heap->event_listener_data);
-  copy_space_allocator_init(&mut->allocator);
-  heap_lock(heap);
-  // We have no roots.  If there is a GC currently in progress, we have
-  // nothing to add.  Just wait until it's done.
-  while (mutators_are_stopping(heap))
-    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  mut->next = mut->prev = NULL;
-  struct gc_mutator *tail = heap->mutators;
-  if (tail) {
-    mut->next = tail;
-    tail->prev = mut;
-  }
-  heap->mutators = mut;
-  heap->mutator_count++;
-  heap_unlock(heap);
-}
-
-static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
-  MUTATOR_EVENT(mut, mutator_removed);
-  mut->heap = NULL;
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
-  heap_lock(heap);
-  heap->mutator_count--;
-  if (mut->next)
-    mut->next->prev = mut->prev;
-  if (mut->prev)
-    mut->prev->next = mut->next;
-  else
-    heap->mutators = mut->next;
-  // We have no roots.  If there is a GC stop currently in progress,
-  // maybe tell the controller it can continue.
-  if (mutators_are_stopping(heap) && all_mutators_stopped(heap))
-    pthread_cond_signal(&heap->collector_cond);
-  heap_unlock(heap);
-}
-
-static void request_mutators_to_stop(struct gc_heap *heap) {
-  GC_ASSERT(!mutators_are_stopping(heap));
-  atomic_store_explicit(&heap->collecting, 1, memory_order_relaxed);
-}
-
-static void allow_mutators_to_continue(struct gc_heap *heap) {
-  GC_ASSERT(mutators_are_stopping(heap));
-  GC_ASSERT(all_mutators_stopped(heap));
-  heap->paused_mutator_count--;
-  atomic_store_explicit(&heap->collecting, 0, memory_order_relaxed);
-  GC_ASSERT(!mutators_are_stopping(heap));
-  pthread_cond_broadcast(&heap->mutator_cond);
-}
-
-static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
-  size_t previous = heap->large_object_pages;
-  heap->large_object_pages = npages;
-  GC_ASSERT(npages <= previous);
-  size_t bytes = (previous - npages) <<
-    heap_large_object_space(heap)->page_size_log2;
-  copy_space_reacquire_memory(heap_copy_space(heap), bytes);
-}
-
-void gc_mutator_set_roots(struct gc_mutator *mut,
-                          struct gc_mutator_roots *roots) {
-  mut->roots = roots;
-}
-void gc_heap_set_roots(struct gc_heap *heap, struct gc_heap_roots *roots) {
-  heap->roots = roots;
-}
-void gc_heap_set_extern_space(struct gc_heap *heap,
-                              struct gc_extern_space *space) {
-  heap->extern_space = space;
-}
-
-static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
-                                void *trace_data) GC_ALWAYS_INLINE;
-static inline void
-tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
-  struct gc_trace_worker *worker = trace_data;
-  if (trace_edge(heap, edge, worker))
-    gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
-}
-
-static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
-                             struct gc_trace_worker *worker) {
-#ifdef DEBUG
-  if (copy_space_contains(heap_copy_space(heap), ref))
-    GC_ASSERT(copy_space_object_region(ref) == heap_copy_space(heap)->active_region);
-#endif
-  gc_trace_object(ref, tracer_visit, heap, worker, NULL);
-}
-
-static inline void trace_root(struct gc_root root, struct gc_heap *heap,
-                              struct gc_trace_worker *worker) {
-  switch (root.kind) {
-  case GC_ROOT_KIND_HEAP:
-    gc_trace_heap_roots(root.heap->roots, tracer_visit, heap, worker);
-    break;
-  case GC_ROOT_KIND_MUTATOR:
-    gc_trace_mutator_roots(root.mutator->roots, tracer_visit, heap, worker);
-    break;
-  case GC_ROOT_KIND_RESOLVED_EPHEMERONS:
-    gc_trace_resolved_ephemerons(root.resolved_ephemerons, tracer_visit,
-                                 heap, worker);
-    break;
-  case GC_ROOT_KIND_EDGE:
-    tracer_visit(root.edge, heap, worker);
-    break;
-  default:
-    GC_CRASH();
-  }
-}
-
-static void wait_for_mutators_to_stop(struct gc_heap *heap) {
-  heap->paused_mutator_count++;
-  while (!all_mutators_stopped(heap))
-    pthread_cond_wait(&heap->collector_cond, &heap->lock);
-}
-
-void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                             struct gc_edge edge, struct gc_ref new_val) {
-}
-
-static void
-pause_mutator_for_collection(struct gc_heap *heap,
-                             struct gc_mutator *mut) GC_NEVER_INLINE;
-static void
-pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
-  GC_ASSERT(mutators_are_stopping(heap));
-  GC_ASSERT(!all_mutators_stopped(heap));
-  MUTATOR_EVENT(mut, mutator_stopped);
-  heap->paused_mutator_count++;
-  if (all_mutators_stopped(heap))
-    pthread_cond_signal(&heap->collector_cond);
-
-  do {
-    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  } while (mutators_are_stopping(heap));
-  heap->paused_mutator_count--;
-
-  MUTATOR_EVENT(mut, mutator_restarted);
-}
-
-static void
-pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void
-pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mutators_are_stopping(heap));
-  MUTATOR_EVENT(mut, mutator_stopping);
-  pause_mutator_for_collection(heap, mut);
-}
-
-static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mutators_are_stopping(heap));
-  MUTATOR_EVENT(mut, mutator_stopping);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
-  heap_lock(heap);
-  pause_mutator_for_collection(heap, mut);
-  heap_unlock(heap);
-}
-
-static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
-  while (mutators_are_stopping(mutator_heap(mut)))
-    pause_mutator_for_collection_without_lock(mut);
-}
-
-static int maybe_grow_heap(struct gc_heap *heap) {
-  return 0;
-}
-
-static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
-                            void *unused) {
-  gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
-}
-
-static void add_roots(struct gc_heap *heap) {
-  for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next)
-    gc_tracer_add_root(&heap->tracer, gc_root_mutator(mut));
-  gc_tracer_add_root(&heap->tracer, gc_root_heap(heap));
-  gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
-}
-
-static void resolve_ephemerons_lazily(struct gc_heap *heap) {
-  heap->check_pending_ephemerons = 0;
-}
-
-static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
-  heap->check_pending_ephemerons = 1;
-  gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
-}
-
-static void trace_resolved_ephemerons(struct gc_heap *heap) {
-  for (struct gc_ephemeron *resolved = gc_pop_resolved_ephemerons(heap);
-       resolved;
-       resolved = gc_pop_resolved_ephemerons(heap)) {
-    gc_tracer_add_root(&heap->tracer, gc_root_resolved_ephemerons(resolved));
-    gc_tracer_trace(&heap->tracer);
-  }
-}
-
-static void resolve_finalizers(struct gc_heap *heap) {
-  for (size_t priority = 0;
-       priority < gc_finalizer_priority_count();
-       priority++) {
-    if (gc_resolve_finalizers(heap->finalizer_state, priority,
-                              visit_root_edge, heap, NULL)) {
-      gc_tracer_trace(&heap->tracer);
-      trace_resolved_ephemerons(heap);
-    }
-  }
-  gc_notify_finalizers(heap->finalizer_state, heap);
-}
-
-static void sweep_ephemerons(struct gc_heap *heap) {
-  return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
-}
-
-static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void collect(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  struct copy_space *copy_space = heap_copy_space(heap);
-  struct large_object_space *lospace = heap_large_object_space(heap);
-  struct gc_extern_space *exspace = heap_extern_space(heap);
-  MUTATOR_EVENT(mut, mutator_cause_gc);
-  DEBUG("start collect #%ld:\n", heap->count);
-  large_object_space_start_gc(lospace, 0);
-  gc_extern_space_start_gc(exspace, 0);
-  resolve_ephemerons_lazily(heap);
-  HEAP_EVENT(heap, requesting_stop);
-  request_mutators_to_stop(heap);
-  HEAP_EVENT(heap, waiting_for_stop);
-  wait_for_mutators_to_stop(heap);
-  HEAP_EVENT(heap, mutators_stopped);
-  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
-  copy_space_flip(copy_space);
-  gc_tracer_prepare(&heap->tracer);
-  add_roots(heap);
-  HEAP_EVENT(heap, roots_traced);
-  gc_tracer_trace(&heap->tracer);
-  HEAP_EVENT(heap, heap_traced);
-  resolve_ephemerons_eagerly(heap);
-  trace_resolved_ephemerons(heap);
-  HEAP_EVENT(heap, ephemerons_traced);
-  resolve_finalizers(heap);
-  HEAP_EVENT(heap, finalizers_traced);
-  sweep_ephemerons(heap);
-  gc_tracer_release(&heap->tracer);
-  copy_space_finish_gc(copy_space);
-  large_object_space_finish_gc(lospace, 0);
-  gc_extern_space_finish_gc(exspace, 0);
-  heap->count++;
-  heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
-  size_t live_size = (copy_space->allocated_bytes_at_last_gc +
-                      large_object_space_size_at_last_collection(lospace));
-  HEAP_EVENT(heap, live_data_size, live_size);
-  maybe_grow_heap(heap);
-  if (!copy_space_page_out_blocks_until_memory_released(copy_space)) {
-    fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
-            heap->size, copy_space->nslabs);
-    GC_CRASH();
-  }
-  HEAP_EVENT(heap, restarting_mutators);
-  allow_mutators_to_continue(heap);
-}
-
-static void trigger_collection(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
-  heap_lock(heap);
-  long epoch = heap->count;
-  while (mutators_are_stopping(heap))
-    pause_mutator_for_collection_with_lock(mut);
-  if (epoch == heap->count)
-    collect(mut);
-  heap_unlock(heap);
-}
-
-void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
-  trigger_collection(mut);
-}
-
-static void* allocate_large(struct gc_mutator *mut, size_t size) {
-  struct gc_heap *heap = mutator_heap(mut);
-  struct large_object_space *space = heap_large_object_space(heap);
-
-  size_t npages = large_object_space_npages(space, size);
-
-  copy_space_request_release_memory(heap_copy_space(heap),
-                                     npages << space->page_size_log2);
-  while (!copy_space_page_out_blocks_until_memory_released(heap_copy_space(heap)))
-    trigger_collection(mut);
-  atomic_fetch_add(&heap->large_object_pages, npages);
-
-  void *ret = large_object_space_alloc(space, npages);
-  if (!ret)
-    ret = large_object_space_obtain_and_alloc(space, npages);
-
-  if (!ret) {
-    perror("weird: we have the space but mmap didn't work");
-    GC_CRASH();
-  }
-
-  return ret;
-}
-
-static void get_more_empty_blocks_for_mutator(void *mut) {
-  trigger_collection(mut);
-}
-
-void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
-  GC_ASSERT(size > 0); // allocating 0 bytes would be silly
-
-  if (size > gc_allocator_large_threshold())
-    return allocate_large(mut, size);
-
-  struct gc_ref ret = copy_space_allocate(&mut->allocator,
-                                          heap_copy_space(mutator_heap(mut)),
-                                          size,
-                                          get_more_empty_blocks_for_mutator,
-                                          mut);
-  gc_clear_fresh_allocation(ret, size);
-  return gc_ref_heap_object(ret);
-}
-
-void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
-  return gc_allocate(mut, size);
-}
-
-struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_ephemeron_size());
-}
-
-void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
-                       struct gc_ref key, struct gc_ref value) {
-  gc_ephemeron_init_internal(mutator_heap(mut), ephemeron, key, value);
-}
-
-struct gc_pending_ephemerons *gc_heap_pending_ephemerons(struct gc_heap *heap) {
-  return heap->pending_ephemerons;
-}
-
-unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
-  return heap->count;
-}
-
-struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_finalizer_size());
-}
-
-void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
-                         unsigned priority, struct gc_ref object,
-                         struct gc_ref closure) {
-  gc_finalizer_init_internal(finalizer, object, closure);
-  gc_finalizer_attach_internal(mutator_heap(mut)->finalizer_state,
-                               finalizer, priority);
-  // No write barrier.
-}
-
-struct gc_finalizer* gc_pop_finalizable(struct gc_mutator *mut) {
-  return gc_finalizer_state_pop(mutator_heap(mut)->finalizer_state);
-}
-
-void gc_set_finalizer_callback(struct gc_heap *heap,
-                               gc_finalizer_callback callback) {
-  gc_finalizer_state_set_callback(heap->finalizer_state, callback);
-}
-
-static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
-  struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
-  size_t target = heap->size * heap->pending_ephemerons_size_factor;
-  double slop = heap->pending_ephemerons_size_slop;
-
-  heap->pending_ephemerons = gc_prepare_pending_ephemerons(cur, target, slop);
-
-  return !!heap->pending_ephemerons;
-}
-
-struct gc_options {
-  struct gc_common_options common;
-};
-int gc_option_from_string(const char *str) {
-  return gc_common_option_from_string(str);
-}
-struct gc_options* gc_allocate_options(void) {
-  struct gc_options *ret = malloc(sizeof(struct gc_options));
-  gc_init_common_options(&ret->common);
-  return ret;
-}
-int gc_options_set_int(struct gc_options *options, int option, int value) {
-  return gc_common_options_set_int(&options->common, option, value);
-}
-int gc_options_set_size(struct gc_options *options, int option,
-                        size_t value) {
-  return gc_common_options_set_size(&options->common, option, value);
-}
-int gc_options_set_double(struct gc_options *options, int option,
-                          double value) {
-  return gc_common_options_set_double(&options->common, option, value);
-}
-int gc_options_parse_and_set(struct gc_options *options, int option,
-                             const char *value) {
-  return gc_common_options_parse_and_set(&options->common, option, value);
-}
-
-static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
-  // *heap is already initialized to 0.
-
-  pthread_mutex_init(&heap->lock, NULL);
-  pthread_cond_init(&heap->mutator_cond, NULL);
-  pthread_cond_init(&heap->collector_cond, NULL);
-  heap->size = options->common.heap_size;
-
-  if (options->common.parallelism != 1)
-    fprintf(stderr, "warning: parallelism unimplemented in semispace copying collector\n");
-
-  if (!gc_tracer_init(&heap->tracer, heap, 1))
-    GC_CRASH();
-
-  heap->pending_ephemerons_size_factor = 0.005;
-  heap->pending_ephemerons_size_slop = 0.5;
-
-  if (!heap_prepare_pending_ephemerons(heap))
-    GC_CRASH();
-
-  heap->finalizer_state = gc_make_finalizer_state();
-  if (!heap->finalizer_state)
-    GC_CRASH();
-
-  return 1;
-}
-
-int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
-            struct gc_heap **heap, struct gc_mutator **mut,
-            struct gc_event_listener event_listener,
-            void *event_listener_data) {
-  GC_ASSERT_EQ(gc_allocator_small_granule_size(), GC_ALIGNMENT);
-  GC_ASSERT_EQ(gc_allocator_large_threshold(), GC_LARGE_OBJECT_THRESHOLD);
-  GC_ASSERT_EQ(0, offsetof(struct gc_mutator, allocator));
-  GC_ASSERT_EQ(gc_allocator_allocation_pointer_offset(),
-               offsetof(struct copy_space_allocator, hp));
-  GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
-               offsetof(struct copy_space_allocator, limit));
-
-  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
-    fprintf(stderr, "fixed heap size is currently required\n");
-    return 0;
-  }
-
-  *heap = calloc(1, sizeof(struct gc_heap));
-  if (!*heap) GC_CRASH();
-
-  if (!heap_init(*heap, options))
-    GC_CRASH();
-
-  (*heap)->event_listener = event_listener;
-  (*heap)->event_listener_data = event_listener_data;
-  HEAP_EVENT(*heap, init, (*heap)->size);
-
-  struct copy_space *space = heap_copy_space(*heap);
-  int atomic_forward = 0;
-  if (!copy_space_init(space, (*heap)->size, atomic_forward)) {
-    free(*heap);
-    *heap = NULL;
-    return 0;
-  }
-  
-  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
-    GC_CRASH();
-
-  *mut = calloc(1, sizeof(struct gc_mutator));
-  if (!*mut) GC_CRASH();
-  add_mutator(*heap, *mut);
-  return 1;
-}
-
-struct gc_mutator* gc_init_for_thread(struct gc_stack_addr *stack_base,
-                                      struct gc_heap *heap) {
-  struct gc_mutator *ret = calloc(1, sizeof(struct gc_mutator));
-  if (!ret)
-    GC_CRASH();
-  add_mutator(heap, ret);
-  return ret;
-}
-
-void gc_finish_for_thread(struct gc_mutator *mut) {
-  remove_mutator(mutator_heap(mut), mut);
-  free(mut);
-}
-
-static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
-  GC_ASSERT(mut->next == NULL);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
-  heap_lock(heap);
-  heap->inactive_mutator_count++;
-  if (all_mutators_stopped(heap))
-    pthread_cond_signal(&heap->collector_cond);
-  heap_unlock(heap);
-}
-
-static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
-  heap_lock(heap);
-  while (mutators_are_stopping(heap))
-    pthread_cond_wait(&heap->mutator_cond, &heap->lock);
-  heap->inactive_mutator_count--;
-  heap_unlock(heap);
-}
-
-void* gc_call_without_gc(struct gc_mutator *mut,
-                         void* (*f)(void*),
-                         void *data) {
-  struct gc_heap *heap = mutator_heap(mut);
-  deactivate_mutator(heap, mut);
-  void *ret = f(data);
-  reactivate_mutator(heap, mut);
-  return ret;
-}

From 4cdb47de6ac01d8212da7e165260a48e063b20f9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 10 Sep 2024 11:28:15 +0200
Subject: [PATCH 286/403] There are four lights

---
 doc/collectors.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/collectors.md b/doc/collectors.md
index ccb95ef76..6e21fcd77 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -1,6 +1,6 @@
 # Whippet collectors
 
-Whippet has five collectors currently:
+Whippet has four collectors currently:
  - [Semi-space collector (`semi`)](./collector-semi.md): For
    single-threaded embedders who are not too tight on memory.
  - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `semi`,

From 48085393f32454e3a157a883095eb5658d755516 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 10 Sep 2024 11:31:55 +0200
Subject: [PATCH 287/403] Update some doc links

---
 doc/collector-pcc.md | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index 64af6c769..ef8085ed7 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -50,24 +50,21 @@ the trace worker grabs another, just like mutators do.
 
 Unlike the simple semi-space collector which uses a Cheney grey
 worklist, `pcc` uses an external worklist.  If parallelism is disabled
-at compile-time, it uses a [simple first-in, first-out queue of objects
-to be traced](../src/simple-worklist.h) originally developed for
-[Whippet's Immix-like collector](./collector-whippet.md).  Like a Cheney
-worklist, this should result in objects being copied in breadth-first
-order.  The literature would suggest that depth-first is generally
-better for locality, but that preserving allocation order is generally
-best.  This is something to experiment with in the future.
+at compile-time, it uses a simple first-in, first-out queue of objects
+to be traced.  Like a Cheney worklist, this should result in objects
+being copied in breadth-first order.  The literature would suggest that
+depth-first is generally better for locality, but that preserving
+allocation order is generally best.  This is something to experiment
+with in the future.
 
-If parallelism is enabled, as it is by default, `pcc` uses the
-[fine-grained work-stealing parallel tracer](../src/parallel-tracer.h)
-originally developed for [Whippet's Immix-like
-collector](./collector-whippet.md).  Each trace worker maintains a
-[local queue of objects that need tracing](../src/local-worklist.h),
-which currently has 1024 entries.  If the local queue becomes full, the
-worker will publish 3/4 of those entries to the worker's [shared
-worklist](../src/shared-worklist.h).  When a worker runs out of local
-work, it will first try to remove work from its own shared worklist,
-then will try to steal from other workers.
+If parallelism is enabled, as it is by default, `pcc` uses a
+[fine-grained work-stealing parallel tracer](../src/parallel-tracer.h).
+Each trace worker maintains a [local queue of objects that need
+tracing](../src/local-worklist.h), which currently has 1024 entries.  If
+the local queue becomes full, the worker will publish 3/4 of those
+entries to the worker's [shared worklist](../src/shared-worklist.h).
+When a worker runs out of local work, it will first try to remove work
+from its own shared worklist, then will try to steal from other workers.
 
 If only one tracing thread is enabled at run-time (`parallelism=1`) (or
 if parallelism is disabled at compile-time), `pcc` will evacuate by

From d19366bea2d85987885d2a3a12150d2a8cc47513 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 10 Sep 2024 11:35:28 +0200
Subject: [PATCH 288/403] Remove mention of concurrent marking for mmc

---
 doc/collector-mmc.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/doc/collector-mmc.md b/doc/collector-mmc.md
index 20af1d581..5f1ea936e 100644
--- a/doc/collector-mmc.md
+++ b/doc/collector-mmc.md
@@ -143,12 +143,6 @@ naturally cache-friendly and parallel.
 The mark byte array facilitates conservative collection by being an
 oracle for "does this address start an object".
 
-There is some support for concurrent marking by having three mark bit
-states (dead, survivor, marked) that rotate at each collection; some
-collector configurations can have mutators mark before waiting for other
-mutators to stop.  True concurrent marking and associated barriers
-are not yet implemented.
-
 For a detailed introduction, see [Whippet: Towards a new local
 maximum](https://wingolog.org/archives/2023/02/07/whippet-towards-a-new-local-maximum),
 a talk given at FOSDEM 2023.

From 2818958c59b2492887edc48d0a1de8ea8003dce9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 13 Sep 2024 12:58:33 +0200
Subject: [PATCH 289/403] First version of adaptive heap sizing for pcc and mcc

---
 Makefile                    |   3 +
 api/gc-options.h            |   2 +-
 doc/manual.md               |   5 +-
 embed.mk                    |   3 +
 src/adaptive-heap-sizer.h   | 173 ++++++++++++++++++++++++++++++
 src/copy-space.h            |  72 +++++++++++--
 src/gc-internal.h           |   4 +
 src/gc-options-internal.h   |   2 +-
 src/gc-options.c            |   4 +-
 src/gc-platform-gnu-linux.c |  11 ++
 src/gc-platform.h           |   1 +
 src/growable-heap-sizer.h   |  58 ++++++++++
 src/heap-sizer.h            |  71 +++++++++++++
 src/large-object-space.h    |   8 ++
 src/mmc.c                   | 130 ++++++++++++++---------
 src/nofl-space.h            | 206 +++++++++++++++++++++++++++++-------
 src/pcc.c                   |  59 +++++++++--
 17 files changed, 702 insertions(+), 110 deletions(-)
 create mode 100644 src/adaptive-heap-sizer.h
 create mode 100644 src/growable-heap-sizer.h
 create mode 100644 src/heap-sizer.h

diff --git a/Makefile b/Makefile
index 30d84ff71..3f7278bdc 100644
--- a/Makefile
+++ b/Makefile
@@ -62,13 +62,16 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
+GC_LIBS_semi       = -lm
 
 GC_STEM_pcc       = pcc
 GC_CFLAGS_pcc     = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+GC_LIBS_pcc       = -lm
 
 define mmc_variant
 GC_STEM_$(1)       = mmc
 GC_CFLAGS_$(1)     = $(2)
+GC_LIBS_$(1)       = -lm
 endef
 
 define generational_mmc_variants
diff --git a/api/gc-options.h b/api/gc-options.h
index 35cb7aacf..2f3f7f792 100644
--- a/api/gc-options.h
+++ b/api/gc-options.h
@@ -14,7 +14,7 @@ enum {
   GC_OPTION_HEAP_SIZE,
   GC_OPTION_MAXIMUM_HEAP_SIZE,
   GC_OPTION_HEAP_SIZE_MULTIPLIER,
-  GC_OPTION_HEAP_FRUGALITY,
+  GC_OPTION_HEAP_EXPANSIVENESS,
   GC_OPTION_PARALLELISM
 };
 
diff --git a/doc/manual.md b/doc/manual.md
index eb648f54d..e88ac8198 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -460,8 +460,9 @@ defined for all collectors:
  * `GC_OPTION_HEAP_SIZE_MULTIPLIER`: For growable heaps, the target heap
    multiplier.  A heap multiplier of 2.5 means that for 100 MB of live
    data, the heap should be 250 MB.
- * `GC_OPTION_HEAP_FRUGALITY`: Something that will be used in adaptive
-   heaps, apparently!  Not yet implemented.
+ * `GC_OPTION_HEAP_EXPANSIVENESS`: For adaptive heap sizing, an
+   indication of how much free space will be given to heaps, as a
+   proportion of the square root of the live data size.
  * `GC_OPTION_PARALLELISM`: How many threads to devote to collection
    tasks during GC pauses.  By default, the current number of
    processors, with a maximum of 8.
diff --git a/embed.mk b/embed.mk
index d0608b345..9ef4d8ab7 100644
--- a/embed.mk
+++ b/embed.mk
@@ -41,13 +41,16 @@ GC_LIBS_bdw        = `pkg-config --libs bdw-gc`
 
 GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
+GC_LIBS_semi       = -lm
 
 GC_STEM_pcc        = pcc
 GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+GC_LIBS_pcc       = -lm
 
 define mmc_variant
 GC_STEM_$(1)       = mmc
 GC_CFLAGS_$(1)     = $(2)
+GC_LIBS_$(1)       = -lm
 endef
 
 define generational_mmc_variants
diff --git a/src/adaptive-heap-sizer.h b/src/adaptive-heap-sizer.h
new file mode 100644
index 000000000..796f07469
--- /dev/null
+++ b/src/adaptive-heap-sizer.h
@@ -0,0 +1,173 @@
+#ifndef ADAPTIVE_HEAP_SIZER_H
+#define ADAPTIVE_HEAP_SIZER_H
+
+#include <math.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "heap-sizer.h"
+#include "gc-platform.h"
+
+// This is the MemBalancer algorithm from "Optimal Heap Limits for Reducing
+// Browser Memory Use" by Marisa Kirisame, Pranav Shenoy, and Pavel Panchekha
+// (https://arxiv.org/abs/2204.10455).
+//
+// This implementation differs slightly in that the constant "c" of the paper
+// has been extracted outside the radical, and notionally reversed: it is a
+// unitless "expansiveness" parameter whose domain is [0,+∞].  Also there are
+// minimum and maximum heap size multipliers, and a minimum amount of free
+// space.  The initial collection rate is an informed guess.  The initial
+// allocation rate estimate is high, considering that allocation rates are often
+// high on program startup.
+
+struct gc_adaptive_heap_sizer {
+  uint64_t (*get_allocation_counter)(void *callback_data);
+  void (*set_heap_size)(size_t size, void *callback_data);
+  void *callback_data;
+  uint64_t smoothed_pause_time;
+  uint64_t smoothed_live_bytes;
+  uint64_t live_bytes;
+  double smoothed_allocation_rate;
+  double collection_smoothing_factor;
+  double allocation_smoothing_factor;
+  double minimum_multiplier;
+  double maximum_multiplier;
+  double minimum_free_space;
+  double expansiveness;
+  int stopping;
+  pthread_t thread;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+};
+
+// With lock
+static uint64_t
+gc_adaptive_heap_sizer_calculate_size(struct gc_adaptive_heap_sizer *sizer) {
+  double allocation_rate = sizer->smoothed_allocation_rate;
+  double collection_rate =
+    (double)sizer->smoothed_pause_time / (double)sizer->smoothed_live_bytes;
+  double radicand = sizer->live_bytes * allocation_rate / collection_rate;
+  double multiplier = 1.0 + sizer->expansiveness * sqrt(radicand);
+  if (isnan(multiplier) || multiplier < sizer->minimum_multiplier)
+    multiplier = sizer->minimum_multiplier;
+  else if (multiplier > sizer->maximum_multiplier)
+    multiplier = sizer->maximum_multiplier;
+  uint64_t size = sizer->live_bytes * multiplier;
+  if (size - sizer->live_bytes < sizer->minimum_free_space)
+    size = sizer->live_bytes + sizer->minimum_free_space;
+  return size;
+}
+
+static uint64_t
+gc_adaptive_heap_sizer_set_expansiveness(struct gc_adaptive_heap_sizer *sizer,
+                                         double expansiveness) {
+  pthread_mutex_lock(&sizer->lock);
+  sizer->expansiveness = expansiveness;
+  uint64_t heap_size = gc_adaptive_heap_sizer_calculate_size(sizer);
+  pthread_mutex_unlock(&sizer->lock);
+  return heap_size;
+}
+
+static void
+gc_adaptive_heap_sizer_on_gc(struct gc_adaptive_heap_sizer *sizer,
+                             size_t live_bytes, uint64_t pause_ns,
+                             void (*set_heap_size)(size_t, void*),
+                             void *data) {
+  pthread_mutex_lock(&sizer->lock);
+  sizer->live_bytes = live_bytes;
+  sizer->smoothed_live_bytes *= 1.0 - sizer->collection_smoothing_factor;
+  sizer->smoothed_live_bytes += sizer->collection_smoothing_factor * live_bytes;
+  sizer->smoothed_pause_time *= 1.0 - sizer->collection_smoothing_factor;
+  sizer->smoothed_pause_time += sizer->collection_smoothing_factor * pause_ns;
+  set_heap_size(gc_adaptive_heap_sizer_calculate_size(sizer), data);
+  pthread_mutex_unlock(&sizer->lock);
+}
+
+static void*
+gc_adaptive_heap_sizer_thread(void *data) {
+  struct gc_adaptive_heap_sizer *sizer = data;
+  uint64_t last_bytes_allocated =
+    sizer->get_allocation_counter(sizer->callback_data);
+  uint64_t last_heartbeat = gc_platform_monotonic_nanoseconds();
+  pthread_mutex_lock(&sizer->lock);
+  while (!sizer->stopping) {
+    {
+      struct timespec ts;
+      if (clock_gettime(CLOCK_REALTIME, &ts)) {
+        perror("adaptive heap sizer thread: failed to get time!");
+        break;
+      }
+      ts.tv_sec += 1;
+      pthread_cond_timedwait(&sizer->cond, &sizer->lock, &ts);
+    }
+    uint64_t bytes_allocated =
+      sizer->get_allocation_counter(sizer->callback_data);
+    uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
+    double rate = (double) (bytes_allocated - last_bytes_allocated) /
+      (double) (heartbeat - last_heartbeat);
+    // Just smooth the rate, under the assumption that the denominator is almost
+    // always 1.
+    sizer->smoothed_allocation_rate *= 1.0 - sizer->allocation_smoothing_factor;
+    sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
+    last_heartbeat = heartbeat;
+    last_bytes_allocated = bytes_allocated;
+    sizer->set_heap_size(gc_adaptive_heap_sizer_calculate_size(sizer),
+                         sizer->callback_data);
+  }
+  pthread_mutex_unlock(&sizer->lock);
+  return NULL;
+}
+
+static struct gc_adaptive_heap_sizer*
+gc_make_adaptive_heap_sizer(double expansiveness,
+                            uint64_t (*get_allocation_counter)(void *),
+                            void (*set_heap_size)(size_t , void *),
+                            void *callback_data) {
+  struct gc_adaptive_heap_sizer *sizer;
+  sizer = malloc(sizeof(*sizer));
+  if (!sizer)
+    GC_CRASH();
+  memset(sizer, 0, sizeof(*sizer));
+  sizer->get_allocation_counter = get_allocation_counter;
+  sizer->set_heap_size = set_heap_size;
+  sizer->callback_data = callback_data;
+  // Baseline estimate of GC speed: 10 MB/ms, or 10 bytes/ns.  However since we
+  // observe this speed by separately noisy measurements, we have to provide
+  // defaults for numerator and denominator; estimate 2ms for initial GC pauses
+  // for 20 MB of live data during program startup.
+  sizer->smoothed_pause_time = 2 * 1000 * 1000;
+  sizer->smoothed_live_bytes = 20 * 1024 * 1024;
+  // Baseline estimate of allocation rate during startup: 50 MB in 10ms, or 5
+  // bytes/ns.
+  sizer->smoothed_allocation_rate = 5;
+  sizer->collection_smoothing_factor = 0.5;
+  sizer->allocation_smoothing_factor = 0.95;
+  sizer->minimum_multiplier = 1.1;
+  sizer->maximum_multiplier = 5;
+  sizer->minimum_free_space = 4 * 1024 * 1024;
+  sizer->expansiveness = expansiveness;
+  pthread_mutex_init(&sizer->lock, NULL);
+  pthread_cond_init(&sizer->cond, NULL);
+  if (pthread_create(&sizer->thread, NULL, gc_adaptive_heap_sizer_thread,
+                     sizer)) {
+    perror("spawning adaptive heap size thread failed");
+    GC_CRASH();
+  }
+  return sizer;
+}
+
+static void
+gc_destroy_adaptive_heap_sizer(struct gc_adaptive_heap_sizer *sizer) {
+  pthread_mutex_lock(&sizer->lock);
+  GC_ASSERT(!sizer->stopping);
+  sizer->stopping = 1;
+  pthread_mutex_unlock(&sizer->lock);
+  pthread_cond_signal(&sizer->cond);
+  pthread_join(sizer->thread, NULL);
+  free(sizer);
+}
+
+#endif // ADAPTIVE_HEAP_SIZER_H
diff --git a/src/copy-space.h b/src/copy-space.h
index ba26444bb..7d8ab98a2 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -117,7 +117,7 @@ struct copy_space {
   size_t allocated_bytes_at_last_gc;
   size_t fragmentation_at_last_gc;
   struct extents *extents;
-  struct copy_space_slab *slabs;
+  struct copy_space_slab **slabs;
   size_t nslabs;
 };
 
@@ -231,17 +231,25 @@ copy_space_page_out_blocks_until_memory_released(struct copy_space *space) {
   return 1;
 }
 
-static void
-copy_space_reacquire_memory(struct copy_space *space, size_t bytes) {
+static ssize_t
+copy_space_maybe_reacquire_memory(struct copy_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
   while (pending + COPY_SPACE_BLOCK_SIZE <= 0) {
     struct copy_space_block *block = copy_space_page_in_block(space);
-    GC_ASSERT(block);
+    if (!block) break;
     copy_space_push_empty_block(space, block);
-    pending = (atomic_fetch_add(&space->bytes_to_page_out, COPY_SPACE_BLOCK_SIZE)
+    pending = (atomic_fetch_add(&space->bytes_to_page_out,
+                                COPY_SPACE_BLOCK_SIZE)
                + COPY_SPACE_BLOCK_SIZE);
   }
+  return pending;
+}
+
+static void
+copy_space_reacquire_memory(struct copy_space *space, size_t bytes) {
+  ssize_t pending = copy_space_maybe_reacquire_memory(space, bytes);
+  GC_ASSERT(pending + COPY_SPACE_BLOCK_SIZE > 0);
 }
 
 static inline void
@@ -395,6 +403,12 @@ copy_space_finish_gc(struct copy_space *space) {
   space->fragmentation_at_last_gc = space->fragmentation;
 }
 
+static void
+copy_space_add_to_allocation_counter(struct copy_space *space,
+                                     uintptr_t *counter) {
+  *counter += space->allocated_bytes - space->allocated_bytes_at_last_gc;
+}
+
 static void
 copy_space_gc_during_evacuation(void *data) {
   // If space is really tight and reordering of objects during
@@ -578,6 +592,50 @@ copy_space_allocate_slabs(size_t nslabs) {
   return (struct copy_space_slab*) aligned_base;
 }
 
+static void
+copy_space_add_slabs(struct copy_space *space, struct copy_space_slab *slabs,
+                     size_t nslabs) {
+  size_t old_size = space->nslabs * sizeof(struct copy_space_slab*);
+  size_t additional_size = nslabs * sizeof(struct copy_space_slab*);
+  space->extents = extents_adjoin(space->extents, slabs,
+                                  nslabs * sizeof(struct copy_space_slab));
+  space->slabs = realloc(space->slabs, old_size + additional_size);
+  if (!space->slabs)
+    GC_CRASH();
+  while (nslabs--)
+    space->slabs[space->nslabs++] = slabs++;
+}
+
+static void
+copy_space_shrink(struct copy_space *space, size_t bytes) {
+  ssize_t pending = copy_space_request_release_memory(space, bytes);
+  copy_space_page_out_blocks_until_memory_released(space);
+  
+  // It still may be the case we need to page out more blocks.  Only collection
+  // can help us then!
+}
+      
+static void
+copy_space_expand(struct copy_space *space, size_t bytes) {
+  ssize_t to_acquire = -copy_space_maybe_reacquire_memory(space, bytes);
+  if (to_acquire <= 0) return;
+  size_t reserved = align_up(to_acquire, COPY_SPACE_SLAB_SIZE);
+  size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
+  struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs);
+  copy_space_add_slabs(space, slabs, nslabs);
+
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
+      struct copy_space_block *block = &slabs[slab].headers[idx];
+      block->all_zeroes[0] = block->all_zeroes[1] = 1;
+      block->in_core = 0;
+      copy_space_push_paged_out_block(space, block);
+      reserved -= COPY_SPACE_BLOCK_SIZE;
+    }
+  }
+  copy_space_reacquire_memory(space, 0);
+}
+
 static int
 copy_space_init(struct copy_space *space, size_t size, int atomic) {
   size = align_up(size, COPY_SPACE_BLOCK_SIZE);
@@ -599,9 +657,7 @@ copy_space_init(struct copy_space *space, size_t size, int atomic) {
   space->allocated_bytes_at_last_gc = 0;
   space->fragmentation_at_last_gc = 0;
   space->extents = extents_allocate(10);
-  extents_adjoin(space->extents, slabs, reserved);
-  space->slabs = slabs;
-  space->nslabs = nslabs;
+  copy_space_add_slabs(space, slabs, nslabs);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
diff --git a/src/gc-internal.h b/src/gc-internal.h
index 7cbb79f58..715b72a99 100644
--- a/src/gc-internal.h
+++ b/src/gc-internal.h
@@ -9,4 +9,8 @@
 #include "gc-finalizer-internal.h"
 #include "gc-options-internal.h"
 
+uint64_t gc_heap_total_bytes_allocated(struct gc_heap *heap);
+void gc_mutator_adjust_heap_size(struct gc_mutator *mut, uint64_t new_size);
+
+
 #endif // GC_INTERNAL_H
diff --git a/src/gc-options-internal.h b/src/gc-options-internal.h
index 4190cb841..9e9fbca22 100644
--- a/src/gc-options-internal.h
+++ b/src/gc-options-internal.h
@@ -12,7 +12,7 @@ struct gc_common_options {
   size_t heap_size;
   size_t maximum_heap_size;
   double heap_size_multiplier;
-  double heap_frugality;
+  double heap_expansiveness;
   int parallelism;
 };
 
diff --git a/src/gc-options.c b/src/gc-options.c
index c41b8fe51..31de02745 100644
--- a/src/gc-options.c
+++ b/src/gc-options.c
@@ -25,8 +25,8 @@
 #define FOR_EACH_DOUBLE_GC_OPTION(M)                                    \
   M(HEAP_SIZE_MULTIPLIER, heap_size_multiplier, "heap-size-multiplier", \
     double, double, 1.75, 1.0, 1e6)                                     \
-  M(HEAP_FRUGALITY, heap_frugality, "heap-frugality",                   \
-    double, double, 1e-1, 1e-6, 1e6)
+  M(HEAP_EXPANSIVENESS, heap_expansiveness, "heap-expansiveness",       \
+    double, double, 1.0, 0.0, 50.0)
 
 typedef int gc_option_int;
 typedef size_t gc_option_size;
diff --git a/src/gc-platform-gnu-linux.c b/src/gc-platform-gnu-linux.c
index 82390d445..ebcfa5579 100644
--- a/src/gc-platform-gnu-linux.c
+++ b/src/gc-platform-gnu-linux.c
@@ -5,6 +5,7 @@
 #include <pthread.h>
 #include <sched.h>
 #include <stdio.h>
+#include <time.h>
 #include <unistd.h>
 
 #define GC_IMPL 1
@@ -110,3 +111,13 @@ int gc_platform_processor_count(void) {
     return 1;
   return CPU_COUNT(&set);
 }
+
+uint64_t gc_platform_monotonic_nanoseconds(void) {
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC, &ts))
+    GC_CRASH();
+  uint64_t s = ts.tv_sec;
+  uint64_t ns = ts.tv_nsec;
+  uint64_t ns_per_sec = 1000000000;
+  return s * ns_per_sec + ns;
+}
diff --git a/src/gc-platform.h b/src/gc-platform.h
index ea6a6aa18..42335ed7a 100644
--- a/src/gc-platform.h
+++ b/src/gc-platform.h
@@ -21,5 +21,6 @@ void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
                                                  struct gc_heap *heap,
                                                  void *data);
 GC_INTERNAL int gc_platform_processor_count(void);
+GC_INTERNAL uint64_t gc_platform_monotonic_nanoseconds(void);
 
 #endif // GC_PLATFORM_H
diff --git a/src/growable-heap-sizer.h b/src/growable-heap-sizer.h
new file mode 100644
index 000000000..bf4200893
--- /dev/null
+++ b/src/growable-heap-sizer.h
@@ -0,0 +1,58 @@
+#ifndef GROWABLE_HEAP_SIZER_H
+#define GROWABLE_HEAP_SIZER_H
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "assert.h"
+#include "heap-sizer.h"
+
+// This is a simple heap-sizing algorithm that will grow the heap if it is
+// smaller than a given multiplier of the live data size.  It does not shrink
+// the heap.
+
+struct gc_growable_heap_sizer {
+  double multiplier;
+  pthread_mutex_t lock;
+};
+
+static void
+gc_growable_heap_sizer_set_multiplier(struct gc_growable_heap_sizer *sizer,
+                                      double multiplier) {
+  pthread_mutex_lock(&sizer->lock);
+  sizer->multiplier = multiplier;
+  pthread_mutex_unlock(&sizer->lock);
+}
+
+static void
+gc_growable_heap_sizer_on_gc(struct gc_growable_heap_sizer *sizer,
+                             size_t heap_size, size_t live_bytes,
+                             uint64_t pause_ns,
+                             void (*set_heap_size)(size_t, void*),
+                             void *data) {
+  pthread_mutex_lock(&sizer->lock);
+  size_t target_size = live_bytes * sizer->multiplier;
+  if (target_size > heap_size)
+    set_heap_size(target_size, data);
+  pthread_mutex_unlock(&sizer->lock);
+}
+
+static struct gc_growable_heap_sizer*
+gc_make_growable_heap_sizer(double multiplier) {
+  struct gc_growable_heap_sizer *sizer;
+  sizer = malloc(sizeof(*sizer));
+  if (!sizer)
+    GC_CRASH();
+  memset(sizer, 0, sizeof(*sizer));
+  sizer->multiplier = multiplier;
+  pthread_mutex_init(&sizer->lock, NULL);
+  return sizer;
+}
+
+static void
+gc_destroy_growable_heap_sizer(struct gc_growable_heap_sizer *sizer) {
+  free(sizer);
+}
+
+#endif // GROWABLE_HEAP_SIZER_H
diff --git a/src/heap-sizer.h b/src/heap-sizer.h
new file mode 100644
index 000000000..dc6f3d2ef
--- /dev/null
+++ b/src/heap-sizer.h
@@ -0,0 +1,71 @@
+#ifndef HEAP_SIZER_H
+#define HEAP_SIZER_H
+
+#include "gc-api.h"
+
+#include "gc-options-internal.h"
+#include "growable-heap-sizer.h"
+#include "adaptive-heap-sizer.h"
+
+struct gc_heap_sizer {
+  enum gc_heap_size_policy policy;
+  union {
+    struct gc_growable_heap_sizer* growable;
+    struct gc_adaptive_heap_sizer* adaptive;
+  };
+};
+
+static struct gc_heap_sizer
+gc_make_heap_sizer(struct gc_heap *heap,
+                   const struct gc_common_options *options,
+                   uint64_t (*get_allocation_counter_from_thread)(void*),
+                   void (*set_heap_size_from_thread)(size_t, void*),
+                   void *data) {
+  struct gc_heap_sizer ret = { options->heap_size_policy, };
+  switch (options->heap_size_policy) {
+    case GC_HEAP_SIZE_FIXED:
+      break;
+
+    case GC_HEAP_SIZE_GROWABLE:
+      ret.growable = gc_make_growable_heap_sizer(options->heap_size_multiplier);
+      break;
+
+    case GC_HEAP_SIZE_ADAPTIVE:
+      ret.adaptive =
+        gc_make_adaptive_heap_sizer (options->heap_expansiveness,
+                                     get_allocation_counter_from_thread,
+                                     set_heap_size_from_thread,
+                                     heap);
+      break;
+
+    default:
+      GC_CRASH();
+  }
+  return ret;
+}
+
+static void
+gc_heap_sizer_on_gc(struct gc_heap_sizer sizer, size_t heap_size,
+                    size_t live_bytes, size_t pause_ns,
+                    void (*set_heap_size)(size_t, void*), void *data) {
+  switch (sizer.policy) {
+    case GC_HEAP_SIZE_FIXED:
+      break;
+
+    case GC_HEAP_SIZE_GROWABLE:
+      gc_growable_heap_sizer_on_gc(sizer.growable, heap_size, live_bytes,
+                                   pause_ns, set_heap_size, data);
+      break;
+
+    case GC_HEAP_SIZE_ADAPTIVE:
+      gc_adaptive_heap_sizer_on_gc(sizer.adaptive, live_bytes, pause_ns,
+                                   set_heap_size, data);
+      break;
+
+    default:
+      GC_CRASH();
+  }
+}
+                    
+
+#endif // HEAP_SIZER_H
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 3d07cf8ad..d81369f93 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -218,6 +218,14 @@ static void large_object_space_finish_gc(struct large_object_space *space,
   pthread_mutex_unlock(&space->lock);
 }
 
+static void
+large_object_space_add_to_allocation_counter(struct large_object_space *space,
+                                             uint64_t *counter) {
+  size_t pages = space->total_pages - space->free_pages;
+  pages -= space->live_pages_at_last_collection;
+  *counter += pages << space->page_size_log2;
+}
+
 static inline struct gc_ref
 large_object_space_mark_conservative_ref(struct large_object_space *space,
                                          struct gc_conservative_ref ref,
diff --git a/src/mmc.c b/src/mmc.c
index 7975a643c..34a96e409 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -15,6 +15,7 @@
 #include "gc-platform.h"
 #include "gc-stack.h"
 #include "gc-trace.h"
+#include "heap-sizer.h"
 #include "large-object-space.h"
 #include "nofl-space.h"
 #if GC_PARALLEL
@@ -36,6 +37,8 @@ struct gc_heap {
   pthread_cond_t collector_cond;
   pthread_cond_t mutator_cond;
   size_t size;
+  size_t total_allocated_bytes_at_last_gc;
+  size_t size_at_last_gc;
   int collecting;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
@@ -55,6 +58,7 @@ struct gc_heap {
   double minimum_major_gc_yield_threshold;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
+  struct gc_heap_sizer sizer;
   struct gc_event_listener event_listener;
   void *event_listener_data;
 };
@@ -450,27 +454,32 @@ maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
     pause_mutator_for_collection_without_lock(mut);
 }
 
-static int maybe_grow_heap(struct gc_heap *heap) {
-  return 0;
+static void
+resize_heap(size_t new_size, void *data) {
+  struct gc_heap *heap = data;
+  if (new_size == heap->size)
+    return;
+  DEBUG("------ resizing heap\n");
+  DEBUG("------ old heap size: %zu bytes\n", heap->size);
+  DEBUG("------ new heap size: %zu bytes\n", new_size);
+  if (new_size < heap->size)
+    nofl_space_shrink(heap_nofl_space(heap), heap->size - new_size);
+  else
+    nofl_space_expand(heap_nofl_space(heap), new_size - heap->size);
+
+  heap->size = new_size;
+  HEAP_EVENT(heap, heap_resized, new_size);
 }
 
 static double
 heap_last_gc_yield(struct gc_heap *heap) {
-  struct nofl_space *nofl_space = heap_nofl_space(heap);
-  size_t nofl_yield = nofl_space_yield(nofl_space);
-  size_t evacuation_reserve = nofl_space_evacuation_reserve_bytes(nofl_space);
-  // FIXME: Size nofl evacuation reserve based on size of nofl space,
-  // not heap size.
-  size_t minimum_evacuation_reserve =
-    heap->size * nofl_space->evacuation_minimum_reserve;
-  if (evacuation_reserve > minimum_evacuation_reserve)
-    nofl_yield += evacuation_reserve - minimum_evacuation_reserve;
-  struct large_object_space *lospace = heap_large_object_space(heap);
-  size_t lospace_yield = lospace->pages_freed_by_last_collection;
-  lospace_yield <<= lospace->page_size_log2;
+  size_t live_size =
+    nofl_space_live_size_at_last_collection(heap_nofl_space(heap)) +
+    large_object_space_size_at_last_collection(heap_large_object_space(heap));
 
-  double yield = nofl_yield + lospace_yield;
-  return yield / heap->size;
+  if (live_size > heap->size_at_last_gc)
+    return 0;
+  return 1.0 - ((double) live_size) / heap->size_at_last_gc;
 }
 
 static double
@@ -480,31 +489,29 @@ heap_fragmentation(struct gc_heap *heap) {
   return ((double)fragmentation) / heap->size;
 }
 
+static size_t
+heap_estimate_live_data_after_gc(struct gc_heap *heap,
+                                 size_t last_live_bytes,
+                                 double last_yield) {
+  size_t bytes =
+    nofl_space_estimate_live_bytes_after_gc(heap_nofl_space(heap),
+                                            last_yield)
+    + large_object_space_size_at_last_collection(heap_large_object_space(heap));
+  if (bytes < last_live_bytes)
+    return last_live_bytes;
+  return bytes;
+}
+
 static void
-detect_out_of_memory(struct gc_heap *heap) {
-  struct nofl_space *nofl_space = heap_nofl_space(heap);
-  struct large_object_space *lospace = heap_large_object_space(heap);
-
-  if (heap->count == 0)
+detect_out_of_memory(struct gc_heap *heap, uintptr_t allocation_since_last_gc) {
+  if (heap->sizer.policy != GC_HEAP_SIZE_FIXED)
     return;
 
-  double last_yield = heap_last_gc_yield(heap);
-  double fragmentation = heap_fragmentation(heap);
-
-  double yield_epsilon = NOFL_BLOCK_SIZE * 1.0 / heap->size;
-  double fragmentation_epsilon = LARGE_OBJECT_THRESHOLD * 1.0 / NOFL_BLOCK_SIZE;
-
-  if (last_yield - fragmentation > yield_epsilon)
+  if (allocation_since_last_gc > nofl_space_fragmentation(heap_nofl_space(heap)))
     return;
 
-  if (fragmentation > fragmentation_epsilon
-      && atomic_load(&nofl_space->evacuation_targets.count))
-    return;
-
-  // No yield in last gc and we do not expect defragmentation to
-  // be able to yield more space: out of memory.
-  fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
-          heap->size, nofl_space->nslabs);
+  // No allocation since last gc: out of memory.
+  fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
   GC_CRASH();
 }
 
@@ -731,10 +738,7 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
   struct gc_extern_space *exspace = heap_extern_space(heap);
-  if (maybe_grow_heap(heap)) {
-    DEBUG("grew heap instead of collecting #%ld:\n", heap->count);
-    return;
-  }
+  uint64_t start_ns = gc_platform_monotonic_nanoseconds();
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
   HEAP_EVENT(heap, requesting_stop);
@@ -747,6 +751,11 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
     determine_collection_kind(heap, requested_kind);
   int is_minor = gc_kind == GC_COLLECTION_MINOR;
   HEAP_EVENT(heap, prepare_gc, gc_kind);
+  uint64_t allocation_counter = 0;
+  nofl_space_add_to_allocation_counter(nofl_space, &allocation_counter);
+  large_object_space_add_to_allocation_counter(lospace, &allocation_counter);
+  heap->total_allocated_bytes_at_last_gc += allocation_counter;
+  detect_out_of_memory(heap, allocation_counter);
   nofl_space_prepare_gc(nofl_space, gc_kind);
   large_object_space_start_gc(lospace, is_minor);
   gc_extern_space_start_gc(exspace, is_minor);
@@ -754,9 +763,9 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   gc_tracer_prepare(&heap->tracer);
   double yield = heap_last_gc_yield(heap);
   double fragmentation = heap_fragmentation(heap);
-  HEAP_EVENT(heap, live_data_size, heap->size * (1 - yield));
+  size_t live_bytes = heap->size * (1.0 - yield);
+  HEAP_EVENT(heap, live_data_size, live_bytes);
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
-  detect_out_of_memory(heap);
   enqueue_pinned_roots(heap);
   // Eagerly trace pinned roots if we are going to relocate objects.
   if (gc_kind == GC_COLLECTION_COMPACTING)
@@ -780,6 +789,13 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   gc_extern_space_finish_gc(exspace, is_minor);
   heap->count++;
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
+  uint64_t pause_ns = gc_platform_monotonic_nanoseconds() - start_ns;
+  size_t live_bytes_estimate =
+    heap_estimate_live_data_after_gc(heap, live_bytes, yield);
+  DEBUG("--- total live bytes estimate: %zu\n", live_bytes_estimate);
+  gc_heap_sizer_on_gc(heap->sizer, heap->size, live_bytes_estimate, pause_ns,
+                      resize_heap, heap);
+  heap->size_at_last_gc = heap->size;
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
 }
@@ -971,7 +987,7 @@ heap_init(struct gc_heap *heap, const struct gc_options *options) {
   pthread_mutex_init(&heap->lock, NULL);
   pthread_cond_init(&heap->mutator_cond, NULL);
   pthread_cond_init(&heap->collector_cond, NULL);
-  heap->size = options->common.heap_size;
+  heap->size = heap->size_at_last_gc = options->common.heap_size;
 
   if (!gc_tracer_init(&heap->tracer, heap, options->common.parallelism))
     GC_CRASH();
@@ -995,6 +1011,24 @@ heap_init(struct gc_heap *heap, const struct gc_options *options) {
   return 1;
 }
 
+static uint64_t allocation_counter_from_thread(void *data) {
+  struct gc_heap *heap = data;
+  uint64_t ret = heap->total_allocated_bytes_at_last_gc;
+  if (pthread_mutex_trylock(&heap->lock)) return ret;
+  nofl_space_add_to_allocation_counter(heap_nofl_space(heap), &ret);
+  large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
+                                               &ret);
+  pthread_mutex_unlock(&heap->lock);
+  return ret;
+}
+
+static void set_heap_size_from_thread(size_t size, void *data) {
+  struct gc_heap *heap = data;
+  if (pthread_mutex_trylock(&heap->lock)) return;
+  resize_heap(size, heap);
+  pthread_mutex_unlock(&heap->lock);
+}
+
 int
 gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
         struct gc_heap **heap, struct gc_mutator **mut,
@@ -1015,11 +1049,6 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
                  NOFL_BLOCK_SIZE / NOFL_REMSET_BYTES_PER_BLOCK);
   }
 
-  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
-    fprintf(stderr, "fixed heap size is currently required\n");
-    return 0;
-  }
-
   *heap = calloc(1, sizeof(struct gc_heap));
   if (!*heap) GC_CRASH();
 
@@ -1042,6 +1071,11 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
+  (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
+                                      allocation_counter_from_thread,
+                                      set_heap_size_from_thread,
+                                      (*heap));
+
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   gc_stack_init(&(*mut)->stack, stack_base);
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 5a217cdb0..5c46bb7ff 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -75,6 +75,7 @@ enum nofl_block_summary_flag {
   NOFL_BLOCK_EVACUATE = 0x1,
   NOFL_BLOCK_ZERO = 0x2,
   NOFL_BLOCK_UNAVAILABLE = 0x4,
+  NOFL_BLOCK_PAGED_OUT = 0x8,
   NOFL_BLOCK_FLAG_UNUSED_3 = 0x8,
   NOFL_BLOCK_FLAG_UNUSED_4 = 0x10,
   NOFL_BLOCK_FLAG_UNUSED_5 = 0x20,
@@ -95,7 +96,7 @@ struct nofl_block_summary {
       // Counters related to previous collection: how many holes there
       // were, and how much space they had.
       uint16_t hole_count;
-      uint16_t free_granules;
+      uint16_t hole_granules;
       // Counters related to allocation since previous collection:
       // wasted space due to fragmentation.  Also used by blocks on the
       // "partly full" list, which have zero holes_with_fragmentation
@@ -158,7 +159,9 @@ struct nofl_space {
   ssize_t pending_unavailable_bytes; // atomically
   struct nofl_slab **slabs;
   size_t nslabs;
-  uintptr_t granules_freed_by_last_collection; // atomically
+  uintptr_t old_generation_granules; // atomically
+  uintptr_t survivor_granules_at_last_collection; // atomically
+  uintptr_t allocated_granules_since_last_collection; // atomically
   uintptr_t fragmentation_granules_since_last_collection; // atomically
 };
 
@@ -271,7 +274,7 @@ nofl_block_summary_for_addr(uintptr_t addr) {
 static uintptr_t
 nofl_block_summary_has_flag(struct nofl_block_summary *summary,
                             enum nofl_block_summary_flag flag) {
-  return summary->next_and_flags & flag;
+  return (summary->next_and_flags & flag) == flag;
 }
 
 static void
@@ -404,12 +407,23 @@ nofl_block_count(struct nofl_block_list *list) {
   return atomic_load_explicit(&list->count, memory_order_acquire);
 }
 
+static void
+nofl_push_paged_out_block(struct nofl_space *space,
+                          struct nofl_block_ref block) {
+  GC_ASSERT(nofl_block_has_flag(block,
+                                NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT));
+  nofl_block_set_flag(block, NOFL_BLOCK_UNAVAILABLE);
+  nofl_push_block(&space->unavailable, block);
+}
+
 static void
 nofl_push_unavailable_block(struct nofl_space *space,
                             struct nofl_block_ref block) {
-  nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_UNAVAILABLE);
-  madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
-  nofl_push_block(&space->unavailable, block);
+  if (!nofl_block_has_flag(block, NOFL_BLOCK_PAGED_OUT)) {
+    nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
+    madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+  }
+  nofl_push_paged_out_block(space, block);
 }
 
 static struct nofl_block_ref
@@ -483,7 +497,7 @@ nofl_should_promote_block(struct nofl_space *space,
   // until after the next full GC.
   if (!GC_GENERATIONAL) return 0;
   size_t threshold = NOFL_GRANULES_PER_BLOCK * space->promotion_threshold;
-  return block.summary->free_granules < threshold;
+  return block.summary->hole_granules < threshold;
 }
 
 static void
@@ -492,8 +506,10 @@ nofl_allocator_release_full_block(struct nofl_allocator *alloc,
   GC_ASSERT(nofl_allocator_has_block(alloc));
   struct nofl_block_ref block = alloc->block;
   GC_ASSERT(alloc->alloc == alloc->sweep);
-  atomic_fetch_add(&space->granules_freed_by_last_collection,
-                   block.summary->free_granules);
+  atomic_fetch_add(&space->allocated_granules_since_last_collection,
+                   block.summary->hole_granules);
+  atomic_fetch_add(&space->survivor_granules_at_last_collection,
+                   NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
   atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
                    block.summary->fragmentation_granules);
 
@@ -515,15 +531,13 @@ nofl_allocator_release_full_evacuation_target(struct nofl_allocator *alloc,
   size_t hole_size = alloc->sweep - alloc->alloc;
   // FIXME: Check how this affects statistics.
   GC_ASSERT_EQ(block.summary->hole_count, 1);
-  GC_ASSERT_EQ(block.summary->free_granules, NOFL_GRANULES_PER_BLOCK);
-  atomic_fetch_add(&space->granules_freed_by_last_collection,
+  GC_ASSERT_EQ(block.summary->hole_granules, NOFL_GRANULES_PER_BLOCK);
+  atomic_fetch_add(&space->old_generation_granules,
                    NOFL_GRANULES_PER_BLOCK);
   if (hole_size) {
     hole_size >>= NOFL_GRANULE_SIZE_LOG_2;
     block.summary->holes_with_fragmentation = 1;
     block.summary->fragmentation_granules = hole_size / NOFL_GRANULE_SIZE;
-    atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
-                     block.summary->fragmentation_granules);
   } else {
     GC_ASSERT_EQ(block.summary->fragmentation_granules, 0);
     GC_ASSERT_EQ(block.summary->holes_with_fragmentation, 0);
@@ -570,14 +584,14 @@ nofl_allocator_acquire_empty_block(struct nofl_allocator *alloc,
   if (nofl_block_is_null(block))
     return 0;
   block.summary->hole_count = 1;
-  block.summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+  block.summary->hole_granules = NOFL_GRANULES_PER_BLOCK;
   block.summary->holes_with_fragmentation = 0;
   block.summary->fragmentation_granules = 0;
   alloc->block = block;
   alloc->alloc = block.addr;
   alloc->sweep = block.addr + NOFL_BLOCK_SIZE;
   if (nofl_block_has_flag(block, NOFL_BLOCK_ZERO))
-    nofl_block_clear_flag(block, NOFL_BLOCK_ZERO);
+    nofl_block_clear_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
   else
     nofl_clear_memory(block.addr, NOFL_BLOCK_SIZE);
   return NOFL_GRANULES_PER_BLOCK;
@@ -637,22 +651,22 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
     return 0;
   }
 
-  size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
-  size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
-  GC_ASSERT(free_granules);
-  GC_ASSERT(free_granules <= limit_granules);
+  size_t hole_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+  size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE;
+  GC_ASSERT(hole_granules);
+  GC_ASSERT(hole_granules <= limit_granules);
 
-  memset(metadata, 0, free_granules);
+  memset(metadata, 0, hole_granules);
   memset((char*)sweep, 0, free_bytes);
 
   alloc->block.summary->hole_count++;
-  GC_ASSERT(free_granules <=
-            NOFL_GRANULES_PER_BLOCK - alloc->block.summary->free_granules);
-  alloc->block.summary->free_granules += free_granules;
+  GC_ASSERT(hole_granules <=
+            NOFL_GRANULES_PER_BLOCK - alloc->block.summary->hole_granules);
+  alloc->block.summary->hole_granules += hole_granules;
 
   alloc->alloc = sweep;
   alloc->sweep = sweep + free_bytes;
-  return free_granules;
+  return hole_granules;
 }
 
 static void
@@ -724,7 +738,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
     // at the last collection.  As we allocate we'll record how
     // many granules were wasted because of fragmentation.
     alloc->block.summary->hole_count = 0;
-    alloc->block.summary->free_granules = 0;
+    alloc->block.summary->hole_granules = 0;
     alloc->block.summary->holes_with_fragmentation = 0;
     alloc->block.summary->fragmentation_granules = 0;
     size_t granules =
@@ -875,13 +889,53 @@ nofl_space_clear_remembered_set(struct nofl_space *space) {
 
 static void
 nofl_space_reset_statistics(struct nofl_space *space) {
-  space->granules_freed_by_last_collection = 0;
+  space->survivor_granules_at_last_collection = 0;
+  space->allocated_granules_since_last_collection = 0;
   space->fragmentation_granules_since_last_collection = 0;
 }
 
 static size_t
-nofl_space_yield(struct nofl_space *space) {
-  return space->granules_freed_by_last_collection * NOFL_GRANULE_SIZE;
+nofl_space_live_size_at_last_collection(struct nofl_space *space) {
+  size_t granules = space->old_generation_granules
+    + space->survivor_granules_at_last_collection;
+  return granules * NOFL_GRANULE_SIZE;
+}
+
+static void
+nofl_space_add_to_allocation_counter(struct nofl_space *space,
+                                     uint64_t *counter) {
+  *counter +=
+    atomic_load_explicit(&space->allocated_granules_since_last_collection,
+                         memory_order_relaxed) * NOFL_GRANULE_SIZE;
+}
+  
+static size_t
+nofl_space_estimate_live_bytes_after_gc(struct nofl_space *space,
+                                        double last_yield)
+{
+  // The nofl space mostly traces via marking, and as such doesn't precisely
+  // know the live data size until after sweeping.  But it is important to
+  // promptly compute the live size so that we can grow the heap if
+  // appropriate.  Therefore sometimes we will estimate the live data size
+  // instead of measuring it precisely.
+  size_t bytes = 0;
+  bytes += nofl_block_count(&space->full) * NOFL_BLOCK_SIZE;
+  bytes += nofl_block_count(&space->partly_full) * NOFL_BLOCK_SIZE / 2;
+  GC_ASSERT_EQ(nofl_block_count(&space->promoted), 0);
+  bytes += space->old_generation_granules * NOFL_GRANULE_SIZE;
+  bytes +=
+    nofl_block_count(&space->to_sweep) * NOFL_BLOCK_SIZE * (1 - last_yield);
+
+  DEBUG("--- nofl estimate before adjustment: %zu\n", bytes);
+/*
+  // Assume that if we have pending unavailable bytes after GC that there is a
+  // large object waiting to be allocated, and that probably it survives this GC
+  // cycle.
+  bytes += atomic_load_explicit(&space->pending_unavailable_bytes,
+                                memory_order_acquire);
+  DEBUG("--- nofl estimate after adjustment: %zu\n", bytes);
+*/
+  return bytes;
 }
 
 static size_t
@@ -891,8 +945,12 @@ nofl_space_evacuation_reserve_bytes(struct nofl_space *space) {
 
 static size_t
 nofl_space_fragmentation(struct nofl_space *space) {
-  size_t granules = space->fragmentation_granules_since_last_collection;
-  return granules * NOFL_GRANULE_SIZE;
+  size_t young = space->fragmentation_granules_since_last_collection;
+  GC_ASSERT(nofl_block_count(&space->old) * NOFL_GRANULES_PER_BLOCK >=
+            space->old_generation_granules);
+  size_t old = nofl_block_count(&space->old) * NOFL_GRANULES_PER_BLOCK -
+    space->old_generation_granules;
+  return (young + old) * NOFL_GRANULE_SIZE;
 }
 
 static void
@@ -933,7 +991,7 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
   for (struct nofl_block_ref b = nofl_block_for_addr(space->to_sweep.blocks);
        !nofl_block_is_null(b);
        b = nofl_block_next(b)) {
-    size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - b.summary->free_granules;
+    size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - b.summary->hole_granules;
     size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
     histogram[bucket]++;
   }
@@ -956,7 +1014,7 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
   for (struct nofl_block_ref b = nofl_block_for_addr(space->to_sweep.blocks);
        !nofl_block_is_null(b);
        b = nofl_block_next(b)) {
-    size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - b.summary->free_granules;
+    size_t survivor_granules = NOFL_GRANULES_PER_BLOCK - b.summary->hole_granules;
     size_t bucket = (survivor_granules + bucket_size - 1) / bucket_size;
     if (histogram[bucket]) {
       nofl_block_set_flag(b, NOFL_BLOCK_EVACUATE);
@@ -1012,6 +1070,7 @@ nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
       nofl_push_block(&space->to_sweep, block);
     while (!nofl_block_is_null(block = nofl_pop_block(&space->old)))
       nofl_push_block(&space->to_sweep, block);
+    space->old_generation_granules = 0;
   }
 
   if (gc_kind == GC_COLLECTION_COMPACTING)
@@ -1042,6 +1101,8 @@ nofl_space_promote_blocks(struct nofl_space *space) {
   while (!nofl_block_is_null(block = nofl_pop_block(&space->promoted))) {
     struct nofl_allocator alloc = { block.addr, block.addr, block };
     nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
+    atomic_fetch_add(&space->old_generation_granules,
+                     NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
     nofl_push_block(&space->old, block);
   }
 }
@@ -1210,18 +1271,25 @@ nofl_space_request_release_memory(struct nofl_space *space, size_t bytes) {
   return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
 }
 
-static void
-nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
+static ssize_t
+nofl_space_maybe_reacquire_memory(struct nofl_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
   while (pending + NOFL_BLOCK_SIZE <= 0) {
     struct nofl_block_ref block = nofl_pop_unavailable_block(space);
-    GC_ASSERT(!nofl_block_is_null(block));
+    if (nofl_block_is_null(block)) break;
     if (!nofl_push_evacuation_target_if_needed(space, block))
       nofl_push_empty_block(space, block);
     pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
       + NOFL_BLOCK_SIZE;
   }
+  return pending;
+}
+
+static void
+nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
+  ssize_t pending = nofl_space_maybe_reacquire_memory(space, bytes);
+  GC_ASSERT(pending + NOFL_BLOCK_SIZE > 0);
 }
 
 static int
@@ -1538,6 +1606,66 @@ nofl_space_add_slabs(struct nofl_space *space, struct nofl_slab *slabs,
     space->slabs[space->nslabs++] = slabs++;
 }
 
+static void
+nofl_space_shrink(struct nofl_space *space, size_t bytes) {
+  ssize_t pending = nofl_space_request_release_memory(space, bytes);
+  // First try to shrink by unmapping previously-identified empty blocks.
+  while (pending > 0) {
+    struct nofl_block_ref block = nofl_pop_empty_block(space);
+    if (nofl_block_is_null(block))
+      break;
+    nofl_push_unavailable_block(space, block);
+    pending = atomic_fetch_sub(&space->pending_unavailable_bytes,
+                               NOFL_BLOCK_SIZE);
+    pending -= NOFL_BLOCK_SIZE;
+  }
+  
+  // If we still need to shrink, steal from the evacuation reserve, if it's more
+  // than the minimum.  Not racy: evacuation target lists are built during eager
+  // lazy sweep, which is mutually exclusive with consumption, itself either
+  // during trace, synchronously from gc_heap_sizer_on_gc, or async but subject
+  // to the heap lock.
+  if (pending > 0) {
+    size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+    size_t unavailable = nofl_block_count(&space->unavailable);
+    size_t target = space->evacuation_minimum_reserve * (total - unavailable);
+    ssize_t avail = nofl_block_count(&space->evacuation_targets);
+    while (avail > target && pending > 0) {
+      struct nofl_block_ref block = nofl_pop_block(&space->evacuation_targets);
+      GC_ASSERT(!nofl_block_is_null(block));
+      nofl_push_unavailable_block(space, block);
+      pending = atomic_fetch_sub(&space->pending_unavailable_bytes,
+                                 NOFL_BLOCK_SIZE);
+      pending -= NOFL_BLOCK_SIZE;
+    }
+  }
+
+  // It still may be the case we need to page out more blocks.  Only evacuation
+  // can help us then!
+}
+      
+static void
+nofl_space_expand(struct nofl_space *space, size_t bytes) {
+  double overhead = ((double)NOFL_META_BLOCKS_PER_SLAB) / NOFL_BLOCKS_PER_SLAB;
+  ssize_t to_acquire = -nofl_space_maybe_reacquire_memory(space, bytes);
+  if (to_acquire <= 0) return;
+  to_acquire *= (1 + overhead);
+  size_t reserved = align_up(to_acquire, NOFL_SLAB_SIZE);
+  size_t nslabs = reserved / NOFL_SLAB_SIZE;
+  struct nofl_slab *slabs = nofl_allocate_slabs(nslabs);
+  nofl_space_add_slabs(space, slabs, nslabs);
+
+  for (size_t slab = 0; slab < nslabs; slab++) {
+    for (size_t idx = 0; idx < NOFL_NONMETA_BLOCKS_PER_SLAB; idx++) {
+      uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
+      struct nofl_block_ref block = nofl_block_for_addr(addr);
+      nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
+      nofl_push_paged_out_block(space, block);
+    }
+  }
+  nofl_space_reacquire_memory(space, 0);
+}
+
 static int
 nofl_space_init(struct nofl_space *space, size_t size, int atomic,
                 double promotion_threshold) {
@@ -1556,12 +1684,12 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
   space->evacuation_reserve = space->evacuation_minimum_reserve;
   space->promotion_threshold = promotion_threshold;
   for (size_t slab = 0; slab < nslabs; slab++) {
-    for (size_t block = 0; block < NOFL_NONMETA_BLOCKS_PER_SLAB; block++) {
-      uintptr_t addr = (uintptr_t)slabs[slab].blocks[block].data;
+    for (size_t idx = 0; idx < NOFL_NONMETA_BLOCKS_PER_SLAB; idx++) {
+      uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
       struct nofl_block_ref block = nofl_block_for_addr(addr);
-      nofl_block_set_flag(block, NOFL_BLOCK_ZERO);
+      nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
       if (reserved > size) {
-        nofl_push_unavailable_block(space, block);
+        nofl_push_paged_out_block(space, block);
         reserved -= NOFL_BLOCK_SIZE;
       } else {
         if (!nofl_push_evacuation_target_if_needed(space, block))
diff --git a/src/pcc.c b/src/pcc.c
index 0f5d84abd..40ba60f8b 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -13,7 +13,9 @@
 #include "debug.h"
 #include "gc-align.h"
 #include "gc-inline.h"
+#include "gc-platform.h"
 #include "gc-trace.h"
+#include "heap-sizer.h"
 #include "large-object-space.h"
 #if GC_PARALLEL
 #include "parallel-tracer.h"
@@ -32,6 +34,7 @@ struct gc_heap {
   pthread_cond_t collector_cond;
   pthread_cond_t mutator_cond;
   size_t size;
+  size_t total_allocated_bytes_at_last_gc;
   int collecting;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
@@ -45,6 +48,7 @@ struct gc_heap {
   struct gc_tracer tracer;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
+  struct gc_heap_sizer sizer;
   struct gc_event_listener event_listener;
   void *event_listener_data;
 };
@@ -316,8 +320,20 @@ static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
     pause_mutator_for_collection_without_lock(mut);
 }
 
-static int maybe_grow_heap(struct gc_heap *heap) {
-  return 0;
+static void resize_heap(size_t new_size, void *data) {
+  struct gc_heap *heap = data;
+  if (new_size == heap->size)
+    return;
+  DEBUG("------ resizing heap\n");
+  DEBUG("------ old heap size: %zu bytes\n", heap->size);
+  DEBUG("------ new heap size: %zu bytes\n", new_size);
+  if (new_size < heap->size)
+    copy_space_shrink(heap_copy_space(heap), heap->size - new_size);
+  else
+    copy_space_expand(heap_copy_space(heap), new_size - heap->size);
+
+  heap->size = new_size;
+  HEAP_EVENT(heap, heap_resized, new_size);
 }
 
 static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
@@ -375,6 +391,7 @@ static void collect(struct gc_mutator *mut) {
   struct copy_space *copy_space = heap_copy_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
   struct gc_extern_space *exspace = heap_extern_space(heap);
+  uint64_t start_ns = gc_platform_monotonic_nanoseconds();
   MUTATOR_EVENT(mut, mutator_cause_gc);
   DEBUG("start collect #%ld:\n", heap->count);
   HEAP_EVENT(heap, requesting_stop);
@@ -383,6 +400,9 @@ static void collect(struct gc_mutator *mut) {
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
   HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
+  uint64_t *counter_loc = &heap->total_allocated_bytes_at_last_gc;
+  copy_space_add_to_allocation_counter(copy_space, counter_loc);
+  large_object_space_add_to_allocation_counter(lospace, counter_loc);
   copy_space_flip(copy_space);
   large_object_space_start_gc(lospace, 0);
   gc_extern_space_start_gc(exspace, 0);
@@ -406,9 +426,12 @@ static void collect(struct gc_mutator *mut) {
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
   size_t live_size = (copy_space->allocated_bytes_at_last_gc +
                       large_object_space_size_at_last_collection(lospace));
+  uint64_t pause_ns = gc_platform_monotonic_nanoseconds() - start_ns;
   HEAP_EVENT(heap, live_data_size, live_size);
-  maybe_grow_heap(heap);
-  if (!copy_space_page_out_blocks_until_memory_released(copy_space)) {
+  gc_heap_sizer_on_gc(heap->sizer, heap->size, live_size, pause_ns,
+                      resize_heap, heap);
+  if (!copy_space_page_out_blocks_until_memory_released(copy_space)
+      && heap->sizer.policy == GC_HEAP_SIZE_FIXED) {
     fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
             heap->size, copy_space->nslabs);
     GC_CRASH();
@@ -584,6 +607,24 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   return 1;
 }
 
+static uint64_t allocation_counter_from_thread(void *data) {
+  struct gc_heap *heap = data;
+  uint64_t ret = heap->total_allocated_bytes_at_last_gc;
+  if (pthread_mutex_trylock(&heap->lock)) return ret;
+  copy_space_add_to_allocation_counter(heap_copy_space(heap), &ret);
+  large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
+                                               &ret);
+  pthread_mutex_unlock(&heap->lock);
+  return ret;
+}
+
+static void set_heap_size_from_thread(size_t size, void *data) {
+  struct gc_heap *heap = data;
+  if (pthread_mutex_trylock(&heap->lock)) return;
+  resize_heap(size, heap);
+  pthread_mutex_unlock(&heap->lock);
+}
+
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut,
             struct gc_event_listener event_listener,
@@ -596,11 +637,6 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
                offsetof(struct copy_space_allocator, limit));
 
-  if (options->common.heap_size_policy != GC_HEAP_SIZE_FIXED) {
-    fprintf(stderr, "fixed heap size is currently required\n");
-    return 0;
-  }
-
   *heap = calloc(1, sizeof(struct gc_heap));
   if (!*heap) GC_CRASH();
 
@@ -622,6 +658,11 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
+  (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
+                                      allocation_counter_from_thread,
+                                      set_heap_size_from_thread,
+                                      (*heap));
+
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   add_mutator(*heap, *mut);

From d785f082b11c78de830bb223ae7c03a8a3452656 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 15 Sep 2024 11:01:49 +0200
Subject: [PATCH 290/403] Factor out adapative heap sizer background thread to
 own file

This will let us piggy-back on the thread to asynchronously release
memory to the OS.
---
 src/adaptive-heap-sizer.h |  80 ++++++++--------------
 src/background-thread.h   | 138 ++++++++++++++++++++++++++++++++++++++
 src/heap-sizer.h          |   5 +-
 src/mmc.c                 |   6 +-
 src/pcc.c                 |   6 +-
 5 files changed, 179 insertions(+), 56 deletions(-)
 create mode 100644 src/background-thread.h

diff --git a/src/adaptive-heap-sizer.h b/src/adaptive-heap-sizer.h
index 796f07469..126a493b8 100644
--- a/src/adaptive-heap-sizer.h
+++ b/src/adaptive-heap-sizer.h
@@ -7,6 +7,7 @@
 #include <string.h>
 
 #include "assert.h"
+#include "background-thread.h"
 #include "debug.h"
 #include "heap-sizer.h"
 #include "gc-platform.h"
@@ -37,10 +38,10 @@ struct gc_adaptive_heap_sizer {
   double maximum_multiplier;
   double minimum_free_space;
   double expansiveness;
-  int stopping;
-  pthread_t thread;
   pthread_mutex_t lock;
-  pthread_cond_t cond;
+  int background_task_id;
+  uint64_t last_bytes_allocated;
+  uint64_t last_heartbeat;
 };
 
 // With lock
@@ -86,46 +87,31 @@ gc_adaptive_heap_sizer_on_gc(struct gc_adaptive_heap_sizer *sizer,
   pthread_mutex_unlock(&sizer->lock);
 }
 
-static void*
-gc_adaptive_heap_sizer_thread(void *data) {
+static void
+gc_adaptive_heap_sizer_background_task(void *data) {
   struct gc_adaptive_heap_sizer *sizer = data;
-  uint64_t last_bytes_allocated =
+  uint64_t bytes_allocated =
     sizer->get_allocation_counter(sizer->callback_data);
-  uint64_t last_heartbeat = gc_platform_monotonic_nanoseconds();
-  pthread_mutex_lock(&sizer->lock);
-  while (!sizer->stopping) {
-    {
-      struct timespec ts;
-      if (clock_gettime(CLOCK_REALTIME, &ts)) {
-        perror("adaptive heap sizer thread: failed to get time!");
-        break;
-      }
-      ts.tv_sec += 1;
-      pthread_cond_timedwait(&sizer->cond, &sizer->lock, &ts);
-    }
-    uint64_t bytes_allocated =
-      sizer->get_allocation_counter(sizer->callback_data);
-    uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
-    double rate = (double) (bytes_allocated - last_bytes_allocated) /
-      (double) (heartbeat - last_heartbeat);
-    // Just smooth the rate, under the assumption that the denominator is almost
-    // always 1.
-    sizer->smoothed_allocation_rate *= 1.0 - sizer->allocation_smoothing_factor;
-    sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
-    last_heartbeat = heartbeat;
-    last_bytes_allocated = bytes_allocated;
-    sizer->set_heap_size(gc_adaptive_heap_sizer_calculate_size(sizer),
-                         sizer->callback_data);
-  }
+  uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
+  double rate = (double) (bytes_allocated - sizer->last_bytes_allocated) /
+    (double) (heartbeat - sizer->last_heartbeat);
+  // Just smooth the rate, under the assumption that the denominator is almost
+  // always 1.
+  sizer->smoothed_allocation_rate *= 1.0 - sizer->allocation_smoothing_factor;
+  sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
+  sizer->last_heartbeat = heartbeat;
+  sizer->last_bytes_allocated = bytes_allocated;
+  sizer->set_heap_size(gc_adaptive_heap_sizer_calculate_size(sizer),
+                       sizer->callback_data);
   pthread_mutex_unlock(&sizer->lock);
-  return NULL;
 }
 
 static struct gc_adaptive_heap_sizer*
 gc_make_adaptive_heap_sizer(double expansiveness,
                             uint64_t (*get_allocation_counter)(void *),
                             void (*set_heap_size)(size_t , void *),
-                            void *callback_data) {
+                            void *callback_data,
+                            struct gc_background_thread *thread) {
   struct gc_adaptive_heap_sizer *sizer;
   sizer = malloc(sizeof(*sizer));
   if (!sizer)
@@ -149,25 +135,15 @@ gc_make_adaptive_heap_sizer(double expansiveness,
   sizer->maximum_multiplier = 5;
   sizer->minimum_free_space = 4 * 1024 * 1024;
   sizer->expansiveness = expansiveness;
-  pthread_mutex_init(&sizer->lock, NULL);
-  pthread_cond_init(&sizer->cond, NULL);
-  if (pthread_create(&sizer->thread, NULL, gc_adaptive_heap_sizer_thread,
-                     sizer)) {
-    perror("spawning adaptive heap size thread failed");
-    GC_CRASH();
-  }
+  pthread_mutex_init(&thread->lock, NULL);
+  sizer->last_bytes_allocated = get_allocation_counter(callback_data);
+  sizer->last_heartbeat = gc_platform_monotonic_nanoseconds();
+  sizer->background_task_id = thread
+    ? gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_FIRST,
+                                    gc_adaptive_heap_sizer_background_task,
+                                    sizer)
+    : -1;
   return sizer;
 }
 
-static void
-gc_destroy_adaptive_heap_sizer(struct gc_adaptive_heap_sizer *sizer) {
-  pthread_mutex_lock(&sizer->lock);
-  GC_ASSERT(!sizer->stopping);
-  sizer->stopping = 1;
-  pthread_mutex_unlock(&sizer->lock);
-  pthread_cond_signal(&sizer->cond);
-  pthread_join(sizer->thread, NULL);
-  free(sizer);
-}
-
 #endif // ADAPTIVE_HEAP_SIZER_H
diff --git a/src/background-thread.h b/src/background-thread.h
new file mode 100644
index 000000000..ee858ac58
--- /dev/null
+++ b/src/background-thread.h
@@ -0,0 +1,138 @@
+#ifndef BACKGROUND_THREAD_H
+#define BACKGROUND_THREAD_H
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "assert.h"
+#include "debug.h"
+
+enum {
+  GC_BACKGROUND_TASK_FIRST = 0,
+  GC_BACKGROUND_TASK_NORMAL = 100,
+  GC_BACKGROUND_TASK_LAST = 200
+};
+
+struct gc_background_task {
+  int id;
+  int priority;
+  void (*run)(void *data);
+  void *data;
+};
+
+struct gc_background_thread {
+  size_t count;
+  size_t capacity;
+  struct gc_background_task *tasks;
+  int next_id;
+  int stopping;
+  pthread_t thread;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+};
+
+static void*
+gc_background_thread(void *data) {
+  struct gc_background_thread *thread = data;
+  struct timespec ts;
+  if (clock_gettime(CLOCK_REALTIME, &ts)) {
+    perror("background thread: failed to get time!");
+    return NULL;
+  }
+  pthread_mutex_lock(&thread->lock);
+  while (!thread->stopping) {
+    ts.tv_sec += 1;
+    pthread_cond_timedwait(&thread->cond, &thread->lock, &ts);
+    if (thread->stopping)
+      break;
+    for (size_t i = 0; i < thread->count; i++)
+      thread->tasks[i].run(thread->tasks[i].data);
+  }
+  pthread_mutex_unlock(&thread->lock);
+  return NULL;
+}
+
+static struct gc_background_thread*
+gc_make_background_thread(void) {
+  struct gc_background_thread *thread;
+  thread = malloc(sizeof(*thread));
+  if (!thread)
+    GC_CRASH();
+  memset(thread, 0, sizeof(*thread));
+  thread->tasks = NULL;
+  thread->count = 0;
+  thread->capacity = 0;
+  pthread_mutex_init(&thread->lock, NULL);
+  pthread_cond_init(&thread->cond, NULL);
+  if (pthread_create(&thread->thread, NULL, gc_background_thread, thread)) {
+    perror("spawning background thread failed");
+    GC_CRASH();
+  }
+  return thread;
+}
+
+static int
+gc_background_thread_add_task(struct gc_background_thread *thread,
+                              int priority, void (*run)(void *data),
+                              void *data) {
+  pthread_mutex_lock(&thread->lock);
+  if (thread->count == thread->capacity) {
+    size_t new_capacity = thread->capacity * 2 + 1;
+    struct gc_background_task *new_tasks =
+      realloc(thread->tasks, sizeof(struct gc_background_task) * new_capacity);
+    if (!new_tasks) {
+      perror("ran out of space for background tasks!");
+      GC_CRASH();
+    }
+    thread->capacity = new_capacity;
+    thread->tasks = new_tasks;
+  }
+  size_t insert = 0;
+  for (; insert < thread->count; insert++) {
+    if (priority < thread->tasks[insert].priority)
+      break;
+  }
+  size_t bytes_to_move =
+    (thread->count - insert) * sizeof(struct gc_background_task);
+  memmove(&thread->tasks[insert + 1], &thread->tasks[insert], bytes_to_move);
+  int id = thread->next_id++;
+  thread->tasks[insert].id = id;
+  thread->tasks[insert].priority = priority;
+  thread->tasks[insert].run = run;
+  thread->tasks[insert].data = data;
+  thread->count++;
+  pthread_mutex_unlock(&thread->lock);
+  return id;
+}
+
+static void
+gc_background_thread_remove_task(struct gc_background_thread *thread,
+                                 int id) {
+  pthread_mutex_lock(&thread->lock);
+  size_t remove = 0;
+  for (; remove < thread->count; remove++) {
+    if (thread->tasks[remove].id == id)
+      break;
+  }
+  if (remove == thread->count)
+    GC_CRASH();
+  size_t bytes_to_move =
+    (thread->count - (remove + 1)) * sizeof(struct gc_background_task);
+  memmove(&thread->tasks[remove], &thread->tasks[remove + 1], bytes_to_move);
+  pthread_mutex_unlock(&thread->lock);
+}
+
+static void
+gc_destroy_background_thread(struct gc_background_thread *thread) {
+  pthread_mutex_lock(&thread->lock);
+  GC_ASSERT(!thread->stopping);
+  thread->stopping = 1;
+  pthread_mutex_unlock(&thread->lock);
+  pthread_cond_signal(&thread->cond);
+  pthread_join(thread->thread, NULL);
+  free(thread->tasks);
+  free(thread);
+}
+
+#endif // BACKGROUND_THREAD_H
diff --git a/src/heap-sizer.h b/src/heap-sizer.h
index dc6f3d2ef..eb038cca9 100644
--- a/src/heap-sizer.h
+++ b/src/heap-sizer.h
@@ -20,7 +20,8 @@ gc_make_heap_sizer(struct gc_heap *heap,
                    const struct gc_common_options *options,
                    uint64_t (*get_allocation_counter_from_thread)(void*),
                    void (*set_heap_size_from_thread)(size_t, void*),
-                   void *data) {
+                   void *data,
+                   struct gc_background_thread *thread) {
   struct gc_heap_sizer ret = { options->heap_size_policy, };
   switch (options->heap_size_policy) {
     case GC_HEAP_SIZE_FIXED:
@@ -35,7 +36,7 @@ gc_make_heap_sizer(struct gc_heap *heap,
         gc_make_adaptive_heap_sizer (options->heap_expansiveness,
                                      get_allocation_counter_from_thread,
                                      set_heap_size_from_thread,
-                                     heap);
+                                     heap, thread);
       break;
 
     default:
diff --git a/src/mmc.c b/src/mmc.c
index 34a96e409..061d3b80f 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -9,6 +9,7 @@
 #define GC_IMPL 1
 #include "gc-internal.h"
 
+#include "background-thread.h"
 #include "debug.h"
 #include "gc-align.h"
 #include "gc-inline.h"
@@ -58,6 +59,7 @@ struct gc_heap {
   double minimum_major_gc_yield_threshold;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
+  struct gc_background_thread *background_thread;
   struct gc_heap_sizer sizer;
   struct gc_event_listener event_listener;
   void *event_listener_data;
@@ -1071,10 +1073,12 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
+  (*heap)->background_thread = gc_make_background_thread();
   (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
                                       allocation_counter_from_thread,
                                       set_heap_size_from_thread,
-                                      (*heap));
+                                      (*heap),
+                                      (*heap)->background_thread);
 
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
diff --git a/src/pcc.c b/src/pcc.c
index 40ba60f8b..2cd919e50 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -9,6 +9,7 @@
 #define GC_IMPL 1
 #include "gc-internal.h"
 
+#include "background-thread.h"
 #include "copy-space.h"
 #include "debug.h"
 #include "gc-align.h"
@@ -48,6 +49,7 @@ struct gc_heap {
   struct gc_tracer tracer;
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
+  struct gc_background_thread *background_thread;
   struct gc_heap_sizer sizer;
   struct gc_event_listener event_listener;
   void *event_listener_data;
@@ -658,10 +660,12 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
+  (*heap)->background_thread = gc_make_background_thread();
   (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
                                       allocation_counter_from_thread,
                                       set_heap_size_from_thread,
-                                      (*heap));
+                                      (*heap),
+                                      (*heap)->background_thread);
 
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();

From 7984f60eaee9dfa3d50dbac2589e2e6c8535187b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 08:49:30 +0200
Subject: [PATCH 291/403] MMC and PCC defer actual page-out operations to
 background thread

Should avoid excessive VM traffic when allocating large objects, or when
the adaptive heap sizer is on and we see frequent expansions and
resizes.
---
 src/adaptive-heap-sizer.h |   2 +-
 src/background-thread.h   |  41 +++++++++----
 src/copy-space.h          |  83 +++++++++++++++++++--------
 src/mmc.c                 |   9 ++-
 src/nofl-space.h          | 117 ++++++++++++++++++++++++++------------
 src/pcc.c                 |   8 ++-
 6 files changed, 184 insertions(+), 76 deletions(-)

diff --git a/src/adaptive-heap-sizer.h b/src/adaptive-heap-sizer.h
index 126a493b8..df38f181d 100644
--- a/src/adaptive-heap-sizer.h
+++ b/src/adaptive-heap-sizer.h
@@ -139,7 +139,7 @@ gc_make_adaptive_heap_sizer(double expansiveness,
   sizer->last_bytes_allocated = get_allocation_counter(callback_data);
   sizer->last_heartbeat = gc_platform_monotonic_nanoseconds();
   sizer->background_task_id = thread
-    ? gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_FIRST,
+    ? gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_MIDDLE,
                                     gc_adaptive_heap_sizer_background_task,
                                     sizer)
     : -1;
diff --git a/src/background-thread.h b/src/background-thread.h
index ee858ac58..7a141cee0 100644
--- a/src/background-thread.h
+++ b/src/background-thread.h
@@ -9,9 +9,9 @@
 #include "debug.h"
 
 enum {
-  GC_BACKGROUND_TASK_FIRST = 0,
-  GC_BACKGROUND_TASK_NORMAL = 100,
-  GC_BACKGROUND_TASK_LAST = 200
+  GC_BACKGROUND_TASK_START = 0,
+  GC_BACKGROUND_TASK_MIDDLE = 100,
+  GC_BACKGROUND_TASK_END = 200
 };
 
 struct gc_background_task {
@@ -21,12 +21,18 @@ struct gc_background_task {
   void *data;
 };
 
+enum gc_background_thread_state {
+  GC_BACKGROUND_THREAD_STARTING,
+  GC_BACKGROUND_THREAD_RUNNING,
+  GC_BACKGROUND_THREAD_STOPPING
+};
+
 struct gc_background_thread {
   size_t count;
   size_t capacity;
   struct gc_background_task *tasks;
   int next_id;
-  int stopping;
+  enum gc_background_thread_state state;
   pthread_t thread;
   pthread_mutex_t lock;
   pthread_cond_t cond;
@@ -35,19 +41,20 @@ struct gc_background_thread {
 static void*
 gc_background_thread(void *data) {
   struct gc_background_thread *thread = data;
+  pthread_mutex_lock(&thread->lock);
+  while (thread->state == GC_BACKGROUND_THREAD_STARTING)
+    pthread_cond_wait(&thread->cond, &thread->lock);
   struct timespec ts;
   if (clock_gettime(CLOCK_REALTIME, &ts)) {
     perror("background thread: failed to get time!");
     return NULL;
   }
-  pthread_mutex_lock(&thread->lock);
-  while (!thread->stopping) {
+  while (thread->state == GC_BACKGROUND_THREAD_RUNNING) {
     ts.tv_sec += 1;
     pthread_cond_timedwait(&thread->cond, &thread->lock, &ts);
-    if (thread->stopping)
-      break;
-    for (size_t i = 0; i < thread->count; i++)
-      thread->tasks[i].run(thread->tasks[i].data);
+    if (thread->state == GC_BACKGROUND_THREAD_RUNNING)
+      for (size_t i = 0; i < thread->count; i++)
+        thread->tasks[i].run(thread->tasks[i].data);
   }
   pthread_mutex_unlock(&thread->lock);
   return NULL;
@@ -63,6 +70,7 @@ gc_make_background_thread(void) {
   thread->tasks = NULL;
   thread->count = 0;
   thread->capacity = 0;
+  thread->state = GC_BACKGROUND_THREAD_STARTING;
   pthread_mutex_init(&thread->lock, NULL);
   pthread_cond_init(&thread->cond, NULL);
   if (pthread_create(&thread->thread, NULL, gc_background_thread, thread)) {
@@ -72,6 +80,15 @@ gc_make_background_thread(void) {
   return thread;
 }
 
+static void
+gc_background_thread_start(struct gc_background_thread *thread) {
+  pthread_mutex_lock(&thread->lock);
+  GC_ASSERT_EQ(thread->state, GC_BACKGROUND_THREAD_STARTING);
+  thread->state = GC_BACKGROUND_THREAD_RUNNING;
+  pthread_mutex_unlock(&thread->lock);
+  pthread_cond_signal(&thread->cond);
+}
+
 static int
 gc_background_thread_add_task(struct gc_background_thread *thread,
                               int priority, void (*run)(void *data),
@@ -126,8 +143,8 @@ gc_background_thread_remove_task(struct gc_background_thread *thread,
 static void
 gc_destroy_background_thread(struct gc_background_thread *thread) {
   pthread_mutex_lock(&thread->lock);
-  GC_ASSERT(!thread->stopping);
-  thread->stopping = 1;
+  GC_ASSERT(thread->state == GC_BACKGROUND_THREAD_RUNNING);
+  thread->state = GC_BACKGROUND_THREAD_STOPPING;
   pthread_mutex_unlock(&thread->lock);
   pthread_cond_signal(&thread->cond);
   pthread_join(thread->thread, NULL);
diff --git a/src/copy-space.h b/src/copy-space.h
index 7d8ab98a2..98d3f6146 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -10,6 +10,7 @@
 #include "gc-internal.h"
 
 #include "assert.h"
+#include "background-thread.h"
 #include "debug.h"
 #include "extents.h"
 #include "gc-align.h"
@@ -102,13 +103,16 @@ copy_space_object_region(struct gc_ref obj) {
   return (gc_ref_value(obj) / COPY_SPACE_REGION_SIZE) & 1;
 }
 
+#define COPY_SPACE_PAGE_OUT_QUEUE_SIZE 4
+
 struct copy_space {
   struct copy_space_block *empty;
   struct copy_space_block *partly_full;
   struct copy_space_block *full ALIGNED_TO_AVOID_FALSE_SHARING;
   size_t allocated_bytes;
   size_t fragmentation;
-  struct copy_space_block *paged_out ALIGNED_TO_AVOID_FALSE_SHARING;
+  struct copy_space_block *paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE]
+    ALIGNED_TO_AVOID_FALSE_SHARING;
   ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
   // The rest of these members are only changed rarely and with the heap
   // lock.
@@ -186,31 +190,23 @@ copy_space_push_partly_full_block(struct copy_space *space,
   copy_space_push_block(&space->partly_full, block);
 }
 
-static struct copy_space_block*
-copy_space_pop_paged_out_block(struct copy_space *space) {
-  return copy_space_pop_block(&space->paged_out);
-}
-
-static void
-copy_space_push_paged_out_block(struct copy_space *space,
-                                struct copy_space_block *block) {
-  copy_space_push_block(&space->paged_out, block);
-}
-
 static void
 copy_space_page_out_block(struct copy_space *space,
                           struct copy_space_block *block) {
-  block->in_core = 0;
-  block->all_zeroes[0] = block->all_zeroes[1] = 1;
-  madvise(copy_space_block_payload(block), COPY_SPACE_BLOCK_SIZE, MADV_DONTNEED);
-  copy_space_push_paged_out_block(space, block);
+  copy_space_push_block(block->in_core
+                        ? &space->paged_out[0]
+                        : &space->paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE-1],
+                        block);
 }
 
 static struct copy_space_block*
 copy_space_page_in_block(struct copy_space *space) {
-  struct copy_space_block* block = copy_space_pop_paged_out_block(space);
-  if (block) block->in_core = 1;
-  return block;
+  for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++) {
+    struct copy_space_block *block =
+      copy_space_pop_block(&space->paged_out[age]);
+    if (block) return block;
+  }
+  return NULL;
 }
 
 static ssize_t
@@ -280,6 +276,7 @@ copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
   if (copy_space_allocator_acquire_block(alloc,
                                          copy_space_pop_empty_block(space),
                                          space->active_region)) {
+    alloc->block->in_core = 1;
     if (alloc->block->all_zeroes[space->active_region])
       alloc->block->all_zeroes[space->active_region] = 0;
     else
@@ -629,15 +626,45 @@ copy_space_expand(struct copy_space *space, size_t bytes) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
       block->in_core = 0;
-      copy_space_push_paged_out_block(space, block);
+      copy_space_page_out_block(space, block);
       reserved -= COPY_SPACE_BLOCK_SIZE;
     }
   }
   copy_space_reacquire_memory(space, 0);
 }
 
+static void
+copy_space_advance_page_out_queue(void *data) {
+  struct copy_space *space = data;
+  for (int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
+    while (1) {
+      struct copy_space_block *block =
+        copy_space_pop_block(&space->paged_out[age]);
+      if (!block) break;
+      copy_space_push_block(&space->paged_out[age + 1], block);
+    }
+  }
+}
+
+static void
+copy_space_page_out_blocks(void *data) {
+  struct copy_space *space = data;
+  int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 2;
+  while (1) {
+    struct copy_space_block *block =
+      copy_space_pop_block(&space->paged_out[age]);
+    if (!block) break;
+    block->in_core = 0;
+    block->all_zeroes[0] = block->all_zeroes[1] = 1;
+    madvise(copy_space_block_payload(block), COPY_SPACE_BLOCK_SIZE,
+            MADV_DONTNEED);
+    copy_space_push_block(&space->paged_out[age + 1], block);
+  }
+}
+
 static int
-copy_space_init(struct copy_space *space, size_t size, int atomic) {
+copy_space_init(struct copy_space *space, size_t size, int atomic,
+                struct gc_background_thread *thread) {
   size = align_up(size, COPY_SPACE_BLOCK_SIZE);
   size_t reserved = align_up(size, COPY_SPACE_SLAB_SIZE);
   size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
@@ -648,7 +675,8 @@ copy_space_init(struct copy_space *space, size_t size, int atomic) {
   space->empty = NULL;
   space->partly_full = NULL;
   space->full = NULL;
-  space->paged_out = NULL;
+  for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++)
+    space->paged_out[age] = NULL;
   space->allocated_bytes = 0;
   space->fragmentation = 0;
   space->bytes_to_page_out = 0;
@@ -662,16 +690,21 @@ copy_space_init(struct copy_space *space, size_t size, int atomic) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
+      block->in_core = 0;
       if (reserved > size) {
-        block->in_core = 0;
-        copy_space_push_paged_out_block(space, block);
+        copy_space_page_out_block(space, block);
         reserved -= COPY_SPACE_BLOCK_SIZE;
       } else {
-        block->in_core = 1;
         copy_space_push_empty_block(space, block);
       }
     }
   }
+  gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
+                                copy_space_advance_page_out_queue,
+                                space);
+  gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_END,
+                                copy_space_page_out_blocks,
+                                space);
   return 1;
 }
 
diff --git a/src/mmc.c b/src/mmc.c
index 061d3b80f..bd0cc958b 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -1010,6 +1010,8 @@ heap_init(struct gc_heap *heap, const struct gc_options *options) {
   if (!heap->finalizer_state)
     GC_CRASH();
 
+  heap->background_thread = gc_make_background_thread();
+
   return 1;
 }
 
@@ -1064,7 +1066,8 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   struct nofl_space *space = heap_nofl_space(*heap);
   if (!nofl_space_init(space, (*heap)->size,
                        options->common.parallelism != 1,
-                       (*heap)->fragmentation_low_threshold)) {
+                       (*heap)->fragmentation_low_threshold,
+                       (*heap)->background_thread)) {
     free(*heap);
     *heap = NULL;
     return 0;
@@ -1073,7 +1076,6 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
-  (*heap)->background_thread = gc_make_background_thread();
   (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
                                       allocation_counter_from_thread,
                                       set_heap_size_from_thread,
@@ -1084,6 +1086,9 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!*mut) GC_CRASH();
   gc_stack_init(&(*mut)->stack, stack_base);
   add_mutator(*heap, *mut);
+
+  gc_background_thread_start((*heap)->background_thread);
+  
   return 1;
 }
 
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 5c46bb7ff..f52ad9e28 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -137,6 +137,8 @@ struct nofl_block_list {
   uintptr_t blocks;
 };
 
+#define NOFL_PAGE_OUT_QUEUE_SIZE 4
+
 struct nofl_space {
   uint64_t sweep_mask;
   uint8_t live_mask;
@@ -146,7 +148,7 @@ struct nofl_space {
   size_t heap_size;
   uint8_t last_collection_was_minor;
   struct nofl_block_list empty;
-  struct nofl_block_list unavailable;
+  struct nofl_block_list paged_out[NOFL_PAGE_OUT_QUEUE_SIZE];
   struct nofl_block_list to_sweep;
   struct nofl_block_list partly_full;
   struct nofl_block_list full;
@@ -407,31 +409,26 @@ nofl_block_count(struct nofl_block_list *list) {
   return atomic_load_explicit(&list->count, memory_order_acquire);
 }
 
-static void
-nofl_push_paged_out_block(struct nofl_space *space,
-                          struct nofl_block_ref block) {
-  GC_ASSERT(nofl_block_has_flag(block,
-                                NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT));
-  nofl_block_set_flag(block, NOFL_BLOCK_UNAVAILABLE);
-  nofl_push_block(&space->unavailable, block);
-}
-
 static void
 nofl_push_unavailable_block(struct nofl_space *space,
                             struct nofl_block_ref block) {
-  if (!nofl_block_has_flag(block, NOFL_BLOCK_PAGED_OUT)) {
-    nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
-    madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
-  }
-  nofl_push_paged_out_block(space, block);
+  nofl_block_set_flag(block, NOFL_BLOCK_UNAVAILABLE);
+  nofl_push_block(nofl_block_has_flag(block, NOFL_BLOCK_PAGED_OUT)
+                  ? &space->paged_out[NOFL_PAGE_OUT_QUEUE_SIZE-1]
+                  : &space->paged_out[0],
+                  block);
 }
 
 static struct nofl_block_ref
 nofl_pop_unavailable_block(struct nofl_space *space) {
-  struct nofl_block_ref block = nofl_pop_block(&space->unavailable);
-  if (!nofl_block_is_null(block))
-    nofl_block_clear_flag(block, NOFL_BLOCK_UNAVAILABLE);
-  return block;
+  for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++) {
+    struct nofl_block_ref block = nofl_pop_block(&space->paged_out[age]);
+    if (!nofl_block_is_null(block)) {
+      nofl_block_clear_flag(block, NOFL_BLOCK_UNAVAILABLE);
+      return block;
+    }
+  }
+  return nofl_block_null();
 }
 
 static void
@@ -445,14 +442,23 @@ nofl_pop_empty_block(struct nofl_space *space) {
   return nofl_pop_block(&space->empty);
 }
 
+static size_t
+nofl_active_block_count(struct nofl_space *space) {
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = 0;
+  for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++)
+    unavailable += nofl_block_count(&space->paged_out[age]);
+  GC_ASSERT(unavailable <= total);
+  return total - unavailable;
+}
+
 static int
 nofl_maybe_push_evacuation_target(struct nofl_space *space,
                                   struct nofl_block_ref block,
                                   double reserve) {
   size_t targets = nofl_block_count(&space->evacuation_targets);
-  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = nofl_block_count(&space->unavailable);
-  if (targets >= (total - unavailable) * reserve)
+  size_t active = nofl_active_block_count(space);
+  if (targets >= active * reserve)
     return 0;
 
   nofl_push_block(&space->evacuation_targets, block);
@@ -1084,9 +1090,8 @@ nofl_space_finish_evacuation(struct nofl_space *space) {
   // repopulate the reserve.
   GC_ASSERT(space->evacuating);
   space->evacuating = 0;
-  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = nofl_block_count(&space->unavailable);
-  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
+  size_t active = nofl_active_block_count(space);
+  size_t reserve = space->evacuation_minimum_reserve * active;
   GC_ASSERT(nofl_block_count(&space->evacuation_targets) == 0);
   while (reserve--) {
     struct nofl_block_ref block = nofl_pop_block(&space->empty);
@@ -1214,7 +1219,8 @@ nofl_space_verify_before_restart(struct nofl_space *space) {
   nofl_space_verify_swept_blocks(space, &space->full);
   nofl_space_verify_swept_blocks(space, &space->old);
   nofl_space_verify_empty_blocks(space, &space->empty, 1);
-  nofl_space_verify_empty_blocks(space, &space->unavailable, 0);
+  for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++)
+    nofl_space_verify_empty_blocks(space, &space->paged_out[age], 0);
   // GC_ASSERT(space->last_collection_was_minor || !nofl_block_count(&space->old));
 }
 
@@ -1229,9 +1235,8 @@ nofl_space_finish_gc(struct nofl_space *space,
     // If we were evacuating and preferentially allocated empty blocks
     // to the evacuation reserve, return those blocks to the empty set
     // for allocation by the mutator.
-    size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-    size_t unavailable = nofl_block_count(&space->unavailable);
-    size_t target = space->evacuation_minimum_reserve * (total - unavailable);
+    size_t active = nofl_active_block_count(space);
+    size_t target = space->evacuation_minimum_reserve * active;
     size_t reserve = nofl_block_count(&space->evacuation_targets);
     while (reserve-- > target)
       nofl_push_block(&space->empty,
@@ -1626,9 +1631,8 @@ nofl_space_shrink(struct nofl_space *space, size_t bytes) {
   // during trace, synchronously from gc_heap_sizer_on_gc, or async but subject
   // to the heap lock.
   if (pending > 0) {
-    size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-    size_t unavailable = nofl_block_count(&space->unavailable);
-    size_t target = space->evacuation_minimum_reserve * (total - unavailable);
+    size_t active = nofl_active_block_count(space);
+    size_t target = space->evacuation_minimum_reserve * active;
     ssize_t avail = nofl_block_count(&space->evacuation_targets);
     while (avail > target && pending > 0) {
       struct nofl_block_ref block = nofl_pop_block(&space->evacuation_targets);
@@ -1660,15 +1664,52 @@ nofl_space_expand(struct nofl_space *space, size_t bytes) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
       struct nofl_block_ref block = nofl_block_for_addr(addr);
       nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
-      nofl_push_paged_out_block(space, block);
+      nofl_push_unavailable_block(space, block);
     }
   }
   nofl_space_reacquire_memory(space, 0);
 }
 
+static void
+nofl_space_advance_page_out_queue(void *data) {
+  // When the nofl space goes to return a block to the OS, it goes on the head
+  // of the page-out queue.  Every second, the background thread will age the
+  // queue, moving all blocks from index 0 to index 1, and so on.  When a block
+  // reaches the end of the queue it is paged out (and stays at the end of the
+  // queue).  In this task, invoked by the background thread, we age queue
+  // items, except that we don't page out yet, as it could be that some other
+  // background task will need to pull pages back in.
+  struct nofl_space *space = data;
+  for (int age = NOFL_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
+    while (1) {
+      struct nofl_block_ref block = nofl_pop_block(&space->paged_out[age]);
+      if (nofl_block_is_null(block))
+        break;
+      nofl_push_block(&space->paged_out[age + 1], block);
+    }
+  }
+}
+
+static void
+nofl_space_page_out_blocks(void *data) {
+  // This task is invoked by the background thread after other tasks.  It
+  // actually pages out blocks that reached the end of the queue.
+  struct nofl_space *space = data;
+  int age = NOFL_PAGE_OUT_QUEUE_SIZE - 2;
+  while (1) {
+    struct nofl_block_ref block = nofl_pop_block(&space->paged_out[age]);
+    if (nofl_block_is_null(block))
+      break;
+    nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
+    madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+    nofl_push_block(&space->paged_out[age + 1], block);
+  }
+}
+
 static int
 nofl_space_init(struct nofl_space *space, size_t size, int atomic,
-                double promotion_threshold) {
+                double promotion_threshold,
+                struct gc_background_thread *thread) {
   size = align_up(size, NOFL_BLOCK_SIZE);
   size_t reserved = align_up(size, NOFL_SLAB_SIZE);
   size_t nslabs = reserved / NOFL_SLAB_SIZE;
@@ -1689,7 +1730,7 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
       struct nofl_block_ref block = nofl_block_for_addr(addr);
       nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
       if (reserved > size) {
-        nofl_push_paged_out_block(space, block);
+        nofl_push_unavailable_block(space, block);
         reserved -= NOFL_BLOCK_SIZE;
       } else {
         if (!nofl_push_evacuation_target_if_needed(space, block))
@@ -1697,6 +1738,12 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
       }
     }
   }
+  gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
+                                nofl_space_advance_page_out_queue,
+                                space);
+  gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_END,
+                                nofl_space_page_out_blocks,
+                                space);
   return 1;
 }
 
diff --git a/src/pcc.c b/src/pcc.c
index 2cd919e50..54c03404e 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -606,6 +606,8 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   if (!heap->finalizer_state)
     GC_CRASH();
 
+  heap->background_thread = gc_make_background_thread();
+
   return 1;
 }
 
@@ -651,7 +653,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   struct copy_space *space = heap_copy_space(*heap);
   int atomic_forward = options->common.parallelism > 1;
-  if (!copy_space_init(space, (*heap)->size, atomic_forward)) {
+  if (!copy_space_init(space, (*heap)->size, atomic_forward,
+                       (*heap)->background_thread)) {
     free(*heap);
     *heap = NULL;
     return 0;
@@ -670,6 +673,9 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   add_mutator(*heap, *mut);
+
+  gc_background_thread_start((*heap)->background_thread);
+
   return 1;
 }
 

From cf570d0206a7b3b4efb39b16b196b210e863c9f6 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 10:51:07 +0200
Subject: [PATCH 292/403] Don't release shared worklist buffers when less than
 256 kB

Fixes https://github.com/wingo/whippet/issues/8.
---
 src/shared-worklist.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/shared-worklist.h b/src/shared-worklist.h
index 6d5ed3315..afefb11e2 100644
--- a/src/shared-worklist.h
+++ b/src/shared-worklist.h
@@ -29,6 +29,8 @@ struct shared_worklist_buf {
 // Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
 #define shared_worklist_buf_max_log_size ((unsigned) 28)
 
+static const size_t shared_worklist_release_byte_threshold = 256 * 1024;
+
 static int
 shared_worklist_buf_init(struct shared_worklist_buf *buf, unsigned log_size) {
   ASSERT(log_size >= shared_worklist_buf_min_log_size);
@@ -59,8 +61,9 @@ shared_worklist_buf_byte_size(struct shared_worklist_buf *buf) {
 
 static void
 shared_worklist_buf_release(struct shared_worklist_buf *buf) {
-  if (buf->data)
-    madvise(buf->data, shared_worklist_buf_byte_size(buf), MADV_DONTNEED);
+  size_t byte_size = shared_worklist_buf_byte_size(buf);
+  if (buf->data && byte_size >= shared_worklist_release_byte_threshold)
+    madvise(buf->data, byte_size, MADV_DONTNEED);
 }
 
 static void

From dcfdc547f6d548e5a302d6a725cc3ceca894c954 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 11:45:01 +0200
Subject: [PATCH 293/403] Whoops, fix refactor-induced locking problem

---
 src/adaptive-heap-sizer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/adaptive-heap-sizer.h b/src/adaptive-heap-sizer.h
index df38f181d..d2c7c8612 100644
--- a/src/adaptive-heap-sizer.h
+++ b/src/adaptive-heap-sizer.h
@@ -90,6 +90,7 @@ gc_adaptive_heap_sizer_on_gc(struct gc_adaptive_heap_sizer *sizer,
 static void
 gc_adaptive_heap_sizer_background_task(void *data) {
   struct gc_adaptive_heap_sizer *sizer = data;
+  pthread_mutex_lock(&sizer->lock);
   uint64_t bytes_allocated =
     sizer->get_allocation_counter(sizer->callback_data);
   uint64_t heartbeat = gc_platform_monotonic_nanoseconds();

From 317039d9524e9967a505edcd361a4179221194da Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 13:07:25 +0200
Subject: [PATCH 294/403] Relax assertion when expanding the heap

It could be that newly mapped blocks were already acquired by other threads.
---
 src/nofl-space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index f52ad9e28..bad9c315d 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1667,7 +1667,7 @@ nofl_space_expand(struct nofl_space *space, size_t bytes) {
       nofl_push_unavailable_block(space, block);
     }
   }
-  nofl_space_reacquire_memory(space, 0);
+  nofl_space_maybe_reacquire_memory(space, 0);
 }
 
 static void

From 1bf250f62aa76f73d6bc7ad27b98d8224cc6e915 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 13:40:09 +0200
Subject: [PATCH 295/403] Heap growth can compete with lospace for nofl blocks

---
 src/mmc.c        | 7 ++++++-
 src/nofl-space.h | 6 ------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index bd0cc958b..2db1b2cc5 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -395,7 +395,12 @@ heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
   GC_ASSERT(npages <= previous);
   size_t bytes = (previous - npages) <<
     heap_large_object_space(heap)->page_size_log2;
-  nofl_space_reacquire_memory(heap_nofl_space(heap), bytes);
+  // If heap size is fixed, we won't need to allocate any more nofl blocks, as
+  // nothing uses paged-out blocks except large object allocation.  But if the
+  // heap can grow, growth can consume nofl-space blocks that were paged out to
+  // allow for lospace allocations, which means that here we may need to
+  // allocate additional slabs.
+  nofl_space_expand(heap_nofl_space(heap), bytes);
 }
 
 static void
diff --git a/src/nofl-space.h b/src/nofl-space.h
index bad9c315d..ea7fe1c31 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1291,12 +1291,6 @@ nofl_space_maybe_reacquire_memory(struct nofl_space *space, size_t bytes) {
   return pending;
 }
 
-static void
-nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
-  ssize_t pending = nofl_space_maybe_reacquire_memory(space, bytes);
-  GC_ASSERT(pending + NOFL_BLOCK_SIZE > 0);
-}
-
 static int
 nofl_space_sweep_until_memory_released(struct nofl_space *space,
                                        struct nofl_allocator *alloc) {

From b7306950bca27203f82139c3f6bbeaaa9a243cea Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 14:19:54 +0200
Subject: [PATCH 296/403] Implement adaptive heap sizing for semi

---
 src/adaptive-heap-sizer.h | 74 ++++++++++++++++++++++-------------
 src/growable-heap-sizer.h |  9 +++--
 src/heap-sizer.h          | 20 +++++-----
 src/mmc.c                 | 49 +++++++++++------------
 src/pcc.c                 | 50 +++++++++++-------------
 src/semi.c                | 82 ++++++++++++++++++++-------------------
 6 files changed, 149 insertions(+), 135 deletions(-)

diff --git a/src/adaptive-heap-sizer.h b/src/adaptive-heap-sizer.h
index d2c7c8612..6d3db05bb 100644
--- a/src/adaptive-heap-sizer.h
+++ b/src/adaptive-heap-sizer.h
@@ -9,8 +9,9 @@
 #include "assert.h"
 #include "background-thread.h"
 #include "debug.h"
-#include "heap-sizer.h"
+#include "gc-config.h"
 #include "gc-platform.h"
+#include "heap-sizer.h"
 
 // This is the MemBalancer algorithm from "Optimal Heap Limits for Reducing
 // Browser Memory Use" by Marisa Kirisame, Pranav Shenoy, and Pavel Panchekha
@@ -25,9 +26,9 @@
 // high on program startup.
 
 struct gc_adaptive_heap_sizer {
-  uint64_t (*get_allocation_counter)(void *callback_data);
-  void (*set_heap_size)(size_t size, void *callback_data);
-  void *callback_data;
+  uint64_t (*get_allocation_counter)(struct gc_heap *heap);
+  void (*set_heap_size)(struct gc_heap *heap, size_t size);
+  struct gc_heap *heap;
   uint64_t smoothed_pause_time;
   uint64_t smoothed_live_bytes;
   uint64_t live_bytes;
@@ -38,12 +39,28 @@ struct gc_adaptive_heap_sizer {
   double maximum_multiplier;
   double minimum_free_space;
   double expansiveness;
+#if GC_PARALLEL
   pthread_mutex_t lock;
+#endif
   int background_task_id;
   uint64_t last_bytes_allocated;
   uint64_t last_heartbeat;
 };
 
+static void
+gc_adaptive_heap_sizer_lock(struct gc_adaptive_heap_sizer *sizer) {
+#if GC_PARALLEL
+  pthread_mutex_lock(&sizer->lock);
+#endif
+}
+
+static void
+gc_adaptive_heap_sizer_unlock(struct gc_adaptive_heap_sizer *sizer) {
+#if GC_PARALLEL
+  pthread_mutex_unlock(&sizer->lock);
+#endif
+}
+
 // With lock
 static uint64_t
 gc_adaptive_heap_sizer_calculate_size(struct gc_adaptive_heap_sizer *sizer) {
@@ -65,34 +82,33 @@ gc_adaptive_heap_sizer_calculate_size(struct gc_adaptive_heap_sizer *sizer) {
 static uint64_t
 gc_adaptive_heap_sizer_set_expansiveness(struct gc_adaptive_heap_sizer *sizer,
                                          double expansiveness) {
-  pthread_mutex_lock(&sizer->lock);
+  gc_adaptive_heap_sizer_lock(sizer);
   sizer->expansiveness = expansiveness;
   uint64_t heap_size = gc_adaptive_heap_sizer_calculate_size(sizer);
-  pthread_mutex_unlock(&sizer->lock);
+  gc_adaptive_heap_sizer_unlock(sizer);
   return heap_size;
 }
 
 static void
 gc_adaptive_heap_sizer_on_gc(struct gc_adaptive_heap_sizer *sizer,
                              size_t live_bytes, uint64_t pause_ns,
-                             void (*set_heap_size)(size_t, void*),
-                             void *data) {
-  pthread_mutex_lock(&sizer->lock);
+                             void (*set_heap_size)(struct gc_heap*, size_t)) {
+  gc_adaptive_heap_sizer_lock(sizer);
   sizer->live_bytes = live_bytes;
   sizer->smoothed_live_bytes *= 1.0 - sizer->collection_smoothing_factor;
   sizer->smoothed_live_bytes += sizer->collection_smoothing_factor * live_bytes;
   sizer->smoothed_pause_time *= 1.0 - sizer->collection_smoothing_factor;
   sizer->smoothed_pause_time += sizer->collection_smoothing_factor * pause_ns;
-  set_heap_size(gc_adaptive_heap_sizer_calculate_size(sizer), data);
-  pthread_mutex_unlock(&sizer->lock);
+  set_heap_size(sizer->heap, gc_adaptive_heap_sizer_calculate_size(sizer));
+  gc_adaptive_heap_sizer_unlock(sizer);
 }
 
 static void
 gc_adaptive_heap_sizer_background_task(void *data) {
   struct gc_adaptive_heap_sizer *sizer = data;
-  pthread_mutex_lock(&sizer->lock);
+  gc_adaptive_heap_sizer_lock(sizer);
   uint64_t bytes_allocated =
-    sizer->get_allocation_counter(sizer->callback_data);
+    sizer->get_allocation_counter(sizer->heap);
   uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
   double rate = (double) (bytes_allocated - sizer->last_bytes_allocated) /
     (double) (heartbeat - sizer->last_heartbeat);
@@ -102,16 +118,15 @@ gc_adaptive_heap_sizer_background_task(void *data) {
   sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
   sizer->last_heartbeat = heartbeat;
   sizer->last_bytes_allocated = bytes_allocated;
-  sizer->set_heap_size(gc_adaptive_heap_sizer_calculate_size(sizer),
-                       sizer->callback_data);
-  pthread_mutex_unlock(&sizer->lock);
+  sizer->set_heap_size(sizer->heap,
+                       gc_adaptive_heap_sizer_calculate_size(sizer));
+  gc_adaptive_heap_sizer_unlock(sizer);
 }
 
 static struct gc_adaptive_heap_sizer*
-gc_make_adaptive_heap_sizer(double expansiveness,
-                            uint64_t (*get_allocation_counter)(void *),
-                            void (*set_heap_size)(size_t , void *),
-                            void *callback_data,
+gc_make_adaptive_heap_sizer(struct gc_heap *heap, double expansiveness,
+                            uint64_t (*get_allocation_counter)(struct gc_heap*),
+                            void (*set_heap_size)(struct gc_heap*, size_t),
                             struct gc_background_thread *thread) {
   struct gc_adaptive_heap_sizer *sizer;
   sizer = malloc(sizeof(*sizer));
@@ -120,7 +135,7 @@ gc_make_adaptive_heap_sizer(double expansiveness,
   memset(sizer, 0, sizeof(*sizer));
   sizer->get_allocation_counter = get_allocation_counter;
   sizer->set_heap_size = set_heap_size;
-  sizer->callback_data = callback_data;
+  sizer->heap = heap;
   // Baseline estimate of GC speed: 10 MB/ms, or 10 bytes/ns.  However since we
   // observe this speed by separately noisy measurements, we have to provide
   // defaults for numerator and denominator; estimate 2ms for initial GC pauses
@@ -136,14 +151,17 @@ gc_make_adaptive_heap_sizer(double expansiveness,
   sizer->maximum_multiplier = 5;
   sizer->minimum_free_space = 4 * 1024 * 1024;
   sizer->expansiveness = expansiveness;
-  pthread_mutex_init(&thread->lock, NULL);
-  sizer->last_bytes_allocated = get_allocation_counter(callback_data);
+  sizer->last_bytes_allocated = get_allocation_counter(heap);
   sizer->last_heartbeat = gc_platform_monotonic_nanoseconds();
-  sizer->background_task_id = thread
-    ? gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_MIDDLE,
-                                    gc_adaptive_heap_sizer_background_task,
-                                    sizer)
-    : -1;
+#if GC_PARALLEL
+  pthread_mutex_init(&thread->lock, NULL);
+  sizer->background_task_id =
+    gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_MIDDLE,
+                                  gc_adaptive_heap_sizer_background_task,
+                                  sizer);
+#else
+  sizer->background_task_id = -1;
+#endif
   return sizer;
 }
 
diff --git a/src/growable-heap-sizer.h b/src/growable-heap-sizer.h
index bf4200893..49e5ad377 100644
--- a/src/growable-heap-sizer.h
+++ b/src/growable-heap-sizer.h
@@ -13,6 +13,7 @@
 // the heap.
 
 struct gc_growable_heap_sizer {
+  struct gc_heap *heap;
   double multiplier;
   pthread_mutex_t lock;
 };
@@ -29,22 +30,22 @@ static void
 gc_growable_heap_sizer_on_gc(struct gc_growable_heap_sizer *sizer,
                              size_t heap_size, size_t live_bytes,
                              uint64_t pause_ns,
-                             void (*set_heap_size)(size_t, void*),
-                             void *data) {
+                             void (*set_heap_size)(struct gc_heap*, size_t)) {
   pthread_mutex_lock(&sizer->lock);
   size_t target_size = live_bytes * sizer->multiplier;
   if (target_size > heap_size)
-    set_heap_size(target_size, data);
+    set_heap_size(sizer->heap, target_size);
   pthread_mutex_unlock(&sizer->lock);
 }
 
 static struct gc_growable_heap_sizer*
-gc_make_growable_heap_sizer(double multiplier) {
+gc_make_growable_heap_sizer(struct gc_heap *heap, double multiplier) {
   struct gc_growable_heap_sizer *sizer;
   sizer = malloc(sizeof(*sizer));
   if (!sizer)
     GC_CRASH();
   memset(sizer, 0, sizeof(*sizer));
+  sizer->heap = heap;
   sizer->multiplier = multiplier;
   pthread_mutex_init(&sizer->lock, NULL);
   return sizer;
diff --git a/src/heap-sizer.h b/src/heap-sizer.h
index eb038cca9..46ef841c8 100644
--- a/src/heap-sizer.h
+++ b/src/heap-sizer.h
@@ -18,9 +18,8 @@ struct gc_heap_sizer {
 static struct gc_heap_sizer
 gc_make_heap_sizer(struct gc_heap *heap,
                    const struct gc_common_options *options,
-                   uint64_t (*get_allocation_counter_from_thread)(void*),
-                   void (*set_heap_size_from_thread)(size_t, void*),
-                   void *data,
+                   uint64_t (*get_allocation_counter_from_thread)(struct gc_heap*),
+                   void (*set_heap_size_from_thread)(struct gc_heap*, size_t),
                    struct gc_background_thread *thread) {
   struct gc_heap_sizer ret = { options->heap_size_policy, };
   switch (options->heap_size_policy) {
@@ -28,15 +27,16 @@ gc_make_heap_sizer(struct gc_heap *heap,
       break;
 
     case GC_HEAP_SIZE_GROWABLE:
-      ret.growable = gc_make_growable_heap_sizer(options->heap_size_multiplier);
+      ret.growable =
+        gc_make_growable_heap_sizer(heap, options->heap_size_multiplier);
       break;
 
     case GC_HEAP_SIZE_ADAPTIVE:
       ret.adaptive =
-        gc_make_adaptive_heap_sizer (options->heap_expansiveness,
+        gc_make_adaptive_heap_sizer (heap, options->heap_expansiveness,
                                      get_allocation_counter_from_thread,
                                      set_heap_size_from_thread,
-                                     heap, thread);
+                                     thread);
       break;
 
     default:
@@ -48,19 +48,21 @@ gc_make_heap_sizer(struct gc_heap *heap,
 static void
 gc_heap_sizer_on_gc(struct gc_heap_sizer sizer, size_t heap_size,
                     size_t live_bytes, size_t pause_ns,
-                    void (*set_heap_size)(size_t, void*), void *data) {
+                    void (*set_heap_size)(struct gc_heap*, size_t)) {
   switch (sizer.policy) {
     case GC_HEAP_SIZE_FIXED:
       break;
 
     case GC_HEAP_SIZE_GROWABLE:
       gc_growable_heap_sizer_on_gc(sizer.growable, heap_size, live_bytes,
-                                   pause_ns, set_heap_size, data);
+                                   pause_ns, set_heap_size);
       break;
 
     case GC_HEAP_SIZE_ADAPTIVE:
+      if (sizer.adaptive->background_task_id < 0)
+        gc_adaptive_heap_sizer_background_task(sizer.adaptive);
       gc_adaptive_heap_sizer_on_gc(sizer.adaptive, live_bytes, pause_ns,
-                                   set_heap_size, data);
+                                   set_heap_size);
       break;
 
     default:
diff --git a/src/mmc.c b/src/mmc.c
index 2db1b2cc5..533067b40 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -462,8 +462,7 @@ maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
 }
 
 static void
-resize_heap(size_t new_size, void *data) {
-  struct gc_heap *heap = data;
+resize_heap(struct gc_heap *heap, size_t new_size) {
   if (new_size == heap->size)
     return;
   DEBUG("------ resizing heap\n");
@@ -801,7 +800,7 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
     heap_estimate_live_data_after_gc(heap, live_bytes, yield);
   DEBUG("--- total live bytes estimate: %zu\n", live_bytes_estimate);
   gc_heap_sizer_on_gc(heap->sizer, heap->size, live_bytes_estimate, pause_ns,
-                      resize_heap, heap);
+                      resize_heap);
   heap->size_at_last_gc = heap->size;
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
@@ -987,6 +986,22 @@ gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
+static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
+  uint64_t ret = heap->total_allocated_bytes_at_last_gc;
+  if (pthread_mutex_trylock(&heap->lock)) return ret;
+  nofl_space_add_to_allocation_counter(heap_nofl_space(heap), &ret);
+  large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
+                                               &ret);
+  pthread_mutex_unlock(&heap->lock);
+  return ret;
+}
+
+static void set_heap_size_from_thread(struct gc_heap *heap, size_t size) {
+  if (pthread_mutex_trylock(&heap->lock)) return;
+  resize_heap(heap, size);
+  pthread_mutex_unlock(&heap->lock);
+}
+
 static int
 heap_init(struct gc_heap *heap, const struct gc_options *options) {
   // *heap is already initialized to 0.
@@ -1016,28 +1031,14 @@ heap_init(struct gc_heap *heap, const struct gc_options *options) {
     GC_CRASH();
 
   heap->background_thread = gc_make_background_thread();
+  heap->sizer = gc_make_heap_sizer(heap, &options->common,
+                                   allocation_counter_from_thread,
+                                   set_heap_size_from_thread,
+                                   heap->background_thread);
 
   return 1;
 }
 
-static uint64_t allocation_counter_from_thread(void *data) {
-  struct gc_heap *heap = data;
-  uint64_t ret = heap->total_allocated_bytes_at_last_gc;
-  if (pthread_mutex_trylock(&heap->lock)) return ret;
-  nofl_space_add_to_allocation_counter(heap_nofl_space(heap), &ret);
-  large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
-                                               &ret);
-  pthread_mutex_unlock(&heap->lock);
-  return ret;
-}
-
-static void set_heap_size_from_thread(size_t size, void *data) {
-  struct gc_heap *heap = data;
-  if (pthread_mutex_trylock(&heap->lock)) return;
-  resize_heap(size, heap);
-  pthread_mutex_unlock(&heap->lock);
-}
-
 int
 gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
         struct gc_heap **heap, struct gc_mutator **mut,
@@ -1081,12 +1082,6 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
-  (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
-                                      allocation_counter_from_thread,
-                                      set_heap_size_from_thread,
-                                      (*heap),
-                                      (*heap)->background_thread);
-
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   gc_stack_init(&(*mut)->stack, stack_base);
diff --git a/src/pcc.c b/src/pcc.c
index 54c03404e..34a7f47b6 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -322,8 +322,7 @@ static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
     pause_mutator_for_collection_without_lock(mut);
 }
 
-static void resize_heap(size_t new_size, void *data) {
-  struct gc_heap *heap = data;
+static void resize_heap(struct gc_heap *heap, size_t new_size) {
   if (new_size == heap->size)
     return;
   DEBUG("------ resizing heap\n");
@@ -431,7 +430,7 @@ static void collect(struct gc_mutator *mut) {
   uint64_t pause_ns = gc_platform_monotonic_nanoseconds() - start_ns;
   HEAP_EVENT(heap, live_data_size, live_size);
   gc_heap_sizer_on_gc(heap->sizer, heap->size, live_size, pause_ns,
-                      resize_heap, heap);
+                      resize_heap);
   if (!copy_space_page_out_blocks_until_memory_released(copy_space)
       && heap->sizer.policy == GC_HEAP_SIZE_FIXED) {
     fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
@@ -585,6 +584,22 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
+static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
+  uint64_t ret = heap->total_allocated_bytes_at_last_gc;
+  if (pthread_mutex_trylock(&heap->lock)) return ret;
+  copy_space_add_to_allocation_counter(heap_copy_space(heap), &ret);
+  large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
+                                               &ret);
+  pthread_mutex_unlock(&heap->lock);
+  return ret;
+}
+
+static void set_heap_size_from_thread(struct gc_heap *heap, size_t size) {
+  if (pthread_mutex_trylock(&heap->lock)) return;
+  resize_heap(heap, size);
+  pthread_mutex_unlock(&heap->lock);
+}
+
 static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   // *heap is already initialized to 0.
 
@@ -607,28 +622,14 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
     GC_CRASH();
 
   heap->background_thread = gc_make_background_thread();
+  heap->sizer = gc_make_heap_sizer(heap, &options->common,
+                                   allocation_counter_from_thread,
+                                   set_heap_size_from_thread,
+                                   heap->background_thread);
 
   return 1;
 }
 
-static uint64_t allocation_counter_from_thread(void *data) {
-  struct gc_heap *heap = data;
-  uint64_t ret = heap->total_allocated_bytes_at_last_gc;
-  if (pthread_mutex_trylock(&heap->lock)) return ret;
-  copy_space_add_to_allocation_counter(heap_copy_space(heap), &ret);
-  large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
-                                               &ret);
-  pthread_mutex_unlock(&heap->lock);
-  return ret;
-}
-
-static void set_heap_size_from_thread(size_t size, void *data) {
-  struct gc_heap *heap = data;
-  if (pthread_mutex_trylock(&heap->lock)) return;
-  resize_heap(size, heap);
-  pthread_mutex_unlock(&heap->lock);
-}
-
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut,
             struct gc_event_listener event_listener,
@@ -663,13 +664,6 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     GC_CRASH();
 
-  (*heap)->background_thread = gc_make_background_thread();
-  (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
-                                      allocation_counter_from_thread,
-                                      set_heap_size_from_thread,
-                                      (*heap),
-                                      (*heap)->background_thread);
-
   *mut = calloc(1, sizeof(struct gc_mutator));
   if (!*mut) GC_CRASH();
   add_mutator(*heap, *mut);
diff --git a/src/semi.c b/src/semi.c
index 3b4d90b20..29ac29c3e 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -10,6 +10,8 @@
 #define GC_IMPL 1
 #include "gc-internal.h"
 
+#include "gc-platform.h"
+#include "heap-sizer.h"
 #include "semi-attrs.h"
 #include "large-object-space.h"
 
@@ -32,6 +34,7 @@ struct semi_space {
   struct region to_space;
   size_t page_size;
   size_t stolen_pages;
+  size_t live_bytes_at_last_gc;
 };
 struct gc_heap {
   struct semi_space semi_space;
@@ -42,10 +45,12 @@ struct gc_heap {
   double pending_ephemerons_size_factor;
   double pending_ephemerons_size_slop;
   size_t size;
+  size_t total_allocated_bytes_at_last_gc;
   long count;
   int check_pending_ephemerons;
   const struct gc_options *options;
   struct gc_heap_roots *roots;
+  struct gc_heap_sizer sizer;
   struct gc_event_listener event_listener;
   void *event_listener_data;
 };
@@ -134,10 +139,18 @@ static int semi_space_steal_pages(struct semi_space *space, size_t npages) {
 
 static void semi_space_finish_gc(struct semi_space *space,
                                  size_t large_object_pages) {
+  space->live_bytes_at_last_gc = space->hp - space->to_space.base;
   space->stolen_pages = large_object_pages;
   space->limit = 0; // set in adjust_heap_size_and_limits
 }
 
+static void
+semi_space_add_to_allocation_counter(struct semi_space *space,
+                                     uint64_t *counter) {
+  size_t base = space->to_space.base + space->live_bytes_at_last_gc;
+  *counter += space->hp - base;
+}
+
 static void flip(struct semi_space *space) {
   struct region tmp;
   GC_ASSERT(space->hp <= space->limit);
@@ -258,10 +271,9 @@ static int grow_region_if_needed(struct region *region, size_t new_size) {
   if (new_size <= region->mapped_size)
     return 1;
 
-  new_size = max_size(new_size, region->mapped_size * 2);
-
   void *mem = mmap(NULL, new_size, PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  DEBUG("new size %zx\n", new_size);
   if (mem == MAP_FAILED) {
     perror("mmap failed");
     return 0;
@@ -286,38 +298,9 @@ static void truncate_region(struct region *region, size_t new_size) {
   }
 }
 
-static size_t compute_new_heap_size(struct gc_heap *heap, size_t for_alloc) {
+static void resize_heap(struct gc_heap *heap, size_t new_heap_size) {
   struct semi_space *semi = heap_semi_space(heap);
-  struct large_object_space *large = heap_large_object_space(heap);
-  size_t live_bytes = semi->hp - semi->to_space.base;
-  live_bytes += large->live_pages_at_last_collection * semi->page_size;
-  live_bytes += for_alloc;
-
-  HEAP_EVENT(heap, live_data_size, live_bytes);
-
-  size_t new_heap_size = heap->size;
-  switch (heap->options->common.heap_size_policy) {
-    case GC_HEAP_SIZE_FIXED:
-      break;
-
-    case GC_HEAP_SIZE_GROWABLE: {
-      new_heap_size =
-        max_size(heap->size,
-                 live_bytes * heap->options->common.heap_size_multiplier);
-      break;
-    }
-
-    case GC_HEAP_SIZE_ADAPTIVE:
-    default:
-      GC_CRASH();
-  }
-  return align_up(new_heap_size, semi->page_size * 2);
-}
-
-static void adjust_heap_size_and_limits(struct gc_heap *heap,
-                                        size_t for_alloc) {
-  struct semi_space *semi = heap_semi_space(heap);
-  size_t new_heap_size = compute_new_heap_size(heap, for_alloc);
+  new_heap_size = align_up(new_heap_size, semi->page_size * 2);
   size_t new_region_size = new_heap_size / 2;
 
   // Note that there is an asymmetry in how heap size is adjusted: we
@@ -386,6 +369,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct gc_heap *heap = mutator_heap(mut);
   int is_minor = 0;
   int is_compacting = 1;
+  uint64_t start_ns = gc_platform_monotonic_nanoseconds();
 
   HEAP_EVENT(heap, requesting_stop);
   HEAP_EVENT(heap, waiting_for_stop);
@@ -395,6 +379,9 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   struct semi_space *semi = heap_semi_space(heap);
   struct large_object_space *large = heap_large_object_space(heap);
   // fprintf(stderr, "start collect #%ld:\n", space->count);
+  uint64_t *counter_loc = &heap->total_allocated_bytes_at_last_gc;
+  semi_space_add_to_allocation_counter(semi, counter_loc);
+  large_object_space_add_to_allocation_counter(large, counter_loc);
   large_object_space_start_gc(large, 0);
   gc_extern_space_start_gc(heap->extern_space, 0);
   flip(semi);
@@ -420,7 +407,15 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   gc_extern_space_finish_gc(heap->extern_space, 0);
   semi_space_finish_gc(semi, large->live_pages_at_last_collection);
   gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
-  adjust_heap_size_and_limits(heap, for_alloc);
+  size_t live_size = semi->live_bytes_at_last_gc;
+  live_size += large_object_space_size_at_last_collection(large);
+  live_size += for_alloc;
+  uint64_t pause_ns = gc_platform_monotonic_nanoseconds() - start_ns;
+  HEAP_EVENT(heap, live_data_size, live_size);
+  DEBUG("gc %zu: live size %zu, heap size %zu\n", heap->count, live_size,
+        heap->size);
+  gc_heap_sizer_on_gc(heap->sizer, heap->size, live_size, pause_ns,
+                      resize_heap);
 
   HEAP_EVENT(heap, restarting_mutators);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
@@ -595,7 +590,7 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   if (!heap->finalizer_state)
     GC_CRASH();
 
-return heap_prepare_pending_ephemerons(heap);
+  return heap_prepare_pending_ephemerons(heap);
 }
 
 int gc_option_from_string(const char *str) {
@@ -622,6 +617,14 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
+static uint64_t get_allocation_counter(struct gc_heap *heap) {
+  return heap->total_allocated_bytes_at_last_gc;
+}
+
+static void ignore_async_heap_size_adjustment(struct gc_heap *heap,
+                                              size_t size) {
+}
+
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut,
             struct gc_event_listener event_listener,
@@ -633,10 +636,6 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   if (!options) options = gc_allocate_options();
 
-  if (options->common.heap_size_policy == GC_HEAP_SIZE_ADAPTIVE) {
-    fprintf(stderr, "adaptive heap size is currently unimplemented\n");
-    return 0;
-  }
   if (options->common.parallelism != 1)
     fprintf(stderr, "warning: parallelism unimplemented in semispace copying collector\n");
 
@@ -656,6 +655,11 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     return 0;
   
+  (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
+                                      get_allocation_counter,
+                                      ignore_async_heap_size_adjustment,
+                                      NULL);
+
   // Ignore stack base, as we are precise.
   (*mut)->roots = NULL;
 

From 506b4187fcff9a3eda827a11f9706c2393627e5a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 14:33:30 +0200
Subject: [PATCH 297/403] Update manual

---
 README.md            |  8 ++++--
 doc/collector-bdw.md |  6 ++++
 doc/manual.md        | 67 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 52e98e77b..b3689fcd9 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,8 @@ See the [documentation](./doc/README.md).
  - Inline allocation / write barrier fast paths (supporting JIT)
  - One unified API with no-overhead abstraction: switch collectors when
    you like
+ - Three policies for sizing heaps: fixed, proportional to live size, and
+   [MemBalancer](http://marisa.moe/balancer.html)
 
 ## Source repository structure
 
@@ -42,9 +44,9 @@ See the [documentation](./doc/README.md).
 
 ## Status and roadmap
 
-As of September 2024, Whippet is almost feature-complete.  The main
-missing feature is dynamic heap growth and shrinkage
-(https://github.com/wingo/whippet/issues/5), which should land soon.
+As of September 2024, Whippet is almost feature-complete.  We need to
+land a per-object pinning API, and an API for cooperative safepoints for
+use by threads that are looping without allocating.
 
 After that, the next phase on the roadmap is support for tracing, and
 some performance noodling.
diff --git a/doc/collector-bdw.md b/doc/collector-bdw.md
index b86a9d3b1..5a38b4e2e 100644
--- a/doc/collector-bdw.md
+++ b/doc/collector-bdw.md
@@ -15,6 +15,12 @@ finalizers), and both ephemerons and finalizers only approximate the
 Whippet behavior, because they are implemented in terms of what BDW-GC
 provides.
 
+`bdw` supports the `fixed` and `growable` heap-sizing policies, but not
+`adaptive`, as BDW-GC can't reliably return memory to the OS.  Also,
+[`growable` has an effective limit of a 3x heap
+multiplier](https://github.com/wingo/whippet/blob/main/src/bdw.c#L478).
+Oh well!
+
 It's a bit of an oddball from a Whippet perspective, but useful as a
 migration path if you have an embedder that is already using BDW-GC.
 And, it is a useful performance comparison.
diff --git a/doc/manual.md b/doc/manual.md
index e88ac8198..45e8d019d 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -77,13 +77,15 @@ visitor function on all outgoing edges in an object.  It also includes a
 of an object.  `trace_edge` and `size` may be `NULL`, in which case no
 tracing or size computation should be performed.
 
-### Tracing ephemerons
+### Tracing ephemerons and finalizers
 
 Most kinds of GC-managed object are defined by the program, but the GC
-itself has support for a specific object kind: ephemerons.  If the
-program allocates ephemerons, it should trace them in the
-`gc_trace_object` function by calling `gc_trace_ephemeron` from
-[`gc-ephemerons.h`](../api/gc-ephemerons.h).
+itself has support for two specific object kind: ephemerons and
+finalizers.  If the program allocates ephemerons, it should trace them
+in the `gc_trace_object` function by calling `gc_trace_ephemeron` from
+[`gc-ephemerons.h`](../api/gc-ephemerons.h).  Likewise if the program
+allocates finalizers, it should trace them by calling
+`gc_trace_finalizer` from [`gc-finalizer.h`](../api/gc-finalizer.h).
 
 ### Remembered-set bits
 
@@ -299,6 +301,12 @@ We do this by including the `gc-embedder-api.h` implementation, via
 $(COMPILE) -include foo-embedder.h -o gc-ephemeron.o -c gc-ephemeron.c
 ```
 
+As for ephemerons, finalizers also have their own compilation unit.
+
+```
+$(COMPILE) -include foo-embedder.h -o gc-finalizer.o -c gc-finalizer.c
+```
+
 #### Compile-time options
 
 There are a number of pre-processor definitions that can parameterize
@@ -469,7 +477,7 @@ defined for all collectors:
 
 You can set these options via `gc_option_set_int` and so on; see
 [`gc-options.h`](../api/gc-options.h).  Or, you can parse options from
-trings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
+strings: `heap-size-policy`, `heap-size`, `maximum-heap-size`, and so
 on.  Use `gc_option_from_string` to determine if a string is really an
 option.  Use `gc_option_parse_and_set` to parse a value for an option.
 Use `gc_options_parse_and_set_many` to parse a number of comma-delimited
@@ -669,4 +677,49 @@ An ephemeron association can be removed via `gc_ephemeron_mark_dead`.
 
 ### Finalizers
 
-Not yet implemented!
+A finalizer allows the embedder to be notified when an object becomes
+unreachable.
+
+A finalizer has a priority.  When the heap is created, the embedder
+should declare how many priorities there are.  Lower-numbered priorities
+take precedence; if an object has a priority-0 finalizer outstanding,
+that will prevent any finalizer at level 1 (or 2, ...)  from firing
+until no priority-0 finalizer remains.
+
+Call `gc_attach_finalizer`, from `gc-finalizer.h`, to attach a finalizer
+to an object.
+
+A finalizer also references an associated GC-managed closure object.
+A finalizer's reference to the closure object is strong:  if a
+finalizer's closure closure references its finalizable object,
+directly or indirectly, the finalizer will never fire.
+
+When an object with a finalizer becomes unreachable, it is added to a
+queue.  The embedder can call `gc_pop_finalizable` to get the next
+finalizable object and its associated closure.  At that point the
+embedder can do anything with the object, including keeping it alive.
+Ephemeron associations will still be present while the finalizable
+object is live.  Note however that any objects referenced by the
+finalizable object may themselves be already finalized; finalizers are
+enqueued for objects when they become unreachable, which can concern
+whole subgraphs of objects at once.
+
+The usual way for an embedder to know when the queue of finalizable
+object is non-empty is to call `gc_set_finalizer_callback` to
+provide a function that will be invoked when there are pending
+finalizers.
+
+Arranging to call `gc_pop_finalizable` and doing something with the
+finalizable object and closure is the responsibility of the embedder.
+The embedder's finalization action can end up invoking arbitrary code,
+so unless the embedder imposes some kind of restriction on what
+finalizers can do, generally speaking finalizers should be run in a
+dedicated thread instead of recursively from within whatever mutator
+thread caused GC.  Setting up such a thread is the responsibility of the
+mutator.  `gc_pop_finalizable` is thread-safe, allowing multiple
+finalization threads if that is appropriate.
+
+`gc_allocate_finalizer` returns a finalizer, which is a fresh GC-managed
+heap object.  The mutator should then directly attach it to an object
+using `gc_finalizer_attach`.  When the finalizer is fired, it becomes
+available to the mutator via `gc_pop_finalizable`.

From 2b59efd9fc347c88ec1cdf7612e6585be80d497b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 15:06:50 +0200
Subject: [PATCH 298/403] Fix semi-space with fixed-sized heaps

---
 src/semi.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/semi.c b/src/semi.c
index 29ac29c3e..3be0b034b 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -322,6 +322,11 @@ static void resize_heap(struct gc_heap *heap, size_t new_heap_size) {
   heap->size = new_region_size * 2;
   if (heap->size != old_heap_size)
     HEAP_EVENT(heap, heap_resized, heap->size);
+}
+
+static void reset_heap_limits(struct gc_heap *heap) {
+  struct semi_space *semi = heap_semi_space(heap);
+  size_t new_region_size = align_up(heap->size, semi->page_size * 2) / 2;
   size_t stolen = align_up(semi->stolen_pages, 2) * semi->page_size;
   GC_ASSERT(new_region_size > stolen/2);
   size_t new_active_region_size = new_region_size - stolen/2;
@@ -416,6 +421,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
         heap->size);
   gc_heap_sizer_on_gc(heap->sizer, heap->size, live_size, pause_ns,
                       resize_heap);
+  reset_heap_limits(heap);  
 
   HEAP_EVENT(heap, restarting_mutators);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
@@ -578,6 +584,14 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
   return heap->count;
 }
 
+static uint64_t get_allocation_counter(struct gc_heap *heap) {
+  return heap->total_allocated_bytes_at_last_gc;
+}
+
+static void ignore_async_heap_size_adjustment(struct gc_heap *heap,
+                                              size_t size) {
+}
+
 static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   heap->extern_space = NULL;
   heap->pending_ephemerons_size_factor = 0.01;
@@ -590,6 +604,11 @@ static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   if (!heap->finalizer_state)
     GC_CRASH();
 
+  heap->sizer = gc_make_heap_sizer(heap, &options->common,
+                                   get_allocation_counter,
+                                   ignore_async_heap_size_adjustment,
+                                   NULL);
+
   return heap_prepare_pending_ephemerons(heap);
 }
 
@@ -617,14 +636,6 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-static uint64_t get_allocation_counter(struct gc_heap *heap) {
-  return heap->total_allocated_bytes_at_last_gc;
-}
-
-static void ignore_async_heap_size_adjustment(struct gc_heap *heap,
-                                              size_t size) {
-}
-
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mut,
             struct gc_event_listener event_listener,
@@ -655,11 +666,6 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   if (!large_object_space_init(heap_large_object_space(*heap), *heap))
     return 0;
   
-  (*heap)->sizer = gc_make_heap_sizer(*heap, &options->common,
-                                      get_allocation_counter,
-                                      ignore_async_heap_size_adjustment,
-                                      NULL);
-
   // Ignore stack base, as we are precise.
   (*mut)->roots = NULL;
 

From a722b9c13fc23207bb7bc8ef475d7a8cb9227890 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 16 Sep 2024 15:46:49 +0200
Subject: [PATCH 299/403] Force major GC before signalling OOM

---
 src/mmc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index 533067b40..0c351f2e8 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -516,6 +516,9 @@ detect_out_of_memory(struct gc_heap *heap, uintptr_t allocation_since_last_gc) {
   if (allocation_since_last_gc > nofl_space_fragmentation(heap_nofl_space(heap)))
     return;
 
+  if (heap->gc_kind == GC_COLLECTION_MINOR)
+    return;
+
   // No allocation since last gc: out of memory.
   fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
   GC_CRASH();
@@ -753,15 +756,15 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
-  enum gc_collection_kind gc_kind =
-    determine_collection_kind(heap, requested_kind);
-  int is_minor = gc_kind == GC_COLLECTION_MINOR;
-  HEAP_EVENT(heap, prepare_gc, gc_kind);
   uint64_t allocation_counter = 0;
   nofl_space_add_to_allocation_counter(nofl_space, &allocation_counter);
   large_object_space_add_to_allocation_counter(lospace, &allocation_counter);
   heap->total_allocated_bytes_at_last_gc += allocation_counter;
   detect_out_of_memory(heap, allocation_counter);
+  enum gc_collection_kind gc_kind =
+    determine_collection_kind(heap, requested_kind);
+  int is_minor = gc_kind == GC_COLLECTION_MINOR;
+  HEAP_EVENT(heap, prepare_gc, gc_kind);
   nofl_space_prepare_gc(nofl_space, gc_kind);
   large_object_space_start_gc(lospace, is_minor);
   gc_extern_space_start_gc(exspace, is_minor);

From 9f26dbb1fc296f38c2236437dbcf82d76cb042a5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Sep 2024 10:32:38 +0200
Subject: [PATCH 300/403] Implement per-object pinning API

Fixes https://github.com/wingo/whippet/issues/6.
---
 README.md        |  4 ++--
 api/bdw-attrs.h  |  4 ++++
 api/gc-api.h     |  2 ++
 api/gc-attrs.h   |  2 ++
 api/mmc-attrs.h  |  4 ++++
 api/pcc-attrs.h  |  4 ++++
 api/semi-attrs.h |  4 ++++
 doc/manual.md    | 19 +++++++++++++++++++
 src/bdw.c        |  4 ++++
 src/mmc.c        |  8 ++++++++
 src/nofl-space.h | 23 ++++++++++++++++++++---
 src/pcc.c        |  4 ++++
 src/semi.c       |  4 ++++
 13 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index b3689fcd9..fc60c5ccf 100644
--- a/README.md
+++ b/README.md
@@ -45,8 +45,8 @@ See the [documentation](./doc/README.md).
 ## Status and roadmap
 
 As of September 2024, Whippet is almost feature-complete.  We need to
-land a per-object pinning API, and an API for cooperative safepoints for
-use by threads that are looping without allocating.
+land an API for cooperative safepoints for use by threads that are
+looping without allocating.
 
 After that, the next phase on the roadmap is support for tracing, and
 some performance noodling.
diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index e190c2cee..85c3aa250 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -54,4 +54,8 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_SIGNAL;
 }
 
+static inline int gc_can_pin_objects(void) {
+  return 1;
+}
+
 #endif // BDW_ATTRS_H
diff --git a/api/gc-api.h b/api/gc-api.h
index e60be7579..071655bff 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -206,4 +206,6 @@ static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
   }
 }
 
+GC_API_ void gc_pin_object(struct gc_mutator *mut, struct gc_ref obj);
+
 #endif // GC_API_H_
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index c08330eaa..8e914e361 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -43,4 +43,6 @@ enum gc_safepoint_mechanism {
 };
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) GC_ALWAYS_INLINE;
 
+static inline int gc_can_pin_objects(void) GC_ALWAYS_INLINE;
+
 #endif // GC_ATTRS_H
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 111b2512c..d56b3d88c 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -61,4 +61,8 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
+static inline int gc_can_pin_objects(void) {
+  return 1;
+}
+
 #endif // MMC_ATTRS_H
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 5f80488a0..b8b42baf0 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -57,4 +57,8 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
+static inline int gc_can_pin_objects(void) {
+  return 0;
+}
+
 #endif // PCC_ATTRS_H
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index be906768f..3e0511074 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -56,4 +56,8 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
+static inline int gc_can_pin_objects(void) {
+  return 0;
+}
+
 #endif // SEMI_ATTRS_H
diff --git a/doc/manual.md b/doc/manual.md
index 45e8d019d..d2878ebd6 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -538,6 +538,25 @@ Also, the BDW collector actually uses pre-emptive safepoints: it stops
 threads via POSIX signals.  `gc_safepoint` is (or will be) a no-op with
 BDW.
 
+### Pinning
+
+Sometimes a mutator or embedder would like to tell the collector to not
+move a particular object.  This can happen for example during a foreign
+function call, or if the embedder allows programs to access the address
+of an object, for example to compute an identity hash code.  To support
+this use case, some Whippet collectors allow the embedder to *pin*
+objects.  Call `gc_pin_object` to prevent the collector from relocating
+an object.
+
+Pinning is currently supported by the `bdw` collector, which never moves
+objects, and also by the various `mmc` collectors, which can move
+objects that have no inbound conservative references.
+
+Pinning is not supported on `semi` or `pcc`.
+
+Call `gc_can_pin_objects` to determine whether the current collector can
+pin objects.
+
 ### Statistics
 
 Sometimes a program would like some information from the GC: how many
diff --git a/src/bdw.c b/src/bdw.c
index 3149bf3c7..e5394427a 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -127,6 +127,10 @@ void* gc_allocate_pointerless(struct gc_mutator *mut,
   return GC_malloc_atomic(size);
 }
 
+void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
+  // Nothing to do.
+}
+
 void gc_collect(struct gc_mutator *mut,
                 enum gc_collection_kind requested_kind) {
   switch (requested_kind) {
diff --git a/src/mmc.c b/src/mmc.c
index 0c351f2e8..97091e9b6 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -880,6 +880,14 @@ gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+void
+gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
+  struct nofl_space *nofl = heap_nofl_space(mutator_heap(mut));
+  if (nofl_space_contains(nofl, ref))
+    nofl_space_pin_object(nofl, ref);
+  // Otherwise if it's a large or external object, it won't move.
+}
+
 void
 gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                         struct gc_edge edge, struct gc_ref new_val) {
diff --git a/src/nofl-space.h b/src/nofl-space.h
index ea7fe1c31..a962cac81 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1328,9 +1328,12 @@ nofl_space_sweep_until_memory_released(struct nofl_space *space,
 }
 
 static inline int
-nofl_space_should_evacuate(struct nofl_space *space, struct gc_ref obj) {
+nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
+                           struct gc_ref obj) {
   if (!space->evacuating)
     return 0;
+  if (metadata_byte & NOFL_METADATA_BYTE_PINNED)
+    return 0;
   return nofl_block_has_flag(nofl_block_for_addr(gc_ref_value(obj)),
                              NOFL_BLOCK_EVACUATE);
 }
@@ -1353,6 +1356,20 @@ nofl_space_set_nonempty_mark(struct nofl_space *space, uint8_t *metadata,
   return 1;
 }
 
+static inline void
+nofl_space_pin_object(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
+  uint8_t byte = atomic_load_explicit(metadata, memory_order_relaxed);
+  if (byte & NOFL_METADATA_BYTE_PINNED)
+    return;
+  uint8_t new_byte;
+  do {
+    new_byte = byte | NOFL_METADATA_BYTE_PINNED;
+  } while (!atomic_compare_exchange_weak_explicit(metadata, &byte, new_byte,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+}
+
 static inline int
 nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
                     struct gc_edge edge,
@@ -1429,7 +1446,7 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
   if (byte & space->marked_mask)
     return 0;
 
-  if (nofl_space_should_evacuate(space, old_ref))
+  if (nofl_space_should_evacuate(space, byte, old_ref))
     return nofl_space_evacuate(space, metadata, byte, edge, old_ref,
                                evacuate);
 
@@ -1490,7 +1507,7 @@ nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
   if (byte & space->marked_mask)
     return 1;
 
-  if (!nofl_space_should_evacuate(space, ref))
+  if (!nofl_space_should_evacuate(space, byte, ref))
     return 0;
 
   return nofl_space_forward_if_evacuated(space, edge, ref);
diff --git a/src/pcc.c b/src/pcc.c
index 34a7f47b6..a99e2a967 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -504,6 +504,10 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
+  GC_CRASH();
+}
+
 void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                              struct gc_edge edge, struct gc_ref new_val) {
 }
diff --git a/src/semi.c b/src/semi.c
index 3be0b034b..84493e81c 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -505,6 +505,10 @@ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
   return gc_allocate(mut, size);
 }
 
+void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
+  GC_CRASH();
+}
+
 struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
   return gc_allocate(mut, gc_ephemeron_size());
 }

From 8fba0e53223fd432b7f3968e53d58860fa8cbf7d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Sep 2024 11:31:06 +0200
Subject: [PATCH 301/403] Implement cooperative safepoint API

Fixes https://github.com/wingo/whippet/issues/9.
---
 api/bdw-attrs.h  |  4 ++++
 api/gc-api.h     | 20 +++++++++++++++++++
 api/gc-attrs.h   |  7 +++++++
 api/mmc-attrs.h  |  4 ++++
 api/pcc-attrs.h  |  4 ++++
 api/semi-attrs.h |  4 ++++
 doc/manual.md    | 17 +++++++++++-----
 src/bdw.c        |  3 +++
 src/mmc.c        | 50 ++++++++++++++++++------------------------------
 src/pcc.c        | 42 +++++++++++++++-------------------------
 src/semi.c       |  3 +++
 11 files changed, 95 insertions(+), 63 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index 85c3aa250..af1042af3 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -54,6 +54,10 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_SIGNAL;
 }
 
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_NONE;
+}
+
 static inline int gc_can_pin_objects(void) {
   return 1;
 }
diff --git a/api/gc-api.h b/api/gc-api.h
index 071655bff..63921f628 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -208,4 +208,24 @@ static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
 
 GC_API_ void gc_pin_object(struct gc_mutator *mut, struct gc_ref obj);
 
+GC_API_ void gc_safepoint_slow(struct gc_mutator *mut) GC_NEVER_INLINE;
+GC_API_ int* gc_safepoint_flag_loc(struct gc_mutator *mut);
+static inline int gc_should_stop_for_safepoint(struct gc_mutator *mut) {
+  switch (gc_cooperative_safepoint_kind()) {
+  case GC_COOPERATIVE_SAFEPOINT_NONE:
+    return 0;
+  case GC_COOPERATIVE_SAFEPOINT_MUTATOR_FLAG:
+  case GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG: {
+    return atomic_load_explicit(gc_safepoint_flag_loc(mut),
+                                memory_order_relaxed);
+  }
+  default:
+    GC_CRASH();
+  }
+}
+static inline void gc_safepoint(struct gc_mutator *mut) {
+  if (GC_UNLIKELY(gc_should_stop_for_safepoint(mut)))
+    gc_safepoint_slow(mut);
+}
+
 #endif // GC_API_H_
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index 8e914e361..b6acb4302 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -43,6 +43,13 @@ enum gc_safepoint_mechanism {
 };
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) GC_ALWAYS_INLINE;
 
+enum gc_cooperative_safepoint_kind {
+  GC_COOPERATIVE_SAFEPOINT_NONE,
+  GC_COOPERATIVE_SAFEPOINT_MUTATOR_FLAG,
+  GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG,
+};
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) GC_ALWAYS_INLINE;
+
 static inline int gc_can_pin_objects(void) GC_ALWAYS_INLINE;
 
 #endif // GC_ATTRS_H
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index d56b3d88c..e5757f6d1 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -61,6 +61,10 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG;
+}
+
 static inline int gc_can_pin_objects(void) {
   return 1;
 }
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index b8b42baf0..2f02640ea 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -57,6 +57,10 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG;
+}
+
 static inline int gc_can_pin_objects(void) {
   return 0;
 }
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index 3e0511074..bcd8e89e0 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -56,6 +56,10 @@ static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
 }
 
+static inline enum gc_cooperative_safepoint_kind gc_cooperative_safepoint_kind(void) {
+  return GC_COOPERATIVE_SAFEPOINT_NONE;
+}
+
 static inline int gc_can_pin_objects(void) {
   return 0;
 }
diff --git a/doc/manual.md b/doc/manual.md
index d2878ebd6..7c784b626 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -531,12 +531,19 @@ temporarily mark itself as inactive by trampolining through
 for, for example, system calls that might block.  Periodic safepoints is
 better for code that is active but not allocating.
 
-Thing is, though, `gc_safepoint` is not yet implemented :)  It will be,
-though!
-
 Also, the BDW collector actually uses pre-emptive safepoints: it stops
-threads via POSIX signals.  `gc_safepoint` is (or will be) a no-op with
-BDW.
+threads via POSIX signals.  `gc_safepoint` is a no-op with BDW.
+
+Embedders can inline safepoint checks.  If
+`gc_cooperative_safepoint_kind()` is `GC_COOPERATIVE_SAFEPOINT_NONE`,
+then the collector doesn't need safepoints, as is the case for `bdw`
+which uses signals and `semi` which is single-threaded.  If it is
+`GC_COOPERATIVE_SAFEPOINT_HEAP_FLAG`, then calling
+`gc_safepoint_flag_loc` on a mutator will return the address of an `int`
+in memory, which if nonzero when loaded using relaxed atomics indicates
+that the mutator should call `gc_safepoint_slow`.  Similarly for
+`GC_COOPERATIVE_SAFEPOINT_MUTATOR_FLAG`, except that the address is
+per-mutator rather than global.
 
 ### Pinning
 
diff --git a/src/bdw.c b/src/bdw.c
index e5394427a..df579dde7 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -153,6 +153,9 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                              struct gc_edge edge, struct gc_ref new_val) {
 }
 
+int* gc_safepoint_flag_loc(struct gc_mutator *mut) { GC_CRASH(); }
+void gc_safepoint_slow(struct gc_mutator *mut) { GC_CRASH(); }
+
 struct bdw_mark_state {
   struct GC_ms_entry *mark_stack_ptr;
   struct GC_ms_entry *mark_stack_limit;
diff --git a/src/mmc.c b/src/mmc.c
index 97091e9b6..eebc61903 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -417,6 +417,7 @@ static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(!all_mutators_stopped(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
   MUTATOR_EVENT(mut, mutator_stopped);
   heap->paused_mutator_count++;
   enum gc_collection_kind collection_kind = heap->gc_kind;
@@ -432,35 +433,6 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   return collection_kind;
 }
 
-static enum gc_collection_kind
-pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static enum gc_collection_kind
-pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mutators_are_stopping(heap));
-  MUTATOR_EVENT(mut, mutator_stopping);
-  return pause_mutator_for_collection(heap, mut);
-}
-
-static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void
-pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mutators_are_stopping(heap));
-  MUTATOR_EVENT(mut, mutator_stopping);
-  nofl_finish_sweeping(&mut->allocator, heap_nofl_space(heap));
-  gc_stack_capture_hot(&mut->stack);
-  heap_lock(heap);
-  pause_mutator_for_collection(heap, mut);
-  heap_unlock(heap);
-}
-
-static inline void
-maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
-  while (mutators_are_stopping(mutator_heap(mut)))
-    pause_mutator_for_collection_without_lock(mut);
-}
-
 static void
 resize_heap(struct gc_heap *heap, size_t new_size) {
   if (new_size == heap->size)
@@ -818,7 +790,7 @@ trigger_collection(struct gc_mutator *mut,
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   heap_lock(heap);
   while (mutators_are_stopping(heap))
-    prev_kind = pause_mutator_for_collection_with_lock(mut);
+    prev_kind = pause_mutator_for_collection(heap, mut);
   if (prev_kind < (int)requested_kind)
     collect(mut, requested_kind);
   heap_unlock(heap);
@@ -829,6 +801,22 @@ gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
   trigger_collection(mut, kind);
 }
 
+int*
+gc_safepoint_flag_loc(struct gc_mutator *mut) {
+  return &mutator_heap(mut)->collecting;
+}
+
+void
+gc_safepoint_slow(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  gc_stack_capture_hot(&mut->stack);
+  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
+  heap_lock(heap);
+  while (mutators_are_stopping(mutator_heap(mut)))
+    pause_mutator_for_collection(heap, mut);
+  heap_unlock(heap);
+}
+
 static void*
 allocate_large(struct gc_mutator *mut, size_t size) {
   struct gc_heap *heap = mutator_heap(mut);
@@ -894,7 +882,7 @@ gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
   GC_ASSERT(obj_size > gc_allocator_large_threshold());
   gc_object_set_remembered(obj);
 }
-
+  
 struct gc_ephemeron*
 gc_allocate_ephemeron(struct gc_mutator *mut) {
   struct gc_ref ret =
diff --git a/src/pcc.c b/src/pcc.c
index a99e2a967..90194e96a 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -283,6 +283,7 @@ static void
 pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(!all_mutators_stopped(heap));
+  MUTATOR_EVENT(mut, mutator_stopping);
   MUTATOR_EVENT(mut, mutator_stopped);
   heap->paused_mutator_count++;
   if (all_mutators_stopped(heap))
@@ -296,32 +297,6 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_restarted);
 }
 
-static void
-pause_mutator_for_collection_with_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void
-pause_mutator_for_collection_with_lock(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mutators_are_stopping(heap));
-  MUTATOR_EVENT(mut, mutator_stopping);
-  pause_mutator_for_collection(heap, mut);
-}
-
-static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void pause_mutator_for_collection_without_lock(struct gc_mutator *mut) {
-  struct gc_heap *heap = mutator_heap(mut);
-  GC_ASSERT(mutators_are_stopping(heap));
-  MUTATOR_EVENT(mut, mutator_stopping);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
-  heap_lock(heap);
-  pause_mutator_for_collection(heap, mut);
-  heap_unlock(heap);
-}
-
-static inline void maybe_pause_mutator_for_collection(struct gc_mutator *mut) {
-  while (mutators_are_stopping(mutator_heap(mut)))
-    pause_mutator_for_collection_without_lock(mut);
-}
-
 static void resize_heap(struct gc_heap *heap, size_t new_size) {
   if (new_size == heap->size)
     return;
@@ -447,7 +422,7 @@ static void trigger_collection(struct gc_mutator *mut) {
   heap_lock(heap);
   long epoch = heap->count;
   while (mutators_are_stopping(heap))
-    pause_mutator_for_collection_with_lock(mut);
+    pause_mutator_for_collection(heap, mut);
   if (epoch == heap->count)
     collect(mut);
   heap_unlock(heap);
@@ -512,6 +487,19 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                              struct gc_edge edge, struct gc_ref new_val) {
 }
 
+int* gc_safepoint_flag_loc(struct gc_mutator *mut) {
+  return &mutator_heap(mut)->collecting;
+}
+
+void gc_safepoint_slow(struct gc_mutator *mut) {
+  struct gc_heap *heap = mutator_heap(mut);
+  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  heap_lock(heap);
+  while (mutators_are_stopping(mutator_heap(mut)))
+    pause_mutator_for_collection(heap, mut);
+  heap_unlock(heap);
+}
+  
 struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
   return gc_allocate(mut, gc_ephemeron_size());
 }
diff --git a/src/semi.c b/src/semi.c
index 84493e81c..7958b5898 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -457,6 +457,9 @@ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
                              struct gc_edge edge, struct gc_ref new_val) {
 }
 
+int* gc_safepoint_flag_loc(struct gc_mutator *mut) { GC_CRASH(); }
+void gc_safepoint_slow(struct gc_mutator *mut) { GC_CRASH(); }
+  
 static void collect_for_large_alloc(struct gc_mutator *mut, size_t npages) {
   collect_for_alloc(mut, npages * mutator_semi_space(mut)->page_size);
 }

From a411599d872b70e70ee57a0b84daf82f619f6b8d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 18 Sep 2024 11:56:54 +0200
Subject: [PATCH 302/403] Update README.md

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fc60c5ccf..466a5ea59 100644
--- a/README.md
+++ b/README.md
@@ -44,11 +44,11 @@ See the [documentation](./doc/README.md).
 
 ## Status and roadmap
 
-As of September 2024, Whippet is almost feature-complete.  We need to
-land an API for cooperative safepoints for use by threads that are
-looping without allocating.
+As of September 2024, Whippet is feature-complete!  Of course there will
+surely be new features to build as Whippet gets integrated it into
+language run-times, but the basics are there.
 
-After that, the next phase on the roadmap is support for tracing, and
+The next phase on the roadmap is support for tracing, and
 some performance noodling.
 
 Once that is done, the big task is integrating Whippet into the [Guile

From 326e925f4cf1ce59233435802f680e00f77746ab Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 30 Sep 2024 10:59:47 +0200
Subject: [PATCH 303/403] Fix an ABA problem in the nofl space

We use Treiber stacks to represent sets of blocks: blocks to sweep, full
blocks, and so on.  This is fine as long as we are only adding to or
only removing from those sets, but as soon as we have concurrent add and
remove, we need to avoid the ABA problem.

Concurrent add and remove occurs for partly-full blocks, which are both
acquired and released by mutators; empty blocks, which can be added to
by heap growth at the same time as the mutator acquires them; and the
paged-out queue, which is also concurrent with heap growth/shrinkage.
---
 src/mmc.c        |   3 +-
 src/nofl-space.h | 286 +++++++++++++++++++++++++++++------------------
 2 files changed, 176 insertions(+), 113 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index eebc61903..48d8eae59 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -828,8 +828,7 @@ allocate_large(struct gc_mutator *mut, size_t size) {
   nofl_space_request_release_memory(nofl_space,
                                     npages << lospace->page_size_log2);
 
-  while (!nofl_space_sweep_until_memory_released(nofl_space,
-                                                 &mut->allocator))
+  while (!nofl_space_shrink(nofl_space, 0))
     trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
diff --git a/src/nofl-space.h b/src/nofl-space.h
index a962cac81..ebf44a524 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1,6 +1,7 @@
 #ifndef NOFL_SPACE_H
 #define NOFL_SPACE_H
 
+#include <pthread.h>
 #include <stdatomic.h>
 #include <stdint.h>
 #include <string.h>
@@ -131,12 +132,24 @@ struct nofl_slab {
 };
 STATIC_ASSERT_EQ(sizeof(struct nofl_slab), NOFL_SLAB_SIZE);
 
-// Lock-free block list.
+// Lock-free block list, which either only has threads removing items
+// from it or only has threads adding items to it -- i.e., adding and
+// removing items don't happen concurrently.
 struct nofl_block_list {
   size_t count;
   uintptr_t blocks;
 };
 
+// A block list that has concurrent threads adding and removing items
+// from it.
+struct nofl_block_stack {
+  struct nofl_block_list list;
+};
+
+struct nofl_lock {
+  pthread_mutex_t *lock;
+};
+
 #define NOFL_PAGE_OUT_QUEUE_SIZE 4
 
 struct nofl_space {
@@ -147,14 +160,15 @@ struct nofl_space {
   struct extents *extents;
   size_t heap_size;
   uint8_t last_collection_was_minor;
-  struct nofl_block_list empty;
-  struct nofl_block_list paged_out[NOFL_PAGE_OUT_QUEUE_SIZE];
+  struct nofl_block_stack empty;
+  struct nofl_block_stack paged_out[NOFL_PAGE_OUT_QUEUE_SIZE];
   struct nofl_block_list to_sweep;
-  struct nofl_block_list partly_full;
+  struct nofl_block_stack partly_full;
   struct nofl_block_list full;
   struct nofl_block_list promoted;
   struct nofl_block_list old;
   struct nofl_block_list evacuation_targets;
+  pthread_mutex_t lock;
   double evacuation_minimum_reserve;
   double evacuation_reserve;
   double promotion_threshold;
@@ -222,6 +236,24 @@ nofl_rotate_dead_survivor_marked(uint8_t mask) {
   return ((mask << 1) | (mask >> 2)) & all;
 }
 
+static struct nofl_lock
+nofl_lock_acquire(pthread_mutex_t *lock) {
+  pthread_mutex_lock(lock);
+  return (struct nofl_lock){ lock };
+}
+
+static void
+nofl_lock_release(struct nofl_lock *lock) {
+  GC_ASSERT(lock->lock);
+  pthread_mutex_unlock(lock->lock);
+  lock->lock = NULL;
+}
+
+static struct nofl_lock
+nofl_space_lock(struct nofl_space *space) {
+  return nofl_lock_acquire(&space->lock);
+}
+
 static struct nofl_slab*
 nofl_object_slab(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
@@ -381,7 +413,8 @@ nofl_block_compare_and_exchange(struct nofl_block_list *list,
 }
 
 static void
-nofl_push_block(struct nofl_block_list *list, struct nofl_block_ref block) {
+nofl_block_list_push(struct nofl_block_list *list,
+                     struct nofl_block_ref block) {
   atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
   GC_ASSERT(nofl_block_is_null(nofl_block_next(block)));
   struct nofl_block_ref next = nofl_block_head(list);
@@ -391,7 +424,7 @@ nofl_push_block(struct nofl_block_list *list, struct nofl_block_ref block) {
 }
 
 static struct nofl_block_ref
-nofl_pop_block(struct nofl_block_list *list) {
+nofl_block_list_pop(struct nofl_block_list *list) {
   struct nofl_block_ref head = nofl_block_head(list);
   struct nofl_block_ref next;
   do {
@@ -404,6 +437,31 @@ nofl_pop_block(struct nofl_block_list *list) {
   return head;
 }
 
+static void
+nofl_block_stack_push(struct nofl_block_stack *stack,
+                      struct nofl_block_ref block,
+                      const struct nofl_lock *lock) {
+  struct nofl_block_list *list = &stack->list;
+  list->count++;
+  GC_ASSERT(nofl_block_is_null(nofl_block_next(block)));
+  struct nofl_block_ref next = nofl_block_head(list);
+  nofl_block_set_next(block, next);
+  list->blocks = block.addr;
+}
+
+static struct nofl_block_ref
+nofl_block_stack_pop(struct nofl_block_stack *stack,
+                     const struct nofl_lock *lock) {
+  struct nofl_block_list *list = &stack->list;
+  struct nofl_block_ref head = nofl_block_head(list);
+  if (!nofl_block_is_null(head)) {
+    list->count--;
+    list->blocks = nofl_block_next(head).addr;
+    nofl_block_set_next(head, nofl_block_null());
+  }
+  return head;
+}
+
 static size_t
 nofl_block_count(struct nofl_block_list *list) {
   return atomic_load_explicit(&list->count, memory_order_acquire);
@@ -411,18 +469,21 @@ nofl_block_count(struct nofl_block_list *list) {
 
 static void
 nofl_push_unavailable_block(struct nofl_space *space,
-                            struct nofl_block_ref block) {
+                            struct nofl_block_ref block,
+                            const struct nofl_lock *lock) {
   nofl_block_set_flag(block, NOFL_BLOCK_UNAVAILABLE);
-  nofl_push_block(nofl_block_has_flag(block, NOFL_BLOCK_PAGED_OUT)
-                  ? &space->paged_out[NOFL_PAGE_OUT_QUEUE_SIZE-1]
-                  : &space->paged_out[0],
-                  block);
+  nofl_block_stack_push(nofl_block_has_flag(block, NOFL_BLOCK_PAGED_OUT)
+                        ? &space->paged_out[NOFL_PAGE_OUT_QUEUE_SIZE-1]
+                        : &space->paged_out[0],
+                        block, lock);
 }
 
 static struct nofl_block_ref
-nofl_pop_unavailable_block(struct nofl_space *space) {
+nofl_pop_unavailable_block(struct nofl_space *space,
+                           const struct nofl_lock *lock) {
   for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++) {
-    struct nofl_block_ref block = nofl_pop_block(&space->paged_out[age]);
+    struct nofl_block_ref block =
+      nofl_block_stack_pop(&space->paged_out[age], lock);
     if (!nofl_block_is_null(block)) {
       nofl_block_clear_flag(block, NOFL_BLOCK_UNAVAILABLE);
       return block;
@@ -433,13 +494,23 @@ nofl_pop_unavailable_block(struct nofl_space *space) {
 
 static void
 nofl_push_empty_block(struct nofl_space *space,
-                      struct nofl_block_ref block) {
-  nofl_push_block(&space->empty, block);
+                      struct nofl_block_ref block,
+                      const struct nofl_lock *lock) {
+  nofl_block_stack_push(&space->empty, block, lock);
+}
+
+static struct nofl_block_ref
+nofl_pop_empty_block_with_lock(struct nofl_space *space,
+                               const struct nofl_lock *lock) {
+  return nofl_block_stack_pop(&space->empty, lock);
 }
 
 static struct nofl_block_ref
 nofl_pop_empty_block(struct nofl_space *space) {
-  return nofl_pop_block(&space->empty);
+  struct nofl_lock lock = nofl_space_lock(space);
+  struct nofl_block_ref ret = nofl_pop_empty_block_with_lock(space, &lock);
+  nofl_lock_release(&lock);
+  return ret;
 }
 
 static size_t
@@ -447,7 +518,7 @@ nofl_active_block_count(struct nofl_space *space) {
   size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
   size_t unavailable = 0;
   for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++)
-    unavailable += nofl_block_count(&space->paged_out[age]);
+    unavailable += nofl_block_count(&space->paged_out[age].list);
   GC_ASSERT(unavailable <= total);
   return total - unavailable;
 }
@@ -461,7 +532,7 @@ nofl_maybe_push_evacuation_target(struct nofl_space *space,
   if (targets >= active * reserve)
     return 0;
 
-  nofl_push_block(&space->evacuation_targets, block);
+  nofl_block_list_push(&space->evacuation_targets, block);
   return 1;
 }
 
@@ -520,9 +591,9 @@ nofl_allocator_release_full_block(struct nofl_allocator *alloc,
                    block.summary->fragmentation_granules);
 
   if (nofl_should_promote_block(space, block))
-    nofl_push_block(&space->promoted, block);
+    nofl_block_list_push(&space->promoted, block);
   else
-    nofl_push_block(&space->full, block);
+    nofl_block_list_push(&space->full, block);
 
   nofl_allocator_reset(alloc);
 }
@@ -548,7 +619,7 @@ nofl_allocator_release_full_evacuation_target(struct nofl_allocator *alloc,
     GC_ASSERT_EQ(block.summary->fragmentation_granules, 0);
     GC_ASSERT_EQ(block.summary->holes_with_fragmentation, 0);
   }
-  nofl_push_block(&space->old, block);
+  nofl_block_list_push(&space->old, block);
   nofl_allocator_reset(alloc);
 }
 
@@ -564,14 +635,19 @@ nofl_allocator_release_partly_full_block(struct nofl_allocator *alloc,
   size_t hole_size = alloc->sweep - alloc->alloc;
   GC_ASSERT(hole_size);
   block.summary->fragmentation_granules = hole_size / NOFL_GRANULE_SIZE;
-  nofl_push_block(&space->partly_full, block);
+  struct nofl_lock lock = nofl_space_lock(space);
+  nofl_block_stack_push(&space->partly_full, block, &lock);
+  nofl_lock_release(&lock);
   nofl_allocator_reset(alloc);
 }
 
 static size_t
 nofl_allocator_acquire_partly_full_block(struct nofl_allocator *alloc,
                                          struct nofl_space *space) {
-  struct nofl_block_ref block = nofl_pop_block(&space->partly_full);
+  struct nofl_lock lock = nofl_space_lock(space);
+  struct nofl_block_ref block = nofl_block_stack_pop(&space->partly_full,
+                                                     &lock);
+  nofl_lock_release(&lock);
   if (nofl_block_is_null(block))
     return 0;
   GC_ASSERT_EQ(block.summary->holes_with_fragmentation, 0);
@@ -708,7 +784,7 @@ nofl_allocator_finish(struct nofl_allocator *alloc, struct nofl_space *space) {
 static int
 nofl_allocator_acquire_block_to_sweep(struct nofl_allocator *alloc,
                                       struct nofl_space *space) {
-  struct nofl_block_ref block = nofl_pop_block(&space->to_sweep);
+  struct nofl_block_ref block = nofl_block_list_pop(&space->to_sweep);
   if (nofl_block_is_null(block))
     return 0;
   alloc->block = block;
@@ -732,12 +808,6 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
     GC_ASSERT(!nofl_allocator_has_block(alloc));
   }
 
-  {
-    size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
-    if (granules)
-      return granules;
-  }
-
   while (nofl_allocator_acquire_block_to_sweep(alloc, space)) {
     // This block was marked in the last GC and needs sweeping.
     // As we sweep we'll want to record how many bytes were live
@@ -754,6 +824,12 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
     nofl_allocator_release_full_block(alloc, space);
   }
 
+  {
+    size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+    if (granules)
+      return granules;
+  }
+
   // We are done sweeping for blocks.  Now take from the empties list.
   if (nofl_allocator_acquire_empty_block(alloc, space))
     return NOFL_GRANULES_PER_BLOCK;
@@ -926,7 +1002,7 @@ nofl_space_estimate_live_bytes_after_gc(struct nofl_space *space,
   // instead of measuring it precisely.
   size_t bytes = 0;
   bytes += nofl_block_count(&space->full) * NOFL_BLOCK_SIZE;
-  bytes += nofl_block_count(&space->partly_full) * NOFL_BLOCK_SIZE / 2;
+  bytes += nofl_block_count(&space->partly_full.list) * NOFL_BLOCK_SIZE / 2;
   GC_ASSERT_EQ(nofl_block_count(&space->promoted), 0);
   bytes += space->old_generation_granules * NOFL_GRANULE_SIZE;
   bytes +=
@@ -963,16 +1039,18 @@ static void
 nofl_space_prepare_evacuation(struct nofl_space *space) {
   GC_ASSERT(!space->evacuating);
   struct nofl_block_ref block;
+  struct nofl_lock lock = nofl_space_lock(space);
   while (!nofl_block_is_null
-         (block = nofl_pop_block(&space->evacuation_targets)))
-    nofl_push_empty_block(space, block);
+         (block = nofl_block_list_pop(&space->evacuation_targets)))
+    nofl_push_empty_block(space, block, &lock);
+  nofl_lock_release(&lock);
   // Blocks are either to_sweep, empty, or unavailable.
-  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
+  GC_ASSERT_EQ(nofl_block_count(&space->partly_full.list), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->full), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->promoted), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->old), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->evacuation_targets), 0);
-  size_t target_blocks = nofl_block_count(&space->empty);
+  size_t target_blocks = nofl_block_count(&space->empty.list);
   DEBUG("evacuation target block count: %zu\n", target_blocks);
 
   if (target_blocks == 0) {
@@ -1066,16 +1144,17 @@ nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
   // Any block that was the target of allocation in the last cycle will need to
   // be swept next cycle.
   struct nofl_block_ref block;
-  while (!nofl_block_is_null(block = nofl_pop_block(&space->partly_full)))
-    nofl_push_block(&space->to_sweep, block);
-  while (!nofl_block_is_null(block = nofl_pop_block(&space->full)))
-    nofl_push_block(&space->to_sweep, block);
+  while (!nofl_block_is_null
+         (block = nofl_block_list_pop(&space->partly_full.list)))
+    nofl_block_list_push(&space->to_sweep, block);
+  while (!nofl_block_is_null(block = nofl_block_list_pop(&space->full)))
+    nofl_block_list_push(&space->to_sweep, block);
 
   if (gc_kind != GC_COLLECTION_MINOR) {
-    while (!nofl_block_is_null(block = nofl_pop_block(&space->promoted)))
-      nofl_push_block(&space->to_sweep, block);
-    while (!nofl_block_is_null(block = nofl_pop_block(&space->old)))
-      nofl_push_block(&space->to_sweep, block);
+    while (!nofl_block_is_null(block = nofl_block_list_pop(&space->promoted)))
+      nofl_block_list_push(&space->to_sweep, block);
+    while (!nofl_block_is_null(block = nofl_block_list_pop(&space->old)))
+      nofl_block_list_push(&space->to_sweep, block);
     space->old_generation_granules = 0;
   }
 
@@ -1084,7 +1163,8 @@ nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
 }
 
 static void
-nofl_space_finish_evacuation(struct nofl_space *space) {
+nofl_space_finish_evacuation(struct nofl_space *space,
+                             const struct nofl_lock *lock) {
   // When evacuation began, the evacuation reserve was moved to the
   // empties list.  Now that evacuation is finished, attempt to
   // repopulate the reserve.
@@ -1094,21 +1174,21 @@ nofl_space_finish_evacuation(struct nofl_space *space) {
   size_t reserve = space->evacuation_minimum_reserve * active;
   GC_ASSERT(nofl_block_count(&space->evacuation_targets) == 0);
   while (reserve--) {
-    struct nofl_block_ref block = nofl_pop_block(&space->empty);
+    struct nofl_block_ref block = nofl_pop_empty_block_with_lock(space, lock);
     if (nofl_block_is_null(block)) break;
-    nofl_push_block(&space->evacuation_targets, block);
+    nofl_block_list_push(&space->evacuation_targets, block);
   }
 }
 
 static void
 nofl_space_promote_blocks(struct nofl_space *space) {
   struct nofl_block_ref block;
-  while (!nofl_block_is_null(block = nofl_pop_block(&space->promoted))) {
+  while (!nofl_block_is_null(block = nofl_block_list_pop(&space->promoted))) {
     struct nofl_allocator alloc = { block.addr, block.addr, block };
     nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
     atomic_fetch_add(&space->old_generation_granules,
                      NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
-    nofl_push_block(&space->old, block);
+    nofl_block_list_push(&space->old, block);
   }
 }
 
@@ -1215,12 +1295,12 @@ nofl_space_verify_before_restart(struct nofl_space *space) {
   nofl_space_verify_sweepable_blocks(space, &space->promoted);
   // If there are full or partly full blocks, they were filled during
   // evacuation.
-  nofl_space_verify_swept_blocks(space, &space->partly_full);
+  nofl_space_verify_swept_blocks(space, &space->partly_full.list);
   nofl_space_verify_swept_blocks(space, &space->full);
   nofl_space_verify_swept_blocks(space, &space->old);
-  nofl_space_verify_empty_blocks(space, &space->empty, 1);
+  nofl_space_verify_empty_blocks(space, &space->empty.list, 1);
   for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++)
-    nofl_space_verify_empty_blocks(space, &space->paged_out[age], 0);
+    nofl_space_verify_empty_blocks(space, &space->paged_out[age].list, 0);
   // GC_ASSERT(space->last_collection_was_minor || !nofl_block_count(&space->old));
 }
 
@@ -1228,8 +1308,9 @@ static void
 nofl_space_finish_gc(struct nofl_space *space,
                      enum gc_collection_kind gc_kind) {
   space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
+  struct nofl_lock lock = nofl_space_lock(space);
   if (space->evacuating)
-    nofl_space_finish_evacuation(space);
+    nofl_space_finish_evacuation(space, &lock);
   else {
     space->evacuation_reserve = space->evacuation_minimum_reserve;
     // If we were evacuating and preferentially allocated empty blocks
@@ -1239,22 +1320,23 @@ nofl_space_finish_gc(struct nofl_space *space,
     size_t target = space->evacuation_minimum_reserve * active;
     size_t reserve = nofl_block_count(&space->evacuation_targets);
     while (reserve-- > target)
-      nofl_push_block(&space->empty,
-                      nofl_pop_block(&space->evacuation_targets));
+      nofl_push_empty_block(space,
+                            nofl_block_list_pop(&space->evacuation_targets),
+                            &lock);
   }
 
   {
     struct nofl_block_list to_sweep = {0,};
     struct nofl_block_ref block;
-    while (!nofl_block_is_null(block = nofl_pop_block(&space->to_sweep))) {
+    while (!nofl_block_is_null(block = nofl_block_list_pop(&space->to_sweep))) {
       if (nofl_block_is_marked(block.addr)) {
-        nofl_push_block(&to_sweep, block);
+        nofl_block_list_push(&to_sweep, block);
       } else {
         // Block is empty.
         memset(nofl_metadata_byte_for_addr(block.addr), 0,
                NOFL_GRANULES_PER_BLOCK);
         if (!nofl_push_evacuation_target_if_possible(space, block))
-          nofl_push_empty_block(space, block);
+          nofl_push_empty_block(space, block, &lock);
       }
     }
     atomic_store_explicit(&space->to_sweep.count, to_sweep.count,
@@ -1264,6 +1346,7 @@ nofl_space_finish_gc(struct nofl_space *space,
   }
 
   // FIXME: Promote concurrently instead of during the pause.
+  nofl_lock_release(&lock);
   nofl_space_promote_blocks(space);
   nofl_space_reset_statistics(space);
   nofl_space_update_mark_patterns(space, 0);
@@ -1280,53 +1363,19 @@ static ssize_t
 nofl_space_maybe_reacquire_memory(struct nofl_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
+  struct nofl_lock lock = nofl_space_lock(space);
   while (pending + NOFL_BLOCK_SIZE <= 0) {
-    struct nofl_block_ref block = nofl_pop_unavailable_block(space);
+    struct nofl_block_ref block = nofl_pop_unavailable_block(space, &lock);
     if (nofl_block_is_null(block)) break;
     if (!nofl_push_evacuation_target_if_needed(space, block))
-      nofl_push_empty_block(space, block);
+      nofl_push_empty_block(space, block, &lock);
     pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
       + NOFL_BLOCK_SIZE;
   }
+  nofl_lock_release(&lock);
   return pending;
 }
 
-static int
-nofl_space_sweep_until_memory_released(struct nofl_space *space,
-                                       struct nofl_allocator *alloc) {
-  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                         memory_order_acquire);
-  // First try to unmap previously-identified empty blocks.  If pending
-  // > 0 and other mutators happen to identify empty blocks, they will
-  // be unmapped directly and moved to the unavailable list.
-  while (pending > 0) {
-    struct nofl_block_ref block = nofl_pop_empty_block(space);
-    if (nofl_block_is_null(block))
-      break;
-    // Note that we may have competing uses; if we're evacuating,
-    // perhaps we should push this block to the evacuation target list.
-    // That would enable us to reach a fragmentation low water-mark in
-    // fewer cycles.  But maybe evacuation started in order to obtain
-    // free blocks for large objects; in that case we should just reap
-    // the fruits of our labor.  Probably this second use-case is more
-    // important.
-    nofl_push_unavailable_block(space, block);
-    pending = atomic_fetch_sub(&space->pending_unavailable_bytes,
-                               NOFL_BLOCK_SIZE);
-    pending -= NOFL_BLOCK_SIZE;
-  }
-  // Otherwise, sweep, transitioning any empty blocks to unavailable and
-  // throwing away any non-empty block.  A bit wasteful but hastening
-  // the next collection is a reasonable thing to do here.
-  while (pending > 0) {
-    if (!nofl_allocator_next_hole(alloc, space))
-      return 0;
-    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                   memory_order_acquire);
-  }
-  return pending <= 0;
-}
-
 static inline int
 nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
                            struct gc_ref obj) {
@@ -1622,15 +1671,17 @@ nofl_space_add_slabs(struct nofl_space *space, struct nofl_slab *slabs,
     space->slabs[space->nslabs++] = slabs++;
 }
 
-static void
+static int
 nofl_space_shrink(struct nofl_space *space, size_t bytes) {
   ssize_t pending = nofl_space_request_release_memory(space, bytes);
+  struct nofl_lock lock = nofl_space_lock(space);
+
   // First try to shrink by unmapping previously-identified empty blocks.
   while (pending > 0) {
-    struct nofl_block_ref block = nofl_pop_empty_block(space);
+    struct nofl_block_ref block = nofl_pop_empty_block_with_lock(space, &lock);
     if (nofl_block_is_null(block))
       break;
-    nofl_push_unavailable_block(space, block);
+    nofl_push_unavailable_block(space, block, &lock);
     pending = atomic_fetch_sub(&space->pending_unavailable_bytes,
                                NOFL_BLOCK_SIZE);
     pending -= NOFL_BLOCK_SIZE;
@@ -1646,17 +1697,21 @@ nofl_space_shrink(struct nofl_space *space, size_t bytes) {
     size_t target = space->evacuation_minimum_reserve * active;
     ssize_t avail = nofl_block_count(&space->evacuation_targets);
     while (avail > target && pending > 0) {
-      struct nofl_block_ref block = nofl_pop_block(&space->evacuation_targets);
+      struct nofl_block_ref block =
+        nofl_block_list_pop(&space->evacuation_targets);
       GC_ASSERT(!nofl_block_is_null(block));
-      nofl_push_unavailable_block(space, block);
+      nofl_push_unavailable_block(space, block, &lock);
       pending = atomic_fetch_sub(&space->pending_unavailable_bytes,
                                  NOFL_BLOCK_SIZE);
       pending -= NOFL_BLOCK_SIZE;
     }
   }
 
+  nofl_lock_release(&lock);
+
   // It still may be the case we need to page out more blocks.  Only evacuation
   // can help us then!
+  return pending <= 0;
 }
       
 static void
@@ -1670,14 +1725,16 @@ nofl_space_expand(struct nofl_space *space, size_t bytes) {
   struct nofl_slab *slabs = nofl_allocate_slabs(nslabs);
   nofl_space_add_slabs(space, slabs, nslabs);
 
+  struct nofl_lock lock = nofl_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < NOFL_NONMETA_BLOCKS_PER_SLAB; idx++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
       struct nofl_block_ref block = nofl_block_for_addr(addr);
       nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
-      nofl_push_unavailable_block(space, block);
+      nofl_push_unavailable_block(space, block, &lock);
     }
   }
+  nofl_lock_release(&lock);
   nofl_space_maybe_reacquire_memory(space, 0);
 }
 
@@ -1691,14 +1748,15 @@ nofl_space_advance_page_out_queue(void *data) {
   // items, except that we don't page out yet, as it could be that some other
   // background task will need to pull pages back in.
   struct nofl_space *space = data;
+  struct nofl_lock lock = nofl_space_lock(space);
   for (int age = NOFL_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
-    while (1) {
-      struct nofl_block_ref block = nofl_pop_block(&space->paged_out[age]);
-      if (nofl_block_is_null(block))
-        break;
-      nofl_push_block(&space->paged_out[age + 1], block);
-    }
+    struct nofl_block_ref block =
+      nofl_block_stack_pop(&space->paged_out[age], &lock);
+    if (nofl_block_is_null(block))
+      break;
+    nofl_block_stack_push(&space->paged_out[age+1], block, &lock);
   }
+  nofl_lock_release(&lock);
 }
 
 static void
@@ -1706,15 +1764,18 @@ nofl_space_page_out_blocks(void *data) {
   // This task is invoked by the background thread after other tasks.  It
   // actually pages out blocks that reached the end of the queue.
   struct nofl_space *space = data;
+  struct nofl_lock lock = nofl_space_lock(space);
   int age = NOFL_PAGE_OUT_QUEUE_SIZE - 2;
   while (1) {
-    struct nofl_block_ref block = nofl_pop_block(&space->paged_out[age]);
+    struct nofl_block_ref block =
+      nofl_block_stack_pop(&space->paged_out[age], &lock);
     if (nofl_block_is_null(block))
       break;
     nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
     madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
-    nofl_push_block(&space->paged_out[age + 1], block);
+    nofl_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
+  nofl_lock_release(&lock);
 }
 
 static int
@@ -1732,23 +1793,26 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
   nofl_space_update_mark_patterns(space, 0);
   space->extents = extents_allocate(10);
   nofl_space_add_slabs(space, slabs, nslabs);
+  pthread_mutex_init(&space->lock, NULL);
   space->evacuation_minimum_reserve = 0.02;
   space->evacuation_reserve = space->evacuation_minimum_reserve;
   space->promotion_threshold = promotion_threshold;
+  struct nofl_lock lock = nofl_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < NOFL_NONMETA_BLOCKS_PER_SLAB; idx++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
       struct nofl_block_ref block = nofl_block_for_addr(addr);
       nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
       if (reserved > size) {
-        nofl_push_unavailable_block(space, block);
+        nofl_push_unavailable_block(space, block, &lock);
         reserved -= NOFL_BLOCK_SIZE;
       } else {
         if (!nofl_push_evacuation_target_if_needed(space, block))
-          nofl_push_empty_block(space, block);
+          nofl_push_empty_block(space, block, &lock);
       }
     }
   }
+  nofl_lock_release(&lock);
   gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
                                 nofl_space_advance_page_out_queue,
                                 space);

From 691c777e7b787a09d847660be3a47496ebeb587d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 30 Sep 2024 12:29:35 +0200
Subject: [PATCH 304/403] Fix ABA problem in the copy space

Same concerns as the previous fix to the nofl space.
---
 src/copy-space.h | 203 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 141 insertions(+), 62 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 98d3f6146..b66efad97 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -1,6 +1,7 @@
 #ifndef COPY_SPACE_H
 #define COPY_SPACE_H
 
+#include <pthread.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 
@@ -105,13 +106,26 @@ copy_space_object_region(struct gc_ref obj) {
 
 #define COPY_SPACE_PAGE_OUT_QUEUE_SIZE 4
 
+struct copy_space_block_list {
+  struct copy_space_block *head;
+};
+
+struct copy_space_block_stack {
+  struct copy_space_block_list list;
+};
+
+struct copy_space_lock {
+  pthread_mutex_t *lock;
+};
+
 struct copy_space {
-  struct copy_space_block *empty;
-  struct copy_space_block *partly_full;
-  struct copy_space_block *full ALIGNED_TO_AVOID_FALSE_SHARING;
+  pthread_mutex_t lock;
+  struct copy_space_block_stack empty;
+  struct copy_space_block_stack partly_full;
+  struct copy_space_block_list full ALIGNED_TO_AVOID_FALSE_SHARING;
   size_t allocated_bytes;
   size_t fragmentation;
-  struct copy_space_block *paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE]
+  struct copy_space_block_stack paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE]
     ALIGNED_TO_AVOID_FALSE_SHARING;
   ssize_t bytes_to_page_out ALIGNED_TO_AVOID_FALSE_SHARING;
   // The rest of these members are only changed rarely and with the heap
@@ -131,32 +145,72 @@ struct copy_space_allocator {
   struct copy_space_block *block;
 };
 
+static struct copy_space_lock
+copy_space_lock_acquire(pthread_mutex_t *lock) {
+  pthread_mutex_lock(lock);
+  return (struct copy_space_lock){ lock };
+}
+
 static void
-copy_space_push_block(struct copy_space_block **list,
-                      struct copy_space_block *block) {
+copy_space_lock_release(struct copy_space_lock *lock) {
+  GC_ASSERT(lock->lock);
+  pthread_mutex_unlock(lock->lock);
+  lock->lock = NULL;
+}
+
+static struct copy_space_lock
+copy_space_lock(struct copy_space *space) {
+  return copy_space_lock_acquire(&space->lock);
+}
+
+static void
+copy_space_block_list_push(struct copy_space_block_list *list,
+                           struct copy_space_block *block) {
   struct copy_space_block *next =
-    atomic_load_explicit(list, memory_order_acquire);
+    atomic_load_explicit(&list->head, memory_order_acquire);
   do {
     block->next = next;
-  } while (!atomic_compare_exchange_weak(list, &next, block));
+  } while (!atomic_compare_exchange_weak(&list->head, &next, block));
 }
 
 static struct copy_space_block*
-copy_space_pop_block(struct copy_space_block **list) {
+copy_space_block_list_pop(struct copy_space_block_list *list) {
   struct copy_space_block *head =
-    atomic_load_explicit(list, memory_order_acquire);
+    atomic_load_explicit(&list->head, memory_order_acquire);
   struct copy_space_block *next;
   do {
     if (!head)
       return NULL;
-  } while (!atomic_compare_exchange_weak(list, &head, head->next));
+  } while (!atomic_compare_exchange_weak(&list->head, &head, head->next));
   head->next = NULL;
   return head;
 }
 
+static void
+copy_space_block_stack_push(struct copy_space_block_stack *stack,
+                            struct copy_space_block *block,
+                            const struct copy_space_lock *lock) {
+  struct copy_space_block *next = stack->list.head;
+  block->next = next;
+  stack->list.head = block;
+}
+
 static struct copy_space_block*
-copy_space_pop_empty_block(struct copy_space *space) {
-  struct copy_space_block *ret = copy_space_pop_block(&space->empty);
+copy_space_block_stack_pop(struct copy_space_block_stack *stack,
+                           const struct copy_space_lock *lock) {
+  struct copy_space_block *head = stack->list.head;
+  if (head) {
+    stack->list.head = head->next;
+    head->next = NULL;
+  }
+  return head;
+}
+
+static struct copy_space_block*
+copy_space_pop_empty_block(struct copy_space *space,
+                           const struct copy_space_lock *lock) {
+  struct copy_space_block *ret = copy_space_block_stack_pop(&space->empty,
+                                                            lock);
   if (ret)
     ret->allocated = 0;
   return ret;
@@ -164,46 +218,53 @@ copy_space_pop_empty_block(struct copy_space *space) {
 
 static void
 copy_space_push_empty_block(struct copy_space *space,
-                            struct copy_space_block *block) {
-  copy_space_push_block(&space->empty, block);
+                            struct copy_space_block *block,
+                            const struct copy_space_lock *lock) {
+  copy_space_block_stack_push(&space->empty, block, lock);
 }
 
 static struct copy_space_block*
 copy_space_pop_full_block(struct copy_space *space) {
-  return copy_space_pop_block(&space->full);
+  return copy_space_block_list_pop(&space->full);
 }
 
 static void
 copy_space_push_full_block(struct copy_space *space,
                            struct copy_space_block *block) {
-  copy_space_push_block(&space->full, block);
+  copy_space_block_list_push(&space->full, block);
 }
 
 static struct copy_space_block*
-copy_space_pop_partly_full_block(struct copy_space *space) {
-  return copy_space_pop_block(&space->partly_full);
+copy_space_pop_partly_full_block(struct copy_space *space,
+                                 const struct copy_space_lock *lock) {
+  return copy_space_block_stack_pop(&space->partly_full, lock);
 }
 
 static void
 copy_space_push_partly_full_block(struct copy_space *space,
-                                  struct copy_space_block *block) {
-  copy_space_push_block(&space->partly_full, block);
+                                  struct copy_space_block *block,
+                                  const struct copy_space_lock *lock) {
+  copy_space_block_stack_push(&space->partly_full, block, lock);
 }
 
 static void
 copy_space_page_out_block(struct copy_space *space,
-                          struct copy_space_block *block) {
-  copy_space_push_block(block->in_core
-                        ? &space->paged_out[0]
-                        : &space->paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE-1],
-                        block);
+                          struct copy_space_block *block,
+                          const struct copy_space_lock *lock) {
+  copy_space_block_stack_push
+    (block->in_core
+     ? &space->paged_out[0]
+     : &space->paged_out[COPY_SPACE_PAGE_OUT_QUEUE_SIZE-1],
+     block,
+     lock);
 }
 
 static struct copy_space_block*
-copy_space_page_in_block(struct copy_space *space) {
+copy_space_page_in_block(struct copy_space *space,
+                         const struct copy_space_lock *lock) {
   for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++) {
     struct copy_space_block *block =
-      copy_space_pop_block(&space->paged_out[age]);
+      copy_space_block_stack_pop(&space->paged_out[age], lock);
     if (block) return block;
   }
   return NULL;
@@ -217,28 +278,32 @@ copy_space_request_release_memory(struct copy_space *space, size_t bytes) {
 static int
 copy_space_page_out_blocks_until_memory_released(struct copy_space *space) {
   ssize_t pending = atomic_load(&space->bytes_to_page_out);
+  struct copy_space_lock lock = copy_space_lock(space);
   while (pending > 0) {
-    struct copy_space_block *block = copy_space_pop_empty_block(space);
-    if (!block) return 0;
-    copy_space_page_out_block(space, block);
+    struct copy_space_block *block = copy_space_pop_empty_block(space, &lock);
+    if (!block) break;
+    copy_space_page_out_block(space, block, &lock);
     pending = (atomic_fetch_sub(&space->bytes_to_page_out, COPY_SPACE_BLOCK_SIZE)
                - COPY_SPACE_BLOCK_SIZE);
   }
-  return 1;
+  copy_space_lock_release(&lock);
+  return pending <= 0;
 }
 
 static ssize_t
 copy_space_maybe_reacquire_memory(struct copy_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
+  struct copy_space_lock lock = copy_space_lock(space);
   while (pending + COPY_SPACE_BLOCK_SIZE <= 0) {
-    struct copy_space_block *block = copy_space_page_in_block(space);
+    struct copy_space_block *block = copy_space_page_in_block(space, &lock);
     if (!block) break;
-    copy_space_push_empty_block(space, block);
+    copy_space_push_empty_block(space, block, &lock);
     pending = (atomic_fetch_add(&space->bytes_to_page_out,
                                 COPY_SPACE_BLOCK_SIZE)
                + COPY_SPACE_BLOCK_SIZE);
   }
+  copy_space_lock_release(&lock);
   return pending;
 }
 
@@ -273,12 +338,13 @@ copy_space_allocator_acquire_block(struct copy_space_allocator *alloc,
 static int
 copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
                                          struct copy_space *space) {
-  if (copy_space_allocator_acquire_block(alloc,
-                                         copy_space_pop_empty_block(space),
-                                         space->active_region)) {
-    alloc->block->in_core = 1;
-    if (alloc->block->all_zeroes[space->active_region])
-      alloc->block->all_zeroes[space->active_region] = 0;
+  struct copy_space_lock lock = copy_space_lock(space);
+  struct copy_space_block *block = copy_space_pop_empty_block(space, &lock);
+  copy_space_lock_release(&lock);
+  if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
+    block->in_core = 1;
+    if (block->all_zeroes[space->active_region])
+      block->all_zeroes[space->active_region] = 0;
     else
       memset((char*)alloc->hp, 0, COPY_SPACE_REGION_SIZE);
     return 1;
@@ -289,10 +355,12 @@ copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
 static int
 copy_space_allocator_acquire_partly_full_block(struct copy_space_allocator *alloc,
                                                struct copy_space *space) {
-  if (copy_space_allocator_acquire_block(alloc,
-                                         copy_space_pop_partly_full_block(space),
-                                         space->active_region)) {
-    alloc->hp += alloc->block->allocated;
+  struct copy_space_lock lock = copy_space_lock(space);
+  struct copy_space_block *block = copy_space_pop_partly_full_block(space,
+                                                                    &lock);
+  copy_space_lock_release(&lock);
+  if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
+    alloc->hp += block->allocated;
     return 1;
   }
   return 0;
@@ -322,7 +390,9 @@ copy_space_allocator_release_partly_full_block(struct copy_space_allocator *allo
                               allocated - alloc->block->allocated,
                               memory_order_relaxed);
     alloc->block->allocated = allocated;
-    copy_space_push_partly_full_block(space, alloc->block);
+    struct copy_space_lock lock = copy_space_lock(space);
+    copy_space_push_partly_full_block(space, alloc->block, &lock);
+    copy_space_lock_release(&lock);
   } else {
     // In this case, hp was bumped all the way to the limit, in which
     // case allocated wraps to 0; the block is full.
@@ -382,12 +452,12 @@ copy_space_append_block_lists(struct copy_space_block *head,
 static void
 copy_space_flip(struct copy_space *space) {
   // Mutators stopped, can access nonatomically.
-  struct copy_space_block *flip = space->full;
-  flip = copy_space_append_block_lists(space->partly_full, flip);
-  flip = copy_space_append_block_lists(space->empty, flip);
-  space->empty = flip;
-  space->partly_full = NULL;
-  space->full = NULL;
+  struct copy_space_block* flip = space->full.head;
+  flip = copy_space_append_block_lists(space->partly_full.list.head, flip);
+  flip = copy_space_append_block_lists(space->empty.list.head, flip);
+  space->empty.list.head = flip;
+  space->partly_full.list.head = NULL;
+  space->full.head = NULL;
   space->allocated_bytes = 0;
   space->fragmentation = 0;
   space->active_region ^= 1;
@@ -621,45 +691,51 @@ copy_space_expand(struct copy_space *space, size_t bytes) {
   struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs);
   copy_space_add_slabs(space, slabs, nslabs);
 
+  struct copy_space_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
       block->in_core = 0;
-      copy_space_page_out_block(space, block);
+      copy_space_page_out_block(space, block, &lock);
       reserved -= COPY_SPACE_BLOCK_SIZE;
     }
   }
+  copy_space_lock_release(&lock);
   copy_space_reacquire_memory(space, 0);
 }
 
 static void
 copy_space_advance_page_out_queue(void *data) {
   struct copy_space *space = data;
+  struct copy_space_lock lock = copy_space_lock(space);
   for (int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
     while (1) {
       struct copy_space_block *block =
-        copy_space_pop_block(&space->paged_out[age]);
+        copy_space_block_stack_pop(&space->paged_out[age], &lock);
       if (!block) break;
-      copy_space_push_block(&space->paged_out[age + 1], block);
+      copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
     }
   }
+  copy_space_lock_release(&lock);
 }
 
 static void
 copy_space_page_out_blocks(void *data) {
   struct copy_space *space = data;
   int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 2;
+  struct copy_space_lock lock = copy_space_lock(space);
   while (1) {
     struct copy_space_block *block =
-      copy_space_pop_block(&space->paged_out[age]);
+      copy_space_block_stack_pop(&space->paged_out[age], &lock);
     if (!block) break;
     block->in_core = 0;
     block->all_zeroes[0] = block->all_zeroes[1] = 1;
     madvise(copy_space_block_payload(block), COPY_SPACE_BLOCK_SIZE,
             MADV_DONTNEED);
-    copy_space_push_block(&space->paged_out[age + 1], block);
+    copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
+  copy_space_lock_release(&lock);
 }
 
 static int
@@ -672,11 +748,12 @@ copy_space_init(struct copy_space *space, size_t size, int atomic,
   if (!slabs)
     return 0;
 
-  space->empty = NULL;
-  space->partly_full = NULL;
-  space->full = NULL;
+  pthread_mutex_init(&space->lock, NULL);
+  space->empty.list.head = NULL;
+  space->partly_full.list.head = NULL;
+  space->full.head = NULL;
   for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++)
-    space->paged_out[age] = NULL;
+    space->paged_out[age].list.head = NULL;
   space->allocated_bytes = 0;
   space->fragmentation = 0;
   space->bytes_to_page_out = 0;
@@ -686,19 +763,21 @@ copy_space_init(struct copy_space *space, size_t size, int atomic,
   space->fragmentation_at_last_gc = 0;
   space->extents = extents_allocate(10);
   copy_space_add_slabs(space, slabs, nslabs);
+  struct copy_space_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
       block->in_core = 0;
       if (reserved > size) {
-        copy_space_page_out_block(space, block);
+        copy_space_page_out_block(space, block, &lock);
         reserved -= COPY_SPACE_BLOCK_SIZE;
       } else {
-        copy_space_push_empty_block(space, block);
+        copy_space_push_empty_block(space, block, &lock);
       }
     }
   }
+  copy_space_lock_release(&lock);
   gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
                                 copy_space_advance_page_out_queue,
                                 space);

From 3955d2ad96017c45baf0f056811c84cc17627e64 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 30 Sep 2024 20:52:45 +0200
Subject: [PATCH 305/403] Factor out locking utils to separate header

---
 src/copy-space.h |  74 ++++++++++----------------
 src/gc-lock.h    |  24 +++++++++
 src/nofl-space.h | 134 ++++++++++++++++++++++++++---------------------
 3 files changed, 126 insertions(+), 106 deletions(-)
 create mode 100644 src/gc-lock.h

diff --git a/src/copy-space.h b/src/copy-space.h
index b66efad97..d09609dfe 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -17,6 +17,7 @@
 #include "gc-align.h"
 #include "gc-attrs.h"
 #include "gc-inline.h"
+#include "gc-lock.h"
 #include "spin.h"
 
 // A copy space: a block-structured space that traces via evacuation.
@@ -114,10 +115,6 @@ struct copy_space_block_stack {
   struct copy_space_block_list list;
 };
 
-struct copy_space_lock {
-  pthread_mutex_t *lock;
-};
-
 struct copy_space {
   pthread_mutex_t lock;
   struct copy_space_block_stack empty;
@@ -145,22 +142,9 @@ struct copy_space_allocator {
   struct copy_space_block *block;
 };
 
-static struct copy_space_lock
-copy_space_lock_acquire(pthread_mutex_t *lock) {
-  pthread_mutex_lock(lock);
-  return (struct copy_space_lock){ lock };
-}
-
-static void
-copy_space_lock_release(struct copy_space_lock *lock) {
-  GC_ASSERT(lock->lock);
-  pthread_mutex_unlock(lock->lock);
-  lock->lock = NULL;
-}
-
-static struct copy_space_lock
+static struct gc_lock
 copy_space_lock(struct copy_space *space) {
-  return copy_space_lock_acquire(&space->lock);
+  return gc_lock_acquire(&space->lock);
 }
 
 static void
@@ -189,7 +173,7 @@ copy_space_block_list_pop(struct copy_space_block_list *list) {
 static void
 copy_space_block_stack_push(struct copy_space_block_stack *stack,
                             struct copy_space_block *block,
-                            const struct copy_space_lock *lock) {
+                            const struct gc_lock *lock) {
   struct copy_space_block *next = stack->list.head;
   block->next = next;
   stack->list.head = block;
@@ -197,7 +181,7 @@ copy_space_block_stack_push(struct copy_space_block_stack *stack,
 
 static struct copy_space_block*
 copy_space_block_stack_pop(struct copy_space_block_stack *stack,
-                           const struct copy_space_lock *lock) {
+                           const struct gc_lock *lock) {
   struct copy_space_block *head = stack->list.head;
   if (head) {
     stack->list.head = head->next;
@@ -208,7 +192,7 @@ copy_space_block_stack_pop(struct copy_space_block_stack *stack,
 
 static struct copy_space_block*
 copy_space_pop_empty_block(struct copy_space *space,
-                           const struct copy_space_lock *lock) {
+                           const struct gc_lock *lock) {
   struct copy_space_block *ret = copy_space_block_stack_pop(&space->empty,
                                                             lock);
   if (ret)
@@ -219,7 +203,7 @@ copy_space_pop_empty_block(struct copy_space *space,
 static void
 copy_space_push_empty_block(struct copy_space *space,
                             struct copy_space_block *block,
-                            const struct copy_space_lock *lock) {
+                            const struct gc_lock *lock) {
   copy_space_block_stack_push(&space->empty, block, lock);
 }
 
@@ -236,21 +220,21 @@ copy_space_push_full_block(struct copy_space *space,
 
 static struct copy_space_block*
 copy_space_pop_partly_full_block(struct copy_space *space,
-                                 const struct copy_space_lock *lock) {
+                                 const struct gc_lock *lock) {
   return copy_space_block_stack_pop(&space->partly_full, lock);
 }
 
 static void
 copy_space_push_partly_full_block(struct copy_space *space,
                                   struct copy_space_block *block,
-                                  const struct copy_space_lock *lock) {
+                                  const struct gc_lock *lock) {
   copy_space_block_stack_push(&space->partly_full, block, lock);
 }
 
 static void
 copy_space_page_out_block(struct copy_space *space,
                           struct copy_space_block *block,
-                          const struct copy_space_lock *lock) {
+                          const struct gc_lock *lock) {
   copy_space_block_stack_push
     (block->in_core
      ? &space->paged_out[0]
@@ -261,7 +245,7 @@ copy_space_page_out_block(struct copy_space *space,
 
 static struct copy_space_block*
 copy_space_page_in_block(struct copy_space *space,
-                         const struct copy_space_lock *lock) {
+                         const struct gc_lock *lock) {
   for (int age = 0; age < COPY_SPACE_PAGE_OUT_QUEUE_SIZE; age++) {
     struct copy_space_block *block =
       copy_space_block_stack_pop(&space->paged_out[age], lock);
@@ -278,7 +262,7 @@ copy_space_request_release_memory(struct copy_space *space, size_t bytes) {
 static int
 copy_space_page_out_blocks_until_memory_released(struct copy_space *space) {
   ssize_t pending = atomic_load(&space->bytes_to_page_out);
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   while (pending > 0) {
     struct copy_space_block *block = copy_space_pop_empty_block(space, &lock);
     if (!block) break;
@@ -286,7 +270,7 @@ copy_space_page_out_blocks_until_memory_released(struct copy_space *space) {
     pending = (atomic_fetch_sub(&space->bytes_to_page_out, COPY_SPACE_BLOCK_SIZE)
                - COPY_SPACE_BLOCK_SIZE);
   }
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
   return pending <= 0;
 }
 
@@ -294,7 +278,7 @@ static ssize_t
 copy_space_maybe_reacquire_memory(struct copy_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->bytes_to_page_out, bytes) - bytes;
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   while (pending + COPY_SPACE_BLOCK_SIZE <= 0) {
     struct copy_space_block *block = copy_space_page_in_block(space, &lock);
     if (!block) break;
@@ -303,7 +287,7 @@ copy_space_maybe_reacquire_memory(struct copy_space *space, size_t bytes) {
                                 COPY_SPACE_BLOCK_SIZE)
                + COPY_SPACE_BLOCK_SIZE);
   }
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
   return pending;
 }
 
@@ -338,9 +322,9 @@ copy_space_allocator_acquire_block(struct copy_space_allocator *alloc,
 static int
 copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
                                          struct copy_space *space) {
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   struct copy_space_block *block = copy_space_pop_empty_block(space, &lock);
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
   if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
     block->in_core = 1;
     if (block->all_zeroes[space->active_region])
@@ -355,10 +339,10 @@ copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
 static int
 copy_space_allocator_acquire_partly_full_block(struct copy_space_allocator *alloc,
                                                struct copy_space *space) {
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   struct copy_space_block *block = copy_space_pop_partly_full_block(space,
                                                                     &lock);
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
   if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
     alloc->hp += block->allocated;
     return 1;
@@ -390,9 +374,9 @@ copy_space_allocator_release_partly_full_block(struct copy_space_allocator *allo
                               allocated - alloc->block->allocated,
                               memory_order_relaxed);
     alloc->block->allocated = allocated;
-    struct copy_space_lock lock = copy_space_lock(space);
+    struct gc_lock lock = copy_space_lock(space);
     copy_space_push_partly_full_block(space, alloc->block, &lock);
-    copy_space_lock_release(&lock);
+    gc_lock_release(&lock);
   } else {
     // In this case, hp was bumped all the way to the limit, in which
     // case allocated wraps to 0; the block is full.
@@ -691,7 +675,7 @@ copy_space_expand(struct copy_space *space, size_t bytes) {
   struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs);
   copy_space_add_slabs(space, slabs, nslabs);
 
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
@@ -701,14 +685,14 @@ copy_space_expand(struct copy_space *space, size_t bytes) {
       reserved -= COPY_SPACE_BLOCK_SIZE;
     }
   }
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
   copy_space_reacquire_memory(space, 0);
 }
 
 static void
 copy_space_advance_page_out_queue(void *data) {
   struct copy_space *space = data;
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   for (int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
     while (1) {
       struct copy_space_block *block =
@@ -717,14 +701,14 @@ copy_space_advance_page_out_queue(void *data) {
       copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
     }
   }
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
 }
 
 static void
 copy_space_page_out_blocks(void *data) {
   struct copy_space *space = data;
   int age = COPY_SPACE_PAGE_OUT_QUEUE_SIZE - 2;
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   while (1) {
     struct copy_space_block *block =
       copy_space_block_stack_pop(&space->paged_out[age], &lock);
@@ -735,7 +719,7 @@ copy_space_page_out_blocks(void *data) {
             MADV_DONTNEED);
     copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
 }
 
 static int
@@ -763,7 +747,7 @@ copy_space_init(struct copy_space *space, size_t size, int atomic,
   space->fragmentation_at_last_gc = 0;
   space->extents = extents_allocate(10);
   copy_space_add_slabs(space, slabs, nslabs);
-  struct copy_space_lock lock = copy_space_lock(space);
+  struct gc_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
@@ -777,7 +761,7 @@ copy_space_init(struct copy_space *space, size_t size, int atomic,
       }
     }
   }
-  copy_space_lock_release(&lock);
+  gc_lock_release(&lock);
   gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
                                 copy_space_advance_page_out_queue,
                                 space);
diff --git a/src/gc-lock.h b/src/gc-lock.h
new file mode 100644
index 000000000..89c5f4ac0
--- /dev/null
+++ b/src/gc-lock.h
@@ -0,0 +1,24 @@
+#ifndef GC_LOCK_H
+#define GC_LOCK_H
+
+#include <pthread.h>
+#include "gc-assert.h"
+
+struct gc_lock {
+  pthread_mutex_t *lock;
+};
+
+static struct gc_lock
+gc_lock_acquire(pthread_mutex_t *lock) {
+  pthread_mutex_lock(lock);
+  return (struct gc_lock){ lock };
+}
+
+static void
+gc_lock_release(struct gc_lock *lock) {
+  GC_ASSERT(lock->lock);
+  pthread_mutex_unlock(lock->lock);
+  lock->lock = NULL;
+}
+
+#endif // GC_LOCK_H
diff --git a/src/nofl-space.h b/src/nofl-space.h
index ebf44a524..93904aee5 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -18,6 +18,7 @@
 #include "gc-align.h"
 #include "gc-attrs.h"
 #include "gc-inline.h"
+#include "gc-lock.h"
 #include "spin.h"
 #include "swar.h"
 
@@ -146,10 +147,6 @@ struct nofl_block_stack {
   struct nofl_block_list list;
 };
 
-struct nofl_lock {
-  pthread_mutex_t *lock;
-};
-
 #define NOFL_PAGE_OUT_QUEUE_SIZE 4
 
 struct nofl_space {
@@ -208,15 +205,33 @@ struct nofl_allocator {
 //
 // When an object becomes dead after a GC, it will still have a bit set
 // -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
-// clear these bits before the next collection.  But, for concurrent
-// marking, we will also be marking "live" objects, updating their mark
-// bits.  So there are four object states concurrently observable:
-// young, dead, survivor, and marked.  (If we didn't have concurrent
-// marking we would still need the "marked" state, because marking
-// mutator roots before stopping is also a form of concurrent marking.)
-// Even though these states are mutually exclusive, we use separate bits
-// for them because we have the space.  After each collection, the dead,
-// survivor, and marked states rotate by one bit.
+// clear these bits before the next collection.  But if we add
+// concurrent marking, we will also be marking "live" objects, updating
+// their mark bits.  So there are four object states concurrently
+// observable:  young, dead, survivor, and marked.  (We don't currently
+// have concurrent marking, though.)  Even though these states are
+// mutually exclusive, we use separate bits for them because we have the
+// space.  After each collection, the dead, survivor, and marked states
+// rotate by one bit.
+//
+// An object can be pinned, preventing it from being evacuated during
+// collection.  Pinning does not keep the object alive; if it is
+// otherwise unreachable, it will be collected.  To pin an object, a
+// running mutator can set the pinned bit, using atomic
+// compare-and-swap.
+//
+// For generational collectors, the nofl space supports a field-logging
+// write barrier.  The two logging bits correspond to the two words in a
+// granule.  When a field is written to, the write barrier should check
+// the logged bit; if it is unset, it should try to atomically set the
+// bit, and if that works, then we record the field location as a
+// generational root, adding it to a sequential-store buffer.
+//
+// Finally, for heap-conservative collectors, nofl generally traces all
+// objects in the same way, treating them as an array of conservative
+// edges.  But we need to know when we have an ephemeron.  In that case,
+// we re-use the pinned bit, because it's of no use to us anyway in that
+// configuration, as all objects are pinned.
 enum nofl_metadata_byte {
   NOFL_METADATA_BYTE_NONE = 0,
   NOFL_METADATA_BYTE_YOUNG = 1,
@@ -224,9 +239,10 @@ enum nofl_metadata_byte {
   NOFL_METADATA_BYTE_MARK_1 = 4,
   NOFL_METADATA_BYTE_MARK_2 = 8,
   NOFL_METADATA_BYTE_END = 16,
-  NOFL_METADATA_BYTE_EPHEMERON = 32,
-  NOFL_METADATA_BYTE_PINNED = 64,
-  NOFL_METADATA_BYTE_UNUSED_1 = 128
+  NOFL_METADATA_BYTE_PINNED = 32,
+  NOFL_METADATA_BYTE_LOGGED_0 = 64,
+  NOFL_METADATA_BYTE_LOGGED_1 = 128,
+  NOFL_METADATA_BYTE_EPHEMERON = NOFL_METADATA_BYTE_PINNED,
 };
 
 static uint8_t
@@ -236,22 +252,9 @@ nofl_rotate_dead_survivor_marked(uint8_t mask) {
   return ((mask << 1) | (mask >> 2)) & all;
 }
 
-static struct nofl_lock
-nofl_lock_acquire(pthread_mutex_t *lock) {
-  pthread_mutex_lock(lock);
-  return (struct nofl_lock){ lock };
-}
-
-static void
-nofl_lock_release(struct nofl_lock *lock) {
-  GC_ASSERT(lock->lock);
-  pthread_mutex_unlock(lock->lock);
-  lock->lock = NULL;
-}
-
-static struct nofl_lock
+static struct gc_lock
 nofl_space_lock(struct nofl_space *space) {
-  return nofl_lock_acquire(&space->lock);
+  return gc_lock_acquire(&space->lock);
 }
 
 static struct nofl_slab*
@@ -440,7 +443,7 @@ nofl_block_list_pop(struct nofl_block_list *list) {
 static void
 nofl_block_stack_push(struct nofl_block_stack *stack,
                       struct nofl_block_ref block,
-                      const struct nofl_lock *lock) {
+                      const struct gc_lock *lock) {
   struct nofl_block_list *list = &stack->list;
   list->count++;
   GC_ASSERT(nofl_block_is_null(nofl_block_next(block)));
@@ -451,7 +454,7 @@ nofl_block_stack_push(struct nofl_block_stack *stack,
 
 static struct nofl_block_ref
 nofl_block_stack_pop(struct nofl_block_stack *stack,
-                     const struct nofl_lock *lock) {
+                     const struct gc_lock *lock) {
   struct nofl_block_list *list = &stack->list;
   struct nofl_block_ref head = nofl_block_head(list);
   if (!nofl_block_is_null(head)) {
@@ -470,7 +473,7 @@ nofl_block_count(struct nofl_block_list *list) {
 static void
 nofl_push_unavailable_block(struct nofl_space *space,
                             struct nofl_block_ref block,
-                            const struct nofl_lock *lock) {
+                            const struct gc_lock *lock) {
   nofl_block_set_flag(block, NOFL_BLOCK_UNAVAILABLE);
   nofl_block_stack_push(nofl_block_has_flag(block, NOFL_BLOCK_PAGED_OUT)
                         ? &space->paged_out[NOFL_PAGE_OUT_QUEUE_SIZE-1]
@@ -480,7 +483,7 @@ nofl_push_unavailable_block(struct nofl_space *space,
 
 static struct nofl_block_ref
 nofl_pop_unavailable_block(struct nofl_space *space,
-                           const struct nofl_lock *lock) {
+                           const struct gc_lock *lock) {
   for (int age = 0; age < NOFL_PAGE_OUT_QUEUE_SIZE; age++) {
     struct nofl_block_ref block =
       nofl_block_stack_pop(&space->paged_out[age], lock);
@@ -495,21 +498,21 @@ nofl_pop_unavailable_block(struct nofl_space *space,
 static void
 nofl_push_empty_block(struct nofl_space *space,
                       struct nofl_block_ref block,
-                      const struct nofl_lock *lock) {
+                      const struct gc_lock *lock) {
   nofl_block_stack_push(&space->empty, block, lock);
 }
 
 static struct nofl_block_ref
 nofl_pop_empty_block_with_lock(struct nofl_space *space,
-                               const struct nofl_lock *lock) {
+                               const struct gc_lock *lock) {
   return nofl_block_stack_pop(&space->empty, lock);
 }
 
 static struct nofl_block_ref
 nofl_pop_empty_block(struct nofl_space *space) {
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   struct nofl_block_ref ret = nofl_pop_empty_block_with_lock(space, &lock);
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   return ret;
 }
 
@@ -635,19 +638,19 @@ nofl_allocator_release_partly_full_block(struct nofl_allocator *alloc,
   size_t hole_size = alloc->sweep - alloc->alloc;
   GC_ASSERT(hole_size);
   block.summary->fragmentation_granules = hole_size / NOFL_GRANULE_SIZE;
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   nofl_block_stack_push(&space->partly_full, block, &lock);
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   nofl_allocator_reset(alloc);
 }
 
 static size_t
 nofl_allocator_acquire_partly_full_block(struct nofl_allocator *alloc,
                                          struct nofl_space *space) {
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   struct nofl_block_ref block = nofl_block_stack_pop(&space->partly_full,
                                                      &lock);
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   if (nofl_block_is_null(block))
     return 0;
   GC_ASSERT_EQ(block.summary->holes_with_fragmentation, 0);
@@ -1039,11 +1042,11 @@ static void
 nofl_space_prepare_evacuation(struct nofl_space *space) {
   GC_ASSERT(!space->evacuating);
   struct nofl_block_ref block;
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   while (!nofl_block_is_null
          (block = nofl_block_list_pop(&space->evacuation_targets)))
     nofl_push_empty_block(space, block, &lock);
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   // Blocks are either to_sweep, empty, or unavailable.
   GC_ASSERT_EQ(nofl_block_count(&space->partly_full.list), 0);
   GC_ASSERT_EQ(nofl_block_count(&space->full), 0);
@@ -1164,7 +1167,7 @@ nofl_space_start_gc(struct nofl_space *space, enum gc_collection_kind gc_kind) {
 
 static void
 nofl_space_finish_evacuation(struct nofl_space *space,
-                             const struct nofl_lock *lock) {
+                             const struct gc_lock *lock) {
   // When evacuation began, the evacuation reserve was moved to the
   // empties list.  Now that evacuation is finished, attempt to
   // repopulate the reserve.
@@ -1308,7 +1311,7 @@ static void
 nofl_space_finish_gc(struct nofl_space *space,
                      enum gc_collection_kind gc_kind) {
   space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   if (space->evacuating)
     nofl_space_finish_evacuation(space, &lock);
   else {
@@ -1346,7 +1349,7 @@ nofl_space_finish_gc(struct nofl_space *space,
   }
 
   // FIXME: Promote concurrently instead of during the pause.
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   nofl_space_promote_blocks(space);
   nofl_space_reset_statistics(space);
   nofl_space_update_mark_patterns(space, 0);
@@ -1363,7 +1366,7 @@ static ssize_t
 nofl_space_maybe_reacquire_memory(struct nofl_space *space, size_t bytes) {
   ssize_t pending =
     atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   while (pending + NOFL_BLOCK_SIZE <= 0) {
     struct nofl_block_ref block = nofl_pop_unavailable_block(space, &lock);
     if (nofl_block_is_null(block)) break;
@@ -1372,13 +1375,15 @@ nofl_space_maybe_reacquire_memory(struct nofl_space *space, size_t bytes) {
     pending = atomic_fetch_add(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE)
       + NOFL_BLOCK_SIZE;
   }
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   return pending;
 }
 
 static inline int
 nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
                            struct gc_ref obj) {
+  if (gc_has_conservative_intraheap_edges())
+    return 0;
   if (!space->evacuating)
     return 0;
   if (metadata_byte & NOFL_METADATA_BYTE_PINNED)
@@ -1389,8 +1394,11 @@ nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
 
 static inline int
 nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
+  // Clear logged bits when we mark: after marking, there will be no
+  // young objects.
   uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2
+    | NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1;
   atomic_store_explicit(metadata,
                         (byte & ~mask) | space->marked_mask,
                         memory_order_relaxed);
@@ -1407,6 +1415,10 @@ nofl_space_set_nonempty_mark(struct nofl_space *space, uint8_t *metadata,
 
 static inline void
 nofl_space_pin_object(struct nofl_space *space, struct gc_ref ref) {
+  // For the heap-conservative configuration, all objects are pinned,
+  // and we re-use the pinned bit to identify ephemerons.
+  if (gc_has_conservative_intraheap_edges())
+    return;
   uint8_t *metadata = nofl_metadata_byte_for_object(ref);
   uint8_t byte = atomic_load_explicit(metadata, memory_order_relaxed);
   if (byte & NOFL_METADATA_BYTE_PINNED)
@@ -1674,7 +1686,7 @@ nofl_space_add_slabs(struct nofl_space *space, struct nofl_slab *slabs,
 static int
 nofl_space_shrink(struct nofl_space *space, size_t bytes) {
   ssize_t pending = nofl_space_request_release_memory(space, bytes);
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
 
   // First try to shrink by unmapping previously-identified empty blocks.
   while (pending > 0) {
@@ -1707,7 +1719,7 @@ nofl_space_shrink(struct nofl_space *space, size_t bytes) {
     }
   }
 
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
 
   // It still may be the case we need to page out more blocks.  Only evacuation
   // can help us then!
@@ -1725,7 +1737,7 @@ nofl_space_expand(struct nofl_space *space, size_t bytes) {
   struct nofl_slab *slabs = nofl_allocate_slabs(nslabs);
   nofl_space_add_slabs(space, slabs, nslabs);
 
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < NOFL_NONMETA_BLOCKS_PER_SLAB; idx++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
@@ -1734,7 +1746,7 @@ nofl_space_expand(struct nofl_space *space, size_t bytes) {
       nofl_push_unavailable_block(space, block, &lock);
     }
   }
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   nofl_space_maybe_reacquire_memory(space, 0);
 }
 
@@ -1748,7 +1760,7 @@ nofl_space_advance_page_out_queue(void *data) {
   // items, except that we don't page out yet, as it could be that some other
   // background task will need to pull pages back in.
   struct nofl_space *space = data;
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   for (int age = NOFL_PAGE_OUT_QUEUE_SIZE - 3; age >= 0; age--) {
     struct nofl_block_ref block =
       nofl_block_stack_pop(&space->paged_out[age], &lock);
@@ -1756,7 +1768,7 @@ nofl_space_advance_page_out_queue(void *data) {
       break;
     nofl_block_stack_push(&space->paged_out[age+1], block, &lock);
   }
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
 }
 
 static void
@@ -1764,7 +1776,7 @@ nofl_space_page_out_blocks(void *data) {
   // This task is invoked by the background thread after other tasks.  It
   // actually pages out blocks that reached the end of the queue.
   struct nofl_space *space = data;
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   int age = NOFL_PAGE_OUT_QUEUE_SIZE - 2;
   while (1) {
     struct nofl_block_ref block =
@@ -1775,7 +1787,7 @@ nofl_space_page_out_blocks(void *data) {
     madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
     nofl_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
 }
 
 static int
@@ -1797,7 +1809,7 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
   space->evacuation_minimum_reserve = 0.02;
   space->evacuation_reserve = space->evacuation_minimum_reserve;
   space->promotion_threshold = promotion_threshold;
-  struct nofl_lock lock = nofl_space_lock(space);
+  struct gc_lock lock = nofl_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
     for (size_t idx = 0; idx < NOFL_NONMETA_BLOCKS_PER_SLAB; idx++) {
       uintptr_t addr = (uintptr_t)slabs[slab].blocks[idx].data;
@@ -1812,7 +1824,7 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
       }
     }
   }
-  nofl_lock_release(&lock);
+  gc_lock_release(&lock);
   gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
                                 nofl_space_advance_page_out_queue,
                                 space);

From 1f4e3bdf3731a48f9758d0cf1f304a77aacf4de0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 1 Oct 2024 10:34:27 +0200
Subject: [PATCH 306/403] Add field-logging write barrier (fast path only)

Add a new kind of write barrier, one which has a bit per field; the
mutator that sets the bit will need to add the field's location (the
edge) to a remembered set.  Here we just have the fast-path
implementation.
---
 api/bdw-attrs.h         |  9 +++++++++
 api/gc-api.h            | 35 +++++++++++++++++++++++++++--------
 api/gc-attrs.h          |  6 +++++-
 api/mmc-attrs.h         | 14 +++++++++++++-
 api/pcc-attrs.h         |  9 +++++++++
 api/semi-attrs.h        |  9 +++++++++
 benchmarks/mt-gcbench.c |  9 +++++----
 src/bdw.c               |  5 +++--
 src/mmc.c               |  5 +++--
 src/pcc.c               |  5 +++--
 src/semi.c              |  5 +++--
 11 files changed, 89 insertions(+), 22 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index af1042af3..51b4e72f1 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -49,6 +49,15 @@ static inline size_t gc_write_barrier_card_table_alignment(void) {
 static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_CRASH();
+}
 
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_SIGNAL;
diff --git a/api/gc-api.h b/api/gc-api.h
index 63921f628..cb1e4e819 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -179,13 +179,16 @@ static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
 // FIXME: remove :P
 GC_API_ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t bytes);
 
-GC_API_ void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                                     struct gc_edge edge, struct gc_ref new_val) GC_NEVER_INLINE;
+GC_API_ void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                                   size_t obj_size, struct gc_edge edge,
+                                   struct gc_ref new_val) GC_NEVER_INLINE;
 
-static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
-                                    struct gc_edge edge, struct gc_ref new_val) GC_ALWAYS_INLINE;
-static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
-                                    struct gc_edge edge, struct gc_ref new_val) {
+static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
+                                    size_t obj_size, struct gc_edge edge,
+                                    struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
+                                    size_t obj_size, struct gc_edge edge,
+                                    struct gc_ref new_val) {
   switch (gc_write_barrier_kind(obj_size)) {
   case GC_WRITE_BARRIER_NONE:
     return;
@@ -198,8 +201,24 @@ static inline void gc_write_barrier(struct gc_ref obj, size_t obj_size,
     atomic_store_explicit((uint8_t*)(base + card), 1, memory_order_relaxed);
     return;
   }
-  case GC_WRITE_BARRIER_EXTERN:
-    gc_write_barrier_extern(obj, obj_size, edge, new_val);
+  case GC_WRITE_BARRIER_FIELD: {
+    size_t field_table_alignment = gc_write_barrier_field_table_alignment();
+    size_t fields_per_byte = gc_write_barrier_field_fields_per_byte();
+    uint8_t first_bit_pattern = gc_write_barrier_field_first_bit_pattern();
+
+    uintptr_t addr = (uintptr_t) gc_edge_loc(edge);
+    uintptr_t base = addr & ~(field_table_alignment - 1);
+    uintptr_t field = (addr & (field_table_alignment - 1)) / sizeof(uintptr_t);
+    uintptr_t log_byte = field / fields_per_byte;
+    uint8_t log_bit = first_bit_pattern << (field % fields_per_byte);
+    uint8_t *byte_loc = (uint8_t*)(base + log_byte);
+    uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
+    if (!(byte & log_bit))
+      gc_write_barrier_slow(mut, obj, obj_size, edge, new_val);
+    return;
+  }
+  case GC_WRITE_BARRIER_SLOW:
+    gc_write_barrier_slow(mut, obj, obj_size, edge, new_val);
     return;
   default:
     GC_CRASH();
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index b6acb4302..39faceae2 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -30,12 +30,16 @@ static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
 enum gc_write_barrier_kind {
   GC_WRITE_BARRIER_NONE,
   GC_WRITE_BARRIER_CARD,
-  GC_WRITE_BARRIER_EXTERN
+  GC_WRITE_BARRIER_FIELD,
+  GC_WRITE_BARRIER_SLOW
 };
 
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_card_size(void) GC_ALWAYS_INLINE;
+static inline size_t gc_write_barrier_field_table_alignment(void) GC_ALWAYS_INLINE;
+static inline size_t gc_write_barrier_field_fields_per_byte(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) GC_ALWAYS_INLINE;
 
 enum gc_safepoint_mechanism {
   GC_SAFEPOINT_MECHANISM_COOPERATIVE,
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index e5757f6d1..6da4b2a68 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -44,7 +44,7 @@ static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size)
   if (GC_GENERATIONAL) {
     if (obj_size <= gc_allocator_large_threshold())
       return GC_WRITE_BARRIER_CARD;
-    return GC_WRITE_BARRIER_EXTERN;
+    return GC_WRITE_BARRIER_SLOW;
   }
   return GC_WRITE_BARRIER_NONE;
 }
@@ -56,6 +56,18 @@ static inline size_t gc_write_barrier_card_size(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return 256;
 }
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 4 * 1024 * 1024;
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 2;
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 64; // NOFL_METADATA_BYTE_LOGGED_0
+}
 
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 2f02640ea..5dd38c42f 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -52,6 +52,15 @@ static inline size_t gc_write_barrier_card_table_alignment(void) {
 static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_CRASH();
+}
 
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index bcd8e89e0..94e2dc814 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -51,6 +51,15 @@ static inline size_t gc_write_barrier_card_table_alignment(void) {
 static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
+static inline size_t gc_write_barrier_field_table_alignment(void) {
+  GC_CRASH();
+}
+static inline size_t gc_write_barrier_field_fields_per_byte(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
+  GC_CRASH();
+}
 
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
   return GC_SAFEPOINT_MECHANISM_COOPERATIVE;
diff --git a/benchmarks/mt-gcbench.c b/benchmarks/mt-gcbench.c
index 05ae887d0..7f342fe90 100644
--- a/benchmarks/mt-gcbench.c
+++ b/benchmarks/mt-gcbench.c
@@ -144,8 +144,9 @@ static void allocate_garbage(struct thread *t) {
   }
 }
 
-static void set_field(Node *obj, Node **field, Node *val) {
-  gc_write_barrier(gc_ref_from_heap_object(obj), sizeof(Node),
+static void set_field(struct gc_mutator *mut, Node *obj,
+                      Node **field, Node *val) {
+  gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Node),
                    gc_edge(field),
                    gc_ref_from_heap_object(val));
   *field = val;
@@ -166,8 +167,8 @@ static void populate(struct thread *t, int depth, Node *node) {
   NodeHandle r = { allocate_node(mut) };
   PUSH_HANDLE(t, r);
 
-  set_field(HANDLE_REF(self), &HANDLE_REF(self)->left, HANDLE_REF(l));
-  set_field(HANDLE_REF(self), &HANDLE_REF(self)->right, HANDLE_REF(r));
+  set_field(mut, HANDLE_REF(self), &HANDLE_REF(self)->left, HANDLE_REF(l));
+  set_field(mut, HANDLE_REF(self), &HANDLE_REF(self)->right, HANDLE_REF(r));
   // i is 0 because the memory is zeroed.
   HANDLE_REF(self)->j = depth;
 
diff --git a/src/bdw.c b/src/bdw.c
index df579dde7..d4cbe4c41 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -149,8 +149,9 @@ void gc_collect(struct gc_mutator *mut,
   }
 }
 
-void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                             struct gc_edge edge, struct gc_ref new_val) {
+void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                           size_t obj_size, struct gc_edge edge,
+                           struct gc_ref new_val) {
 }
 
 int* gc_safepoint_flag_loc(struct gc_mutator *mut) { GC_CRASH(); }
diff --git a/src/mmc.c b/src/mmc.c
index 48d8eae59..63778d669 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -876,8 +876,9 @@ gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
 }
 
 void
-gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                        struct gc_edge edge, struct gc_ref new_val) {
+gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                      size_t obj_size, struct gc_edge edge,
+                      struct gc_ref new_val) {
   GC_ASSERT(obj_size > gc_allocator_large_threshold());
   gc_object_set_remembered(obj);
 }
diff --git a/src/pcc.c b/src/pcc.c
index 90194e96a..10d5f8382 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -483,8 +483,9 @@ void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   GC_CRASH();
 }
 
-void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                             struct gc_edge edge, struct gc_ref new_val) {
+void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                           size_t obj_size, struct gc_edge edge,
+                           struct gc_ref new_val) {
 }
 
 int* gc_safepoint_flag_loc(struct gc_mutator *mut) {
diff --git a/src/semi.c b/src/semi.c
index 7958b5898..aab5af233 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -453,8 +453,9 @@ void gc_collect(struct gc_mutator *mut,
   collect(mut, 0);
 }
 
-void gc_write_barrier_extern(struct gc_ref obj, size_t obj_size,
-                             struct gc_edge edge, struct gc_ref new_val) {
+void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
+                           size_t obj_size, struct gc_edge edge,
+                           struct gc_ref new_val) {
 }
 
 int* gc_safepoint_flag_loc(struct gc_mutator *mut) { GC_CRASH(); }

From 8e1574491adf4cfa8021626e3641e86221b942e3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 1 Oct 2024 13:16:43 +0200
Subject: [PATCH 307/403] Fix ephemerons test for mmc

Change to avoid detecting OOM based on no allocation since last GC, if
the collection was explicitly triggered by the user.
---
 src/mmc.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index 63778d669..8f9326cde 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -712,9 +712,11 @@ sweep_ephemerons(struct gc_heap *heap) {
 }
 
 static void collect(struct gc_mutator *mut,
-                    enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
+                    enum gc_collection_kind requested_kind,
+                    int requested_by_user) GC_NEVER_INLINE;
 static void
-collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
+collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind,
+        int requested_by_user) {
   struct gc_heap *heap = mutator_heap(mut);
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
@@ -732,7 +734,8 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   nofl_space_add_to_allocation_counter(nofl_space, &allocation_counter);
   large_object_space_add_to_allocation_counter(lospace, &allocation_counter);
   heap->total_allocated_bytes_at_last_gc += allocation_counter;
-  detect_out_of_memory(heap, allocation_counter);
+  if (!requested_by_user)
+    detect_out_of_memory(heap, allocation_counter);
   enum gc_collection_kind gc_kind =
     determine_collection_kind(heap, requested_kind);
   int is_minor = gc_kind == GC_COLLECTION_MINOR;
@@ -783,7 +786,8 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
 
 static void
 trigger_collection(struct gc_mutator *mut,
-                   enum gc_collection_kind requested_kind) {
+                   enum gc_collection_kind requested_kind,
+                   int requested_by_user) {
   struct gc_heap *heap = mutator_heap(mut);
   int prev_kind = -1;
   gc_stack_capture_hot(&mut->stack);
@@ -792,13 +796,13 @@ trigger_collection(struct gc_mutator *mut,
   while (mutators_are_stopping(heap))
     prev_kind = pause_mutator_for_collection(heap, mut);
   if (prev_kind < (int)requested_kind)
-    collect(mut, requested_kind);
+    collect(mut, requested_kind, requested_by_user);
   heap_unlock(heap);
 }
 
 void
 gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
-  trigger_collection(mut, kind);
+  trigger_collection(mut, kind, 1);
 }
 
 int*
@@ -829,7 +833,7 @@ allocate_large(struct gc_mutator *mut, size_t size) {
                                     npages << lospace->page_size_log2);
 
   while (!nofl_space_shrink(nofl_space, 0))
-    trigger_collection(mut, GC_COLLECTION_COMPACTING);
+    trigger_collection(mut, GC_COLLECTION_COMPACTING, 0);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(lospace, npages);
@@ -846,7 +850,7 @@ allocate_large(struct gc_mutator *mut, size_t size) {
 
 static void
 collect_for_small_allocation(void *mut) {
-  trigger_collection(mut, GC_COLLECTION_ANY);
+  trigger_collection(mut, GC_COLLECTION_ANY, 0);
 }
 
 void*

From 4aa5d04f08eef6dd447d717f86b22a0d1576f145 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 1 Oct 2024 13:18:15 +0200
Subject: [PATCH 308/403] Fix sense of "large_object_space_is_copied".

* src/large-object-space.h (large_object_space_is_copied): I don't
understand why or how it was like it was!
---
 src/large-object-space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index d81369f93..987190187 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -145,7 +145,7 @@ static int large_object_space_is_copied(struct large_object_space *space,
   int copied = 0;
   uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
-  copied = address_set_contains(&space->from_space, addr);
+  copied = address_set_contains(&space->to_space, addr);
   pthread_mutex_unlock(&space->lock);
   return copied;
 }

From cc0476127124aaf96552308bc36d67e289b8dd68 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 1 Oct 2024 14:36:55 +0200
Subject: [PATCH 309/403] gc_object_set_remembered returns nonzero on success

---
 api/gc-embedder-api.h           |  2 +-
 benchmarks/simple-gc-embedder.h | 15 +++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index b176d7bef..7535ea7bc 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -54,7 +54,7 @@ GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
 // are in the remembered set.  Large or potentially large objects
 // (e.g. a vector whose size is a run-time property) must have a
 // remembered set bit.  Small objects may or may not have such a bit.
-GC_EMBEDDER_API inline void gc_object_set_remembered(struct gc_ref ref);
+GC_EMBEDDER_API inline int gc_object_set_remembered(struct gc_ref ref);
 GC_EMBEDDER_API inline int gc_object_is_remembered_nonatomic(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_object_clear_remembered_nonatomic(struct gc_ref ref);
 
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index 683cc15ca..4e5fbb83d 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -102,11 +102,18 @@ static inline void gc_object_forward_nonatomic(struct gc_ref ref,
   *tag_word(ref) = gc_ref_value(new_ref);
 }
 
-static inline void gc_object_set_remembered(struct gc_ref ref) {
+static inline int gc_object_set_remembered(struct gc_ref ref) {
   uintptr_t *loc = tag_word(ref);
-  uintptr_t tag = *loc;
-  while (!(tag & gcobj_remembered_bit))
-    atomic_compare_exchange_weak(loc, &tag, tag | gcobj_remembered_bit);
+  uintptr_t tag = atomic_load_explicit(loc, memory_order_relaxed);
+  while (1) {
+    if (tag & gcobj_remembered_bit)
+      return 0;
+    if (atomic_compare_exchange_weak_explicit(loc, &tag,
+                                              tag | gcobj_remembered_bit,
+                                              memory_order_acq_rel,
+                                              memory_order_acquire))
+      return 1;
+  }
 }
 
 static inline int gc_object_is_remembered_nonatomic(struct gc_ref ref) {

From 42bf36d7cc10a7985fe37ecea9eeaa576eccad49 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 1 Oct 2024 14:38:08 +0200
Subject: [PATCH 310/403] Add nursery for lospace

---
 src/large-object-space.h | 123 +++++++++++++++++++++++----------------
 src/mmc.c                |  10 +++-
 2 files changed, 80 insertions(+), 53 deletions(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index 987190187..358826b36 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -35,6 +35,8 @@ struct large_object_space {
 
   struct address_set from_space;
   struct address_set to_space;
+  struct address_set survivor_space;
+  struct address_set remembered_set;
   struct address_set free_space;
   struct address_map object_pages; // for each object: size in pages.
   struct address_map predecessors; // subsequent addr -> object addr
@@ -47,6 +49,8 @@ static int large_object_space_init(struct large_object_space *space,
   space->page_size_log2 = __builtin_ctz(space->page_size);
   address_set_init(&space->from_space);
   address_set_init(&space->to_space);
+  address_set_init(&space->survivor_space);
+  address_set_init(&space->remembered_set);
   address_set_init(&space->free_space);
   address_map_init(&space->object_pages);
   address_map_init(&space->predecessors);
@@ -63,19 +67,14 @@ large_object_space_size_at_last_collection(struct large_object_space *space) {
   return space->live_pages_at_last_collection << space->page_size_log2;
 }
 
-static void large_object_space_clear_one_remembered(uintptr_t addr,
-                                                    void *unused) {
-  struct gc_ref ref = gc_ref(addr);
-  if (gc_object_is_remembered_nonatomic(ref))
-    gc_object_clear_remembered_nonatomic(ref);
-}
-
-static void
-large_object_space_clear_remembered_set(struct large_object_space *space) {
-  if (!GC_GENERATIONAL)
-    return;
-  address_set_for_each(&space->to_space,
-                       large_object_space_clear_one_remembered, NULL);
+static inline int large_object_space_contains(struct large_object_space *space,
+                                              struct gc_ref ref) {
+  pthread_mutex_lock(&space->lock);
+  // ptr might be in fromspace or tospace.  Just check the object_pages table, which
+  // contains both, as well as object_pages for free blocks.
+  int ret = address_map_contains(&space->object_pages, gc_ref_value(ref));
+  pthread_mutex_unlock(&space->lock);
+  return ret;
 }
 
 struct large_object_space_trace_remembered_data {
@@ -86,11 +85,14 @@ struct large_object_space_trace_remembered_data {
 static void large_object_space_trace_one_remembered(uintptr_t addr,
                                                     void *data) {
   struct gc_ref ref = gc_ref(addr);
-  if (gc_object_is_remembered_nonatomic(ref)) {
-    gc_object_clear_remembered_nonatomic(ref);
-    struct large_object_space_trace_remembered_data *vdata = data;
-    vdata->trace(ref, vdata->heap);
-  }
+  gc_object_clear_remembered_nonatomic(ref);
+  struct large_object_space_trace_remembered_data *vdata = data;
+  vdata->trace(ref, vdata->heap);
+}
+
+static void
+large_object_space_clear_remembered_set(struct large_object_space *space) {
+  address_set_clear(&space->remembered_set);
 }
 
 static void
@@ -102,22 +104,43 @@ large_object_space_trace_remembered_set(struct large_object_space *space,
 
   if (!GC_GENERATIONAL)
     return;
-  address_set_for_each(&space->to_space,
+  address_set_for_each(&space->remembered_set,
                        large_object_space_trace_one_remembered, &vdata);
+  large_object_space_clear_remembered_set(space);
+}
+
+static void
+large_object_space_remember_object(struct large_object_space *space,
+                                   struct gc_ref ref) {
+  GC_ASSERT(GC_GENERATIONAL);
+  uintptr_t addr = gc_ref_value(ref);
+  pthread_mutex_lock(&space->lock);
+  GC_ASSERT(!address_set_contains(&space->remembered_set, addr));
+  address_set_add(&space->remembered_set, addr);
+  pthread_mutex_unlock(&space->lock);
+}
+
+static void large_object_space_flip_survivor(uintptr_t addr,
+                                             void *data) {
+  struct large_object_space *space = data;
+  address_set_add(&space->from_space, addr);
 }
 
 static void large_object_space_start_gc(struct large_object_space *space,
                                         int is_minor_gc) {
-  if (is_minor_gc)
-    return;
-
   // Flip.  Note that when we flip, fromspace is empty, but it might have
   // allocated storage, so we do need to do a proper swap.
   struct address_set tmp;
   memcpy(&tmp, &space->from_space, sizeof(tmp));
   memcpy(&space->from_space, &space->to_space, sizeof(tmp));
   memcpy(&space->to_space, &tmp, sizeof(tmp));
-  space->live_pages_at_last_collection = 0;
+  
+  if (!is_minor_gc) {
+    address_set_for_each(&space->survivor_space,
+                         large_object_space_flip_survivor, space);
+    address_set_clear(&space->survivor_space);
+    space->live_pages_at_last_collection = 0;
+  }
 }
 
 static int large_object_space_copy(struct large_object_space *space,
@@ -126,14 +149,16 @@ static int large_object_space_copy(struct large_object_space *space,
   uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
   if (!address_set_contains(&space->from_space, addr))
-    // Already copied; object is grey or white.
+    // Already copied; object is grey or black.
     goto done;
   space->live_pages_at_last_collection +=
     address_map_lookup(&space->object_pages, addr, 0);
   address_set_remove(&space->from_space, addr);
-  address_set_add(&space->to_space, addr);
-  // Object should be placed on mark stack for visiting its fields.  (While on
-  // mark stack it's actually grey, not black.)
+  address_set_add(GC_GENERATIONAL ? &space->survivor_space : &space->to_space,
+                  addr);
+  if (GC_GENERATIONAL && gc_object_is_remembered_nonatomic(ref))
+    gc_object_clear_remembered_nonatomic(ref);
+  // Object is grey; place it on mark stack to visit its fields.
   copied = 1;
 done:
   pthread_mutex_unlock(&space->lock);
@@ -142,14 +167,26 @@ done:
 
 static int large_object_space_is_copied(struct large_object_space *space,
                                         struct gc_ref ref) {
+  GC_ASSERT(large_object_space_contains(space, ref));
   int copied = 0;
   uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
-  copied = address_set_contains(&space->to_space, addr);
+  copied = !address_set_contains(&space->from_space, addr);
   pthread_mutex_unlock(&space->lock);
   return copied;
 }
 
+static int large_object_space_is_old(struct large_object_space *space,
+                                     struct gc_ref ref) {
+  GC_ASSERT(large_object_space_contains(space, ref));
+  int old = 0;
+  uintptr_t addr = gc_ref_value(ref);
+  pthread_mutex_lock(&space->lock);
+  old = address_set_contains(&space->survivor_space, addr);
+  pthread_mutex_unlock(&space->lock);
+  return old;
+}
+
 static int large_object_space_mark_object(struct large_object_space *space,
                                           struct gc_ref ref) {
   return large_object_space_copy(space, ref);
@@ -202,19 +239,13 @@ static void large_object_space_reclaim_one(uintptr_t addr, void *data) {
 static void large_object_space_finish_gc(struct large_object_space *space,
                                          int is_minor_gc) {
   pthread_mutex_lock(&space->lock);
-  if (is_minor_gc) {
-    space->live_pages_at_last_collection =
-      space->total_pages - space->free_pages;
-    space->pages_freed_by_last_collection = 0;
-  } else {
-    address_set_for_each(&space->from_space, large_object_space_reclaim_one,
-                         space);
-    address_set_clear(&space->from_space);
-    size_t free_pages =
-      space->total_pages - space->live_pages_at_last_collection;
-    space->pages_freed_by_last_collection = free_pages - space->free_pages;
-    space->free_pages = free_pages;
-  }
+  address_set_for_each(&space->from_space, large_object_space_reclaim_one,
+                       space);
+  address_set_clear(&space->from_space);
+  size_t free_pages =
+    space->total_pages - space->live_pages_at_last_collection;
+  space->pages_freed_by_last_collection = free_pages - space->free_pages;
+  space->free_pages = free_pages;
   pthread_mutex_unlock(&space->lock);
 }
 
@@ -258,16 +289,6 @@ large_object_space_mark_conservative_ref(struct large_object_space *space,
   return gc_ref_null();
 }
 
-static inline int large_object_space_contains(struct large_object_space *space,
-                                              struct gc_ref ref) {
-  pthread_mutex_lock(&space->lock);
-  // ptr might be in fromspace or tospace.  Just check the object_pages table, which
-  // contains both, as well as object_pages for free blocks.
-  int ret = address_map_contains(&space->object_pages, gc_ref_value(ref));
-  pthread_mutex_unlock(&space->lock);
-  return ret;
-}
-
 struct large_object_space_candidate {
   struct large_object_space *space;
   size_t min_npages;
diff --git a/src/mmc.c b/src/mmc.c
index 8f9326cde..e14346bec 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -641,7 +641,7 @@ enqueue_remembered_object(struct gc_ref ref, struct gc_heap *heap) {
 static void
 enqueue_generational_roots(struct gc_heap *heap,
                            enum gc_collection_kind gc_kind) {
-  // TODO: Add lospace nursery.
+  if (!GC_GENERATIONAL) return;
   if (gc_kind == GC_COLLECTION_MINOR) {
     for (size_t i = 0; i < heap_nofl_space(heap)->nslabs; i++)
       gc_tracer_add_root(&heap->tracer, gc_root_remembered_slab(i));
@@ -883,8 +883,14 @@ void
 gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                       size_t obj_size, struct gc_edge edge,
                       struct gc_ref new_val) {
+  if (!GC_GENERATIONAL) return;
   GC_ASSERT(obj_size > gc_allocator_large_threshold());
-  gc_object_set_remembered(obj);
+  struct gc_heap *heap = mutator_heap(mut);
+  struct large_object_space *space = heap_large_object_space(heap);
+  if (!large_object_space_is_old(space, obj))
+    return;
+  if (gc_object_set_remembered(obj))
+    large_object_space_remember_object(space, obj);
 }
   
 struct gc_ephemeron*

From 1493bf6398f53db468c54dd527b92ed62376a388 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 1 Oct 2024 15:44:55 +0200
Subject: [PATCH 311/403] Add gc_object_is_old_generation

Will be useful for write barriers.
---
 api/bdw-attrs.h          |  7 +++++++
 api/gc-api.h             | 32 +++++++++++++++++++++++++++++++-
 api/gc-attrs.h           | 10 ++++++++++
 api/mmc-attrs.h          | 13 +++++++++++++
 api/pcc-attrs.h          |  7 +++++++
 api/semi-attrs.h         |  7 +++++++
 src/bdw.c                |  5 +++++
 src/large-object-space.h |  4 ++--
 src/mmc.c                | 19 ++++++++++++++++++-
 src/nofl-space.h         |  9 +++++++++
 src/pcc.c                |  5 +++++
 src/semi.c               |  5 +++++
 12 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index 51b4e72f1..05f7e4cb7 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -40,6 +40,13 @@ static inline int gc_allocator_needs_clear(void) {
   return 0;
 }
 
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+  GC_CRASH();
+}
+
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
   return GC_WRITE_BARRIER_NONE;
 }
diff --git a/api/gc-api.h b/api/gc-api.h
index cb1e4e819..8f5565428 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -179,6 +179,33 @@ static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
 // FIXME: remove :P
 GC_API_ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t bytes);
 
+GC_API_ int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                             struct gc_ref obj) GC_NEVER_INLINE;
+
+static inline int gc_object_is_old_generation(struct gc_mutator *mut,
+                                              struct gc_ref obj,
+                                              size_t obj_size) GC_ALWAYS_INLINE;
+static inline int gc_object_is_old_generation(struct gc_mutator *mut,
+                                              struct gc_ref obj,
+                                              size_t obj_size) {
+  switch (gc_old_generation_check_kind(obj_size)) {
+  case GC_OLD_GENERATION_CHECK_ALLOC_TABLE: {
+    size_t alignment = gc_allocator_alloc_table_alignment();
+    GC_ASSERT(alignment);
+    uintptr_t addr = gc_ref_value(obj);
+    uintptr_t base = addr & ~(alignment - 1);
+    size_t granule_size = gc_allocator_small_granule_size();
+    uintptr_t granule = (addr & (alignment - 1)) / granule_size;
+    uint8_t *byte = (uint8_t*)(base + granule);
+    return (*byte) & gc_old_generation_check_alloc_table_bit_pattern();
+  }
+  case GC_OLD_GENERATION_CHECK_SLOW:
+    return gc_object_is_old_generation_slow(mut, obj);
+  default:
+    GC_CRASH();
+  }
+}
+
 GC_API_ void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                                    size_t obj_size, struct gc_edge edge,
                                    struct gc_ref new_val) GC_NEVER_INLINE;
@@ -202,11 +229,14 @@ static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
     return;
   }
   case GC_WRITE_BARRIER_FIELD: {
+    if (!gc_object_is_old_generation(mut, obj, obj_size))
+      return;
+
     size_t field_table_alignment = gc_write_barrier_field_table_alignment();
     size_t fields_per_byte = gc_write_barrier_field_fields_per_byte();
     uint8_t first_bit_pattern = gc_write_barrier_field_first_bit_pattern();
 
-    uintptr_t addr = (uintptr_t) gc_edge_loc(edge);
+    uintptr_t addr = gc_ref_value(obj);
     uintptr_t base = addr & ~(field_table_alignment - 1);
     uintptr_t field = (addr & (field_table_alignment - 1)) / sizeof(uintptr_t);
     uintptr_t log_byte = field / fields_per_byte;
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index 39faceae2..389cb536e 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -27,6 +27,16 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLIN
 
 static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
 
+enum gc_old_generation_check_kind {
+  GC_OLD_GENERATION_CHECK_NONE,
+  GC_OLD_GENERATION_CHECK_ALLOC_TABLE,
+  GC_OLD_GENERATION_CHECK_SLOW
+};
+
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) GC_ALWAYS_INLINE;
+
+static uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) GC_ALWAYS_INLINE;
+
 enum gc_write_barrier_kind {
   GC_WRITE_BARRIER_NONE,
   GC_WRITE_BARRIER_CARD,
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 6da4b2a68..f527c127b 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -40,6 +40,19 @@ static inline int gc_allocator_needs_clear(void) {
   return 0;
 }
 
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) {
+  if (GC_GENERATIONAL) {
+    if (obj_size <= gc_allocator_large_threshold())
+      return GC_OLD_GENERATION_CHECK_ALLOC_TABLE;
+    return GC_OLD_GENERATION_CHECK_SLOW;
+  }
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+  // The three mark bits.
+  return 2 + 4 + 8;
+}
+
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
   if (GC_GENERATIONAL) {
     if (obj_size <= gc_allocator_large_threshold())
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 5dd38c42f..c86d79471 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -43,6 +43,13 @@ static inline int gc_allocator_needs_clear(void) {
   return 0;
 }
 
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+  GC_CRASH();
+}
+
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
   return GC_WRITE_BARRIER_NONE;
 }
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index 94e2dc814..997b031ee 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -42,6 +42,13 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
   GC_CRASH();
 }
 
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
+  return GC_OLD_GENERATION_CHECK_NONE;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+  GC_CRASH();
+}
+
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
   return GC_WRITE_BARRIER_NONE;
 }
diff --git a/src/bdw.c b/src/bdw.c
index d4cbe4c41..d1478d805 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -149,6 +149,11 @@ void gc_collect(struct gc_mutator *mut,
   }
 }
 
+int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                     struct gc_ref obj) {
+  return 0;
+}
+
 void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                            size_t obj_size, struct gc_edge edge,
                            struct gc_ref new_val) {
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 358826b36..323c3f2f8 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -176,8 +176,8 @@ static int large_object_space_is_copied(struct large_object_space *space,
   return copied;
 }
 
-static int large_object_space_is_old(struct large_object_space *space,
-                                     struct gc_ref ref) {
+static int large_object_space_is_survivor(struct large_object_space *space,
+                                          struct gc_ref ref) {
   GC_ASSERT(large_object_space_contains(space, ref));
   int old = 0;
   uintptr_t addr = gc_ref_value(ref);
diff --git a/src/mmc.c b/src/mmc.c
index e14346bec..ce1571118 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -879,6 +879,23 @@ gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   // Otherwise if it's a large or external object, it won't move.
 }
 
+int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                     struct gc_ref obj) {
+  if (!GC_GENERATIONAL)
+    return 0;
+
+  struct gc_heap *heap = mutator_heap(mut);
+  struct nofl_space *nofl_space = heap_nofl_space(heap);
+  if (nofl_space_contains(nofl_space, obj))
+    return nofl_space_is_survivor(nofl_space, obj);
+
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  if (large_object_space_contains(lospace, obj))
+    return large_object_space_is_survivor(lospace, obj);
+
+  return 0;
+}
+
 void
 gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                       size_t obj_size, struct gc_edge edge,
@@ -887,7 +904,7 @@ gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
   GC_ASSERT(obj_size > gc_allocator_large_threshold());
   struct gc_heap *heap = mutator_heap(mut);
   struct large_object_space *space = heap_large_object_space(heap);
-  if (!large_object_space_is_old(space, obj))
+  if (!large_object_space_is_survivor(space, obj))
     return;
   if (gc_object_set_remembered(obj))
     large_object_space_remember_object(space, obj);
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 93904aee5..e47162853 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1431,6 +1431,15 @@ nofl_space_pin_object(struct nofl_space *space, struct gc_ref ref) {
                                                   memory_order_acquire));
 }
 
+static inline int
+nofl_space_is_survivor(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
+  uint8_t mask = NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  uint8_t byte = atomic_load_explicit(metadata, memory_order_relaxed);
+  return byte & mask;
+}
+
 static inline int
 nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
                     struct gc_edge edge,
diff --git a/src/pcc.c b/src/pcc.c
index 10d5f8382..593f43fc6 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -483,6 +483,11 @@ void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   GC_CRASH();
 }
 
+int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                     struct gc_ref obj) {
+  return 0;
+}
+
 void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                            size_t obj_size, struct gc_edge edge,
                            struct gc_ref new_val) {
diff --git a/src/semi.c b/src/semi.c
index aab5af233..ca7a31607 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -453,6 +453,11 @@ void gc_collect(struct gc_mutator *mut,
   collect(mut, 0);
 }
 
+int gc_object_is_old_generation_slow(struct gc_mutator *mut,
+                                     struct gc_ref obj) {
+  return 0;
+}
+
 void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                            size_t obj_size, struct gc_edge edge,
                            struct gc_ref new_val) {

From 1ecb45a437df45ae2ae894639e91e999c9c50813 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 2 Oct 2024 21:25:09 +0200
Subject: [PATCH 312/403] Switch mmc to field-logging write barrier

Instead of the card table, use metadata bytes for field-logging.  More
precision should lead to less work during the pause.
---
 api/mmc-attrs.h          |   4 +-
 src/field-set.h          | 205 +++++++++++++++++++++++++++++++++++++++
 src/large-object-space.h |  77 ++++++---------
 src/mmc.c                |  77 +++++++++------
 src/nofl-space.h         | 184 ++++++++++++++++++-----------------
 src/root.h               |  20 ++--
 6 files changed, 385 insertions(+), 182 deletions(-)
 create mode 100644 src/field-set.h

diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index f527c127b..5d4dcb490 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -56,7 +56,7 @@ static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
   if (GC_GENERATIONAL) {
     if (obj_size <= gc_allocator_large_threshold())
-      return GC_WRITE_BARRIER_CARD;
+      return GC_WRITE_BARRIER_FIELD;
     return GC_WRITE_BARRIER_SLOW;
   }
   return GC_WRITE_BARRIER_NONE;
@@ -71,7 +71,7 @@ static inline size_t gc_write_barrier_card_size(void) {
 }
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_ASSERT(GC_GENERATIONAL);
-  return 4 * 1024 * 1024;
+  return gc_allocator_alloc_table_alignment();
 }
 static inline size_t gc_write_barrier_field_fields_per_byte(void) {
   GC_ASSERT(GC_GENERATIONAL);
diff --git a/src/field-set.h b/src/field-set.h
new file mode 100644
index 000000000..c7ddffd08
--- /dev/null
+++ b/src/field-set.h
@@ -0,0 +1,205 @@
+#ifndef FIELD_SET_H
+#define FIELD_SET_H
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+
+#include "assert.h"
+#include "gc-edge.h"
+#include "gc-lock.h"
+#include "tracer.h"
+
+#define GC_EDGE_BUFFER_CAPACITY 510
+
+struct gc_edge_buffer {
+  struct gc_edge_buffer *next;
+  size_t size;
+  struct gc_edge edges[GC_EDGE_BUFFER_CAPACITY];
+};
+
+// Lock-free.
+struct gc_edge_buffer_list {
+  struct gc_edge_buffer *head;
+};
+
+// With a lock.
+struct gc_edge_buffer_stack {
+  struct gc_edge_buffer_list list;
+};
+
+struct gc_field_set {
+  struct gc_edge_buffer_list full;
+  struct gc_edge_buffer_stack partly_full;
+  struct gc_edge_buffer_list empty;
+  size_t count;
+  pthread_mutex_t lock;
+};
+
+struct gc_field_set_writer {
+  struct gc_edge_buffer *buf;
+  struct gc_field_set *set;
+};
+
+static void
+gc_edge_buffer_list_push(struct gc_edge_buffer_list *list,
+                         struct gc_edge_buffer *buf) {
+  struct gc_edge_buffer *next =
+    atomic_load_explicit(&list->head, memory_order_relaxed);
+  do {
+    buf->next = next;
+  } while (!atomic_compare_exchange_weak_explicit(&list->head, &next, buf,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+}
+
+static struct gc_edge_buffer*
+gc_edge_buffer_list_pop(struct gc_edge_buffer_list *list) {
+  struct gc_edge_buffer *head =
+    atomic_load_explicit(&list->head, memory_order_acquire);
+  struct gc_edge_buffer *next;
+  do {
+    if (!head) return NULL;
+    next = head->next;
+  } while (!atomic_compare_exchange_weak_explicit(&list->head, &head, next,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  head->next = NULL;
+  return head;
+}
+
+static void
+gc_edge_buffer_stack_push(struct gc_edge_buffer_stack *stack,
+                          struct gc_edge_buffer *buf,
+                          const struct gc_lock *lock) {
+  buf->next = stack->list.head;
+  stack->list.head = buf;
+}
+
+static struct gc_edge_buffer*
+gc_edge_buffer_stack_pop(struct gc_edge_buffer_stack *stack,
+                         const struct gc_lock *lock) {
+  struct gc_edge_buffer *head = stack->list.head;
+  if (head) {
+    stack->list.head = head->next;
+    head->next = NULL;
+  }
+  return head;
+}
+
+static void
+gc_field_set_init(struct gc_field_set *set) {
+  memset(set, 0, sizeof(*set));
+  pthread_mutex_init(&set->lock, NULL);
+}
+
+static struct gc_edge_buffer*
+gc_field_set_acquire_buffer(struct gc_field_set *set) {
+  struct gc_edge_buffer *ret;
+
+  ret = gc_edge_buffer_list_pop(&set->empty);
+  if (ret) return ret;
+
+  struct gc_lock lock = gc_lock_acquire(&set->lock);
+  ret = gc_edge_buffer_stack_pop(&set->partly_full, &lock);
+  gc_lock_release(&lock);
+  if (ret) return ret;
+
+  // atomic inc count
+  ret = malloc(sizeof(*ret));
+  if (!ret) {
+    perror("Failed to allocate remembered set");
+    GC_CRASH();
+  }
+  memset(ret, 0, sizeof(*ret));
+  return ret;
+}
+
+static void
+gc_field_set_release_buffer(struct gc_field_set *set,
+                            struct gc_edge_buffer *buf) {
+  if (buf->size == GC_EDGE_BUFFER_CAPACITY) {
+    gc_edge_buffer_list_push(&set->full, buf);
+  } else {
+    struct gc_lock lock = gc_lock_acquire(&set->lock);
+    gc_edge_buffer_stack_push(&set->partly_full, buf, &lock);
+    gc_lock_release(&lock);
+  }
+}
+
+static void
+gc_field_set_add_roots(struct gc_field_set *set, struct gc_tracer *tracer) {
+  struct gc_edge_buffer *buf;
+  for (buf = set->partly_full.list.head; buf; buf = buf->next)
+    gc_tracer_add_root(tracer, gc_root_edge_buffer(buf));
+  for (buf = set->full.head; buf; buf = buf->next)
+    gc_tracer_add_root(tracer, gc_root_edge_buffer(buf));
+}
+
+static void
+gc_field_set_clear(struct gc_field_set *set,
+                   void (*forget_edge)(struct gc_edge, struct gc_heap*),
+                   struct gc_heap *heap) {
+  struct gc_edge_buffer *partly_full = set->partly_full.list.head;
+  struct gc_edge_buffer *full = set->full.head;
+  // Clear the full and partly full sets now so that if a collector
+  // wanted to it could re-add an edge to the remembered set.
+  set->partly_full.list.head = NULL;
+  set->full.head = NULL;
+  struct gc_edge_buffer *buf;
+  for (buf = partly_full; buf; buf = buf->next) {
+    for (size_t i = 0; i < buf->size; i++)
+      forget_edge(buf->edges[i], heap);
+    buf->size = 0;
+    gc_edge_buffer_list_push(&set->empty, buf);
+  }
+  for (buf = full; buf; buf = buf->next) {
+    for (size_t i = 0; i < buf->size; i++)
+      forget_edge(buf->edges[i], heap);
+    buf->size = 0;
+    gc_edge_buffer_list_push(&set->empty, buf);
+  }
+}
+
+static inline void
+gc_field_set_trace_edge_buffer(struct gc_field_set *set,
+                               struct gc_edge_buffer *buf,
+                               void (*tracer_visit)(struct gc_edge,
+                                                    struct gc_heap*,
+                                                    void *data),
+                               struct gc_heap *heap,
+                               struct gc_trace_worker *worker) {
+  for (size_t i = 0; i < buf->size; i++)
+    tracer_visit(buf->edges[i], heap, worker);
+}
+
+static void
+gc_field_set_writer_release_buffer(struct gc_field_set_writer *writer) {
+  if (writer->buf) {
+    gc_field_set_release_buffer(writer->set, writer->buf);
+    writer->buf = NULL;
+  }
+}
+
+static void
+gc_field_set_writer_init(struct gc_field_set_writer *writer,
+                         struct gc_field_set *set) {
+  writer->set = set;
+  writer->buf = NULL;
+}
+
+static void
+gc_field_set_writer_add_edge(struct gc_field_set_writer *writer,
+                             struct gc_edge edge) {
+  struct gc_edge_buffer *buf = writer->buf;
+  if (GC_UNLIKELY(!buf))
+    writer->buf = buf = gc_field_set_acquire_buffer(writer->set);
+  GC_ASSERT(buf->size < GC_EDGE_BUFFER_CAPACITY);
+  buf->edges[buf->size++] = edge;
+  if (GC_UNLIKELY(buf->size == GC_EDGE_BUFFER_CAPACITY)) {
+    gc_edge_buffer_list_push(&writer->set->full, buf);
+    writer->buf = NULL;
+  }
+}
+
+#endif // FIELD_SET_H
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 323c3f2f8..4c7277797 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -36,7 +36,7 @@ struct large_object_space {
   struct address_set from_space;
   struct address_set to_space;
   struct address_set survivor_space;
-  struct address_set remembered_set;
+  struct address_set remembered_edges;
   struct address_set free_space;
   struct address_map object_pages; // for each object: size in pages.
   struct address_map predecessors; // subsequent addr -> object addr
@@ -50,7 +50,7 @@ static int large_object_space_init(struct large_object_space *space,
   address_set_init(&space->from_space);
   address_set_init(&space->to_space);
   address_set_init(&space->survivor_space);
-  address_set_init(&space->remembered_set);
+  address_set_init(&space->remembered_edges);
   address_set_init(&space->free_space);
   address_map_init(&space->object_pages);
   address_map_init(&space->predecessors);
@@ -77,49 +77,6 @@ static inline int large_object_space_contains(struct large_object_space *space,
   return ret;
 }
 
-struct large_object_space_trace_remembered_data {
-  void (*trace)(struct gc_ref, struct gc_heap*);
-  struct gc_heap *heap;
-};
-
-static void large_object_space_trace_one_remembered(uintptr_t addr,
-                                                    void *data) {
-  struct gc_ref ref = gc_ref(addr);
-  gc_object_clear_remembered_nonatomic(ref);
-  struct large_object_space_trace_remembered_data *vdata = data;
-  vdata->trace(ref, vdata->heap);
-}
-
-static void
-large_object_space_clear_remembered_set(struct large_object_space *space) {
-  address_set_clear(&space->remembered_set);
-}
-
-static void
-large_object_space_trace_remembered_set(struct large_object_space *space,
-                                        void (*trace)(struct gc_ref,
-                                                      struct gc_heap*),
-                                        struct gc_heap *heap) {
-  struct large_object_space_trace_remembered_data vdata = { trace, heap };
-
-  if (!GC_GENERATIONAL)
-    return;
-  address_set_for_each(&space->remembered_set,
-                       large_object_space_trace_one_remembered, &vdata);
-  large_object_space_clear_remembered_set(space);
-}
-
-static void
-large_object_space_remember_object(struct large_object_space *space,
-                                   struct gc_ref ref) {
-  GC_ASSERT(GC_GENERATIONAL);
-  uintptr_t addr = gc_ref_value(ref);
-  pthread_mutex_lock(&space->lock);
-  GC_ASSERT(!address_set_contains(&space->remembered_set, addr));
-  address_set_add(&space->remembered_set, addr);
-  pthread_mutex_unlock(&space->lock);
-}
-
 static void large_object_space_flip_survivor(uintptr_t addr,
                                              void *data) {
   struct large_object_space *space = data;
@@ -176,17 +133,41 @@ static int large_object_space_is_copied(struct large_object_space *space,
   return copied;
 }
 
+static int
+large_object_space_is_survivor_with_lock(struct large_object_space *space,
+                                         struct gc_ref ref) {
+  return address_set_contains(&space->survivor_space, gc_ref_value(ref));
+}
+
 static int large_object_space_is_survivor(struct large_object_space *space,
                                           struct gc_ref ref) {
   GC_ASSERT(large_object_space_contains(space, ref));
-  int old = 0;
-  uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
-  old = address_set_contains(&space->survivor_space, addr);
+  int old = large_object_space_is_survivor_with_lock(space, ref);
   pthread_mutex_unlock(&space->lock);
   return old;
 }
 
+static int large_object_space_remember_edge(struct large_object_space *space,
+                                            struct gc_ref obj,
+                                            struct gc_edge edge) {
+  int remembered = 0;
+  uintptr_t edge_addr = (uintptr_t)gc_edge_loc(edge);
+  pthread_mutex_lock(&space->lock);
+  if (large_object_space_is_survivor_with_lock(space, obj)
+      && !address_set_contains(&space->remembered_edges, edge_addr)) {
+    address_set_add(&space->remembered_edges, edge_addr);
+    remembered = 1;
+  }
+  pthread_mutex_unlock(&space->lock);
+  return remembered;
+}
+
+static void
+large_object_space_clear_remembered_edges(struct large_object_space *space) {
+  address_set_clear(&space->remembered_edges);
+}
+
 static int large_object_space_mark_object(struct large_object_space *space,
                                           struct gc_ref ref) {
   return large_object_space_copy(space, ref);
diff --git a/src/mmc.c b/src/mmc.c
index ce1571118..b1f4238a7 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -11,6 +11,7 @@
 
 #include "background-thread.h"
 #include "debug.h"
+#include "field-set.h"
 #include "gc-align.h"
 #include "gc-inline.h"
 #include "gc-platform.h"
@@ -33,6 +34,7 @@ struct gc_heap {
   struct nofl_space nofl_space;
   struct large_object_space large_object_space;
   struct gc_extern_space *extern_space;
+  struct gc_field_set remembered_set;
   size_t large_object_pages;
   pthread_mutex_t lock;
   pthread_cond_t collector_cond;
@@ -72,6 +74,7 @@ struct gc_heap {
 
 struct gc_mutator {
   struct nofl_allocator allocator;
+  struct gc_field_set_writer logger;
   struct gc_heap *heap;
   struct gc_stack stack;
   struct gc_mutator_roots *roots;
@@ -188,6 +191,7 @@ add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
   nofl_allocator_reset(&mut->allocator);
+  gc_field_set_writer_init(&mut->logger, &heap->remembered_set);
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
@@ -207,6 +211,8 @@ add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 static void
 remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(&mut->logger);
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
@@ -360,12 +366,9 @@ trace_root(struct gc_root root, struct gc_heap *heap,
   case GC_ROOT_KIND_EDGE:
     tracer_visit(root.edge, heap, worker);
     break;
-  case GC_ROOT_KIND_REMEMBERED_OBJECT:
-    trace_one(root.ref, heap, worker);
-    break;
-  case GC_ROOT_KIND_REMEMBERED_SLAB:
-    nofl_space_trace_remembered_slab(heap_nofl_space(heap), root.idx,
-                                     trace_one, heap, worker);
+  case GC_ROOT_KIND_EDGE_BUFFER:
+    gc_field_set_trace_edge_buffer(&heap->remembered_set, root.edge_buffer,
+                                   tracer_visit, heap, worker);
     break;
   default:
     GC_CRASH();
@@ -633,25 +636,27 @@ enqueue_root_edge(struct gc_edge edge, struct gc_heap *heap, void *unused) {
   gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
 }
 
-static void
-enqueue_remembered_object(struct gc_ref ref, struct gc_heap *heap) {
-  gc_tracer_add_root(&heap->tracer, gc_root_remembered_object(ref));
-}
-
 static void
 enqueue_generational_roots(struct gc_heap *heap,
                            enum gc_collection_kind gc_kind) {
   if (!GC_GENERATIONAL) return;
-  if (gc_kind == GC_COLLECTION_MINOR) {
-    for (size_t i = 0; i < heap_nofl_space(heap)->nslabs; i++)
-      gc_tracer_add_root(&heap->tracer, gc_root_remembered_slab(i));
-    large_object_space_trace_remembered_set(heap_large_object_space(heap),
-                                            enqueue_remembered_object,
-                                            heap);
-  } else {
-    nofl_space_clear_remembered_set(heap_nofl_space(heap));
-    large_object_space_clear_remembered_set(heap_large_object_space(heap));
-  }
+  if (gc_kind == GC_COLLECTION_MINOR)
+    gc_field_set_add_roots(&heap->remembered_set, &heap->tracer);
+}
+
+static inline void
+forget_remembered_edge(struct gc_edge edge, struct gc_heap *heap) {
+  struct nofl_space *space = heap_nofl_space(heap);
+  if (nofl_space_contains_edge(space, edge))
+    nofl_space_forget_edge(space, edge);
+  // Otherwise the edge is in the lospace, whose remembered edges are
+  // cleared in bulk.
+}
+
+static void
+clear_remembered_set(struct gc_heap *heap) {
+  gc_field_set_clear(&heap->remembered_set, forget_remembered_edge, heap);
+  large_object_space_clear_remembered_edges(heap_large_object_space(heap));
 }
 
 static void
@@ -768,6 +773,7 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind,
   HEAP_EVENT(heap, finalizers_traced);
   sweep_ephemerons(heap);
   gc_tracer_release(&heap->tracer);
+  clear_remembered_set(heap);
   nofl_space_finish_gc(nofl_space, gc_kind);
   large_object_space_finish_gc(lospace, is_minor);
   gc_extern_space_finish_gc(exspace, is_minor);
@@ -792,6 +798,8 @@ trigger_collection(struct gc_mutator *mut,
   int prev_kind = -1;
   gc_stack_capture_hot(&mut->stack);
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(&mut->logger);
   heap_lock(heap);
   while (mutators_are_stopping(heap))
     prev_kind = pause_mutator_for_collection(heap, mut);
@@ -815,6 +823,8 @@ gc_safepoint_slow(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   gc_stack_capture_hot(&mut->stack);
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(&mut->logger);
   heap_lock(heap);
   while (mutators_are_stopping(mutator_heap(mut)))
     pause_mutator_for_collection(heap, mut);
@@ -900,14 +910,16 @@ void
 gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                       size_t obj_size, struct gc_edge edge,
                       struct gc_ref new_val) {
+  GC_ASSERT(gc_ref_is_heap_object(new_val));
   if (!GC_GENERATIONAL) return;
-  GC_ASSERT(obj_size > gc_allocator_large_threshold());
-  struct gc_heap *heap = mutator_heap(mut);
-  struct large_object_space *space = heap_large_object_space(heap);
-  if (!large_object_space_is_survivor(space, obj))
+  if (gc_object_is_old_generation_slow(mut, new_val))
     return;
-  if (gc_object_set_remembered(obj))
-    large_object_space_remember_object(space, obj);
+  struct gc_heap *heap = mutator_heap(mut);
+  if ((obj_size <= gc_allocator_large_threshold())
+      ? nofl_space_remember_edge(heap_nofl_space(heap), obj, edge)
+      : large_object_space_remember_edge(heap_large_object_space(heap),
+                                         obj, edge))
+    gc_field_set_writer_add_edge(&mut->logger, edge);
 }
   
 struct gc_ephemeron*
@@ -1032,6 +1044,7 @@ static int
 heap_init(struct gc_heap *heap, const struct gc_options *options) {
   // *heap is already initialized to 0.
 
+  gc_field_set_init(&heap->remembered_set);
   pthread_mutex_init(&heap->lock, NULL);
   pthread_cond_init(&heap->mutator_cond, NULL);
   pthread_cond_init(&heap->collector_cond, NULL);
@@ -1080,9 +1093,11 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), NOFL_METADATA_BYTE_YOUNG);
   GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), NOFL_METADATA_BYTE_END);
   if (GC_GENERATIONAL) {
-    GC_ASSERT_EQ(gc_write_barrier_card_table_alignment(), NOFL_SLAB_SIZE);
-    GC_ASSERT_EQ(gc_write_barrier_card_size(),
-                 NOFL_BLOCK_SIZE / NOFL_REMSET_BYTES_PER_BLOCK);
+    GC_ASSERT_EQ(gc_write_barrier_field_table_alignment(), NOFL_SLAB_SIZE);
+    GC_ASSERT_EQ(gc_write_barrier_field_fields_per_byte(),
+                 NOFL_GRANULE_SIZE / sizeof(uintptr_t));
+    GC_ASSERT_EQ(gc_write_barrier_field_first_bit_pattern(),
+                 NOFL_METADATA_BYTE_LOGGED_0);
   }
 
   *heap = calloc(1, sizeof(struct gc_heap));
@@ -1139,6 +1154,8 @@ static void
 deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
   nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(&mut->logger);
   heap_lock(heap);
   heap->inactive_mutator_count++;
   gc_stack_capture_hot(&mut->stack);
diff --git a/src/nofl-space.h b/src/nofl-space.h
index e47162853..9759b3d8e 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -45,10 +45,10 @@ STATIC_ASSERT_EQ(NOFL_MEDIUM_OBJECT_THRESHOLD,
 #define NOFL_NONMETA_BLOCKS_PER_SLAB (NOFL_BLOCKS_PER_SLAB - NOFL_META_BLOCKS_PER_SLAB)
 #define NOFL_METADATA_BYTES_PER_SLAB (NOFL_NONMETA_BLOCKS_PER_SLAB * NOFL_METADATA_BYTES_PER_BLOCK)
 #define NOFL_SLACK_METADATA_BYTES_PER_SLAB (NOFL_META_BLOCKS_PER_SLAB * NOFL_METADATA_BYTES_PER_BLOCK)
-#define NOFL_REMSET_BYTES_PER_BLOCK (NOFL_SLACK_METADATA_BYTES_PER_SLAB / NOFL_BLOCKS_PER_SLAB)
-#define NOFL_REMSET_BYTES_PER_SLAB (NOFL_REMSET_BYTES_PER_BLOCK * NOFL_NONMETA_BLOCKS_PER_SLAB)
-#define NOFL_SLACK_REMSET_BYTES_PER_SLAB (NOFL_REMSET_BYTES_PER_BLOCK * NOFL_META_BLOCKS_PER_SLAB)
-#define NOFL_SUMMARY_BYTES_PER_BLOCK (NOFL_SLACK_REMSET_BYTES_PER_SLAB / NOFL_BLOCKS_PER_SLAB)
+#define NOFL_VESTIGIAL_BYTES_PER_BLOCK (NOFL_SLACK_METADATA_BYTES_PER_SLAB / NOFL_BLOCKS_PER_SLAB)
+#define NOFL_VESTIGIAL_BYTES_PER_SLAB (NOFL_VESTIGIAL_BYTES_PER_BLOCK * NOFL_NONMETA_BLOCKS_PER_SLAB)
+#define NOFL_SLACK_VESTIGIAL_BYTES_PER_SLAB (NOFL_VESTIGIAL_BYTES_PER_BLOCK * NOFL_META_BLOCKS_PER_SLAB)
+#define NOFL_SUMMARY_BYTES_PER_BLOCK (NOFL_SLACK_VESTIGIAL_BYTES_PER_SLAB / NOFL_BLOCKS_PER_SLAB)
 #define NOFL_SUMMARY_BYTES_PER_SLAB (NOFL_SUMMARY_BYTES_PER_BLOCK * NONMETA_BLOCKS_PER_SLAB)
 #define NOFL_SLACK_SUMMARY_BYTES_PER_SLAB (NOFL_SUMMARY_BYTES_PER_BLOCK * NOFL_META_BLOCKS_PER_SLAB)
 #define NOFL_HEADER_BYTES_PER_SLAB NOFL_SLACK_SUMMARY_BYTES_PER_SLAB
@@ -127,7 +127,7 @@ struct nofl_block_ref {
 struct nofl_slab {
   struct nofl_slab_header header;
   struct nofl_block_summary summaries[NOFL_NONMETA_BLOCKS_PER_SLAB];
-  uint8_t remembered_set[NOFL_REMSET_BYTES_PER_SLAB];
+  uint8_t unused[NOFL_VESTIGIAL_BYTES_PER_SLAB];
   uint8_t metadata[NOFL_METADATA_BYTES_PER_SLAB];
   struct nofl_block blocks[NOFL_NONMETA_BLOCKS_PER_SLAB];
 };
@@ -297,8 +297,6 @@ nofl_block_set_mark(uintptr_t addr) {
 }
 
 #define NOFL_GRANULES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
-#define NOFL_GRANULES_PER_REMSET_BYTE \
-  (NOFL_GRANULES_PER_BLOCK / NOFL_REMSET_BYTES_PER_BLOCK)
 
 static struct nofl_block_summary*
 nofl_block_summary_for_addr(uintptr_t addr) {
@@ -909,67 +907,75 @@ nofl_space_set_ephemeron_flag(struct gc_ref ref) {
 
 struct gc_trace_worker;
 
-// Note that it's quite possible (and even likely) that any given remset
-// byte doesn't hold any roots, if all stores were to nursery objects.
-STATIC_ASSERT_EQ(NOFL_GRANULES_PER_REMSET_BYTE % 8, 0);
-static void
-nofl_space_trace_card(struct nofl_space *space, struct nofl_slab *slab,
-                      size_t card,
-                      void (*trace_object)(struct gc_ref,
-                                           struct gc_heap*,
-                                           struct gc_trace_worker*),
-                      struct gc_heap *heap,
-                      struct gc_trace_worker *worker) {
-  uintptr_t first_addr_in_slab = (uintptr_t) &slab->blocks[0];
-  size_t granule_base = card * NOFL_GRANULES_PER_REMSET_BYTE;
-  for (size_t granule_in_remset = 0;
-       granule_in_remset < NOFL_GRANULES_PER_REMSET_BYTE;
-       granule_in_remset += 8, granule_base += 8) {
-    uint64_t mark_bytes = load_eight_aligned_bytes(slab->metadata + granule_base);
-    mark_bytes &= space->sweep_mask;
-    while (mark_bytes) {
-      size_t granule_offset = count_zero_bytes(mark_bytes);
-      mark_bytes &= ~(((uint64_t)0xff) << (granule_offset * 8));
-      size_t granule = granule_base + granule_offset;
-      uintptr_t addr = first_addr_in_slab + granule * NOFL_GRANULE_SIZE;
-      GC_ASSERT(nofl_metadata_byte_for_addr(addr) == &slab->metadata[granule]);
-      trace_object(gc_ref(addr), heap, worker);
-    }
-  }
+static inline int
+nofl_space_contains_address(struct nofl_space *space, uintptr_t addr) {
+  return extents_contain_addr(space->extents, addr);
+}
+
+static inline int
+nofl_space_contains_conservative_ref(struct nofl_space *space,
+                                     struct gc_conservative_ref ref) {
+  return nofl_space_contains_address(space, gc_conservative_ref_value(ref));
+}
+
+static inline int
+nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
+  return nofl_space_contains_address(space, gc_ref_value(ref));
+}
+
+static inline int
+nofl_space_contains_edge(struct nofl_space *space, struct gc_edge edge) {
+  return nofl_space_contains_address(space, (uintptr_t)gc_edge_loc(edge));
+}  
+
+static inline int
+nofl_space_is_survivor(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
+  uint8_t mask = NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  uint8_t byte = atomic_load_explicit(metadata, memory_order_relaxed);
+  return byte & mask;
+}
+
+static uint8_t*
+nofl_field_logged_byte(struct gc_edge edge) {
+  return nofl_metadata_byte_for_addr((uintptr_t)gc_edge_loc(edge));
+}
+
+static uint8_t
+nofl_field_logged_bit(struct gc_edge edge) {
+  GC_ASSERT_EQ(sizeof(uintptr_t) * 2, NOFL_GRANULE_SIZE);
+  size_t field = ((uintptr_t)gc_edge_loc(edge)) / sizeof(uintptr_t);
+  return NOFL_METADATA_BYTE_LOGGED_0 << (field % 2);
+}
+
+static int
+nofl_space_remember_edge(struct nofl_space *space, struct gc_ref obj,
+                         struct gc_edge edge) {
+  GC_ASSERT(nofl_space_contains(space, obj));
+  if (!GC_GENERATIONAL) return 0;
+  if (!nofl_space_is_survivor(space, obj))
+    return 0;
+  uint8_t* loc = nofl_field_logged_byte(edge);
+  uint8_t bit = nofl_field_logged_bit(edge);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (byte & bit) return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &byte, byte|bit,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  return 1;
 }
 
 static void
-nofl_space_trace_remembered_slab(struct nofl_space *space,
-                                 size_t slab_idx,
-                                 void (*trace_object)(struct gc_ref,
-                                                      struct gc_heap*,
-                                                      struct gc_trace_worker*),
-                                 struct gc_heap *heap,
-                                 struct gc_trace_worker *worker) {
-  GC_ASSERT(slab_idx < space->nslabs);
-  struct nofl_slab *slab = space->slabs[slab_idx];
-  uint8_t *remset = slab->remembered_set;
-  for (size_t card_base = 0;
-       card_base < NOFL_REMSET_BYTES_PER_SLAB;
-       card_base += 8) {
-    uint64_t remset_bytes = load_eight_aligned_bytes(remset + card_base);
-    if (!remset_bytes) continue;
-    memset(remset + card_base, 0, 8);
-    while (remset_bytes) {
-      size_t card_offset = count_zero_bytes(remset_bytes);
-      remset_bytes &= ~(((uint64_t)0xff) << (card_offset * 8));
-      nofl_space_trace_card(space, slab, card_base + card_offset,
-                            trace_object, heap, worker);
-    }
-  }
-}
-
-static void
-nofl_space_clear_remembered_set(struct nofl_space *space) {
-  if (!GC_GENERATIONAL) return;
-  for (size_t slab = 0; slab < space->nslabs; slab++) {
-    memset(space->slabs[slab]->remembered_set, 0, NOFL_REMSET_BYTES_PER_SLAB);
-  }
+nofl_space_forget_edge(struct nofl_space *space, struct gc_edge edge) {
+  GC_ASSERT(nofl_space_contains_edge(space, edge));
+  GC_ASSERT(GC_GENERATIONAL);
+  uint8_t* loc = nofl_field_logged_byte(edge);
+  // Clear both logged bits.
+  uint8_t bits = NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1;
+  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
+  atomic_store_explicit(loc, byte & ~bits, memory_order_release);
 }
 
 static void
@@ -1431,13 +1437,29 @@ nofl_space_pin_object(struct nofl_space *space, struct gc_ref ref) {
                                                   memory_order_acquire));
 }
 
-static inline int
-nofl_space_is_survivor(struct nofl_space *space, struct gc_ref ref) {
-  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
-  uint8_t mask = NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  uint8_t byte = atomic_load_explicit(metadata, memory_order_relaxed);
-  return byte & mask;
+static inline void
+clear_logged_bits_in_evacuated_object(uint8_t *metadata, size_t count) {
+  // On a major collection, it could be that we evacuate an object that
+  // has one or more fields in the old-to-new remembered set.  Because
+  // the young generation is empty after a major collection, we know the
+  // old-to-new remembered set will be empty also.  To clear the
+  // remembered set, we call gc_field_set_clear, which will end up
+  // visiting all remembered edges and clearing their logged bits.  But
+  // that doesn't work for evacuated objects, because their edges move:
+  // gc_field_set_clear will frob the pre-evacuation metadata bytes of
+  // the object.  So here we explicitly clear logged bits for evacuated
+  // objects.  That the bits for the pre-evacuation location are also
+  // frobbed by gc_field_set_clear doesn't cause a problem, as that
+  // memory will be swept and cleared later.
+  //
+  // This concern doesn't apply to minor collections: there we will
+  // never evacuate an object in the remembered set, because old objects
+  // aren't traced during a minor collection.
+  uint8_t mask = NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1;
+  for (size_t i = 0; i < count; i++) {
+    if (metadata[i] & mask)
+      metadata[i] &= ~mask;
+  }    
 }
 
 static inline int
@@ -1472,6 +1494,8 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
       // the object's fields need to be traced.
       uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
       memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
+      if (GC_GENERATIONAL)
+        clear_logged_bits_in_evacuated_object(new_metadata, object_granules);
       gc_edge_update(edge, new_ref);
       return nofl_space_set_nonempty_mark(space, new_metadata, byte,
                                           new_ref);
@@ -1523,22 +1547,6 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
   return nofl_space_set_nonempty_mark(space, metadata, byte, old_ref);
 }
 
-static inline int
-nofl_space_contains_address(struct nofl_space *space, uintptr_t addr) {
-  return extents_contain_addr(space->extents, addr);
-}
-
-static inline int
-nofl_space_contains_conservative_ref(struct nofl_space *space,
-                                     struct gc_conservative_ref ref) {
-  return nofl_space_contains_address(space, gc_conservative_ref_value(ref));
-}
-
-static inline int
-nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
-  return nofl_space_contains_address(space, gc_ref_value(ref));
-}
-
 static inline int
 nofl_space_forward_if_evacuated(struct nofl_space *space,
                                 struct gc_edge edge,
diff --git a/src/root.h b/src/root.h
index 46e019b06..4fc705e61 100644
--- a/src/root.h
+++ b/src/root.h
@@ -7,6 +7,7 @@
 struct gc_ephemeron;
 struct gc_heap;
 struct gc_mutator;
+struct gc_edge_buffer;
 
 enum gc_root_kind {
   GC_ROOT_KIND_NONE,
@@ -16,8 +17,7 @@ enum gc_root_kind {
   GC_ROOT_KIND_CONSERVATIVE_POSSIBLY_INTERIOR_EDGES,
   GC_ROOT_KIND_RESOLVED_EPHEMERONS,
   GC_ROOT_KIND_EDGE,
-  GC_ROOT_KIND_REMEMBERED_OBJECT,
-  GC_ROOT_KIND_REMEMBERED_SLAB,
+  GC_ROOT_KIND_EDGE_BUFFER,
 };
 
 struct gc_root {
@@ -28,8 +28,7 @@ struct gc_root {
     struct gc_ephemeron *resolved_ephemerons;
     struct extent_range range;
     struct gc_edge edge;
-    struct gc_ref ref;
-    size_t idx;
+    struct gc_edge_buffer *edge_buffer;
   };
 };
 
@@ -73,16 +72,9 @@ gc_root_edge(struct gc_edge edge) {
 }
 
 static inline struct gc_root
-gc_root_remembered_object(struct gc_ref ref) {
-  struct gc_root ret = { GC_ROOT_KIND_REMEMBERED_OBJECT };
-  ret.ref = ref;
-  return ret;
-}
-
-static inline struct gc_root
-gc_root_remembered_slab(size_t idx) {
-  struct gc_root ret = { GC_ROOT_KIND_REMEMBERED_SLAB };
-  ret.idx = idx;
+gc_root_edge_buffer(struct gc_edge_buffer *buf) {
+  struct gc_root ret = { GC_ROOT_KIND_EDGE_BUFFER };
+  ret.edge_buffer = buf;
   return ret;
 }
 

From 095515eaed34a0e494a1d75a69bd98580dbf121f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 2 Oct 2024 21:36:33 +0200
Subject: [PATCH 313/403] Rework write barrier fast/slow paths

---
 api/gc-api.h | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/api/gc-api.h b/api/gc-api.h
index 8f5565428..3edc80862 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -210,15 +210,15 @@ GC_API_ void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                                    size_t obj_size, struct gc_edge edge,
                                    struct gc_ref new_val) GC_NEVER_INLINE;
 
-static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
-                                    size_t obj_size, struct gc_edge edge,
-                                    struct gc_ref new_val) GC_ALWAYS_INLINE;
-static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
-                                    size_t obj_size, struct gc_edge edge,
-                                    struct gc_ref new_val) {
+static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref obj,
+                                        size_t obj_size, struct gc_edge edge,
+                                        struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref obj,
+                                        size_t obj_size, struct gc_edge edge,
+                                        struct gc_ref new_val) {
   switch (gc_write_barrier_kind(obj_size)) {
   case GC_WRITE_BARRIER_NONE:
-    return;
+    return 0;
   case GC_WRITE_BARRIER_CARD: {
     size_t card_table_alignment = gc_write_barrier_card_table_alignment();
     size_t card_size = gc_write_barrier_card_size();
@@ -226,11 +226,11 @@ static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
     uintptr_t base = addr & ~(card_table_alignment - 1);
     uintptr_t card = (addr & (card_table_alignment - 1)) / card_size;
     atomic_store_explicit((uint8_t*)(base + card), 1, memory_order_relaxed);
-    return;
+    return 0;
   }
   case GC_WRITE_BARRIER_FIELD: {
     if (!gc_object_is_old_generation(mut, obj, obj_size))
-      return;
+      return 0;
 
     size_t field_table_alignment = gc_write_barrier_field_table_alignment();
     size_t fields_per_byte = gc_write_barrier_field_fields_per_byte();
@@ -243,18 +243,25 @@ static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
     uint8_t log_bit = first_bit_pattern << (field % fields_per_byte);
     uint8_t *byte_loc = (uint8_t*)(base + log_byte);
     uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
-    if (!(byte & log_bit))
-      gc_write_barrier_slow(mut, obj, obj_size, edge, new_val);
-    return;
+    return byte & log_bit;
   }
   case GC_WRITE_BARRIER_SLOW:
-    gc_write_barrier_slow(mut, obj, obj_size, edge, new_val);
-    return;
+    return 1;
   default:
     GC_CRASH();
   }
 }
 
+static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
+                                    size_t obj_size, struct gc_edge edge,
+                                    struct gc_ref new_val) GC_ALWAYS_INLINE;
+static inline void gc_write_barrier(struct gc_mutator *mut, struct gc_ref obj,
+                                    size_t obj_size, struct gc_edge edge,
+                                    struct gc_ref new_val) {
+  if (GC_UNLIKELY(gc_write_barrier_fast(mut, obj, obj_size, edge, new_val)))
+    gc_write_barrier_slow(mut, obj, obj_size, edge, new_val);
+}
+
 GC_API_ void gc_pin_object(struct gc_mutator *mut, struct gc_ref obj);
 
 GC_API_ void gc_safepoint_slow(struct gc_mutator *mut) GC_NEVER_INLINE;

From 10017daa0c8030327cdd1214b1f57ba81503a20c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 3 Oct 2024 10:05:07 +0200
Subject: [PATCH 314/403] Inline set_field in mt-gcbench

---
 benchmarks/mt-gcbench.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/mt-gcbench.c b/benchmarks/mt-gcbench.c
index 7f342fe90..6494c22d2 100644
--- a/benchmarks/mt-gcbench.c
+++ b/benchmarks/mt-gcbench.c
@@ -144,8 +144,8 @@ static void allocate_garbage(struct thread *t) {
   }
 }
 
-static void set_field(struct gc_mutator *mut, Node *obj,
-                      Node **field, Node *val) {
+static inline void set_field(struct gc_mutator *mut, Node *obj,
+                             Node **field, Node *val) {
   gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Node),
                    gc_edge(field),
                    gc_ref_from_heap_object(val));

From b5c36b9fd857674f951dcd4892d98ddd8f429d36 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 11:40:09 +0200
Subject: [PATCH 315/403] Explicitly support immediate values

Because we have to deref edges ourselves, as part of generational
marking, we need to ignore edges that don't point to heap objects.
---
 api/gc-config.h       |  4 ++++
 api/gc-ref.h          | 15 ++++++++++++++-
 src/field-set.h       | 12 ++++++------
 src/gc-finalizer.c    |  5 ++---
 src/mmc.c             | 28 +++++++++++++++-------------
 src/nofl-space.h      |  2 +-
 src/parallel-tracer.h |  8 ++++----
 src/pcc.c             | 10 ++++++----
 src/semi.c            |  5 ++++-
 src/serial-tracer.h   |  2 +-
 10 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/api/gc-config.h b/api/gc-config.h
index 91dd555e2..ca1b38d14 100644
--- a/api/gc-config.h
+++ b/api/gc-config.h
@@ -5,6 +5,10 @@
 #define GC_DEBUG 0
 #endif
 
+#ifndef GC_HAS_IMMEDIATES
+#define GC_HAS_IMMEDIATES 1
+#endif
+
 #ifndef GC_PARALLEL
 #define GC_PARALLEL 0
 #endif
diff --git a/api/gc-ref.h b/api/gc-ref.h
index 33ac5e73b..29e1a3853 100644
--- a/api/gc-ref.h
+++ b/api/gc-ref.h
@@ -2,6 +2,7 @@
 #define GC_REF_H
 
 #include "gc-assert.h"
+#include "gc-config.h"
 
 #include <stdint.h>
 
@@ -19,8 +20,20 @@ static inline uintptr_t gc_ref_value(struct gc_ref ref) {
 static inline struct gc_ref gc_ref_null(void) {
   return gc_ref(0);
 }
+static inline int gc_ref_is_null(struct gc_ref ref) {
+  return ref.value == 0;
+}
+static inline int gc_ref_is_immediate(struct gc_ref ref) {
+  GC_ASSERT(!gc_ref_is_null(ref));
+  return GC_HAS_IMMEDIATES && (ref.value & (sizeof(void*) - 1));
+}
+static inline struct gc_ref gc_ref_immediate(uintptr_t val) {
+  GC_ASSERT(val & (sizeof(void*) - 1));
+  GC_ASSERT(GC_HAS_IMMEDIATES);
+  return gc_ref(val);
+}
 static inline int gc_ref_is_heap_object(struct gc_ref ref) {
-  return ref.value != 0;
+  return !gc_ref_is_immediate(ref);
 }
 static inline struct gc_ref gc_ref_from_heap_object_or_null(void *obj) {
   return gc_ref((uintptr_t) obj);
diff --git a/src/field-set.h b/src/field-set.h
index c7ddffd08..2c93232c1 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -162,15 +162,15 @@ gc_field_set_clear(struct gc_field_set *set,
 }
 
 static inline void
-gc_field_set_trace_edge_buffer(struct gc_field_set *set,
+gc_field_set_visit_edge_buffer(struct gc_field_set *set,
                                struct gc_edge_buffer *buf,
-                               void (*tracer_visit)(struct gc_edge,
-                                                    struct gc_heap*,
-                                                    void *data),
+                               void (*visit)(struct gc_edge,
+                                             struct gc_heap*,
+                                             void *data),
                                struct gc_heap *heap,
-                               struct gc_trace_worker *worker) {
+                               void *data) {
   for (size_t i = 0; i < buf->size; i++)
-    tracer_visit(buf->edges[i], heap, worker);
+    visit(buf->edges[i], heap, data);
 }
 
 static void
diff --git a/src/gc-finalizer.c b/src/gc-finalizer.c
index c0e5831bf..5365899fc 100644
--- a/src/gc-finalizer.c
+++ b/src/gc-finalizer.c
@@ -164,8 +164,7 @@ void gc_finalizer_init_internal(struct gc_finalizer *f,
   // value.
   if (f->state != FINALIZER_STATE_INIT)
     GC_CRASH();
-  if (gc_ref_is_heap_object(f->object))
-    GC_CRASH();
+  GC_ASSERT(gc_ref_is_null(f->object));
   f->object = object;
   f->closure = closure;
 }
@@ -179,7 +178,7 @@ void gc_finalizer_attach_internal(struct gc_finalizer_state *state,
   // value.
   if (f->state != FINALIZER_STATE_INIT)
     GC_CRASH();
-  if (!gc_ref_is_heap_object(f->object))
+  if (gc_ref_is_null(f->object))
     GC_CRASH();
 
   f->state = FINALIZER_STATE_ACTIVE;
diff --git a/src/mmc.c b/src/mmc.c
index b1f4238a7..e5d5a95be 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -121,8 +121,6 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
 static inline int
 do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
          struct gc_trace_worker_data *data) {
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
   if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref)))
     return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref,
                                               &data->allocator);
@@ -137,6 +135,9 @@ static inline int
 trace_edge(struct gc_heap *heap, struct gc_edge edge,
            struct gc_trace_worker_data *data) {
   struct gc_ref ref = gc_edge_ref(edge);
+  if (gc_ref_is_null(ref) || gc_ref_is_immediate(ref))
+    return 0;
+
   int is_new = do_trace(heap, edge, ref, data);
 
   if (is_new &&
@@ -150,9 +151,10 @@ trace_edge(struct gc_heap *heap, struct gc_edge edge,
 int
 gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
-
+  GC_ASSERT(!gc_ref_is_null(ref));
+  if (gc_ref_is_immediate(ref))
+    return 1;
+  GC_ASSERT(gc_ref_is_heap_object(ref));
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   if (GC_LIKELY(nofl_space_contains(nofl_space, ref)))
     return nofl_space_forward_or_mark_if_traced(nofl_space, edge, ref);
@@ -271,11 +273,11 @@ static inline struct gc_ref
 trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
                        int possibly_interior) {
   struct gc_ref ret = do_trace_conservative_ref(heap, ref, possibly_interior);
-
-  if (gc_ref_is_heap_object(ret) &&
-      GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
-                                       memory_order_relaxed)))
-    gc_resolve_pending_ephemerons(ret, heap);
+  if (!gc_ref_is_null(ret)) {
+    if (GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
+                                         memory_order_relaxed)))
+      gc_resolve_pending_ephemerons(ret, heap);
+  }
 
   return ret;
 }
@@ -286,7 +288,7 @@ tracer_trace_conservative_ref(struct gc_conservative_ref ref,
                               struct gc_trace_worker *worker,
                               int possibly_interior) {
   struct gc_ref resolved = trace_conservative_ref(heap, ref, possibly_interior);
-  if (gc_ref_is_heap_object(resolved))
+  if (!gc_ref_is_null(resolved))
     gc_trace_worker_enqueue(worker, resolved);
 }
 
@@ -367,7 +369,7 @@ trace_root(struct gc_root root, struct gc_heap *heap,
     tracer_visit(root.edge, heap, worker);
     break;
   case GC_ROOT_KIND_EDGE_BUFFER:
-    gc_field_set_trace_edge_buffer(&heap->remembered_set, root.edge_buffer,
+    gc_field_set_visit_edge_buffer(&heap->remembered_set, root.edge_buffer,
                                    tracer_visit, heap, worker);
     break;
   default:
@@ -910,7 +912,7 @@ void
 gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                       size_t obj_size, struct gc_edge edge,
                       struct gc_ref new_val) {
-  GC_ASSERT(gc_ref_is_heap_object(new_val));
+  GC_ASSERT(!gc_ref_is_null(new_val));
   if (!GC_GENERATIONAL) return;
   if (gc_object_is_old_generation_slow(mut, new_val))
     return;
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 9759b3d8e..bc5144205 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1483,7 +1483,7 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
     size_t object_granules = nofl_space_live_object_granules(metadata);
     struct gc_ref new_ref = nofl_evacuation_allocate(evacuate, space,
                                                      object_granules);
-    if (gc_ref_is_heap_object(new_ref)) {
+    if (!gc_ref_is_null(new_ref)) {
       // Copy object contents before committing, as we don't know what
       // part of the object (if any) will be overwritten by the
       // commit.
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 20d66730f..8115c369d 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -207,7 +207,7 @@ trace_worker_steal_from_any(struct gc_trace_worker *worker,
   for (size_t i = 0; i < tracer->worker_count; i++) {
     LOG("tracer #%zu: stealing from #%zu\n", worker->id, worker->steal_id);
     struct gc_ref obj = tracer_steal_from_worker(tracer, worker->steal_id);
-    if (gc_ref_is_heap_object(obj)) {
+    if (!gc_ref_is_null(obj)) {
       LOG("tracer #%zu: stealing got %p\n", worker->id,
             gc_ref_heap_object(obj));
       return obj;
@@ -281,13 +281,13 @@ trace_worker_steal(struct gc_trace_worker *worker) {
   {
     LOG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
     struct gc_ref obj = shared_worklist_try_pop(&worker->shared);
-    if (gc_ref_is_heap_object(obj))
+    if (!gc_ref_is_null(obj))
       return obj;
   }
 
   LOG("tracer #%zu: trying to steal\n", worker->id);
   struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
-  if (gc_ref_is_heap_object(obj))
+  if (!gc_ref_is_null(obj))
     return obj;
 
   return gc_ref_null();
@@ -337,7 +337,7 @@ trace_with_data(struct gc_tracer *tracer,
           ref = local_worklist_pop(&worker->local);
         } else {
           ref = trace_worker_steal(worker);
-          if (!gc_ref_is_heap_object(ref))
+          if (gc_ref_is_null(ref))
             break;
         }
         trace_one(ref, heap, worker);
diff --git a/src/pcc.c b/src/pcc.c
index 593f43fc6..6b8a55c03 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -103,8 +103,6 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
 static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
                            struct gc_ref ref,
                            struct gc_trace_worker_data *data) {
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
   if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
     return copy_space_forward(heap_copy_space(heap), edge, ref,
                               &data->allocator);
@@ -117,6 +115,8 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
 static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge,
                              struct gc_trace_worker *worker) {
   struct gc_ref ref = gc_edge_ref(edge);
+  if (gc_ref_is_null(ref) || gc_ref_is_immediate(ref))
+    return 0;
   struct gc_trace_worker_data *data = gc_trace_worker_data(worker);
   int is_new = do_trace(heap, edge, ref, data);
 
@@ -130,8 +130,10 @@ static inline int trace_edge(struct gc_heap *heap, struct gc_edge edge,
 
 int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
-  if (!gc_ref_is_heap_object(ref))
-    return 0;
+  GC_ASSERT(!gc_ref_is_null(ref));
+  if (gc_ref_is_immediate(ref))
+    return 1;
+  GC_ASSERT(gc_ref_is_heap_object(ref));
   if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
     return copy_space_forward_if_traced(heap_copy_space(heap), edge, ref);
   if (large_object_space_contains(heap_large_object_space(heap), ref))
diff --git a/src/semi.c b/src/semi.c
index ca7a31607..239367999 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -233,7 +233,7 @@ static void visit_external_object(struct gc_heap *heap,
 
 static void visit(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
-  if (!gc_ref_is_heap_object(ref))
+  if (gc_ref_is_null(ref) || gc_ref_is_immediate(ref))
     return;
   if (semi_space_contains(heap_semi_space(heap), ref))
     visit_semi_space(heap, heap_semi_space(heap), edge, ref);
@@ -250,6 +250,9 @@ gc_heap_pending_ephemerons(struct gc_heap *heap) {
 
 int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   struct gc_ref ref = gc_edge_ref(edge);
+  GC_ASSERT(!gc_ref_is_null(ref));
+  if (gc_ref_is_immediate(ref))
+    return 1;
   GC_ASSERT(gc_ref_is_heap_object(ref));
   if (semi_space_contains(heap_semi_space(heap), ref)) {
     uintptr_t forwarded = gc_object_forwarded_nonatomic(ref);
diff --git a/src/serial-tracer.h b/src/serial-tracer.h
index b9575fddb..a3289e30c 100644
--- a/src/serial-tracer.h
+++ b/src/serial-tracer.h
@@ -65,7 +65,7 @@ tracer_trace_with_data(struct gc_tracer *tracer, struct gc_heap *heap,
   if (!tracer->trace_roots_only) {
     do {
       struct gc_ref obj = simple_worklist_pop(&tracer->worklist);
-      if (!gc_ref_is_heap_object(obj))
+      if (gc_ref_is_null(obj))
         break;
       trace_one(obj, heap, worker);
     } while (1);

From 1a79c3a4512be1230a64ed7be3117b5108e73326 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 11:41:08 +0200
Subject: [PATCH 316/403] mmc: only serialize root-tracing if there are pinned
 roots

---
 src/mmc.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/mmc.c b/src/mmc.c
index e5d5a95be..76874510b 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -605,7 +605,7 @@ enqueue_conservative_roots(uintptr_t low, uintptr_t high,
                      gc_root_conservative_edges(low, high, *possibly_interior));
 }
 
-static void
+static int
 enqueue_mutator_conservative_roots(struct gc_heap *heap) {
   if (gc_has_mutator_conservative_roots()) {
     int possibly_interior = gc_mutator_conservative_roots_may_be_interior();
@@ -614,23 +614,28 @@ enqueue_mutator_conservative_roots(struct gc_heap *heap) {
          mut = mut->next)
       gc_stack_visit(&mut->stack, enqueue_conservative_roots, heap,
                      &possibly_interior);
+    return 1;
   }
+  return 0;
 }
 
-static void
+static int
 enqueue_global_conservative_roots(struct gc_heap *heap) {
   if (gc_has_global_conservative_roots()) {
     int possibly_interior = 0;
     gc_platform_visit_global_conservative_roots
       (enqueue_conservative_roots, heap, &possibly_interior);
+    return 1;
   }
+  return 0;
 }
 
-static void
+static int
 enqueue_pinned_roots(struct gc_heap *heap) {
   GC_ASSERT(!heap_nofl_space(heap)->evacuating);
-  enqueue_mutator_conservative_roots(heap);
-  enqueue_global_conservative_roots(heap);
+  int has_pinned_roots = enqueue_mutator_conservative_roots(heap);
+  has_pinned_roots |= enqueue_global_conservative_roots(heap);
+  return has_pinned_roots;
 }
 
 static void
@@ -757,9 +762,8 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind,
   size_t live_bytes = heap->size * (1.0 - yield);
   HEAP_EVENT(heap, live_data_size, live_bytes);
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
-  enqueue_pinned_roots(heap);
   // Eagerly trace pinned roots if we are going to relocate objects.
-  if (gc_kind == GC_COLLECTION_COMPACTING)
+  if (enqueue_pinned_roots(heap) && gc_kind == GC_COLLECTION_COMPACTING)
     gc_tracer_trace_roots(&heap->tracer);
   // Process the rest of the roots in parallel.  This heap event should probably
   // be removed, as there is no clear cutoff time.
@@ -891,8 +895,8 @@ gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   // Otherwise if it's a large or external object, it won't move.
 }
 
-int gc_object_is_old_generation_slow(struct gc_mutator *mut,
-                                     struct gc_ref obj) {
+int
+gc_object_is_old_generation_slow(struct gc_mutator *mut, struct gc_ref obj) {
   if (!GC_GENERATIONAL)
     return 0;
 

From 3c8c956f4cb42c8ee984f92f022934a22011fcb8 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:49:27 +0200
Subject: [PATCH 317/403] Add gc_edge_address

---
 api/gc-edge.h            | 3 +++
 src/large-object-space.h | 2 +-
 src/nofl-space.h         | 6 +++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/api/gc-edge.h b/api/gc-edge.h
index 72d7b3e5b..ec487df9d 100644
--- a/api/gc-edge.h
+++ b/api/gc-edge.h
@@ -16,6 +16,9 @@ static inline struct gc_ref gc_edge_ref(struct gc_edge edge) {
 static inline struct gc_ref* gc_edge_loc(struct gc_edge edge) {
   return edge.dst;
 }
+static inline uintptr_t gc_edge_address(struct gc_edge edge) {
+  return (uintptr_t)gc_edge_loc(edge);
+}
 static inline void gc_edge_update(struct gc_edge edge, struct gc_ref ref) {
   *edge.dst = ref;
 }
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 4c7277797..44f4095f2 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -152,7 +152,7 @@ static int large_object_space_remember_edge(struct large_object_space *space,
                                             struct gc_ref obj,
                                             struct gc_edge edge) {
   int remembered = 0;
-  uintptr_t edge_addr = (uintptr_t)gc_edge_loc(edge);
+  uintptr_t edge_addr = gc_edge_address(edge);
   pthread_mutex_lock(&space->lock);
   if (large_object_space_is_survivor_with_lock(space, obj)
       && !address_set_contains(&space->remembered_edges, edge_addr)) {
diff --git a/src/nofl-space.h b/src/nofl-space.h
index bc5144205..7bf818cbe 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -925,7 +925,7 @@ nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
 
 static inline int
 nofl_space_contains_edge(struct nofl_space *space, struct gc_edge edge) {
-  return nofl_space_contains_address(space, (uintptr_t)gc_edge_loc(edge));
+  return nofl_space_contains_address(space, gc_edge_address(edge));
 }  
 
 static inline int
@@ -939,13 +939,13 @@ nofl_space_is_survivor(struct nofl_space *space, struct gc_ref ref) {
 
 static uint8_t*
 nofl_field_logged_byte(struct gc_edge edge) {
-  return nofl_metadata_byte_for_addr((uintptr_t)gc_edge_loc(edge));
+  return nofl_metadata_byte_for_addr(gc_edge_address(edge));
 }
 
 static uint8_t
 nofl_field_logged_bit(struct gc_edge edge) {
   GC_ASSERT_EQ(sizeof(uintptr_t) * 2, NOFL_GRANULE_SIZE);
-  size_t field = ((uintptr_t)gc_edge_loc(edge)) / sizeof(uintptr_t);
+  size_t field = gc_edge_address(edge) / sizeof(uintptr_t);
   return NOFL_METADATA_BYTE_LOGGED_0 << (field % 2);
 }
 

From e1ae9819cfde51db5f275d2aa678526ad31f4d77 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:49:45 +0200
Subject: [PATCH 318/403] gc_object_is_old_generation uses relaxed atomics

---
 api/gc-api.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/api/gc-api.h b/api/gc-api.h
index 3edc80862..2b3f9fcd6 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -196,8 +196,9 @@ static inline int gc_object_is_old_generation(struct gc_mutator *mut,
     uintptr_t base = addr & ~(alignment - 1);
     size_t granule_size = gc_allocator_small_granule_size();
     uintptr_t granule = (addr & (alignment - 1)) / granule_size;
-    uint8_t *byte = (uint8_t*)(base + granule);
-    return (*byte) & gc_old_generation_check_alloc_table_bit_pattern();
+    uint8_t *byte_loc = (uint8_t*)(base + granule);
+    uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
+    return byte & gc_old_generation_check_alloc_table_bit_pattern();
   }
   case GC_OLD_GENERATION_CHECK_SLOW:
     return gc_object_is_old_generation_slow(mut, obj);

From 15a51c8a855c1ee52629169eb998758132230f60 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:50:01 +0200
Subject: [PATCH 319/403] Fix embarrassing bugs in write buffer fast path

Check edge address, not object address, and reverse the sense of the
check!
---
 api/gc-api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/api/gc-api.h b/api/gc-api.h
index 2b3f9fcd6..2efd16ecd 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -237,14 +237,14 @@ static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref ob
     size_t fields_per_byte = gc_write_barrier_field_fields_per_byte();
     uint8_t first_bit_pattern = gc_write_barrier_field_first_bit_pattern();
 
-    uintptr_t addr = gc_ref_value(obj);
+    uintptr_t addr = gc_edge_address(edge);
     uintptr_t base = addr & ~(field_table_alignment - 1);
     uintptr_t field = (addr & (field_table_alignment - 1)) / sizeof(uintptr_t);
     uintptr_t log_byte = field / fields_per_byte;
     uint8_t log_bit = first_bit_pattern << (field % fields_per_byte);
     uint8_t *byte_loc = (uint8_t*)(base + log_byte);
     uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
-    return byte & log_bit;
+    return !(byte & log_bit);
   }
   case GC_WRITE_BARRIER_SLOW:
     return 1;

From 478b9de798a17c0d2ae0852b63d70ddf335b23d0 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:50:35 +0200
Subject: [PATCH 320/403] Add assertions when pushing edge buffers

---
 src/field-set.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/field-set.h b/src/field-set.h
index 2c93232c1..4facab019 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -44,6 +44,7 @@ struct gc_field_set_writer {
 static void
 gc_edge_buffer_list_push(struct gc_edge_buffer_list *list,
                          struct gc_edge_buffer *buf) {
+  GC_ASSERT(!buf->next);
   struct gc_edge_buffer *next =
     atomic_load_explicit(&list->head, memory_order_relaxed);
   do {
@@ -72,6 +73,7 @@ static void
 gc_edge_buffer_stack_push(struct gc_edge_buffer_stack *stack,
                           struct gc_edge_buffer *buf,
                           const struct gc_lock *lock) {
+  GC_ASSERT(!buf->next);
   buf->next = stack->list.head;
   stack->list.head = buf;
 }

From 745a5ab5587f306f2fd2961761f909deb70f8b9c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:50:57 +0200
Subject: [PATCH 321/403] Don't clear remembered set in non-generational mode

---
 src/mmc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mmc.c b/src/mmc.c
index 76874510b..68a811947 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -662,6 +662,7 @@ forget_remembered_edge(struct gc_edge edge, struct gc_heap *heap) {
 
 static void
 clear_remembered_set(struct gc_heap *heap) {
+  if (!GC_GENERATIONAL) return;
   gc_field_set_clear(&heap->remembered_set, forget_remembered_edge, heap);
   large_object_space_clear_remembered_edges(heap_large_object_space(heap));
 }

From 6d48e12f788e946feba8777f8c0431ed9ede5aad Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:51:17 +0200
Subject: [PATCH 322/403] Add assertions when forgetting nofl edges

---
 src/nofl-space.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 7bf818cbe..817c10461 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -972,10 +972,18 @@ nofl_space_forget_edge(struct nofl_space *space, struct gc_edge edge) {
   GC_ASSERT(nofl_space_contains_edge(space, edge));
   GC_ASSERT(GC_GENERATIONAL);
   uint8_t* loc = nofl_field_logged_byte(edge);
-  // Clear both logged bits.
-  uint8_t bits = NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1;
-  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
-  atomic_store_explicit(loc, byte & ~bits, memory_order_release);
+  if (GC_DEBUG) {
+    pthread_mutex_lock(&space->lock);
+    uint8_t bit = nofl_field_logged_bit(edge);
+    GC_ASSERT(*loc & bit);
+    *loc &= ~bit;
+    pthread_mutex_unlock(&space->lock);
+  } else {
+    // In release mode, race to clear both bits at once.
+    uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
+    byte &= ~(NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1);
+    atomic_store_explicit(loc, byte, memory_order_relaxed);
+  }
 }
 
 static void

From b4ea55b9c436a320b19ef8ce5c034fe7e6b0e08c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:51:27 +0200
Subject: [PATCH 323/403] Don't clear log bits when marking

This happens in a post-pass.
---
 src/nofl-space.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 817c10461..a6e2b2b13 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1408,11 +1408,8 @@ nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
 
 static inline int
 nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
-  // Clear logged bits when we mark: after marking, there will be no
-  // young objects.
   uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2
-    | NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1;
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
   atomic_store_explicit(metadata,
                         (byte & ~mask) | space->marked_mask,
                         memory_order_relaxed);

From da4f1ec806309bc40d051b8ff00bccf86e099f1d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 13:51:49 +0200
Subject: [PATCH 324/403] Fix bug in which head byte's logged bits were not
 cleared

---
 src/nofl-space.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index a6e2b2b13..01f446ad7 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1442,8 +1442,9 @@ nofl_space_pin_object(struct nofl_space *space, struct gc_ref ref) {
                                                   memory_order_acquire));
 }
 
-static inline void
-clear_logged_bits_in_evacuated_object(uint8_t *metadata, size_t count) {
+static inline uint8_t
+clear_logged_bits_in_evacuated_object(uint8_t head, uint8_t *metadata,
+                                      size_t count) {
   // On a major collection, it could be that we evacuate an object that
   // has one or more fields in the old-to-new remembered set.  Because
   // the young generation is empty after a major collection, we know the
@@ -1461,10 +1462,11 @@ clear_logged_bits_in_evacuated_object(uint8_t *metadata, size_t count) {
   // never evacuate an object in the remembered set, because old objects
   // aren't traced during a minor collection.
   uint8_t mask = NOFL_METADATA_BYTE_LOGGED_0 | NOFL_METADATA_BYTE_LOGGED_1;
-  for (size_t i = 0; i < count; i++) {
+  for (size_t i = 1; i < count; i++) {
     if (metadata[i] & mask)
       metadata[i] &= ~mask;
   }    
+  return head & ~mask;
 }
 
 static inline int
@@ -1500,7 +1502,8 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
       uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
       memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
       if (GC_GENERATIONAL)
-        clear_logged_bits_in_evacuated_object(new_metadata, object_granules);
+        byte = clear_logged_bits_in_evacuated_object(byte, new_metadata,
+                                                     object_granules);
       gc_edge_update(edge, new_ref);
       return nofl_space_set_nonempty_mark(space, new_metadata, byte,
                                           new_ref);

From cff99c75a889fa58cac70a08df8fbef08691121d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 4 Oct 2024 14:00:47 +0200
Subject: [PATCH 325/403] Fix bug clearing field set

---
 src/field-set.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/field-set.h b/src/field-set.h
index 4facab019..ef53f398d 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -148,14 +148,16 @@ gc_field_set_clear(struct gc_field_set *set,
   // wanted to it could re-add an edge to the remembered set.
   set->partly_full.list.head = NULL;
   set->full.head = NULL;
-  struct gc_edge_buffer *buf;
-  for (buf = partly_full; buf; buf = buf->next) {
+  struct gc_edge_buffer *buf, *next;
+  for (buf = partly_full; buf; buf = next) {
+    next = buf->next;
     for (size_t i = 0; i < buf->size; i++)
       forget_edge(buf->edges[i], heap);
     buf->size = 0;
     gc_edge_buffer_list_push(&set->empty, buf);
   }
-  for (buf = full; buf; buf = buf->next) {
+  for (buf = full; buf; buf = next) {
+    next = buf->next;
     for (size_t i = 0; i < buf->size; i++)
       forget_edge(buf->edges[i], heap);
     buf->size = 0;

From ac016d5f838bc1622ebdd498a55d9a0f3af3806c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 7 Oct 2024 15:00:45 +0200
Subject: [PATCH 326/403] nofl: Fix hole count / size computation for promoted
 blocks

---
 src/nofl-space.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 01f446ad7..4bf99efdf 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1201,6 +1201,10 @@ static void
 nofl_space_promote_blocks(struct nofl_space *space) {
   struct nofl_block_ref block;
   while (!nofl_block_is_null(block = nofl_block_list_pop(&space->promoted))) {
+    block.summary->hole_count = 0;
+    block.summary->hole_granules = 0;
+    block.summary->holes_with_fragmentation = 0;
+    block.summary->fragmentation_granules = 0;
     struct nofl_allocator alloc = { block.addr, block.addr, block };
     nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
     atomic_fetch_add(&space->old_generation_granules,

From 922c13a1838077e15afce36d939560eb5621810d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 6 Nov 2024 22:32:36 +0100
Subject: [PATCH 327/403] Move mmap / munmap / madvise to gc-platform

---
 src/copy-space.h            | 29 ++++---------------
 src/gc-platform-gnu-linux.c | 57 +++++++++++++++++++++++++++++++++++++
 src/gc-platform.h           |  7 +++++
 src/large-object-space.h    |  3 +-
 src/nofl-space.h            | 26 ++---------------
 src/semi.c                  | 21 ++++++--------
 src/shared-worklist.h       | 12 ++++----
 src/simple-worklist.h       | 11 +++----
 8 files changed, 90 insertions(+), 76 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index d09609dfe..9be2f8d2a 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -3,7 +3,6 @@
 
 #include <pthread.h>
 #include <stdlib.h>
-#include <sys/mman.h>
 
 #include "gc-api.h"
 
@@ -18,6 +17,7 @@
 #include "gc-attrs.h"
 #include "gc-inline.h"
 #include "gc-lock.h"
+#include "gc-platform.h"
 #include "spin.h"
 
 // A copy space: a block-structured space that traces via evacuation.
@@ -620,27 +620,8 @@ copy_space_allocator_finish(struct copy_space_allocator *alloc,
 
 static struct copy_space_slab*
 copy_space_allocate_slabs(size_t nslabs) {
-  size_t size = nslabs * COPY_SPACE_SLAB_SIZE;
-  size_t extent = size + COPY_SPACE_SLAB_SIZE;
-
-  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("mmap failed");
-    return NULL;
-  }
-
-  uintptr_t base = (uintptr_t) mem;
-  uintptr_t end = base + extent;
-  uintptr_t aligned_base = align_up(base, COPY_SPACE_SLAB_SIZE);
-  uintptr_t aligned_end = aligned_base + size;
-
-  if (aligned_base - base)
-    munmap((void*)base, aligned_base - base);
-  if (end - aligned_end)
-    munmap((void*)aligned_end, end - aligned_end);
-
-  return (struct copy_space_slab*) aligned_base;
+  return gc_platform_acquire_memory(nslabs * COPY_SPACE_SLAB_SIZE,
+                                    COPY_SPACE_SLAB_SIZE);
 }
 
 static void
@@ -715,8 +696,8 @@ copy_space_page_out_blocks(void *data) {
     if (!block) break;
     block->in_core = 0;
     block->all_zeroes[0] = block->all_zeroes[1] = 1;
-    madvise(copy_space_block_payload(block), COPY_SPACE_BLOCK_SIZE,
-            MADV_DONTNEED);
+    gc_platform_discard_memory(copy_space_block_payload(block),
+                               COPY_SPACE_BLOCK_SIZE);
     copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
   gc_lock_release(&lock);
diff --git a/src/gc-platform-gnu-linux.c b/src/gc-platform-gnu-linux.c
index ebcfa5579..0b267032a 100644
--- a/src/gc-platform-gnu-linux.c
+++ b/src/gc-platform-gnu-linux.c
@@ -5,12 +5,14 @@
 #include <pthread.h>
 #include <sched.h>
 #include <stdio.h>
+#include <sys/mman.h>
 #include <time.h>
 #include <unistd.h>
 
 #define GC_IMPL 1
 
 #include "debug.h"
+#include "gc-align.h"
 #include "gc-assert.h"
 #include "gc-inline.h"
 #include "gc-platform.h"
@@ -121,3 +123,58 @@ uint64_t gc_platform_monotonic_nanoseconds(void) {
   uint64_t ns_per_sec = 1000000000;
   return s * ns_per_sec + ns;
 }
+
+size_t gc_platform_page_size(void) {
+  return getpagesize();
+}
+
+void* gc_platform_acquire_memory(size_t size, size_t alignment) {
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  GC_ASSERT_EQ(alignment & (alignment - 1), 0);
+  GC_ASSERT_EQ(alignment, align_down(alignment, getpagesize()));
+
+  size_t extent = size + alignment;
+  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return NULL;
+  }
+
+  uintptr_t base = (uintptr_t) mem;
+  uintptr_t end = base + extent;
+  uintptr_t aligned_base = alignment ? align_up(base, alignment) : base;
+  uintptr_t aligned_end = aligned_base + size;
+
+  if (aligned_base - base)
+    munmap((void*)base, aligned_base - base);
+  if (end - aligned_end)
+    munmap((void*)aligned_end, end - aligned_end);
+
+  return (void*) aligned_base;
+}
+
+void gc_platform_release_memory(void *ptr, size_t size) {
+  GC_ASSERT_EQ((uintptr_t)ptr, align_down((uintptr_t)ptr, getpagesize()));
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  if (munmap(ptr, size) != 0)
+    perror("failed to unmap memory");
+}
+
+int gc_platform_populate_memory(void *ptr, size_t size) {
+  GC_ASSERT_EQ((uintptr_t)ptr, align_down((uintptr_t)ptr, getpagesize()));
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  if (madvise(ptr, size, MADV_WILLNEED) == 0)
+    return 1;
+  perror("failed to populate memory");
+  return 0;
+}
+
+int gc_platform_discard_memory(void *ptr, size_t size) {
+  GC_ASSERT_EQ((uintptr_t)ptr, align_down((uintptr_t)ptr, getpagesize()));
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  if (madvise(ptr, size, MADV_DONTNEED) == 0)
+    return 1;
+  perror("failed to discard memory");
+  return 0;
+}
diff --git a/src/gc-platform.h b/src/gc-platform.h
index 42335ed7a..ddedbb984 100644
--- a/src/gc-platform.h
+++ b/src/gc-platform.h
@@ -23,4 +23,11 @@ void gc_platform_visit_global_conservative_roots(void (*f)(uintptr_t start,
 GC_INTERNAL int gc_platform_processor_count(void);
 GC_INTERNAL uint64_t gc_platform_monotonic_nanoseconds(void);
 
+GC_INTERNAL size_t gc_platform_page_size(void);
+GC_INTERNAL void* gc_platform_acquire_memory(size_t size, size_t alignment);
+GC_INTERNAL void gc_platform_release_memory(void *base, size_t size);
+
+GC_INTERNAL int gc_platform_populate_memory(void *addr, size_t size);
+GC_INTERNAL int gc_platform_discard_memory(void *addr, size_t size);
+
 #endif // GC_PLATFORM_H
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 44f4095f2..7aba13d9a 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -321,8 +321,7 @@ static void*
 large_object_space_obtain_and_alloc(struct large_object_space *space,
                                     size_t npages) {
   size_t bytes = npages * space->page_size;
-  void *ret = mmap(NULL, bytes, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  void *ret = gc_platform_acquire_memory(bytes, 0);
   if (ret == MAP_FAILED)
     return NULL;
 
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 4bf99efdf..9e4edf912 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -5,7 +5,6 @@
 #include <stdatomic.h>
 #include <stdint.h>
 #include <string.h>
-#include <sys/mman.h>
 
 #include "gc-api.h"
 
@@ -19,6 +18,7 @@
 #include "gc-attrs.h"
 #include "gc-inline.h"
 #include "gc-lock.h"
+#include "gc-platform.h"
 #include "spin.h"
 #include "swar.h"
 
@@ -1675,27 +1675,7 @@ nofl_space_object_size(struct nofl_space *space, struct gc_ref ref) {
 
 static struct nofl_slab*
 nofl_allocate_slabs(size_t nslabs) {
-  size_t size = nslabs * NOFL_SLAB_SIZE;
-  size_t extent = size + NOFL_SLAB_SIZE;
-
-  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("mmap failed");
-    return NULL;
-  }
-
-  uintptr_t base = (uintptr_t) mem;
-  uintptr_t end = base + extent;
-  uintptr_t aligned_base = align_up(base, NOFL_SLAB_SIZE);
-  uintptr_t aligned_end = aligned_base + size;
-
-  if (aligned_base - base)
-    munmap((void*)base, aligned_base - base);
-  if (end - aligned_end)
-    munmap((void*)aligned_end, end - aligned_end);
-
-  return (struct nofl_slab*) aligned_base;
+  return gc_platform_acquire_memory(nslabs * NOFL_SLAB_SIZE, NOFL_SLAB_SIZE);
 }
 
 static void
@@ -1813,7 +1793,7 @@ nofl_space_page_out_blocks(void *data) {
     if (nofl_block_is_null(block))
       break;
     nofl_block_set_flag(block, NOFL_BLOCK_ZERO | NOFL_BLOCK_PAGED_OUT);
-    madvise((void*)block.addr, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+    gc_platform_discard_memory((void*)block.addr, NOFL_BLOCK_SIZE);
     nofl_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
   gc_lock_release(&lock);
diff --git a/src/semi.c b/src/semi.c
index 239367999..725a75f30 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -2,8 +2,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/mman.h>
-#include <unistd.h>
 
 #include "gc-api.h"
 
@@ -100,13 +98,13 @@ static void region_trim_by(struct region *region, size_t newly_unavailable) {
   GC_ASSERT(newly_unavailable <= region->active_size);
 
   region->active_size -= newly_unavailable;
-  madvise((void*)(region->base + region->active_size), newly_unavailable,
-          MADV_DONTNEED);
+  gc_platform_discard_memory((void*)(region->base + region->active_size),
+                             newly_unavailable);
 }
 
 static void region_set_active_size(struct region *region, size_t size) {
   GC_ASSERT(size <= region->mapped_size);
-  GC_ASSERT(size == align_up(size, getpagesize()));
+  GC_ASSERT(size == align_up(size, gc_platform_page_size()));
   if (size < region->active_size)
     region_trim_by(region, region->active_size - size);
   else
@@ -274,15 +272,12 @@ static int grow_region_if_needed(struct region *region, size_t new_size) {
   if (new_size <= region->mapped_size)
     return 1;
 
-  void *mem = mmap(NULL, new_size, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  void *mem = gc_platform_acquire_memory(new_size, 0);
   DEBUG("new size %zx\n", new_size);
-  if (mem == MAP_FAILED) {
-    perror("mmap failed");
+  if (!mem)
     return 0;
-  }
   if (region->mapped_size)
-    munmap((void*)region->base, region->mapped_size);
+    gc_platform_release_memory((void*)region->base, region->mapped_size);
   region->base = (uintptr_t)mem;
   region->active_size = 0;
   region->mapped_size = new_size;
@@ -294,7 +289,7 @@ static void truncate_region(struct region *region, size_t new_size) {
 
   size_t bytes = region->mapped_size - new_size;
   if (bytes) {
-    munmap((void*)(region->base + new_size), bytes);
+    gc_platform_release_memory((void*)(region->base + new_size), bytes);
     region->mapped_size = new_size;
     if (region->active_size > new_size)
       region->active_size = new_size;
@@ -569,7 +564,7 @@ static int region_init(struct region *region, size_t size) {
 
 static int semi_space_init(struct semi_space *space, struct gc_heap *heap) {
   // Allocate even numbers of pages.
-  size_t page_size = getpagesize();
+  size_t page_size = gc_platform_page_size();
   size_t size = align_up(heap->size, page_size * 2);
 
   space->page_size = page_size;
diff --git a/src/shared-worklist.h b/src/shared-worklist.h
index afefb11e2..979c87178 100644
--- a/src/shared-worklist.h
+++ b/src/shared-worklist.h
@@ -2,13 +2,12 @@
 #define SHARED_WORKLIST_H
 
 #include <stdatomic.h>
-#include <sys/mman.h>
-#include <unistd.h>
 
 #include "assert.h"
 #include "debug.h"
 #include "gc-align.h"
 #include "gc-inline.h"
+#include "gc-platform.h"
 #include "spin.h"
 
 // The Chase-Lev work-stealing deque, as initially described in "Dynamic
@@ -36,9 +35,8 @@ shared_worklist_buf_init(struct shared_worklist_buf *buf, unsigned log_size) {
   ASSERT(log_size >= shared_worklist_buf_min_log_size);
   ASSERT(log_size <= shared_worklist_buf_max_log_size);
   size_t size = (1 << log_size) * sizeof(uintptr_t);
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
+  void *mem = gc_platform_acquire_memory(size, 0);
+  if (!mem) {
     perror("Failed to grow work-stealing dequeue");
     DEBUG("Failed to allocate %zu bytes", size);
     return 0;
@@ -63,13 +61,13 @@ static void
 shared_worklist_buf_release(struct shared_worklist_buf *buf) {
   size_t byte_size = shared_worklist_buf_byte_size(buf);
   if (buf->data && byte_size >= shared_worklist_release_byte_threshold)
-    madvise(buf->data, byte_size, MADV_DONTNEED);
+    gc_platform_discard_memory(buf->data, byte_size);
 }
 
 static void
 shared_worklist_buf_destroy(struct shared_worklist_buf *buf) {
   if (buf->data) {
-    munmap(buf->data, shared_worklist_buf_byte_size(buf));
+    gc_platform_release_memory(buf->data, shared_worklist_buf_byte_size(buf));
     buf->data = NULL;
     buf->log_size = 0;
     buf->size = 0;
diff --git a/src/simple-worklist.h b/src/simple-worklist.h
index bae33b470..61f92a31d 100644
--- a/src/simple-worklist.h
+++ b/src/simple-worklist.h
@@ -1,13 +1,11 @@
 #ifndef SIMPLE_WORKLIST_H
 #define SIMPLE_WORKLIST_H
 
-#include <sys/mman.h>
-#include <unistd.h>
-
 #include "assert.h"
 #include "debug.h"
 #include "gc-inline.h"
 #include "gc-ref.h"
+#include "gc-platform.h"
 
 struct simple_worklist {
   size_t size;
@@ -22,9 +20,8 @@ static const size_t simple_worklist_release_byte_threshold = 1 * 1024 * 1024;
 
 static struct gc_ref *
 simple_worklist_alloc(size_t size) {
-  void *mem = mmap(NULL, size * sizeof(struct gc_ref), PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
+  void *mem = gc_platform_acquire_memory(size * sizeof(struct gc_ref), 0);
+  if (!mem) {
     perror("Failed to grow trace queue");
     DEBUG("Failed to allocate %zu bytes", size);
     return NULL;
@@ -34,7 +31,7 @@ simple_worklist_alloc(size_t size) {
 
 static int
 simple_worklist_init(struct simple_worklist *q) {
-  q->size = getpagesize() / sizeof(struct gc_ref);
+  q->size = gc_platform_page_size() / sizeof(struct gc_ref);
   q->read = 0;
   q->write = 0;
   q->buf = simple_worklist_alloc(q->size);

From 6a6f5b09e3fbe2ea0dd5f24117c55a6c068526be Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sat, 23 Nov 2024 09:13:57 +0100
Subject: [PATCH 328/403] Use PROT_NONE to reserve memory, then remap within
 that memory

Should play better with the kernel's overcommit heuristics.
---
 src/gc-platform-gnu-linux.c | 43 +++++++++++++++++++++++++++++++------
 src/gc-platform.h           | 15 +++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/src/gc-platform-gnu-linux.c b/src/gc-platform-gnu-linux.c
index 0b267032a..3ace1890d 100644
--- a/src/gc-platform-gnu-linux.c
+++ b/src/gc-platform-gnu-linux.c
@@ -128,17 +128,18 @@ size_t gc_platform_page_size(void) {
   return getpagesize();
 }
 
-void* gc_platform_acquire_memory(size_t size, size_t alignment) {
+struct gc_reservation gc_platform_reserve_memory(size_t size,
+                                                 size_t alignment) {
   GC_ASSERT_EQ(size, align_down(size, getpagesize()));
   GC_ASSERT_EQ(alignment & (alignment - 1), 0);
   GC_ASSERT_EQ(alignment, align_down(alignment, getpagesize()));
 
   size_t extent = size + alignment;
-  char *mem = mmap(NULL, extent, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  void *mem = mmap(NULL, extent, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+
   if (mem == MAP_FAILED) {
-    perror("mmap failed");
-    return NULL;
+    perror("failed to reserve address space");
+    GC_CRASH();
   }
 
   uintptr_t base = (uintptr_t) mem;
@@ -151,7 +152,37 @@ void* gc_platform_acquire_memory(size_t size, size_t alignment) {
   if (end - aligned_end)
     munmap((void*)aligned_end, end - aligned_end);
 
-  return (void*) aligned_base;
+  return (struct gc_reservation){aligned_base, size};
+}
+
+void*
+gc_platform_acquire_memory_from_reservation(struct gc_reservation reservation,
+                                            size_t offset, size_t size) {
+  GC_ASSERT_EQ(size, align_down(size, getpagesize()));
+  GC_ASSERT(size <= reservation.size);
+  GC_ASSERT(offset <= reservation.size - size);
+
+  void *mem = mmap((void*)(reservation.base + offset), size,
+                   PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("mmap failed");
+    return NULL;
+  }
+
+  return mem;
+}
+
+void
+gc_platform_release_reservation(struct gc_reservation reservation) {
+  if (munmap((void*)reservation.base, reservation.size) != 0)
+    perror("failed to unmap memory");
+}
+
+void*
+gc_platform_acquire_memory(size_t size, size_t alignment) {
+  struct gc_reservation reservation =
+    gc_platform_reserve_memory(size, alignment);
+  return gc_platform_acquire_memory_from_reservation(reservation, 0, size);
 }
 
 void gc_platform_release_memory(void *ptr, size_t size) {
diff --git a/src/gc-platform.h b/src/gc-platform.h
index ddedbb984..b642e8157 100644
--- a/src/gc-platform.h
+++ b/src/gc-platform.h
@@ -24,6 +24,21 @@ GC_INTERNAL int gc_platform_processor_count(void);
 GC_INTERNAL uint64_t gc_platform_monotonic_nanoseconds(void);
 
 GC_INTERNAL size_t gc_platform_page_size(void);
+
+struct gc_reservation {
+  uintptr_t base;
+  size_t size;
+};
+
+GC_INTERNAL
+struct gc_reservation gc_platform_reserve_memory(size_t size, size_t alignment);
+GC_INTERNAL
+void*
+gc_platform_acquire_memory_from_reservation(struct gc_reservation reservation,
+                                            size_t offset, size_t size);
+GC_INTERNAL
+void gc_platform_release_reservation(struct gc_reservation reservation);
+
 GC_INTERNAL void* gc_platform_acquire_memory(size_t size, size_t alignment);
 GC_INTERNAL void gc_platform_release_memory(void *base, size_t size);
 

From 2dcdfc24bc66a091c00b4079c3cfb4cc1a17af67 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 9 Dec 2024 14:30:30 +0100
Subject: [PATCH 329/403] Field set: per-edge clear callback is optional

---
 src/field-set.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/field-set.h b/src/field-set.h
index ef53f398d..ee7df811d 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -151,15 +151,17 @@ gc_field_set_clear(struct gc_field_set *set,
   struct gc_edge_buffer *buf, *next;
   for (buf = partly_full; buf; buf = next) {
     next = buf->next;
-    for (size_t i = 0; i < buf->size; i++)
-      forget_edge(buf->edges[i], heap);
+    if (forget_edge)
+      for (size_t i = 0; i < buf->size; i++)
+        forget_edge(buf->edges[i], heap);
     buf->size = 0;
     gc_edge_buffer_list_push(&set->empty, buf);
   }
   for (buf = full; buf; buf = next) {
     next = buf->next;
-    for (size_t i = 0; i < buf->size; i++)
-      forget_edge(buf->edges[i], heap);
+    if (forget_edge)
+      for (size_t i = 0; i < buf->size; i++)
+        forget_edge(buf->edges[i], heap);
     buf->size = 0;
     gc_edge_buffer_list_push(&set->empty, buf);
   }

From 95868c70a2c94472f16b6f8fd2f542b3d87a99b3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Jan 2025 11:11:10 +0100
Subject: [PATCH 330/403] Add splay tree

---
 src/splay-tree.h       | 258 +++++++++++++++++++++++++++++++++++++++++
 test/test-splay-tree.c | 116 ++++++++++++++++++
 2 files changed, 374 insertions(+)
 create mode 100644 src/splay-tree.h
 create mode 100644 test/test-splay-tree.c

diff --git a/src/splay-tree.h b/src/splay-tree.h
new file mode 100644
index 000000000..f4e41af18
--- /dev/null
+++ b/src/splay-tree.h
@@ -0,0 +1,258 @@
+// A splay tree, originally derived from Octane's `splay.js', whose
+// copyright is as follows:
+//
+// Copyright 2009 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The splay tree has been modified to allow nodes to store spans of
+// keys, for example so that we can look up an object given any address
+// pointing into that object.
+
+#ifndef SPLAY_TREE_PREFIX
+#error define SPLAY_TREE_PREFIX before including splay-tree.h
+#endif
+
+#include <malloc.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "gc-assert.h"
+
+#define SPLAY___(p, n) p ## n
+#define SPLAY__(p, n) SPLAY___(p, n)
+#define SPLAY_(n) SPLAY__(SPLAY_TREE_PREFIX, n)
+
+// Data types used by the splay tree.
+#define SPLAY_KEY_SPAN SPLAY_(key_span)
+#define SPLAY_KEY SPLAY_(key)
+#define SPLAY_VALUE SPLAY_(value)
+
+// Functions used by the splay tree.
+// key_span, key -> -1|0|1
+#define SPLAY_COMPARE SPLAY_(compare)
+// key_span -> key
+#define SPLAY_SPAN_START SPLAY_(span_start)
+
+// Data types defined by the splay tree.
+#define SPLAY_TREE SPLAY_(tree)
+#define SPLAY_NODE SPLAY_(node)
+
+// Functions defined by the splay tree.
+#define SPLAY_NODE_NEW SPLAY_(node_new)
+#define SPLAY_INIT SPLAY_(tree_init)
+#define SPLAY_SPLAY SPLAY_(tree_splay)
+#define SPLAY_PREVIOUS SPLAY_(tree_previous)
+#define SPLAY_LOOKUP SPLAY_(tree_lookup)
+#define SPLAY_CONTAINS SPLAY_(tree_contains)
+#define SPLAY_INSERT SPLAY_(tree_insert)
+#define SPLAY_REMOVE SPLAY_(tree_remove)
+
+struct SPLAY_NODE {
+  SPLAY_KEY_SPAN key;
+  SPLAY_VALUE value;
+  struct SPLAY_NODE *left;
+  struct SPLAY_NODE *right;
+};
+
+struct SPLAY_TREE {
+  struct SPLAY_NODE *root;
+};
+
+static inline struct SPLAY_NODE*
+SPLAY_NODE_NEW(SPLAY_KEY_SPAN key, SPLAY_VALUE value) {
+  struct SPLAY_NODE *ret = malloc(sizeof(*ret));
+  if (!ret) GC_CRASH();
+  ret->key = key;
+  ret->value = value;
+  ret->left = ret->right = NULL;
+  return ret;
+}
+
+static inline void
+SPLAY_INIT(struct SPLAY_TREE *tree) {
+  tree->root = NULL;
+}
+
+static struct SPLAY_NODE*
+SPLAY_SPLAY(struct SPLAY_TREE *tree, SPLAY_KEY key) {
+  struct SPLAY_NODE *current = tree->root;
+  if (!current)
+    return NULL;
+  // The use of the dummy node is a bit counter-intuitive: The right
+  // child of the dummy node will hold the L tree of the algorithm.  The
+  // left child of the dummy node will hold the R tree of the algorithm.
+  // Using a dummy node, left and right will always be nodes and we
+  // avoid special cases.
+  struct SPLAY_NODE dummy;
+  memset(&dummy, 0, sizeof(dummy));
+  struct SPLAY_NODE *left = &dummy;
+  struct SPLAY_NODE *right = &dummy;
+
+loop:
+  switch (SPLAY_COMPARE(key, current->key)) {
+  case -1:
+    if (!current->left)
+      break;
+    if (SPLAY_COMPARE(key, current->left->key) < 0LL) {
+      // Rotate right.
+      struct SPLAY_NODE *tmp = current->left;
+      current->left = tmp->right;
+      tmp->right = current;
+      current = tmp;
+      if (!current->left)
+        break;
+    }
+    // Link right.
+    right->left = current;
+    right = current;
+    current = current->left;
+    goto loop;
+
+  case 0:
+    break;
+
+  case 1:
+    if (!current->right)
+      break;
+    if (SPLAY_COMPARE(key, current->right->key) > 0LL) {
+      // Rotate left.
+      struct SPLAY_NODE *tmp = current->right;
+      current->right = tmp->left;
+      tmp->left = current;
+      current = tmp;
+      if (!current->right)
+        break;
+    }
+    // Link left.
+    left->right = current;
+    left = current;
+    current = current->right;
+    goto loop;
+
+  default:
+    GC_CRASH();
+  }
+
+  left->right = current->left;
+  right->left = current->right;
+  current->left = dummy.right;
+  current->right = dummy.left;
+  tree->root = current;
+  return current;
+}
+
+static inline struct SPLAY_NODE*
+SPLAY_PREVIOUS(struct SPLAY_NODE *node) {
+  node = node->left;
+  if (!node) return NULL;
+  while (node->right)
+    node = node->right;
+  return node;
+}
+
+static inline struct SPLAY_NODE*
+SPLAY_LOOKUP(struct SPLAY_TREE *tree, SPLAY_KEY key) {
+  struct SPLAY_NODE *node = SPLAY_SPLAY(tree, key);
+  if (node && SPLAY_COMPARE(key, node->key) == 0)
+    return node;
+  return NULL;
+}
+
+static inline int
+SPLAY_CONTAINS(struct SPLAY_TREE *tree, SPLAY_KEY key) {
+  return !!SPLAY_LOOKUP(tree, key);
+}
+
+static inline struct SPLAY_NODE*
+SPLAY_INSERT(struct SPLAY_TREE* tree, SPLAY_KEY_SPAN key, SPLAY_VALUE value) {
+  if (!tree->root) {
+    tree->root = SPLAY_NODE_NEW(key, value);
+    return tree->root;
+  }
+  SPLAY_KEY scalar = SPLAY_SPAN_START(key);
+  struct SPLAY_NODE *node = SPLAY_SPLAY(tree, scalar);
+  switch (SPLAY_COMPARE(scalar, node->key)) {
+  case -1:
+    node = SPLAY_NODE_NEW(key, value);
+    node->right = tree->root;
+    node->left = tree->root->left;
+    tree->root->left = NULL;
+    tree->root = node;
+    break;
+  case 0:
+    GC_ASSERT(memcmp(&key, &node->key, sizeof(SPLAY_KEY_SPAN)) == 0);
+    node->value = value;
+    break;
+  case 1:
+    node = SPLAY_NODE_NEW(key, value);
+    node->left = tree->root;
+    node->right = tree->root->right;
+    tree->root->right = NULL;
+    tree->root = node;
+    break;
+  default:
+    GC_CRASH();
+  }
+  return node;
+}
+
+static inline SPLAY_VALUE
+SPLAY_REMOVE(struct SPLAY_TREE *tree, SPLAY_KEY key) {
+  GC_ASSERT(tree->root);
+  struct SPLAY_NODE *removed = SPLAY_SPLAY(tree, key);
+  GC_ASSERT(removed);
+  SPLAY_VALUE value = removed->value;
+  if (!removed->left) {
+    tree->root = removed->right;
+  } else {
+    struct SPLAY_NODE *right = removed->right;
+    tree->root = removed->left;
+    // Splay to make sure that the new root has an empty right child.
+    SPLAY_SPLAY(tree, key);
+    tree->root->right = right;
+  }
+  free(removed);
+  return value;
+}
+
+#undef SPLAY_TREE_PREFIX
+#undef SPLAY_KEY_SPAN
+#undef SPLAY_KEY
+#undef SPLAY_VALUE
+#undef SPLAY_COMPARE
+#undef SPLAY_SPAN_START
+#undef SPLAY_SPANS_EQUAL
+#undef SPLAY_TREE
+#undef SPLAY_NODE
+#undef SPLAY_NODE_NEW
+#undef SPLAY_INIT
+#undef SPLAY_SPLAY
+#undef SPLAY_PREVIOUS
+#undef SPLAY_LOOKUP
+#undef SPLAY_CONTAINS
+#undef SPLAY_INSERT
+#undef SPLAY_REMOVE
diff --git a/test/test-splay-tree.c b/test/test-splay-tree.c
new file mode 100644
index 000000000..7f6e916c6
--- /dev/null
+++ b/test/test-splay-tree.c
@@ -0,0 +1,116 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+struct object {
+  uintptr_t addr;
+  size_t size;
+};
+
+struct data {
+  size_t idx;
+};
+
+#define SPLAY_TREE_PREFIX object_
+typedef struct object object_key_span;
+typedef uintptr_t object_key;
+typedef struct data object_value;
+static inline int
+object_compare(uintptr_t addr, struct object obj) {
+  if (addr < obj.addr) return -1;
+  if (addr - obj.addr < obj.size) return 0;
+  return 1;
+}
+static inline uintptr_t
+object_span_start(struct object obj) {
+  return obj.addr;
+}
+#include "splay-tree.h"
+
+// A power-law distribution.  Each integer was selected by starting at
+// 0, taking a random number in [0,1), and then accepting the integer if
+// the random number was less than 0.15, or trying again with the next
+// integer otherwise.  Useful for modelling allocation sizes or number
+// of garbage objects to allocate between live allocations.
+static const uint8_t power_law_distribution[256] = {
+  1, 15, 3, 12, 2, 8, 4, 0, 18, 7, 9, 8, 15, 2, 36, 5,
+  1, 9, 6, 11, 9, 19, 2, 0, 0, 3, 9, 6, 3, 2, 1, 1,
+  6, 1, 8, 4, 2, 0, 5, 3, 7, 0, 0, 3, 0, 4, 1, 7,
+  1, 8, 2, 2, 2, 14, 0, 7, 8, 0, 2, 1, 4, 12, 7, 5,
+  0, 3, 4, 13, 10, 2, 3, 7, 0, 8, 0, 23, 0, 16, 1, 1,
+  6, 28, 1, 18, 0, 3, 6, 5, 8, 6, 14, 5, 2, 5, 0, 11,
+  0, 18, 4, 16, 1, 4, 3, 13, 3, 23, 7, 4, 10, 5, 3, 13,
+  0, 14, 5, 5, 2, 5, 0, 16, 2, 0, 1, 1, 0, 0, 4, 2,
+  7, 7, 0, 5, 7, 2, 1, 24, 27, 3, 7, 1, 0, 8, 1, 4,
+  0, 3, 0, 7, 7, 3, 9, 2, 9, 2, 5, 10, 1, 1, 12, 6,
+  2, 9, 5, 0, 4, 6, 0, 7, 2, 1, 5, 4, 1, 0, 1, 15,
+  4, 0, 15, 4, 0, 0, 32, 18, 2, 2, 1, 7, 8, 3, 11, 1,
+  2, 7, 11, 1, 9, 1, 2, 6, 11, 17, 1, 2, 5, 1, 14, 3,
+  6, 1, 1, 15, 3, 1, 0, 6, 10, 8, 1, 3, 2, 7, 0, 1,
+  0, 11, 3, 3, 5, 8, 2, 0, 0, 7, 12, 2, 5, 20, 3, 7,
+  4, 4, 5, 22, 1, 5, 2, 7, 15, 2, 4, 6, 11, 8, 12, 1
+};
+
+static size_t power_law(size_t *counter) {
+  return power_law_distribution[(*counter)++ & 0xff];
+}
+
+static uintptr_t allocate(size_t size) {
+  void *ret = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  if (ret == MAP_FAILED) {
+    perror("mmap failed");
+    exit(1);
+  }
+  return (uintptr_t)ret;
+}
+
+static const size_t GB = 1024 * 1024 * 1024;
+
+// Page size is at least 4 kB, so we will have at most 256 * 1024 allocations.
+static uintptr_t all_objects[256 * 1024 + 1];
+static size_t object_count;
+
+#define ASSERT(x) do { if (!(x)) abort(); } while (0)
+
+int main(int argc, char *arv[]) {
+  struct object_tree tree;
+
+  object_tree_init(&tree);
+
+  size_t counter = 0;
+  size_t page_size = getpagesize();
+
+  // Use mmap as a source of nonoverlapping spans.  Allocate 1 GB of address space.
+  size_t allocated = 0;
+  while (allocated < 1 * GB) {
+    size_t size = power_law(&counter) * page_size;
+    if (!size)
+      continue;
+    uintptr_t addr = allocate(size);
+    object_tree_insert(&tree,
+                       (struct object){addr, size},
+                       (struct data){object_count});
+    all_objects[object_count++] = addr;
+    ASSERT(object_count < sizeof(all_objects) / sizeof(all_objects[0]));
+    allocated += size;
+  }
+
+  for (size_t i = 0; i < object_count; i++)
+    ASSERT(object_tree_contains(&tree, all_objects[i]));
+
+  for (size_t i = 0; i < object_count; i++)
+    ASSERT(object_tree_lookup(&tree, all_objects[i])->value.idx == i);
+
+  for (size_t i = 0; i < object_count; i++)
+    ASSERT(object_tree_lookup(&tree, all_objects[i] + 42)->value.idx == i);
+
+  for (size_t i = 0; i < object_count; i++)
+    object_tree_remove(&tree, all_objects[i]);
+
+  for (size_t i = 0; i < object_count; i++)
+    ASSERT(!object_tree_contains(&tree, all_objects[i]));
+  for (size_t i = 0; i < object_count; i++)
+    ASSERT(object_tree_lookup(&tree, all_objects[i]) == NULL);
+}

From 4be3e69ac130a8d8385dfba05a674d863a91b044 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Jan 2025 15:49:59 +0100
Subject: [PATCH 331/403] Add asserts to address map / address set

---
 src/address-map.h | 3 +++
 src/address-set.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/address-map.h b/src/address-map.h
index 4b6b0c47f..57c2a0a04 100644
--- a/src/address-map.h
+++ b/src/address-map.h
@@ -6,6 +6,7 @@
 #include <string.h>
 
 #include "address-hash.h"
+#include "gc-assert.h"
 
 struct hash_map_entry {
   uintptr_t k;
@@ -28,7 +29,9 @@ static void hash_map_clear(struct hash_map *map) {
 static void hash_map_init(struct hash_map *map, size_t size) {
   map->size = size;
   map->data = malloc(sizeof(struct hash_map_entry) * size);
+  if (!map->data) GC_CRASH();
   map->bits = malloc(size / 8);
+  if (!map->bits) GC_CRASH();
   hash_map_clear(map);
 }
 static void hash_map_destroy(struct hash_map *map) {
diff --git a/src/address-set.h b/src/address-set.h
index 74bc08888..b1c27fa41 100644
--- a/src/address-set.h
+++ b/src/address-set.h
@@ -6,6 +6,7 @@
 #include <string.h>
 
 #include "address-hash.h"
+#include "gc-assert.h"
 
 struct hash_set {
   uintptr_t *data;
@@ -23,7 +24,9 @@ static void hash_set_clear(struct hash_set *set) {
 static void hash_set_init(struct hash_set *set, size_t size) {
   set->size = size;
   set->data = malloc(sizeof(uintptr_t) * size);
+  if (!set->data) GC_CRASH();
   set->bits = malloc(size / 8);
+  if (!set->bits) GC_CRASH();
   hash_set_clear(set);
 }
 static void hash_set_destroy(struct hash_set *set) {

From 8e631ca3f3a24773e55768e420735a43b8bd97de Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Jan 2025 15:47:20 +0100
Subject: [PATCH 332/403] Remove embedder requirement for per-object remset
 bits

Since we now have a field-logging write barrier, we don't need
per-object log bits.
---
 api/gc-embedder-api.h              |  8 --------
 benchmarks/simple-gc-embedder.h    | 26 --------------------------
 benchmarks/simple-tagging-scheme.h |  8 +++-----
 doc/manual.md                      | 16 ----------------
 src/large-object-space.h           |  2 --
 5 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/api/gc-embedder-api.h b/api/gc-embedder-api.h
index 7535ea7bc..c1b272a51 100644
--- a/api/gc-embedder-api.h
+++ b/api/gc-embedder-api.h
@@ -50,14 +50,6 @@ GC_EMBEDDER_API inline void gc_trace_heap_roots(struct gc_heap_roots *roots,
                                                 struct gc_heap *heap,
                                                 void *trace_data);
 
-// Some heap objects have space for a "remembered" bit, indicating they
-// are in the remembered set.  Large or potentially large objects
-// (e.g. a vector whose size is a run-time property) must have a
-// remembered set bit.  Small objects may or may not have such a bit.
-GC_EMBEDDER_API inline int gc_object_set_remembered(struct gc_ref ref);
-GC_EMBEDDER_API inline int gc_object_is_remembered_nonatomic(struct gc_ref ref);
-GC_EMBEDDER_API inline void gc_object_clear_remembered_nonatomic(struct gc_ref ref);
-
 GC_EMBEDDER_API inline uintptr_t gc_object_forwarded_nonatomic(struct gc_ref ref);
 GC_EMBEDDER_API inline void gc_object_forward_nonatomic(struct gc_ref ref,
                                                         struct gc_ref new_ref);
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index 4e5fbb83d..d8ad3f0ad 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -102,32 +102,6 @@ static inline void gc_object_forward_nonatomic(struct gc_ref ref,
   *tag_word(ref) = gc_ref_value(new_ref);
 }
 
-static inline int gc_object_set_remembered(struct gc_ref ref) {
-  uintptr_t *loc = tag_word(ref);
-  uintptr_t tag = atomic_load_explicit(loc, memory_order_relaxed);
-  while (1) {
-    if (tag & gcobj_remembered_bit)
-      return 0;
-    if (atomic_compare_exchange_weak_explicit(loc, &tag,
-                                              tag | gcobj_remembered_bit,
-                                              memory_order_acq_rel,
-                                              memory_order_acquire))
-      return 1;
-  }
-}
-
-static inline int gc_object_is_remembered_nonatomic(struct gc_ref ref) {
-  uintptr_t *loc = tag_word(ref);
-  uintptr_t tag = *loc;
-  return tag & gcobj_remembered_bit;
-}
-
-static inline void gc_object_clear_remembered_nonatomic(struct gc_ref ref) {
-  uintptr_t *loc = tag_word(ref);
-  uintptr_t tag = *loc;
-  *loc = tag & ~(uintptr_t)gcobj_remembered_bit;
-}
-
 static inline struct gc_atomic_forward
 gc_atomic_forward_begin(struct gc_ref ref) {
   uintptr_t tag = atomic_load_explicit(tag_word(ref), memory_order_acquire);
diff --git a/benchmarks/simple-tagging-scheme.h b/benchmarks/simple-tagging-scheme.h
index aa0b707e4..b6b8a924c 100644
--- a/benchmarks/simple-tagging-scheme.h
+++ b/benchmarks/simple-tagging-scheme.h
@@ -7,11 +7,9 @@ struct gc_header {
   uintptr_t tag;
 };
 
-// Alloc kind is in bits 2-7, for live objects.
-static const uintptr_t gcobj_alloc_kind_mask = 0x3f;
-static const uintptr_t gcobj_alloc_kind_shift = 2;
-static const uintptr_t gcobj_remembered_mask = 0x2;
-static const uintptr_t gcobj_remembered_bit = 0x2;
+// Alloc kind is in bits 1-7, for live objects.
+static const uintptr_t gcobj_alloc_kind_mask = 0x7f;
+static const uintptr_t gcobj_alloc_kind_shift = 1;
 static const uintptr_t gcobj_forwarded_mask = 0x1;
 static const uintptr_t gcobj_not_forwarded_bit = 0x1;
 static const uintptr_t gcobj_busy = 0;
diff --git a/doc/manual.md b/doc/manual.md
index 7c784b626..c299128bf 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -87,22 +87,6 @@ in the `gc_trace_object` function by calling `gc_trace_ephemeron` from
 allocates finalizers, it should trace them by calling
 `gc_trace_finalizer` from [`gc-finalizer.h`](../api/gc-finalizer.h).
 
-### Remembered-set bits
-
-When built to support generational garbage collection, Whippet requires
-that all "large" or potentially large objects have a flag bit reserved
-for use of the garbage collector.  A large object is one whose size
-exceeds the `gc_allocator_large_threshold()` (see
-[`gc-attrs.h`](../api/gc-attrs.h)), which is a collector-specific value.
-Currently the only generational collector is the in-place `mmc`
-collector, whose large object threshold is 4096 bytes.  The
-`gc_object_set_remembered`, `gc_object_is_remembered_nonatomic`, and
-`gc_object_clear_remembered_nonatomic` embedder functions manage the
-remembered bit.  Setting the remembered bit should be idempotent;
-multiple threads can race to call `gc_object_set_remembered` and do not
-synchronize.  The query and clear functions are called without
-concurrent accessors and so don't have to be atomic.
-
 ### Forwarding objects
 
 When built with a collector that moves objects, the embedder must also
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 7aba13d9a..18e6280da 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -113,8 +113,6 @@ static int large_object_space_copy(struct large_object_space *space,
   address_set_remove(&space->from_space, addr);
   address_set_add(GC_GENERATIONAL ? &space->survivor_space : &space->to_space,
                   addr);
-  if (GC_GENERATIONAL && gc_object_is_remembered_nonatomic(ref))
-    gc_object_clear_remembered_nonatomic(ref);
   // Object is grey; place it on mark stack to visit its fields.
   copied = 1;
 done:

From d2e745ac23b5db1d31476fb8135b35aef3ff8910 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 6 Jan 2025 15:49:49 +0100
Subject: [PATCH 333/403] Rework large object space

Store allocations in a splay tree so that we can efficiently map from an
edge originating in the lospace to its object.  Defer returning memory
to the OS to a periodic background thread, using a similar strategy as
for nofl and copy-space pages.  Use a size-segregated freelist instead
of requiring a full best-fit search for those pages that haven't yet
been returned to the OS.
---
 src/freelist.h           |  31 +++
 src/large-object-space.h | 461 +++++++++++++++++++++++++--------------
 src/mmc.c                |   3 +-
 src/pcc.c                |   3 +-
 src/semi.c               |   3 +-
 5 files changed, 338 insertions(+), 163 deletions(-)
 create mode 100644 src/freelist.h

diff --git a/src/freelist.h b/src/freelist.h
new file mode 100644
index 000000000..6eec6dbac
--- /dev/null
+++ b/src/freelist.h
@@ -0,0 +1,31 @@
+#ifndef FREELIST_H
+#define FREELIST_H
+
+// A size-segregated freelist with linear-log buckets à la
+// https://pvk.ca/Blog/2015/06/27/linear-log-bucketing-fast-versatile-simple/.
+
+#include "gc-assert.h"
+#include "gc-histogram.h"
+
+#include <string.h>
+
+#define DEFINE_FREELIST(name, max_value_bits, precision, node)          \
+  struct name { node buckets[((max_value_bits) << (precision)) + 1]; }; \
+  static inline size_t name##_num_size_classes(void) {                  \
+    return ((max_value_bits) << (precision)) + 1;                       \
+  }                                                                     \
+  static inline uint64_t name##_bucket_min_val(size_t idx) {            \
+    GC_ASSERT(idx < name##_num_size_classes());                         \
+    return gc_histogram_bucket_min_val((precision), idx);               \
+  }                                                                     \
+  static inline void name##_init(struct name *f) {                      \
+    memset(f, 0, sizeof(*f));                                           \
+  }                                                                     \
+  static inline size_t name##_size_class(uint64_t val) {                \
+    return gc_histogram_bucket((max_value_bits), (precision), val);     \
+  }                                                                     \
+  static inline node* name##_bucket(struct name *f, uint64_t val) {     \
+    return &f->buckets[name##_size_class(val)];                         \
+  }
+
+#endif // FREELIST_H
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 18e6280da..703c048b4 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -6,7 +6,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/mman.h>
 #include <unistd.h>
 
 #include "gc-assert.h"
@@ -14,6 +13,8 @@
 #include "gc-conservative-ref.h"
 #include "address-map.h"
 #include "address-set.h"
+#include "background-thread.h"
+#include "freelist.h"
 
 // Logically the large object space is a treadmill space -- somewhat like a
 // copying collector, in that we allocate into tospace, and collection flips
@@ -23,42 +24,87 @@
 
 struct gc_heap;
 
+struct large_object {
+  uintptr_t addr;
+  size_t size;
+};
+struct large_object_node;
+struct large_object_live_data {
+  uint8_t is_survivor;
+};
+struct large_object_dead_data {
+  uint8_t age;
+  struct large_object_node **prev;
+  struct large_object_node *next;
+};
+struct large_object_data {
+  uint8_t is_live;
+  union {
+    struct large_object_live_data live;
+    struct large_object_dead_data dead;
+  };
+};
+
+#define SPLAY_TREE_PREFIX large_object_
+typedef struct large_object large_object_key_span;
+typedef uintptr_t large_object_key;
+typedef struct large_object_data large_object_value;
+static inline int
+large_object_compare(uintptr_t addr, struct large_object obj) {
+  if (addr < obj.addr) return -1;
+  if (addr - obj.addr < obj.size) return 0;
+  return 1;
+}
+static inline uintptr_t
+large_object_span_start(struct large_object obj) {
+  return obj.addr;
+}
+#include "splay-tree.h"
+
+DEFINE_FREELIST(large_object_freelist, sizeof(uintptr_t) * 8 - 1, 2,
+                struct large_object_node*);
+
 struct large_object_space {
+  // Access to all members protected by lock.
   pthread_mutex_t lock;
 
+  // Splay tree of objects, keyed by <addr, size> tuple.  Useful when
+  // looking up object-for-address.
+  struct large_object_tree object_tree;
+  // Hash table of objects, where values are pointers to splay tree
+  // nodes.  Useful when you have the object address and just want to
+  // check something about it (for example its size).
+  struct address_map object_map;
+
+  // Size-segregated freelist of dead objects.  Allocations are first
+  // served from the quarantine freelist before falling back to the OS
+  // if needed.  Collected objects spend a second or two in quarantine
+  // before being returned to the OS.  This is an optimization to avoid
+  // mucking about too much with the TLB and so on.
+  struct large_object_freelist quarantine;
+
   size_t page_size;
   size_t page_size_log2;
   size_t total_pages;
   size_t free_pages;
   size_t live_pages_at_last_collection;
   size_t pages_freed_by_last_collection;
+  int synchronous_release;
 
-  struct address_set from_space;
-  struct address_set to_space;
-  struct address_set survivor_space;
+  // A partition of the set of live objects into three sub-spaces.  If
+  // all collections are major, the survivor space will always be empty.
+  // The values of these maps are splay tree nodes.
+  struct address_map from_space;
+  struct address_map to_space;
+  struct address_map survivor_space;
+
+  // Set of edges from lospace that may reference young objects,
+  // possibly in other spaces.
   struct address_set remembered_edges;
-  struct address_set free_space;
-  struct address_map object_pages; // for each object: size in pages.
-  struct address_map predecessors; // subsequent addr -> object addr
 };
 
-static int large_object_space_init(struct large_object_space *space,
-                                   struct gc_heap *heap) {
-  pthread_mutex_init(&space->lock, NULL);
-  space->page_size = getpagesize();
-  space->page_size_log2 = __builtin_ctz(space->page_size);
-  address_set_init(&space->from_space);
-  address_set_init(&space->to_space);
-  address_set_init(&space->survivor_space);
-  address_set_init(&space->remembered_edges);
-  address_set_init(&space->free_space);
-  address_map_init(&space->object_pages);
-  address_map_init(&space->predecessors);
-  return 1;
-}
-
-static size_t large_object_space_npages(struct large_object_space *space,
-                                       size_t bytes) {
+static size_t
+large_object_space_npages(struct large_object_space *space, size_t bytes) {
   return (bytes + space->page_size - 1) >> space->page_size_log2;
 }
 
@@ -67,66 +113,105 @@ large_object_space_size_at_last_collection(struct large_object_space *space) {
   return space->live_pages_at_last_collection << space->page_size_log2;
 }
 
-static inline int large_object_space_contains(struct large_object_space *space,
-                                              struct gc_ref ref) {
+static inline int
+large_object_space_contains(struct large_object_space *space,
+                            struct gc_ref ref) {
   pthread_mutex_lock(&space->lock);
-  // ptr might be in fromspace or tospace.  Just check the object_pages table, which
-  // contains both, as well as object_pages for free blocks.
-  int ret = address_map_contains(&space->object_pages, gc_ref_value(ref));
+  int ret = address_map_contains(&space->object_map, gc_ref_value(ref));
   pthread_mutex_unlock(&space->lock);
   return ret;
 }
 
-static void large_object_space_flip_survivor(uintptr_t addr,
-                                             void *data) {
-  struct large_object_space *space = data;
-  address_set_add(&space->from_space, addr);
+static inline struct gc_ref
+large_object_space_contains_edge(struct large_object_space *space,
+                                 struct gc_edge edge) {
+  pthread_mutex_lock(&space->lock);
+  struct large_object_node *node =
+    large_object_tree_lookup(&space->object_tree, gc_edge_address(edge));
+  uintptr_t addr = node ? node->key.addr : 0;
+  pthread_mutex_unlock(&space->lock);
+  return gc_ref(addr);
 }
 
-static void large_object_space_start_gc(struct large_object_space *space,
-                                        int is_minor_gc) {
+static void
+large_object_space_flip_survivor(uintptr_t addr, uintptr_t node_bits,
+                                 void *data) {
+  struct large_object_space *space = data;
+  struct large_object_node *node = (void*)node_bits;
+  GC_ASSERT(node->value.is_live && node->value.live.is_survivor);
+  node->value.live.is_survivor = 0;
+  address_map_add(&space->from_space, addr, (uintptr_t)node);
+}
+
+static void
+large_object_space_start_gc(struct large_object_space *space, int is_minor_gc) {
   // Flip.  Note that when we flip, fromspace is empty, but it might have
   // allocated storage, so we do need to do a proper swap.
-  struct address_set tmp;
+  struct address_map tmp;
   memcpy(&tmp, &space->from_space, sizeof(tmp));
   memcpy(&space->from_space, &space->to_space, sizeof(tmp));
   memcpy(&space->to_space, &tmp, sizeof(tmp));
   
   if (!is_minor_gc) {
-    address_set_for_each(&space->survivor_space,
+    address_map_for_each(&space->survivor_space,
                          large_object_space_flip_survivor, space);
-    address_set_clear(&space->survivor_space);
+    address_map_clear(&space->survivor_space);
     space->live_pages_at_last_collection = 0;
   }
 }
 
-static int large_object_space_copy(struct large_object_space *space,
-                                   struct gc_ref ref) {
+static inline size_t
+large_object_space_object_size(struct large_object_space *space,
+                               struct gc_ref ref) {
+  uintptr_t node_bits =
+    address_map_lookup(&space->object_map, gc_ref_value(ref), 0);
+  GC_ASSERT(node_bits);
+  struct large_object_node *node = (struct large_object_node*) node_bits;
+  return node->key.size;
+}
+
+static void
+large_object_space_do_copy(struct large_object_space *space,
+                           struct large_object_node *node) {
+  GC_ASSERT(address_map_contains(&space->from_space, node->key.addr));
+  GC_ASSERT(node->value.is_live);
+  GC_ASSERT(!node->value.live.is_survivor);
+  uintptr_t addr = node->key.addr;
+  size_t bytes = node->key.size;
+  uintptr_t node_bits = (uintptr_t)node;
+  space->live_pages_at_last_collection += bytes >> space->page_size_log2;
+  address_map_remove(&space->from_space, addr);
+  if (GC_GENERATIONAL) {
+    node->value.live.is_survivor = 1;
+    address_map_add(&space->survivor_space, addr, node_bits);
+  } else {
+    address_map_add(&space->to_space, addr, node_bits);
+  }
+}
+
+static int
+large_object_space_copy(struct large_object_space *space, struct gc_ref ref) {
   int copied = 0;
   uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
-  if (!address_set_contains(&space->from_space, addr))
-    // Already copied; object is grey or black.
-    goto done;
-  space->live_pages_at_last_collection +=
-    address_map_lookup(&space->object_pages, addr, 0);
-  address_set_remove(&space->from_space, addr);
-  address_set_add(GC_GENERATIONAL ? &space->survivor_space : &space->to_space,
-                  addr);
-  // Object is grey; place it on mark stack to visit its fields.
-  copied = 1;
-done:
+  uintptr_t node_bits = address_map_lookup(&space->from_space, addr, 0);
+  if (node_bits) {
+    large_object_space_do_copy(space, (struct large_object_node*) node_bits);
+    // Object is grey; place it on mark stack to visit its fields.
+    copied = 1;
+  }
   pthread_mutex_unlock(&space->lock);
   return copied;
 }
 
-static int large_object_space_is_copied(struct large_object_space *space,
-                                        struct gc_ref ref) {
+static int
+large_object_space_is_copied(struct large_object_space *space,
+                             struct gc_ref ref) {
   GC_ASSERT(large_object_space_contains(space, ref));
   int copied = 0;
   uintptr_t addr = gc_ref_value(ref);
   pthread_mutex_lock(&space->lock);
-  copied = !address_set_contains(&space->from_space, addr);
+  copied = !address_map_contains(&space->from_space, addr);
   pthread_mutex_unlock(&space->lock);
   return copied;
 }
@@ -134,11 +219,12 @@ static int large_object_space_is_copied(struct large_object_space *space,
 static int
 large_object_space_is_survivor_with_lock(struct large_object_space *space,
                                          struct gc_ref ref) {
-  return address_set_contains(&space->survivor_space, gc_ref_value(ref));
+  return address_map_contains(&space->survivor_space, gc_ref_value(ref));
 }
 
-static int large_object_space_is_survivor(struct large_object_space *space,
-                                          struct gc_ref ref) {
+static int
+large_object_space_is_survivor(struct large_object_space *space,
+                               struct gc_ref ref) {
   GC_ASSERT(large_object_space_contains(space, ref));
   pthread_mutex_lock(&space->lock);
   int old = large_object_space_is_survivor_with_lock(space, ref);
@@ -146,9 +232,11 @@ static int large_object_space_is_survivor(struct large_object_space *space,
   return old;
 }
 
-static int large_object_space_remember_edge(struct large_object_space *space,
-                                            struct gc_ref obj,
-                                            struct gc_edge edge) {
+static int
+large_object_space_remember_edge(struct large_object_space *space,
+                                 struct gc_ref obj,
+                                 struct gc_edge edge) {
+  GC_ASSERT(large_object_space_contains(space, obj));
   int remembered = 0;
   uintptr_t edge_addr = gc_edge_address(edge);
   pthread_mutex_lock(&space->lock);
@@ -171,61 +259,78 @@ static int large_object_space_mark_object(struct large_object_space *space,
   return large_object_space_copy(space, ref);
 }
 
-static inline size_t large_object_space_object_size(struct large_object_space *space,
-                                                    struct gc_ref ref) {
-  size_t npages = address_map_lookup(&space->object_pages,
-                                     gc_ref_value(ref), 0);
-  GC_ASSERT(npages != 0);
-  return npages * space->page_size;
+static void
+large_object_space_add_to_freelist(struct large_object_space *space,
+                                   struct large_object_node *node) {
+  node->value.is_live = 0;
+  struct large_object_dead_data *data = &node->value.dead;
+  memset(data, 0, sizeof(*data));
+  data->age = 0;
+  struct large_object_node **bucket =
+    large_object_freelist_bucket(&space->quarantine, node->key.size);
+  data->next = *bucket;
+  if (data->next)
+    data->next->value.dead.prev = &data->next;
+  data->prev = bucket;
+  *bucket = node;
 }
 
-static void large_object_space_reclaim_one(uintptr_t addr, void *data) {
+static void
+large_object_space_remove_from_freelist(struct large_object_space *space,
+                                        struct large_object_node *node) {
+  GC_ASSERT(!node->value.is_live);
+  struct large_object_dead_data *dead = &node->value.dead;
+  GC_ASSERT(dead->prev);
+  if (dead->next)
+    dead->next->value.dead.prev = dead->prev;
+  *dead->prev = dead->next;
+}
+
+static void
+large_object_space_reclaim_one(uintptr_t addr, uintptr_t node_bits,
+                               void *data) {
   struct large_object_space *space = data;
-  size_t npages = address_map_lookup(&space->object_pages, addr, 0);
-  // Release the pages to the OS, and cause them to be zero on next use.
-  madvise((void*) addr, npages * space->page_size, MADV_DONTNEED);
-  size_t did_merge = 0;
-  uintptr_t pred = address_map_lookup(&space->predecessors, addr, 0);
-  uintptr_t succ = addr + npages * space->page_size;
-  if (pred && address_set_contains(&space->free_space, pred)) {
-    // Merge with free predecessor.
-    address_map_remove(&space->predecessors, addr);
-    address_map_remove(&space->object_pages, addr);
-    addr = pred;
-    npages += address_map_lookup(&space->object_pages, addr, 0);
-    did_merge = 1;
-  } else {
-    // Otherwise this is a new free object.
-    address_set_add(&space->free_space, addr);
-  }
-  if (address_set_contains(&space->free_space, succ)) {
-    // Merge with free successor.
-    size_t succ_npages = address_map_lookup(&space->object_pages, succ, 0);
-    address_map_remove(&space->predecessors, succ);
-    address_map_remove(&space->object_pages, succ);
-    address_set_remove(&space->free_space, succ);
-    npages += succ_npages;
-    succ += succ_npages * space->page_size;
-    did_merge = 1;
-  }
-  if (did_merge) {
-    // Update extents.
-    address_map_add(&space->object_pages, addr, npages);
-    address_map_add(&space->predecessors, succ, addr);
-  }
+  struct large_object_node *node = (struct large_object_node*) node_bits;
+  GC_ASSERT(node->value.is_live);
+  large_object_space_add_to_freelist(space, node);
 }
 
-static void large_object_space_finish_gc(struct large_object_space *space,
-                                         int is_minor_gc) {
+static void
+large_object_space_process_quarantine(void *data) {
+  struct large_object_space *space = data;
   pthread_mutex_lock(&space->lock);
-  address_set_for_each(&space->from_space, large_object_space_reclaim_one,
+  for (size_t idx = 0; idx < large_object_freelist_num_size_classes(); idx++) {
+    struct large_object_node **link = &space->quarantine.buckets[idx];
+    for (struct large_object_node *node = *link; node; node = *link) {
+      GC_ASSERT(!node->value.is_live);
+      if (++node->value.dead.age < 2) {
+        link = &node->value.dead.next;
+      } else {
+        struct large_object obj = node->key;
+        large_object_space_remove_from_freelist(space, node);
+        address_map_remove(&space->object_map, obj.addr);
+        large_object_tree_remove(&space->object_tree, obj.addr);
+        gc_platform_release_memory((void*)obj.addr, obj.size);
+      }
+    }
+  }
+  pthread_mutex_unlock(&space->lock);
+}
+
+static void
+large_object_space_finish_gc(struct large_object_space *space,
+                             int is_minor_gc) {
+  pthread_mutex_lock(&space->lock);
+  address_map_for_each(&space->from_space, large_object_space_reclaim_one,
                        space);
-  address_set_clear(&space->from_space);
+  address_map_clear(&space->from_space);
   size_t free_pages =
     space->total_pages - space->live_pages_at_last_collection;
   space->pages_freed_by_last_collection = free_pages - space->free_pages;
   space->free_pages = free_pages;
   pthread_mutex_unlock(&space->lock);
+  if (space->synchronous_release)
+    large_object_space_process_quarantine(space);
 }
 
 static void
@@ -242,14 +347,9 @@ large_object_space_mark_conservative_ref(struct large_object_space *space,
                                          int possibly_interior) {
   uintptr_t addr = gc_conservative_ref_value(ref);
 
-  if (possibly_interior) {
-    // FIXME: This only allows interior pointers within the first page.
-    // BDW-GC doesn't have all-interior-pointers on for intraheap edges
-    // or edges originating in static data but by default does allow
-    // them from stack edges; probably we should too.
-    addr &= ~(space->page_size - 1);
-  } else {
+  if (!possibly_interior) {
     // Addr not aligned on page boundary?  Not a large object.
+    // Otherwise strip the displacement to obtain the true base address.
     uintptr_t displacement = addr & (space->page_size - 1);
     if (!gc_is_valid_conservative_ref_displacement(displacement))
       return gc_ref_null();
@@ -257,59 +357,63 @@ large_object_space_mark_conservative_ref(struct large_object_space *space,
   }
 
   pthread_mutex_lock(&space->lock);
-  // ptr might be in fromspace or tospace.  Just check the object_pages table, which
-  // contains both, as well as object_pages for free blocks.
-  int found = address_map_contains(&space->object_pages, addr);
+  struct large_object_node *node = NULL;
+  if (possibly_interior) {
+    node = large_object_tree_lookup(&space->object_tree, addr);
+    if (node && !address_map_contains(&space->from_space, node->key.addr))
+      node = NULL;
+  } else {
+    uintptr_t node_bits = address_map_lookup(&space->from_space, addr, 0);
+    node = (struct large_object_node*) node_bits;
+  }
+  struct gc_ref ret = gc_ref_null();
+  if (node) {
+    large_object_space_do_copy(space, node);
+    ret = gc_ref(node->key.addr);
+  }
   pthread_mutex_unlock(&space->lock);
 
-  if (found && large_object_space_copy(space, gc_ref(addr)))
-    return gc_ref(addr);
-
-  return gc_ref_null();
+  return ret;
 }
 
-struct large_object_space_candidate {
-  struct large_object_space *space;
-  size_t min_npages;
-  uintptr_t addr;
-  size_t npages;
-};
-
-static int large_object_space_best_fit(uintptr_t addr, void *data) {
-  struct large_object_space_candidate *found = data;
-  size_t npages = address_map_lookup(&found->space->object_pages, addr, 0);
-  if (npages < found->min_npages)
-    return 0;
-  if (npages >= found->npages)
-    return 0;
-  found->addr = addr;
-  found->npages = npages;
-  return found->min_npages == npages;
-}
-    
-static void* large_object_space_alloc(struct large_object_space *space,
-                                      size_t npages) {
-  void *ret;
+static void*
+large_object_space_alloc(struct large_object_space *space, size_t npages) {
+  void *ret = NULL;
   pthread_mutex_lock(&space->lock);
-  ret = NULL;
-  struct large_object_space_candidate found = { space, npages, 0, -1 };
-  address_set_find(&space->free_space, large_object_space_best_fit, &found);
-  if (found.addr) {
-    uintptr_t addr = found.addr;
-    ret = (void*)addr;
-    address_set_remove(&space->free_space, addr);
-    address_set_add(&space->to_space, addr);
+  
+  size_t size = npages << space->page_size_log2;
+  for (size_t idx = large_object_freelist_size_class(size);
+       idx < large_object_freelist_num_size_classes();
+       idx++) {
+    struct large_object_node *node = space->quarantine.buckets[idx];
+    while (node && node->key.size < size)
+      node = node->value.dead.next;
+    if (node) {
+      // We found a suitable hole in quarantine.  Unlink it from the
+      // freelist.
+      large_object_space_remove_from_freelist(space, node);
 
-    if (found.npages > npages) {
-      uintptr_t succ = addr + npages * space->page_size;
-      uintptr_t succ_succ = succ + (found.npages - npages) * space->page_size;
-      address_map_add(&space->object_pages, addr, npages);
-      address_map_add(&space->object_pages, succ, found.npages - npages);
-      address_set_add(&space->free_space, succ);
-      address_map_add(&space->predecessors, succ, addr);
-      address_map_add(&space->predecessors, succ_succ, succ);
+      // Mark the hole as live.
+      node->value.is_live = 1;
+      memset(&node->value.live, 0, sizeof(node->value.live));
+
+      // If the hole is actually too big, trim its tail.
+      if (node->key.size > size) {
+        struct large_object tail = {node->key.addr + size, node->key.size - size};
+        struct large_object_data tail_value = {0,};
+        node->key.size = size;
+        struct large_object_node *tail_node =
+          large_object_tree_insert(&space->object_tree, tail, tail_value);
+        large_object_space_add_to_freelist(space, tail_node);
+      }
+
+      // Add the object to tospace.
+      address_map_add(&space->to_space, node->key.addr, (uintptr_t)node);
+    
+      space->free_pages -= npages;
+      ret = (void*)node->key.addr;
+      break;
     }
-    space->free_pages -= npages;
   }
   pthread_mutex_unlock(&space->lock);
   return ret;
@@ -320,18 +424,55 @@ large_object_space_obtain_and_alloc(struct large_object_space *space,
                                     size_t npages) {
   size_t bytes = npages * space->page_size;
   void *ret = gc_platform_acquire_memory(bytes, 0);
-  if (ret == MAP_FAILED)
+  if (!ret)
     return NULL;
 
   uintptr_t addr = (uintptr_t)ret;
+  struct large_object k = { addr, bytes };
+  struct large_object_data v = {0,};
+  v.is_live = 1;
+  v.live.is_survivor = 0;
+
   pthread_mutex_lock(&space->lock);
-  address_map_add(&space->object_pages, addr, npages);
-  address_map_add(&space->predecessors, addr + bytes, addr);
-  address_set_add(&space->to_space, addr);
+  struct large_object_node *node =
+    large_object_tree_insert(&space->object_tree, k, v);
+  uintptr_t node_bits = (uintptr_t)node;
+  address_map_add(&space->object_map, addr, node_bits);
+  address_map_add(&space->to_space, addr, node_bits);
   space->total_pages += npages;
   pthread_mutex_unlock(&space->lock);
 
   return ret;
 }
 
+static int
+large_object_space_init(struct large_object_space *space,
+                        struct gc_heap *heap,
+                        struct gc_background_thread *thread) {
+  memset(space, 0, sizeof(*space));
+  pthread_mutex_init(&space->lock, NULL);
+
+  space->page_size = getpagesize();
+  space->page_size_log2 = __builtin_ctz(space->page_size);
+
+  large_object_tree_init(&space->object_tree);
+  address_map_init(&space->object_map);
+
+  large_object_freelist_init(&space->quarantine);
+
+  address_map_init(&space->from_space);
+  address_map_init(&space->to_space);
+  address_map_init(&space->survivor_space);
+  address_set_init(&space->remembered_edges);
+
+  if (thread)
+    gc_background_thread_add_task(thread, GC_BACKGROUND_TASK_START,
+                                  large_object_space_process_quarantine,
+                                  space);
+  else
+    space->synchronous_release = 1;
+
+  return 1;
+}
+
 #endif // LARGE_OBJECT_SPACE_H
diff --git a/src/mmc.c b/src/mmc.c
index 68a811947..d123c4793 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -1127,7 +1127,8 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
     return 0;
   }
   
-  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap,
+                               (*heap)->background_thread))
     GC_CRASH();
 
   *mut = calloc(1, sizeof(struct gc_mutator));
diff --git a/src/pcc.c b/src/pcc.c
index 6b8a55c03..9b2a8baf2 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -661,7 +661,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
     return 0;
   }
   
-  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap,
+                               (*heap)->background_thread))
     GC_CRASH();
 
   *mut = calloc(1, sizeof(struct gc_mutator));
diff --git a/src/semi.c b/src/semi.c
index 725a75f30..29cb0c92f 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -674,7 +674,8 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
 
   if (!semi_space_init(heap_semi_space(*heap), *heap))
     return 0;
-  if (!large_object_space_init(heap_large_object_space(*heap), *heap))
+  struct gc_background_thread *thread = NULL;
+  if (!large_object_space_init(heap_large_object_space(*heap), *heap, thread))
     return 0;
   
   // Ignore stack base, as we are precise.

From a74a2c129c5500ab5062ab4252596b6b3e564073 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 8 Dec 2024 21:04:38 +0100
Subject: [PATCH 334/403] Copy space has flags, can indicate that space is
 aligned

This will be useful for copy-space nurseries.
---
 src/copy-space.h | 77 +++++++++++++++++++++++++++++++++++++++++++-----
 src/pcc.c        | 18 ++++++-----
 2 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 9be2f8d2a..8377be73b 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -115,6 +115,11 @@ struct copy_space_block_stack {
   struct copy_space_block_list list;
 };
 
+enum copy_space_flags {
+  COPY_SPACE_ATOMIC_FORWARDING = 1,
+  COPY_SPACE_ALIGNED = 2,
+};
+
 struct copy_space {
   pthread_mutex_t lock;
   struct copy_space_block_stack empty;
@@ -129,6 +134,7 @@ struct copy_space {
   // lock.
   uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
   uint8_t atomic_forward;
+  uint32_t flags;
   size_t allocated_bytes_at_last_gc;
   size_t fragmentation_at_last_gc;
   struct extents *extents;
@@ -606,6 +612,39 @@ copy_space_contains(struct copy_space *space, struct gc_ref ref) {
   return extents_contain_addr(space->extents, gc_ref_value(ref));
 }
 
+static int
+copy_space_is_aligned(struct copy_space *space) {
+  return space->flags & COPY_SPACE_ALIGNED;
+}
+
+static int
+copy_space_fixed_size(struct copy_space *space) {
+  // If the extent is aligned, it is fixed.
+  return copy_space_is_aligned(space);
+}
+
+static inline uintptr_t
+copy_space_low_aligned_address(struct copy_space *space) {
+  GC_ASSERT(copy_space_is_aligned(space));
+  GC_ASSERT_EQ(space->extents->size, 1);
+  return space->extents->ranges[0].lo_addr;
+}
+
+static inline uintptr_t
+copy_space_high_aligned_address(struct copy_space *space) {
+  GC_ASSERT(copy_space_is_aligned(space));
+  GC_ASSERT_EQ(space->extents->size, 1);
+  return space->extents->ranges[0].hi_addr;
+}
+
+static inline int
+copy_space_contains_address_aligned(struct copy_space *space, uintptr_t addr) {
+  uintptr_t low_addr = copy_space_low_aligned_address(space);
+  uintptr_t high_addr = copy_space_high_aligned_address(space);
+  uintptr_t size = high_addr - low_addr;
+  return (addr - low_addr) < size;
+}
+
 static inline void
 copy_space_allocator_init(struct copy_space_allocator *alloc) {
   memset(alloc, 0, sizeof(*alloc));
@@ -618,10 +657,27 @@ copy_space_allocator_finish(struct copy_space_allocator *alloc,
     copy_space_allocator_release_partly_full_block(alloc, space);
 }
 
+static size_t copy_space_is_power_of_two(size_t n) {
+  GC_ASSERT(n != 0);
+  return (n & (n - 1)) == 0;
+}
+
+static size_t copy_space_round_up_power_of_two(size_t n) {
+  if (copy_space_is_power_of_two(n))
+    return n;
+
+  return 1ULL << (sizeof(size_t) * 8 - __builtin_clzll(n));
+}
+
 static struct copy_space_slab*
-copy_space_allocate_slabs(size_t nslabs) {
-  return gc_platform_acquire_memory(nslabs * COPY_SPACE_SLAB_SIZE,
-                                    COPY_SPACE_SLAB_SIZE);
+copy_space_allocate_slabs(size_t nslabs, uint32_t flags) {
+  size_t size = nslabs * COPY_SPACE_SLAB_SIZE;
+  size_t alignment = COPY_SPACE_SLAB_SIZE;
+  if (flags & COPY_SPACE_ALIGNED) {
+    GC_ASSERT(copy_space_is_power_of_two(size));
+    alignment = size;
+  }
+  return gc_platform_acquire_memory(size, alignment);
 }
 
 static void
@@ -649,11 +705,13 @@ copy_space_shrink(struct copy_space *space, size_t bytes) {
       
 static void
 copy_space_expand(struct copy_space *space, size_t bytes) {
+  GC_ASSERT(!copy_space_fixed_size(space));
   ssize_t to_acquire = -copy_space_maybe_reacquire_memory(space, bytes);
   if (to_acquire <= 0) return;
   size_t reserved = align_up(to_acquire, COPY_SPACE_SLAB_SIZE);
   size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
-  struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs);
+  struct copy_space_slab *slabs =
+    copy_space_allocate_slabs(nslabs, space->flags);
   copy_space_add_slabs(space, slabs, nslabs);
 
   struct gc_lock lock = copy_space_lock(space);
@@ -704,12 +762,14 @@ copy_space_page_out_blocks(void *data) {
 }
 
 static int
-copy_space_init(struct copy_space *space, size_t size, int atomic,
+copy_space_init(struct copy_space *space, size_t size, uint32_t flags,
                 struct gc_background_thread *thread) {
   size = align_up(size, COPY_SPACE_BLOCK_SIZE);
   size_t reserved = align_up(size, COPY_SPACE_SLAB_SIZE);
+  if (flags & COPY_SPACE_ALIGNED)
+    reserved = copy_space_round_up_power_of_two(reserved);
   size_t nslabs = reserved / COPY_SPACE_SLAB_SIZE;
-  struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs);
+  struct copy_space_slab *slabs = copy_space_allocate_slabs(nslabs, flags);
   if (!slabs)
     return 0;
 
@@ -723,10 +783,11 @@ copy_space_init(struct copy_space *space, size_t size, int atomic,
   space->fragmentation = 0;
   space->bytes_to_page_out = 0;
   space->active_region = 0;
-  space->atomic_forward = atomic;
+  space->atomic_forward = flags & COPY_SPACE_ATOMIC_FORWARDING;
+  space->flags = flags;
   space->allocated_bytes_at_last_gc = 0;
   space->fragmentation_at_last_gc = 0;
-  space->extents = extents_allocate(10);
+  space->extents = extents_allocate((flags & COPY_SPACE_ALIGNED) ? 1 : 10);
   copy_space_add_slabs(space, slabs, nslabs);
   struct gc_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
diff --git a/src/pcc.c b/src/pcc.c
index 9b2a8baf2..0ca94327e 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -652,13 +652,17 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   (*heap)->event_listener_data = event_listener_data;
   HEAP_EVENT(*heap, init, (*heap)->size);
 
-  struct copy_space *space = heap_copy_space(*heap);
-  int atomic_forward = options->common.parallelism > 1;
-  if (!copy_space_init(space, (*heap)->size, atomic_forward,
-                       (*heap)->background_thread)) {
-    free(*heap);
-    *heap = NULL;
-    return 0;
+  struct copy_space *copy_space = heap_copy_space(*heap);
+  {
+    uint32_t flags = 0;
+    if (options->common.parallelism > 1)
+      flags |= COPY_SPACE_ATOMIC_FORWARDING;
+    if (!copy_space_init(copy_space, (*heap)->size, flags,
+                         (*heap)->background_thread)) {
+      free(*heap);
+      *heap = NULL;
+      return 0;
+    }
   }
   
   if (!large_object_space_init(heap_large_object_space(*heap), *heap,

From b33efb27597196c91703848448e45b0345535758 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Sun, 8 Dec 2024 21:49:47 +0100
Subject: [PATCH 335/403] Copy space can reserve some blocks for field-logging
 bits

Useful for an oldgen
---
 src/copy-space.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 8377be73b..920235707 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -118,6 +118,7 @@ struct copy_space_block_stack {
 enum copy_space_flags {
   COPY_SPACE_ATOMIC_FORWARDING = 1,
   COPY_SPACE_ALIGNED = 2,
+  COPY_SPACE_HAS_FIELD_LOGGING_BITS = 4,
 };
 
 struct copy_space {
@@ -645,6 +646,51 @@ copy_space_contains_address_aligned(struct copy_space *space, uintptr_t addr) {
   return (addr - low_addr) < size;
 }
 
+static uint8_t*
+copy_space_field_logged_byte(struct gc_edge edge) {
+  uintptr_t addr = gc_edge_address(edge);
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  base += offsetof(struct copy_space_slab, blocks);
+  uintptr_t field = (addr & (COPY_SPACE_SLAB_SIZE - 1)) / sizeof(uintptr_t);
+  uintptr_t byte = field / 8;
+  return (uint8_t*) (base + field);
+}
+
+static uint8_t
+copy_space_field_logged_bit(struct gc_edge edge) {
+  // Each byte has 8 bytes, covering 8 fields.
+  size_t field = gc_edge_address(edge) / sizeof(uintptr_t);
+  return 1 << (field % 8);
+}
+
+static int
+copy_space_remember_edge(struct copy_space *space, struct gc_edge edge) {
+  GC_ASSERT(copy_space_contains_edge(space, edge));
+  uint8_t* loc = copy_space_field_logged_byte(edge);
+  uint8_t bit = copy_space_field_logged_bit(edge);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (byte & bit) return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &byte, byte|bit,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  return 1;
+}
+
+static int
+copy_space_forget_edge(struct copy_space *space, struct gc_edge edge) {
+  GC_ASSERT(copy_space_contains_edge(space, edge));
+  uint8_t* loc = copy_space_field_logged_byte(edge);
+  uint8_t bit = copy_space_field_logged_bit(edge);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_acquire);
+  do {
+    if (!(byte & bit)) return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &byte, byte&~bit,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+  return 1;
+}
+
 static inline void
 copy_space_allocator_init(struct copy_space_allocator *alloc) {
   memset(alloc, 0, sizeof(*alloc));
@@ -703,6 +749,26 @@ copy_space_shrink(struct copy_space *space, size_t bytes) {
   // can help us then!
 }
       
+static int
+copy_space_has_field_logging_bits(struct copy_space *space) {
+  return space->flags & COPY_SPACE_HAS_FIELD_LOGGING_BITS;
+}
+
+static size_t
+copy_space_field_logging_blocks(struct copy_space *space) {
+  if (!copy_space_has_field_logging_bits(space))
+    return 0;
+  size_t bytes = COPY_SPACE_SLAB_SIZE / sizeof (uintptr_t) / 8;
+  size_t blocks =
+    align_up(bytes, COPY_SPACE_BLOCK_SIZE) / COPY_SPACE_BLOCK_SIZE;
+  return blocks;
+}
+
+static size_t
+copy_space_first_payload_block(struct copy_space *space) {
+  return copy_space_field_logging_blocks(space);
+}
+
 static void
 copy_space_expand(struct copy_space *space, size_t bytes) {
   GC_ASSERT(!copy_space_fixed_size(space));
@@ -716,7 +782,9 @@ copy_space_expand(struct copy_space *space, size_t bytes) {
 
   struct gc_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
-    for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
+    for (size_t idx = copy_space_first_payload_block(space);
+         idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB;
+         idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
       block->in_core = 0;
@@ -791,7 +859,9 @@ copy_space_init(struct copy_space *space, size_t size, uint32_t flags,
   copy_space_add_slabs(space, slabs, nslabs);
   struct gc_lock lock = copy_space_lock(space);
   for (size_t slab = 0; slab < nslabs; slab++) {
-    for (size_t idx = 0; idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB; idx++) {
+    for (size_t idx = copy_space_first_payload_block(space);
+         idx < COPY_SPACE_NONHEADER_BLOCKS_PER_SLAB;
+         idx++) {
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
       block->in_core = 0;

From d96b53facde55871c9a5a90254efbb2af7f0d2c1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 9 Dec 2024 14:22:36 +0100
Subject: [PATCH 336/403] Missing inline decl:
 gc_old_generation_check_alloc_table_bit_pattern

---
 api/gc-attrs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index 389cb536e..d7fc77682 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -35,7 +35,7 @@ enum gc_old_generation_check_kind {
 
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) GC_ALWAYS_INLINE;
 
-static uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) GC_ALWAYS_INLINE;
 
 enum gc_write_barrier_kind {
   GC_WRITE_BARRIER_NONE,

From 274cf438649fd920417634a9214e66edf4bfc9ee Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 9 Dec 2024 14:28:19 +0100
Subject: [PATCH 337/403] Add new old-gen predicate method: check a range of
 addresses

---
 api/gc-api.h   | 16 ++++++++++++++++
 api/gc-attrs.h |  1 +
 src/bdw.c      | 10 ++++++++++
 src/mmc.c      | 10 ++++++++++
 src/pcc.c      | 11 +++++++++++
 src/semi.c     | 10 ++++++++++
 6 files changed, 58 insertions(+)

diff --git a/api/gc-api.h b/api/gc-api.h
index 2efd16ecd..4b23bb543 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -30,6 +30,11 @@ GC_API_ int gc_init(const struct gc_options *options,
                     struct gc_event_listener event_listener,
                     void *event_listener_data);
 
+GC_API_ struct gc_heap* gc_mutator_heap(struct gc_mutator *mut);
+
+GC_API_ uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap);
+GC_API_ uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap);
+
 struct gc_mutator_roots;
 GC_API_ void gc_mutator_set_roots(struct gc_mutator *mut,
                                   struct gc_mutator_roots *roots);
@@ -200,6 +205,17 @@ static inline int gc_object_is_old_generation(struct gc_mutator *mut,
     uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
     return byte & gc_old_generation_check_alloc_table_bit_pattern();
   }
+  case GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY: {
+    struct gc_heap *heap = gc_mutator_heap(mut);
+    // Note that these addresses are fixed and that the embedder might
+    // want to store them somewhere or inline them into the output of
+    // JIT-generated code.  They may also be power-of-two aligned.
+    uintptr_t low_addr = gc_small_object_nursery_low_address(heap);
+    uintptr_t high_addr = gc_small_object_nursery_high_address(heap);
+    uintptr_t size = high_addr - low_addr;
+    uintptr_t addr = gc_ref_value(obj);
+    return addr - low_addr < size;
+  }
   case GC_OLD_GENERATION_CHECK_SLOW:
     return gc_object_is_old_generation_slow(mut, obj);
   default:
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index d7fc77682..344c24c27 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -30,6 +30,7 @@ static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
 enum gc_old_generation_check_kind {
   GC_OLD_GENERATION_CHECK_NONE,
   GC_OLD_GENERATION_CHECK_ALLOC_TABLE,
+  GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY,
   GC_OLD_GENERATION_CHECK_SLOW
 };
 
diff --git a/src/bdw.c b/src/bdw.c
index d1478d805..72b13012e 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -82,6 +82,16 @@ static inline size_t gc_inline_freelist_object_size(size_t idx) {
   return (idx + 1U) * GC_INLINE_GRANULE_BYTES;
 }
 
+struct gc_heap* gc_mutator_heap(struct gc_mutator *mutator) {
+  return __the_bdw_gc_heap;
+}
+uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+
 // The values of these must match the internal POINTERLESS and NORMAL
 // definitions in libgc, for which unfortunately there are no external
 // definitions.  Alack.
diff --git a/src/mmc.c b/src/mmc.c
index d123c4793..0af725138 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -104,6 +104,16 @@ mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
+struct gc_heap* gc_mutator_heap(struct gc_mutator *mutator) {
+  return mutator_heap(mutator);
+}
+uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                          struct gc_heap *heap,
diff --git a/src/pcc.c b/src/pcc.c
index 0ca94327e..0b5b92236 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -86,6 +86,17 @@ static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
+struct gc_heap* gc_mutator_heap(struct gc_mutator *mutator) {
+  return mutator_heap(mutator);
+}
+
+uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+
 static void
 gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                          struct gc_heap *heap,
diff --git a/src/semi.c b/src/semi.c
index 29cb0c92f..c16cecabd 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -81,6 +81,16 @@ static inline struct semi_space* mutator_semi_space(struct gc_mutator *mut) {
   return heap_semi_space(mutator_heap(mut));
 }
 
+struct gc_heap* gc_mutator_heap(struct gc_mutator *mutator) {
+  return mutator_heap(mutator);
+}
+uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap) {
+  GC_CRASH();
+}
+
 static uintptr_t align_up(uintptr_t addr, size_t align) {
   return (addr + align - 1) & ~(align-1);
 }

From 4f8c7bef613f045492990bd6f841c31a06c78825 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 10 Dec 2024 09:40:49 +0100
Subject: [PATCH 338/403] Refactor copy_space_forward to take src and dst
 spaces

---
 src/copy-space.h | 21 +++++++++++----------
 src/pcc.c        |  4 ++--
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 920235707..a6382e741 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -483,7 +483,6 @@ static inline int
 copy_space_forward_atomic(struct copy_space *space, struct gc_edge edge,
                           struct gc_ref old_ref,
                           struct copy_space_allocator *alloc) {
-  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
 
   if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
@@ -532,7 +531,6 @@ static int
 copy_space_forward_if_traced_atomic(struct copy_space *space,
                                     struct gc_edge edge,
                                     struct gc_ref old_ref) {
-  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
   switch (fwd.state) {
   case GC_FORWARDING_STATE_NOT_FORWARDED:
@@ -559,8 +557,6 @@ static inline int
 copy_space_forward_nonatomic(struct copy_space *space, struct gc_edge edge,
                              struct gc_ref old_ref,
                              struct copy_space_allocator *alloc) {
-  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
-
   uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
   if (forwarded) {
     gc_edge_update(edge, gc_ref(forwarded));
@@ -582,7 +578,6 @@ static int
 copy_space_forward_if_traced_nonatomic(struct copy_space *space,
                                        struct gc_edge edge,
                                        struct gc_ref old_ref) {
-  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
   uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
   if (forwarded) {
     gc_edge_update(edge, gc_ref(forwarded));
@@ -592,17 +587,23 @@ copy_space_forward_if_traced_nonatomic(struct copy_space *space,
 }
 
 static inline int
-copy_space_forward(struct copy_space *space, struct gc_edge edge,
+copy_space_forward(struct copy_space *src_space, struct copy_space *dst_space,
+                   struct gc_edge edge,
                    struct gc_ref old_ref,
-                   struct copy_space_allocator *alloc) {
-  if (GC_PARALLEL && space->atomic_forward)
-    return copy_space_forward_atomic(space, edge, old_ref, alloc);
-  return copy_space_forward_nonatomic(space, edge, old_ref, alloc);
+                   struct copy_space_allocator *dst_alloc) {
+  GC_ASSERT(copy_space_contains(src_space, old_ref));
+  GC_ASSERT(src_space != dst_space
+            || copy_space_object_region(old_ref) != src_space->active_region);
+  if (GC_PARALLEL && src_space->atomic_forward)
+    return copy_space_forward_atomic(dst_space, edge, old_ref, dst_alloc);
+  return copy_space_forward_nonatomic(dst_space, edge, old_ref, dst_alloc);
 }
 
 static inline int
 copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
                              struct gc_ref old_ref) {
+  GC_ASSERT(copy_space_contains(space, old_ref));
+  GC_ASSERT(copy_space_object_region(old_ref) != space->active_region);
   if (GC_PARALLEL && space->atomic_forward)
     return copy_space_forward_if_traced_atomic(space, edge, old_ref);
   return copy_space_forward_if_traced_nonatomic(space, edge, old_ref);
diff --git a/src/pcc.c b/src/pcc.c
index 0b5b92236..847297407 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -115,8 +115,8 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
                            struct gc_ref ref,
                            struct gc_trace_worker_data *data) {
   if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
-    return copy_space_forward(heap_copy_space(heap), edge, ref,
-                              &data->allocator);
+    return copy_space_forward(heap_copy_space(heap), heap_copy_space(heap),
+                              edge, ref, &data->allocator);
   else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap), ref);
   else

From 336c2dfadd8837a1080fa4d2ae0cfa513a87234f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 Dec 2024 11:46:00 +0100
Subject: [PATCH 339/403] Beginnings of scaffolding for generational pcc

---
 src/pcc.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/pcc.c b/src/pcc.c
index 847297407..09089584b 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -289,16 +289,24 @@ static void wait_for_mutators_to_stop(struct gc_heap *heap) {
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static void
+static int is_minor_collection(struct gc_heap *heap) {
+  if (GC_GENERATIONAL)
+    GC_CRASH();
+  return 0;
+}
+
+static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap,
                              struct gc_mutator *mut) GC_NEVER_INLINE;
-static void
+static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mutators_are_stopping(heap));
   GC_ASSERT(!all_mutators_stopped(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
   MUTATOR_EVENT(mut, mutator_stopped);
   heap->paused_mutator_count++;
+  enum gc_collection_kind collection_kind =
+    is_minor_collection(heap) ? GC_COLLECTION_MINOR : GC_COLLECTION_COMPACTING;
   if (all_mutators_stopped(heap))
     pthread_cond_signal(&heap->collector_cond);
 
@@ -308,6 +316,7 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   heap->paused_mutator_count--;
 
   MUTATOR_EVENT(mut, mutator_restarted);
+  return collection_kind;
 }
 
 static void resize_heap(struct gc_heap *heap, size_t new_size) {
@@ -374,8 +383,25 @@ static void sweep_ephemerons(struct gc_heap *heap) {
   return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
 }
 
-static void collect(struct gc_mutator *mut) GC_NEVER_INLINE;
-static void collect(struct gc_mutator *mut) {
+static int
+heap_can_minor_gc(struct gc_heap *heap) {
+  if (!GC_GENERATIONAL) return 0;
+  GC_CRASH();
+}
+
+static enum gc_collection_kind
+determine_collection_kind(struct gc_heap *heap,
+                          enum gc_collection_kind requested) {
+  if (requested == GC_COLLECTION_MINOR && heap_can_minor_gc(heap))
+    return GC_COLLECTION_MINOR;
+  return GC_COLLECTION_COMPACTING;
+}
+
+static enum gc_collection_kind
+collect(struct gc_mutator *mut,
+        enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
+static enum gc_collection_kind
+collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   struct copy_space *copy_space = heap_copy_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
@@ -389,6 +415,8 @@ static void collect(struct gc_mutator *mut) {
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
   HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
+  enum gc_collection_kind gc_kind =
+    determine_collection_kind(heap, requested_kind);
   uint64_t *counter_loc = &heap->total_allocated_bytes_at_last_gc;
   copy_space_add_to_allocation_counter(copy_space, counter_loc);
   large_object_space_add_to_allocation_counter(lospace, counter_loc);
@@ -427,22 +455,24 @@ static void collect(struct gc_mutator *mut) {
   }
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
+  return gc_kind;
 }
 
-static void trigger_collection(struct gc_mutator *mut) {
+static void trigger_collection(struct gc_mutator *mut,
+                               enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
   heap_lock(heap);
-  long epoch = heap->count;
+  int prev_kind = -1;
   while (mutators_are_stopping(heap))
-    pause_mutator_for_collection(heap, mut);
-  if (epoch == heap->count)
-    collect(mut);
+    prev_kind = pause_mutator_for_collection(heap, mut);
+  if (prev_kind < (int)requested_kind)
+    collect(mut, requested_kind);
   heap_unlock(heap);
 }
 
 void gc_collect(struct gc_mutator *mut, enum gc_collection_kind kind) {
-  trigger_collection(mut);
+  trigger_collection(mut, kind);
 }
 
 static void* allocate_large(struct gc_mutator *mut, size_t size) {
@@ -454,7 +484,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
   copy_space_request_release_memory(heap_copy_space(heap),
                                      npages << space->page_size_log2);
   while (!copy_space_page_out_blocks_until_memory_released(heap_copy_space(heap)))
-    trigger_collection(mut);
+    trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
@@ -470,7 +500,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
 }
 
 static void get_more_empty_blocks_for_mutator(void *mut) {
-  trigger_collection(mut);
+  trigger_collection(mut, GC_COLLECTION_MINOR);
 }
 
 void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {

From 095d2ff9532a5d30bac5dc7a498296b8787fd3c3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 Dec 2024 14:51:48 +0100
Subject: [PATCH 340/403] Copy space maintains block flag indicating which are
 survivors

---
 src/copy-space.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index a6382e741..21e2f3719 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -57,6 +57,7 @@ struct copy_space_block {
       struct copy_space_block *next;
       uint8_t in_core;
       uint8_t all_zeroes[2];
+      uint8_t is_survivor[2];
       size_t allocated; // For partly-empty blocks.
     };
     uint8_t padding[COPY_SPACE_HEADER_BYTES_PER_BLOCK];
@@ -135,6 +136,7 @@ struct copy_space {
   // lock.
   uint8_t active_region ALIGNED_TO_AVOID_FALSE_SHARING;
   uint8_t atomic_forward;
+  uint8_t in_gc;
   uint32_t flags;
   size_t allocated_bytes_at_last_gc;
   size_t fragmentation_at_last_gc;
@@ -202,8 +204,10 @@ copy_space_pop_empty_block(struct copy_space *space,
                            const struct gc_lock *lock) {
   struct copy_space_block *ret = copy_space_block_stack_pop(&space->empty,
                                                             lock);
-  if (ret)
+  if (ret) {
     ret->allocated = 0;
+    ret->is_survivor[space->active_region] = 0;
+  }
   return ret;
 }
 
@@ -222,6 +226,8 @@ copy_space_pop_full_block(struct copy_space *space) {
 static void
 copy_space_push_full_block(struct copy_space *space,
                            struct copy_space_block *block) {
+  if (space->in_gc)
+    block->is_survivor[space->active_region] = 1;
   copy_space_block_list_push(&space->full, block);
 }
 
@@ -452,6 +458,7 @@ copy_space_flip(struct copy_space *space) {
   space->allocated_bytes = 0;
   space->fragmentation = 0;
   space->active_region ^= 1;
+  space->in_gc = 1;
 }
 
 static void
@@ -459,6 +466,7 @@ copy_space_finish_gc(struct copy_space *space) {
   // Mutators stopped, can access nonatomically.
   space->allocated_bytes_at_last_gc = space->allocated_bytes;
   space->fragmentation_at_last_gc = space->fragmentation;
+  space->in_gc = 0;
 }
 
 static void
@@ -866,6 +874,7 @@ copy_space_init(struct copy_space *space, size_t size, uint32_t flags,
       struct copy_space_block *block = &slabs[slab].headers[idx];
       block->all_zeroes[0] = block->all_zeroes[1] = 1;
       block->in_core = 0;
+      block->is_survivor[0] = block->is_survivor[1] = 0;
       if (reserved > size) {
         copy_space_page_out_block(space, block, &lock);
         reserved -= COPY_SPACE_BLOCK_SIZE;

From c7645975bffb99322c9ead7baf0e6934c190545f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 Dec 2024 15:02:50 +0100
Subject: [PATCH 341/403] pcc: abstract space into which small objects are
 allocated

This may be the nursery, in future.
---
 src/pcc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/pcc.c b/src/pcc.c
index 09089584b..485981d70 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -76,6 +76,10 @@ struct gc_trace_worker_data {
 static inline struct copy_space* heap_copy_space(struct gc_heap *heap) {
   return &heap->copy_space;
 }
+static inline struct copy_space* heap_allocation_space(struct gc_heap *heap) {
+  // The space into which small objects are allocated.
+  return heap_copy_space(heap);
+}
 static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
@@ -191,7 +195,7 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
 }
 
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
@@ -461,7 +465,7 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
 static void trigger_collection(struct gc_mutator *mut,
                                enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
   heap_lock(heap);
   int prev_kind = -1;
   while (mutators_are_stopping(heap))
@@ -542,7 +546,7 @@ int* gc_safepoint_flag_loc(struct gc_mutator *mut) {
 
 void gc_safepoint_slow(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
   heap_lock(heap);
   while (mutators_are_stopping(mutator_heap(mut)))
     pause_mutator_for_collection(heap, mut);
@@ -735,7 +739,7 @@ void gc_finish_for_thread(struct gc_mutator *mut) {
 
 static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
-  copy_space_allocator_finish(&mut->allocator, heap_copy_space(heap));
+  copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
   heap_lock(heap);
   heap->inactive_mutator_count++;
   if (all_mutators_stopped(heap))

From 0318770266e0ec6535d0151881fda1536e874b3e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 11 Dec 2024 15:11:56 +0100
Subject: [PATCH 342/403] pcc: abstract space that has blocks paged out and in

---
 src/pcc.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/pcc.c b/src/pcc.c
index 485981d70..8dba8f32f 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -80,6 +80,10 @@ static inline struct copy_space* heap_allocation_space(struct gc_heap *heap) {
   // The space into which small objects are allocated.
   return heap_copy_space(heap);
 }
+static inline struct copy_space* heap_resizable_space(struct gc_heap *heap) {
+  // The space that gets resized.
+  return heap_copy_space(heap);
+}
 static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
@@ -284,7 +288,7 @@ static void heap_reset_large_object_pages(struct gc_heap *heap, size_t npages) {
   GC_ASSERT(npages <= previous);
   size_t bytes = (previous - npages) <<
     heap_large_object_space(heap)->page_size_log2;
-  copy_space_reacquire_memory(heap_copy_space(heap), bytes);
+  copy_space_reacquire_memory(heap_resizable_space(heap), bytes);
 }
 
 static void wait_for_mutators_to_stop(struct gc_heap *heap) {
@@ -330,9 +334,9 @@ static void resize_heap(struct gc_heap *heap, size_t new_size) {
   DEBUG("------ old heap size: %zu bytes\n", heap->size);
   DEBUG("------ new heap size: %zu bytes\n", new_size);
   if (new_size < heap->size)
-    copy_space_shrink(heap_copy_space(heap), heap->size - new_size);
+    copy_space_shrink(heap_resizable_space(heap), heap->size - new_size);
   else
-    copy_space_expand(heap_copy_space(heap), new_size - heap->size);
+    copy_space_expand(heap_resizable_space(heap), new_size - heap->size);
 
   heap->size = new_size;
   HEAP_EVENT(heap, heap_resized, new_size);
@@ -485,9 +489,9 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
 
   size_t npages = large_object_space_npages(space, size);
 
-  copy_space_request_release_memory(heap_copy_space(heap),
-                                     npages << space->page_size_log2);
-  while (!copy_space_page_out_blocks_until_memory_released(heap_copy_space(heap)))
+  copy_space_request_release_memory(heap_resizable_space(heap),
+                                    npages << space->page_size_log2);
+  while (!copy_space_page_out_blocks_until_memory_released(heap_resizable_space(heap)))
     trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 

From e65c81518d8457b42ccf548de1098567dc2632b9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 16:02:37 +0100
Subject: [PATCH 343/403] Fix copy space compilation in debug mode

Also add copy_space_should_promote
---
 src/copy-space.h | 55 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 21e2f3719..7f4331b58 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -82,8 +82,7 @@ struct copy_space_slab {
 STATIC_ASSERT_EQ(sizeof(struct copy_space_slab), COPY_SPACE_SLAB_SIZE);
 
 static inline struct copy_space_block*
-copy_space_block_header(struct copy_space_block_payload *payload) {
-  uintptr_t addr = (uintptr_t) payload;
+copy_space_block_for_addr(uintptr_t addr) {
   uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
   struct copy_space_slab *slab = (struct copy_space_slab*) base;
   uintptr_t block_idx =
@@ -91,6 +90,11 @@ copy_space_block_header(struct copy_space_block_payload *payload) {
   return &slab->headers[block_idx - COPY_SPACE_HEADER_BLOCKS_PER_SLAB];
 }
 
+static inline struct copy_space_block*
+copy_space_block_header(struct copy_space_block_payload *payload) {
+  return copy_space_block_for_addr((uintptr_t) payload);
+}
+
 static inline struct copy_space_block_payload*
 copy_space_block_payload(struct copy_space_block *block) {
   uintptr_t addr = (uintptr_t) block;
@@ -469,12 +473,34 @@ copy_space_finish_gc(struct copy_space *space) {
   space->in_gc = 0;
 }
 
+static int
+copy_space_can_allocate(struct copy_space *space, size_t bytes) {
+  // With lock!
+  for (struct copy_space_block *empties = space->empty.list.head;
+       empties;
+       empties = empties->next) {
+    if (bytes <= COPY_SPACE_REGION_SIZE) return 1;
+    bytes -= COPY_SPACE_REGION_SIZE;
+  }
+  return 0;
+}
+
 static void
 copy_space_add_to_allocation_counter(struct copy_space *space,
                                      uintptr_t *counter) {
   *counter += space->allocated_bytes - space->allocated_bytes_at_last_gc;
 }
 
+static inline int
+copy_space_contains_address(struct copy_space *space, uintptr_t addr) {
+  return extents_contain_addr(space->extents, addr);
+}
+
+static inline int
+copy_space_contains(struct copy_space *space, struct gc_ref ref) {
+  return copy_space_contains_address(space, gc_ref_value(ref));
+}
+
 static void
 copy_space_gc_during_evacuation(void *data) {
   // If space is really tight and reordering of objects during
@@ -617,11 +643,6 @@ copy_space_forward_if_traced(struct copy_space *space, struct gc_edge edge,
   return copy_space_forward_if_traced_nonatomic(space, edge, old_ref);
 }
 
-static inline int
-copy_space_contains(struct copy_space *space, struct gc_ref ref) {
-  return extents_contain_addr(space->extents, gc_ref_value(ref));
-}
-
 static int
 copy_space_is_aligned(struct copy_space *space) {
   return space->flags & COPY_SPACE_ALIGNED;
@@ -655,6 +676,12 @@ copy_space_contains_address_aligned(struct copy_space *space, uintptr_t addr) {
   return (addr - low_addr) < size;
 }
 
+static inline int
+copy_space_contains_edge_aligned(struct copy_space *space,
+                                 struct gc_edge edge) {
+  return copy_space_contains_address_aligned(space, gc_edge_address(edge));
+}
+
 static uint8_t*
 copy_space_field_logged_byte(struct gc_edge edge) {
   uintptr_t addr = gc_edge_address(edge);
@@ -672,6 +699,20 @@ copy_space_field_logged_bit(struct gc_edge edge) {
   return 1 << (field % 8);
 }
 
+static inline int
+copy_space_should_promote(struct copy_space *space, struct gc_ref ref) {
+  GC_ASSERT(copy_space_contains(space, ref));
+  uintptr_t addr = gc_ref_value(ref);
+  struct copy_space_block *block = copy_space_block_for_addr(gc_ref_value(ref));
+  GC_ASSERT_EQ(copy_space_object_region(ref), space->active_region ^ 1);
+  return block->is_survivor[space->active_region ^ 1];
+}
+
+static int
+copy_space_contains_edge(struct copy_space *space, struct gc_edge edge) {
+  return copy_space_contains_address(space, gc_edge_address(edge));
+}
+
 static int
 copy_space_remember_edge(struct copy_space *space, struct gc_edge edge) {
   GC_ASSERT(copy_space_contains_edge(space, edge));

From 47aa6f041fa866899723a2f3c6a59c0edb8e8549 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:57:33 +0100
Subject: [PATCH 344/403] Add ALWAYS_INLINE to gc_field_set_visit_edge_buffer

---
 src/field-set.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/field-set.h b/src/field-set.h
index ee7df811d..60727fcaa 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -167,6 +167,14 @@ gc_field_set_clear(struct gc_field_set *set,
   }
 }
 
+static inline void
+gc_field_set_visit_edge_buffer(struct gc_field_set *set,
+                               struct gc_edge_buffer *buf,
+                               void (*visit)(struct gc_edge,
+                                             struct gc_heap*,
+                                             void *data),
+                               struct gc_heap *heap,
+                               void *data) GC_ALWAYS_INLINE;
 static inline void
 gc_field_set_visit_edge_buffer(struct gc_field_set *set,
                                struct gc_edge_buffer *buf,

From 8b96d8cf904b593d038e951a510d676ba6351c75 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 08:46:25 +0100
Subject: [PATCH 345/403] Fix embarassing use of uninitialized variable

---
 src/gc-finalizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-finalizer.c b/src/gc-finalizer.c
index 5365899fc..ae795cccf 100644
--- a/src/gc-finalizer.c
+++ b/src/gc-finalizer.c
@@ -223,7 +223,7 @@ size_t gc_visit_finalizer_roots(struct gc_finalizer_state *state,
                                               void *),
                                 struct gc_heap *heap,
                                 void *visit_data) {
-  size_t count;
+  size_t count = 0;
   for (size_t tidx = 0; tidx < state->table_count; tidx++) {
     struct gc_finalizer_table *table = &state->tables[tidx];
     if (table->finalizer_count) {

From 209be38640a9c48d43a0a92f03362eb78ca25c9d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:27:22 +0100
Subject: [PATCH 346/403] Pushing logged edges takes ownership of edge buffers

This allows field logging to proceed during collection, which might add
new edge buffers.  Also fix a bug that would cause debug-mode assertion
failures, where clearing a field set didn't clear edge buffer next
pointers.
---
 src/field-set.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/field-set.h b/src/field-set.h
index 60727fcaa..f5b2e42c6 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -132,10 +132,12 @@ gc_field_set_release_buffer(struct gc_field_set *set,
 static void
 gc_field_set_add_roots(struct gc_field_set *set, struct gc_tracer *tracer) {
   struct gc_edge_buffer *buf;
-  for (buf = set->partly_full.list.head; buf; buf = buf->next)
+  struct gc_lock lock = gc_lock_acquire(&set->lock);
+  while ((buf = gc_edge_buffer_stack_pop(&set->partly_full, &lock)))
     gc_tracer_add_root(tracer, gc_root_edge_buffer(buf));
-  for (buf = set->full.head; buf; buf = buf->next)
+  while ((buf = gc_edge_buffer_list_pop(&set->full)))
     gc_tracer_add_root(tracer, gc_root_edge_buffer(buf));
+  gc_lock_release(&lock);
 }
 
 static void
@@ -151,6 +153,7 @@ gc_field_set_clear(struct gc_field_set *set,
   struct gc_edge_buffer *buf, *next;
   for (buf = partly_full; buf; buf = next) {
     next = buf->next;
+    buf->next = NULL;
     if (forget_edge)
       for (size_t i = 0; i < buf->size; i++)
         forget_edge(buf->edges[i], heap);
@@ -159,6 +162,7 @@ gc_field_set_clear(struct gc_field_set *set,
   }
   for (buf = full; buf; buf = next) {
     next = buf->next;
+    buf->next = NULL;
     if (forget_edge)
       for (size_t i = 0; i < buf->size; i++)
         forget_edge(buf->edges[i], heap);
@@ -185,6 +189,7 @@ gc_field_set_visit_edge_buffer(struct gc_field_set *set,
                                void *data) {
   for (size_t i = 0; i < buf->size; i++)
     visit(buf->edges[i], heap, data);
+  gc_field_set_release_buffer(set, buf);
 }
 
 static void

From e4048b5296b70f2fd161add55298c41911b98d2d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:42:25 +0100
Subject: [PATCH 347/403] Add offset to field-logging fast path

---
 api/bdw-attrs.h  | 3 +++
 api/gc-api.h     | 5 +++--
 api/gc-attrs.h   | 1 +
 api/mmc-attrs.h  | 4 ++++
 api/pcc-attrs.h  | 3 +++
 api/semi-attrs.h | 3 +++
 6 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index 05f7e4cb7..938356a5e 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -59,6 +59,9 @@ static inline size_t gc_write_barrier_card_size(void) {
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_CRASH();
 }
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_CRASH();
+}
 static inline size_t gc_write_barrier_field_fields_per_byte(void) {
   GC_CRASH();
 }
diff --git a/api/gc-api.h b/api/gc-api.h
index 4b23bb543..ff1a20927 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -214,7 +214,7 @@ static inline int gc_object_is_old_generation(struct gc_mutator *mut,
     uintptr_t high_addr = gc_small_object_nursery_high_address(heap);
     uintptr_t size = high_addr - low_addr;
     uintptr_t addr = gc_ref_value(obj);
-    return addr - low_addr < size;
+    return addr - low_addr >= size;
   }
   case GC_OLD_GENERATION_CHECK_SLOW:
     return gc_object_is_old_generation_slow(mut, obj);
@@ -252,13 +252,14 @@ static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref ob
     size_t field_table_alignment = gc_write_barrier_field_table_alignment();
     size_t fields_per_byte = gc_write_barrier_field_fields_per_byte();
     uint8_t first_bit_pattern = gc_write_barrier_field_first_bit_pattern();
+    ssize_t table_offset = gc_write_barrier_field_table_offset();
 
     uintptr_t addr = gc_edge_address(edge);
     uintptr_t base = addr & ~(field_table_alignment - 1);
     uintptr_t field = (addr & (field_table_alignment - 1)) / sizeof(uintptr_t);
     uintptr_t log_byte = field / fields_per_byte;
     uint8_t log_bit = first_bit_pattern << (field % fields_per_byte);
-    uint8_t *byte_loc = (uint8_t*)(base + log_byte);
+    uint8_t *byte_loc = (uint8_t*)(base + table_offset + log_byte);
     uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
     return !(byte & log_bit);
   }
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index 344c24c27..f0a6e94e6 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -49,6 +49,7 @@ static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size)
 static inline size_t gc_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_card_size(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_field_table_alignment(void) GC_ALWAYS_INLINE;
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_field_fields_per_byte(void) GC_ALWAYS_INLINE;
 static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) GC_ALWAYS_INLINE;
 
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 5d4dcb490..65cb434c9 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -73,6 +73,10 @@ static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return gc_allocator_alloc_table_alignment();
 }
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_ASSERT(GC_GENERATIONAL);
+  return 0;
+}
 static inline size_t gc_write_barrier_field_fields_per_byte(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return 2;
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index c86d79471..c3c928f54 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -62,6 +62,9 @@ static inline size_t gc_write_barrier_card_size(void) {
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_CRASH();
 }
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_CRASH();
+}
 static inline size_t gc_write_barrier_field_fields_per_byte(void) {
   GC_CRASH();
 }
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index 997b031ee..69a87560e 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -61,6 +61,9 @@ static inline size_t gc_write_barrier_card_size(void) {
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_CRASH();
 }
+static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
+  GC_CRASH();
+}
 static inline size_t gc_write_barrier_field_fields_per_byte(void) {
   GC_CRASH();
 }

From 555694965dde4d62b0a0c3091aca344e0c7653c1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:57:49 +0100
Subject: [PATCH 348/403] Looking up large object for edge returns the object

---
 src/large-object-space.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index 703c048b4..f3470e0e4 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -123,12 +123,12 @@ large_object_space_contains(struct large_object_space *space,
 }
 
 static inline struct gc_ref
-large_object_space_contains_edge(struct large_object_space *space,
-                                 struct gc_edge edge) {
+large_object_space_object_containing_edge(struct large_object_space *space,
+                                          struct gc_edge edge) {
   pthread_mutex_lock(&space->lock);
   struct large_object_node *node =
     large_object_tree_lookup(&space->object_tree, gc_edge_address(edge));
-  uintptr_t addr = node ? node->key.addr : 0;
+  uintptr_t addr = (node && node->value.is_live) ? node->key.addr : 0;
   pthread_mutex_unlock(&space->lock);
   return gc_ref(addr);
 }

From 0b8630145abcdca86d1a6e81a65f0168b35b7e77 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:48:20 +0100
Subject: [PATCH 349/403] Copy space clears log bits when obtaining fresh
 blocks

---
 src/copy-space.h | 111 ++++++++++++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 44 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 7f4331b58..bd2df7334 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -314,6 +314,68 @@ copy_space_reacquire_memory(struct copy_space *space, size_t bytes) {
   GC_ASSERT(pending + COPY_SPACE_BLOCK_SIZE > 0);
 }
 
+static inline int
+copy_space_contains_address(struct copy_space *space, uintptr_t addr) {
+  return extents_contain_addr(space->extents, addr);
+}
+
+static inline int
+copy_space_contains(struct copy_space *space, struct gc_ref ref) {
+  return copy_space_contains_address(space, gc_ref_value(ref));
+}
+
+static int
+copy_space_has_field_logging_bits(struct copy_space *space) {
+  return space->flags & COPY_SPACE_HAS_FIELD_LOGGING_BITS;
+}
+
+static size_t
+copy_space_field_logging_blocks(struct copy_space *space) {
+  if (!copy_space_has_field_logging_bits(space))
+    return 0;
+  size_t bytes = COPY_SPACE_SLAB_SIZE / sizeof (uintptr_t) / 8;
+  size_t blocks =
+    align_up(bytes, COPY_SPACE_BLOCK_SIZE) / COPY_SPACE_BLOCK_SIZE;
+  return blocks;
+}
+
+static uint8_t*
+copy_space_field_logged_byte(struct gc_edge edge) {
+  uintptr_t addr = gc_edge_address(edge);
+  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
+  base += offsetof(struct copy_space_slab, blocks);
+  uintptr_t field = (addr & (COPY_SPACE_SLAB_SIZE - 1)) / sizeof(uintptr_t);
+  uintptr_t byte = field / 8;
+  return (uint8_t*) (base + field);
+}
+
+static uint8_t
+copy_space_field_logged_bit(struct gc_edge edge) {
+  // Each byte has 8 bytes, covering 8 fields.
+  size_t field = gc_edge_address(edge) / sizeof(uintptr_t);
+  return 1 << (field % 8);
+}
+
+static void
+copy_space_clear_field_logged_bits_for_region(struct copy_space *space,
+                                              void *region_base) {
+  uintptr_t addr = (uintptr_t)region_base;
+  GC_ASSERT_EQ(addr, align_down(addr, COPY_SPACE_REGION_SIZE));
+  GC_ASSERT(copy_space_contains_address(space, addr));
+  if (copy_space_has_field_logging_bits(space))
+    memset(copy_space_field_logged_byte(gc_edge(region_base)),
+           0,
+           COPY_SPACE_REGION_SIZE / sizeof(uintptr_t) / 8);
+}
+
+static void
+copy_space_clear_field_logged_bits_for_block(struct copy_space *space,
+                                             struct copy_space_block *block) {
+  struct copy_space_block_payload *payload = copy_space_block_payload(block);
+  copy_space_clear_field_logged_bits_for_region(space, &payload->regions[0]);
+  copy_space_clear_field_logged_bits_for_region(space, &payload->regions[1]);
+}
+
 static inline void
 copy_space_allocator_set_block(struct copy_space_allocator *alloc,
                                struct copy_space_block *block,
@@ -344,10 +406,12 @@ copy_space_allocator_acquire_empty_block(struct copy_space_allocator *alloc,
   gc_lock_release(&lock);
   if (copy_space_allocator_acquire_block(alloc, block, space->active_region)) {
     block->in_core = 1;
-    if (block->all_zeroes[space->active_region])
+    if (block->all_zeroes[space->active_region]) {
       block->all_zeroes[space->active_region] = 0;
-    else
+    } else {
       memset((char*)alloc->hp, 0, COPY_SPACE_REGION_SIZE);
+      copy_space_clear_field_logged_bits_for_region(space, (void*)alloc->hp);
+    }
     return 1;
   }
   return 0;
@@ -491,16 +555,6 @@ copy_space_add_to_allocation_counter(struct copy_space *space,
   *counter += space->allocated_bytes - space->allocated_bytes_at_last_gc;
 }
 
-static inline int
-copy_space_contains_address(struct copy_space *space, uintptr_t addr) {
-  return extents_contain_addr(space->extents, addr);
-}
-
-static inline int
-copy_space_contains(struct copy_space *space, struct gc_ref ref) {
-  return copy_space_contains_address(space, gc_ref_value(ref));
-}
-
 static void
 copy_space_gc_during_evacuation(void *data) {
   // If space is really tight and reordering of objects during
@@ -682,23 +736,6 @@ copy_space_contains_edge_aligned(struct copy_space *space,
   return copy_space_contains_address_aligned(space, gc_edge_address(edge));
 }
 
-static uint8_t*
-copy_space_field_logged_byte(struct gc_edge edge) {
-  uintptr_t addr = gc_edge_address(edge);
-  uintptr_t base = align_down(addr, COPY_SPACE_SLAB_SIZE);
-  base += offsetof(struct copy_space_slab, blocks);
-  uintptr_t field = (addr & (COPY_SPACE_SLAB_SIZE - 1)) / sizeof(uintptr_t);
-  uintptr_t byte = field / 8;
-  return (uint8_t*) (base + field);
-}
-
-static uint8_t
-copy_space_field_logged_bit(struct gc_edge edge) {
-  // Each byte has 8 bytes, covering 8 fields.
-  size_t field = gc_edge_address(edge) / sizeof(uintptr_t);
-  return 1 << (field % 8);
-}
-
 static inline int
 copy_space_should_promote(struct copy_space *space, struct gc_ref ref) {
   GC_ASSERT(copy_space_contains(space, ref));
@@ -799,21 +836,6 @@ copy_space_shrink(struct copy_space *space, size_t bytes) {
   // can help us then!
 }
       
-static int
-copy_space_has_field_logging_bits(struct copy_space *space) {
-  return space->flags & COPY_SPACE_HAS_FIELD_LOGGING_BITS;
-}
-
-static size_t
-copy_space_field_logging_blocks(struct copy_space *space) {
-  if (!copy_space_has_field_logging_bits(space))
-    return 0;
-  size_t bytes = COPY_SPACE_SLAB_SIZE / sizeof (uintptr_t) / 8;
-  size_t blocks =
-    align_up(bytes, COPY_SPACE_BLOCK_SIZE) / COPY_SPACE_BLOCK_SIZE;
-  return blocks;
-}
-
 static size_t
 copy_space_first_payload_block(struct copy_space *space) {
   return copy_space_field_logging_blocks(space);
@@ -874,6 +896,7 @@ copy_space_page_out_blocks(void *data) {
     block->all_zeroes[0] = block->all_zeroes[1] = 1;
     gc_platform_discard_memory(copy_space_block_payload(block),
                                COPY_SPACE_BLOCK_SIZE);
+    copy_space_clear_field_logged_bits_for_block(space, block);
     copy_space_block_stack_push(&space->paged_out[age + 1], block, &lock);
   }
   gc_lock_release(&lock);

From 65b74b5abb71be18dcc17f5b02a27062d1805357 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:57:28 +0100
Subject: [PATCH 350/403] Add generational support to pcc

---
 api/pcc-attrs.h |  26 ++-
 src/pcc.c       | 576 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 540 insertions(+), 62 deletions(-)

diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index c3c928f54..654acf8b9 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -43,15 +43,23 @@ static inline int gc_allocator_needs_clear(void) {
   return 0;
 }
 
-static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
-  return GC_OLD_GENERATION_CHECK_NONE;
+static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t size) {
+  if (!GC_GENERATIONAL)
+    return GC_OLD_GENERATION_CHECK_NONE;
+  if (size <= gc_allocator_large_threshold())
+    return GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY;
+  return GC_OLD_GENERATION_CHECK_SLOW;
 }
 static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
   GC_CRASH();
 }
 
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
-  return GC_WRITE_BARRIER_NONE;
+  if (!GC_GENERATIONAL)
+    return GC_WRITE_BARRIER_NONE;
+  if (obj_size <= gc_allocator_large_threshold())
+    return GC_WRITE_BARRIER_FIELD;
+  return GC_WRITE_BARRIER_SLOW;
 }
 static inline size_t gc_write_barrier_card_table_alignment(void) {
   GC_CRASH();
@@ -60,16 +68,20 @@ static inline size_t gc_write_barrier_card_size(void) {
   GC_CRASH();
 }
 static inline size_t gc_write_barrier_field_table_alignment(void) {
-  GC_CRASH();
+  GC_ASSERT(GC_GENERATIONAL);
+  return 64 * 1024 * 1024;
 }
 static inline ptrdiff_t gc_write_barrier_field_table_offset(void) {
-  GC_CRASH();
+  GC_ASSERT(GC_GENERATIONAL);
+  return 128 * 1024;
 }
 static inline size_t gc_write_barrier_field_fields_per_byte(void) {
-  GC_CRASH();
+  GC_ASSERT(GC_GENERATIONAL);
+  return 8;
 }
 static inline uint8_t gc_write_barrier_field_first_bit_pattern(void) {
-  GC_CRASH();
+  GC_ASSERT(GC_GENERATIONAL);
+  return 1;
 }
 
 static inline enum gc_safepoint_mechanism gc_safepoint_mechanism(void) {
diff --git a/src/pcc.c b/src/pcc.c
index 8dba8f32f..7ee017ae2 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -12,6 +12,7 @@
 #include "background-thread.h"
 #include "copy-space.h"
 #include "debug.h"
+#include "field-set.h"
 #include "gc-align.h"
 #include "gc-inline.h"
 #include "gc-platform.h"
@@ -27,9 +28,17 @@
 #include "pcc-attrs.h"
 
 struct gc_heap {
-  struct copy_space copy_space;
+#if GC_GENERATIONAL
+  struct copy_space new_space;
+  struct copy_space old_space;
+#else
+  struct copy_space mono_space;
+#endif
   struct large_object_space large_object_space;
   struct gc_extern_space *extern_space;
+#if GC_GENERATIONAL
+  struct gc_field_set remembered_set;
+#endif
   size_t large_object_pages;
   pthread_mutex_t lock;
   pthread_cond_t collector_cond;
@@ -37,6 +46,13 @@ struct gc_heap {
   size_t size;
   size_t total_allocated_bytes_at_last_gc;
   int collecting;
+#if GC_GENERATIONAL
+  int is_minor_collection;
+  size_t per_processor_nursery_size;
+  size_t nursery_size;
+#endif
+  size_t processor_count;
+  size_t max_active_mutator_count;
   int check_pending_ephemerons;
   struct gc_pending_ephemerons *pending_ephemerons;
   struct gc_finalizer_state *finalizer_state;
@@ -62,6 +78,9 @@ struct gc_heap {
 
 struct gc_mutator {
   struct copy_space_allocator allocator;
+#if GC_GENERATIONAL
+  struct gc_field_set_writer logger;
+#endif
   struct gc_heap *heap;
   struct gc_mutator_roots *roots;
   void *event_listener_data;
@@ -70,26 +89,116 @@ struct gc_mutator {
 };
 
 struct gc_trace_worker_data {
+#if GC_GENERATIONAL
+  struct copy_space_allocator new_allocator;
+  struct copy_space_allocator old_allocator;
+  struct gc_field_set_writer logger;
+#else
   struct copy_space_allocator allocator;
+#endif
 };
 
-static inline struct copy_space* heap_copy_space(struct gc_heap *heap) {
-  return &heap->copy_space;
+static inline struct copy_space* heap_mono_space(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  GC_CRASH();
+#else
+  return &heap->mono_space;
+#endif
 }
+
+static inline struct copy_space* heap_new_space(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  return &heap->new_space;
+#else
+  GC_CRASH();
+#endif
+}
+
+static inline struct copy_space* heap_old_space(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  return &heap->old_space;
+#else
+  GC_CRASH();
+#endif
+}
+
+static inline struct gc_field_set* heap_remembered_set(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  return &heap->remembered_set;
+#else
+  GC_CRASH();
+#endif
+}
+
+static inline struct copy_space_allocator*
+trace_worker_mono_space_allocator(struct gc_trace_worker_data *data) {
+#if GC_GENERATIONAL
+  GC_CRASH();
+#else
+  return &data->allocator;
+#endif
+}
+
+static inline struct copy_space_allocator*
+trace_worker_new_space_allocator(struct gc_trace_worker_data *data) {
+#if GC_GENERATIONAL
+  return &data->new_allocator;
+#else
+  GC_CRASH();
+#endif
+}
+
+static inline struct copy_space_allocator*
+trace_worker_old_space_allocator(struct gc_trace_worker_data *data) {
+#if GC_GENERATIONAL
+  return &data->old_allocator;
+#else
+  GC_CRASH();
+#endif
+}
+
+static inline struct gc_field_set_writer*
+trace_worker_field_logger(struct gc_trace_worker_data *data) {
+#if GC_GENERATIONAL
+  return &data->logger;
+#else
+  GC_CRASH();
+#endif
+}
+
+static inline struct gc_field_set_writer*
+mutator_field_logger(struct gc_mutator *mut) {
+#if GC_GENERATIONAL
+  return &mut->logger;
+#else
+  GC_CRASH();
+#endif
+}
+
+static int is_minor_collection(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  return heap->is_minor_collection;
+#else
+  GC_CRASH();
+#endif
+}
+
 static inline struct copy_space* heap_allocation_space(struct gc_heap *heap) {
-  // The space into which small objects are allocated.
-  return heap_copy_space(heap);
+  return GC_GENERATIONAL ? heap_new_space(heap) : heap_mono_space(heap);
 }
+
 static inline struct copy_space* heap_resizable_space(struct gc_heap *heap) {
-  // The space that gets resized.
-  return heap_copy_space(heap);
+  return GC_GENERATIONAL ? heap_old_space(heap) : heap_mono_space(heap);
 }
+
 static inline struct large_object_space* heap_large_object_space(struct gc_heap *heap) {
   return &heap->large_object_space;
 }
+
 static inline struct gc_extern_space* heap_extern_space(struct gc_heap *heap) {
   return heap->extern_space;
 }
+
 static inline struct gc_heap* mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
@@ -99,9 +208,13 @@ struct gc_heap* gc_mutator_heap(struct gc_mutator *mutator) {
 }
 
 uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap) {
+  if (GC_GENERATIONAL)
+    return copy_space_low_aligned_address(heap_new_space(heap));
   GC_CRASH();
 }
 uintptr_t gc_small_object_nursery_high_address(struct gc_heap *heap) {
+  if (GC_GENERATIONAL)
+    return copy_space_high_aligned_address(heap_new_space(heap));
   GC_CRASH();
 }
 
@@ -114,18 +227,147 @@ gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
                                struct gc_heap *heap,
                                struct gc_trace_worker *worker) {
   struct gc_trace_worker_data data;
-  copy_space_allocator_init(&data.allocator);
+
+  if (GC_GENERATIONAL) {
+    copy_space_allocator_init(trace_worker_new_space_allocator(&data));
+    copy_space_allocator_init(trace_worker_old_space_allocator(&data));
+    gc_field_set_writer_init(trace_worker_field_logger(&data),
+                             heap_remembered_set(heap));
+  } else {
+    copy_space_allocator_init(trace_worker_mono_space_allocator(&data));
+  }
+
   f(tracer, heap, worker, &data);
-  copy_space_allocator_finish(&data.allocator, heap_copy_space(heap));
+
+  if (GC_GENERATIONAL) {
+    copy_space_allocator_finish(trace_worker_new_space_allocator(&data),
+                                heap_new_space(heap));
+    copy_space_allocator_finish(trace_worker_old_space_allocator(&data),
+                                heap_old_space(heap));
+    gc_field_set_writer_release_buffer(trace_worker_field_logger(&data));
+  } else {
+    copy_space_allocator_finish(trace_worker_mono_space_allocator(&data),
+                                heap_mono_space(heap));
+  }
 }
 
+static int new_space_contains_addr(struct gc_heap *heap, uintptr_t addr) {
+  return copy_space_contains_address_aligned(heap_new_space(heap), addr);
+}
+
+static int new_space_contains(struct gc_heap *heap, struct gc_ref ref) {
+  return new_space_contains_addr(heap, gc_ref_value(ref));
+}
+
+static int old_space_contains(struct gc_heap *heap, struct gc_ref ref) {
+  return copy_space_contains(heap_old_space(heap), ref);
+}
+
+static int remember_edge_to_survivor_object(struct gc_heap *heap,
+                                            struct gc_edge edge) {
+  GC_ASSERT(!new_space_contains_addr(heap, gc_edge_address(edge)));
+  GC_ASSERT(new_space_contains(heap, gc_edge_ref(edge)));
+  if (copy_space_contains_edge(heap_old_space(heap), edge))
+    return copy_space_remember_edge(heap_old_space(heap), edge);
+  struct gc_ref large_object =
+    large_object_space_object_containing_edge(heap_large_object_space(heap),
+                                              edge);
+  if (!gc_ref_is_null(large_object))
+    return large_object_space_remember_edge(heap_large_object_space(heap),
+                                            large_object, edge);
+  return 0;
+}
+
+static inline int edge_is_from_survivor(struct gc_heap *heap,
+                                        struct gc_edge edge) {
+  // Currently only the copy-space has survivors.  (A survivor is a live object
+  // which stays in the nursery after collection).  If lospace gains a survivor
+  // stage, we would need to augment this check.
+  GC_ASSERT(is_minor_collection(heap));
+  return copy_space_contains_edge_aligned(heap_new_space(heap), edge);
+}
+
+static inline int do_minor_trace(struct gc_heap *heap, struct gc_edge edge,
+                                 struct gc_ref ref,
+                                 struct gc_trace_worker_data *data) {
+  // Trace EDGE for a minor GC.  We only need to trace edges to young objects.
+  // Young objects are either in the nursery copy space, or in the large object
+  // space.
+
+  if (GC_LIKELY(new_space_contains(heap, ref))) {
+    struct copy_space *new_space = heap_new_space(heap);
+    struct copy_space *old_space = heap_old_space(heap);
+    // We are visiting an edge into newspace.  Either the edge's target will be
+    // promoted to oldspace, or it will stay in newspace as a survivor.
+    //
+    // After the scavenge, we need to preserve the invariant that all old-to-new
+    // edges are part of the remembered set.  So depending on where the edge
+    // comes from and where the object moves to, we may need to add or remove
+    // the edge from the remembered set.  Concretely:
+    //
+    //                   |  survivor dst    |  promoted dst
+    //   ----------------+------------------+-----------------
+    //   survivor src    |    nothing       |    nothing
+    //                   |                  |
+    //   promoted src    |    log edge      |    nothing
+    //                   |                  |
+    //   oldspace src    |    nothing       |   clear log
+    //                   |                  |
+    //   root src        |    nothing       |    nothing
+    //
+    // However, clearing a logged field usually isn't possible, as it's not easy
+    // to go from field address to position in a field set, so instead we lazily
+    // remove old->old edges from the field set during the next minor GC.  (Or,
+    // we will anyway; for now we ignore them.)  So really we only need to log
+    // promoted-to-survivor edges.
+    //
+    // However however, it is hard to distinguish between edges from promoted
+    // objects and edges from old objects, so we mostly just rely on an
+    // idempotent "log if unlogged" operation instead.
+    int promote = copy_space_should_promote(new_space, ref);
+    struct copy_space *dst_space = promote ? old_space : new_space;
+    struct copy_space_allocator *alloc = promote
+      ? trace_worker_old_space_allocator(data)
+      : trace_worker_new_space_allocator(data);
+    // Update the remembered set for promoted-to-survivor edges.
+    if (!promote && !edge_is_from_survivor(heap, edge)
+        && remember_edge_to_survivor_object(heap, edge))
+      gc_field_set_writer_add_edge(trace_worker_field_logger(data), edge);
+    return copy_space_forward(new_space, dst_space, edge, ref, alloc);
+  } else {
+    // Note that although the target of the edge might not be in lospace, this
+    // will do what we want and return 1 if and only if ref is was a young
+    // object in lospace.
+    return large_object_space_copy(heap_large_object_space(heap), ref);
+  }
+}
+
+
 static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
                            struct gc_ref ref,
                            struct gc_trace_worker_data *data) {
-  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
-    return copy_space_forward(heap_copy_space(heap), heap_copy_space(heap),
-                              edge, ref, &data->allocator);
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+  if (GC_GENERATIONAL) {
+    if (GC_LIKELY(is_minor_collection(heap)))
+      return do_minor_trace(heap, edge, ref, data);
+
+    // Major trace: promote all copyspace objects to oldgen.
+    struct copy_space *new_space = heap_new_space(heap);
+    struct copy_space *old_space = heap_old_space(heap);
+    if (new_space_contains(heap, ref))
+      return copy_space_forward(new_space, old_space, edge, ref,
+                                trace_worker_old_space_allocator(data));
+    if (old_space_contains(heap, ref))
+      return copy_space_forward(old_space, old_space, edge, ref,
+                                trace_worker_old_space_allocator(data));
+  } else {
+    if (GC_LIKELY(copy_space_contains(heap_mono_space(heap), ref)))
+      return copy_space_forward(heap_mono_space(heap), heap_mono_space(heap),
+                                edge, ref,
+                                trace_worker_mono_space_allocator(data));
+  }
+
+  // Fall through for objects in large or extern spaces.
+  if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap), ref);
   else
     return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
@@ -153,8 +395,18 @@ int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
   if (gc_ref_is_immediate(ref))
     return 1;
   GC_ASSERT(gc_ref_is_heap_object(ref));
-  if (GC_LIKELY(copy_space_contains(heap_copy_space(heap), ref)))
-    return copy_space_forward_if_traced(heap_copy_space(heap), edge, ref);
+
+  if (GC_GENERATIONAL) {
+    if (new_space_contains(heap, ref))
+      return copy_space_forward_if_traced(heap_new_space(heap), edge, ref);
+    if (old_space_contains(heap, ref))
+      return is_minor_collection(heap) ||
+        copy_space_forward_if_traced(heap_old_space(heap), edge, ref);
+  } else {
+    if (copy_space_contains(heap_mono_space(heap), ref))
+      return copy_space_forward_if_traced(heap_mono_space(heap), edge, ref);
+  }
+
   if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_is_copied(heap_large_object_space(heap), ref);
   GC_CRASH();
@@ -177,11 +429,21 @@ static inline int all_mutators_stopped(struct gc_heap *heap) {
     heap->paused_mutator_count + heap->inactive_mutator_count;
 }
 
+// with heap lock
+static void maybe_increase_max_active_mutator_count(struct gc_heap *heap) {
+  size_t active_mutators = heap->mutator_count - heap->inactive_mutator_count;
+  if (active_mutators > heap->max_active_mutator_count)
+    heap->max_active_mutator_count = active_mutators;
+}
+
 static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   mut->heap = heap;
   mut->event_listener_data =
     heap->event_listener.mutator_added(heap->event_listener_data);
   copy_space_allocator_init(&mut->allocator);
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_init(mutator_field_logger(mut),
+                             heap_remembered_set(heap));
   heap_lock(heap);
   // We have no roots.  If there is a GC currently in progress, we have
   // nothing to add.  Just wait until it's done.
@@ -195,11 +457,14 @@ static void add_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   }
   heap->mutators = mut;
   heap->mutator_count++;
+  maybe_increase_max_active_mutator_count(heap);
   heap_unlock(heap);
 }
 
 static void remove_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(mutator_field_logger(mut));
   MUTATOR_EVENT(mut, mutator_removed);
   mut->heap = NULL;
   heap_lock(heap);
@@ -241,9 +506,20 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              struct gc_trace_worker *worker) {
 #ifdef DEBUG
-  if (copy_space_contains(heap_copy_space(heap), ref))
-    GC_ASSERT(copy_space_object_region(ref) == heap_copy_space(heap)->active_region);
+  if (GC_GENERATIONAL) {
+    if (new_space_contains(heap, ref))
+      GC_ASSERT_EQ(copy_space_object_region(ref),
+                   heap_new_space(heap)->active_region);
+    else if (old_space_contains(heap, ref))
+      GC_ASSERT_EQ(copy_space_object_region(ref),
+                   heap_old_space(heap)->active_region);
+  } else {
+    if (copy_space_contains(heap_mono_space(heap), ref))
+      GC_ASSERT_EQ(copy_space_object_region(ref),
+                   heap_mono_space(heap)->active_region);
+  }
 #endif
+
   gc_trace_object(ref, tracer_visit, heap, worker, NULL);
 }
 
@@ -263,6 +539,10 @@ static inline void trace_root(struct gc_root root, struct gc_heap *heap,
   case GC_ROOT_KIND_EDGE:
     tracer_visit(root.edge, heap, worker);
     break;
+  case GC_ROOT_KIND_EDGE_BUFFER:
+    gc_field_set_visit_edge_buffer(heap_remembered_set(heap), root.edge_buffer,
+                                   tracer_visit, heap, worker);
+    break;
   default:
     GC_CRASH();
   }
@@ -297,12 +577,6 @@ static void wait_for_mutators_to_stop(struct gc_heap *heap) {
     pthread_cond_wait(&heap->collector_cond, &heap->lock);
 }
 
-static int is_minor_collection(struct gc_heap *heap) {
-  if (GC_GENERATIONAL)
-    GC_CRASH();
-  return 0;
-}
-
 static enum gc_collection_kind
 pause_mutator_for_collection(struct gc_heap *heap,
                              struct gc_mutator *mut) GC_NEVER_INLINE;
@@ -313,13 +587,17 @@ pause_mutator_for_collection(struct gc_heap *heap, struct gc_mutator *mut) {
   MUTATOR_EVENT(mut, mutator_stopping);
   MUTATOR_EVENT(mut, mutator_stopped);
   heap->paused_mutator_count++;
-  enum gc_collection_kind collection_kind =
-    is_minor_collection(heap) ? GC_COLLECTION_MINOR : GC_COLLECTION_COMPACTING;
   if (all_mutators_stopped(heap))
     pthread_cond_signal(&heap->collector_cond);
 
+  enum gc_collection_kind collection_kind = GC_COLLECTION_MINOR;
   do {
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
+    // is_minor_collection is reset before requesting mutators to stop, so this
+    // will pick up either whether the last collection was minor, or whether the
+    // next one will be minor.
+    if (!GC_GENERATIONAL || !is_minor_collection(heap))
+      collection_kind = GC_COLLECTION_COMPACTING;
   } while (mutators_are_stopping(heap));
   heap->paused_mutator_count--;
 
@@ -342,16 +620,83 @@ static void resize_heap(struct gc_heap *heap, size_t new_size) {
   HEAP_EVENT(heap, heap_resized, new_size);
 }
 
+static size_t heap_nursery_size(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  return heap->nursery_size;
+#else
+  GC_CRASH();
+#endif
+}
+
+static void heap_set_nursery_size(struct gc_heap *heap, size_t size) {
+#if GC_GENERATIONAL
+  GC_ASSERT(size);
+  heap->nursery_size = size;
+#else
+  GC_CRASH();
+#endif
+}
+
+static size_t heap_nursery_size_for_mutator_count(struct gc_heap *heap,
+                                                  size_t count) {
+#if GC_GENERATIONAL
+  return heap->per_processor_nursery_size * count;
+#else
+  GC_CRASH();
+#endif
+}
+
+static void resize_nursery(struct gc_heap *heap, size_t size) {
+  size_t prev_size = heap_nursery_size(heap);
+  if (size < prev_size)
+    copy_space_shrink(heap_new_space(heap), prev_size - size);
+  else
+    copy_space_reacquire_memory(heap_new_space(heap), size - prev_size);
+  heap_set_nursery_size(heap, size);
+}
+
+static void resize_nursery_for_active_mutator_count(struct gc_heap *heap,
+                                                    size_t count) {
+  if (count > heap->processor_count)
+    count = heap->processor_count;
+  size_t prev_size = heap_nursery_size(heap);
+  size_t size = heap_nursery_size_for_mutator_count(heap, count);
+  // If there were more mutator processors this cycle than in the previous,
+  // increase the nursery size.  Otherwise shrink, but with an exponential decay
+  // factor.
+  if (size < prev_size)
+    size = (prev_size + size) / 2;
+  resize_nursery(heap, size);
+}
+
+static void resize_for_active_mutator_count(struct gc_heap *heap) {
+  size_t mutators = heap->max_active_mutator_count;
+  GC_ASSERT(mutators);
+  heap->max_active_mutator_count = 1;
+  maybe_increase_max_active_mutator_count(heap);
+
+  if (GC_GENERATIONAL)
+    resize_nursery_for_active_mutator_count(heap, mutators);
+}
+
 static void visit_root_edge(struct gc_edge edge, struct gc_heap *heap,
                             void *unused) {
   gc_tracer_add_root(&heap->tracer, gc_root_edge(edge));
 }
 
-static void add_roots(struct gc_heap *heap) {
+static void add_roots(struct gc_heap *heap, int is_minor_gc) {
   for (struct gc_mutator *mut = heap->mutators; mut; mut = mut->next)
     gc_tracer_add_root(&heap->tracer, gc_root_mutator(mut));
   gc_tracer_add_root(&heap->tracer, gc_root_heap(heap));
   gc_visit_finalizer_roots(heap->finalizer_state, visit_root_edge, heap, NULL);
+  if (is_minor_gc)
+    gc_field_set_add_roots(heap_remembered_set(heap), &heap->tracer);
+}
+
+static void
+clear_remembered_set(struct gc_heap *heap) {
+  gc_field_set_clear(heap_remembered_set(heap), NULL, NULL);
+  large_object_space_clear_remembered_edges(heap_large_object_space(heap));
 }
 
 static void resolve_ephemerons_lazily(struct gc_heap *heap) {
@@ -394,7 +739,17 @@ static void sweep_ephemerons(struct gc_heap *heap) {
 static int
 heap_can_minor_gc(struct gc_heap *heap) {
   if (!GC_GENERATIONAL) return 0;
-  GC_CRASH();
+  // Invariant: the oldgen always has enough free space to accomodate promoted
+  // objects from the nursery.  This is a precondition for minor GC of course,
+  // but it is also a post-condition: after potentially promoting all nursery
+  // objects, we still need an additional nursery's worth of space in oldgen to
+  // satisfy the invariant.  We ensure the invariant by only doing minor GC if
+  // the copy space can allocate as many bytes as the nursery, which is already
+  // twice the allocatable size because of the copy reserve.
+  struct copy_space *new_space = heap_new_space(heap);
+  struct copy_space *old_space = heap_old_space(heap);
+  size_t nursery_size = heap_nursery_size(heap);
+  return copy_space_can_allocate(old_space, nursery_size);
 }
 
 static enum gc_collection_kind
@@ -405,13 +760,43 @@ determine_collection_kind(struct gc_heap *heap,
   return GC_COLLECTION_COMPACTING;
 }
 
+static void
+copy_spaces_start_gc(struct gc_heap *heap, int is_minor_gc) {
+  if (GC_GENERATIONAL) {
+    copy_space_flip(heap_new_space(heap));
+    if (!is_minor_gc)
+      copy_space_flip(heap_old_space(heap));
+  } else {
+    copy_space_flip(heap_mono_space(heap));
+  }
+}
+
+static void
+copy_spaces_finish_gc(struct gc_heap *heap, int is_minor_gc) {
+  if (GC_GENERATIONAL) {
+    copy_space_finish_gc(heap_new_space(heap));
+    if (!is_minor_gc)
+      copy_space_finish_gc(heap_old_space(heap));
+  } else {
+    copy_space_finish_gc(heap_mono_space(heap));
+  }
+}
+
+static size_t
+copy_spaces_allocated_bytes(struct gc_heap *heap) 
+{
+  return GC_GENERATIONAL
+    ? (heap_new_space(heap)->allocated_bytes_at_last_gc +
+       heap_old_space(heap)->allocated_bytes_at_last_gc)
+    : heap_mono_space(heap)->allocated_bytes_at_last_gc;
+}
+
 static enum gc_collection_kind
 collect(struct gc_mutator *mut,
         enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 static enum gc_collection_kind
 collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
-  struct copy_space *copy_space = heap_copy_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
   struct gc_extern_space *exspace = heap_extern_space(heap);
   uint64_t start_ns = gc_platform_monotonic_nanoseconds();
@@ -422,18 +807,24 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   HEAP_EVENT(heap, waiting_for_stop);
   wait_for_mutators_to_stop(heap);
   HEAP_EVENT(heap, mutators_stopped);
-  HEAP_EVENT(heap, prepare_gc, GC_COLLECTION_COMPACTING);
   enum gc_collection_kind gc_kind =
     determine_collection_kind(heap, requested_kind);
+  int is_minor_gc =
+#if GC_GENERATIONAL
+    heap->is_minor_collection =
+#endif
+    GC_GENERATIONAL ? gc_kind == GC_COLLECTION_MINOR : 0;
+  HEAP_EVENT(heap, prepare_gc, gc_kind);
   uint64_t *counter_loc = &heap->total_allocated_bytes_at_last_gc;
-  copy_space_add_to_allocation_counter(copy_space, counter_loc);
+  copy_space_add_to_allocation_counter(heap_allocation_space(heap),
+                                       counter_loc);
   large_object_space_add_to_allocation_counter(lospace, counter_loc);
-  copy_space_flip(copy_space);
-  large_object_space_start_gc(lospace, 0);
-  gc_extern_space_start_gc(exspace, 0);
+  copy_spaces_start_gc(heap, is_minor_gc);
+  large_object_space_start_gc(lospace, is_minor_gc);
+  gc_extern_space_start_gc(exspace, is_minor_gc);
   resolve_ephemerons_lazily(heap);
   gc_tracer_prepare(&heap->tracer);
-  add_roots(heap);
+  add_roots(heap, is_minor_gc);
   HEAP_EVENT(heap, roots_traced);
   gc_tracer_trace(&heap->tracer);
   HEAP_EVENT(heap, heap_traced);
@@ -444,22 +835,27 @@ collect(struct gc_mutator *mut, enum gc_collection_kind requested_kind) {
   HEAP_EVENT(heap, finalizers_traced);
   sweep_ephemerons(heap);
   gc_tracer_release(&heap->tracer);
-  copy_space_finish_gc(copy_space);
-  large_object_space_finish_gc(lospace, 0);
-  gc_extern_space_finish_gc(exspace, 0);
+  copy_spaces_finish_gc(heap, is_minor_gc);
+  large_object_space_finish_gc(lospace, is_minor_gc);
+  gc_extern_space_finish_gc(exspace, is_minor_gc);
+  if (GC_GENERATIONAL && !is_minor_gc)
+    clear_remembered_set(heap);
   heap->count++;
+  resize_for_active_mutator_count(heap);
   heap_reset_large_object_pages(heap, lospace->live_pages_at_last_collection);
-  size_t live_size = (copy_space->allocated_bytes_at_last_gc +
+  size_t live_size = (copy_spaces_allocated_bytes(heap) +
                       large_object_space_size_at_last_collection(lospace));
   uint64_t pause_ns = gc_platform_monotonic_nanoseconds() - start_ns;
   HEAP_EVENT(heap, live_data_size, live_size);
   gc_heap_sizer_on_gc(heap->sizer, heap->size, live_size, pause_ns,
                       resize_heap);
-  if (!copy_space_page_out_blocks_until_memory_released(copy_space)
-      && heap->sizer.policy == GC_HEAP_SIZE_FIXED) {
-    fprintf(stderr, "ran out of space, heap size %zu (%zu slabs)\n",
-            heap->size, copy_space->nslabs);
-    GC_CRASH();
+  {
+    struct copy_space *space = heap_resizable_space(heap);
+    if (!copy_space_page_out_blocks_until_memory_released(space)
+        && heap->sizer.policy == GC_HEAP_SIZE_FIXED) {
+      fprintf(stderr, "ran out of space, heap size %zu\n", heap->size);
+      GC_CRASH();
+    }
   }
   HEAP_EVENT(heap, restarting_mutators);
   allow_mutators_to_continue(heap);
@@ -470,6 +866,8 @@ static void trigger_collection(struct gc_mutator *mut,
                                enum gc_collection_kind requested_kind) {
   struct gc_heap *heap = mutator_heap(mut);
   copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(mutator_field_logger(mut));
   heap_lock(heap);
   int prev_kind = -1;
   while (mutators_are_stopping(heap))
@@ -517,11 +915,12 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   if (size > gc_allocator_large_threshold())
     return allocate_large(mut, size);
 
-  struct gc_ref ret = copy_space_allocate(&mut->allocator,
-                                          heap_copy_space(mutator_heap(mut)),
-                                          size,
-                                          get_more_empty_blocks_for_mutator,
-                                          mut);
+  struct gc_ref ret =
+    copy_space_allocate(&mut->allocator,
+                        heap_allocation_space(mutator_heap(mut)),
+                        size,
+                        get_more_empty_blocks_for_mutator,
+                        mut);
   gc_clear_fresh_allocation(ret, size);
   return gc_ref_heap_object(ret);
 }
@@ -536,12 +935,36 @@ void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
 
 int gc_object_is_old_generation_slow(struct gc_mutator *mut,
                                      struct gc_ref obj) {
+  if (!GC_GENERATIONAL)
+    return 0;
+
+  struct gc_heap *heap = mutator_heap(mut);
+
+  if (copy_space_contains(heap_new_space(heap), obj))
+    return 0;
+  if (copy_space_contains(heap_old_space(heap), obj))
+    return 1;
+
+  struct large_object_space *lospace = heap_large_object_space(heap);
+  if (large_object_space_contains(lospace, obj))
+    return large_object_space_is_survivor(lospace, obj);
+
   return 0;
 }
 
 void gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
                            size_t obj_size, struct gc_edge edge,
                            struct gc_ref new_val) {
+  GC_ASSERT(!gc_ref_is_null(new_val));
+  if (!GC_GENERATIONAL) return;
+  if (gc_object_is_old_generation_slow(mut, new_val))
+    return;
+  struct gc_heap *heap = mutator_heap(mut);
+  if ((obj_size <= gc_allocator_large_threshold())
+      ? copy_space_remember_edge(heap_old_space(heap), edge)
+      : large_object_space_remember_edge(heap_large_object_space(heap),
+                                         obj, edge))
+    gc_field_set_writer_add_edge(mutator_field_logger(mut), edge);
 }
 
 int* gc_safepoint_flag_loc(struct gc_mutator *mut) {
@@ -551,6 +974,8 @@ int* gc_safepoint_flag_loc(struct gc_mutator *mut) {
 void gc_safepoint_slow(struct gc_mutator *mut) {
   struct gc_heap *heap = mutator_heap(mut);
   copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(mutator_field_logger(mut));
   heap_lock(heap);
   while (mutators_are_stopping(mutator_heap(mut)))
     pause_mutator_for_collection(heap, mut);
@@ -636,7 +1061,7 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
 static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
   uint64_t ret = heap->total_allocated_bytes_at_last_gc;
   if (pthread_mutex_trylock(&heap->lock)) return ret;
-  copy_space_add_to_allocation_counter(heap_copy_space(heap), &ret);
+  copy_space_add_to_allocation_counter(heap_allocation_space(heap), &ret);
   large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
                                                &ret);
   pthread_mutex_unlock(&heap->lock);
@@ -652,10 +1077,20 @@ static void set_heap_size_from_thread(struct gc_heap *heap, size_t size) {
 static int heap_init(struct gc_heap *heap, const struct gc_options *options) {
   // *heap is already initialized to 0.
 
+  if (GC_GENERATIONAL)
+    gc_field_set_init(heap_remembered_set(heap));
   pthread_mutex_init(&heap->lock, NULL);
   pthread_cond_init(&heap->mutator_cond, NULL);
   pthread_cond_init(&heap->collector_cond, NULL);
   heap->size = options->common.heap_size;
+  heap->processor_count = gc_platform_processor_count();
+  // max_active_mutator_count never falls below 1 after this point.
+  heap->max_active_mutator_count = 1;
+
+#if GC_GENERATIONAL
+  // We should add an option to set this, but for now, 2 MB per processor.
+  heap->per_processor_nursery_size = 2 * 1024 * 1024;
+#endif
 
   if (!gc_tracer_init(&heap->tracer, heap, options->common.parallelism))
     GC_CRASH();
@@ -690,6 +1125,12 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
                offsetof(struct copy_space_allocator, hp));
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
                offsetof(struct copy_space_allocator, limit));
+  if (GC_GENERATIONAL) {
+    GC_ASSERT_EQ(gc_write_barrier_field_table_alignment(),
+                 COPY_SPACE_SLAB_SIZE);
+    GC_ASSERT_EQ(gc_write_barrier_field_table_offset(),
+                 offsetof(struct copy_space_slab, blocks));
+  }
 
   *heap = calloc(1, sizeof(struct gc_heap));
   if (!*heap) GC_CRASH();
@@ -701,16 +1142,38 @@ int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   (*heap)->event_listener_data = event_listener_data;
   HEAP_EVENT(*heap, init, (*heap)->size);
 
-  struct copy_space *copy_space = heap_copy_space(*heap);
   {
     uint32_t flags = 0;
     if (options->common.parallelism > 1)
       flags |= COPY_SPACE_ATOMIC_FORWARDING;
-    if (!copy_space_init(copy_space, (*heap)->size, flags,
-                         (*heap)->background_thread)) {
-      free(*heap);
-      *heap = NULL;
-      return 0;
+    if (GC_GENERATIONAL) {
+      size_t nursery_size =
+        heap_nursery_size_for_mutator_count(*heap, (*heap)->processor_count);
+      heap_set_nursery_size(*heap, nursery_size);
+      if (!copy_space_init(heap_new_space(*heap), nursery_size,
+                           flags | COPY_SPACE_ALIGNED,
+                           (*heap)->background_thread)) {
+        free(*heap);
+        *heap = NULL;
+        return 0;
+      }
+      // Initially dimension the nursery for one mutator.
+      resize_nursery(*heap, heap_nursery_size_for_mutator_count(*heap, 1));
+
+      if (!copy_space_init(heap_old_space(*heap), (*heap)->size,
+                           flags | COPY_SPACE_HAS_FIELD_LOGGING_BITS,
+                           (*heap)->background_thread)) {
+        free(*heap);
+        *heap = NULL;
+        return 0;
+      }
+    } else {
+      if (!copy_space_init(heap_mono_space(*heap), (*heap)->size, flags,
+                           (*heap)->background_thread)) {
+        free(*heap);
+        *heap = NULL;
+        return 0;
+      }
     }
   }
   
@@ -744,6 +1207,8 @@ void gc_finish_for_thread(struct gc_mutator *mut) {
 static void deactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   GC_ASSERT(mut->next == NULL);
   copy_space_allocator_finish(&mut->allocator, heap_allocation_space(heap));
+  if (GC_GENERATIONAL)
+    gc_field_set_writer_release_buffer(mutator_field_logger(mut));
   heap_lock(heap);
   heap->inactive_mutator_count++;
   if (all_mutators_stopped(heap))
@@ -756,6 +1221,7 @@ static void reactivate_mutator(struct gc_heap *heap, struct gc_mutator *mut) {
   while (mutators_are_stopping(heap))
     pthread_cond_wait(&heap->mutator_cond, &heap->lock);
   heap->inactive_mutator_count--;
+  maybe_increase_max_active_mutator_count(heap);
   heap_unlock(heap);
 }
 

From c95b7ef046e1d101943a86ec743b6b569e00fb18 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 15:54:32 +0100
Subject: [PATCH 351/403] New collector configuration: generational-pcc

---
 Makefile | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 3f7278bdc..8de7346b9 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,9 @@ TESTS = quads mt-gcbench ephemerons finalizers
 COLLECTORS = \
 	bdw \
 	semi \
+	\
 	pcc \
+	generational-pcc \
 	\
 	mmc \
 	stack-conservative-mmc \
@@ -64,9 +66,13 @@ GC_STEM_semi       = semi
 GC_CFLAGS_semi     = -DGC_PRECISE_ROOTS=1
 GC_LIBS_semi       = -lm
 
-GC_STEM_pcc       = pcc
-GC_CFLAGS_pcc     = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
-GC_LIBS_pcc       = -lm
+GC_STEM_pcc        = pcc
+GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+GC_LIBS_pcc        = -lm
+
+GC_STEM_generational_pcc   = $(GC_STEM_pcc)
+GC_CFLAGS_generational_pcc = $(GC_CFLAGS_pcc) -DGC_GENERATIONAL=1
+GC_LIBS_generational_pcc   = $(GC_LIBS_pcc)
 
 define mmc_variant
 GC_STEM_$(1)       = mmc

From 4ab72e92b0ac2fb2782c9eafd812aa81a308ba65 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 16:33:38 +0100
Subject: [PATCH 352/403] gpcc: Don't mix survivors and new objects in same
 block

---
 src/copy-space.h | 35 ++++++++++++++++++++++-------------
 src/pcc.c        |  7 ++++---
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index bd2df7334..0fbf4b111 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -529,9 +529,30 @@ copy_space_flip(struct copy_space *space) {
   space->in_gc = 1;
 }
 
+static inline void
+copy_space_allocator_init(struct copy_space_allocator *alloc) {
+  memset(alloc, 0, sizeof(*alloc));
+}
+
+static inline void
+copy_space_allocator_finish(struct copy_space_allocator *alloc,
+                            struct copy_space *space) {
+  if (alloc->block)
+    copy_space_allocator_release_partly_full_block(alloc, space);
+}
+
 static void
-copy_space_finish_gc(struct copy_space *space) {
+copy_space_finish_gc(struct copy_space *space, int is_minor_gc) {
   // Mutators stopped, can access nonatomically.
+  if (is_minor_gc) {
+    // Avoid mixing survivors and new objects on the same blocks.
+    struct copy_space_allocator alloc;
+    copy_space_allocator_init(&alloc);
+    while (copy_space_allocator_acquire_partly_full_block(&alloc, space))
+      copy_space_allocator_release_full_block(&alloc, space);
+    copy_space_allocator_finish(&alloc, space);
+  }
+
   space->allocated_bytes_at_last_gc = space->allocated_bytes;
   space->fragmentation_at_last_gc = space->fragmentation;
   space->in_gc = 0;
@@ -778,18 +799,6 @@ copy_space_forget_edge(struct copy_space *space, struct gc_edge edge) {
   return 1;
 }
 
-static inline void
-copy_space_allocator_init(struct copy_space_allocator *alloc) {
-  memset(alloc, 0, sizeof(*alloc));
-}
-
-static inline void
-copy_space_allocator_finish(struct copy_space_allocator *alloc,
-                            struct copy_space *space) {
-  if (alloc->block)
-    copy_space_allocator_release_partly_full_block(alloc, space);
-}
-
 static size_t copy_space_is_power_of_two(size_t n) {
   GC_ASSERT(n != 0);
   return (n & (n - 1)) == 0;
diff --git a/src/pcc.c b/src/pcc.c
index 7ee017ae2..ff10375ef 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -774,11 +774,12 @@ copy_spaces_start_gc(struct gc_heap *heap, int is_minor_gc) {
 static void
 copy_spaces_finish_gc(struct gc_heap *heap, int is_minor_gc) {
   if (GC_GENERATIONAL) {
-    copy_space_finish_gc(heap_new_space(heap));
+    copy_space_finish_gc(heap_new_space(heap), is_minor_gc);
     if (!is_minor_gc)
-      copy_space_finish_gc(heap_old_space(heap));
+      copy_space_finish_gc(heap_old_space(heap), 0);
   } else {
-    copy_space_finish_gc(heap_mono_space(heap));
+    GC_ASSERT(!is_minor_gc);
+    copy_space_finish_gc(heap_mono_space(heap), 0);
   }
 }
 

From 27f9a1f01e8c947c76923c58f32eeaa2260e1766 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 10 Jan 2025 16:39:39 +0100
Subject: [PATCH 353/403] gpcc: Temporarily always promote survivors

Generational PCC is still a bit buggy.
---
 src/pcc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pcc.c b/src/pcc.c
index ff10375ef..4b4a99700 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -324,7 +324,7 @@ static inline int do_minor_trace(struct gc_heap *heap, struct gc_edge edge,
     // However however, it is hard to distinguish between edges from promoted
     // objects and edges from old objects, so we mostly just rely on an
     // idempotent "log if unlogged" operation instead.
-    int promote = copy_space_should_promote(new_space, ref);
+    int promote = copy_space_should_promote(new_space, ref) || 1;
     struct copy_space *dst_space = promote ? old_space : new_space;
     struct copy_space_allocator *alloc = promote
       ? trace_worker_old_space_allocator(data)

From b37a7f3862f6bc04c860ae18b4a0fdc653915ce1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 09:12:34 +0100
Subject: [PATCH 354/403] copy-space: Fix bug computing field logging byte
 location

Also re-enable survivors in generational-pcc :)
---
 src/copy-space.h | 2 +-
 src/pcc.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 0fbf4b111..e95ec0322 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -346,7 +346,7 @@ copy_space_field_logged_byte(struct gc_edge edge) {
   base += offsetof(struct copy_space_slab, blocks);
   uintptr_t field = (addr & (COPY_SPACE_SLAB_SIZE - 1)) / sizeof(uintptr_t);
   uintptr_t byte = field / 8;
-  return (uint8_t*) (base + field);
+  return (uint8_t*) (base + byte);
 }
 
 static uint8_t
diff --git a/src/pcc.c b/src/pcc.c
index 4b4a99700..ff10375ef 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -324,7 +324,7 @@ static inline int do_minor_trace(struct gc_heap *heap, struct gc_edge edge,
     // However however, it is hard to distinguish between edges from promoted
     // objects and edges from old objects, so we mostly just rely on an
     // idempotent "log if unlogged" operation instead.
-    int promote = copy_space_should_promote(new_space, ref) || 1;
+    int promote = copy_space_should_promote(new_space, ref);
     struct copy_space *dst_space = promote ? old_space : new_space;
     struct copy_space_allocator *alloc = promote
       ? trace_worker_old_space_allocator(data)

From e41000094dcc1b6e540c24c2b6841d9ffa902d39 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 10:23:47 +0100
Subject: [PATCH 355/403] Add missing write barriers to finalizers.c

---
 benchmarks/finalizers.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/benchmarks/finalizers.c b/benchmarks/finalizers.c
index 434283a53..537307118 100644
--- a/benchmarks/finalizers.c
+++ b/benchmarks/finalizers.c
@@ -93,6 +93,24 @@ static void cause_gc(struct gc_mutator *mut) {
   gc_collect(mut, GC_COLLECTION_MAJOR);
 }
 
+static inline void set_car(struct gc_mutator *mut, Pair *obj, void *val) {
+  void **field = &obj->car;
+  if (val)
+    gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Pair),
+                     gc_edge(field),
+                     gc_ref_from_heap_object(val));
+  *field = val;
+}
+
+static inline void set_cdr(struct gc_mutator *mut, Pair *obj, void *val) {
+  void **field = &obj->cdr;
+  if (val)
+    gc_write_barrier(mut, gc_ref_from_heap_object(obj), sizeof(Pair),
+                     gc_edge(field),
+                     gc_ref_from_heap_object(val));
+  field = val;
+}
+
 static Pair* make_finalizer_chain(struct thread *t, size_t length) {
   PairHandle head = { NULL };
   PairHandle tail = { NULL };
@@ -102,8 +120,8 @@ static Pair* make_finalizer_chain(struct thread *t, size_t length) {
   for (size_t i = 0; i < length; i++) {
     HANDLE_SET(tail, HANDLE_REF(head));
     HANDLE_SET(head, allocate_pair(t->mut));
-    HANDLE_REF(head)->car = allocate_small_object(t->mut);
-    HANDLE_REF(head)->cdr = HANDLE_REF(tail);
+    set_car(t->mut, HANDLE_REF(head), allocate_small_object(t->mut));
+    set_cdr(t->mut, HANDLE_REF(head), HANDLE_REF(tail));
     struct gc_finalizer *finalizer = allocate_finalizer(t->mut);
     gc_finalizer_attach(t->mut, finalizer, 0,
                         gc_ref_from_heap_object(HANDLE_REF(head)),

From b23b77218c5c05a8cfc6536a8cd4b3bf84aaccfb Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 16:28:40 +0100
Subject: [PATCH 356/403] nofl space: Fix a bug for parallel optimistic
 evacuation

If two tracer threads visit edges to the same object on an evacuation
candidate block, and they first see that the object is unmarked, then
they both try to evacuate it at the same time.  Thread A might try and
fail before thread B manages to acquire the forwarding word.  B needs to
see that A marked it in place, and to do that it needs to re-load the
mark byte after acquiring the forwarding word.  Otherwise perhaps B
could succeed and you would end up with two copies of an object, one of
them garbled!
---
 src/nofl-space.h | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 9e4edf912..d29b3a9bd 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1411,7 +1411,8 @@ nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
 }
 
 static inline int
-nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
+nofl_space_set_mark_relaxed(struct nofl_space *space, uint8_t *metadata,
+                            uint8_t byte) {
   uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
     | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
   atomic_store_explicit(metadata,
@@ -1420,10 +1421,21 @@ nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
   return 1;
 }
 
+static inline int
+nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  atomic_store_explicit(metadata,
+                        (byte & ~mask) | space->marked_mask,
+                        memory_order_release);
+  return 1;
+}
+
 static inline int
 nofl_space_set_nonempty_mark(struct nofl_space *space, uint8_t *metadata,
                              uint8_t byte, struct gc_ref ref) {
-  nofl_space_set_mark(space, metadata, byte);
+  // FIXME: Check that relaxed atomics are actually worth it.
+  nofl_space_set_mark_relaxed(space, metadata, byte);
   nofl_block_set_mark(gc_ref_value(ref));
   return 1;
 }
@@ -1490,13 +1502,24 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
     // Impossible.
     GC_CRASH();
   case GC_FORWARDING_STATE_ACQUIRED: {
-    // We claimed the object successfully; evacuating is up to us.
+    // We claimed the object successfully.
+
+    // First check again if someone else tried to evacuate this object and ended
+    // up marking in place instead.
+    byte = atomic_load_explicit(metadata, memory_order_acquire);
+    if (byte & space->marked_mask) {
+      // Indeed, already marked in place.
+      gc_atomic_forward_abort(&fwd);
+      return 0;
+    }
+
+    // Otherwise, we try to evacuate.
     size_t object_granules = nofl_space_live_object_granules(metadata);
     struct gc_ref new_ref = nofl_evacuation_allocate(evacuate, space,
                                                      object_granules);
     if (!gc_ref_is_null(new_ref)) {
-      // Copy object contents before committing, as we don't know what
-      // part of the object (if any) will be overwritten by the
+      // Whee, it works!  Copy object contents before committing, as we don't
+      // know what part of the object (if any) will be overwritten by the
       // commit.
       memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
              object_granules * NOFL_GRANULE_SIZE);
@@ -1512,11 +1535,12 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
       return nofl_space_set_nonempty_mark(space, new_metadata, byte,
                                           new_ref);
     } else {
-      // Well shucks; allocation failed, marking the end of
-      // opportunistic evacuation.  No future evacuation of this
-      // object will succeed.  Mark in place instead.
+      // Well shucks; allocation failed.  Mark in place and then release the
+      // object.
+      nofl_space_set_mark(space, metadata, byte);
+      nofl_block_set_mark(gc_ref_value(old_ref));
       gc_atomic_forward_abort(&fwd);
-      return nofl_space_set_nonempty_mark(space, metadata, byte, old_ref);
+      return 1;
     }
     break;
   }

From 5fdb14cc5ea14c0dc031e9dc020bc3b3470bfa34 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 16:44:12 +0100
Subject: [PATCH 357/403] Remove "ABORTED" atomic forwarding state

It was not distinguishable from "NOT_FORWARDED".
---
 api/gc-forwarding.h             | 3 +--
 benchmarks/simple-gc-embedder.h | 9 +++++----
 src/copy-space.h                | 1 -
 src/nofl-space.h                | 5 ++---
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/api/gc-forwarding.h b/api/gc-forwarding.h
index b598e47a1..25aca3011 100644
--- a/api/gc-forwarding.h
+++ b/api/gc-forwarding.h
@@ -8,8 +8,7 @@ enum gc_forwarding_state {
   GC_FORWARDING_STATE_FORWARDED,
   GC_FORWARDING_STATE_BUSY,
   GC_FORWARDING_STATE_ACQUIRED,
-  GC_FORWARDING_STATE_NOT_FORWARDED,
-  GC_FORWARDING_STATE_ABORTED
+  GC_FORWARDING_STATE_NOT_FORWARDED
 };
 
 struct gc_atomic_forward {
diff --git a/benchmarks/simple-gc-embedder.h b/benchmarks/simple-gc-embedder.h
index d8ad3f0ad..904d2c740 100644
--- a/benchmarks/simple-gc-embedder.h
+++ b/benchmarks/simple-gc-embedder.h
@@ -122,9 +122,10 @@ gc_atomic_forward_retry_busy(struct gc_atomic_forward *fwd) {
                                        memory_order_acquire);
   if (tag == gcobj_busy)
     return 0;
-  if (tag & gcobj_not_forwarded_bit)
-    fwd->state = GC_FORWARDING_STATE_ABORTED;
-  else {
+  if (tag & gcobj_not_forwarded_bit) {
+    fwd->state = GC_FORWARDING_STATE_NOT_FORWARDED;
+    fwd->data = tag;
+  } else {
     fwd->state = GC_FORWARDING_STATE_FORWARDED;
     fwd->data = tag;
   }
@@ -149,7 +150,7 @@ static inline void
 gc_atomic_forward_abort(struct gc_atomic_forward *fwd) {
   GC_ASSERT(fwd->state == GC_FORWARDING_STATE_ACQUIRED);
   atomic_store_explicit(tag_word(fwd->ref), fwd->data, memory_order_release);
-  fwd->state = GC_FORWARDING_STATE_ABORTED;
+  fwd->state = GC_FORWARDING_STATE_NOT_FORWARDED;
 }
 
 static inline size_t
diff --git a/src/copy-space.h b/src/copy-space.h
index e95ec0322..b866d0ff6 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -599,7 +599,6 @@ copy_space_forward_atomic(struct copy_space *space, struct gc_edge edge,
 
   switch (fwd.state) {
   case GC_FORWARDING_STATE_NOT_FORWARDED:
-  case GC_FORWARDING_STATE_ABORTED:
   default:
     // Impossible.
     GC_CRASH();
diff --git a/src/nofl-space.h b/src/nofl-space.h
index d29b3a9bd..05759a033 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1497,7 +1497,6 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
 
   switch (fwd.state) {
   case GC_FORWARDING_STATE_NOT_FORWARDED:
-  case GC_FORWARDING_STATE_ABORTED:
   default:
     // Impossible.
     GC_CRASH();
@@ -1552,7 +1551,7 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
         break;
       yield_for_spin(spin_count);
     }
-    if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
       // Remove evacuation aborted; remote will mark and enqueue.
       return 0;
     ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
@@ -1599,7 +1598,7 @@ nofl_space_forward_if_evacuated(struct nofl_space *space,
         break;
       yield_for_spin(spin_count);
     }
-    if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
       // Remote evacuation aborted; remote will mark and enqueue.
       return 1;
     ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);

From ba65e32b00c1df50a739c2e745920ad5e58dedb7 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 17:22:43 +0100
Subject: [PATCH 358/403] pcc / copy-space: Allow allocations to fail

This fixes an issue in which minor collection of a nursery full of live
data can fail because of fragmentation, whereas really it should just
fall back to promotion.
---
 src/copy-space.h | 59 ++++++++++++++++++-------------
 src/pcc.c        | 92 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 104 insertions(+), 47 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index b866d0ff6..d32de0298 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -149,6 +149,17 @@ struct copy_space {
   size_t nslabs;
 };
 
+enum copy_space_forward_result {
+  // We went to forward an edge, but the target was already forwarded, so we
+  // just updated the edge.
+  COPY_SPACE_FORWARD_UPDATED,
+  // We went to forward an edge and evacuated the referent to a new location.
+  COPY_SPACE_FORWARD_EVACUATED,
+  // We went to forward an edge but failed to acquire memory for its new
+  // location.
+  COPY_SPACE_FORWARD_FAILED,
+};
+
 struct copy_space_allocator {
   uintptr_t hp;
   uintptr_t limit;
@@ -473,9 +484,7 @@ copy_space_allocator_release_partly_full_block(struct copy_space_allocator *allo
 static inline struct gc_ref
 copy_space_allocate(struct copy_space_allocator *alloc,
                     struct copy_space *space,
-                    size_t size,
-                    void (*get_more_empty_blocks)(void *data),
-                    void *data) {
+                    size_t size) {
   GC_ASSERT(size > 0);
   GC_ASSERT(size <= gc_allocator_large_threshold());
   size = align_up(size, gc_allocator_small_granule_size());
@@ -490,8 +499,8 @@ copy_space_allocate(struct copy_space_allocator *alloc,
       goto done;
     copy_space_allocator_release_full_block(alloc, space);
   }
-  while (!copy_space_allocator_acquire_empty_block(alloc, space))
-    get_more_empty_blocks(data);
+  if (!copy_space_allocator_acquire_empty_block(alloc, space))
+    return gc_ref_null();
   // The newly acquired block is empty and is therefore large enough for
   // a small allocation.
 
@@ -588,12 +597,13 @@ copy_space_gc_during_evacuation(void *data) {
   GC_CRASH();
 }
 
-static inline int
+static inline enum copy_space_forward_result
 copy_space_forward_atomic(struct copy_space *space, struct gc_edge edge,
                           struct gc_ref old_ref,
                           struct copy_space_allocator *alloc) {
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
 
+retry:
   if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
     gc_atomic_forward_acquire(&fwd);
 
@@ -605,33 +615,34 @@ copy_space_forward_atomic(struct copy_space *space, struct gc_edge edge,
   case GC_FORWARDING_STATE_ACQUIRED: {
     // We claimed the object successfully; evacuating is up to us.
     size_t bytes = gc_atomic_forward_object_size(&fwd);
-    struct gc_ref new_ref =
-      copy_space_allocate(alloc, space, bytes,
-                          copy_space_gc_during_evacuation, NULL);
+    struct gc_ref new_ref = copy_space_allocate(alloc, space, bytes);
+    if (gc_ref_is_null(new_ref)) {
+      gc_atomic_forward_abort(&fwd);
+      return COPY_SPACE_FORWARD_FAILED;
+    }
     // Copy object contents before committing, as we don't know what
     // part of the object (if any) will be overwritten by the
     // commit.
     memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), bytes);
     gc_atomic_forward_commit(&fwd, new_ref);
     gc_edge_update(edge, new_ref);
-    return 1;
+    return COPY_SPACE_FORWARD_EVACUATED;
   }
   case GC_FORWARDING_STATE_BUSY:
     // Someone else claimed this object first.  Spin until new address
     // known, or evacuation aborts.
     for (size_t spin_count = 0;; spin_count++) {
       if (gc_atomic_forward_retry_busy(&fwd))
-        break;
+        goto retry;
       yield_for_spin(spin_count);
     }
-    GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-    // Fall through.
+    GC_CRASH(); // Unreachable.
   case GC_FORWARDING_STATE_FORWARDED:
     // The object has been evacuated already.  Update the edge;
     // whoever forwarded the object will make sure it's eventually
     // traced.
     gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-    return 0;
+    return COPY_SPACE_FORWARD_UPDATED;
   }
 }
 
@@ -640,6 +651,7 @@ copy_space_forward_if_traced_atomic(struct copy_space *space,
                                     struct gc_edge edge,
                                     struct gc_ref old_ref) {
   struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+retry:
   switch (fwd.state) {
   case GC_FORWARDING_STATE_NOT_FORWARDED:
     return 0;
@@ -648,11 +660,10 @@ copy_space_forward_if_traced_atomic(struct copy_space *space,
     // known.
     for (size_t spin_count = 0;; spin_count++) {
       if (gc_atomic_forward_retry_busy(&fwd))
-        break;
+        goto retry;
       yield_for_spin(spin_count);
     }
-    GC_ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-    // Fall through.
+    GC_CRASH(); // Unreachable.
   case GC_FORWARDING_STATE_FORWARDED:
     gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
     return 1;
@@ -661,24 +672,24 @@ copy_space_forward_if_traced_atomic(struct copy_space *space,
   }
 }
 
-static inline int
+static inline enum copy_space_forward_result
 copy_space_forward_nonatomic(struct copy_space *space, struct gc_edge edge,
                              struct gc_ref old_ref,
                              struct copy_space_allocator *alloc) {
   uintptr_t forwarded = gc_object_forwarded_nonatomic(old_ref);
   if (forwarded) {
     gc_edge_update(edge, gc_ref(forwarded));
-    return 0;
+    return COPY_SPACE_FORWARD_UPDATED;
   } else {
     size_t size;
     gc_trace_object(old_ref, NULL, NULL, NULL, &size);
-    struct gc_ref new_ref =
-      copy_space_allocate(alloc, space, size,
-                          copy_space_gc_during_evacuation, NULL);
+    struct gc_ref new_ref = copy_space_allocate(alloc, space, size);
+    if (gc_ref_is_null(new_ref))
+      return COPY_SPACE_FORWARD_FAILED;
     memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref), size);
     gc_object_forward_nonatomic(old_ref, new_ref);
     gc_edge_update(edge, new_ref);
-    return 1;
+    return COPY_SPACE_FORWARD_EVACUATED;
   }
 }
 
@@ -694,7 +705,7 @@ copy_space_forward_if_traced_nonatomic(struct copy_space *space,
   return 0;
 }
 
-static inline int
+static inline enum copy_space_forward_result
 copy_space_forward(struct copy_space *src_space, struct copy_space *dst_space,
                    struct gc_edge edge,
                    struct gc_ref old_ref,
diff --git a/src/pcc.c b/src/pcc.c
index ff10375ef..422276afd 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -287,6 +287,31 @@ static inline int edge_is_from_survivor(struct gc_heap *heap,
   return copy_space_contains_edge_aligned(heap_new_space(heap), edge);
 }
 
+static inline int forward(struct copy_space *src_space,
+                          struct copy_space *dst_space,
+                          struct gc_edge edge,
+                          struct gc_ref ref,
+                          struct copy_space_allocator *dst_alloc) {
+  switch (copy_space_forward(src_space, dst_space, edge, ref, dst_alloc)) {
+  case COPY_SPACE_FORWARD_UPDATED:
+    return 0;
+  case COPY_SPACE_FORWARD_EVACUATED:
+    return 1;
+  case COPY_SPACE_FORWARD_FAILED:
+    // If space is really tight and reordering of objects during evacuation
+    // resulted in more end-of-block fragmentation and thus block use than
+    // before collection started, we can actually run out of memory while
+    // collecting.  We should probably attempt to expand the heap here, at
+    // least by a single block; it's better than the alternatives.  For now,
+    // abort.
+    fprintf(stderr, "Out of memory\n");
+    GC_CRASH();
+    break;
+  default:
+    GC_CRASH();
+  }
+}
+
 static inline int do_minor_trace(struct gc_heap *heap, struct gc_edge edge,
                                  struct gc_ref ref,
                                  struct gc_trace_worker_data *data) {
@@ -324,16 +349,32 @@ static inline int do_minor_trace(struct gc_heap *heap, struct gc_edge edge,
     // However however, it is hard to distinguish between edges from promoted
     // objects and edges from old objects, so we mostly just rely on an
     // idempotent "log if unlogged" operation instead.
-    int promote = copy_space_should_promote(new_space, ref);
-    struct copy_space *dst_space = promote ? old_space : new_space;
-    struct copy_space_allocator *alloc = promote
-      ? trace_worker_old_space_allocator(data)
-      : trace_worker_new_space_allocator(data);
-    // Update the remembered set for promoted-to-survivor edges.
-    if (!promote && !edge_is_from_survivor(heap, edge)
-        && remember_edge_to_survivor_object(heap, edge))
-      gc_field_set_writer_add_edge(trace_worker_field_logger(data), edge);
-    return copy_space_forward(new_space, dst_space, edge, ref, alloc);
+    if (!copy_space_should_promote(new_space, ref)) {
+      // Try to leave the object in newspace as a survivor.  If the edge is from
+      // a promoted object, we will need to add it to the remembered set.
+      if (!edge_is_from_survivor(heap, edge)
+          && remember_edge_to_survivor_object(heap, edge)) {
+        // Log the edge even though in rare conditions the referent could end up
+        // being promoted by us (if we run out of newspace) or a remote
+        // evacuation thread (if they run out of newspace).
+        gc_field_set_writer_add_edge(trace_worker_field_logger(data), edge);
+      }
+      switch (copy_space_forward(new_space, new_space, edge, ref,
+                                 trace_worker_new_space_allocator(data))) {
+      case COPY_SPACE_FORWARD_UPDATED:
+        return 0;
+      case COPY_SPACE_FORWARD_EVACUATED:
+        return 1;
+      case COPY_SPACE_FORWARD_FAILED:
+        // Ran out of newspace!  Fall through to promote instead.
+        break;
+      default:
+        GC_CRASH();
+      }
+    }
+    // Promote the object.
+    return forward(new_space, old_space, edge, ref,
+                   trace_worker_old_space_allocator(data));
   } else {
     // Note that although the target of the edge might not be in lospace, this
     // will do what we want and return 1 if and only if ref is was a young
@@ -354,16 +395,16 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
     struct copy_space *new_space = heap_new_space(heap);
     struct copy_space *old_space = heap_old_space(heap);
     if (new_space_contains(heap, ref))
-      return copy_space_forward(new_space, old_space, edge, ref,
-                                trace_worker_old_space_allocator(data));
+      return forward(new_space, old_space, edge, ref,
+                     trace_worker_old_space_allocator(data));
     if (old_space_contains(heap, ref))
-      return copy_space_forward(old_space, old_space, edge, ref,
-                                trace_worker_old_space_allocator(data));
+      return forward(old_space, old_space, edge, ref,
+                     trace_worker_old_space_allocator(data));
   } else {
     if (GC_LIKELY(copy_space_contains(heap_mono_space(heap), ref)))
-      return copy_space_forward(heap_mono_space(heap), heap_mono_space(heap),
-                                edge, ref,
-                                trace_worker_mono_space_allocator(data));
+      return forward(heap_mono_space(heap), heap_mono_space(heap),
+                     edge, ref,
+                     trace_worker_mono_space_allocator(data));
   }
 
   // Fall through for objects in large or extern spaces.
@@ -916,12 +957,17 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   if (size > gc_allocator_large_threshold())
     return allocate_large(mut, size);
 
-  struct gc_ref ret =
-    copy_space_allocate(&mut->allocator,
-                        heap_allocation_space(mutator_heap(mut)),
-                        size,
-                        get_more_empty_blocks_for_mutator,
-                        mut);
+  struct gc_ref ret;
+  while (1) {
+    ret = copy_space_allocate(&mut->allocator,
+                              heap_allocation_space(mutator_heap(mut)),
+                              size);
+    if (gc_ref_is_null(ret))
+      trigger_collection(mut, GC_COLLECTION_MINOR);
+    else
+      break;
+  }
+
   gc_clear_fresh_allocation(ret, size);
   return gc_ref_heap_object(ret);
 }

From f9c2ce04d415d24c209bb93fe120ce522de0d643 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 21:08:13 +0100
Subject: [PATCH 359/403] Add generational-pcc to embed.mk

---
 embed.mk | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/embed.mk b/embed.mk
index 9ef4d8ab7..a98f7df48 100644
--- a/embed.mk
+++ b/embed.mk
@@ -47,6 +47,10 @@ GC_STEM_pcc        = pcc
 GC_CFLAGS_pcc      = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
 GC_LIBS_pcc       = -lm
 
+GC_STEM_generational_pcc   = $(GC_STEM_pcc)
+GC_CFLAGS_generational_pcc = $(GC_CFLAGS_pcc) -DGC_GENERATIONAL=1
+GC_LIBS_generational_pcc   = $(GC_LIBS_pcc)
+
 define mmc_variant
 GC_STEM_$(1)       = mmc
 GC_CFLAGS_$(1)     = $(2)

From cc68a9a6107dc2767a1bd5a228cca1e222ceeb0f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 13 Jan 2025 21:42:18 +0100
Subject: [PATCH 360/403] Update docs

---
 README.md             |  6 +++---
 doc/collector-pcc.md  |  7 +++++--
 doc/collector-semi.md |  4 ++--
 doc/collectors.md     |  8 +++++---
 doc/manual.md         | 10 ++++++++--
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 466a5ea59..9ef9e3cc9 100644
--- a/README.md
+++ b/README.md
@@ -44,9 +44,9 @@ See the [documentation](./doc/README.md).
 
 ## Status and roadmap
 
-As of September 2024, Whippet is feature-complete!  Of course there will
-surely be new features to build as Whippet gets integrated it into
-language run-times, but the basics are there.
+As of January 2025, Whippet is good to go!  Of course there will surely
+be new features to build as Whippet gets integrated it into language
+run-times, but the basics are there.
 
 The next phase on the roadmap is support for tracing, and
 some performance noodling.
diff --git a/doc/collector-pcc.md b/doc/collector-pcc.md
index ef8085ed7..dc7bd1fb0 100644
--- a/doc/collector-pcc.md
+++ b/doc/collector-pcc.md
@@ -20,8 +20,11 @@ the same performance characteristics with a single mutator and with
 parallelism disabled, additionally allowing multiple mutators, and
 scaling better with multiple tracing threads.
 
-Also like `semi`, `pcc` is not generational yet.  If and when `pcc`
-grows a young generation, it would be a great collector.
+`pcc` has a generational configuration, conventionally referred to as
+`generational-pcc`, in which both the nursery and the old generation are
+copy spaces.  Objects stay in the nursery for one cycle before moving on
+to the old generation.  This configuration is a bit new (January 2025)
+and still needs some tuning.
 
 ## Implementation notes
 
diff --git a/doc/collector-semi.md b/doc/collector-semi.md
index 1900d2d27..ea84720df 100644
--- a/doc/collector-semi.md
+++ b/doc/collector-semi.md
@@ -19,5 +19,5 @@ size, and performs best with ample heap sizes; between 3× and 5× is
 best.
 
 The semi-space collector doesn't support multiple mutator threads.  If
-you want a whole-heap copying collector for a multi-threaded mutator,
-look at [pcc](./collector-pcc.md).
+you want a copying collector for a multi-threaded mutator, look at
+[pcc](./collector-pcc.md).
diff --git a/doc/collectors.md b/doc/collectors.md
index 6e21fcd77..90f4867fc 100644
--- a/doc/collectors.md
+++ b/doc/collectors.md
@@ -3,8 +3,9 @@
 Whippet has four collectors currently:
  - [Semi-space collector (`semi`)](./collector-semi.md): For
    single-threaded embedders who are not too tight on memory.
- - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like `semi`,
-   but with support for multiple mutator and tracing threads.
+ - [Parallel copying collector (`pcc`)](./collector-pcc.md): Like
+   `semi`, but with support for multiple mutator and tracing threads and
+   generational collection.
  - [Mostly marking collector (`mmc`)](./collector-mmc.md):
    Immix-inspired collector.  Optionally parallel, conservative (stack
    and/or heap), and/or generational.
@@ -30,8 +31,9 @@ precise roots, then go for `stack-conservative-parallel-mmc` directly.
 
 ## More collectors
 
-It would be nice to have a classic generational GC, perhaps using
+It would be nice to have a generational GC that uses the space from
 `parallel-mmc` for the old generation but a pcc-style copying nursery.
+We have `generational-pcc` now, so this should be possible.
 
 Support for concurrent marking in `mmc` would be good as well, perhaps
 with a SATB barrier.  (Or, if you are the sort of person to bet on
diff --git a/doc/manual.md b/doc/manual.md
index c299128bf..1ddfcb556 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -112,8 +112,8 @@ If the `gc_atomic_forward`'s state is `BUSY`, the collector will call
 `gc_atomic_forward_retry_busy`; a return value of 0 means the object is
 still busy, because another thread is attempting to forward it.
 Otherwise the forwarding state becomes either `FORWARDED`, if the other
-thread succeeded in forwarding it, or `ABORTED`, indicating that the
-other thread failed to forward it.
+thread succeeded in forwarding it, or go back to `NOT_FORWARDED`,
+indicating that the other thread failed to forward it.
 
 If the forwarding state is `FORWARDED`, the collector will call
 `gc_atomic_forward_address` to get the new address.
@@ -351,6 +351,12 @@ $(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 \
   -include foo-embedder.h -o gc.o -c pcc.c
 ```
 
+You can also build `pcc` in a generational configuration by passing
+`-DGC_GENERATIONAL=1`.  The nursery is 2 MB per active mutator, capped
+to the number of processors, so if the last cycle had a maximum of 4
+mutator threads active at the same time and your machine has 24 cores,
+your nursery would be 8 MB.
+
 #### Building `mmc`
 
 Finally, there is the mostly-marking collector.  It can collect roots

From 685c63ab3aa92fb8a25fc1730a1bd1ea11933bf3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 15 Jan 2025 22:31:48 +0100
Subject: [PATCH 361/403] Trim remembered-set during minor GC

When visiting remembered-set roots, if the target is no longer in
newspace, forget the edge.
---
 src/field-set.h          | 21 +++++++++++++--------
 src/large-object-space.h | 10 ++++++++++
 src/mmc.c                |  9 ++++++++-
 src/pcc.c                | 22 +++++++++++++++++++++-
 4 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/src/field-set.h b/src/field-set.h
index f5b2e42c6..ff9a68e83 100644
--- a/src/field-set.h
+++ b/src/field-set.h
@@ -174,21 +174,26 @@ gc_field_set_clear(struct gc_field_set *set,
 static inline void
 gc_field_set_visit_edge_buffer(struct gc_field_set *set,
                                struct gc_edge_buffer *buf,
-                               void (*visit)(struct gc_edge,
-                                             struct gc_heap*,
-                                             void *data),
+                               int (*visit)(struct gc_edge,
+                                            struct gc_heap*,
+                                            void *data),
                                struct gc_heap *heap,
                                void *data) GC_ALWAYS_INLINE;
 static inline void
 gc_field_set_visit_edge_buffer(struct gc_field_set *set,
                                struct gc_edge_buffer *buf,
-                               void (*visit)(struct gc_edge,
-                                             struct gc_heap*,
-                                             void *data),
+                               int (*visit)(struct gc_edge,
+                                            struct gc_heap*,
+                                            void *data),
                                struct gc_heap *heap,
                                void *data) {
-  for (size_t i = 0; i < buf->size; i++)
-    visit(buf->edges[i], heap, data);
+  size_t i = 0;
+  while (i < buf->size) {
+    if (visit(buf->edges[i], heap, data))
+      i++;
+    else
+      buf->edges[i] = buf->edges[--buf->size];
+  }
   gc_field_set_release_buffer(set, buf);
 }
 
diff --git a/src/large-object-space.h b/src/large-object-space.h
index f3470e0e4..285664d03 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -249,6 +249,16 @@ large_object_space_remember_edge(struct large_object_space *space,
   return remembered;
 }
 
+static void
+large_object_space_forget_edge(struct large_object_space *space,
+                               struct gc_edge edge) {
+  uintptr_t edge_addr = gc_edge_address(edge);
+  pthread_mutex_lock(&space->lock);
+  GC_ASSERT(address_set_contains(&space->remembered_edges, edge_addr));
+  address_set_remove(&space->remembered_edges, edge_addr);
+  pthread_mutex_unlock(&space->lock);
+}
+
 static void
 large_object_space_clear_remembered_edges(struct large_object_space *space) {
   address_set_clear(&space->remembered_edges);
diff --git a/src/mmc.c b/src/mmc.c
index 0af725138..445bda8ec 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -264,6 +264,13 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
+static inline int
+trace_remembered_edge(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
+  tracer_visit(edge, heap, trace_data);
+  // Keep the edge in the remembered set; we clear these in bulk later.
+  return 1;
+}
+
 static inline struct gc_ref
 do_trace_conservative_ref(struct gc_heap *heap, struct gc_conservative_ref ref,
                           int possibly_interior) {
@@ -380,7 +387,7 @@ trace_root(struct gc_root root, struct gc_heap *heap,
     break;
   case GC_ROOT_KIND_EDGE_BUFFER:
     gc_field_set_visit_edge_buffer(&heap->remembered_set, root.edge_buffer,
-                                   tracer_visit, heap, worker);
+                                   trace_remembered_edge, heap, worker);
     break;
   default:
     GC_CRASH();
diff --git a/src/pcc.c b/src/pcc.c
index 422276afd..2f7178f00 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -544,6 +544,26 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
+static inline int
+trace_remembered_edge(struct gc_edge edge, struct gc_heap *heap,
+                      void *trace_data) {
+  GC_ASSERT(is_minor_collection(heap));
+  tracer_visit(edge, heap, trace_data);
+
+  // Return 1 if the edge should be kept in the remset, which is the
+  // case only for new objects that survive the minor GC, and only the
+  // nursery copy space has survivors.
+  if (new_space_contains(heap, gc_edge_ref(edge)))
+    return 1; // Keep edge in remset.
+  // Otherwise remove field-logging bit and return 0 to indicate that
+  // the remembered field set should remove this edge.
+  if (copy_space_contains_edge(heap_old_space(heap), edge))
+    copy_space_forget_edge(heap_old_space(heap), edge);
+  else
+    large_object_space_forget_edge(heap_large_object_space(heap), edge);
+  return 0;
+}
+
 static inline void trace_one(struct gc_ref ref, struct gc_heap *heap,
                              struct gc_trace_worker *worker) {
 #ifdef DEBUG
@@ -582,7 +602,7 @@ static inline void trace_root(struct gc_root root, struct gc_heap *heap,
     break;
   case GC_ROOT_KIND_EDGE_BUFFER:
     gc_field_set_visit_edge_buffer(heap_remembered_set(heap), root.edge_buffer,
-                                   tracer_visit, heap, worker);
+                                   trace_remembered_edge, heap, worker);
     break;
   default:
     GC_CRASH();

From f93777c1337214966c64b3e068584723aff85f3a Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 15 Jan 2025 22:32:31 +0100
Subject: [PATCH 362/403] generational-pcc: Make a smaller pending-ephemeron
 nursery table

Otherwise we end up visiting a pending-ephemeron set that is
proportional in size to the whole heap.
---
 src/gc-ephemeron.c |  1 +
 src/pcc.c          | 32 +++++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/gc-ephemeron.c b/src/gc-ephemeron.c
index 90f82e8ca..0f1f9720a 100644
--- a/src/gc-ephemeron.c
+++ b/src/gc-ephemeron.c
@@ -199,6 +199,7 @@ ephemeron_list_follow(struct gc_ephemeron **loc,
                       struct gc_ephemeron** (*get_next)(struct gc_ephemeron*),
                       int (*is_live)(struct gc_ephemeron*)) {
   struct gc_ephemeron *head = atomic_load_explicit(loc, memory_order_acquire);
+  if (!head) return NULL;
 
   while (1) {
     struct gc_ephemeron *new_head = head;
diff --git a/src/pcc.c b/src/pcc.c
index 2f7178f00..eb2d1ed18 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -54,6 +54,9 @@ struct gc_heap {
   size_t processor_count;
   size_t max_active_mutator_count;
   int check_pending_ephemerons;
+#if GC_GENERATIONAL
+  struct gc_pending_ephemerons *nursery_pending_ephemerons;
+#endif
   struct gc_pending_ephemerons *pending_ephemerons;
   struct gc_finalizer_state *finalizer_state;
   size_t mutator_count;
@@ -768,7 +771,7 @@ static void resolve_ephemerons_lazily(struct gc_heap *heap) {
 static void resolve_ephemerons_eagerly(struct gc_heap *heap) {
   atomic_store_explicit(&heap->check_pending_ephemerons, 1,
                         memory_order_release);
-  gc_scan_pending_ephemerons(heap->pending_ephemerons, heap, 0, 1);
+  gc_scan_pending_ephemerons(gc_heap_pending_ephemerons(heap), heap, 0, 1);
 }
 
 static void trace_resolved_ephemerons(struct gc_heap *heap) {
@@ -794,7 +797,7 @@ static void resolve_finalizers(struct gc_heap *heap) {
 }
 
 static void sweep_ephemerons(struct gc_heap *heap) {
-  return gc_sweep_pending_ephemerons(heap->pending_ephemerons, 0, 1);
+  return gc_sweep_pending_ephemerons(gc_heap_pending_ephemerons(heap), 0, 1);
 }
 
 static int
@@ -1059,6 +1062,10 @@ void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
 }
 
 struct gc_pending_ephemerons *gc_heap_pending_ephemerons(struct gc_heap *heap) {
+#if GC_GENERATIONAL
+  if (is_minor_collection(heap))
+    return heap->nursery_pending_ephemerons;
+#endif
   return heap->pending_ephemerons;
 }
 
@@ -1088,14 +1095,25 @@ void gc_set_finalizer_callback(struct gc_heap *heap,
   gc_finalizer_state_set_callback(heap->finalizer_state, callback);
 }
 
-static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
-  struct gc_pending_ephemerons *cur = heap->pending_ephemerons;
-  size_t target = heap->size * heap->pending_ephemerons_size_factor;
+static int
+heap_do_prepare_pending_ephemerons(struct gc_heap *heap,
+                                   struct gc_pending_ephemerons **loc,
+                                   size_t size) {
+  size_t target = size * heap->pending_ephemerons_size_factor;
   double slop = heap->pending_ephemerons_size_slop;
 
-  heap->pending_ephemerons = gc_prepare_pending_ephemerons(cur, target, slop);
+  return !!(*loc = gc_prepare_pending_ephemerons(*loc, target, slop));
+}
 
-  return !!heap->pending_ephemerons;
+static int heap_prepare_pending_ephemerons(struct gc_heap *heap) {
+  return heap_do_prepare_pending_ephemerons(heap, &heap->pending_ephemerons,
+                                             heap->size)
+#if GC_GENERATIONAL
+    && heap_do_prepare_pending_ephemerons(heap,
+                                          &heap->nursery_pending_ephemerons,
+                                          heap->per_processor_nursery_size * 2)
+#endif
+    ;
 }
 
 struct gc_options {

From 2a51399896f5237b38bfdd75218ce7888b061a04 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 21 Jan 2025 21:09:03 +0100
Subject: [PATCH 363/403] nofl: Disable some consistency checks when tracing
 conservatively

---
 src/nofl-space.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 05759a033..a7ee4881b 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1222,6 +1222,10 @@ static void
 nofl_space_verify_sweepable_blocks(struct nofl_space *space,
                                    struct nofl_block_list *list)
 {
+  if (GC_CONSERVATIVE_TRACE)
+    // No intrinsic way to measure object size, only the extrinsic
+    // metadata bytes.
+    return;
   for (struct nofl_block_ref b = nofl_block_for_addr(list->blocks);
        !nofl_block_is_null(b);
        b = nofl_block_next(b)) {
@@ -1254,6 +1258,10 @@ nofl_space_verify_sweepable_blocks(struct nofl_space *space,
 static void
 nofl_space_verify_swept_blocks(struct nofl_space *space,
                                struct nofl_block_list *list) {
+  if (GC_CONSERVATIVE_TRACE)
+    // No intrinsic way to measure object size, only the extrinsic
+    // metadata bytes.
+    return;
   for (struct nofl_block_ref b = nofl_block_for_addr(list->blocks);
        !nofl_block_is_null(b);
        b = nofl_block_next(b)) {

From 7885ea1037c421e81f9e3edcc2856fabe1070257 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 21 Jan 2025 21:09:25 +0100
Subject: [PATCH 364/403] nofl: Prevent needless expansion

Releasing memory proceeds until there is (-NOFL_BLOCK_SIZE,0] bytes to
release; we should only expand when the number of bytes to reacquire is
large enough.
---
 src/nofl-space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index a7ee4881b..66aa0ac62 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -1770,7 +1770,7 @@ static void
 nofl_space_expand(struct nofl_space *space, size_t bytes) {
   double overhead = ((double)NOFL_META_BLOCKS_PER_SLAB) / NOFL_BLOCKS_PER_SLAB;
   ssize_t to_acquire = -nofl_space_maybe_reacquire_memory(space, bytes);
-  if (to_acquire <= 0) return;
+  if (to_acquire < NOFL_BLOCK_SIZE) return;
   to_acquire *= (1 + overhead);
   size_t reserved = align_up(to_acquire, NOFL_SLAB_SIZE);
   size_t nslabs = reserved / NOFL_SLAB_SIZE;

From 7a9de35aaa776fdb9f3d329700bccc063f357b84 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 23 Jan 2025 15:06:44 +0100
Subject: [PATCH 365/403] lospace: Rely on object_map to be immutable during
 collection

This avoids having multiple threads serialize through a mutex.  We still
have to allow for mutation on object_tree and remembered_edges, though.
---
 src/large-object-space.h | 260 +++++++++++++++++++++------------------
 src/mmc.c                |   7 +-
 src/pcc.c                |  10 +-
 src/semi.c               |   9 +-
 4 files changed, 150 insertions(+), 136 deletions(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index 285664d03..897a7b9e6 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -16,21 +16,25 @@
 #include "background-thread.h"
 #include "freelist.h"
 
-// Logically the large object space is a treadmill space -- somewhat like a
-// copying collector, in that we allocate into tospace, and collection flips
-// tospace to fromspace, except that we just keep a record on the side of which
-// objects are in which space.  That way we slot into the abstraction of a
-// copying collector while not actually copying data.
+// A mark-sweep space with generational support.
 
 struct gc_heap;
 
+enum large_object_state {
+  LARGE_OBJECT_NURSERY = 0,
+  LARGE_OBJECT_MARKED_BIT = 1,
+  LARGE_OBJECT_MARK_TOGGLE_BIT = 2,
+  LARGE_OBJECT_MARK_0 = LARGE_OBJECT_MARKED_BIT,
+  LARGE_OBJECT_MARK_1 = LARGE_OBJECT_MARKED_BIT | LARGE_OBJECT_MARK_TOGGLE_BIT
+};
+
 struct large_object {
   uintptr_t addr;
   size_t size;
 };
 struct large_object_node;
 struct large_object_live_data {
-  uint8_t is_survivor;
+  uint8_t mark;
 };
 struct large_object_dead_data {
   uint8_t age;
@@ -65,17 +69,35 @@ DEFINE_FREELIST(large_object_freelist, sizeof(uintptr_t) * 8 - 1, 2,
                 struct large_object_node*);
 
 struct large_object_space {
-  // Access to all members protected by lock.
+  // Lock for object_map, quarantine, nursery, and marked.
   pthread_mutex_t lock;
+  // Lock for object_tree.
+  pthread_mutex_t object_tree_lock;
+  // Lock for remembered_edges.
+  pthread_mutex_t remembered_edges_lock;
+  // Locking order: You must hold the space lock when taking
+  // object_tree_lock.  Take no other lock while holding
+  // object_tree_lock.  remembered_edges_lock is a leaf; take no locks
+  // when holding it.
+
+  // The value for a large_object_node's "mark" field indicating a
+  // marked object; always nonzero, and alternating between two values
+  // at every major GC.
+  uint8_t marked;
 
   // Splay tree of objects, keyed by <addr, size> tuple.  Useful when
   // looking up object-for-address.
   struct large_object_tree object_tree;
+
   // Hash table of objects, where values are pointers to splay tree
   // nodes.  Useful when you have the object address and just want to
   // check something about it (for example its size).
   struct address_map object_map;
 
+  // In generational configurations, we collect all allocations in the
+  // last cycle into the nursery.
+  struct address_map nursery;
+
   // Size-segregated freelist of dead objects.  Allocations are first
   // served from the quarantine freelist before falling back to the OS
   // if needed.  Collected objects spend a second or two in quarantine
@@ -83,6 +105,10 @@ struct large_object_space {
   // mucking about too much with the TLB and so on.
   struct large_object_freelist quarantine;
 
+  // Set of edges from lospace that may reference young objects,
+  // possibly in other spaces.
+  struct address_set remembered_edges;
+
   size_t page_size;
   size_t page_size_log2;
   size_t total_pages;
@@ -90,17 +116,6 @@ struct large_object_space {
   size_t live_pages_at_last_collection;
   size_t pages_freed_by_last_collection;
   int synchronous_release;
-
-  // A partition of the set of live objects into three sub-spaces.  If
-  // all collections are major, the survivor space will always be empty.
-  // The values of these maps are splay tree nodes.
-  struct address_map from_space;
-  struct address_map to_space;
-  struct address_map survivor_space;
-
-  // Set of edges from lospace that may reference young objects,
-  // possibly in other spaces.
-  struct address_set remembered_edges;
 };
 
 static size_t
@@ -113,11 +128,17 @@ large_object_space_size_at_last_collection(struct large_object_space *space) {
   return space->live_pages_at_last_collection << space->page_size_log2;
 }
 
+static inline int
+large_object_space_contains_with_lock(struct large_object_space *space,
+                                      struct gc_ref ref) {
+  return address_map_contains(&space->object_map, gc_ref_value(ref));
+}
+
 static inline int
 large_object_space_contains(struct large_object_space *space,
                             struct gc_ref ref) {
   pthread_mutex_lock(&space->lock);
-  int ret = address_map_contains(&space->object_map, gc_ref_value(ref));
+  int ret = large_object_space_contains_with_lock(space, ref);
   pthread_mutex_unlock(&space->lock);
   return ret;
 }
@@ -125,37 +146,22 @@ large_object_space_contains(struct large_object_space *space,
 static inline struct gc_ref
 large_object_space_object_containing_edge(struct large_object_space *space,
                                           struct gc_edge edge) {
-  pthread_mutex_lock(&space->lock);
+  pthread_mutex_lock(&space->object_tree_lock);
   struct large_object_node *node =
     large_object_tree_lookup(&space->object_tree, gc_edge_address(edge));
   uintptr_t addr = (node && node->value.is_live) ? node->key.addr : 0;
-  pthread_mutex_unlock(&space->lock);
+  pthread_mutex_unlock(&space->object_tree_lock);
   return gc_ref(addr);
 }
 
-static void
-large_object_space_flip_survivor(uintptr_t addr, uintptr_t node_bits,
-                                 void *data) {
-  struct large_object_space *space = data;
-  struct large_object_node *node = (void*)node_bits;
-  GC_ASSERT(node->value.is_live && node->value.live.is_survivor);
-  node->value.live.is_survivor = 0;
-  address_map_add(&space->from_space, addr, (uintptr_t)node);
-}
-
 static void
 large_object_space_start_gc(struct large_object_space *space, int is_minor_gc) {
-  // Flip.  Note that when we flip, fromspace is empty, but it might have
-  // allocated storage, so we do need to do a proper swap.
-  struct address_map tmp;
-  memcpy(&tmp, &space->from_space, sizeof(tmp));
-  memcpy(&space->from_space, &space->to_space, sizeof(tmp));
-  memcpy(&space->to_space, &tmp, sizeof(tmp));
-  
+  // Take the space lock to prevent
+  // large_object_space_process_quarantine from concurrently mutating
+  // the object map.
+  pthread_mutex_lock(&space->lock);
   if (!is_minor_gc) {
-    address_map_for_each(&space->survivor_space,
-                         large_object_space_flip_survivor, space);
-    address_map_clear(&space->survivor_space);
+    space->marked ^= LARGE_OBJECT_MARK_TOGGLE_BIT;
     space->live_pages_at_last_collection = 0;
   }
 }
@@ -170,56 +176,57 @@ large_object_space_object_size(struct large_object_space *space,
   return node->key.size;
 }
 
-static void
-large_object_space_do_copy(struct large_object_space *space,
-                           struct large_object_node *node) {
-  GC_ASSERT(address_map_contains(&space->from_space, node->key.addr));
+static uint8_t*
+large_object_node_mark_loc(struct large_object_node *node) {
   GC_ASSERT(node->value.is_live);
-  GC_ASSERT(!node->value.live.is_survivor);
-  uintptr_t addr = node->key.addr;
-  size_t bytes = node->key.size;
-  uintptr_t node_bits = (uintptr_t)node;
-  space->live_pages_at_last_collection += bytes >> space->page_size_log2;
-  address_map_remove(&space->from_space, addr);
-  if (GC_GENERATIONAL) {
-    node->value.live.is_survivor = 1;
-    address_map_add(&space->survivor_space, addr, node_bits);
-  } else {
-    address_map_add(&space->to_space, addr, node_bits);
-  }
+  return &node->value.live.mark;
+}
+
+static uint8_t
+large_object_node_get_mark(struct large_object_node *node) {
+  return atomic_load_explicit(large_object_node_mark_loc(node),
+                              memory_order_acquire);
+}
+
+static struct large_object_node*
+large_object_space_lookup(struct large_object_space *space, struct gc_ref ref) {
+  return (struct large_object_node*) address_map_lookup(&space->object_map,
+                                                        gc_ref_value(ref),
+                                                        0);
 }
 
 static int
-large_object_space_copy(struct large_object_space *space, struct gc_ref ref) {
-  int copied = 0;
-  uintptr_t addr = gc_ref_value(ref);
-  pthread_mutex_lock(&space->lock);
-  uintptr_t node_bits = address_map_lookup(&space->from_space, addr, 0);
-  if (node_bits) {
-    large_object_space_do_copy(space, (struct large_object_node*) node_bits);
-    // Object is grey; place it on mark stack to visit its fields.
-    copied = 1;
-  }
-  pthread_mutex_unlock(&space->lock);
-  return copied;
+large_object_space_mark(struct large_object_space *space, struct gc_ref ref) {
+  struct large_object_node *node = large_object_space_lookup(space, ref);
+  if (!node)
+    return 0;
+  GC_ASSERT(node->value.is_live);
+
+  uint8_t *loc = large_object_node_mark_loc(node);
+  uint8_t mark = atomic_load_explicit(loc, memory_order_relaxed);
+  do {
+    if (mark == space->marked)
+      return 0;
+  } while (!atomic_compare_exchange_weak_explicit(loc, &mark, space->marked,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire));
+
+  size_t pages = node->key.size >> space->page_size_log2;
+  space->live_pages_at_last_collection += pages;
+
+  return 1;
 }
 
 static int
-large_object_space_is_copied(struct large_object_space *space,
+large_object_space_is_marked(struct large_object_space *space,
                              struct gc_ref ref) {
-  GC_ASSERT(large_object_space_contains(space, ref));
-  int copied = 0;
-  uintptr_t addr = gc_ref_value(ref);
-  pthread_mutex_lock(&space->lock);
-  copied = !address_map_contains(&space->from_space, addr);
-  pthread_mutex_unlock(&space->lock);
-  return copied;
-}
+  struct large_object_node *node = large_object_space_lookup(space, ref);
+  if (!node)
+    return 0;
+  GC_ASSERT(node->value.is_live);
 
-static int
-large_object_space_is_survivor_with_lock(struct large_object_space *space,
-                                         struct gc_ref ref) {
-  return address_map_contains(&space->survivor_space, gc_ref_value(ref));
+  return atomic_load_explicit(large_object_node_mark_loc(node),
+                              memory_order_acquire) == space->marked;
 }
 
 static int
@@ -227,7 +234,7 @@ large_object_space_is_survivor(struct large_object_space *space,
                                struct gc_ref ref) {
   GC_ASSERT(large_object_space_contains(space, ref));
   pthread_mutex_lock(&space->lock);
-  int old = large_object_space_is_survivor_with_lock(space, ref);
+  int old = large_object_space_is_marked(space, ref);
   pthread_mutex_unlock(&space->lock);
   return old;
 }
@@ -237,15 +244,17 @@ large_object_space_remember_edge(struct large_object_space *space,
                                  struct gc_ref obj,
                                  struct gc_edge edge) {
   GC_ASSERT(large_object_space_contains(space, obj));
-  int remembered = 0;
+  if (!large_object_space_is_survivor(space, obj))
+    return 0;
+
   uintptr_t edge_addr = gc_edge_address(edge);
-  pthread_mutex_lock(&space->lock);
-  if (large_object_space_is_survivor_with_lock(space, obj)
-      && !address_set_contains(&space->remembered_edges, edge_addr)) {
+  int remembered = 0;
+  pthread_mutex_lock(&space->remembered_edges_lock);
+  if (!address_set_contains(&space->remembered_edges, edge_addr)) {
     address_set_add(&space->remembered_edges, edge_addr);
     remembered = 1;
   }
-  pthread_mutex_unlock(&space->lock);
+  pthread_mutex_unlock(&space->remembered_edges_lock);
   return remembered;
 }
 
@@ -253,10 +262,10 @@ static void
 large_object_space_forget_edge(struct large_object_space *space,
                                struct gc_edge edge) {
   uintptr_t edge_addr = gc_edge_address(edge);
-  pthread_mutex_lock(&space->lock);
+  pthread_mutex_lock(&space->remembered_edges_lock);
   GC_ASSERT(address_set_contains(&space->remembered_edges, edge_addr));
   address_set_remove(&space->remembered_edges, edge_addr);
-  pthread_mutex_unlock(&space->lock);
+  pthread_mutex_unlock(&space->remembered_edges_lock);
 }
 
 static void
@@ -264,11 +273,6 @@ large_object_space_clear_remembered_edges(struct large_object_space *space) {
   address_set_clear(&space->remembered_edges);
 }
 
-static int large_object_space_mark_object(struct large_object_space *space,
-                                          struct gc_ref ref) {
-  return large_object_space_copy(space, ref);
-}
-
 static void
 large_object_space_add_to_freelist(struct large_object_space *space,
                                    struct large_object_node *node) {
@@ -297,18 +301,24 @@ large_object_space_remove_from_freelist(struct large_object_space *space,
 }
 
 static void
-large_object_space_reclaim_one(uintptr_t addr, uintptr_t node_bits,
-                               void *data) {
+large_object_space_sweep_one(uintptr_t addr, uintptr_t node_bits,
+                             void *data) {
   struct large_object_space *space = data;
   struct large_object_node *node = (struct large_object_node*) node_bits;
+  if (!GC_GENERATIONAL && !node->value.is_live)
+    return;
   GC_ASSERT(node->value.is_live);
-  large_object_space_add_to_freelist(space, node);
+  uint8_t mark = atomic_load_explicit(large_object_node_mark_loc(node),
+                                      memory_order_acquire);
+  if (mark != space->marked)
+    large_object_space_add_to_freelist(space, node);
 }
 
 static void
 large_object_space_process_quarantine(void *data) {
   struct large_object_space *space = data;
   pthread_mutex_lock(&space->lock);
+  pthread_mutex_lock(&space->object_tree_lock);
   for (size_t idx = 0; idx < large_object_freelist_num_size_classes(); idx++) {
     struct large_object_node **link = &space->quarantine.buckets[idx];
     for (struct large_object_node *node = *link; node; node = *link) {
@@ -324,16 +334,23 @@ large_object_space_process_quarantine(void *data) {
       }
     }
   }
+  pthread_mutex_unlock(&space->object_tree_lock);
   pthread_mutex_unlock(&space->lock);
 }
 
 static void
 large_object_space_finish_gc(struct large_object_space *space,
                              int is_minor_gc) {
-  pthread_mutex_lock(&space->lock);
-  address_map_for_each(&space->from_space, large_object_space_reclaim_one,
-                       space);
-  address_map_clear(&space->from_space);
+  if (GC_GENERATIONAL) {
+    address_map_for_each(is_minor_gc ? &space->nursery : &space->object_map,
+                         large_object_space_sweep_one,
+                         space);
+    address_map_clear(&space->nursery);
+  } else {
+    address_map_for_each(&space->object_map,
+                         large_object_space_sweep_one,
+                         space);
+  }
   size_t free_pages =
     space->total_pages - space->live_pages_at_last_collection;
   space->pages_freed_by_last_collection = free_pages - space->free_pages;
@@ -366,24 +383,20 @@ large_object_space_mark_conservative_ref(struct large_object_space *space,
     addr -= displacement;
   }
 
-  pthread_mutex_lock(&space->lock);
-  struct large_object_node *node = NULL;
+  struct large_object_node *node;
   if (possibly_interior) {
+    pthread_mutex_lock(&space->object_tree_lock);
     node = large_object_tree_lookup(&space->object_tree, addr);
-    if (node && !address_map_contains(&space->from_space, node->key.addr))
-      node = NULL;
+    pthread_mutex_unlock(&space->object_tree_lock);
   } else {
-    uintptr_t node_bits = address_map_lookup(&space->from_space, addr, 0);
-    node = (struct large_object_node*) node_bits;
+    node = large_object_space_lookup(space, gc_ref(addr));
   }
-  struct gc_ref ret = gc_ref_null();
-  if (node) {
-    large_object_space_do_copy(space, node);
-    ret = gc_ref(node->key.addr);
-  }
-  pthread_mutex_unlock(&space->lock);
 
-  return ret;
+  if (node && node->value.is_live &&
+      large_object_space_mark(space, gc_ref(node->key.addr)))
+    return gc_ref(node->key.addr);
+
+  return gc_ref_null();
 }
 
 static void*
@@ -417,8 +430,9 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
         large_object_space_add_to_freelist(space, tail_node);
       }
 
-      // Add the object to tospace.
-      address_map_add(&space->to_space, node->key.addr, (uintptr_t)node);
+      // Add the object to the nursery.
+      if (GC_GENERATIONAL)
+        address_map_add(&space->nursery, node->key.addr, (uintptr_t)node);
     
       space->free_pages -= npages;
       ret = (void*)node->key.addr;
@@ -441,15 +455,16 @@ large_object_space_obtain_and_alloc(struct large_object_space *space,
   struct large_object k = { addr, bytes };
   struct large_object_data v = {0,};
   v.is_live = 1;
-  v.live.is_survivor = 0;
+  v.live.mark = 0;
 
   pthread_mutex_lock(&space->lock);
+  pthread_mutex_lock(&space->object_tree_lock);
   struct large_object_node *node =
     large_object_tree_insert(&space->object_tree, k, v);
   uintptr_t node_bits = (uintptr_t)node;
   address_map_add(&space->object_map, addr, node_bits);
-  address_map_add(&space->to_space, addr, node_bits);
   space->total_pages += npages;
+  pthread_mutex_unlock(&space->object_tree_lock);
   pthread_mutex_unlock(&space->lock);
 
   return ret;
@@ -461,18 +476,17 @@ large_object_space_init(struct large_object_space *space,
                         struct gc_background_thread *thread) {
   memset(space, 0, sizeof(*space));
   pthread_mutex_init(&space->lock, NULL);
+  pthread_mutex_init(&space->object_tree_lock, NULL);
+  pthread_mutex_init(&space->remembered_edges_lock, NULL);
 
   space->page_size = getpagesize();
   space->page_size_log2 = __builtin_ctz(space->page_size);
 
   large_object_tree_init(&space->object_tree);
   address_map_init(&space->object_map);
-
+  address_map_init(&space->nursery);
   large_object_freelist_init(&space->quarantine);
 
-  address_map_init(&space->from_space);
-  address_map_init(&space->to_space);
-  address_map_init(&space->survivor_space);
   address_set_init(&space->remembered_edges);
 
   if (thread)
diff --git a/src/mmc.c b/src/mmc.c
index 445bda8ec..266c19c41 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -135,8 +135,7 @@ do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
     return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref,
                                               &data->allocator);
   else if (large_object_space_contains(heap_large_object_space(heap), ref))
-    return large_object_space_mark_object(heap_large_object_space(heap),
-                                          ref);
+    return large_object_space_mark(heap_large_object_space(heap), ref);
   else
     return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
 }
@@ -170,8 +169,8 @@ gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
     return nofl_space_forward_or_mark_if_traced(nofl_space, edge, ref);
 
   struct large_object_space *lospace = heap_large_object_space(heap);
-  if (large_object_space_contains(lospace, ref))
-    return large_object_space_is_copied(lospace, ref);
+  if (large_object_space_contains_with_lock(lospace, ref))
+    return large_object_space_is_marked(lospace, ref);
 
   GC_CRASH();
 }
diff --git a/src/pcc.c b/src/pcc.c
index eb2d1ed18..dd91a0317 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -382,7 +382,7 @@ static inline int do_minor_trace(struct gc_heap *heap, struct gc_edge edge,
     // Note that although the target of the edge might not be in lospace, this
     // will do what we want and return 1 if and only if ref is was a young
     // object in lospace.
-    return large_object_space_copy(heap_large_object_space(heap), ref);
+    return large_object_space_mark(heap_large_object_space(heap), ref);
   }
 }
 
@@ -411,8 +411,8 @@ static inline int do_trace(struct gc_heap *heap, struct gc_edge edge,
   }
 
   // Fall through for objects in large or extern spaces.
-  if (large_object_space_contains(heap_large_object_space(heap), ref))
-    return large_object_space_mark_object(heap_large_object_space(heap), ref);
+  if (large_object_space_contains_with_lock(heap_large_object_space(heap), ref))
+    return large_object_space_mark(heap_large_object_space(heap), ref);
   else
     return gc_extern_space_visit(heap_extern_space(heap), edge, ref);
 }
@@ -451,8 +451,8 @@ int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
       return copy_space_forward_if_traced(heap_mono_space(heap), edge, ref);
   }
 
-  if (large_object_space_contains(heap_large_object_space(heap), ref))
-    return large_object_space_is_copied(heap_large_object_space(heap), ref);
+  if (large_object_space_contains_with_lock(heap_large_object_space(heap), ref))
+    return large_object_space_is_marked(heap_large_object_space(heap), ref);
   GC_CRASH();
 }
 
diff --git a/src/semi.c b/src/semi.c
index c16cecabd..256295c1d 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -207,7 +207,7 @@ static void visit_semi_space(struct gc_heap *heap, struct semi_space *space,
 static void visit_large_object_space(struct gc_heap *heap,
                                      struct large_object_space *space,
                                      struct gc_ref ref) {
-  if (large_object_space_copy(space, ref)) {
+  if (large_object_space_mark(space, ref)) {
     if (GC_UNLIKELY(heap->check_pending_ephemerons))
       gc_resolve_pending_ephemerons(ref, heap);
 
@@ -245,7 +245,8 @@ static void visit(struct gc_edge edge, struct gc_heap *heap) {
     return;
   if (semi_space_contains(heap_semi_space(heap), ref))
     visit_semi_space(heap, heap_semi_space(heap), edge, ref);
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+  else if (large_object_space_contains_with_lock(heap_large_object_space(heap),
+                                                 ref))
     visit_large_object_space(heap, heap_large_object_space(heap), ref);
   else
     visit_external_object(heap, heap->extern_space, edge, ref);
@@ -268,8 +269,8 @@ int gc_visit_ephemeron_key(struct gc_edge edge, struct gc_heap *heap) {
       return 0;
     gc_edge_update(edge, gc_ref(forwarded));
     return 1;
-  } else if (large_object_space_contains(heap_large_object_space(heap), ref)) {
-    return large_object_space_is_copied(heap_large_object_space(heap), ref);
+  } else if (large_object_space_contains_with_lock(heap_large_object_space(heap), ref)) {
+    return large_object_space_is_marked(heap_large_object_space(heap), ref);
   }
   GC_CRASH();
 }

From 2c72034a1cfab60823d240aaf287bd94e9da5375 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 23 Jan 2025 15:19:09 +0100
Subject: [PATCH 366/403] Fix bug in mmc for new lospace locking discipline

---
 src/mmc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mmc.c b/src/mmc.c
index 266c19c41..7394fdc18 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -134,7 +134,7 @@ do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
   if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref)))
     return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref,
                                               &data->allocator);
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+  else if (large_object_space_contains_with_lock(heap_large_object_space(heap), ref))
     return large_object_space_mark(heap_large_object_space(heap), ref);
   else
     return gc_extern_space_visit(heap_extern_space(heap), edge, ref);

From 7a5c994613363befa8fea5c541e9f6992afd4954 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 23 Jan 2025 17:12:34 +0100
Subject: [PATCH 367/403] lospace: Add missing lock in allocation path

---
 src/large-object-space.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index 897a7b9e6..43f2936eb 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -425,8 +425,10 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
         struct large_object tail = {node->key.addr + size, node->key.size - size};
         struct large_object_data tail_value = {0,};
         node->key.size = size;
+        pthread_mutex_lock(&space->object_tree_lock);
         struct large_object_node *tail_node =
           large_object_tree_insert(&space->object_tree, tail, tail_value);
+        pthread_mutex_unlock(&space->object_tree_lock);
         large_object_space_add_to_freelist(space, tail_node);
       }
 

From 70498714845727487eaba6b28a92b4d97e4f780c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 23 Jan 2025 17:22:29 +0100
Subject: [PATCH 368/403] lospace: Fix bug when splitting freelist entries

---
 src/large-object-space.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index 43f2936eb..38b7e51a4 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -429,6 +429,9 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
         struct large_object_node *tail_node =
           large_object_tree_insert(&space->object_tree, tail, tail_value);
         pthread_mutex_unlock(&space->object_tree_lock);
+        uintptr_t tail_node_bits = (uintptr_t)tail_node;
+        address_map_add(&space->object_map, tail_node->key.addr,
+                        tail_node_bits);
         large_object_space_add_to_freelist(space, tail_node);
       }
 

From 68e3a692f58b5ceb34dcd4296d12e9aa3298ce34 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 23 Jan 2025 19:24:57 +0100
Subject: [PATCH 369/403] Fix bug with lospace in generational configurations

---
 src/large-object-space.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index 38b7e51a4..a7c762056 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -298,6 +298,8 @@ large_object_space_remove_from_freelist(struct large_object_space *space,
   if (dead->next)
     dead->next->value.dead.prev = dead->prev;
   *dead->prev = dead->next;
+  dead->prev = NULL;
+  dead->next = NULL;
 }
 
 static void
@@ -305,7 +307,7 @@ large_object_space_sweep_one(uintptr_t addr, uintptr_t node_bits,
                              void *data) {
   struct large_object_space *space = data;
   struct large_object_node *node = (struct large_object_node*) node_bits;
-  if (!GC_GENERATIONAL && !node->value.is_live)
+  if (!node->value.is_live)
     return;
   GC_ASSERT(node->value.is_live);
   uint8_t mark = atomic_load_explicit(large_object_node_mark_loc(node),

From b517464d7f1301710e3249129b2b67b301b9db28 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 24 Jan 2025 16:11:11 +0100
Subject: [PATCH 370/403] copy-space: refactor to copy_space_can_allocate

---
 src/copy-space.h | 10 +++++-----
 src/pcc.c        |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index d32de0298..19b00e5fa 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -567,16 +567,16 @@ copy_space_finish_gc(struct copy_space *space, int is_minor_gc) {
   space->in_gc = 0;
 }
 
-static int
+static size_t
 copy_space_can_allocate(struct copy_space *space, size_t bytes) {
   // With lock!
+  size_t count = 0;
   for (struct copy_space_block *empties = space->empty.list.head;
-       empties;
+       empties && count < bytes;
        empties = empties->next) {
-    if (bytes <= COPY_SPACE_REGION_SIZE) return 1;
-    bytes -= COPY_SPACE_REGION_SIZE;
+    count += COPY_SPACE_REGION_SIZE;
   }
-  return 0;
+  return count;
 }
 
 static void
diff --git a/src/pcc.c b/src/pcc.c
index dd91a0317..1abf70bf6 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -813,7 +813,7 @@ heap_can_minor_gc(struct gc_heap *heap) {
   struct copy_space *new_space = heap_new_space(heap);
   struct copy_space *old_space = heap_old_space(heap);
   size_t nursery_size = heap_nursery_size(heap);
-  return copy_space_can_allocate(old_space, nursery_size);
+  return copy_space_can_allocate(old_space, nursery_size) >= nursery_size;
 }
 
 static enum gc_collection_kind

From cca54736a01bff8e031c6b6e1fa0a425d2fe9322 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 10 Feb 2025 12:45:25 +0100
Subject: [PATCH 371/403] Add build support for tracepoints via lttng-ust

---
 Makefile            | 14 ++++++--
 api/gc-lttng.h      | 83 +++++++++++++++++++++++++++++++++++++++++++++
 api/gc-tracepoint.h | 17 ++++++++++
 embed.mk            | 10 ++++--
 manifest.scm        |  1 +
 src/gc-tracepoint.c |  6 ++++
 6 files changed, 126 insertions(+), 5 deletions(-)
 create mode 100644 api/gc-lttng.h
 create mode 100644 api/gc-tracepoint.h
 create mode 100644 src/gc-tracepoint.c

diff --git a/Makefile b/Makefile
index 8de7346b9..c92cda4d5 100644
--- a/Makefile
+++ b/Makefile
@@ -30,10 +30,16 @@ BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
 
 BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
 
+USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1)
+LTTNG_CPPFLAGS := $(if $(USE_LTTNG), $(shell pkg-config --cflags lttng-ust),)
+LTTNG_LIBS := $(if $(USE_LTTNG), $(shell pkg-config --libs lttng-ust),)
+TRACEPOINT_CPPFLAGS = $(if $(USE_LTTNG),$(LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
+TRACEPOINT_LIBS = $(LTTNG_LIBS)
+
 CC       = gcc
 CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
-CPPFLAGS = -Iapi
-LDFLAGS  = -lpthread -flto=auto
+CPPFLAGS = -Iapi $(TRACEPOINT_CPPFLAGS)
+LDFLAGS  = -lpthread -flto=auto $(TRACEPOINT_LIBS)
 DEPFLAGS = -MMD -MP -MF $(@:obj/%.o=.deps/%.d)
 COMPILE  = $(CC) $(CFLAGS) $(CPPFLAGS) $(DEPFLAGS) -o $@
 LINK     = $(CC) $(LDFLAGS) -o $@
@@ -52,6 +58,8 @@ obj/gc-stack.o: src/gc-stack.c | .deps obj
 	$(COMPILE) -c $<
 obj/gc-options.o: src/gc-options.c | .deps obj
 	$(COMPILE) -c $<
+obj/gc-tracepoint.o: src/gc-tracepoint.c | .deps obj
+	$(COMPILE) -c $<
 obj/%.gc-ephemeron.o: src/gc-ephemeron.c | .deps obj
 	$(COMPILE) -include benchmarks/$*-embedder.h -c $<
 obj/%.gc-finalizer.o: src/gc-finalizer.c | .deps obj
@@ -110,7 +118,7 @@ obj/$(1).$(2).gc.o: src/$(call gc_impl,$(2)) | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) $(call gc_impl_cflags,$(2)) -include benchmarks/$(1)-embedder.h -c $$<
 obj/$(1).$(2).o: benchmarks/$(1).c | .deps obj
 	$$(COMPILE) $(call gc_cflags,$(2)) -include api/$(call gc_attrs,$(2)) -c $$<
-bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/$(1).gc-ephemeron.o obj/$(1).gc-finalizer.o | bin
+bin/$(1).$(2): obj/$(1).$(2).gc.o obj/$(1).$(2).o obj/gc-stack.o obj/gc-options.o obj/gc-platform.o obj/gc-tracepoint.o obj/$(1).gc-ephemeron.o obj/$(1).gc-finalizer.o | bin
 	$$(LINK) $$^ $(call gc_libs,$(2))
 endef
 
diff --git a/api/gc-lttng.h b/api/gc-lttng.h
new file mode 100644
index 000000000..630e7c543
--- /dev/null
+++ b/api/gc-lttng.h
@@ -0,0 +1,83 @@
+#define LTTNG_UST_TRACEPOINT_PROVIDER whippet
+
+#undef LTTNG_UST_TRACEPOINT_INCLUDE
+#define LTTNG_UST_TRACEPOINT_INCLUDE "gc-lttng.h"
+
+#if !defined(_TP_H) || defined(LTTNG_UST_TRACEPOINT_HEADER_MULTI_READ)
+#define _TP_H
+
+#include <lttng/tracepoint.h>
+
+LTTNG_UST_TRACEPOINT_ENUM(
+  whippet, gc_kind,
+  LTTNG_UST_TP_ENUM_VALUES
+  (lttng_ust_field_enum_value("MINOR", 1)
+   lttng_ust_field_enum_value("MAJOR", 2)
+   lttng_ust_field_enum_value("COMPACTING", 3)))
+
+LTTNG_UST_TRACEPOINT_EVENT_CLASS(
+  whippet, tracepoint,
+  LTTNG_UST_TP_ARGS(),
+  LTTNG_UST_TP_FIELDS())
+
+LTTNG_UST_TRACEPOINT_EVENT_CLASS(
+  whippet, size_tracepoint,
+  LTTNG_UST_TP_ARGS(size_t, size),
+  LTTNG_UST_TP_FIELDS(lttng_ust_field_integer(size_t, size, size)))
+
+
+/* The tracepoint instances */
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, size_tracepoint, whippet, init,
+  LTTNG_UST_TP_ARGS(size_t, size))
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, size_tracepoint, whippet, heap_resized,
+  LTTNG_UST_TP_ARGS(size_t, size))
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, size_tracepoint, whippet, live_data_size,
+  LTTNG_UST_TP_ARGS(size_t, size))
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, requesting_stop, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, waiting_for_stop, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutators_stopped, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT(
+  whippet, prepare_gc,
+  LTTNG_UST_TP_ARGS(int, gc_kind),
+  LTTNG_UST_TP_FIELDS(
+    lttng_ust_field_enum(whippet, gc_kind, int, gc_kind, gc_kind)))
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, roots_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, heap_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, ephemerons_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, finalizers_traced, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, restarting_mutators, LTTNG_UST_TP_ARGS())
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_added, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_cause_gc, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_stopping, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_stopped, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_restarted, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, mutator_removed, LTTNG_UST_TP_ARGS())
+
+/*
+ * Use LTTNG_UST_TRACEPOINT_EVENT(), LTTNG_UST_TRACEPOINT_EVENT_CLASS(),
+ * LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(), and
+ * LTTNG_UST_TRACEPOINT_LOGLEVEL() here.
+ */
+
+#endif /* _TP_H */
+
+#include <lttng/tracepoint-event.h>
diff --git a/api/gc-tracepoint.h b/api/gc-tracepoint.h
new file mode 100644
index 000000000..598d0bc44
--- /dev/null
+++ b/api/gc-tracepoint.h
@@ -0,0 +1,17 @@
+#ifndef GC_TRACEPOINT_H
+#define GC_TRACEPOINT_H
+
+#ifdef GC_TRACEPOINT_LTTNG
+
+#include "gc-lttng.h"
+
+#define GC_TRACEPOINT(...) \
+  lttng_ust_tracepoint(whippet, __VA_ARGS__)
+
+#else // GC_TRACEPOINT_LTTNG
+
+#define GC_TRACEPOINT(...) do {} while (0)
+
+#endif // GC_TRACEPOINT_LTTNG
+
+#endif // GC_TRACEPOINT_H
diff --git a/embed.mk b/embed.mk
index a98f7df48..4612a0bb1 100644
--- a/embed.mk
+++ b/embed.mk
@@ -12,11 +12,17 @@ V ?= 1
 v_0 = @
 v_1 =
 
+GC_USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1)
+GC_LTTNG_CPPFLAGS := $(if $(GC_USE_LTTNG), $(shell pkg-config --cflags lttng-ust),)
+GC_LTTNG_LIBS := $(if $(GC_USE_LTTNG), $(shell pkg-config --libs lttng-ust),)
+GC_TRACEPOINT_CPPFLAGS = $(if $(GC_USE_LTTNG),$(GC_LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
+GC_TRACEPOINT_LIBS = $(GC_LTTNG_LIBS)
+
 GC_V        = $(v_$(V))
 GC_CC       = gcc
 GC_CFLAGS   = -Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(GC_BUILD_CFLAGS)
-GC_CPPFLAGS = -I$(WHIPPET)api
-GC_LDFLAGS  = -lpthread -flto=auto
+GC_CPPFLAGS = -I$(WHIPPET)api $(GC_TRACEPOINT_CPPFLAGS)
+GC_LDFLAGS  = -lpthread -flto=auto $(GC_TRACEPOINT_LIBS)
 GC_DEPFLAGS = 
 GC_COMPILE  = $(GC_V)$(GC_CC) $(GC_CFLAGS) $(GC_CPPFLAGS) $(GC_DEPFLAGS) -o $@
 GC_LINK     = $(GC_V)$(GC_CC) $(GC_LDFLAGS) -o $@
diff --git a/manifest.scm b/manifest.scm
index fbb5d428c..ea35cf3d2 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -4,6 +4,7 @@
  '("bash"
    "coreutils"
    "gcc-toolchain"
+   "lttng-ust"
    "glibc"
    "libgc"
    "make"
diff --git a/src/gc-tracepoint.c b/src/gc-tracepoint.c
new file mode 100644
index 000000000..aa8ebc4a1
--- /dev/null
+++ b/src/gc-tracepoint.c
@@ -0,0 +1,6 @@
+#include <assert.h>
+#ifdef GC_TRACEPOINT_LTTNG
+#define LTTNG_UST_TRACEPOINT_DEFINE
+#define LTTNG_UST_TRACEPOINT_CREATE_PROBES
+#include "gc-lttng.h"
+#endif // GC_TRACEPOINT_LTTNG

From 461efa98a08d399975c64f3a56f63d7d6de7d0bc Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Mon, 10 Feb 2025 13:41:19 +0100
Subject: [PATCH 372/403] Wire up tracepoints for event-listener interface

---
 src/bdw.c  | 19 +++++++++++++------
 src/mmc.c  | 14 ++++++++++----
 src/pcc.c  | 14 ++++++++++----
 src/semi.c | 14 ++++++++++----
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/bdw.c b/src/bdw.c
index 72b13012e..5f90057a7 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -5,6 +5,7 @@
 
 #include "gc-api.h"
 #include "gc-ephemeron.h"
+#include "gc-tracepoint.h"
 
 #define GC_IMPL 1
 #include "gc-internal.h"
@@ -70,11 +71,16 @@ struct gc_mutator {
 };
 
 struct gc_heap *__the_bdw_gc_heap;
-#define HEAP_EVENT(event, ...)                                    \
-  __the_bdw_gc_heap->event_listener.event(__the_bdw_gc_heap->event_listener_data, ##__VA_ARGS__)
-#define MUTATOR_EVENT(mut, event, ...)                                  \
-  __the_bdw_gc_heap->event_listener.event(mut->event_listener_data, ##__VA_ARGS__)
-
+#define HEAP_EVENT(event, ...) do {                                     \
+    __the_bdw_gc_heap->event_listener.event(__the_bdw_gc_heap->event_listener_data, \
+                                            ##__VA_ARGS__);             \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
+#define MUTATOR_EVENT(mut, event, ...) do {                             \
+    __the_bdw_gc_heap->event_listener.event(mut->event_listener_data,   \
+                                            ##__VA_ARGS__);             \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
 static inline size_t gc_inline_bytes_to_freelist_index(size_t bytes) {
   return (bytes - 1U) / GC_INLINE_GRANULE_BYTES;
 }
@@ -386,7 +392,8 @@ static inline struct gc_mutator *add_mutator(struct gc_heap *heap) {
   struct gc_mutator *ret =
     GC_generic_malloc(sizeof(struct gc_mutator), mutator_gc_kind);
   ret->heap = heap;
-  ret->event_listener_data = HEAP_EVENT(mutator_added);
+  ret->event_listener_data =
+    heap->event_listener.mutator_added(heap->event_listener_data);
 
   pthread_mutex_lock(&heap->lock);
   ret->next = heap->mutators;
diff --git a/src/mmc.c b/src/mmc.c
index 7394fdc18..db7e1f512 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -17,6 +17,7 @@
 #include "gc-platform.h"
 #include "gc-stack.h"
 #include "gc-trace.h"
+#include "gc-tracepoint.h"
 #include "heap-sizer.h"
 #include "large-object-space.h"
 #include "nofl-space.h"
@@ -67,10 +68,15 @@ struct gc_heap {
   void *event_listener_data;
 };
 
-#define HEAP_EVENT(heap, event, ...)                                    \
-  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
-#define MUTATOR_EVENT(mut, event, ...)                                  \
-  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+#define HEAP_EVENT(heap, event, ...) do {                               \
+    (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__); \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
+#define MUTATOR_EVENT(mut, event, ...) do {                             \
+    (mut)->heap->event_listener.event((mut)->event_listener_data,       \
+                                      ##__VA_ARGS__);                   \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
 
 struct gc_mutator {
   struct nofl_allocator allocator;
diff --git a/src/pcc.c b/src/pcc.c
index 1abf70bf6..f3a94d22b 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -17,6 +17,7 @@
 #include "gc-inline.h"
 #include "gc-platform.h"
 #include "gc-trace.h"
+#include "gc-tracepoint.h"
 #include "heap-sizer.h"
 #include "large-object-space.h"
 #if GC_PARALLEL
@@ -74,10 +75,15 @@ struct gc_heap {
   void *event_listener_data;
 };
 
-#define HEAP_EVENT(heap, event, ...)                                    \
-  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
-#define MUTATOR_EVENT(mut, event, ...)                                  \
-  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+#define HEAP_EVENT(heap, event, ...) do {                               \
+    (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__); \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
+#define MUTATOR_EVENT(mut, event, ...) do {                             \
+    (mut)->heap->event_listener.event((mut)->event_listener_data,       \
+                                      ##__VA_ARGS__);                   \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
 
 struct gc_mutator {
   struct copy_space_allocator allocator;
diff --git a/src/semi.c b/src/semi.c
index 256295c1d..0d0c9ecca 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -9,6 +9,7 @@
 #include "gc-internal.h"
 
 #include "gc-platform.h"
+#include "gc-tracepoint.h"
 #include "heap-sizer.h"
 #include "semi-attrs.h"
 #include "large-object-space.h"
@@ -59,10 +60,15 @@ struct gc_mutator {
   void *event_listener_data;
 };
 
-#define HEAP_EVENT(heap, event, ...)                                    \
-  (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__)
-#define MUTATOR_EVENT(mut, event, ...)                                  \
-  (mut)->heap->event_listener.event((mut)->event_listener_data, ##__VA_ARGS__)
+#define HEAP_EVENT(heap, event, ...) do {                               \
+    (heap)->event_listener.event((heap)->event_listener_data, ##__VA_ARGS__); \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
+#define MUTATOR_EVENT(mut, event, ...) do {                             \
+    (mut)->heap->event_listener.event((mut)->event_listener_data,       \
+                                      ##__VA_ARGS__);                   \
+    GC_TRACEPOINT(event, ##__VA_ARGS__);                                \
+  } while (0)
 
 static inline void clear_memory(uintptr_t addr, size_t size) {
   memset((char*)addr, 0, size);

From d675a9b8f10ccfe49a0633475c6a72bd1876ad1f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 11 Feb 2025 16:14:27 +0100
Subject: [PATCH 373/403] Add tracepoints to tracer itself

Also fix an issue whereby the main thread would spin, waiting for other
active threads to finish, doing no work itself.
---
 api/gc-lttng.h        | 29 +++++++++++++++---
 src/parallel-tracer.h | 71 +++++++++++++++++++++++++++----------------
 2 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/api/gc-lttng.h b/api/gc-lttng.h
index 630e7c543..9df639b7d 100644
--- a/api/gc-lttng.h
+++ b/api/gc-lttng.h
@@ -72,11 +72,30 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
 LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
   whippet, tracepoint, whippet, mutator_removed, LTTNG_UST_TP_ARGS())
 
-/*
- * Use LTTNG_UST_TRACEPOINT_EVENT(), LTTNG_UST_TRACEPOINT_EVENT_CLASS(),
- * LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(), and
- * LTTNG_UST_TRACEPOINT_LOGLEVEL() here.
- */
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_unpark_all, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_share, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_check_termination_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_check_termination_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_steal_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_steal_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_roots_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_roots_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_objects_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_objects_end, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_worker_begin, LTTNG_UST_TP_ARGS())
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
+  whippet, tracepoint, whippet, trace_worker_end, LTTNG_UST_TP_ARGS())
 
 #endif /* _TP_H */
 
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 8115c369d..e368c36d6 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -9,6 +9,7 @@
 #include "assert.h"
 #include "debug.h"
 #include "gc-inline.h"
+#include "gc-tracepoint.h"
 #include "local-worklist.h"
 #include "root-worklist.h"
 #include "shared-worklist.h"
@@ -157,6 +158,7 @@ tracer_unpark_all_workers(struct gc_tracer *tracer) {
   long epoch = old_epoch + 1;
   DEBUG("starting trace; %zu workers; epoch=%ld\n", tracer->worker_count,
         epoch);
+  GC_TRACEPOINT(trace_unpark_all);
   pthread_cond_broadcast(&tracer->cond);
 }
 
@@ -171,6 +173,7 @@ tracer_maybe_unpark_workers(struct gc_tracer *tracer) {
 static inline void
 tracer_share(struct gc_trace_worker *worker) {
   DEBUG("tracer #%zu: sharing\n", worker->id);
+  GC_TRACEPOINT(trace_share);
   size_t to_share = LOCAL_WORKLIST_SHARE_AMOUNT;
   while (to_share) {
     struct gc_ref *objv;
@@ -235,40 +238,45 @@ trace_worker_can_steal_from_any(struct gc_trace_worker *worker,
   return 0;
 }
 
-static int
-trace_worker_should_continue(struct gc_trace_worker *worker) {
+static size_t
+trace_worker_should_continue(struct gc_trace_worker *worker, size_t spin_count) {
   // Helper workers should park themselves immediately if they have no work.
   if (worker->id != 0)
     return 0;
 
   struct gc_tracer *tracer = worker->tracer;
 
-  for (size_t spin_count = 0;; spin_count++) {
-    if (atomic_load_explicit(&tracer->active_tracers,
-                             memory_order_acquire) == 1) {
-      // All trace workers have exited except us, the main worker.  We are
-      // probably done, but we need to synchronize to be sure that there is no
-      // work pending, for example if a worker had a spurious wakeup.  Skip
-      // worker 0 (the main worker).
-      size_t locked = 1;
-      while (locked < tracer->worker_count) {
-        if (pthread_mutex_trylock(&tracer->workers[locked].lock) == 0)
-          locked++;
-        else
-          break;
-      }
-      int done = (locked == tracer->worker_count) &&
-        !trace_worker_can_steal_from_any(worker, tracer);
-      if (done)
-        return 0;
-      while (locked > 1)
-        pthread_mutex_unlock(&tracer->workers[--locked].lock);
-      return 1;
-    }
-    // spin
-    LOG("checking for termination: spinning #%zu\n", spin_count);
+  if (atomic_load_explicit(&tracer->active_tracers, memory_order_acquire) != 1) {
+    LOG("checking for termination: tracers active, spinning #%zu\n", spin_count);
     yield_for_spin(spin_count);
+    return 1;
   }
+
+  // All trace workers have exited except us, the main worker.  We are
+  // probably done, but we need to synchronize to be sure that there is no
+  // work pending, for example if a worker had a spurious wakeup.  Skip
+  // worker 0 (the main worker).
+
+  GC_TRACEPOINT(trace_check_termination_begin);
+  size_t locked = 1;
+  while (locked < tracer->worker_count) {
+    if (pthread_mutex_trylock(&tracer->workers[locked].lock) == 0)
+      locked++;
+    else
+      break;
+  }
+  int done = (locked == tracer->worker_count) &&
+    !trace_worker_can_steal_from_any(worker, tracer);
+  GC_TRACEPOINT(trace_check_termination_end);
+
+  if (done)
+    return 0;
+  while (locked > 1)
+    pthread_mutex_unlock(&tracer->workers[--locked].lock);
+
+  LOG("checking for termination: failed to lock, spinning #%zu\n", spin_count);
+  yield_for_spin(spin_count);
+  return 1;
 }
 
 static struct gc_ref
@@ -285,8 +293,10 @@ trace_worker_steal(struct gc_trace_worker *worker) {
       return obj;
   }
 
+  GC_TRACEPOINT(trace_steal_begin);
   LOG("tracer #%zu: trying to steal\n", worker->id);
   struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
+  GC_TRACEPOINT(trace_steal_end);
   if (!gc_ref_is_null(obj))
     return obj;
 
@@ -329,7 +339,9 @@ trace_with_data(struct gc_tracer *tracer,
     }
   } else {
     DEBUG("tracer #%zu: tracing objects\n", worker->id);
+    GC_TRACEPOINT(trace_objects_begin);
     size_t n = 0;
+    size_t spin_count = 0;
     do {
       while (1) {
         struct gc_ref ref;
@@ -343,7 +355,8 @@ trace_with_data(struct gc_tracer *tracer,
         trace_one(ref, heap, worker);
         n++;
       }
-    } while (trace_worker_should_continue(worker));
+    } while (trace_worker_should_continue(worker, spin_count++));
+    GC_TRACEPOINT(trace_objects_end);
 
     DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
   }
@@ -354,8 +367,10 @@ trace_with_data(struct gc_tracer *tracer,
 
 static void
 trace_worker_trace(struct gc_trace_worker *worker) {
+  GC_TRACEPOINT(trace_worker_begin);
   gc_trace_worker_call_with_data(trace_with_data, worker->tracer,
                                  worker->heap, worker);
+  GC_TRACEPOINT(trace_worker_end);
 }
 
 static inline int
@@ -406,9 +421,11 @@ static inline void
 gc_tracer_trace_roots(struct gc_tracer *tracer) {
   DEBUG("starting roots-only trace\n");
 
+  GC_TRACEPOINT(trace_roots_begin);
   tracer->trace_roots_only = 1;
   gc_tracer_trace(tracer);
   tracer->trace_roots_only = 0;
+  GC_TRACEPOINT(trace_roots_end);
   
   GC_ASSERT_EQ(atomic_load(&tracer->active_tracers), 0);
   DEBUG("roots-only trace finished\n");

From 2f0c0f8f8e0d7b20f752fdb9a211257a2d7d1bcf Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 11 Feb 2025 21:42:13 +0100
Subject: [PATCH 374/403] Refactor lttng makefile bits

---
 Makefile | 10 ++++++----
 embed.mk | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index c92cda4d5..c5c91ae3e 100644
--- a/Makefile
+++ b/Makefile
@@ -30,10 +30,12 @@ BUILD_CFLAGS_debug    = -O0 -g -DGC_DEBUG=1
 
 BUILD_CFLAGS = $(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
 
-USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1)
-LTTNG_CPPFLAGS := $(if $(USE_LTTNG), $(shell pkg-config --cflags lttng-ust),)
-LTTNG_LIBS := $(if $(USE_LTTNG), $(shell pkg-config --libs lttng-ust),)
-TRACEPOINT_CPPFLAGS = $(if $(USE_LTTNG),$(LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
+USE_LTTNG_0 :=
+USE_LTTNG_1 := 1
+USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1 || echo 0)
+LTTNG_CPPFLAGS := $(if $(USE_LTTNG_$(USE_LTTNG)), $(shell pkg-config --cflags lttng-ust),)
+LTTNG_LIBS := $(if $(USE_LTTNG_$(USE_LTTNG)), $(shell pkg-config --libs lttng-ust),)
+TRACEPOINT_CPPFLAGS = $(if $(USE_LTTNG_$(USE_LTTNG)),$(LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
 TRACEPOINT_LIBS = $(LTTNG_LIBS)
 
 CC       = gcc
diff --git a/embed.mk b/embed.mk
index 4612a0bb1..56e9b026d 100644
--- a/embed.mk
+++ b/embed.mk
@@ -12,10 +12,12 @@ V ?= 1
 v_0 = @
 v_1 =
 
-GC_USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1)
-GC_LTTNG_CPPFLAGS := $(if $(GC_USE_LTTNG), $(shell pkg-config --cflags lttng-ust),)
-GC_LTTNG_LIBS := $(if $(GC_USE_LTTNG), $(shell pkg-config --libs lttng-ust),)
-GC_TRACEPOINT_CPPFLAGS = $(if $(GC_USE_LTTNG),$(GC_LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
+GC_USE_LTTNG_0 :=
+GC_USE_LTTNG_1 := 1
+GC_USE_LTTNG := $(shell pkg-config --exists lttng-ust && echo 1 || echo 0)
+GC_LTTNG_CPPFLAGS := $(if $(GC_USE_LTTNG_$(GC_USE_LTTNG)), $(shell pkg-config --cflags lttng-ust),)
+GC_LTTNG_LIBS := $(if $(GC_USE_LTTNG_$(GC_USE_LTTNG)), $(shell pkg-config --libs lttng-ust),)
+GC_TRACEPOINT_CPPFLAGS = $(if $(GC_USE_LTTNG_$(GC_USE_LTTNG)),$(GC_LTTNG_CPPFLAGS) -DGC_TRACEPOINT_LTTNG=1,)
 GC_TRACEPOINT_LIBS = $(GC_LTTNG_LIBS)
 
 GC_V        = $(v_$(V))

From c0dd2e58d16a96fdf7d46b555e8273d9c6028bf5 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Tue, 11 Feb 2025 21:51:46 +0100
Subject: [PATCH 375/403] Fix embed.mk for tracepoints

---
 embed.mk | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/embed.mk b/embed.mk
index 56e9b026d..0d2de0df5 100644
--- a/embed.mk
+++ b/embed.mk
@@ -37,6 +37,8 @@ $(GC_OBJDIR)gc-stack.o: $(WHIPPET)src/gc-stack.c
 	$(GC_COMPILE) -c $<
 $(GC_OBJDIR)gc-options.o: $(WHIPPET)src/gc-options.c
 	$(GC_COMPILE) -c $<
+$(GC_OBJDIR)gc-tracepoint.o: $(WHIPPET)src/gc-tracepoint.c
+	$(GC_COMPILE) -c $<
 $(GC_OBJDIR)gc-ephemeron.o: $(WHIPPET)src/gc-ephemeron.c
 	$(GC_COMPILE) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 $(GC_OBJDIR)gc-finalizer.o: $(WHIPPET)src/gc-finalizer.c
@@ -100,4 +102,4 @@ GC_LIBS             = $(call gc_libs,$(GC_COLLECTOR))
 $(GC_OBJDIR)gc-impl.o: $(WHIPPET)src/$(call gc_impl,$(GC_COLLECTOR))
 	$(GC_COMPILE) $(GC_IMPL_CFLAGS) $(EMBEDDER_TO_GC_CFLAGS) -c $<
 
-GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-ephemeron.o gc-finalizer.o gc-impl.o,$(GC_OBJDIR)$(O))
+GC_OBJS=$(foreach O,gc-platform.o gc-stack.o gc-options.o gc-tracepoint.o gc-ephemeron.o gc-finalizer.o gc-impl.o,$(GC_OBJDIR)$(O))

From 81da950ebe1456a416be85d127c9fee9e74cd915 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 12 Feb 2025 14:33:05 +0100
Subject: [PATCH 376/403] Steal becomes an instant event

---
 api/gc-lttng.h        | 4 +---
 src/parallel-tracer.h | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/api/gc-lttng.h b/api/gc-lttng.h
index 9df639b7d..d192be4ed 100644
--- a/api/gc-lttng.h
+++ b/api/gc-lttng.h
@@ -81,9 +81,7 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
 LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
   whippet, tracepoint, whippet, trace_check_termination_end, LTTNG_UST_TP_ARGS())
 LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
-  whippet, tracepoint, whippet, trace_steal_begin, LTTNG_UST_TP_ARGS())
-LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
-  whippet, tracepoint, whippet, trace_steal_end, LTTNG_UST_TP_ARGS())
+  whippet, tracepoint, whippet, trace_steal, LTTNG_UST_TP_ARGS())
 LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
   whippet, tracepoint, whippet, trace_roots_begin, LTTNG_UST_TP_ARGS())
 LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(
diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index e368c36d6..0b80a6dde 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -293,10 +293,9 @@ trace_worker_steal(struct gc_trace_worker *worker) {
       return obj;
   }
 
-  GC_TRACEPOINT(trace_steal_begin);
+  GC_TRACEPOINT(trace_steal);
   LOG("tracer #%zu: trying to steal\n", worker->id);
   struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
-  GC_TRACEPOINT(trace_steal_end);
   if (!gc_ref_is_null(obj))
     return obj;
 

From 367e04f164a5908ab98c9a0c4cd1d8210a4cdae4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 14 Feb 2025 12:30:40 +0100
Subject: [PATCH 377/403] Add documentation on tracepoints

Also clean up how-to-build documentation
---
 ctf_to_json.py            | 160 +++++++++++++++++++++++++++
 doc/manual.md             | 226 ++++++++++----------------------------
 doc/perfetto-minor-gc.png | Bin 0 -> 173475 bytes
 doc/tracepoints.md        | 126 +++++++++++++++++++++
 4 files changed, 346 insertions(+), 166 deletions(-)
 create mode 100755 ctf_to_json.py
 create mode 100644 doc/perfetto-minor-gc.png
 create mode 100644 doc/tracepoints.md

diff --git a/ctf_to_json.py b/ctf_to_json.py
new file mode 100755
index 000000000..f6b7f429a
--- /dev/null
+++ b/ctf_to_json.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+# Any copyright is dedicated to the Public Domain.
+# https://creativecommons.org/publicdomain/zero/1.0/
+#
+# Originally written by Andy Wingo <wingo@igalia.com>.
+
+import bt2 # From the babeltrace2 package.
+import sys
+import json
+from enum import Enum
+
+# Usage: ./ctf_to_json.py ~/lttng-traces/name-of-your-trace > foo.json
+#
+# Convert a Common Trace Format (CTF) trace, for example as produced by
+# LTTng, to the JSON-based Trace Event Format (TEF), for example as
+# consumed by `chrome://tracing`, `https://ui.perfetto.dev/`, or
+# `https://profiler.firefox.com`.
+
+# The Trace Event Format is documented here:
+#
+# https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0
+
+# By default, events are emitted as EventPhase.INSTANT.  We also support
+# rewriting the event stream so as to generate EventPhase.BEGIN /
+# EventPhase.END events for specific named events.
+
+synthetic_events = {
+    'gc': ['whippet:mutator_cause_gc',
+           'whippet:restarting_mutators'],
+    'stop-the-world': ['whippet:requesting_stop',
+                       'whippet:mutators_stopped'],
+    'trace': ['whippet:prepare_gc',
+              'whippet:restarting_mutators'],
+    'mutator-stopped': ['whippet:mutator_stopping',
+                        'whippet:mutator_restarted'],
+    'trace-roots': ['whippet:trace_roots_begin',
+                    'whippet:trace_roots_end'],
+    'trace-check-termination': ['whippet:trace_check_termination_begin',
+                                'whippet:trace_check_termination_end'],
+    'trace-objects': ['whippet:trace_objects_begin',
+                      'whippet:trace_objects_end'],
+    'trace-worker': ['whippet:trace_worker_begin',
+                     'whippet:trace_worker_end']
+}
+
+class EventPhase(Enum):
+    BEGIN = 'B'
+    END = 'E'
+    COMPLETE = 'X'
+    INSTANT = 'i'
+    COUNTER = 'C'
+    NESTABLE_START = 'b'
+    NESTABLE_INSTANT = 'n'
+    NESTABLE_END = 'e'
+    FLOW_START = 's'
+    FLOW_STEP = 't'
+    FLOW_END = 'f'
+    SAMPLE = 'P'
+    OBJECT_CREATED = 'N'
+    OBJECT_SNAPSHOT = 'O'
+    OBJECT_DESTROYED = 'D'
+    METADATA = 'M'
+    MEMORY_DUMP_GLOBAL = 'V'
+    MEMORY_DUMP_PROCESS = 'V'
+    MARK = 'R'
+    CLOCK_SYNC = 'c'
+    CONTEXT_BEGIN = '('
+    CONTEXT_END = ')'
+
+base_time = None
+def event_us(msg):
+    assert(msg.default_clock_snapshot.clock_class.name == 'monotonic')
+    assert(msg.default_clock_snapshot.clock_class.frequency == 1e9)
+    global base_time
+    ns = msg.default_clock_snapshot.value
+    if base_time is None:
+        base_time = ns
+    return (ns - base_time) * 1e-3
+
+def lower(x):
+    if isinstance(x, str) or isinstance(x, int) or isinstance(x, float):
+        return x
+    if isinstance(x, dict) or isinstance(x, bt2._StructureFieldConst):
+        return {lower(k):lower(v) for k, v in x.items()}
+    if isinstance(x, bt2._BoolValueConst) or isinstance(x, bt2._BoolFieldConst):
+        return bool(x)
+    if isinstance(x, bt2._EnumerationFieldConst):
+        return repr(x)
+    if isinstance(x, bt2._IntegerValueConst) or isinstance(x, bt2._IntegerFieldConst):
+        return int(x)
+    if isinstance(x, bt2._RealValueConst) or isinstance(x, bt2._RealFieldConst):
+        return float(x)
+    if isinstance(x, bt2._StringValueConst) or isinstance(x, bt2._StringFieldConst):
+        return str(x)
+    raise ValueError("Unexpected value from trace", x)
+
+# Specific Whippet events.
+synthetic_begin = {}
+synthetic_end = {}
+for synthetic, [begin, end] in synthetic_events.items():
+    synthetic_begin[begin] = []
+    synthetic_end[end] = []
+for synthetic, [begin, end] in synthetic_events.items():
+    synthetic_begin[begin].append(synthetic)
+    synthetic_end[end].append(synthetic)
+
+def put(str):
+    sys.stdout.write(str)
+
+need_comma = False
+def print_event(ev):
+    global need_comma
+    if need_comma:
+        sys.stdout.write(',\n    ')
+    else:
+        need_comma = True
+    # It appears to be faster to make a string, then print the string,
+    # than to call json.dump with a file object.
+    # json.dump(ev, sys.stdout, ensure_ascii=False, check_circular=False)
+    put(json.dumps(ev, ensure_ascii=False, check_circular=False))
+
+def emit_event(msg, name, phase):
+    ev = {'name': name,
+          'cat': 'whippet',
+          'ph': phase.value,
+          'ts': event_us(msg),
+          'pid': lower(msg.event.common_context_field['vpid']),
+          'tid': lower(msg.event.common_context_field['vtid']),
+          'args': lower(msg.event.payload_field)}
+    print_event(ev)
+def emit_begin_event(msg, name):
+    emit_event(msg, name, EventPhase.BEGIN)
+def emit_end_event(msg, name):
+    emit_event(msg, name, EventPhase.END)
+
+def emit_events(msg):
+    emit_event(msg, msg.event.name, EventPhase.INSTANT)
+    for begin in synthetic_begin.get(msg.event.name, []):
+        emit_begin_event(msg, begin)
+    for end in synthetic_end.get(msg.event.name, []):
+        emit_end_event(msg, end)
+
+def ctf_to_json(path):
+    msg_it = bt2.TraceCollectionMessageIterator(path)
+    put('{\n')
+    put('  "traceEvents": [\n    ')
+    for msg in msg_it:
+        if hasattr(msg, 'event'):
+            emit_events(msg)
+    put('\n')
+    put('\n  ],\n')
+    put('  "displayTimeUnit": "ns"\n')
+    put('}\n')
+
+if len(sys.argv) != 2:
+    sys.stderr.write(
+        'usage: ' + sys.argv[0] + ' ~/lttng-traces/name-of-your-trace\n')
+    sys.exit(1)
+else:
+    ctf_to_json(sys.argv[1])
diff --git a/doc/manual.md b/doc/manual.md
index 1ddfcb556..a6742cbe5 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -176,13 +176,14 @@ implementations of that API: `semi`, a simple semi-space collector;
 collector; and `mmc`, a mostly-marking collector inspired by Immix.
 
 The program that embeds Whippet selects the collector implementation at
-build-time.  In the case of the `mmc` collector, the program
-also configures a specific collector mode, again at build-time:
-generational or not, parallel or not, stack-conservative or not, and
-heap-conservative or not.  It may be nice in the future to be able to
-configure these at run-time, but for the time being they are
-compile-time options so that adding new features doesn't change the
-footprint of a more minimal collector.
+build-time.  For `pcc`, the program can also choose whether to be
+generational or not.  For `mmc` collector, the program configures a
+specific collector mode, again at build-time: generational or not,
+parallel or not, stack-conservative or not, and heap-conservative or
+not.  It may be nice in the future to be able to configure these at
+run-time, but for the time being they are compile-time options so that
+adding new features doesn't change the footprint of a more minimal
+collector.
 
 Different collectors have different allocation strategies: for example,
 the BDW collector allocates from thread-local freelists, whereas the
@@ -199,97 +200,58 @@ compiling user code.
 
 ### Compiling the collector
 
-Building the collector is not as easy as it should be.  As an embed-only
-library, we don't get to choose the One True Build System and then just
-build the software in that way; instead Whippet needs to be buildable
-with any build system.  At some point we will have snippets that
-embedders can include in their various build systems, but for now we
-document the low-level structure, so that people can craft the
-appropriate incantations for their program's build system.
+As an embed-only library, Whippet needs to be integrated into the build
+system of its host (embedder).  Currently the only supported build
+system uses GNU make.  We would be happy to add other systems over time.
 
-Whippet consists of some collector-implementation-agnostic independent
-modules, and then the collector implementation itself.  Though Whippet
-tries to put performance-sensitive interfaces in header files, users
-should also compile with link-time optimization (LTO) to remove any
-overhead imposed by the division of code into separate compilation
-units.
+At a high level, first the embedder chooses a collector and defines how
+to specialize the collector against the embedder.  Whippet's `embed.mk`
+Makefile snippet then defines how to build the set of object files that
+define the collector, and how to specialize the embedder against the
+chosen collector.
 
-Usually you want to build with maximum optimization and no debugging
-assertions.  Sometimes you want minimal optimization and all assertions.
-Here's what we do, as a `Makefile` snippet:
+As an example, say you have a file `program.c`, and you want to compile
+it against a Whippet checkout in `whippet/`.  Your headers are in
+`include/`, and you have written an implementation of the embedder
+interface in `host-gc.h`.  In that case you would have a Makefile like
+this:
 
 ```
-DEFAULT_BUILD=opt
-BUILD_CFLAGS_opt=-O2 -g -DNDEBUG
-BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1
-BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1
-BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
+HOST_DIR:=$(dir $(lastword $(MAKEFILE_LIST)))
+WHIPPET_DIR=$(HOST_DIR)whippet/
+
+all: out
+
+# The collector to choose: e.g. semi, bdw, pcc, generational-pcc, mmc,
+# parallel-mmc, etc.
+GC_COLLECTOR=pcc
+
+include $(WHIPPET_DIR)embed.mk
+
+# Host cflags go here...
+HOST_CFLAGS=
+
+# Whippet's embed.mk uses this variable when it compiles code that
+# should be specialized against the embedder.
+EMBEDDER_TO_GC_CFLAGS=$(HOST_CFLAGS) -include $(HOST_DIR)host-gc.h
+
+program.o: program.c
+	$(GC_COMPILE) $(HOST_CFLAGS) $(GC_TO_EMBEDDER_CFLAGS) -c $<
+program: program.o $(GC_OBJS)
+	$(GC_LINK) $^ $(GC_LIBS)
 ```
 
-So if you do just plain `make`, it will do an `opt` build.  You can
-specify the build mode by setting `BUILD` on the command line, as in
-`make BUILD=debug`.
+The optimization settings passed to the C compiler are taken from
+`GC_BUILD_CFLAGS`.  Embedders can override this variable directly, or
+via the shorthand `GC_BUILD` variable.  A `GC_BUILD` of `opt` indicates
+maximum optimization and no debugging assertions; `optdebug` adds
+debugging assertions; and `debug` removes optimizations.
 
-Then for the actual compilation flags, we do:
-
-```
-CC=gcc
-CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
-INCLUDES=-I.
-LDFLAGS=-lpthread -flto
-COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
-```
-
-The actual include directory (the dot in `-I.`) should be adjusted as
-appropriate.
-
-#### Collector-implementation-agnostic independent modules
-
-There are currently four generic modules that don't depend on the choice
-of collector.  The first is `gc-stack.o`, which has supporting code to
-associate mutators (threads) with slices of the native stack, in order
-to support conservative root-finding.
-
-```
-$(COMPILE) -o gc-stack.o -c gc-stack.c
-```
-
-The next is a generic options interface, to allow the user to
-parameterize the collector at run-time, for example to implement a
-specific heap sizing strategy.
-
-```
-$(COMPILE) -o gc-options.o -c gc-options.c
-```
-
-Next, where Whippet needs to get data from the operating system, for
-example the number of processors available, it does so behind an
-abstract interface that is selected at compile-time.  The only
-implementation currently is for GNU/Linux, but it's a pretty thin layer,
-so adding more systems should not be difficult.
-
-```
-PLATFORM=gnu-linux
-$(COMPILE) -o gc-platform.o -c gc-platform-$(PLATFORM).c
-```
-
-Finally, something a little more complicated: ephemerons.  Ephemerons
-are objects that make a weak association between a key and a value.  As
-first-class objects, they need to be classifiable by the user system,
-and notably via the `gc_trace_object` procedure, and therefore need to
-have a header whose shape is understandable by the embedding program.
-We do this by including the `gc-embedder-api.h` implementation, via
-`-include`, in this case providing `foo-embedder.h`:
-
-```
-$(COMPILE) -include foo-embedder.h -o gc-ephemeron.o -c gc-ephemeron.c
-```
-
-As for ephemerons, finalizers also have their own compilation unit.
-
-```
-$(COMPILE) -include foo-embedder.h -o gc-finalizer.o -c gc-finalizer.c
-```
+Though Whippet tries to put performance-sensitive interfaces in header
+files, users should also compile with link-time optimization (LTO) to
+remove any overhead imposed by the division of code into separate
+compilation units.  `embed.mk` includes the necessary LTO flags in
+`GC_CFLAGS` and `GC_LDFLAGS`.
 
 #### Compile-time options
 
@@ -316,82 +278,14 @@ Some collectors require specific compile-time options.  For example, the
 semi-space collector has to be able to move all objects; this is not
 compatible with conservative roots or heap edges.
 
-#### Building `semi`
+#### Tracing support
 
-Finally, let's build a collector.  The simplest collector is the
-semi-space collector.  The entirety of the implementation can be had by
-compiling `semi.c`, providing the program's embedder API implementation
-via `-include`:
-
-```
-$(COMPILE) -DGC_PRECISE_ROOTS=1 -include foo-embedder.h -o gc.o -c semi.c
-```
-
-#### Building `bdw`
-
-The next simplest collector uses
-[BDW-GC](https://github.com/ivmai/bdwgc).  This collector must scan the
-roots and heap conservatively.  The collector is parallel if BDW-GC
-itself was compiled with parallelism enabled.
-
-```
-$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 \
-  `pkg-config --cflags bdw-gc` \
-  -include foo-embedder.h -o gc.o -c bdw.c
-```
-
-#### Building `pcc`
-
-The parallel copying collector is like `semi` but better in every way:
-it supports multiple mutator threads, and evacuates in parallel if
-multiple threads are available.
-
-```
-$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 \
-  -include foo-embedder.h -o gc.o -c pcc.c
-```
-
-You can also build `pcc` in a generational configuration by passing
-`-DGC_GENERATIONAL=1`.  The nursery is 2 MB per active mutator, capped
-to the number of processors, so if the last cycle had a maximum of 4
-mutator threads active at the same time and your machine has 24 cores,
-your nursery would be 8 MB.
-
-#### Building `mmc`
-
-Finally, there is the mostly-marking collector.  It can collect roots
-precisely or conservatively, trace precisely or conservatively, be
-parallel or not, and be generational or not.
-
-```
-$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-  -include foo-embedder.h -o gc.o -c mvv.c
-```
-
-### Compiling your program
-
-Any compilation unit that uses the GC API should have the same set of
-compile-time options defined as when compiling the collector.
-Additionally those compilation units should include the "attributes"
-header for the collector in question, namely `semi-attrs.h`,
-`bdw-attrs.h`, `pcc-attrs.h`, or `mmc-attrs.h`.  For example, for
-parallel generational mmc, you might have:
-
-```
-$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-  -include mmc-attrs.h -o my-program.o -c my-program.c
-```
-
-### Linking the collector into your program
-
-Finally to link, pass all objects to the linker.  You will want to
-ensure that the linker enables `-flto`, for link-time optimization.  We
-do it like this:
-
-```
-$(CC) $(LDFLAGS) -o my-program \
-  my-program.o gc-stack.o gc-platform.o gc-options.o gc-ephemeron.o
-```
+Whippet includes support for low-overhead run-time tracing via
+[LTTng](https://lttng.org/).  If the support library `lttng-ust` is
+present when Whippet is compiled (as checked via `pkg-config`),
+tracepoint support will be present.  See
+[tracepoints.md](./tracepoints.md) for more information on how to get
+performance traces out of Whippet.
 
 ## Using the collector
 
diff --git a/doc/perfetto-minor-gc.png b/doc/perfetto-minor-gc.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c528ae426c8d309cb43e5973753dd9da06f7dad
GIT binary patch
literal 173475
zcmd422UJvR(<X`$1K>F-Nd=T3QG(>`k(`m7L6Dpz=U@ONOU@ulj*>H|NNi}3oKXo)
z&KVkRZO{C7{<UV-+?gBJo&L^QUmv=A@As{@>ZzxmsvW2(FM)sU&NUny9DFHBQ6(Il
zi(K%p`<2V^m(NoB+VFoD9iB?5T!EM8l^5^ezeyd%G#r&}O&ndG+Z*GU+SpnfKXNd#
zH#W9$Ftc@BztAKEUt&PMBw}y;+|k_D=Dv!#wK0y0lkt5vw)^77hW8({KIXj7#>MxT
zosaYJeGxI0yA|1+I5_ulq(uKwaZO&CbaT9ial;(ztFoA_@JqfQ`R64uePX0!1Q(5T
zEZwtcvKDEMJs$N+lcL8p7;4hC1lhG04}b7%9`Hpa^LT{aY76vQ3oKr34|`!9OB10b
zDMdu{`r<z~FJ3jIDq+FjnbLJS^Q{Yc_~bnG6E3X-Z}g86-#qta^Y-;|TftrCR$*+v
z@aTD8u@~c&AAjVedhKbF@i|KBpKis)L`6|mji;>j!WMBn+v67rkT*CuKiwln{(VOp
z{2j3!4o<rUHJB1!Pg*HU&i#4!m?#kc&ztupl2(7-_=boXUikCw1=G#Df8N|T`hW3K
zHM-2|oh(-4vDBpgcHMrDX2&vL`Uvr-VNIs_C-<A4?N?fB`#*2TZ0COX@C4ql;yudT
z^FLej+)MIWcbrb^L7O&>?#u6E4_E%1KV|FN-s83Zp6cCO$|MzOu6Jw@=BLBo5Hc&$
z+g-1gc+@;n>7~2LGCM{sbi9RKQF33J2t8UU(U7&awq|(rNY&gtcWiuIRYN03HD9g!
zov`lNr1^5rI`{Ti#@y*><_&diZvAp6)c)DF=ic1mD9u=HlAHCGdtl|7iC$8<icno$
zLBUGxHL0y*?47%JmF48RUw3}(8yXzU{P>ZQ@b>Kt{@X8nT?L9>@)VSoK8}frp}!N#
zz|F0G-P-o^jz(;zOdK;kEv-aLOUohw?w6Y!e0+HZtT~;XotK<a&v_nfFa@^!)+;mV
zetpf%#zxJ<qi(GRlPY%Ys?3l4;>J8OerL1vGoP<pCEi$f%yfQ!em}`on+!9Ky!|N;
zh27oV@*6lNPP6TQ@9ym_q50RNhnMACcyMC>N~I~$kWBh>=)NIy@-8?BF9Ss>rKxW|
zmQ&W$)MTHR5&QJ4WoDC+yy?eWx6^4wEUv+6*05>uFu#%Hk<Knw3zejoX3FpV33cJ6
z_07Py;G%HA!^<(_DPwXKOfnIYqLtRDna%c@%@>YtIP)yz;fzGVQjr4WJt0Tj4c_SN
z({5FL;yM9T>FrQ)W6jtJ8}3iTHaIu$YH-lXCo4KQlu<^!6Ce*CD$>n|ZSvfp#q^{N
zkJ~mCD$-HMp)zZ0`Bq2EFhMtPgLU;JBmx{7%@gyAiiUJuCg(UphPB9j5BXizlBc|n
zt^Jyx*G$@2k9_{eWx_7Np}wL*mBMr56<mM9rfR5AJDY#4dM2nNw5GGpb@QX157z0@
z`ExZo%!5Ni7Cotg?9THw&*simbAi{eG@X<l9UVbT1_cK@oniMGSy*~_F+ab)Sz1|%
zHrDLy=^1X=pUUs+>kDDJ@#Q2mJbXDuUGK-pNIZ3JPL3qS*+E-M#o5#J_j+z~g!lvm
zot4&e7>womRD-(C_@kSF3v+S$&gdQ#svkCzNyw-+@qX*~W;Fr1Ku&5Zxn^u;a`L^^
zi5g`mC#T?(IISsH!{(3gL@6z7Y=#m!EqA`(QZO5@w6?Ie9^BaY?1299NkaA6v-CC!
zA03xTr#N&9e>NJ8PEyhnARk6PQ&Y?8@mh1JJzlFD{P80_n3zRR9XTx$yd~I^g_&6|
zM)=Hab*e%8(W6K5xW<NtPuCldi@?=)4|{y{#;a`AEiLmO($W^^*Mrv$>v_l-8yma-
z4y1KlZ`jZK^hpGZ#d2o~;0zaKW{Nd8H@nY9NbE2+Z83tQ6*nH`m1%!XU^g9@oGiS6
zOR%unMxp-vdAgqWp)t+Pz&{qOC@-gj)x>!(#_cS83TGg}%gfAc;q^W}al^;MTlf_~
zF=Q%ytO*C)0Gp7<)=w<<XLIuK<RI5$Pdf~e?H?#HYO~RBYiMYwefjJ4-r8h7Vh;#8
zKKGrR-<t=NSik!bBVQ;<@7z(MyWJ-twDmWsfBNq7Fe7=mrGdedNDeNptdbHouN5PT
zsPR=bIl0h!JzHtRU*fUEq2ioELPd|XE898bz+V_xSQK=1b%#~K{puVSBjM&pyUWIj
zSwjVyp8yfEl^?^`HJGyC>L^w`>xt@Tdd2GC*-dL(-+p}tXQXXrRngTg1O$kd?+pbX
zs^GW#{dTBWzxelWL$?OKLd`O9Dk`dLm*1c_^YR#C;^I)N&X(Kr-HFM`1MTgS;z*>r
zdw5g;B7kS*1Ek8y$<cf5?Cm)tzlG!3a&OS8b1oE@kVwg8j6-~0tw1AvYs>!7Q^<O{
z5$hitH=$dal_ky`C!<|y1<pG{h-%(_Q1VKQje{ejzFxqzF}`i)nVw$icB{k7mx@AZ
z3IH0BNLappeI65PMSb%}Rpag>G9k|c|7dR5u_$GFWaLe$$mb+rp)?maoTBO3*^Dw9
zaV5COWn5euIqTindrS0kgFUcK+6YpDfZbJFAMN$}ZFGHSuqY*3aU%n<Up?4v-D};$
zeJ)1*#rWQ9)P>Kizg}wmdO30!7MpQ$;t3%FSPS1$q>EO<^%6>hBY<03jSn#v*MD61
z-meR4F}%*K5IN<wuj_R@KJDD8W%ggz!(J#7$-uy%v^2D^kOOa$(}IJ82?+vH{w9(L
zyZ2{HfF;@Wi<n}KlBXpaUG&@HF>%Ry+B!M~qLdGsrkk$#kWrVkUM3QwyPa5aC9DVI
zz~G6zZC6lJi;c>aZNy5VIki`iv&L(d)NH3^WJF0>b%DDq6Nk4MlZbb^-?ph5ck3q&
z6*s-N?<#Q3?<u+lfE-=<)gdNMlKRECvvkUydxN}N2j}CUpr98dt$%FLPkd3<zDB3i
zT02nmTkf0w<s9?G$9N*JMpCJmv^17D)Z7kXQI&vHUS6IO$DNq*Rp+Inr=1sGVt;SC
zcA^3b3YY-#F&HxrhO{2c$;z&KiaWhjMMXua^*H^^)>dTCy{`s}Zf<Ve8TE^8QPx+n
z?^=L01pk(HK+p4~THVKM?k&=-v$1gVf6mjCmJboXF1<^4cwDoTJ-#|5A$;`1EslHG
zuAf7h=&~*toXDiI*75M$*SNGOfz%W1ScN5VMLpzT^T9mTU^TD@`!M@g&!v%_BY|<w
z{^e!EIG2hU2QMz2yICK9W>xp0FAvwHELFt1WKPq%2xzVx;B-CGY%<9=t+dwP4ij}4
z9JVa%3Dza|9qrC$)<x~_D6s$@Pj>N@kL)Mr3pnfP=$vqt&_#-rX9c&-pnI*u+t{hJ
z6C|mNs#Sj*#>ENv9C7ydG!1Rxbm`l-@9pr#PGEEs??l)gZq5p=4=d}HPRaY+ibTGT
zi#3eH{W;d9))skYlbF?i*i;{;CxoR^?!3V21AK;~I~wzP(Q9W`r4JRZ|Hzzm{IG9e
z`-@>5t=z6d*1E6#p&b(eB);1;G&I*Po~Kvl2S821X39uQ-@fLT9Aq_K`S9Vx=#&&M
z<xWOAy7$jX=!17cLPK>Sj^<f_Jycp}cLm^sr#$t96N;QIFn=IteNJ0T%UyTQO@KTh
zD?w=rVgOQ>tu(m09QdLRa~xbVdD`#kgM=*l;$g<B@kilpQA)U<kVLK~FE8I31ACgC
zoi%fG95{5dvSP}#0X)fGdBVZV3qSB^dGXz9E+K1CfSAeI?b4-7^lh}Xw8kX<4)=oJ
ze1`}yHqL|#xwuqOn*)Xhiawp2n~ThdVN~y&v(Ig4&;@$1zR!4%AitoXi=OIcAWxuQ
z3v&-<8|YBmsL<oTuH+OKE99#ery^(p_>ur=uwUw(_@_Gn(;^~oGICiwtHuj~KuINz
zwZJM>AWY;*`<J(M;7)TLGn+hfErx8nJ3BcgB|qp{?u7E$DiD1F$7HGbU<-_)?1{d<
zep<a=vAu%>AoSAG(!lie`n~n_^*HNs`SXv-!}&WpTIg6JB<XGjGHJmo^kEh9X+n=s
zr7tvN$HvB#jur+72l*W4Ltt&NSyR*Ot*x!(h&13*b8}+AEo<xQ2m?idF36|wWdZ`4
z-flOHtFjnm(5-iU_Ug%#$N|W@6B85Bm&8h57G<ZpFXj5(?wy`4o|@7FDE5kY`c27k
zXW^Nus;c>1XG|HnOO4JQQqmtCQM8<_tRIxK<>?{gK*o1qaN@W#c)Z@Uey_?}yUu0Z
zdCFb@yVDO@!Ew)3Z)FQ8hnAKW2mC0CR-uy>Z*`=kS+nNAV(whLE@gxwWN~nh6kQqS
zxRey2SQ+#4CScFSkrE?>(}a=nzaa7V6iV|C2ymM3BHi8JUq0BJVPIk!fSd)^13*jl
zJ~9V*gr#<Crnb#Q7;Ex*pHv?mA3Od2eiahxWStA7D#kmZOQWV~gI~V*g2jn+$bIvd
z7Z(wN#AP$#+LB9}nVAV}25}rdK0bsP2zu=C1ilIix|Sl~ri4MKD1({Nse<__{4RVf
zPZ4G`(-v;ooyZCJcb!2YacpgO<5#nVg+=7}s@_XC924#|Q>o(N{9Cm0h)bFFqz?Z4
z`N4T@BKO<3r&3BbHbq+6+JkVT>%`1C+1V06YFsJ}1U(Nvhu!1-$Z0tQ9F_;O4-ER7
z-*tpRR9cVE*FVu+*|N8{cU+s`%`QN4t8SedCnsk?bJ;sdD+9fGX)6-{^szB5*!YwC
z9|rT)m)Gjn<)yBvsHrh9GRnZ&Lnf}pq<SRhl0%;LZqkQegFWjX%+XN1ixipI2c80+
z;TD5y;JhtmMZK(Ve|-?FFzJR#K_^{nW*zA5z2DEl$;+GD*(nVajo%n$+B97ZR*gRL
ztk{CP#3Us#!e1bxA|GcqD#*&_)Yl{75v)Y+%b28K*bGHiX{l0FR21;Bs!8W03rowC
z8Jj9?@N00LXzEFO)FZ7bn-siefSn*wO5i0*s;V)v{W^+neFL{O#w%5|wd?4>izYo&
zX|7+tUg5rLaftN>rULgICv|!x*YG4aH}~27zu*F6jb0v|Inoe6B>qdw%YYW8Wo3iC
zy+38IjzSUGiIxu5O&wky1S)$Q?=oR{TZP?>_@w)CVfl^VuFu5PH&mf6m~z9A2_DWy
z1ry$i{ZQ<KT^=hj3aGT1Ok7jMVS4bxOlA7)bYE3pznFi$o_+Ecj>c=`9K*s01dkRz
z(O;Fp`Kfs`*pKtSt&8(hpZ^F63CV%_AYD;1U#+k&OD;(%>;?fr9{g$3RR@d$ZY7<d
z%>`jJG^8fg2WhLqVlYp0JxY?=@o2{!LI!|mAy(T)YSj5hDN{vdWght4;o;#}sj*C_
z7Aah%EsU%m_yxkbx|6u|?T!&{nZjfD6qXJN*TTxGJ}bT_O}H5Rv3+^hVtJ?#LNKi-
z?dC!6)r;qm;9w<DadWGIfQc_PkuSww324@;u!!NZ8hNiYTw7a<I3@U<ToM=Wb$eZO
zr8O*L>32(TCmKC8-<_nYqB46gT>O)Tx%rRw_IJ|K(&bLeL(#lvUk!Q#9ROyun!K@u
z!xtq}?_Rm6qO2VCIL%pzt*W9TH$Pv7mzTG8E>_!&;&TCw3*W-(Do}}`*3i4kOI<s)
zDN#|MLho{CdG6ERylVPt0}u~6J@tY*sav;h@j*x-<rb7~nEjcV83Y~-OX3kc50kPd
zI*W1~+>@+z7ffTEhme7A<*~d0Roky1EaA^)Ci3v*fn4RK^>uY;=gLZ}(MY8@C_3d+
z1#*A=dOinv3hYF`Y|t`e$H~oY3F!cUWUSt8yCCVpNM&K#=}tDic3EOpE>cWtmpwc_
zt+dY1&6NVUqBnMQc3zsOaX_<o?g+Izc5rrHzI-`4DJj2Dy9({9SEJSCu=`Ie7152B
z`mPGsoay!9vL2y)V0@5&AoTI^@pFrc<o%!LG&Sk7G#-4E?T1p3&uJ-|nm6BXAE5FO
zlNL9;tO^JXrARrzKZIa+_3*FefWfh`yp<Izh!|LPK5*{Q(NQ3AwoTYSDFZ&zt#y=w
zvdl4(4(d;!QQ;gsJPQSi0$g14Oe%c3_L#RCOb?q`9hU~^L2uw;{Ln+V4JDA%LhpmQ
z`FRUZPaUM#1tX7)U8P{y))a6A+b9TQ%fa+e|Dw}`(@Im80ghAM=1E5@AN6xMfg?i&
ztg5aKB?^)Rpe9uV&Hxw>DP&+|gvY3;5jN<!@E)Da`)OiACp0t^$rNZbZRF0;a`E!X
z;dW<7$1}iiX1)3xfB(zbUaqHyn7RGqxlLC`$RDP^dQq3&{Db|8E3C*$0<wfEBp@b~
zJvxPP-l#2X5o%~<0A|QGDTI9(_rESY{-F~bFjiUUGUX-#rRdh7CsaV-z|VAbQy?ut
z{zZz$>@@;>{KcKcestKySmQ+~VfjD<Kyq>l9@MPGiB~{H8G?dS1&U%Qt|b#KioG}9
zP-G-WrKJ`3^(jdy+1eKCxi8WHIMd$@%&Do-1P6h=Bs@DETsqWY8?P)R*7tk{l`K?I
z7PhwP9Cze2S^qv@jqfR&O&19m7#hlg5*;zf!jDgA7Z1vaUIp*1TR=nT0QTR!c>@UB
z5BN6gTo)rrz4Prl3zC$ef+8a$ON@_~)I8ztC}o2Edm#9WnzOI8Eh;LC?ZNaP98{-z
ztnq>{0X~l8LLf)Zk#rzrAtwR3h=YwS9gLUT*r+FzX4~2jMlP5KsgV&FBHS0rT#1Q^
zFB0dI*?I<XHH##V4WHc;_SUJXsc}E}6;NS2Ei~yoZoRa&HUwmm3Dsv=+>nJNuJi!%
zzBf07a<pw5RH3R*1wVjt3OpHdSr#t2>yzXC_2D2w#^i{<`XLq;yAs&lH@@N`7Zg1B
z`36Cq&6zeDxdvl%&?TT0c(yuTr4TN#^$x-e7|Bl^VF7`H29N!v#^beqs3Hai2V+W!
zo%#ZA(aze}>})jOSbl{|-dk8Tl{c*KGpR}MRTSR?o(HJKYX>1+X4*6L;q3gEzQL&}
z=6;R}pEGad4j%h!gNNG~5SKD?Yt;bs$|;h#td)R1fa_B)!sV2Vj6RWjZu}#-(<{Qj
z%sd40lHiOG;BZPx3P5|@TGN>~8PCfnvBm&>DBh{5sUqz0{_^SZ`tYR7bRm#5AVM;U
zcb*gCT~-eBflY&i#=hpX*hd2agqZ+*&u?h((qjvF`hW~fja1W(UdKCu3Z74a0SF2T
z2A{RzKK7knSC^3qc3z(<M9hgxa8D6b5I8;tsXOH4Y6=SBdn=<e0|^>T^C5}AfRQ6=
z6WU{gHF(u77u5kG<ox>ei*bsbJUqX!(7IJ76krJ)1u?(-&sVrW6+W<=qIudB04_l_
zK>j|~<kJL{Z}%%MMHih<k-Lz37bqkMxnw!{#$pEKbF9ImDvUz7C@?V44_opIFY07%
zs>(KPI~+Bq1pJvg7v!A_7cTI<{OQ|I8xA6rrDYVyu9qMrN8~UOE5T+R_f|APNhIAb
zWs-sHhnPU*hG0ejxx3^IfeqiX!WDo)RXsh#vX*D3*u31_9}!d%)Yua-1uw6Lz18t-
zka9p2rbREpxg{ngIe`;eAnSu2oOm}e>r}HA$77b>FsV~>eWv5G9?S4pt7&t)HM8jo
z%o}()6Opo=T}hd7BFGPnj8Pn_CAR|KLXA~n*m4c5l#rF1mGuyC*1XjVXfmXD6%`fg
zX4ctVcaYjYNm5gqFUbW%DEfN=EaX;KYk+%1W@W{a`2WJYTL|Qa`lel_X;M*HnKA%7
z&`_wn8Py6DA@&ar4(bib!KmO)Kz9JD5C-L6c@EW5>$JS{<-!dfTc*fQw2?{nuncXR
z8e?(YGHY#+PXS`k3;6vUh<?}D*hqR67dR{~g{LC$#K^2lFXy^Rv==0M$WzN9wZI&s
zArV75&&UmX>h0ZFT8fY?wIUsQ;D|gtJTvUzs`(#3eshyanjm5(5APv$*s7nsYY0~b
z{+OL|J~jz}OYmf)8U5a-pCiTlEywQX68kynZ1fA*HHeoW0Q3#Wgt`i>>F7Uu_5fAN
zAtWTUj0G%ME^b;YC+rUe@&s%W6fOVfW~*Zr@^>G|g|<pJsyfx^fbKK={rmTjfQ%Xh
zM_rtq^UBKyNas{_beQ5$V#4Q9VdjYl4#9pOm|IxREne-Ib=;iYw}Z?BED>ZF8o5F9
z@U|IhT99U<V`4HP97*@3g{o%f=Rror@arT>`vd;xBAZM_IH!+K6L<_FaF{7Wss`a!
zGtCqV5=cVHm<MS+*&t<MCTa{DwJ2Wj8inFB1<eW!gL))M{V_I{2&6ScJ%g=)uhC<G
zJOdj;wEIoh33JiNYuBzN6QCS{6Vf+NK0+yh4-8d;6#E#D%9tRr2NN~D9>>Ay2<Y2p
zo&iEO{DV~7&$Ieenu)GlRPuLp6Cekc2KNJ+pWm8%BHNESgA!Z6B<hl9Jmm5mxH~uk
zn7z1RFA-tpq6<?E9?ED63JQb-$@Z(Tk|aONE-Nifu43p9s7T6Mq!xSq^#f9)O35ev
z+CNtIf8&k_0E>>^;MtJw`-Q0aAHI5G&}s@w1KLyDeCaC_Q7~j2u}?qpTknAerKu*(
z1SC)3v%%<=tAJLGeWtQYU2QE=IYJ_f0>3XL=Xv>sce<2GPF5CCn<KsTs-jk3jjw|K
zpHe~bKu(YYAyi*pU7-KOmXYf%`<V2BoR<sjI9La^o2;}Sg?;tn=0o#8paaB^e)u89
zd}Ij@$`53zu>i10WhIwp>>Tw&Zkvf5Ae4L<+NOZ0!1tr6_~6&crf(J_r~BK&DTc#+
zPK*eNh!Ct%fA%b5n@reC+HGr^>W{Q<SfbHcUJja|>A->Gt0!L;-$>g3mf7Q;4?48)
zdOsHeD#f5fk6<?NM^HYI9Ki1HDy^L24@uN~xJXytud!a_eX_oV=8!6;&DqI`{`NIq
z{6u3-_BC@7|ERCY){-^miR_P)J6uf|8Ar`@9?j2sW$CBs=wH+bgPsV&BTUDD?(>@n
zvJeI578c3@zei8FHM1%$ffOpQn<NL=ju016Euqllp{iysKI?cK)sKp_HDwkyt#qcU
zNmW!TO(|uvM8}&3dsI4K(XfHK*m2|MKk%E@*4F5jf3huYY+~DH?DQ<U)LbIBXG|TS
z;O6Au;IWm-uB#`L99k8n{53E+y0kav_4-fr>8DnwOzE9NJ%e0`G$KYT3-RxAWz@e<
zUi%(+3&<5(CUg`Cz3Ceid<69)B2;&Gce_LPX{Vhkyvuz}2m6k(LL{y&y58i7t|Gt1
zvqGrdUK|v`CFQiEXmG+Q*BShVoj#ePfo!3e$kB2p8QX^4SkPKh{4Nv^LjgoU#+LzP
zJfnaT&tTE6MhAUwyK2f^2BE%)NXduOAy!AtYin36VQpO~bavPSsZ`gt@o>JN?v5<C
z_IaW&-Bl*tcLs-t^Py_Ty+Q6cl$U5@WhG0ikU9V&euSnAXS@tx0DvUYIzVgfvnVK1
z)7&Caf69M;LIbD@l51)lFgx=}wot{XLg^*Tn++F(t=68M9%FPuJSzE?w-0wld%TBg
zxcK?Cdj$8B%c911HM%NaInOU?7+nZv27ZO`3eb7kx|UZ)%b)`9gVdZ7X<lF7{9ck;
zX}+R+K9qY(5kzw7Y+7CmL%#9K6ptF6BF*VM*>D=to;fxe_LBa9P+j`@7jFIR*@2?3
zt;asEAp9LVL%>}7M@BR{OCTyz_?)CY_Ezl3H6opkwrY9QkAK?Nlsm6#5fT&U*VSo9
znhb0&I`L^vS*^BNrF@z#x-8Z4>`n)#OA*nw71f}LWq|KoI|y)<m1=-o=x;xz0AY}-
zLPcFY8+-t?+s0UVCCZ5SXD(44{csSBjephD+w|;CdS^ytIt*?DT%H{4a0dF$b*?%i
z6%@iX$3W$JYFoeQ4~>en(o4%$PIgCSU+1!_?m|LD@9gB$JdMJ>gYYdH2&k@4UhE-p
z_%9x_{7!5WqL*L0TH2mmI!nD4NL9n`koH)aX?Ph)*v_`tw=9;QivVRq#qrD$k_)@T
z`sz$X4r=1^h@XBFmP%J_uhEBM0ep^~i3!@PM8TO|%28R7M8It69W-YUZ4L-@61zzp
zil%rtDRQL7(_8qnPBNbpgE4W`Y<=J2uH%arqU%j(jUWpxb_K_H2a5+>`Hxuuz3iy#
z7iE<DN3(=qP>WIiO(x*loAAj7#0$Wp<J-4y(*Z7EXfZ7c`U8$f_f%1(CIy*M(G#t_
z^nYF9arlV|I^3!dj2u?5C4M#c0(>lhfK2)jz2hptu67`7Kl=_?L8a|<>el-Roo3S`
zj&L&$J|oltu)P(5jc!|S<+FqEGrIB0T~Lsgpp<l(@6lhZ8|RQSMYUz1Fr5nFXBNAj
zMe*0KT~jKOtCZWJ`aBT=jmO?}5p++-df%e50949)uD>pUlUM?<(XYON&cqN}vevUO
zIRnT=G$!GN&zCuSUMF{VVM<L2FQK%uhpnEerjeH0vT0M~;cY1nu#5jdP3UeQ+&uXj
zkAMIq?>WzNfIg<j&Hn!W(U!W*aZqp}w7%=PwWX!QJX)^^h~nz%D#9nrv|N>divc>8
z@u>6MLmDk*>kjrsUiJ^tii((lUv-Zy0RQJ@1K<H|w*!3j^XHd(Y;n=i3sBfWZ4BBB
zaHC<t5d8KRAY38x>-kr{R^!g%iqx1k|ECWG&rY_X6)dlbBb%tj1p2ZpwB*ot2!X5S
z6%-`zhDreJl{HvGmkP>WhqR?73y{H0<DzDZE_S#>kmd#kUd{ewzxi8iv~qRjBgbTC
z3?ty_tfK)RKR+U>RgDXpc0rP~2M*>=1@%H4v#$LZy+I8do(R-qAZ<lik3%VdUZd_)
znG|x{%0MuvSL8-FiS;uL4LV>k1K873?}RMgnXfcZ7b5b+E;Mg>&OkMUcC-kTRB`5s
z4D|E}i76xV($?1Qba4Sq9V+e_0Y^N1d=P$Z#XUIG5xE2u75p1FvR95wG>NcRnD{|N
z;PcokC{wk+|5toOJP#}Y&ZZm+BGMcC&}9gMjtAY3#zrCN`4s9kxVJVyxd)^>c~%T0
zao`oul>@Q>JPSF;g4`c4&wwcRtImjm3lwN_auO{w6c!d19eM?6ncW5t*?Vx0z|zq0
z8?Q@IeDM&jCOURMyY1s>ty4ZEteusz9tHqPXa|o~+kFD^1k@Z!kia2wGBQLFhU1_M
zd>?FAVLi?zxIf|GzS%-dBZZW%GhzHD6K(7J%TTjL{wo%MW(yT1>1d5ZHe9USy#Hat
z;d~OZYjJV$av=7YuNmd#hzNvK$3VIrp(q<180eCGFWCno7&L3MOjF$uy#;RBpDO5)
zT=m$YrQT+euVJ(GE=Zm7xaJ@#2k#K$K5hf{jwm%q=?@(i<Od*mrQp5-ra(`EX8~<-
zq?zrY3z-Vgy24}MreWjTb&x2a=rJ-fVs}=G$f{H>fcXp%B|m|q!?xR3o%uUdfPS~O
ziZ>km4wlb0*j*Y#Ux@&{1S#)X>Nj3*ZEqvRq&2Xtq4X3+^&%AjIz7)#F(5s8RaNRx
zP1%X(>pq$Xp~!l&Ruz$Y;G}2wkAb~%Thd<FAh-satkbd@8Lz!4^qC>%)F67j_9to0
zEkuXw)*JJI^}~nMb7Q)7&K=9Ud9}4#NTvp1sQByGf8aX$>Nh_C&cj$nZ!k+E9q?ut
zMVNFN)<tiE#JK23tk-*;rT+eSB_~X;fXd%PXZnZP!L$pMTm3hD4n&ab1mYMhH-YWi
zrGwAg?tfa)2=^$|sX_E-sG1PTo};3u6}lM3AkAZUvW1<BP#qmVR;#9q!2K*hn&flc
z_z+cXWnq!HTi5XM<5O^E_xS`<6)mkih^IoWicU?l1DJFfK<<hF0qEKg_wEbn0n~C0
zIg{n)QDta@Et(E1OHbh6h|PdGs6b$~wza_+&;qp(CdjP_xXc{%VzN`UB+r4%F0HN(
zLLo{_Pt7DI5l#-0Pz+R7yvv)j#Eu|r8*75lSO%Kp?-z0&9v&%z=}~p(VYml~Q%rO;
z9mESrHAA4^K(h%F46mg>`|WGypv)l&YEII=GF+#@y`oHP21Jx{2aJ67N7}pJ{<N?&
zz&9qnk5|jknovVQHnuNK2O~ZM-C{doKBxw;O-xJ-bR$J5`CzhvfB7@_{V1B7R}ta4
zx|+wBaefi93+O(#uUw3lq%Mc407Q|M*S&M^o}<63eDN@}?%<S^A>yTFWH>=Wf>top
zZk9unH}RTZIx8n6C#ygw3of3tb^m>`%iU{!$bbSk3oqFvC{MMumu1zpp+x|NH;3i%
z&rR2Xfq@JdVuB5T{(+1ppf_s_<(;LWR!6jE2o9ouS_2SS1Nf`b&BQou5U-)cLs}Uy
zKY;WNMjs0TnAqF*k){e_(*WM!tRQ$n1we#JXokUD&bGS;FN`BV&zP5r4>A@69xVTX
z)So89c|XwH^Kx=}LCQj`zCI<moWs1hv^&8XlHUrwnY{db;4HU;TM+UDZs0U}gR?E5
z2bvh_#d^>R{RBIJiLFT}5YUx*(4L8q{64BCco+@?AaMNKuc?TF5rq))3^a_=DjtEO
zfmRwM{s~(4$lwg3b73aHn6!WYa;|eaUaiXB!A7wtwzaiEOl8-t)uPO`nX3$2f}(pk
zyr^r<6V*fiIG*u{y7q53au7Op!I2j_Xw!5G;y+*X!Y=D0eK<s`cMLdha5Yf;)6waE
zQ}NM}JJJ*SGe71CvtCY)jtf5|C{mo>Qv1#Yn@TYe4gJShpyk)VP(`}$8%M@Jw<a&*
zJc%sJZ{Hd0IA>4`6~YS9MV$Uk7&t5a`CZ)E*}1b)<KQLCwRN3cW_|OV<XS*8%6r`r
zM}6MD<~BiqCdK(LcmgtpC9H&({y&@@{2z@WYP~e1x_u3X1`XH@zur_+RQv!lA8V{Q
zOl;6whDLIPBuwAo$f!_8yoYk!dVG=K(wCdfWz<Bd*SGEh?!(@K|Gpl}3N5mgl@-T;
za|VHszhRn$IgW8M9Y^CMG|B%R)NrB0i6ud1=Hi3S|Jm&S{UOTv^%L}Z4qz;>_#ZHW
z)lYWg&lSJaCrlr-azoq8X=_#ryO?&GYc($KBKzk<oNdD+Pa#YIjFoZt|GR>h))V?i
zeQ`S}>z)z_x~!j_uA^8(9NRAb`2{P>8f~iU(=_ag>6~*NXi|Ou^*%t=fe?YbEbGGb
z@8yTTpL`>3`~V%*rOI)eerO;3=Vf8nTxg_V=IE$Xz42QS-JhS_H;M<MguVgj!$giu
z?&9D)_CFjf_ErT%Mijx*jT>jIlQ2(<yt<N(`_oR#g|LeX+5G=btNL1+h<|zptfh?c
z-=DYh%o8_mT$;3NBdZ1V8>SW`9NYdKOJ%j4cWK-v15q6r9Prsy_i6dB%g`$xK76J7
z?9Y9kJhby!wbbw5Z#wgW@@CP+8{_g{H(o>GvBD;}H)6CH_V0CnCMNvP&CdSs?`bvq
z{5wOhuPrPrC>|fmKPtb+WQ}3oX<zOB(LVB`R^;60x8Hx;RixB?*bIFC?HS{rwkXI%
zg{QB)$vkJ?3D~&QHp7>oH?%O}M-HxT&^lIFM_is;Z9N_&*1Tm>Liq2&SJ(VMkAeRU
zW|IF$xbpwsr1bydswOry9AAlzL}vKK{p-*G3LfiY4Ij>f=*>!^Xk#-)>Gw91?yvah
z;hC5=nEeGD`)j<C@S5MObIojbr!+DQ{K1U+qbp@PFi2AWh_#o{LoYE?`JoXFnIr!B
zcxA);^Uz&GW-$f^$~D-YUqv`J@M#qto#a0q_dm@M8K5%A@Z+Bg0&RpejpS>NgiH88
zF)(Qe<<oaS-Vj^jKY{w+ojrG{y-=NSI`88xe#_|1A^{au%g1Lm6E^d$=i?uc2X}DD
zB`~Wau6$85@GTQ6a(qVW4=$rhqu;fSelWOtFgtT5vTQpUwZ>XBUMg*MAc4#AxHknd
z0`mGo`D>T|&%6KCq4)yhib+NsoN8E0v1iTU;}>){3EJoOIvs0n$CJcZX-Cf5d(5KH
zQK&gJ0di%!pK;dO=3)16a9&(Prs;qBZ_Ldut9x&=s8l&8;q_<A=w=(&>O|sUyOq<_
znC`@KGirxp$J8rK`ct;%X6bp@vSRnWKl&VFgp~w{r1m9X&7{7$5Pb%lr2cC@JKQQw
z-gQ%+i{nQD#h!Ym3is4n@>-+&*F1K(%Wam)v9pU!eQDS=mW4OX!hNeQ*pEY2jqGx-
zF5vjeiXe0TOnl+AGP}z*%Ikh=*n?xA^zH2Fe$?4f+!C+PH{Q0ml<aW9jp{J6(}*e8
zUulEGUgAAxsIyq>aV?)^UGorf*x1dik2pBKVi<dm=oIx2MSCVRzuBZB|1EiW{W4=7
z>oV^;COdi)hl}lo7N1J^y*^DMXNSiTWRcMS@{t`+ggH(-Lh)x1##KNwGDCwpe(QKI
z%67UJn97%c>&gJ<Uw)Tmi<`w~EeB<52#+6rukPdpOE^409%CR%F{Zg4`Zii$gQ1z#
zM39z)msyH|E>H3hYwhACEmyTt`-|;7j_(az<NyqQw0wLX8m`k9l=sls7$X%~uQ|%V
ztUZ^^&IOHJuasJ+VUfpOxy({hVin$t9W42IzhcY3!!r_(Z&<<u624-lnpep@=ht@+
zh|koTP6MX<ThRSlnv)qPA0PXPAcA<F=lXI}cAL+!?e1Nlo#NU7C#?#(aSkC*scwl>
zg>70`cf#wx|M&vWxV@<U+1AW8`LKLL)i(q}+xts4!m0M?!+znxX`e%ft#E=IyBPwf
zD(+>4v*mN4c#hfh7C1QX%}J1r+r)#hvAYxk52frq>T9<fd+3Nl+e^J0S1lDJf*TK;
zd^mL<z|Qg7-oXZ5irkJ-Tl(7OEz>fm+S$$WY{s8VtEBc>9G`Z~yS6?uILd?f2FRl#
z)ep9=qslF(O`R}aX%WUW(Q~N5c7t%e19Vy@KL-xZ#)GGb>rDJLiBVVx_pwYHny!^N
z^qWIgyNgvPB#_Ja&tGGO06tp$>Lceu0s$Zr^4}S@`9B{*i5<LiH~ef!fB9roKvS)j
zDk4bcHv>cC)^Bs$Fu8?_)6*o9AA>D3YzbL1SyIe+Px3@5nVVT(Bo_T4!KDRnLEJ-C
z6x62R%zrIq(o}nDQ$;(O&Sa?_99)s->{Z*W@!ohBUs<zEo(yxo^b!ALdd`RM`FTfU
zjN~k9jUO5tzrqdf>1ZmSC7lYB`DV86$YWVnFZhm<WW9LDb^eR2KYh(a`3-Yt)8`4-
zf*AYAT3>G*k^KRZ|LezPzC<b}&-FL0p1l>)s$l?92}kqClJNT%2(BzwEG#dpxG2=U
zkj!UTuE;CZ^M9Vf5421udTHyrioPP9lm!!0AGAc2KT$F49u6of=G<x)v0z{rdcl-O
z(4-$({+L#R(ZL>?iy(dN%F1V~kSuV?4HDnZ`I3|=DNuyI+Vs${B>Fhk)YQ_+CKXTW
zLi&5=%e2O2tM>s|{Jys`$(P^z;dA`PC%dDAt>gXHR?toii<f#5_nM@O89ZP_w_04-
z-<ZfJ)yKkB@WcQ2P3&3(cSUZ7tbYA26(1Kzj<z&vM@ymVbn83Ys-|kD{&>BZP@If^
z#=Dr_>56;9mJNcD)M=>9=_ar2AtFMQbz-TNHrc5icHf17@piaOWMz!{LeoU8?x;-_
zp43-Y9ew`OS5vF&GCdUF@jAp=A9Ws*B=^RhrqPzWJH0vmWjo!c@3Ts%8V#%$`?X1#
zjw4}h16*10$lWqHRp20qda);Qdicy0^Rja@XyqPuZqvv;efc$>)LC0CJ|BjnpLj|j
zgN58%WF{gfiX<XKFh8$6y;^jduRo7g|2Xh;_RYRZOj5SKjSi#KR?WoGjk{dA9`;Z5
zUANzRA8#x#5y&oegjTg%_6eTHxgWLB?UnH4OR>Srr+0^})s31>taW^nymy5?rTA@{
z6u#a^Jl*DPwK097B+nUnSwZxS&(>ED_rfae)fuC$_^i{p6;fo0wXDB}Yc|;`Lhy1a
z`P+DXdh!@TMZdXc&D39h`VZM!T?RMv?$TNwgOpNkPj+6tC-&zFuWtNCzL%%Y*~_iB
zon>Vel{vHXUN7OHIRja-8d*C#zafr^k8%3gB76I5J0qW(iui4gtJkVmXBG+YaeR#}
z>F(K|3R-BoXJ*SJNlRBWWy$+ZF%_~f|JaH7tcICi>KZYVCOxwqC&ey2g?@kTXG#AI
zMS-l&WF2_ciX=!oR&91R$9PuUhZ63oA!()4y1KUz-gHuay2El>mg=^4_o#{j+u(hu
z+*G80^1YaWrI>Z{7o4h8nvGVmzFvRE_NiT{1f{(`m)G}Yd-~*e*(Wv}H;2rqnWQ>*
zU8$w?$4V^4M}^{+FN+~oym<3&S$k*a(7N}gEeDrShOPIq&mzUyVYuV$D)G)RoSkeG
z8tRM1==I&DAFZ73M<={kAJZN>6Pg{L$x_v-nBtdcsgYP6bxlU9&9Xv;)9oZ&PV&&K
zd)M_U9iR@;7}GGCd3@oG;1PQNR7ZExnt@&~o#k{sZF$q@On=b*NHB$eIuh|VjXkd0
zaqZTL?lpDF&ln$QN2U)v8pRcPFgQD(d9r4dueN+PQLPy{K#V6DDUc;Y_3LG)@UOJO
zUD*8N3xT01S=o>!il$v-!R-ld!oat;L;ayE4MEJVW#nk3h(TAimp&4sd{B{kn)@x{
z!|%;_`4l%>izE#|b-D8;IaX4}P{8UKm$%IJh-+9e(5uFxGGlmaY8wpwWM?oPq5GFF
ztEemHREUY$CuP|Sl6yI?x~rZtjoMAYK!=7b?$VuEhq^g7IYRda_l)#s%+2w;jgTwN
z4H-I(Ei8wYznu0gCQx{Zv}qHdIOLpkcrpXXU3Hu)mSZHp?>3ztNO7j|6s&vKi4~K(
zYx!()pNZ)o><TN(sYW#(`1mM=d;XTf9u<3WPEuUL@eR3I!Q>==usf3Il-JGJn?`YF
zFtm*h3C;8Jlsz-wesuK0>kX_?_uAzF6%FcQa|ZCqnr!8LHPz*&=}M8f^XFm?JRjqW
z=nK0&jpQH=UOYq-2#$?p)SjI8`^^?}R7xmCy)#q9eCfBW#5tV%%GJmqbvBWbudybm
zLdxdmi<EwjMo#+g7E8%ymEyBxkjSjyT;^D#Q1jL+5!&rI%g{gUvV|EB*z9O!=I@X8
zlQlEv=3ZdT?d@gveNNb7VpJ_09iO<YA6_cVD`>K~Cq~0eNTl!95aw{ohIRaK<}UmY
z@HE%qTk&~#q6a&}C&-m25H4{^ub4o;cEReLrOc6m*%Rhwl`iRp(d(F9E!(PA%D;Cn
zKO_ulBMfQVjY?%EeH=aKoNw)^;xlZo<Ms7}a%<qP2NmJr^wTrXzLz>WzI=jw5Jk<2
zgM(@ZAfD9Hi&$eN`u=$}tuWOo<v-K(NyPyD>vZJ$y%Ku2oh<9~a`*K%4`e2ZcerW8
zroX@YlfF>hRJ)-^6aI3G1V;ZrC-Z+Yr@`0$)6=A}{FCP<tg`46iX0^#kZ$wPqBzf<
z6pH>ndXs=VUTI9ycS4E#*>fbaFpU!*c^XlK|J+Y$tI^6AB<1-13Bw!r-qE9&OG+9~
zBL!&i%|oBxCsF(dJ;=CTS$Yk-S#&eBNigGecy#-Qp4&?~fty3ueYV8JUf&8P%cF{r
z1UWF_;Tun!lVt1hQl{X&b6*HGV`<>7i4SkE;3!d#;}YN43d`Y_(M~-Ov@R;Fo43?&
za#gB|<E9&r@{${F*gEy-^Oqa;Y|4BwO2g&2DK=N&t>lkNs{AX=(P?3(A~aG!T_Bs1
zU!pX#jHM#q&?(oRn@DGKBEIdBf^FUJ87l7kWAq$w96#NA18`o_l;2<vosU`0rVU^I
zGXH4UF~xe^-J@EBrbvoR!(qC<IJnjDD2_t7SOK!#oS)zDowdl46}@6HW=_VwV?8Y+
z*H76pw)#s6+w~)gqOaJ!{g^M>JXJx?#n*AEgZam}wdw6E0mX~8E&(h{O4M0Y4eR>-
zPP!r&?p&8$khLA=)U6s|%Kt82y)Q^}{=zjUvIr~wM{MN;_)Fgo;-{7tBRM7TB2M2I
ze5N^52(VRh{P)VF=*~Fd+XKEno>Rjkkzshd?T(m*Xdjj4^dD~e<70!lFFCUI$H$AP
z#xjlEXt-*pW2wcMz3wyqR*=D?^*^GKYQsGqB#y6NsyW>kf6>O|hgOhQbHB;<=jwM^
zes5WHC1p!8Wo=`<wj0qD!3!e`1BcFPewW{D`>=T*s01^O_mym8ig>5s7^t}l+TT(V
zXZz2MZ06h?-k|U(`OYLmv7>MA-N-8BJh)98u0R&PbD5AQB~=s;0-*)lbVxXPxbDL~
z`5Xr)Hn*tWUQCCa2F49igm~AN#VH?%6{M|3E<RG-HaM;?y8c$gechCb6Ro%M!}Y#H
z+!GC<z|h`z1q%aLuJZq_>bNl=7ycmeU3N$xk5I2qivv}|gG*Z3{uHBj+D5kcQyT|x
z%l;*~D$Cv0@jf!~u|ccJ!cHgcezMCr2Bf9<A}MZj=pKolG-Ya{q~VHe?3*AgQ_1sq
zJ<c25wmU5)VGgxC0@PA>Qa6^nhi(m%?Dmyd-Mkx-&NaHZ(BoZ7xXSO{Z&b}Hl=aJ?
z{qS&QAx0up_8vtHp*)XpIQhNzgwi^jo;Wx~!;@#5P3Aqfnv#&}R-^JMbMfV}pyV`u
zT-<@aN#0^hg^(KzH7{R1uO4{pym)51-Z(Rphdv4SSyKshcdF8v^S2Ix|5>r7zsXJa
z|4Z_I`o;51Z}H!s#LCgsEq3_7h_Z^sf2;EJ%Ec=*@xP~%tWcFNG}Tx>MP&6%93PA9
zjl2mm)*h0aKlW~LO|RKcX7A~0|5VeHyq&D~NJ3M)l6`ScJ1MMiW8ukGgp`#d^<#qJ
z+$+Pj%lmwHQ8!j-O%ejf2@CZS-+Yx{W$OR9DsoBW)1&8<p}(TYKfU|&+F@y;3XNag
zxLh4;T;i-S`s}zutl7%eTHyD)SBve&4%j{NW4}+&)Yc?z$}CLjL)#R~9q%7Xg7;{=
zd+D`xhlYJZJfmoTIaXffdq~;uCtF0tuQa0P29NnGD!x5@CAgXr=C^laic^7-^6^GU
zO2a$cGp)||SNlo0%rD<)D32<5K!tN~vOZ*o&Zb~{WtI6KvjB66KF61;PJXiROqh?X
z5VWFs0~D|awi9RB-8yHXXP;hnWn{4FrsBm;9A4Z%amIJSnp`rSbY*eQ=C_+-#S2o<
z|B~&!b!{K}{KD|*adG=yw#so5;lt)8`x)gXxd~=E=3kA%RtBAsjJMj*4Kp$V2Zr>_
z3YT%@s{#(Wy;AFZ%3jxbm%SFN5)45_C721@eH9bf_@-u}XU|@R-C5T;UAR$Ozb=;M
z9*=uk8fpjK3zE9f4h+*=n0{k7#-}M&hT@btHTgvJIGDmUsORLjoQaM-m5Xp)WJfSF
z|Cr2uO3L$3<lN8MQ5|lbVEa}@4NJ(6t!=F{9wnY!s%9^I=j1eA;y&WXPI6Vbzc1O7
z>0TCk{FqGja%8xuA(c`UVOel4Px@FlwsO_dZ&)U@sjz=@4tMVsdv}J>kJu|eZgitR
z|0Zfw9othC6+My{X_41Eo+92Hr1+w(-H|`-_10kd*A0$-mfBY*&t`I`8}L!(@opV*
zI=ge(<^>boW5nM7Oyz9qY##VLC%KGcu34jVR)~rBT${5xJ+`ISKIPlCt&iJYcYDk>
zA5EifKN#mD?l4F`)No*Vtu=aMv-pf<vBpogRb%g~<?Y*X)HWDAiqPhi*3XfPfkw^Q
zUGrb|I7MH3ugDX}d2M=hHSEbtPrC)CtT%j1&R;iJjPrhH_nrGWSKS+jbx%B0sTr^0
zk40|@*_7@Al<o6zZ5G8EJ6;@z;)bZ;#tpeWTg{f%xHRJ~((5&g!l`X32`LM4s;tSd
zyqmV&La$c^iLVm>{WovMP}KOM#+#mTqZIt>!E#e*r@PjBj(g?DSK6p_ca}cac;1ks
zNKE-VOiYtovdYUTgU>grhf=4i0>%ErpG?E#cJmtF;<J@aE*H1@f|{$gJfi8ef_4Q?
zZWru)FkL$1Swsanfxk{9XFrLG&Qfo4bAL7(MRnC25p=mGlc7#y9ff{c|8@zVlFTWb
zIq%i}y+3EHG-$Z#jBZmSdMqWW=4kpf>6@Lh*@$-MMn|?Xhx5&1@Acpn&;6J%w!~%4
zm#7N`M_2`O+alAUK7~F-y-wsDT{*t(Rq6d4qBsjRr>IM;U^yvO4l!mt{rXLt{Dawv
z%ALJfB>^L$SYtNF-*`CpY;K4=l$}Y@_q^cy`gdSRiUR>-+S%3o(+v(Br#dc&%{S8~
zeII2;57$s+7w%pjpLTy$mFivE@0Eoiv_B+nQs_UOtl#bTX?`r%=WspVW0fr0Yvaz@
z!7n2mmAIn0PFipA&Bg2V$3Hng%!tgRrt8B6^QZW}s!NE93Neh4k>BaF=OeS-IUI5O
z5L9zZrgq`d-~BJ*ikDfMv>)L%O7fc%xWr+mN2<dP0vZjfn1WULWex}0nE6^I-n(a?
z5S9{#J1332!%h0^-fp;?8M>>xE^;rd%^dG?WSCRc1!*&2yz<t^43v~uqxGe#TT=MG
zRcYRl@R$wx?~*Xv@OmdQ6uqFP8xG#1gjdhY91O;5Mbk<N&kT;oeZH<a{+5-YsxzY=
z<mdV;-1!8Hf1qV%SzvrHt643Oa65!v>IAd(qEDxS*hl8Vt$8mzUp5Ey^YObrh6``|
zKaZ~BCGn6ov(o9;;rVt1#~GH5YM%A62T-&nx|rGK$%LFUXq{O(`nk(Yv0uNu{;=#N
z-Py@{u@TuxZ=5d2P1D4HJZsCFt6F*ce+OUepx}I9$JE>RWN>J(?MNl~UAu%v3T=2!
z;db&HGXZpxHkb1_%gylqdy6_{f8RIDD9g%P#%I^M;5{`q;*{RYY0PhCM&{|nC4WeI
zS54rlK<~&LFZ(Q&P6K(*h=aWQJ;Fj=1&Po^yP@GBR+=WGYc>Z@eD@wr%Ia2p@QRA-
zT1?5N4}AA-hqGUHDirl=k9Ga1j)dFgD(Ruv$SOMQ78&ESn^(gme{S@lvN|X0Pv3-*
zsrNZ-*;V(G@LDqyd!FV$X^VWnlk7QmCy!pp>zum4X_V-6Tod+Er;uxk{}x#Yzr)qc
z^?Fg~A6HJd+S1s)rw(P~eIB?`wf0Wwq*eDMr+>Oq&h<Aw*=gUG5Q(d}W0I)OuRQkH
z(wGxD!Y}VPmFZJ!d|SPY2iApjf>9CkUq6o~)E_+dk8$iGvdc0yaH+jjA?Qm!%pUyi
zooBq?KThu6RhY9HYj*wW73sdo6jMDbv)mTqRrfbnlO~&wRV`~gzqzHLSBr#vho`5+
zSj=u*)~qt6jq6GsMoF-;CT|6XzMLfLGH|)1$n-jfQNi8NNJ5z^Mo=YgazYW+-Sq|M
zwFg$CWKH;XMUk1o0)C5D7$x|i%~#sSFCoE2rY+mT&r!=hU!tN=tHNH7>`oO6L#bRI
z$e0+AaBWG1(w4o{_-)2?=fS|5QO{D<!D&ns|E>ky^w%Bu?G~?$xorW>z<j37zM4G$
z*&RTWO8+~Czy9itpyTEEH2W8-l+Q$uM4a+!KbeBFkhd+>r4|~nu#};cRGSey$>U!f
zTM`Pg<u{JB-uYjc40%XN*;iKGEv~GkR)w+D)*8{(?(Sln;Y-n;RGD1W*8z)+`7<n?
z;Nx}EN%K`Tlwz!3sIO0aZ81%D`#!z<YJEV#N~LJ&&~umX_g9mZ)U~n~v=(B-{Au^h
z!6+50)_6T#AFV}(D|iS3IsDnE9y-%-iHc^Vg{FBhdkmjM0Ysfgacs^~_uR+B<!i*+
zRSSp`Fqf6{>t<kx{xY*}EoJI*4S{b(4*HyE7Oz*Od*VV`Hpl4t6*mT%@rde<n#}8L
zlb)Y4Vc>qGK+S$Kb@{RVbv7&Kf;a5)R$Tl9alsmN@fuovDXl@e^*h}?4$W1mGhNF<
zagtW?n<N?!ZH2Ma{L{xC#YT`87Ny$fDGd~sOXSQ4PvrwcV<q3=KN=L=bMO!^9o8PD
zFDbD%?WucXGomd{cc*2B@34aIxRc(?$;#jBZQk(iNNbSaJ2yrJExfN9Au)rhf|Xa>
zW~{6WaPNQ4-$v6*1aXO7iFFaQmV}((s$p_3X=Y*CiKm}E(C=yB?31aD5*!lT?0+~^
zR=$)nU2DC^Y|jN$V`3BKqr~cde$y=eiRG>4hiR$hB2o}6Rs^P1`{RmwLCq=RQ|)&5
z=q7!QTXPx5gHTaYYa51pwt5j}+KuyMcy(_g{Q|D18PDE$K;=V+Te*LdfWvo_s<0yF
zWE-PzyzqUot8vMh74Lju_xF!~2em55Sfp{=-{Y!lSpF(1DjN2Yq3r#;229h}{JN3c
zjJ|1UI`Z|8yU*o&u2gcYN%0W{iqq!{^%^y~+g`Um=M!#yIZ>If<nL`YL;r9+EB1-c
z*o>Cmdw9LQ!TLcJ6@zb=L?q<2G@jjB_s}IKCWKT>{%Yu<eL$$m*x~R`S()`pU710>
zhfLMJqw16B*GcP`zJR6+??=|koiXB+ubdH*XJo=+2%(^9kD1|!UjG0-5uxat_b8u~
z+k*|`IsZ&tmiG-Q@#L4#Y46mUVb84NV?o7#My5CS>kYAWWc#Yl)%UgLvSI?$@+*t7
z=AQ6krZ!1vuCthY)5L6V7Go17Kc{pTV~d7+I@VNeOKq{o$xbs3vv_1;mfb>kI%Rlg
zg0y*;TRljH+=xuQPaGyr(tJL=_|?)tt_}w)8AB=(HfG`?CzoxMSt!x;uzYlKWowX@
zOuNj37B{iB=pVOtRg*niWP&|QBxdr9-^DZROpC|F?z#`Jxvp$i+~k#hh|d=kJk+G#
zKgU%}G4EAO&hEOyCMGKCZZw|6VvuNuKAupIy!PCCWDDc{uDB=&nCZ8SmblHOee!hk
zGaLH=;tN0Xi|6OE*JxMv?dE2L=U(^tgf8_t)gP><t;Wwi*Q`HYwk=y6#3a-&JT5w$
zx79f4z@^haW7eKNP_Ejy)O=W{-YC^OoJ_Ysx>KRpU~~b0jz7}bCh#J*eh({QK2a4M
zOmV~&Cv-wqKS+48Yw1Eucw3MGWw;uACZc6RP*z-z&rXiXw4FeKriUg~_YAt*xXZ-O
zQkPI@i%kgHUElZC`TOsPH)F}<O9B$pB!)i97w&dOnd+@rEZ4mK`XJ_<aG`|f=-E)~
zuEZlp>*t#$+;v%aD*jHd=jx0~NMOJ@I@+<J%7&q${*3H3;W`ajna9?<0Iev7^t)jk
z{?i-UpLek;eWsN*vo4t(<J~586BmuxK1!%0xr|u}oUUmkCxspK8v9ym-1*UZMUBHv
zZX-G^RjRc-tRd`EI!@Ek%Hc(AkA2i5fNpr@I^A#Z<rYWF^Ubr)+@2*=+)vl#EatU5
zr!e*MdhfH3$7T7a;xg2ZqSvq25z^XFvseaRw5+x~;fR&5_caq}71KYZbed{TvK+yC
z6LCpwpq%6H<6h#eb?@VV+a{RggA1#;*()LVjO22uCC^DR2w6UEl&(hz9y{Y<e&trG
zF^T5nt?ws?Yno&q-7&=@YYk_vj%MlN4L;#e`0YBtp=nB^aJ5sg?p>}z&!+?ZECZfM
z#=gtW`#QlQ5Ww)dT(lJOI%7OlBc3Ld-*x^pAc5D!s;_2WrfTMfi`8mvdgep=+~*0;
z?2UE>1k4{^za~Mm>rid)T5N%zWn9vfo?2<Zx_E*g+%v<Xf6B10<7}LLAie*8Q1;ew
zQFYxPD1w4WH%KXsfOMzQjUe3&DGkyoNDm+(9nyl-&|O1H_s|W}4MX39&+mQS_kQkQ
z_xuGjbN1PL?bYkM_PK2G`Yq=4n~tZWnl!AIQcu~GI5*NjY;gw5=yE-d-;7b4(_T)_
z?kQ8EIOU;m>#zE-@W5mqq$CZkPU_Pkb4w{YA~5+dIj2S3zCaV@@x}3L1*CG4^>o3n
zt0?))y)GZzx1V<l2@!a0uI%Z~T_4nx1L^{}V!o7)`ndB*m~-{coX;^{OmhmcM$_BZ
z1!;qN=xg-OL&PU)Jt5^<8g6_mM#&$-k50-842QI#N^59wTaFs-y`3JZ!s!(}lq|1V
z=PuN-!=(_)=WOPk7M}?@?MLP|j>89n5|4wJ%4qjP(^Yu(W_&6#14NvGg}%SRvOS$;
zSED}M3t`(sF{FViZN}%BW*#%5>q=L=(rYZ#43diOO1Opw0a8Hno0l*%lS>{Xd#pu~
zBq$m)|C1Oei&jwS;1*HJ&zNH<8!8(eATPyseHw?$$y>!4{x=NoUB<6!v~;vUB<JMb
z<lQrhe>g(FM}-ETR;SvVjU7eNNb<cjbF<}cvHUu?7`3a=&(Rk-d3*$xPmW)LkQeux
zZ?!X8K#oF_NfR1FyTv#u<L$bH>MCb2A`*|5zfZ>;9Mo-o7EGUa95wSkh|LT0CB1wD
ziBUDLi*z^-4(wMZj1@dAg~~>XqYG~14bfX)?8EeuwMMaz#ndvpu(=g^CZ>%L3b06i
zx2Dy9tHQF)v*1MEj3(~VuiT6!cQks5-M|&H8d9*z>_CrYDc0VeOj^SwxPtWLc24((
z+2HRE;%rl@lSoR~U~-teCY(y7qvghAe9OElYIn!w+^tMwm%_ZHE$7T9!$1M$qF(RW
z^?;u-;Q}+#;}LZGKEN;J3{`I{+OaKW++~<N>N|bA9%F4!L(Rfcm^ps1JWizd8%fiL
zEM#G*bmxr@N?%P`fw>7a3`}B0Z+rIsWY_~gRq$88Yw&{PW;|1Gng^Z{t`)<6B{9`8
zarM1xJ|W@p`pgzPP4n)XQy%EDO%L1KhgYQ0Rs}l>x?8#3I4|CkxBo$*1M%5Z`Tnpq
z1oZNwgyEsV68Uqf3qRDRkKP}Y?REp(|Im1EP6tpXxY-M~SwfE+t!l(`3MbKY(pl+z
zdt)Y`CnR9|+JXSwSzhm6V1jqPNN6W=2@?8z?&7YeZ|K~Rt|a2zo;U9^^Bs=pyEs=<
z-|vbv$e0WZt)5(^5J;b|JlIyh0mnx4=^r7i%(i%qddILyZf^@gmeZ6txkUzAo4<1|
zh{I2AoqyVhUNeI`<fT<^NAOb;Cqcng`@jC-HY1#xM$69@l|NeIW16+N*<5>7THJ2Z
z-RJEV=UPcCZAlG<d>Gf9d&sH}o8vZ}hQ2vYbkEgWF4|DEYzEuKy$s<a#}x4z-I%XE
z`*eSHUtu6x`f=wf+^f7|bv^x*A-H*qXb&5D(7|brbF9LoArx-xmf`T6z<|{-&COL+
zG4h}@kX<fAh+H}<&R<xGkD|x+&|`NdAR+|RUyf%TX|nD=7Z*knk>N1E&`lf-s+_^?
zvQ`qvu}7;9@9hI6(-BKOG>eAz{xH!d)FKStkG~aBlu!bS5LyXA`_vcr*sh*+bv~?6
zFt({VU3%qAV~Yf(x=OE`ePdkap;k8*(SoGoQ19D0AFX_7moTk!*j0+Q_ol<xf|c}P
zXZw10m%O8(J{<8$#rSyiB4ZGGzW}5}%;f(}Q2d#?0{u{U;Hb07k6mAs%y1F70|l?*
zUh^%=r)0w5k~I+)f%`9Wo}_`#2nkm%G?@E^hAgA)4l|n9iSWOr&WYX-mp$C@7MAHv
zvtH6t(RXxI2)O$0L|iK2@ITWgBn(-kIum(KB`wn=>Rq)+%G}_`o7Mn#%xA7EpYt7c
zN7v--i_Yj&z=I+*I@-w6{8|feZflh)*aSze$$k>c^kq1-RYj7a0RnKM+azBrbT&31
zcoGi&6P3<X8CF%SCp6<hf9{J3X(S`AiA(&66Vu`hCzt0ek;t|7%*2j4AiMXx%*iSE
z)tuy_6MYv&U-)F1Z%CV6s^{63|K(`PDfK_*kprQglL#B#)Y-)PjYvv74ErA;mu0uu
z^zL|vXD`k?nT0MOXY)^*F*NtvU`8Ml2vck1qw@(*5T80D<TWtRSRgxRqSFfb@XE=V
z@)KJeoU1<&{u6d^6uEqgHK>zHC)$bm@7OVMbix*VaYXY{=Dv<a;{0})1t@OjPo%KE
zulJ-oawjg{P#v7LFjb^yaDo}p?0e$WCoDkWp}lSEM+)ub^$4L|uW(gO*0dP@K9*il
zYRCbgXk_ubG}xa=F>98r3%cwWvCPQi4B<3JG*LwzbK}fNMk*?#xN*)1@ww=ve91=M
zj9Gu=WHskT%}0ae`?1!I*2PQn_`_U<iiCtxn(=UVvTr9(8DSMyC$Z|Dg1$;17?<>M
zkz44g>6K*<6hS!_PaLKum)fSa+UFj;NX5Gc%-UTZ`WDYw*_=*rG55SfSRQWKlRa{h
zDvrj{Q_HdYISSu;CEE69(BYoBfURJ}+~?kMn+xyF4`dK+&-e6>(Qo9JmPZ!dt&)Qj
zbZ@nTmG#iIK7MbJ-M(q1!aZ>zwKn%<R~x=Z7w;LSr@&BCzeX0+5EAE_NSJu**U3dU
z+#0!;eauA(`#ohQ+NE2I=I>Zpx_O31v3j0qC|zFi#J@PeG0k;oi{VX%BJ;~!?{UMK
z>J#k&cj*s&Zc-v@g!A3KjFeDwXH`Q;N(nH{5gllNb*9N5_%nnwum0hmbB(bCthY=E
zl3kU_HSo7i!C-pWRBx$zYnIclwN}E6bXf+FXrM2!c**1CL{4w$40fOIu;W$5pIpw)
zWB5xv=5f<IYh}pFxjjgmE_LZ1=a->3RpB6fW_s*E*uM-y=H^LZv2^5fBCTXGt}AME
zIlH9hJxe>lcZe2tk)zz0G?OS4+?p4DUM4K5tM;st^$M3-Zs<vyxm(4bZb~+rj`uaq
zH=M2|zESL)Z?^`RT4!y2(3LfCBJAKI&!{abu1Y9XTaG2WGaXsbO7Eg`MKCR1Iwy+A
z+Fr@ImbwS0WyK}=^nv)&GT!zm_A@NcE8h|&Uv=4duiXF}(YJn;X)?Py{KUwuj{m0N
zkmJ)%(BrnvCEUelug;`7pGHFj^=v#(Z;Ji)84;$MZq<33uY?geoTS+vW(&&s$!y&3
zM6K*+hKtcCjVC71qE5f#;>1jFICz?W7I=xN15c?U%d6!AJ$ijgh!JAe7+<DVjI^Kn
zxKw%liq+ZEIVa0Ie82_v5mGgk2yuf(OlTG(zbde#qNY#pzt(I8@?f0i*JkjSmlYS^
z%W$HD3KuHVob5bPR>$Aw<?jgxgVUv@hMsG*o<01Wxf4Nn>qfX{t}p31<Z!!)l%7o{
z;45O4{H*x4e{JGG8&iyHz8g$?$!@xrJn_A3$DJ#`uZW1Upv43O0V4BlTmGOcj{9rP
zrcZh|IlBjH@Ire-t6zFTit2^^p^Pgy!xcwR^PUnAfW86B!6`H4O1HbrC(o~RliW61
z)!04}o)i2+t(ro_wk)Ar{)J@J3-cfjj{MFlq*PuKm9lWi`H*5C!|VO>ruJd2(|`jA
z?%je{<P3tHzD{)$dx(GTTO{>N00>1xt5Z}<>uYCSWVB&O(Q}{xP0v6Jj&$b&53Y2u
z?ZL?G#wuPMPwl;;KtsZg2!d4}M8o!#*wCGV6=*2X)-Ea9%y;3TYu3y?=58xW!%#MK
zp&2Mbc`+Z$<Q(HmPNM!QTvT-bV4vO4u8wug^msmsXlcumrF2U*-IiDq%Eruj<G4lz
zn{|A}R}>g{XOpyD9G`E!=eO%~x6q;HZmT7X!nfWNHWt%GTcKZd8hwV|??=LHFs0&|
zvrYP$I;oy4KHMSo+|C1u!8Rk<SULZT2B9rmFsQrT{BY|B^6SThi-f5dv4|tP8sVZ6
zuvdGS^uhZ_J^0Xa<4VD>;5jU}a^?$L4_e3ZZMmKU2j^~D!w#?{f`dne;I%*+INjqJ
z-f$DDzZOlKsGl>voigT^8Ug0hqt8oPG(gJ1p@Fk8lH0TVAf!KTe8Zza5HCb9-9ys{
zqkr$GPfUV33@2hRnOKkzgZBFCtzx|I(JzY3oY`h3M%`1veAX)=kh#IT^HQ@t?Msww
zW54J-V6sL|`NN&&!v`mNG^Qpggglcod~g0PgJy%{RGM?EqQ|O`(QM(@(vOnD*zsXM
ze@KDh%*{J=B(w-ESZ2~B4o2<xqN*g<AJA?^!A!&6HkR8C7uf8)eTXjjNrEKTL<YkS
z3Gkbjtki@UPECykTIOKe!~3+^=Kf^9Eo8b34?Jpm7UN07%pc8pJAaTuLwo&Kf9r|4
z<+0`%qZz7!Z;fZkH!S}#M)CcODgMy!O4}%{9n<Bt;~xDS>}#V+l?+1Kk^n>cmu0v!
zD+s+bUn5;=KP)+zypswj!3U>>8N+Z<pEq)1K+Dhh&^osJl+F(2gkDm;Jgx*p&y{~D
z3xsK*vzjz`KqKtMu;S0_VJ=5csHmQ5Rk|qU+O9>o02fyh+_KYXKKtHz!1h>Dn<4Y4
z<JfZZTCDg{C>0jDI8yqkG@ZuC<Oqo`GFs*NtM)q058+4VZgqVFtRn6}J~arnnmaaz
z_eCQ#-SSaj9Iqv#jfjzL6>l8)7EFnvl)=`O{D8h+*uUf1(+x2OGKdGoosjvgxN^bq
zqp5u%p{NB}*Oj`?ed9GEqZIb0YQYAw+7b^+%z}`CrqM7ErY3SdG#>#F(k`$xUhR_?
z5yQ*3%A(@!y-_>xkeVRjGxv6*gAK!#4nf}tP8PI&@Z-+4X@+pJ{F+NQd}M}-#%?2G
zVS1L7U6W2KIUUMK^G8RbR~8SW8q}QwPKV%!P)Gi@EpwR}T5|GtXYK%>9)SF|Q^6UY
z##E3M2hY&2-XHC}BDIM`2GaU7mfK4GIJ{R)DqeI~IBz@{YtsRS8yEFD$@waA269V+
zSTgy@^1!Ui_huR!<_Yp#4`0yLE_D}2P-LCm8LrCp8}DQ*uazt$!=5}7_~bp%y2%Jw
z3`|Z6voY=3ktwZrKT2Bqd(@hD_31KQm@M?~FkG1*3{6H~;cYGY3kqk}W?pf%1PfQy
zeq?tUB{vcSm}4hf@Od@wm`Xjx9~ed##~NV2`5&#&>;k1q^%{$mf=<VyDLuSy{U+$m
zF%txuY2Lh)b$JiaXW7}4j<Q;j%Vfo#vyV~b*=PnV)BB{4UfC33oa@-L86W?%exJG$
z2GC!hTTlR;&vLzSUnI9Qp<|2_qXwwx;JG;HQCNgS-B{rppFr?D&ZFi<ac%nUUWSFz
z^mR6snBzhrIMPteWrhQ|uPr@B8QS6^5n$kr;dNB2!D+H;(yh>np1RPLQ0psmUh4LQ
zZ4IZWf034jIupW~z~MB8`~F^vr}Z4k`@C_xs8o`9wV>3Smc)ZPneRs6L|P%oX>Ql%
zw#N@Q+6Dh&fYR}0y||p#<qj{2+qF74<%C`5Q%&Xe;31w+60Db$CkSA)a*GAZP)S6I
zfD*udz2<C9f>f580&Ygt)V|3$=@RYoFMxy-@y!waanP@vrDlJ=KAT7^Gv8-2q{iK0
zP<R=E3>n~~E=UXc5}8urW@TU4F<q6I&F)Mr=*+l10|hjbkeRh_h*|gb3IPG&6tDKR
z;xt;rKg{J>%=8Uun&i*}i70}K|BV7T5P;50pv_puz`&q&e(5(+i^TpS!JU&KAc6Fs
z=8J#v-vxxY-&suK*x1Xs@B!kwJkv}0U~Ln1OC6;Rvk_6?^yn#R%ED9<WNP+<lA2Z#
zs{wj@{b@ax-HoSZLP%(+jPiW41rRDTbAD}InO-L<&2o@%T8I=&ompDJ_&@JD&3}BT
zJ_6O*{zr8SSmr0o<jSM&vRM8tB<R@Bf%G)RPkQ7ASE-=yYqAx7q)^)Ya}))govQhA
z`hUFhH2<n%s#_Kmt()J|YUDsZ0v(aZi)RR2kD#UTRc!q5InM)0aTL{vQeykp@$p!5
zJFdw$kWsw-aA28KPKZ;CS%4?s)3({$!=?wMFH(G^dvBZycrZiS;UrcFTpH3=g@>?<
z$rX-k!TERZ!Z7{`6k>c=j})FMAP)Myf}JGV&DGB%<5}bx6O(iDbO82CpC&x+KUjc<
zoWGvn<Kyl+lX;TG)|{W}8VG&T4S^c1GGonLxz6gY-C-{*=XM7F6K=@(vF%BSH$Jav
zN&8vo$mJ@ike(Qo)%S8?OckHx+os{o$0G4>1}D4;F(0%72qJ-FNnkHdq8DYANb;>z
zwY6oXq>zDzD|#Lv3)ibD?s;D)60RhH91t9geCsO;9B2)*xmHq?z_hlq%7Q>*Vq;@J
zhB&ECc=bOI(k0R6!t*OfjTTJ2FrvOUqJF!jUZ=0UZVE;)Iv1VO)^12-8ctf=&~g<-
zC)wCLNwe$@4o&lS?^Heywg5QvxWSWZ&NWwnhNN?LSKbOXTJN6lsQHZWcyo$p?$Hlz
z;1t!0d1RKb&YF|fQvfkp;oU#14LR40*JM@RIW<^#Q=_R~z_8k`$jKwIURS-RtMs$_
z1t$fafx2Q-J|@#ksKZW)WR$+{xzy}#4XFD)jLfPK6R6JuKMtZ2dD&$4l}hB|W2-d7
zUS6K9XvsYi5L6GA1LqFV&(`+L6`pBUHs&X6cjc%OUoGd7kWxA6G52@68Ig~Q;TaoZ
z9A6_ByNV`=e%l!_{r1To_O8=VxpOIg2Z)dW|5Qf(4*7iy55k*I8Y8d<BKtXC1<ok5
z<jz5c!4{vei+guLcBbD&6~cutJXF+7#3iC42+;6s3V|A+<w7M{W&JZI3aE{htcxYb
zb4hWL0G1mz5Kh42?~fsmL8;UG-_{98ga}_$IBmZO6bIH~9bZ}hv|U6NvjXuuGAU`h
zDEftX%Beh#D=+l&6|UveS4|c~8Vu<9k~XbV&wfKr)k|IzUKndTdQa}Ekw-n!-SHHK
zw6ItoC@fYCDD5jSW&Es!l;k7&H@lq27Y%i@|DC10bJtBL^^>-?g-s>0=;?`onX5?M
zT-^lO{uV4;-Z|@9qhts^GR@;0Z&SPplJsdf8dMD`1ANXW_kJ#6`CDn<I`3Y@kt{OG
z0^-4Mo?y1RP>YBVve6BqSlk@9ImRusS5SXkbfCfPr&>BYl;-bnQnx%ds>eWd&CSw7
z*8VBfU~#S`!GSr2?_y|b|0~rfg=b&LW0c*)Jf<P<rQZ|u6A6oFPj+jKySnhXJaVDT
z4^$=!S%E2o=^L!4kHUo{tA840P#f7z)zs{vI0iu)9Gtk(xe+K<O9A3orvXG3f-skW
z?M-Yj7A=p7`Xu(W6aIF3&Fo;(@wz>&4puSnKQJMZ&mE&YDLxy`c9fC2AGxaNqjADi
z4W>t?v(m=^_bCQ6rBTUv?0k*WuY$?$W2@z;A1`<K`BsEh6BVm=(xjV8-|aD(K=R{_
z^)xQ}rEiTSmt7B~D;;@6G98oa_j4D^0xrNaTu*wI;)=9fE+l)-jS0DZGGK1+7eX7T
z!Znig(QxhPm@Z&3{2mJ^y8(=?{c(a>G@oK4lK(5{LaAr^u&}MF_^qdm=HL0?qH^J!
zX&{nUa@+6;@Ny`cY!KX$CZWsOR_n5ZvXpvQIHmeqKsQDJW``<P?n`-MQMaPBz56$_
zcOGg=iCeDf)8)2)>V`qCzyESWSgjN7aVRC5Ol`31+*=I;)DJAuF%(*aEjy7585ak-
zqh@1|!J6r-7f(||d!WsBON6eDZK*T+SLa@yhQHQwlmI5!=I#ft2*X(#Fa)j44Qg!_
z4M~p)fpHF!j~$}t8iMCO;Z_Dd$`o?LPB%;EYV%Tl^~_;=1=w!5Y`!|xQfQGVEB)kg
zc?J>4&2Un|7bG*xD40URHN2@QH`L4|xvQhEM#a>kZ4n2r^8uei0yzXpF&!=YGPfaQ
z<|V>+jL5G!&C2oK@sBLjOnA;~LXl>WXpGroSMO;<DX6kQY25c}vcZo4;@}l+J6pti
zrpEQiL_Mt7pYrn^``OQ}EDx{bJ3KR$=UOge0R|egE<u?x?#l6hm%xAWbAK>DA*x7*
ziqc%WYEQR)v}N*nO*}~3{Gs!&oTPn9slf?D_FNDlb{W|_<cHuyc6oU9*Wh97Q$N~}
z?dqC_C|myYgpkk9un%*oiyEsKk@SMhBwwlSk-br-D693$&&8r$_#SDbI@pn2wDKp@
zI_l%-#5qu3dz26flQ+G7T!0CbDU%2^xWy2B;-Ec4OhY8V5r)e?8!M<yII8b;%_Ko0
z5wPf^rQyWR+{hGIwTCtMv_c`}vSr-w3jmP_CChOadM_?N!+t(9_YFW7Wig?Iuz2Ku
z^6)`2o2;hCUy}avxO67l>Qp>kd(VN}^*Gl+&n>z)`u2G2vDS2*7a#qL`1?b=-3fyS
zSMuS-jmb_|ZnFIpF`rBGZiNTjb48(efHZ7ZTQbXd{k*J<hkoz^DngQ_LZy@*Z?_aa
zLAPZx`XC;eu3>^NAI0J)ppF|413PV_xoM6oS7f0+_a<O=XeeIRY<=3dQKvd*=HQtr
z=Pxo>f2@AZq`_yB6Z*fNQqcQ#;F4KsE5KoRrua$<X}qA<RQu<J&;NQPKi}+5VPoES
zF0^p3)4dc;Ya;%>wH6Mcb|86c9*KI|^(!GZIU#|s)okfrB=Yj(qa7yuP7r}@IfLR4
z&3O0HLrhwg+V&VxZ>?KR=;3XJ^w4V)>w>BJ;H1hmQddB!pVj0uyOOxC;)u2fwki)*
z>4g)`%MbO|69<#C9j!-#_Oe5c)(Me}x~abE{8A-`0F;+;V+Qm+h#bmOrp}_>IT&TO
z0Iit2g00_~@(J3Zh;uWi#SbkIWoF9Q?G+WatG>IZcw}X10*sqb<dky$F=GDPH#Yf5
zWSBx*EZyvO=)}mS1FylNmmeym2B>VRR5L5db$#C^5klnj<9{2o+-P8<-IVjsgY|E*
zJwQXq)b-{O^!ChntnRUAw~ksGX;Vejg{T?;yXB(X8{vJgw#Kd9=g~~u=Ma`uAojpC
zQdvkrZ3~^}d7BAI{H)0}I#j`2w4>>Kx*FuYauVSvu(wxIQczWNohuU6{-^@@i>7{W
z7-^$tdf<_az9uHwc7Mx9bn`PtmM9m~jSxh%sBCxaJv-|L8hjbq{3_F3F@4A8xr2kY
z+J;lw_SN5oNDm#vPy8`j9(vgglf;V25RIy7C@HFZ%)BBg#g!qR8ofI!J~QScw}Y$1
z+$kvIL@2jLpKf`_tcbtOTTC1RW?PyLip^A?U?c$yR%{)~YLSPt`=4<6zLVVJp_=`>
z^KZzsf4NTQ*A%-=w+nizC%3s8+Ck;!GO;dt%@9%UBT;p4S=~>%akzGm^c>lf=#GQ)
z@g?3OMUr5w+B1VD(v;(Lo0gYP;tukN6immumO3~<PgAO(j|Lc>S+T^cd}gmMJ$WG0
zRgfn&Fb%vy9E_1_+_N#Tp6#K`W)2gw;X%>Xrlb}fA2J(fa-5s1vuU4p;OCTa-UQ;>
zU}I{0yF6i)=(+86yolk=g9z4Td+RCdxoHYMRdWAObQ{RG=;9Sl5{~^(#Wk<2wlX3?
zXT20RF#TkO`ZGpgUB<b2EA?lHtJUYQ5R7d|X4-Z3YgW*(uYX#Mv*$ujOoC$HGvzS@
zL%Gdwh4id^k2V=(?%3VA%Sge46|yiWY2ll^fRca{`OJZzgMmUI@wG=BZb|Cw>{Ztm
z<+|7cQ<6DYrYb#R%P0B#u<Iy@lYzp80y8Alk~!&*Gj!&ZabPHWbh<nAw&|l#?~~zj
zrSccRVG}t`O=*DV*%B^``~x^Z12}%BuYX)LH=3+}$GotO{X#g#(X@?2F%CF%l7oYT
z;mw;JNvsGg<>5W<JIpY2Ip9<__?<6sFezk)OIzzdzO18)p}bB4s=qNz;6*6Kb(7N(
z#~fSi(q+^w&1X|XLwWGjrm7S;9Xz;+0BwVhSmfiu@m+q88xrAc&xGm8P|Ro&<szny
zWP*Ta0NxV<h`2`Lq~iyOQFiq~YOCKiC$(k919JzfNzBXI%mkcqEywXU$@5oBGvM1~
z*Bdk0E}nc0;b8cuK=5dC-)D(YAG?3;p2aA{G*eLp{I|asO;E}%FOUtWs@BLAh=2F!
zI{m$3@IMvJJ=KEysKa$G@&(e~N&oj7)Mqa(OP5~wTS*lPmJ4;^V2J#;LLsAer?GnM
zu<E*Szad+@@#k-<=W#^YCjbKVTBUu&zctF?2cskohSGoU@+ZL>pVdqF%uInbzNqm%
zI~1Azzbkr$YekK(kdbbj?D*1ps(>N8{eNa@`O`q0Yiw%U-5ENCFTv^l?}En;nRkjb
zK6~U3>MB{Emik6n9R0iK+{|ZlyH-E7=QjBJIq-Dbevkh5w&7VFCQqRyl}gNU@VPQu
zVA|j6!=q8Y+p~1xLfoHBI+hWY<gELjTbdLFJ%uP=@QfbJEh&$p+x@$+vmBXT`BXQ<
zXxjnz_y!~ZxAWcff4*$knbfyCoh@9rC(tr)qNI8`|1Y_=rCn}B|9l>ZV+XR0^yJo;
z>Z{f>bB4wkI{&>t&N8<|%_nlcgOR!E4~jyH2dS&H&RG4_rv4qt;*bBvaiXYI(?9|i
zw-zlpV`xv(usc1WAI><<Vbxs_8%sJEIj4rR`6K(^uus&gcrKJA8u6RxXFASJVX>3~
z1O%wL#a=E`vimH(1K;=)%hOMczeoS60s({>Q%>2fK_I*mx8U#^#mlw8R2Hy@?I|R=
z3gao?JauhMy_mWKxG0wn_U`?^T;z*=#yc`!{yiY8fYW&~uugAgp@v*=`-PnOeE7mj
zeyJSb@N~xPApFCo5+X0W7lm^zqKu|y3_JFR_8}#M2PU@@(SgS0WqW3H3XM3a3p2_w
zm~a0}9;8-#(NmF&$Q(NEC@#B`Nq3H5+!tKmGPqZ1q<H^_QLws|T3)8^!_4)8tm8@!
zk%{)EZ1|@mwaEX7g0DldY$OyN9lBTmNt63`i(=mO1BaF>TLN!P!)yzSqDInvh23^|
zx#m=_3M2r~U*%ijK7Y!FF7AgSkqf*+;ngvJhxxeMCn~3;r9<T=(a^N#(!h(cGSMeA
z*LvGN8X78dy`&{Z&lkYQArVs;&n+#DIy?JJ6(dsw#ff<W9PXVg_R4?3P7{#a*s7Wu
z;B=Muo}QjX;xX@rKB%fXy+`UaVgKFV&p+yDy3EAPOvB2`I%y1O50ex+yV_JR7#(;4
zk17z4p^N>iwY~$MEe&u$Iq=G;9UWDrr27WISUg_6Kl1*=4x#8eQ6EuW(JRx8M44&(
zaA<I?)8ooe?~Rp#L&?lj=th5Jh8p8s6yynz;gI+PIJ4|_1Y^I{gzQk6SeYN`mc7cb
zrGy<7)gg1YWrfpR0q{F{&BB6m^23&a6AM2WJqS1-+=d4UI)Z|W!yg-Dm10B;JG*~H
zrO(EK$GmH#3VlraaQJH_gjwk{S=SZ_#K#NXOraz@9!C~pf<}OHBtBuZ)br~UHS+(<
zTbu#qfC7Q9?-UKo9-Yho&vO3dkpHZ*_Z@(JllNY2^y~PZk^DEx&Q1Jhn}Zco`fgJX
zZ+h*yw}yJDgln;(BM0OF&j$wm)e9ZvvYgrur+0h{+l82qfQI#?f5n0-FoAmV3GjHC
zPd+=X23baD5(o#^gH9swK`gDio`U*SdC@EYT<+R(!%<n*lHTgKWqeWW84;sq!TGRx
zL(J}iY1|rZYv0nAB6^o5GlJ#cAJ-0+w=<bC%X~%c_dJq7(Bt8uMSdfn00H3KQO;qL
z6&5TG(+;aD?vIrRT@S{)q<)>?9>Ff&mEu$F>paTitupVFqLN=v>Q*6{DtxXS!}DH@
z=0Z|KBctA%MD{0JW%EkFXmvs)0}ZPh=6c@L+x*V}NIIy?Sv1>oORAiCK6%D_r-xNd
zw<c)dmdeJJepvXkV!_Q@ZFlxW;S+t0hwWDFtJ`P@b3{x*dSAA(a3|2Fg6dAHeUoSi
z**<K2)^E}ax0;9VZ!!98m4Q=y>1-pbZ?Bj#MYLJd0aE<{av@Yylpd@;cLo{{#}){;
z#}MU;bIMlbw0;VX0Bw0u*{<u$=kBRTrW+M!kj&*i#^rNL@352Ve!kO&VRoz0yfBf+
zmWHlcVTG#(v)eW4=#`-XbD$Xl%p{EFpM9)Ww|e|QMG<kd<M!~(UY~Yr;haE9A$`Hj
z8)G96Xp(smPEQww!IB(BPaj>}VT=IIbfXwm`F5u(fJ=F5e=TIObn=Q|H{m#%Qbg)v
zWK{2}Q@O8~N#UheN7E^Q5m2O2i2A4^6a#dwU=r*uSxg@Kg_~&uNcr}j)@vr_@l{Gu
zPcQ)9X0AU)rK3ECKgl<A7e3Ykr;w9+-3dJM>hXt1+q_Oaq+5vvVB`@7)xA)wc<6_;
z&C`hP_c8A;25)}OK)wKt2NU0IS2pYyhD$FWh5$sw8*}FFpClHv*9szTjNel=PKH=p
z`l8HOPj8{qK+)H^w1wQiP`<Qt<@5258%37Qm8ko+_2r08W9Aj_!nB#UEe1Dy2aC;V
zRu2JyI=`YWdtNXnelhX^;Sp$#K_FN@z^5GqO|CxEsChUoYq~qroVi+L0+5kz0Oa2S
zU_H7>{Yq!f*7}DR33ofXlnt7J0KW1)%^!=I2X8b^3M8Z7{PJOvIRMn%&upu)>}ZVl
z;1Q0!33MC<4`%#nURy}K8objizpbaFxIqK>fT~C0uf5pT_)(plk{)6~{?S$+8e!NR
z;a?jAAmisrv^%M@I_=&7O-J?6aG+yiQi9{v#NzMprTgdhgT4q-gGtH@i>in%bT*hM
zgoNCJyc|x~Lcreu3_Bfe-`lyZl^{72?#E^_OU8zt|F%ASRzEfD>Fgswb%+Z~5>zmv
zW#H*>>j5KI(hzx|X6}FbV?*p2KwP$sc_x3TU<TL0oGdf0lJ7`x``#~mi@iN(_x+pb
z)sn?TK6O4jj!`kv{lRk>87A;d#T(a{7+nb+bJuMB{UJr4OR7iR5dHCQ)!B2fd?v|x
zvLM-a6ob5)kxXe%Npsa($CU!%ouRyHYb2{^Dx6=K9xJ3D>SE|c=g4f~&d!sHp{((i
z^5OD9mTaljjN8w1RRib^h~(E?R04p^a)bsFT@7x9bIsp@)+)2<o3d4h`og8gwPpB~
z1oT~YmWnF6Z6r6J-a(F?RcefGThs0V9of-`YFdx5a6w@-x5u$}%CLGwY`B64+YTHq
z_)38pr;+_tpfY6yk*OU4x)eshz!fJ~lxx;MH#00W$>&}@)_~swTA%M@s~73H4K<nN
z){Ump!9<sCbF_*VQyh`Ll&9Tmm&V#rRsEsu#6^YnBwVsmu9uiYFg_=#?il7uXXCL_
z7@5*1S34TNv!_F?Te6GrKD@B({9t3FZj(O9;fxX&{ws>evlCPx%N5h|kFRq!dxlol
z1D14O>rtjjNFihV?2sS4wLB0hUJ(p#3-7VAFQUdh8(&lzG;H;CD?8+Ui8(s;u^(u?
zTK??y6CjhAHFbN9#*mgLQ}EyqhzUJFdrqZVln_A{D|BTgkz<)8OTN&#n8|&gE8Umy
zIo?CiLA4vl=~(Iv{CDNV{$fff^~mgm?Vh$IwdY+DU=4EZe*&55&`_Ek1+x0$TqaDn
zm%$5P_LnOniaS<d3wG!^Yt)i7e|mJ)`Z|M%K$ULz<Dc{MoKOAkV>lBQUx*DxVW4LU
zqz(jRg*lEpqY2Q8N8v|*J)YEv5&eV+OpaxeKtlYT_c;o&@F`G<hQB%{E~+V}qln;m
z$R7p~OPSoRCZoZQ5BiO?g+-(|38@KDA;%j_M%^Ih+ja4<jvs?Jm)|nf#4f(Ygk2;*
z6YrzQ{tC=jrsy9d#<f2eWR}d_u#Xe0%%3yo3SgVVpxc`}Ww+q^Y2SoszdJw33|So~
zCehG5pbEgr)!&>C-P3^PytVuXIJs*<+@#;NJwi?MF+Do8Tiwt%r7wbY?Mz-2Xz94L
zc<TjkTv#kKOD1E`M`<g)Gz4l-ojEu!nOqT&8yEe!N2jJ$2J<yDx2jIdHb8C<q_h@>
zV;}IKZ1Ck7%Cl>JtCC-<>4v^>>=tQ*z)ue94zUcnww2a^{;A&+r==#>LDSB6wHX`X
zJ-z{hrD5fOJ}Lb1*R2^*X$cf&S~Oklh+rTSESz&C*j+R`&-7JVf%T<%ELhCk?i|9d
zyD71z%KjvgLoS9}*Y6%~DAtDcI=ZlYQe{Sl<qcay%9l{Y28SY>ZYS;l`2Snqqd8`H
z=Usf6e)q1ZXK@elrM<zk1j%hdKY21H!$vS56C0b~Wo<QYFQl0t_M;zrffDcg{cz8!
zk817bQ(;pzwSX(|BJ#$S<1$6i(SrllYDLTHduK=WP84VdvSx0%i`ZMLif#4Ty?>jy
z`DiQpEVQORY=S5x|2LAo?hS6eqddG_B`w7KzbMbsVR0+N2xxfAd-T76n#MnuDd0Z+
z-!CMPS^kZ<{{Mc=!Q5Y=Wa_WQEal<B51gpyLlB4l_`)!JLc-X@#Kawys0cIdV)$te
zP;^vsc`U617pi!VA%JqCRB#~<O!-@w1{M^W`!!O<6gkU3c}EPH``-zvww31@J4fYL
zvuEt6{J%awAd4D|QI;16<1yhLkLLf!X_a|VSFs5qev8<`XRenVx%Q7Y+l%T{u;thl
z_+WHf{HxIx<W}cUuw_hq4X34|Lg1MvIeqdxgn5;nmM1Flz5*!W$;k2RZ39JWUG&sC
zhRLk7j{sz6Fv#sbVXx!ZhbMnLy|0PsA>9~n_@EMKGRXDgn+KIQ_8?+M>u_oGsFIGn
z`e#15@q-7Zu&+U;>*&fZ4D{b?OTi5%>Hwqz7z6m`Pu-R2I{42&L%<gL!VdCv0NjSb
zqt^}4>Ey#mogY%<4KL#L1kjSVEGH_wT~o#=#bC7BH*y;Y`e$jm1M63{94@2j2moSB
zq<cv3w{B}o<ZIB7>B{+%aSzG^6gFN)16%B^usEawFo5I3%^?7C3YJdc9qGk*x9kUC
z%n%Wv<?Nobk_u^FNXb_qpiVq-pg%O__6VCp{HD(rH9piF5)i}5hE6)jJvVbb<Tt9*
z+BnC@=_JkJ)7Ok2l@T;}(5c%PN6vJg2*7q?XU(t}jr-^yQujI>zXlro)({<D#9&2W
zy9ZnI?azC`ANCg9(o8iymhYCDYn{|JU53WT$KwK}@ZzDC@!y^jgP(x<4Zzxg5F(Hk
zTMc{niM2t9{#04Bd2=`N9(Owr@|OWbWSU~3ln+T{jTQo12fvTIFQbcE#<>@S-%%HG
zr)>R-7RlTu9r8J-FX-^uNpsG;F2IUQ6M7~F+N!phJ^$62GHs-bJRZpExPCkwr>u;)
zQ7_1|fBmXNn0F0LqK__kd*RY0IO;Cc`tY9N*BfU~J?E{E8HZEW<L2R30{oyM47bXI
zZo}OCmI&3?U4Ozd9d46ZK|A%DM?0vmzt)jCLsd5UFxSTuXm|{e3)x?P#eUD6+Cp?6
zMCo_@4k1eT+)e2C+^|fi^<eWdHc6<@{V)!}rtxyEs<<Hr`f%)VA#%dl^yMYb;k7!K
zJ%YbV)%TK1Z~UC3jby(&;sV2KbN*4tslHvZ?%@@&O9?6L;n1(Uz2`n)Ho?WFer-Ad
z{NV$4saOBu_`Hv>bmlGM4aF7u7a1hMg9m-@cCE>pbkalf;!9bB+xASO9gmvDyVQxA
zV#91bEI|sm{%w}dh40pQ+WcvB)x~`0b?V}-UFH3zVYc<SPnyTJuFz>@v@w3Ap7XTy
z^RGMnC-8^OQPu~lgb(X5HNT$d`-80~N6$=bns=#*^T?F_)<6np!h<{Mhq<7;gNUX#
z@1MUmu%&pA<1(BM*yyveCdsZpk&VXqOC!?$33j=e->vdVc{>!(dNQx6d{}Qeqt<?W
zqHyYCN5=F?gs3Sau+82RFHE;!46gNXgcsKzShPF3AH*y{8O)Q&`4w_~pXuu^4nPW8
zWvZ;WG03OO50^xs?uTj!a71pKhPyy_?-fZeE?{X2`g2kl4+rAlj9V-O1Gqu3pzliA
z_N=!k@X&}x+lPC5=TA%1ZH}SQ<SRp$zF`EZht&KYPVegVQ<o&l^OGMp^JE(}A33w2
z#sx`nVH#=qJVe`LFm+q>$W}?@QoYUikUDeo-q0(l#-;Jk(t|B{b*6z+js)TTsJVAL
zalwUm!c0sc4ZRY!?apgGrc;OKuRC{xSiSMWvstt<<mFA)HokQRV?R=?7I;ce;Xtmr
zE*%QxoingUJ9E5i$H>PayB|Ihs32JC)yJKkk$_F|RRhEi7t3%cg%KIJ%9=1i`>*}~
zg9UH^=<{`&^2sX^Pki-LTx)~CqMs97>U7S|WJ7y?Nx?(bqiBrzAqGxsa$ri@RN^e#
z{eAVY$oHPH^~p<TCc!3l2{QWh48H@Mt+ie7g7(m*7<ap4i3R1ht_44+V{v2r8WHmC
zzR57RGl)_ZX}|Y-;H>ByrQIz~O6{6B{j7g#?o`NL(f1+a_S&z$Gll5=^W#q+@F`n*
z6(dUf&1}xHBUxVsGQAC=;Q<8ZxntK)>$%QI`v>M0BYeq86&1G4^NOQqpBQrKiY{b_
z&*Cz+QC?veiGTQb;QJujuuG74yD>@`ItK9k!rUUJKS(!i=7Fun2z{Jk88+FPO;9=h
zHM>L}WfulZ_)_Gt02&~AK8*+FavCo49@Njb5>deiWKQTFTSe*H0Pp9sr9=@rm4Pn)
z>-DAb^U|eUv&l$>i^GjJtOqZ3VL;5aM2ZTY4A#1{3iLZ9rL$S>UNm=q(4V1!JhE~q
zQ$W#G-2qEx{<rka`a|#Vvoouyy<6?6Mfi`#NYGrwMU=xd6QG3dZd#j-^y%0;wU*S#
z+|P!GXZj4d*q^+p@wtBs4bP7Ea$r8W6+sz6W4y)^sW3`6BW$l>lCJMG+$RgUM~Y_S
zK0?$_J<PH0N1DI+;%9Q`CNjA9Crh459^fS=rBfIGiqESiuJgIo{XN-~!X_VLVUPUO
zNsv`z#<DVtdGT;6Eu)miz$!!^@?l``ZPExQwiT7?bl<?y6S*rIhBx$?H@^cYh~*hK
z&5z;lf2O!Css@SW#&~~f@?7(fC6Ju_?d+#T>&7?eo#!1OR+oV{d=Jg;nT#luyTxF0
zarN-&A(X3sT2C9C<9=uSF*;MvDSmC(JGq@&bUEYFFQ}SVq<0VcIH}N$O$=M<4+$*<
z!*ncXv5}$-$&nGX+_aCxG&4)4MG$Hk@u*Fe$9$T@BF68iGFO4AGJo3*E2IzQxxg+o
z{Ggp(ey5mebmP<dj1N#bf!UVDi&qCl6{G^SZ$0M2S}cyPsXrUu@VIHOh1-6$79{4o
zZ7#Cio#m--7JNUd5im-KyGEasN)2i`R#V0auNj-DhU(`tyg4VIM);ON@*`e3A*A~w
zE?rE6lAYi1Lv9PGU$4YyjxeR?Fk!TEO@IEf-5$G;Y5Lu3;V=W4`cwDk0FM|fj%=Qm
z*0P{9*Bv>FGb?)90H0!;XlqNw*q9_u3i=|?aRA1qTC!GN@jO;Os$zdJdVItK*c#Vt
zj0H=I%(+yP<q@~-#cK^a-z`7TpQFn{tG%rH8_$b}Kh6lIcu?@Aa994Es93Juv!Ju5
zgA2UarM6o=hTbEJ6$3<+qTM5-locxphIIJ0S>KY)C#&tL*?qx<oWtt2bFlizbo(o=
zu03Xb-Gd(H4Pt?lJIP^-_0AlCa&PUflST6$+NhH}qX2sc`s0xS-zhWEMJ~AT{m#<p
zm%i(_^Lk9prx@lfI<x*9&qq0NcZh2QO;*F;l(4CVO*-=$QCzxZxX4y?P*L>M0-f^a
z^=P*`9D1HM+vwi>;`LYeefQg6$8gO|H`2M_&UjhHg^q&!(60X&gT_G7duM#focfyv
z+FA%ppfJ`0W?!L!0PmtAyC5_|6b>?%!o>M$h4|Z9L+Fl`l2&-KXO`42rXj&)@8*D#
z&3iAzj-T~zS(&f;(i%|r1HhM0#D|afcts(Ws%tjh4}l<MpD&vSq*~4k!yK+Vs2AS5
zaY<5HpBe!U#UTqhC=r!>%l@15;o9*<^0s9EXR=ga4m{8vUy13>8|~&beVXp*%EK7t
z#<@pL!|L-89maD}{%F%P_y;+c;CxaWkh=Vgt@VtDKvv>B$1Wa(7s*x`SBcTUPM06#
zKS=2BB4ghEaYiS*twqy&25?3hkG*Ao5HI4GqsQKEo=RF5vX24cIM?>!p#^Z>ec4Ba
zRhOT&wTZxq`@jo)5(MKDyBx_Lr>%gor+c6R$ckQ`XUzk8>LDCk`EEQ!Rw+a0zybKU
zY4c<Der5+zJ>*kiJ{=@|w#Xx&ThaLDfbC29nPeTYsyv_-fX8!8t}!Wp+TnvG)X%Rx
zGB9%Z4y!0m*M`wH$o;$hSS78>IX-^BlZ-<DR##W-Kh~-9m&^xlyX$RH^YbXTY$7%M
zvABNn*hjhjaV4PAJb19==}K9+veaD@TUC4paSEgybiUZuDI~Fm6a+;`_O56B^KiPJ
z*PMbfz~&Y<8+MI2_mjswj<Jlk^fepa24Oov=~MD~GAD=lqiCeQ8w(RX{f)8iw_z<<
zg4*{z?1;GkOs3b-2J@lEkj0G{7&akpwVkpa6@l9C+gaz9L-9bspuiOT!^fa@Jjt`{
zz(T;tn)IsRKkDp90<5a9AD4)ox@B2h#wOq1X3JtANPfAXpw!>J<-;6M>J=g-68}er
z|Cut?U@f9NCB5L4ha0?475|yQbi&qL5;2$M;c>In2ZQ>5tij{R@0(J{=F%C?OsY$n
z>AB=QNd07xb<?70cskYJJ*io`)P*GgFyT<mP;G@QPjNue){EB{&D0FSV8mG}p}=e5
zHZY}|6lOZ`w{K}ECY9TWXf@NiD-_9kkYC~d?5UD-|43Q1(_Y~c?upz30YK`#L~F1b
zNd^^ZNIVtc_Nxq%PDiq~_M9u!YwKf6UygcIuE7wy1o{yh>$>1RxT8r7Ci5?eQnrX5
zyj;`VCqXsAEDD{`Uy!oISh`eSmeEU$%8XBy2SJj>8N$P}e||;6aOV;YGKq~%*2gwf
z+kd&b{Z6H=N{%l|e1xdWl^MNKM)7Buo^gUhd7&x_GT_1$8j$s5L0q}(JD3w}>yG)O
z6!iyRG-Y64nWUL822JC2>|(_BDX;R?kQ#!jLfkzOq6vSPnj-|IsGx1nKTX+rH}1^`
zpqb{SggBQIuznPZq<otGHV|c`O_!kG(eAk0mU9KMjqoo+USL+C)bIOTF}<*qa}^e>
z!_JcK?MNs&`Nl;hs;tnJW#*QvAx57=B$HR7cS(#+G{|^k9zIK2%RwT#(!F&#^^Hjh
z50fkAcVxDXOXs+F@&2N03x0vAsRJ3x96M_R#vnO0uFcN|sTDCiU^m|Ru_Tn>o$!8R
zQTRZqA7)1wT7XhCt74mrGzUSwvhE|a=q18)=N64oL95*6<dX90C0GL}XiFLoW?nI(
zhB!3dcrwPv@<k~Mams*-Lu`CK4M;z|`|$Ba-qVKU0=kL!G_X$7TX+BuJ|>R}104g2
zHW}H+E^+<bI0^HgW8G?&MU-Wm1(v4s)?CvWPF*CI*MR}~o|fPeI^86IW?zw9fPKU}
zn);FI(a|oIouy%}T0o>n?VmsE!Vvzxluf>m!7V0QQsBso8azmr-R#IDDI965`!hT^
zn<3X}4VXGY4QS;wxWBAjHQb@xxV`1dgsguI;4qnu?jz3tfD;CCmDDsV0~CBy8f57E
z_J|Yov*{+}YS@B?@roLxu2fChZ&YYt9|;We*(9y+q|9S|)Q=~n1Q@r0$s*W+E4O_E
zT68Lc7*lqq*y9AmK{nNmz8sDL`Lop<Jj6fg)sm<T6-K7Ylaoy|y}(*^=^tIevnfN8
zm%_F8z>>Td*=9PpMpc*!_lO}fD0s&?2izGEWg@eR0=)0|RJK~8ubPaxCi_iSpF&hf
zC4~d`wJcK-fnkzODI=?`5!eP0XFAZUV=&Ff?}*L6sQg&8T}#M|`yKhXrZBkU<@{$(
zWv+R-HX>-n=UPRx)A6r?7=NUqycys5RFRG|$FjsPiIakUJpt%3Z;gL6^$oa{QeT#e
zzkgtl&sx*IFubhP{MnT%Vk<AOGYbL^hJj+ehG;4-zKPHZjBw8shaW51mk{y@8{ZVm
zy7P)CuOF@Uqb}huY=7m}c6R|GK%D@?i}sXFG<*6|;;{B7T+cc%QUT@vE5&vWK2G?>
zA<4mZ+$n_GAb(}z116>rcp5Le*@_7X<TWx#7Oef0OoQuCg7-wL=AEq~J;45zg`7zm
zVJ`ZYY$T$=cKAP0xno%Kw5DB%P;Od+Y*S%Tvdr^J;>98UqTNU9^6p%jUgLgldvQ&C
zaJFA_ZtQV;)M`m|%Msst(UBZ9_wa=LHc5YmhsczF3CMW;Cpxwl{j{d$p_{Ydo+6q)
zbO|p%&ll;5AGi=Mt2#6K?ATWOPA%=_6>37B8UroFHZI{B!(Z-=pd9m<QVpB9=B^7m
zct7iB0mLTVp&X~j6*hlLNVnR<kf3Vkov&`Xb&WFmkySj7^&3++>UGY9lm;8yJ##~{
z+u9t=hmt<TH(-gBE}8tDIv0=Y(s0)J(rYqH26`3$Y9?3b@<V{F!|ezWLQ+(ibgOEe
z%x_8LJQKXxqXO5a-f_)GStUGww!>CTrK{2W-N(m}bqQ{yM!KDwV<kvSJ!pE1zpOGg
zZs#pHb<Z4#chj2~tA1SasrCEfN+$M;2|QGuan;tSmBK`sExLL5;{4Hrg9E(LW^8dF
z`Pjm;SRdL3qnL78fZnS;bOc|xaU^-is?haHtnvYTo3Gu;hDNjy4BRD6Mt1);OH6!{
zTW8t@iyrwtoq%OJW=L@a82lLS$a8f-AD6jmv{YL(J5qF^>p&J)6KqYK;SmNX@lWqr
zinr&x<dK%lL6<%Q`6%VZxJ5glH@*SD7><hja|@)ci;EWTKJdXVSq@+qI8l+J$PBp^
zITMq`uL!L`-WSC69q6QuhXq}~PF9zpTUA+H`Yh9&sRtHsd=cj$fQ~r(JTJA>8~60?
z1EBFcz90jSRG^7!oMyl;?x~Oi50h=|?`N(ht352!UGS86a*p>IcxPbSV(7x$t}}l|
zyT%PfRb+e6^V(kxlkwlT0-DGH)_j+<9)a4uCM|2$Q*M)vp??V}x+BcsKgVjSK8_}l
zBkQbs@w14iwJf-0rKb?J+Gs}L2kx0PDjY;pNnJH*uU&o!-@}oUihA7SPk%mq3ScQV
z^k5xMyY_M|_50QmL;q*HF7h5?(MPmqb8y8Ng>~ycD}zF+(bCr})wlO2;4-T^-A$v5
zm+K1gaHl>MkG@rpU@1MF(!vH#zb-GUGv{VBgFJkAgM4_fI9Rw?rlLFE?taT>Z{Q)=
z6ZJrT>9X(}ZP+i^?eoHKNKQ1QapUE+bf!hmHOe;WHE(`g!JmW0uF*9^1pI(|hFaS@
z?rUVq&5I!{)`r9S^XbvgEEo_9rQyA`{J~a*rDf>+$gPOM?1SxM{Z45S+|^BQ_)C^w
zPKhy^uRTV?cj`ZmmcJG{AUO{a7x^WoCvjReUo_F2bbHF2SJxBkOUyU{vywB8LL$@D
z8ElEy(@g<LYi_|EaOebiXU9HGCG-qaq~8UjwXJ>SaBQJDg1YBE#nd%^@YmvdzLDpA
zz<N`>4a{EzJ_VI16oinZi?kg??IwdWm(TV@PJ;NbisysuHc7~?L;&C@v~OG);@V~B
ziITZdTB;jL#F1N!+EIP^r0&?Iv@UFMQMmZ9fi#nMb?VDBUT*EBcJ_*E$|qisj${<Y
z1L7rc>WTX3+uh3Yp9TCL*Nml0@XWWKnUG}Y06wzSF6^ozI&~oG<Vw1A_<C}BT$R*o
zW<ziP(WR=R>KYdM!mA~nD~U_`UavH(Z%fuxN<}>~JA+P>;S!d_$x%%?KKAZ|%vKMi
z_=GhN0cp*vmoWc}(=b*Kgr6S{dXZPDZ)>4y!LY53{_Crq=?8p*{<oJuyzXJ5u>;@{
z!CO41E^V?NDfQ};<2SpUSF#8kL9z~HWA!K3!Xd5x#)RwxRL^j=5~7U1vkko8kjfWm
zi3@zxeXD#tHK6p!!Cn(>0l+RrfsA#!iUefLr4aapPiT$@u%kc?$4}D8uGwF_>6`ya
zYDmQ?(UOGVuqMb32dU0{z?XDh6BNG2Z-^1+bvj@@-kPEm6B2GKT*+jM6Np^X(yDpJ
zXGcDeKlE|O^2Ia0u?PK=yswQ9pDVEXIeG~!*ar~(by0$~*WqogalKJL#MHfkr2}4r
zoK(Vl#&<#shO+Rq{U`Mg6TC^X>5gmiTXeky`P=08sKMGsrmk4q9oPlZnN%4h9ThHQ
zJ+5AHdJ3c|g24$Sg$+{8P|??#;p3wB%r&;yWw*dFe9S5gw1D@bZfj|2LitR1&~%k%
zd!8@vciF&|YJjQhSG>Sz)d{sE4b(DvnZ8~Qww4fw9~t|*i-A@I{M@(1QER~c{3ADt
zV^^rP7#rc73Uo>;98R#B_U+LE;uLq^QJ#1f1!J9tMtg%uXOkmo(^EA&(&<4VVD$Iw
z?Z;pIST945Cyg{_uPQ_CyPGo_V%8=`a!k=U{X0Ic;@6vyjr07THue<dEaX%epwM-Q
zM8Hh<eM%N;fxKY-*=E@b&{rL7Xie@XyQXO4_;QSGua~1^h9VpHi%m^YuRgKA>8nz(
z%l@=vYF;-lLhwCKT$JuRIkj$5zNsm&|A4ho5a4YOYo}B<@JqO90bH!lmCFYI4^>|o
zR%Ot&iz1>Tp>&I+bO}f*NJ*D;cXyXcw}5nP>5}f2ZZ=(;?(UAw8GPUGobUXBYd<m1
z%$n8rni=xBeqGh9{EIXPs|MPpsMIv)O99{&GXK%5y$EZb1&ft$RS}3b_Clc~X}@ze
z30;#Tx9-4?7G@OSNwFnx_4KrmMlw-uf=O8^7mfdtygGKc;!r);+kNWw#m&Dhhg6r!
zafYu`G9U8&sPWI%RO4pT#)LL2kDyepAE5gClzM!;g}&f*;><FAqMiHshD$W&r0tE<
z%~45ecT?c-J6I*l{zhxA_CJ~?&pe67Ga`{aG|_Sk;CwBE&dlxk?ZG!-ZG;Rb2m!->
zbookB0=x`d%{VZ3Z3ed#ZP84_c$&b$YFQeMV2;`CU`Bc1Xvg<-bCdf9w_FHYw@*fU
z_~oZr9LT#sul=dCX{Z;CB3N16eS6x)2SfP=7TdMV1P5NqG<_C_+L)~(XO=1Kn|m6#
z3Fxy0422PSl1HqZ_y_vKXg1%K`Qe)hN+lC`g~k+e4v%`#8H+%iFsJ!t0{d8S7Y5G+
zBJ#B^Ii_tb!q2?XVWm@U_+6e`WA}!vs5fphCkm4XHK9>ORx{v4_tT*@EFBm}k3p2b
zf5<@@c@Zx-Wm}f!Gc%qBh@StBm4`MQQhwR8C77IJYLknNT;a-;f9JtZx@6;0+xCjd
zXJx9v=GYwT3@cneuD(OLsba9y0x{q?%;ZY>yqxpQzL9&q*UpA!I}fRAct9iT;+uk!
zSFJ?p3dv-Pt#!>tF+>|ad#z!cZKi&KcM>}g!K58rZ`jXL_`1qcskb^`?}Ks?myac(
zQrDBAu;YP^+sodr!mncrF(<|0SG%RXj^!$Mj|@=O$k0#pd8*OYUq7`MkGXo`mhaJ~
zc6Bi8*k7oc4wsbZVn#vP2JU~0Ej?#b?R63l=Uo4yPW_qk{DSH!o<wC@t$wn5m;+mY
zWm5dtp2q2F3m*+#|ENd?2&{oaaR|t3ZBm0Sjx7=4D6Je#mJLxs)EG^j;Y^jEE)JNQ
zow|K8<@lwK2b?|F#&m``%>4~8f3P*6VNtjP+s1j3ZLx9|YE0<vhdroMKvuuzXzk_S
zG+9#t`y&RYyOR#CY$bYXFItwF*riy@cqHI;qZAF#)*7HhD^2Da7|iJd@m&omx4P<(
z?+q&^ie77a+SiyauXKa6F%z#}f1_5lgtj*v##^QTwGqBvRBVZ_G}+C3f-)Zfq941H
zWqTXu{<lH>2ySZxF`@JWjys#guE~P$I{ag-m;0wi*Pi%AQYH{P!G7wl#!H{4?|cBt
zCpgRDgKc__xWQBA0T0(eOtf0PWJ#|PK7ZZy1nF3Jk%7!>oPBrcyBN$~Hmon=&I@|)
zq-Z%s<e)Xre;YXA&2_V+@Cp|(`xp|ok0NH;uY7K^JdF^L=;^L;{7qQ<h4^K<sa74#
z%IH4N``tk_#6}-D=5?utnv;A)+;_xOD_uQt%fJP)>)@)rbug^=pd^b2;XP&QJ_zh7
zScO1~ct)xIQ)d2D=d4NKqQl!3k*8GWQN7`^tgMoq5gAEHuM-~45LDY-o;r8$uvw@%
z3Xlvy;nDY!z`TG!JN(oe7gj1qL(MxwShYC(m}v8kiVJP$>ug8bC^}mVm$*#V-u}?9
zxVDz1s}*fsy>ps5OHS=En_uav%hTlEA*L}?4ciPfy;%ZL<IE@?2enMctskE>L%UKY
zExc6Mo^3rC<;RB9X!<z4&8yg$h`vdjjP{=eHEOe8Q;QpSX`RE-P%shVmS-tivi?B4
z^~-;ubX2Q6UXh<3?DNKgSZdGf-!f*SF?MxmEY!{uZzS?mql`o8Y(tMVh(dV03GG4n
z-KVMo*MQmBh#84?V;lAINw1Gz$=jw7E<}ko>)hB6x?AC15OCGscb`f~bQ;4#ph6tZ
zp&ih(<z*gizJW8i>YtRFN!*PFL#}?xZ~0*ZBx*LO(r5zPRGwvirc_1|Ao&2h(HXSK
zDx$*dvL#B|QglW0-CFJGy}q-$eeTR3Ul!S0*V5w|vd<%MTzj{^41qc4m+eXB7t=P?
zo*{s+%Ib#y!|96fj!_XBaQk|2jN%7g*0de1PgRKn?fjOn{jy*8H!p81U-cKlGD}S4
zzPZU#hLNkFd?ydLQ*k3UL?!Qy7i@T6Bafytl+J9-Hd&6djw|sX9|X<Yj5wR3+ah9C
zARm0S5+n>wwuQ0AeGIqyjh=Rc??|UIUbN!7C;FRQq&wQMlwIZZdc0@mw~q%=H0~_5
za&kwR!^kuXm<`js=F@|Wqf>+cQF=-Bwbe~7%C<L~(I4Jxp(Us(J+t+*6^kHtxbGcc
z<e!ehSY8H<?B25a14Ef-lu;&^i~DQ@`_64R?A9*e5S(0G%FcDf>#zEk8RayBPl%UK
zcHSv6RplfOI@bU_>l_J7uHUQeNvU<r>Xl8fzu<$^dTvyuz?>Pwuewr5PiAc@><13!
z?A=a-?3hrKksS4BxIHsuYT8=YU%PCi!e*P+C<4mgeBk7+gy?NqL=;GRy)KXXmB_O`
zTRR|A5hhKlV{+^ooP|}T&%*8bH;g<L|A~&rIdfYyRG$Z@Lq2#kL*^&lAi^4y)jVi*
zwgZ^qS$mRHx1;o^L10=-=!s#avrAHOW^Zj1;R(kL1PXcVK^FUu-2=14^n{+-3YSWy
z?2fTq9Q0qN8HS1k(k1EQaIsyV+64flF4Z#WrA%Y@)NbS&ny^Op6`z7Ms=)gBe9Z{{
z#8osE@I~4e(c?6%VN}4FX*&_Ef8aBxcQzRnq~zOyyYqU5g1{tyIe{(i8HjuE6Fpiq
zz4Q(PtkS7d3Em_SyH49iNJH+1>0=1%bLVA&M?W`D|6uqw3Tm-L1EevaF#*@cFN9Nh
zotC5eFbp3s6b(6x@^7h0w)cb7PZwXn&m;rae$fqnUH=MhbuYHhAFurAAQWNcshcHh
z$YykCvTuAJD@4V)bUvbzsDQn4dtz%7FLl<uyCOpkJx3VCUdgoIAWc`L&x((a+oa#j
zlaP-35H5T$WiTEpD)7Mr>xQ?jzJ27ML!O@5g|3GK4M$<f677*B;enC}24+#PwiRC!
zp$az0ff)1KtPQ~XXz-`|2);aKW<6W(<8|_HZ*cuY51fmsD|fc=0TL%jFI%-t*)iT>
z$a@y|(2TaNd_!Qv**FNBGwnVnJOG2rzg8Dh{rdcI7%>GkrVdw8lw|%s<Zg=;nW{yV
z?k`jGDUg1Vk~JtUFW3@^WaqobO@%AHmt5D2AcQh`EE^oOuTsX4_x|g&9-iNmykWoT
z%hf}E`n&B!45`3A!|A6h<pFU#FCVhO*pc06+IDtrwiZu2ainh(x?eI(Bes<}0{wtX
zPYU7l7;cnyAcw-)ki}N-P$n9OsxLp6EZ_d8aje&fD%Y*k>0us^$?rSkZzTdJdx2LP
z)ZYg&kpd}U7rs*Zl<!9pI~_?(l<|Q~lJW@c8>lh~(26gsFHoADso2ORDWx^XZ0$Sn
z!^5~$&Rbze8EwbYw|4({v!h#@0p7BrLU|k%C!qTaLpcKkPTd}l04?UluDeogNnvg*
zc7D^8q-{LBXCvu-0aTEj+#FnbXB#|7ZB6RW(H=54YFsp-N1KEY1CA6`hZL2>e-sYA
zif~PV+FFe0-)OIx_}18|R<#VRHm#uvJ@i^Q%T7P;)FCP|?@r}>Iq0WL-GXQGj;eH)
za`&>p%a_vo^|?zl)FNon{q8uLaSS-Fx0}lX>p@xK{{*Te^s!(2$O=_CW<cq{h&f&)
zhB9N{NRRfi{uaoQ2Tt;gPKawR$`zi3LEQ~@YNHaCkRpB#V6VKCVZ1wy@vuGr!6zbF
z&>plD9Ut;iDB)x(ja;S5bKdUrdvO4*6#+vOH^>tq!onwFwZ8hV`J!nDf!cIlX=wm^
z5{6hNh0?S5u4;hLELx(!_>s;u72?V`W=Q8KBHqNZkTSw?$CL7{6ZD8w;C6S5J%!1|
z&aQV4J!8Yi54U^wpV*TE8Fx^{v#7s)MAezBUG;7SssmdGD&Zl?zualWxPP;FxvT47
zzm$1;l7jF4bd(!L?sEIsP`&EY{UzH*g!TEemAr1(>URWBNAvtnTmt{m=BUmXM<KHd
zO~a=CbjpipXPKm&vVFW4C|0Ip6P^cd4AbFoOiUQtI<Z>e;%HOAO@%NdLGlyJ^;vC~
z=3DOc-I@Da(|71$i+>YV`i*cQTn^#7iMt=B-COj7%0bt!zO3p0Pc4A2%iQQmxr!US
z6=s-I!jNDr6nG+T;NQKP6Z;?5SJLH~K!Ql?&2Y2zG@-;As!_7miX+hXy-6jhS>#W+
zr3mUDids7h-+=PGMCw(DZn9BJy>$LZA(uHis(?~xLaEw`zpP@RS%O?(G=%R&&7Z0r
zdt)|e=wYHc6G5h|gxI|2ELQ2Qa(tdzHVuK7^%=%1Sm|bQe>n@c{WrmPwuAh%pS&$T
zT}cegMGioRcuH;Th6(Ol4Uun<@k6L_=yCs{KFf|w@2?+o8Iu;Qp>UR756YCTrv@B8
z8T5+hs_NKH=+eGH@T>VDNwxaQ?m;-Fq7robPk0%uLAr~Qd<VXF;y+!w&)2Xs2vs26
zdpdZYi<dATVMury#w({JLL&=B<b;!5Xx!I-{t>3MLlrA#k&NdS)!xw6K>K$co7e-6
zc(A@N^)%7#{KjgSdCGmeweib;bXBpr5sQ@BD$<2@gg|d%y?{`KK5}Vz%AdNsmE8PE
zO$dQ2Nng!j2{u{LSIqLRSe_^UbWU2o#K-=7=OQfX6o`slv;N&e!Szin1L<NNYV$`B
z;oVqWdJ^dI?bm@Q4n&SGrBd$r30{nqyWfaznazp2uYBEaK1$19?X)n{Mc13_F27jC
zGzjK*o*o>Z^Ehhu_-XELwd_29wliBs<-=<It9O20%L1k<8>=D)k|}|cmAhkd{dzCH
zeXQ`QynvIX`H$~kt86&_X}Y;Mdb)$ad|Zz8+NbYz0Y<Si=8-<|P5X!+id1YrJ@RHa
zahImg=C2Bt{Y`=pLH*Hw&h2AMnZwiAcp9E)h=E5HFLD5s(CFekqRgpdj^@kN1Q(9!
z#{7;~0Kl6*x~2~|MRu_a9P4YfwNO3eC+pOTa+0OxOmg>@iin$q=||9hMU!D3ZIix!
zo$x6<Y$m=c__0QHU6O#?A>R|vwa=momgvp*sS!3%quEuFgG&lZ1**<wya(IZP-NJN
z{%9?gpuoxVht23U60DuOSbwdsIcOafQ8uuTmr#)2NawcpB>V%8;Ir@UKJIJW-mC~8
z^JYJ2_t!SPK3f`oyx)8`|BPSevb788?<XZbl!=bFDc^pW;?%mHARoH60rUV>(>lG<
ziyYw&)7^&1y`>4bS=m_`d0JL8w_X#@b0JQHEN151wfZpY5e#)ynq(5s7o)VV&^Sun
zVb*13XPx$XN+#Aby_#TipWg!hdnzwO1(YiXK%fu26O8S`Il9XN^jlU-$4Vs_`()up
z_52aPm3oqh0&%vyX_;^W-o2j2R<RmTbn%w4-I63Nq^5>*cis&q^7_(y!GxRBcQBKA
z=B-i`fhLK-g0Kf?1{~o~EqVfRc0Wt*arLWV)x3Oq(BNLr=`9hb(6m+Gu+lIyh<h)*
zdl%esG@~FZE2ji?-TwFpYul_20l+^rG-JLq-;SBCB_=<)E@WBV#+=NXrOY{d5vuJ{
zX~`+FkB8*DndEEQ4?2CJ9Q;}DNh2GOeSPREH0G;@^9w0B$t3rA<?{IBOV9Tqop^`B
zd0~IzId`i)8h$rl`}1PuQZI%)dckk~OjwQ%!nU@D2&Z}Rf@B~<7YAhOXSwCEnMfEu
zQcOMhJA>`4fN7Q``(*?}v!kexl#0j%K}QM30CE-+*I^(>BFxPx{g-qt+>(?GE7#cI
znsUE!Nx^|*Plsb%hPS)~_h71@44_1NlV0&)64j2c$0r<)vv&j@5GxB91$bw&^4<@d
zfd_Yv<aezV@x{U=MyVK&ac63g+wM-wcl~zzPgZKxiO|La+__W&ls*-xvDf~}?z~Pk
z9gfTC;uYlZ?LV)>Wv+~Ym?$8o+9O`p6N*$F;ix4FO=*^R$JS)xuN7FJn)XYO^TZ?{
z7v#*ihGgNr#cp!gsNjd^z!%;`Ql&Fl+^yx^xT`De`GV#}NTaH(EH<i=YCOg}>x|zt
z1SFPAxV6Xhr@amL8wUkndwJ`<N}1B^^$YqqpZESqivPx>R++NxwZ-qAd=jhuZgS^2
zqY9_@{ngDNvH5;qlR8{)L)9@_K@-=wD-5#n)NKFqq)OF=y!!I}lYFa>gM6K{VR3fS
z9w%Mp?tI_#EpcK=|8O^@D^B{<A2b+-Xp;MZUCC#!>F<m4)z-w3Dy=lsvQnUP76W(g
zUUN@wS%Bj5^$GKReCAStkiVQql-97IxA!~G3ANOQkjz_i?UJN=aFAd3(+73yV6`n{
zerqd)HH*2yj%+0@x^YK0C!PrElHYJe;kFE<^*MS7p8ygPgxWXT4JvLy>+2)6e|6Ou
znLmXsOt~1MH23YMQ5Wl1Z}Q_8iQC{pT3)K_8E>;xe}|^l#))tKOeKKh>P}3&*vHl9
z2i?z=J(Y%W3e}Nf5@qejE0vUF))EWGe3SAbirP1xW4;(mqb2y{U3k;9eBnQ5K7&Vx
z?r52}&~2$M@R@5$>@T6P9G2b^$44@pt&a+gAOqmQVM&gan^Cw#U`Cz-q?Me&O5bs#
z$S$-g@e9$K6V~B4fQ=qN^}%e*Cy15ci43v{A@<qr`Q7>+-!Gvcb#dhOqhieRH%D2z
zN0=-?UrZ7sR5E>jH>cW^1Obit<c@_mG45wGNmuESYg=wUveMZ@ljC-Lv6e&gHmFz2
z^y-rhKAlqpyot>t=XXy$<P??27ejQXxi5Ba4YI=Bo?V-(Rxevs7n<aw3(Exw&m6k`
z7@Ks!MB{jUI{6y4dSs#mX%1}U5{r(3#xWw_9HeL!3UYu#TvMO_DvMfX`T%$yV0+-u
z?poJAZxJPSC+m9}!n%Wus%#XHUNoO<3Q4NEeF0D<=l$E_KW5E-X+ayRbEP5bRkyU!
zK?*dvvyOArCVAS>)!kNc-qG6UPJi6dck1_5*(2Gt=|=`rcm%@Jb7xDA8d2QLk|pF6
z)wVfpy3QjWOOLON-%uN?jL!FxHf(6J3ukOF<m6Qa$TXeY%~tj-!`v^HxLBc`ekKoT
zKcP5<Z<a8MXmJ7p0<E;4?K^V3B-B_hUcF?5`$-2X3$gHh`D0K0zHn0Gl>fU4A>4$=
zh%AU{<djx~8iA;bEauD7$<?y}rrAP8dWeHpvV&D<=S<~n!<sUJN#Gd<O7$>?y0E4L
z5(8>d;1hxx+tHgtm&bqw#wPzzY(^|`;Q)9#alK{Zd0E-sQ=m;iK?Z@DklgQ#Ej3;e
zf#CEZ#cKbb&EY0BF)EoTNB}I9pda~e7-JGxC^+iD-uV^a$-)#<fu=amF5g}@kO32t
z7g3m=VM>VLrG16sKrt2wP){O@KfB5RsrJt|G06`I?<`cQl6Hm-sKiQBfY~A?$1-`d
zS}Zi`;llT5!ePI2=lSd7$+TR8v-fnRmOh<zcCRbRW~7F+MU6)}B-L|3A;6LjCYd)w
zxW0n=s5twW2&icg{jO3&9ML7C{|D4U<UOxmJP}qI_aZE{QNO~Ge$>n<sXwO%WPp7@
zIZuJK5H$&4>wBo`B^Exv1w<6~)NbdWp;nsHa-x%Lr_;${V?(ILL{BPdXnY?V@4UgW
z5*_6QefwHD_ahh?!X)+=H;Zv5Y-Or~)3U=(GJOf>M-#Uh`Q8%`md7zkB__Gp5GF}5
zEFfKr5hLSv#D4NoRbM=$D$POG*wk|B+uBscnDv{@;(}QnZWq2!dXY}s83l)~>0?hP
z7iB?(!jEiMP$rR9sf+nYJJh5G8x*-n+)+8ODO(|?{&o)|NH368jJY=9tkNHyb0W{y
zOb1&3lQ*#?Dp)F(B&&$6BsV>BqJZ_~`GnKFIM4vbCums0YvFsO`UYkK)mC>He8km}
zJz?c_cTTtS%#k{nnuVN0;~spImbXa99jag`F1()rN-Zi59g@Eq=U5CGQEN_+{hiu5
zMGG7s<CFYqc~-W|M%^OV;Tupq@?ga1M?L}n?q$8$Uv(jn)&l^nqEbe!72ncFaxm~i
zH$hO2F(#_43*)Y~zF`Ki^2YPgVQ`2c?Z|XZN>pT8#u3476LVPUIAE$3xx3Hgmf&eQ
zLe)?9ciaG4n1cz3(DHnR`)SLW1{S7+a-CsW6kz4DReu+(ai=eFByLa0V_HxVSQ#7c
zpkf8v&LmZh;Ozj$9tK~5P=6MxKo)*?p$9m)FEO0w(CzV)$BqEGP~nfzK^(T>k*X}n
zi*UpYaVTlp(Rz_ZF=>4%BT0QTHF7d>ELFR$yV84OW3y>d7yG*z4Z<m?FsBAan7G5C
z1ET_D2OAL7*xz&k^ybLS3w*gw+&58&F18Th?vXQM)Sz)th0M2CJu=<k1CWBqcanBe
zP#DP2E1i^Nub_bh>}LhfU6p(L{>*C^n5Fj@R-V@DqWT2!v(lZJ1szaw!cZ{(#r{qN
z@F^u<TOGD@>;h?`Gbkq#AJUePx!J=CV=sUm{LOId>j_Gf<|uYU(Xs}W1^aI|i?bez
z9JGG{Rla5*w#N-pv5UhV0E`ba#GiE=K<>fG&W$M4GwegH;IJYivRDOv$db-N+ib~J
z<EBVz$`0uhqWxNUb<Zlb>nLh-fR(|6k_3Yo(Ww5eLn$Y#JlcUuI4=PzPUiYwp|RWa
zO=xFj=BUD4aVg1#0ZVwyW7Jo9`$nab5m;pu9ud?f76Ztbt@3>eps&kK(f#1xOA@Q8
zORbM-IV)d*jJ?d=ojHF+ufEYKbpQmY%ySRAd?9?~ZeC-1a<ggG81&ib=QpO_B1=*c
zCVbxpx*Eru8@%MHE_Ew7JxI|lUihj2!$IFgu~<L`kn+i@2xflwcyqUTe;*-DDdnrm
zWJo43XF?X(g=a~nmMNg%27v5Q{oysw3qJTO>FgvrYn);vO^@M&+ADe1vVG<_6oBmy
zjk_UIk_!QiMaB7P!5~@Svi3rOG)SWc;mvCcgRX5y3VXIcCfjdmFLXDuxZ)DvEK<>M
zxYtCH=FOK53atP)cXv|@3$Tjbs#fTg@+%m9y4z#i`9th^Wer|f6+QvMTOzd4F^Ztf
zNw&U;M@TS^BbUef6eb_Xi|aRvCfPw1DGC>`g*MEV#1ZSxq*)thPna3z_QG^kgt2-#
zmq95M7sr<@>VSL`uBoZ>S-XzA4k&YoNg{6F1YM&z?^KmvelFKHOLZ4s_;IabQxm9?
z%g#bX`@L*?p-k81wBE7NQQ?3kEHv7{Ri!kI%bSSa#{ohIP!0BiHe7$@?1>E#Vf_e-
zWZ_t}A)!hB<eaub<8K;JwqhEy18O~Qo1Xcre*Dx?CdEGPBUGy~*InP->dN;?vOqDY
zt=*QMeLP?<K8D03*qyOTzpksAKEuufy-L4+rCmy0RDsuuIQdR#<lTXBT%RKmVZ?fy
z({6dAmLhh6omTjYqSy0_+DVsvUeH((6JLlZ-M!up7aAG|sMlMg#KbR~X5S;;bK&#)
z`$2uuZ6eHbA1`scqe24cRw9@kEY}ntbTi+Ki)*SX=*N$}sWrp!Fi@b>kY?97jYA@X
zpfo%brcsO1paim9`Y@{4tok!nWk~+Eumazv2%6N19_(Zz)3(I2;I;V`@7Nc<+%K-V
z0C5AJa2stbtYEz><BH~&d~UwKGK$UzoLD8h;^J(#KFj72i5A=4<DK8HA%=v2{tJX6
zoFOjk2MzI_ct&X>jJscM6iX~J<RO&6Ypif+NPSaf84GLDkDF9-Q~6bmUUSCW47Y%_
zS4k<3hgcFaPtv`pyj4q@O!{1ST2f|#C`EUn2vD#5ZQBK8df4Bc=QWzAm98=zbkz(@
z`D*)fy75T#eKy*nS~4ZG9N0)*RlR(?ypJ6C)N5nWWcLTs302w0m5bWGxHDehy_-Z2
zS1|R#clLT(oHOgRK2O(IJgTvFrs}mow{6Nf>~_wrMnl6DvjVFMN~f6$_ftlxIe6~0
z#@A66`em+Qzd>}(jcx9t-L8M_qNNrO_^nobso!Ap>&?c{j=X#|%BQF>jv;o^*evCu
z93`!6w^CvhsT18qVcwVv?ooPB#>d>Bev_DN9!CPIKXDG-+f=H;^MAqaXOFOj3z8LT
zQIh~xkMmm=_&fe{7u0R^i^I2E;R6;P{w}<OGnG4~w0mRBA^SJsx;S)vHnazpF&AXW
z$bGD|2imbPp6-?wTZ`(8#o3$jWV^;>SjN`vdbnx4a6{bXoZbF=!+2rY2A4_+(skDl
zH(F94BgGi}kZ=r%1f{#ng4sDaJ!^Ijj*e;htS{7_*?xT+sbgd$vQODz7%hYQkX`#%
zN2^iFTdT2O_M+sE-9=s>gJ?t3+KH=OEm0Gt(E+xgMv4WD3Iffb3HCsHMurmu-IzOg
zHKH%{j76I<&VGRutj<a;EiF~f{H>f%bac)B{=J_r9@Yy#6l;3l8ndaYfz?%tG}&jG
zlW{ymAjdBY+(git>A%FNl6)Rk(B~hrG!9E)pb!SS7H%B%fuo3owF`7#qsnsS;>Iu>
zm8?C=;Nh=4_<Qsv^8f4kL;mo;!w95{-Dph;q?3CW=PKOC7X0DV<6D$WVkpKkJ*<l5
z#RrFuv|sfV#`P#<MQCpPA#&Xcy@m!f!5HbmsZc#E`{K*SzMT{SjdGC6U+UJ}CLBY*
zUycqX7B~!!F;>ON&;Lzx?DzQ@3&h6~2K2`<{Ee@Q@7~lRS4^gV*+Xf!mMBqIXBBbk
zSj8IJ+WVEGo2wWmzNT5-h9xn*71S?K<9C_=P3$~$Iq*e<wp&dT!Rp5Q|2qPW(*J)r
z|5>*4@3!s#l!h0nkw75Yw~+hJ#^$4GfRj8-Ayf3fM*|DS1|ePoJ64U}wQVX|!Gbka
zksm3lW+NY-SvLGnak*{G%+)9;ez#P11^HueLz(v>(feO2E(fov?jt<>IGWj>bt6B3
z?+QFF*`Gk(yqe^>Dsa}8s3}T7heL9vPR^S3#vPq8EcX{BNNZ4p$7{~NLyUJ21!Saz
z=oE~@kNcJHf28@i|FuOh+0}v$2;39?nRP;mHMQB^B0b!lghvT-Kl;{%t8>D4V6Y)`
z&$@JNlno`?*z&$5A{R~7#(<=oHLRe+V?RnUEL^4_vz)yKPk3Efdv)u3-ysuu-!34y
zz;fWYsG8<iH|ULEdvJ4V&ih;IA^W>IuBp?#D;DV8$^Bb4zNa52tJE(O0-D;tA9(sX
zR`dVO%klm@s(p20A6{s8QG7?1v=29JItd^(*jyxOXu;r_Kj$y<Jn^{{eAOA=*d6VC
zzYvel_5ANXx<{gvbF<IN{n4|e125!-+tZtPF2_Tqnuj!~uS)3YzqLR`te81yClSp@
zUti>LA>KficSyaB$FB*!&nFN4T2Gq~pM+bMuUIy>TQ;9QAKdi}C55k9W9H_q&7*tc
zc+))o#)P$0*nAgPQ3UJtzTGqaaM)N}cJ>{za(;0atG+&g>=iyoddYXK?GYt<f6X?=
zUVRcP1E)-!tbW7aRTaO<Cz#oUAHjt1abtbw=&7K?!7S1*Obkt!wf^Ri{l1qqy#2mw
zMxAuA)*DWjdcTf4RdZv>d#HVX^W(U>aSc~I9G#VCtX^ks-F?0%)_tL4#rxtuw(l#J
zQ{%mJ&C(p{^5RAFAg1;$<_w3$#vTN|kV{fr4-vpxPvmS&ifnTqK1q<MC3>JpjpT!(
z0RMaAJ9x`|8mD6YQlX%6<w~#B%@f|^X>{G#V1zOD3cJO7FTbwlQ|d*oToNYh>o9>%
z+?PN;Rcr^s#vK1FYkMCwh7!%Ad|Tm~J={k>YLC1BCC@R?IcIHLc7Y+|wIx#`=>tA}
zwbp#~N`2oG7rFWV42cDe)Gd5x;p)-jVea5*EW!g1o+?B6g=wEI+~3)?#Kll(ccdr6
zpZdK=YLi-R&iAL}fNNeMTp1+a@L%Tux^?^wk*PBjxF2X<ODKjTA&_|g)g&phUeCU`
z<@ULOtZ!zZ`>ZUYPnMW<F-r5F@tu3FSla-Y8@|%80fh&RPcDdCuc^f99;OIfEBT}G
z-h9!ddXO{?iU)WfC;?heD11oD41xE060+38bdUdC?3b)1dAMO(E^Xo@`rrH=C;eY9
ztmF|FyV09lBqDARNg$-c<3OOgwl+xGT0Wh*bW>)F%R~%Uo*BIjD|E1I9=}^43}trP
z6(;J+-kC|F<9lzlu^N3JXJU^$|M90kwmCCu%IXvX0-;lLSbx87(ij5Qd9-!3%wUP_
z1uTEidUy3Z(#9V%t@+!a`|q51OQQED&ZRGj5wzb(W~TWq2|?3LX!E;-+{{AtsD%4w
z81~^Jd14-$r)$bh4~m#E;Xgvq?kY^6R&&t?$#OlT@<=04lqygo4IP*3RYGs>?#ywy
zzV|39Ya#)G#MQiW>Gt)UabKY|6(nBw4At3oG64Qp6J8l{_fjDzg4JnP`0IiV?ok>)
zz2?Rtpu>#%evaw*Q-ZI5I_a@{`Xv>6&C-6t@2Bfe{&Ei<58W>$e7G5IQj4WLs84AR
z-<t};G+rS4`LpE7W8Vr48eMJ1#%h5t>WpzXy?%m8^1rieOdtQsLVd29(&4`B8A)j#
zHyG&<KG+0N_3|d#nku7!9Z|tt`R`Q{T^ACJal(vKT^F1MSM+3r+kQ<*@@#Q%Xg1;0
zk4h~xSyHsMEITNxA_eD{T{j8nUKOGv{5B(ywtgpVJuMzb_b}!5G5`|*GQpL$!+$GG
zfK#}b1>|t18l26URZz3RL?WQ}m)q7;n1LN*z&J|(^_xcb>Q~$5#!yoDLgt*@1n(iu
z9kLcRM4V;T-WQHoT)ue?wj|u6@JALC4G^73!$M}|i#1HhsR&CP_;gVu#je>Di?nl)
zc~&V98pIbzLg)@IKB30T_^13gApbEkEbo%qI)0ovm+S?3uMSBxCzVKY<Fsxu5VJ{D
zPQQb2_)O^tOn-j)t<l@#hbNTRws(ib1QudcUb=Z*{c6*UEIM%Hs<ZEp_ojWj|25a`
z44VphPiXrgMrAopsJj2C`j<_t*BE^6)rVu8BrEXyJGLUXMSX{+rbrJerBKhVVsEjV
zi4B<#P7hL%EWq0NfPWm}7;e@sD~nQPOk2?82hZ&>NXla68xN9R$vyggau!$0GuN+D
zSEq7N23u$oTDY+$N_`y~S;~x~<vtl-{9BO|O2?@G_UIeLgG0#i<W^7K@UZ(4mQ;pW
zr^XBipC&!5?C9dx7(4Step2u#L0n;Wg&RQt2Ygce!Y@@kQi6K=yj?(^&z?VS4sL}|
zmiHvDo4LGrR@S2Ndpfga%Nlj4CYVNyjfjeWT7N$MZ4;wWp$fFuoxgo>;cmy*5WM4K
zd82xD9>9!B99JFcH#vv&tv)@mq&Q^W6ZY)F*|uu~9xf$tgGQ<3sB!mQ#(weYo`RSm
z)P4APTS3F|K4bQJYSI*TOJ7-zYq!QjvK|BRh;Ed**K2j`@<H63X6Fcqu9EV8@`ht9
zAXPlN$GFR5VC}F##&(T|bbXNNmaiC`4$h3~v%`Oh<iSwC>EXEos-?))Q?tN=S&fUs
zxGfh?@>kmEo;TIyr`hhq<)+u@P0}kPO7zWpe7tvsP+NmO`<w=gf3re3^3Pvz37G+O
zi3%50afJrRsO(Q4c=^G7cvO$70&-Kz<8zwtT%ShlX6UK0A=E1V$Z5KI2YzmJM%*FM
zEgb1O!QnX#*U{EEaYvor@IYm+TkrU)MZ5jm)@oIcHa?3h?}e4WC(Wr2jaQPvEmfXR
z>eYITM-lvr^Pb#2;a2aAxo_hX5Q41>l<hZ0*q-@7)(^9DT-RaRE+;o@J>H+8Mk<~Q
z5WR$_Ot*{o`ZvWS#_;#fHa8nh!Y6B!hsy$Yl*Bw7O<RU2U`Wfm3iRebBCF<~XijeJ
z?i#>vuZ-!aY;kV4p#mQdn)bj(Xb&3gL1<rb<b=q;?VySQwmp$J|K#`Its0XD5dPtj
zhd;Os|J##FrwsbH2>;)0jf(&6+J%694F3BLfcb|1yXx=X6-G6%&DBH#{l`g1ZXTWl
z0dM$pu(-yznVFf~yu7rsGP``CTAkBf@5B_ShkkzBe><r01WQSOfX`c^qXN68ainw6
zBC7G-|9l}CIqgx3ZZrS5u7;h30`a&1zVpm>OCjjVd&Pio!w<5*^1?Sp$Qb@_uLA8Q
z<#ywZKIGK|nI;bhVk~n#Kz{!n=^#tTS41<U+6rL~OKC^0IIRB;J6R}kpof_Bp22(`
zUWplNU(-9ZTXcdI0|<CBu+{iN*j5h@rwy@FF~DB@AwmJ|F`BT&Ep+79-#-qh_W&np
z1z=fuT`s8F?%+bV<>Ra_@%pSicK<LcfKmffS@zxe$`0*-JJ-S8o6KbuR+rrWG?6u4
zo(u8BP8|3EHWv}XX=H$69uspFNEBLWgd0VFeIxjcWt{!SVRs*?V*&G(k3(y_0<9(L
z;M4zhTLa;|@v<=w?GyOO2R-{i0!{{OHi}Z)*oUf+hiRpdaYf|hJtg0jW<sd;JdGpr
zSc90(!R?FOx+e2X>wYZ3`NsO&j5}!mrxu{*#|C`ObF&r;rO54PHf8k^Z6R4)@KEiI
zGtvC@=rSvD5o_$irTF&7pAMr740OlQvaPPmhz1UMLEZWMn}qUZEe*=|1`4{|&iCOH
zdjiH~9Bqkn9j%q5zfF4zxf+!+^@SYv;S8c$-g=TB>I;6+9b7}$`x~G3vEuJebvDH{
ziE>~+T|AF~2Y2hxse9jGBPb5EET5r|Hmq{&7Rk%Y2?SKbL+5?G=^5>>f1uLWY=7#_
zH`b4LeLBNgesOi6<Lp*@CIg|5t>3UGoIc)RVjZg_PHJm2<n&DRcikDbaC+-wxw~+Q
zDA;so?9mSxO|pN#)?&6oOg1Nbm>+3a!;ONE0$dJ6k>}><E*J3NOsV&l;7?lZ@V;Xf
zb%K|1E$pAltGy4|_mt7yR5j9a;zXJ7LePfwS-2gSe}0Ny?s6weQ;>lAE!|AxY_NEg
z*_tmv>*kd@{6qrpie9;K#hRwj?5R#?RFZ1llAM2?Q^}RG_M5S0NXHJT?)B{sRha#L
z%mthGzzv?sLVGr~fDrvu%Lr=3NX6;|%bHnY(cdN7jk6p$!NO7)J7eAVTDpTIhuV+{
z&!4q)3&LGR3ngol+HWCJCCA}R3Tp2$zXf0W{c*jSWIiQQ_dXHey6jz%gg7%4?8CJ)
zS01Ha2h=~mL4SqfbQo~Z9Hs-6M)*1lrzF6&So}*DQNGgL<X>MAes%yyb5h>gBBlXM
zMQvz5NX*IbDuyW|nc+X06)+k&Nz-^4&ke1l+ZZ$6t%yqi^x!HELWzHdsI$se(R?I<
zW_isb!Y^d7h~X@7&hA(~fq~*AFaN>kc*gbNHsBbXZyMaKjt6)AmE~WfLoefNcNx=#
zM5iuB1avl4-}(f+D^8LbT~<G=Y-H6f2wSJDevRT#V!eXSd`3jJ@TkZW>a1D@e|0OW
z)y9fKz7TMIQGUj$-N}mTU*{}4=YaH{t=M-GZaB;OzCOhCq=;;&JQ}cQ7b|MjT~#z8
z|0QN(+}z-IWs)3SxZ9~l;v5vtaI|E#=|yba@zRK~9x<mnQS$9KLwpq@U4h3Y9LVEg
zaj!O~#V1o2u+Nhs4UgWFwy5)`h^-&0^7#V=@v%}y$bOKBlMkD-n?g!M%hcK?`#-8Q
zFeEB(LyY=I$3p!DpI`p;yTa6<HYNXbGnH7STi^Uqg&-q)Kpd<ciZ;((&!hVG$HciD
z$oo+QuOs;t2g38G<rKp-xPwR4!}4c9sXkBsfLt!&5zB0SQdBS1x#g`F%NDRBEva+;
zs;*3o*4SGv2;VGdH6-o8`u!z%;~2EbtkrVVLRw><)}Omtzq}p=6FWwO>Wk?a{l~P-
zvgbljoH+C$`$f~&X;z^gZmnwsBB_@)xG|_9c}b1_vJ3p+Ew-`Ry8%)Jzz;d?s%2=x
zwxLN@1kAj@N4cDDd5_lrtAOI3)R+4fESx$28Xx?mWcizhrgyknr?;COd?IJ->gC)`
zD(%Fi1=_ZeR?k4hZG$W_Qqrgd{m}RHT>9E`nP5oDS!JUj?-**ZAnP(ArUWCVu-q7?
zR}W=eu=I6D)NoySaZNzBVKugXDUXC$*JGMsd6g<dJ%TH}MwQ<_D!=tx4Ik^MTHFjk
z?9cAJ`|6y%d>r2XZBh0TTUf%SA{P1#6eqr{1=p7eDQ@>ENRDn5v69lDzHA)He>CiJ
zREf=;5a_(oMY(s#eYer6MpU)u_B3bZOO!x#XHvF!lTTFmt?Z8Q!@KH;g#C9(PrsQZ
zxg2utJn4BXtRxdJSyJ-s?11p%`IJF>grAZ|ReD2hk}e-aZ!{W`<u#}VSgs|;*~`!0
zb+Y6n!8%m@8t82<`^3T*N#(91S3Nc2jQUH*APpn=KNR)d?p&j2y*nlG*xzi|-^Y~V
z3o%UWDE@<xo0!t;-vMOWZi^;6k{thRxa>+>#xx~il{wZ__A6hAKFoq^b}c@e#WW5B
z!P%79jsN`;uX!e=NVm7ih{~zPYp}D5<zS?(*k4VxkZKR^rSGxNqp^bXmuzBmSJ^Jr
zv^yuSSs&~I<s|$$A9AX0i9PI{Wi`;13K=tIpAs6*QcYD>M{pRSt}nM=Q)K*lFhR>d
zzHXYO{mz|v-lvK)MNFbtIOLY@bw1O|g-^~vFmi<8pPH3#>sl)9x~Ea(HCdgTRGH7E
zC9FC?WD%a>9FIMUv&3K>&tBENs0(N%cbgxWPBS*3Aw~6yi0!GE>VO4kFk)tjF{kdR
zto1hp6E7^Q%GIzG?c>~^_Z?H|-<{a&39W_oZQ(yENp%Kz;B_h@P~*-Ds-s^Mwxa>}
zVOf&`YRNeG7IdM)%QiA~)_%Z?Ckb%I4rX<Af86Ig^wW!kypG*e`)*sg*^LT?St?qL
zLeSCiO!bAhMxbCC21x{RwYrs?h;Tr7sChp9gKu`C$j<E9;#ku#4(w=8U<7`MRja0&
z#0_C{d)W&}A6=nYw$-`PugSqQ-Tm=?O;=J7?$C1l{dhxlJWT<l+(^?2H}!Z+aSUpF
z^d3FKIGV_1&Gu9y*n>*?WCUg!?@cT%^I>#DyYF~MAl%GZiu-SNnz5;x9o_h7+!Ae`
z?v^8IoB9Wz$@?27@s6jmN`&^s4WvnPQln8Yo45&9>oa$nf>hz1a|kA}8KTa7K5{oj
zz?Qk|qrS0W3$(<zd1<8;*4ZpacBaO#j!9lg4q{5yAwep9gU)n9Q;m{G_4U8Nh_YRK
z3LSQD-y;$yq1+VxN!>yz>RKy&i+vOwdQDTDBs#jBbo6UnL9Y2HlKDbJ-iGP)E#|4l
z@hy=es2i1#JyL8&AJB3iiE@-en>T<8+P>The*4etsx72#Bye1q5v*%@widl+Nb>$+
zW<ON1${{aDTFig+4oBl`QA`3Isbs49h&1r&j7rVE57!*_<&2i@g2eqtMq(tL-<~K*
zskbjBito8lCK@0X+^sE>aKF?$@88UiyIk%Vn4&^2Yi_e;N`Y*DMpmD`Er4R;o8T)9
zPM*IYTkwk2x{;&rTMWy&_iu{DrJhR-PidxlwQbbnNUFay6Cl=cyX$eia|uHc4^edZ
z7A9~*(^quQbyDPkz!zt5ak7wo5MRV)$piPv$B!GXIOT9d=MOMUczDOx16ieH{nKvI
z>!&<br57CsKgC}{W*)r)uJjl7u9v$-e$))O=!BV=RH^b?vbJ?tJD)SuQXc<*PW~e^
zbMuVsZ2#Wg-aNFmAH(JMrcV4PU?*hY@wHPfpGVv!9)3iDdEIP)L#Tjkmj6}q;-Nf)
z;IR$t%gVm@vzQ@b^E{W2^Ef|yDfUI|K}Pjx-@R}Oz^yOv^`PP(AAZn4bXY|5*!tNj
zMo}O7?C7;tTK4f^2Nb-Seah9)Z~qMevVT@Ld`CUFr&b*Jbda_ot@YrBBFV^UnuI5G
zEQB>g)Et=WoG&scsM^IKS6^(dxNunN5kyUMY5vzEj~5hS6k;D5gog>dnW}dOyIcc8
z;N+@N<`*7*M^!+N!n@_a>eu}%DEGTUUVnbgDtW(^+?7#3vJWUkhn~g%>=mLw6aAtT
zI4a!Zx(_`MNzR)5oy#c6C)R&4xtV~5<!Dj=<Cq|<!<?6BCiQI+$MfrBb_N@1xprJm
z)Bl%)Pi6>4744Z3yomnZEDCjCo{p>h(yN5?$7ba`-}k8ly?v|RGR6Ojgpw)qzxuDo
zf6`mLkCNFlEX(Z71t0M%;LTK^+R8NGkPi5Nfl^@k=W<SroqLJnEukEZoJ<YFX>O+4
zPzvx+XekH}@_|)VC%N)gAnp{VCOlc8H;><DmWCKc;&F?K-;Lyjul~?Uq~eF5M|24(
zDUE|;ntW>z`~=AW1~Hl=BdP=)<ays$hWCv4?@E4pyQTyqgdzrY#YyoeC+q65c*DM#
zU@0SgUuIdw^ZEfFD+3lO#EA08-kS)h<9&xn7xL<*2*Pq=nU7bGpIdNnYB|4e8%FDG
zK+R-L6Xa8-^v~0;t_{q2n)ePO^v2G2>wR3L7;;ITv}E7kGRdd}1TAHTpi>MhV<fbC
z+@y7i1QHQBte0#NlS*N2`-X2EekLB*>Mu(e{K#b0-Bn9#ZZyUJ<(?DW?+Yr`0@@^K
zCWP~HQeQhWBxr;xQu*qx(wuN$NL`n}W?eMLpJePQ#&W!pKwWnFr9Yb7E|2)D#ysSP
zDQdHg9o4LN10(oCR*56}vss`8|My8vAu8Oq><{l8CqV~?*?Eq}-i#ZTi)Z_ET#D>;
zD)_9PvDZZkE9}*zV1u^F9f@5uV_y2AEtSPZjVgQ-)LeUPk!at46h?VB_6tfGgPK98
znq2jImxGfd$-ec{A|Eqp{j=PAA`V>mk`McM;gi3(v!a$Ayir@av|o&M)Sfmik{FE@
zQ4k%xBgv3tl&-CQ)W`a~qpo`$^OtK*7*9Y{0*2wd=@W8MnxnA{nipWetWfs!=cL$K
zY+x2x8`oK%wlPexQ)5$xtscLl@1saO)t^irQ|w246G_r)%0!t!lHHIQ`CWfduDbjO
z>jAsn71js7Fw<?tU+k}g9pyCn13tUu;z9bfzOZkJj_v|7M+C_zvA^DW1Dbr<#oe=G
zN{}D-2qXLgYm$XPTj)%FWcnFeeVhYFXrElT?rQheyDxla2zW+Kmz*j}94MMl@Lb(}
z!+zqj>$am(^q@$|HxCgqA~ZJh33~t50KgF;`7^;cO?4iF-KBK9qHW!;q^|R5lY*=a
zUch!}Ag2T@b(WYK!p}r_a^FGRzf30Utco3RNf|lZo963}uzI%mtn!6L4>G+|j;un?
z5e4bRWU9Dou%mqq4U)Q-sM1MS4tMBiwsE=x@ipohign7iEaS6m)8$b7ng*=~>`HLB
z7{>DBU%focV-)kjo#nBh@+M<<_Ka{{Qd_1`lQI%*h;&Ly*pxg$BU$!O()#uI?uTP@
z2lBQUl!buonV!|<O_4<Z?KKN?uo1ZRB~_A|ck3A)zZ5NIw!02bo`tWxH8HA^#Pswv
zDZNQkLd(!6yDRprRJG(mR=^tEcS+MW9r_k#cgp*J-~@XudtAFDl57#_Gn>bjx}Q!4
zDjfKP6z*s>!dW%K<!{pvx%MbCN62sn8<6UwbVa+OH8?Ja^_O>0_YIgG0_A1+{1Xrr
z3C!czh;{@O_I|k&PEEqyaiDc6RL<}Cz#&!v7tsUM2@4K<g?vzj?IPo)e)XxbYs?~z
zOMbKqUQwkhi`j72I%KCIjpoEP`-M+$Ci+_D0gf6?pWw-3>B&@XaXxp7e8GxkmX_y*
zou-0JV*LJWJmmT386eJe$W>)}E%jUY9>lVsWJl`{YzrpI%O}SB-nUOq#Qx3<FgZqG
z#ZrkYIB%o#{9GGIUWiyaY5l;!iw(qn>$)@-55oMo(V3ga%1k$y*NE=jt9o2STD^^<
z?=GYv%=~Tr3Ub^&Gr>qIbuF<~kdd(pwBmYyzWS!Y*L%{BhsiBWuCbaH`Kv1KLY<|X
zqnf5b32u0w9N=Yr&y}v{A}~`FD?^&G)@9Y|2%Z2vQ-M(e<q^BfuKpxj;ylTCv2;kK
zgg}4f{{89<zIW#i5B!HShjv>FBGaPXuhJn$l(tedS*-9=tIRP4==F7$%iQ}@5p*3R
zk0<G(Dhd~5PW#uW7XwX^#4t!EpiX(RIexg~PcdW!6~(U-H-~76##$2ArwCVy!IF%F
zx7j<UJM*EM_T^1SEBea?v`JFRF4Q5tW*;`)zq`2oL|&hN`nL%#FtX`@wPJqYp5SV4
zZ8$U;1T%UyfHZXaOeeBeB%Y)3Cvz~x_49VOE%e>>>(23){%XNHcyr!TX9e*BD^J~8
z-BlVd5i2S!v-^2cddny&?4260!-)8AnSLp9D5$eU>r^z>So$Xzcb!w$dQY5Duzi}T
z<Fwa58Lzl#{xPcc#j)e^bIaxj_BzWhwtik@^t+!tMq17zJ7HD^TsE7wR*+f}rC}-t
z&bs@=?V{w0{z1;BHk*P$$ee4J_$1vt(L)S0NZ8kpx#R952IZhlEKc0Ek(QvE5v5uu
zC*JSW{BW|04V<=Q$H>o2!@d#caOExhsqg7#pE2()g^-!0Qj$~OMv}Zu2}7`{+4z8^
zD2p{RO@o+*BcFhwe$FSY1udz*b6!1p*BZo+^(Nu@YZ<hkqY4gJ)mWqym|J8-PUwm2
z6e(dP^?S-ddDp8!fO7rBY4^=<Zk;LtZD``=ZI4Bu^u3WqK4NuwlxzcGz+4zmW9yjl
z+0wl4yVHt{gO5=C(J>{id%_!#kiiWKd7n&sxsNSt^ppws*^T!~{;h?dmm9&}PBg*2
zOQ{=yR2~!~kIJr|xbn4upU=8(#0<%4K2pW~9&HZ#Z1z)<|I?e17at`q?AxQf&>Y~L
zJ(9roe2Vi%$&<uOT7Q#ximh%z!A6GWz%R@Jd6OiAA%pr_;+kvdYT<J<zXN%I>2`eV
zryOs#d7HEDCPlm#)lvPvLtqnwNStSyi0TQP`a(ySMs_U6Zi%rEpRXGZBjQNHY<Bzh
z+WE})NL2+GEIm~mp)?%O$Gh)yTtt*lGKeeAo5a|+jK|a8AoD=pMu!Z~obcX#*EFzy
zK0EkDFYgf}#F>fsu6P*eLJkO?Ga8`DitX1@vE~%nKOv2Ky2n>obqV_Op*vg0*7{m2
zXPITXGj7`LxU&IL_KUVk!-P}MKhkee$DodqP7VB6Pujwt{X~mNm3TU=SRG~3+SM|c
z6TI;=du9uDmm)fB*)BGNmP4=<tcDYXE~33X(o!53$;%0!Z%LVA=do-I2{|$i4l|(o
z>}|u8uh&L86JfRbtfTJT2?tQtex}<Um-1V>U|ml5X=7-Yk~pmb7cN8JXE`kT9TF^I
zeebS=XP?6{q%OPHqEC^WupcZ`-L3iySkiObs|I(|J<sn6qZ9I5T8}@nvV{uNK!z%$
z&FI!ghVRvEVNV5-#WvC@Mm_=SwM9HD&P}-VvCCUQxiS#nk&dcS#8+=lF(#~8yKjB@
zC7=;0|3{&-AoV4u(`FI&;Rq`O+bLemSTH4kxVga`BZ6Z1%2Zf8MiK>(76U@k`n|6f
zGppjSkJkUKZ<}vNM*kUh6Le~ko$iW?sd>uez_P$dPcFkf`|csX9{<+!`y5!yZ8Cx!
z>z8gqUPurgOo<#UAd2ZFDZvC!B@xE~+8O~Pkf??I<fCq2vjw9x2QKoP1QNyxAkJGB
z6_@UBM!u>@>p<hrBB&J8#YwYGe{ztk#ER1^1->iJ3ErusNBkz+;I85E`zHrVUm3l5
zE9-D(go_!*TzB{{w&z`L2;B*8ES)B;=qeq1kHrjIPd6^~R7H_FUUs<$m%a-I34Myu
zHkJ6*(~LesZ1Gd<6_ki}+(06Hg-+(<%z<Zlk|r9rwHlAj4Rku1{xY)GsvKR0^NW$X
zd)T=Wi$x9kQ->Cpc#oc<H4fTTV1AoVOUX4`BC~trfbmUwVSO3#HnJ(MXY1X_wAc||
z<wdigwvKXIpRXbCB3K4WH)MrPF^^Uedit4z97<f!cFpe@wk+b?_@HTos(?RRD;(_V
zr0Wlq%4#pWk7$V&Ko8ajYQD=hB#AZPenTXd(T#UibJwQbZ`R*gKV#p!QZA+UX-Fkd
zsII)oT83Nhx#~)KsPcW{{qWHpu4H%B>WUZaxNNJ0^V+cMkI*QQubw8bxfyxqf~<6W
zp`&uan)YE7$G`iUV;newb=Nmhc8zGp<TbiB;S$*aJ*#?~BC)A0#O(?~Fa6Y1hHTIg
zdK@t3`iF66I~uZgworE8Cg6*3vH~qX6h^<KGTF`Ye3Y*Qb>WczC0&G}DoyW{d+)G4
z4c1OFbLlYdloS<i2kESHYrDT;LWjz79$9{$k(7!wCo2V88(@6fEL9USr-mgJ+?19V
zZf#y_o{03Oi479LE+?6Y5g5x2K@QePFM1yF5ekTJp(uFhcrR(+&%X-?8L#=e+ZIoR
zjtPV=>i6N!7V;}98R$kC=mVIg@g8pk^fTuTc9?jdsjEbLG!HY|$CC$z^-Nf)u8Z9T
zHRtOjy0PThrsGjBAWw_0kZK+TowxKX6TW-D#a|MhXN*@H$$4?p`72@fGMT1HEWP>i
z<l`pB93rf<-+IsUmA*l<a5O(_?dH^#C!u&IVuvG9(ik4L#O2h&(T(j*$^RkiEuf-~
zx_)641!<)b5S5nh28)ss>F)0CQt9pnY3c56MoMz%?xDNqJNSIhd*5~Mg2h@h{NtRn
z^X%W=`!JQEdGELH0S<oc%02=dPUeu23g0{IfT~rreag5)ynYN}OvL9QuV~}=YD#q0
z#Ei(ZD`hbq1lV#dwX+$$t(*@ME;9q~b?0U8d;~6k+yHm5x0rkwC@63#(CoSpAKm$n
z6%_BmzT+(^4P$FF9M5l6quaLkRo2MKsr3OGp-9-WGr_d`vgcodxnP|&;K2iO0Uq2o
ze{;ml?2gdSx_xTJkVahG%!Gd~lii%~m2c9x{=O~E<^-YPHaWQW4|n@wD^O{lXGTKl
zQ(K__3@j=nArSrw>*7@x!P(<O5x33ke(Zk^Z@@`MA7$V<zU8}9^Ava6O9~D~{%K_)
z$a^#d$nrUDm<XU(Qf8nh&{#5>U)Ak_QdL{*skC(ur^0$2OGK3{6P8F8P?bR}N@g5x
zQT_ONkYlLJWqN=Kkv*n4DVG3OMajq{OD~m!q#RlFfX_#0-sA#74@Ld4>&8!q;z<2q
zgT$(HDRn2%zQmqfsfh0#p+#P2D^Wjdc<^wClVlX6VnBDkDGm6;vvrd^cb^*j^s`nc
zZRoYMv2qO}AR<Rt2aMxryN~GK9ti@We4B~_4w~6(fdY+=YVf@=Xubo<4-3J!EA^o6
z^He!BK->QIPL-8S^5CzS)35Qi1iN+TMD-Ybs%P)>yiT&kBS!Axhkg>FS4%4APC!9z
z*Q-`8GD~4D>6md`tAE(zrZ6KOMLKO021_VOU|YA7+)J{laGv9IE#5g{&Ka+$mmu^$
z#0fvV$Eo@HoPaHu5x!CB>5joBRMOp-SjEbtNJT7Qk{Au#V*68>g_leEnMZ7mWPr@T
zX-~7(<V;Vf(v0_#DCdLH+))(gd9y%#T9#n?$2zy2H^C>E4ni7|<MvlI5d*lIexZ(1
zk9#O=U6GtZuI{GHJ_D|@&u$$54a{m&8Y4ZMz<_m?-l%dTaxET?LWZ$YIc%aFct9F0
z4Bd6ScdHi$>?6G`3))8)3roz2w`&Z-UCGqIG<3N{qB`J5KUT)Kf$!&IY)MtnM<-Zq
zjw}xHFQ+lcfkNqzO=rq+D(++N7^RNZmgN1JDeX^Aqj`F35nL+K*<=>_Qks@*kUlb?
zOO5g_%LAcd#hvG%O2m6bME6A6qV;ohi*9Y^YUvBV#Jn+~8im2O!}gWoXv()D@e|v8
zuBApgc)b=pieNemVgz0XuclNb{X%w5tqfkf+Rue%7>6t@bwgdRK2rS~><ic;N^jFu
z0M>3VAyL(iuJ*Rdx8)L+v5xuVaw-!;fab33j)l^9vTP@N(()Z8kgs&;;;G4{Fm?{G
zJooSzsDd<<NgAHQ7?_VZGRIn723SRrBS<IK7g=@aGMr5#?%SpaY&<?$h0t-~63wFW
z#G<hAk9r`X&W?>%##)amq{NLmq%GPlQK(TZp#Sq(dE{iG2?=N6Ea{kd$SX~RYhxKU
ztKaoAo7cWYSr7nBmogS5@=L#iQh<c7$-21yP!XNfLf_xz*(Z|jq0fNr3k+8m6sO^I
zEE`caE#nUBEvHK39C@F!dSHaOJh?Iq)B#_kuKsB`8HI-p+_PeobiL?b;%bTndUb?U
zh?Q#6;lX;>of;F#^1|M#YC?i*ocf%Q@C%eTU1~BHZJf5(-D5K{&unasW@3?0iy0;5
z(hpw(v!)B_xKkZog7vyfU;Pe5A`<ZY`pw%q(@E0y6ITTe)qD5$EUn>S3M6A<`*6B3
z9>QOCG|X2bfm{!*&jRciS-5TF8ZVawC0AGa<VD?j!d)#Bc}G0AVGJPKqyNZb-S_$9
z*Uv4<qqe~7bdp?c=0Zz+0((XITKZ9#m!U4`<f8HLE!Jy4M-W2kjYT6pL7Az!Z@$b5
zLxH+=!6!-NHq;;5(66MhhVrv4mY{5|M}3gI8^ls{Xfe>eqd>xv<}Ur4y0K#H0D`#$
zmByJ~>Wj{AetLf_43~!IikH5)Ujm(=4*QG@dWpjvZFcX}-{GW{&7!=|vywZ)Dfgx5
zy}W62w)F<4ye+f3)whY|Nbe%9vlvI3z$)z_ptc#4Z`4OgWoZ~M;~#i)A<cd8F$&X3
zM~XE;1q7*i*^j*3ALKvntNF26`SXt?`!1fNW?kBKz6<SKLFq06niM1bZHFUFP4*b(
zlzG(D^u$uvVupUedfQ`2&IN&@d0^g`jS2YpZde8a$r{x>z`5;Bula}{LVw(OQ9hk(
zW2-l;VL_6AyH`KrTu7zNGYye7I3s<)`xp?cpmo><)de(G<L9vNxMG+e&J3@rO6W7@
zitV7*=8{sC=CbQv>-nrDM>HW&7P{YVaY?=af%4$~n&7hTsO!%E4-*BQn)56Z*BCEi
zo=Tqbi=1xcd=!=D7Mg6u>5r)m05MX=r{orLq~Ag8!?6Ni<I^&}^teYI7<f|}RVoS_
z?n|MT9-6W7ZQ>co*E+n;6!)S!d(Ms$@?^pgskEs+VkD-$ct%H~AO99KBu(P^cA9fU
zu5O}vihU&SQ-S}<1%TZ?v43d`dGn2W+cYZhR5xu21EgRCutkeuNX;yQ93tuNz^La4
z5XlIA!^><276kY9yHjLGaC+hzjLnv0Cp?0!3WpI^FW?4QV+?CQWzv5L{HCxaT60}V
zu4S)d`@}`ZCBBPHvq*?O?Inqoz=)Pesk_Po?-+w}lv#|n9D&Jlf(ENVp#tZTTIxHx
z=<If~{_9cMT!Rtf;}rX5#7G(+<z1=ncRdeHpI1`EeR-wx=k(Kmq}s-b*KKw&NdIDf
zZ@Py22ej#TJ^@ispkN-7Pt;fF`+r#(Q{OK^kZZUYen_5oY2-V|QATjP=#nQ7j|uT#
zIoq#{KcH?lLy|FNW%YDD@n|GHtY;@LMw6q4KZ{|J3AOlrFKn_=pMF1ENo^iR0~Fn9
zcIAujk>?N5cpF*S-!w#Ct&T<CoaXCHG!SP&g!ZM@BwNew5r4@$`3c3_d1i$um74_C
z0^Q}RP*SwdT#}k|UOp{rjRF7OE3IspCcm{6`a9FuxnQ$mZU2DT;_B`<r;$xsWGD`7
zkpoXB?JK9CcRm>@-=_(Z@qyMvN-MY^-XN?eX0RK%PvmCE>49A!!Ia3-{|F|MAJo&a
zk?aD+5YkrcY&bemAZ9aFvR=ek)EgO?{Th=M-UWR6fin?>y&~m9goC+5(w%n%kN^CH
z711QNr<%3)gwhFxk7+*k7TTaJBqP<z>RMl0PUbvp-B0}_wLL}Ps-`yk7lSC^{vTdx
zBi)639vyfU)EY0{`u%CG2dStyDR{#BP6MnETC)AL+e%eA%9++mZW=JhafW*TL?tim
zi4E<^YZ+M%v%7@jeCtj5pTL5|SGBZBqst)~<9|$f3F1bu5@+<}jiqb6^xPKHqlBx<
zT#as0v(}(~*&dF5p+BG-hjbl@@Q8Br)(1r<&u8J4QMILTq)?9dP&y?LDCL9R;c?*E
zCCoZAz?j@{^Ri-owCu2~3!c=>UR{#b+Vv)hUe3+;|E3+p-8_5xiRAohxMQ-_;orVa
z`m<}zcLSfyuOm%yDqy(DmADAP)v~R-f-g&NU;hz{ZdQg%pYHl{Sbx*iCPOF~6rYcB
zGp@Ux?{>BrVofT&XDcw&Qlck2*-MM!_I<wcZG+GEe*a^DL?|m{_^ZlH2rKJgNqx_R
zq`}9}*v$xaBnIGDZ-dWVlO5pvR(5t1qv2)lLjrox65}Gx(?^Q*&RhlGj2(2w)aS~M
zE9K*c=;Q2o{?zMq8_}FdBjVx;8+eX|ItwT*h2yJQ*6ki{9yBD^ma2pn&=?nlCM(Si
z=N%{yh5Y#ldp=g-oTZ@|k&5!qit>`jQ;aqf;rfo6#Iv02$gI#1&2q_3kVvXIGL12L
zVFj6f8t#Tr`{&WD9I=*rVt?}giW@rP>oiie@q+zk>d`OrAyz}pJMjX;Qe@n3oukO=
zKr`oxWqNgh>8cl@&cCACAF@WR9+sUB3>ExK20uxphfP_M>1`CgOfqGP<C$Uo_^2q7
z)i+*(1Meh3;O^!7c0;EO8ea<sKDu%fX@D|#)iE0iv*Khs8B+&kt~e?@M(r#wM|sjX
z*qua4rh8PxCo=iF%yr~OrT%b!b%p+DTvm>y_Oh{8;e*3fip{#KzuZ=CP_AlG;>2(g
z59aR7h=*~lOP$+Zr~7VP76X^2z|`A$*`a@cmyre|-sfS;eB*{79^#gOc?}*N9^q=X
zy^5Q}mX!~Z<f3n_FZo4zL4!L_T;Uwg>1Ii-qZaXg6HF#4C4ny!yJljdu6>_xhV^2A
zM4Fsn9UvDT{>q?1{c<#zk0Zs<WJ7v)(P4k=FR@O~@10gKg50)BUiTO8Rgets=7>uc
zR<GVhm`lQ|Mv_{)6}K3@emWXGBwzlcV>JGf5zA+y&`sMI-LaJ41&S$Lsu!1vPrpuC
z<~8fDktmg!go)!I{;>%}e!8B};7cgM@6f%+-5~f>VT8s88Ws$Nm9*~_b?IDuQ>uUN
zKQG7cp9@v@%jF673hwR6p?O!3xm@NJJQg?561-CZN=JJ!-<F2Z%Yb|ILCEM40BpX>
z&in*mkLD_p{Dj}EN;?LfAV5ak8}8IpDldNwm^Fn-QUw)E(VUHEnK)W%o>9&Xp}HbF
z0?Z^snJdnqtn3PZ!E+F<W4Y@3vD`8<u>8WzF*!IWD!1dD4t1;`Ub-%QY?@~NzIa@5
zATPs}Cqo%4!}%hCb>y=Uva7$%6#J`s8x|$@&8?RSmoS&wVppv6#v9aJCH*6Ra{XiK
z`+=f_lm}a9q`@f|(3lyxQ5`OwG)|cF+wo=GgDyIt+L(1od&^H0+|?#nuwnkoW*xbw
zdYn6LUIVg=p_SWo^u>=)lV64f^-WjKPzTwZ1i*a&LKfo!%JS6OPm|WCEEwe#x>pN2
za(MvWKCy0hSoh9`r{9L>PmonWe-j2xP*Cp7i2O~0|Jq18Dd%usXW}>aEWvApTpH9Q
zQ&CxYT;X<D0q9gMFWj9=O8QI<bjuj@0YM=Hv~_Sh01moM9FrRl&!<#mZ>LQz@Hauv
z4>2ABO(hHjOCugeBNrXwsu6#6B^E{Dk*|2gb^OAzs6tswHi|0_3It@YMK63af79S5
zb<bLqtaLVJ8vib^SktMn9cZaAkv2~%ylg9eR}=a9V@62l2CZOSVUQnGL2iyw$^jQD
z%Wau;v%htmdX7y(e)Cbmq8nKcEf`w{1GBNVF0>#Muy8frNpNp5fEW8&Wfh1WlZlge
z+H^o52|wtY^Is(ejM~b~GSY%jpn*kfTDkXfjFW!YS|K_nBMTGJn%S*5Du$3CA}i#u
z*)+bRwSnr^)?4%Z0j;3H@|Ml214OJ4zlFem{zvW%_}+&13t*~G<4O!7vhsiibsBi%
z3XE+gOdq%^V(MUj<@sJI;U#Ufb?6R8`CCs3fTfd<9|pNcbt&rrOJlHy5b`W;a9Gl$
zt+M55_zrI9Lv|ajr{<<O2pC0qxQcHd1JDpC7XUAkh7POr3>LQhl(J*3J>msOAfqVJ
zw)szDCC@kvC4Ld^5sAdSN5UguRK_J*z5uHTY{UoSl(JiPRQy%oOYs0R_50Di`lP_V
zQ-}{>)OwFSr4f0)oHW-j0uEr<?>}^!(Dm-oQzv46)2dErN;3pN(7N5ePG^wJGLcCj
zOWx5-OVu;sxe`<m#x2phQZaqE0q|OHH?CVP{-H1O6GPYiWDi&+jL46Ck9|^os3C_e
zqoOJ>S;-?Y_nOXRkJPJm18sjJ=<1@}c*COVteJgTfi~_`uhwm<Y(;LaV#DjJR&_-C
z@~2(~BeKAvWPhuswD5(a=fowK+@vkgKG?QRf8{LyT+PlMzhm#W<EszXpN^-YV>!xk
z2C0z8_8NJWmC;uxo9V>~T~B`(7l$S#y?jR}6E}hX_`7IYUgmbeO2W&$jta?JxAb2+
z(;hT4<1HR+XY?6i@%d>^dNz`wa)t33PB&O4niqlMi@()P!NJ#gdB~+u2@8uo2ptH2
z*gp0_0Bu?~HVoy@#!EvRBG<(ZM8J}ea58PH|4hq(<};)R&&qxP=rpSpVxYXYK2=*s
zV(m&*{1O)FlJ$k%xDw;rsI9#G`!BrLS_XqrSlB^lWt)9W`Si)YI%~{>{DAV9ZYVxt
zbf~DY27_Hui0txAS#H|IC&`w-I%@-Gf?Pn<?yCY{Xmz|Ep-QH>kmH~*#dLVDO5m90
zcbyFo>rieyB`4XrCLbJA#0heUl=N5T+_V5O<dG3LY9(>Wn?$0Xy#k6z)QG|EvRVEq
zcH{^6*9uYlg$%~Bjl!y9+>4+yZB<^s#9mm@6XAB|<8cKBplHH#XHY(?K0xp6+j+U}
zjEqJ^aR!l)9sn4BUCHl+l^Al5wu6W~5uu<rQ~~r{6k}i38#+Df`sN8--AsKXh<!v#
z+M8IEZ<?cGyab&l7eFOM4N?PW24Hukg-E51K}~7x%oP9?*?EVLEq<Dn@0b#X4R(OG
z{Yj<I!yLx=PLZb79}CNtdv9~}Zy#D@4zOV;Ogsa4T!ET66#;o9<~QI_$y0TO%;1si
zcqV<5@pzM=eezuVl&y7J<QlONgjSSmcVEohXm#%uy^ckB5#;cX70+Axj(IL3pwXx3
z`wxi=Ev6`m=07pvh5#}YH}Eql(CYqr$qXj&5YOMq&jX=40B2+dY8L$2EVDnK-!=8X
zj?ap5HVybQN@Rb@DW_GR;35Sb89fs<<S1GfTNF_bLnU!!0&B*xZa3|=1_f<Z-bNDj
zm@u_l+Kq9PVU!j|O=G~e51T&%z_&}T|1_ze<&-{W!NmTW3r7ayuX62UFKaSDzTC0e
z#Obk8!`Jo!;09ENXSx~#NiCQFz%WQdib~XGX$KSpAxcDgLuoaMyaSF6$<{cZzaMfH
z7&1Cr0}bpJdhX8`Gl*R*HSZ{K<v~#Tb!pz#tEqOplY>?3;S+k}hCqG81Q50!V+HMC
z(G2Ir4*n7ssh$dAItlsfQRK%Dbu06qC_0i{JDDd%cUM{(DZ!#YuYTelS2g5d&;g)G
zz1h4L3$6({E=(6v#J6>7(E}@?N<;x!0m-%p_N?`U&7Z888<$N10{WS&l0MA=dD=Zn
zz@k-mzCv^hYd@o77Q9k=gKM5SmlDK}%3KzO?B>_K6q_EvkX=)^C=Ln1p9YCdH5bV0
zZqwAjJepiI|Ec=Z4S>{cn7kXDj^4Z_M|opiV39IHeb56onMs2hXE4H7h2uXsShYim
zqj@0Nuid*$z`Zth`8Sj+_1NJX2`DtM`2x1q3+Q8g8EMkC9TjDgq{76DA*Z($W%PFH
zl|ym`CW&op9%4)!|Mrdp+f<xIk!Iy4FTru}Qg*nWwUY7*oy@ks3rNQX2o@~E)x8vi
z$*uIieq~`G9b*qF=B4lNVm?+TG%f(4|MqQ~4^Xy*xWL?*AgI;YI12+JNb2CS1@f3{
zpqu0ZxFy#>JJI+4Fep<uffp4Ov(ew4F=&1RssAIFZTAr(u5fn+J?N^;ieX;!5$3Nh
z7+5ePdVe4*(u2|$31tJeD%ara7VB4INCb4uSCKBrBHJ><*Vl5bt)gr?1tjBuH#9_c
zvts~bQtw_cHPg*}Ob#Db`CSmIF32%j(ZN|EgkdkhSAnJO4J>v-uBJJy?ik6x)RoRX
zdL~aQIk~;HQ~8XuF0uG0^L-#>1hNYx``^pr+6x3*>Z37;#Ec|})QpG`?9A}5o!Kvw
zl;I%-u@BseT~L7V03#^osP9{s`yKRM$}WqT$efd)0f_$zH))Q(U7#)u*KW<doIXo=
zr~-8}rE?#Et;*nlycy9NHn};+<UtkgVUbA-Rgt5)ZweOrTGZvC=J6#)!JD8!Jt0??
z+c1dN)*NlK*it{Z99Ok@O5{A8*jrstA>?3d(O>Wzb#Q?x4Gl;y;T|-!3%la`rhow`
zr11rKIwmi{+RP2vf|M4_Mx(VkzL`+UQllUSJ=D!|pMt3%FPxdadTpqB0=ZXdmKT?c
zzz=Zbg*-7(mlZ`Tr_S`k%!JzKDTcq86O5J*=H7!fIzx9%F05<LVkk$K6IG~YxzIrU
zQqJ#fxfZl}S_kX5xiIs~52rQK(FrBV(<3tURCmhLgQ>;|OAqUe?Hc!L9E5509!d7U
zj)op_r$ooShz8*WZpsH9l>Gb<Y>VcZe~34X5@7nHApJO|ryWYTJ$T4qd`PGRp`bX1
zM9H*~$03n2qbb|AUA_#lUm_)`Y`7WMB71$GZzh{0fp+4!!jW*~pU-i=<b3PCdG?Tv
z#;wSjNUurulN*OKeb`A6-@Yb}oKf=Ar%!7qE~DE=Sy!k4zL6Z$x3+!`lEE0`lXt+8
zUIf2D-|}~{AII1q_H8`@B3VB>b@<b8ynLhsqRUfFyr_uuR*OINfWeatvrDB&=Yh1d
zp9wMl{)T|RL-<#V|H`rdjsE{C2I9E#x9=7P8CS>K<2)Log3%Q5wvi5fw04X%LEk&&
zSSNH+2&VNx3Cwe&NRy<sF>%`<Va{!kDey2b^>-fxQ{N_sNGRdH=^_MWRqj+Ska;pw
zM`TWayMj(<iO9Th3Gc}M<6tKK_f+^iPB3P4d4&(MYkq6$3FYP-PF`1yAT7@2$SS=K
zM-H0Di*7TioREtW^!*(a`Aydo$^Iw(FU3u1{*wu$t3s7)auk#+<v?RJ5qogUTzGIZ
zmXEp00OF(Z0h0V5dGV8Wn7d5=<l@MYcDO```Ldkd%Kz)u`A|U<IouadG0f%CO!dF$
z_<z#){-1g$zzw=x4Hp7Epk^1jDHfz7frLvC_c2zl2om*dicF-N1jqe?HC^ZcIxeAJ
zw>e}6jhmm+na)ghP0k+tLprmlB(nP8j_peUERQ#aHfMS-Mt&k=n)E-po3+d?-*E=i
zZ7>QK|5K8+A|DGK)}D70x{8zeSNcxy_<`jA|8Q3_4U%^dy2h`&aQJNMni1SLF8p1N
zj!Mfn^V3h37U4+V3wN`1x91nfjjf(BX#&ZK<?a};g&Xl)hPo8Vw|`oM1lxxe?|-WE
zqIh@i8FbIS(w#28SS>ElE3+U=zP~%z<|{*_M-!G!thW__EnE<S;=UK8yQ^nfa5e^F
zINGP-@L;QHhJzJQ)Qtf@59C8RtI7KPF8tP*+#)$mXSLOx)LY@;_L<o$=a_W&Q~c3#
zX3o51g$vK0>$QU3g;gmnceB&CCkL#~`a;TzI*ssE49wPRTn;vZUpy)NM>!W^8s(W~
zyKzo7V_8}TC&R7%VOj#7C73=B0yn^SzhYxp+VINz;(nfK+x^bK^QKy8{f`s>z|4jF
z_d!srSa<-ZDK5S92^G5KHE;+mayw+%C~A43P&rb5C#-q1T^oSxjzClJr>VNOY(s-A
z$93}Y$|EwX+s$z9!xby$V;+P8!-3@_e?ifcb)_t$F%s8pCv*73s{Tr(&5_GHQ|HU{
z^vzf@xYfDf$rA)8&->Z&0nI)cAsa4jSs^Q;gPY0H%g#lQu+CD!r)?c(2fBlZWLcRy
zR4=@^X47v)XKMKUd5+;HcU~t4b^V2hwY$3Q*L=>$bO&U$`$;NjF2h`KM6Z6!hk14X
zJ&~78hj3tdCQTQXiVhnZyPVP1+^>)Ym3u8Uwo=DSM+J)h2B)#7d+d<vN0(Q+KGZs?
z&t!M^sLe46dI=$GUSibSN2PlnTVl2z%iCz~?Fy^gKyS9ED-$L8zvne2x!rFHf%31V
zY0!FLh?H36CQYVnt;Y!;R@89uqQvu1&c5ZI)$8_#>EWUwCU;^baUO&mhp~EC@M3$P
zk;}DOIyL9t{y{Q0oguv%n~r;D^B&<9KgZz7qu+`35m7s?_>0z9G^hw;_?jR1x4;|_
z>aK9K^!D5Y+ano<k9RNbQ-VweK?4yoSq{B@`0#=Ya-V|&GW;Ufa&cn6;KFh`%Dld>
zS%;aE@Vg%K()(xPuOE$iKgaG9m3oFU5%E*n`^mG6X>D_Jz3GYxH>m0C!2YiE!QEif
z@f{1F%$O52%8~Ns{yuV}`DQ`D4c54sW@kCwSXu{rv|CJC7q<5ST2<jDAOW?$A%hEi
za`Wu<N%w-0NT|Gi{_F`8WsS#PH2+zw7s68DuvdhI7W}8p@Z{W|J+JE*Z&xutmgq!K
zXu`w6bV8jfVO}j=`(y$G1;tLbP&Yvzh&d9-yYH{R_>oRdo#(Q0LPoK)wK^mO=cX%Z
z8FW4{6FjBF*Ro+4_JED0c<#&AG~JV$dmIj%{mJViih9Eo%&Cs(J*{gn7+RWb1Tp2^
zWJ3Dowrv{Y9`@VliMIpSVyD0XTxm@|gu3<iqU+9b|88ls{ua-Q#?9#>Dz>QMp3(2Z
ziacIAZoYDAx9PY!oq`nY<Z@FepkS?hCfQ{WG@Qt!E76&S3PJLX$aMP67JJUHTJm<M
zePN~g`vn-Fp@C2CyRl+ypZBkyHx0bNCuXwiD0x#6_^cuK#Y-ZZM(RAA+XMdvN&UG;
zxPq=dVF&AhOzr{dRDrNQ%gF_ME;}U)gtnRS0fmd^)UjEw!5S-fRD06JC7b&#Gnook
zm_~~G#^&7d`9*Ypw)2{tmk493{%IfM{b}Q{&Gg;{i><y<tUKfvGG%;@+qCM`m71cu
z9bH4o$oPWz@F+)DQ&!cmhS(feSX6Xk<1s7e+>tA$;IRfc;kotkbBFN5zXYB=AfnSd
z1iz0u+V{>1Prq9wdpSuZOs~jS3*Q#{wWjZQyA-_TT0i>fAGNJk9&y<$awEtd(Ac!j
z*5$!?W#BiWJXeic2!HFfX(7Mj_ZIKbOZ40#C<1f9rqOJ~cA$o;N-}8n{<4$ehoI+G
zpi@R45eahbNf_KYg--M827T|2)Xd72qq!e?+<77NzRl((L_gCFVydWhftm&%GAdk?
zTcZ#3_8;=?XI8TSbs|KDg%}!86`Mf|5|3NaQ+kabZ;iyB2uvlZ)t{x3r6mw7`s5Y>
z(zMUE+)8jR+)+vMpUJJxClwrzd0irQ=ym$GTHQx=V2H0I-N1jk6qYUlqC`mYqqof|
z5GT~yBDP)++z>9*Vj#Gg@hLU!JaW9fAdt0VMc7tOAoDtu%6{u}qOSHB(A=5lH9hAO
zhd>t%zi)`Yyf&?_CWKlED2%svdnQMNN?^+7QV6H?TK>(|sdBTPCUx$g1+sRpmpk=W
zbSNNQNyG;|9>)k@7=t0YZ<z9q&dK!dlcW-gIpfU1K^SV5hSPD$+bT72`cvU4A!Ci^
z3{jA-s8BVOP|T?QY?U%c9K+<)nthkL8_X=&%M+;iXN6<*eb1SQSuX38GP3|YrA5{A
zi_llI8ad<dqw4Ml4-6o9(4#G(yTtNnN+7$B+vUI7s*v2&61VDU5`??&%sUT}fP+vl
z2U7)%3%*b5eu&EMn5(s+D_M1uU6WiUwT&Zaxw!gHOkH-K>n8To{iix&0zE!OG)iy#
zo;{<wAH-Z|VTUfH-fje#1(SCz?W<J|!=1X7pomk%b{luOTqjP;flUlspnoy~{YqT4
zv+8Uhuf7>)vwd?!9uI*BZ}ZfyRrkJtE`o}l;!)I_(u{E$gY8^c{dPv{N*hy4XIn{L
z2&}MX0c9)X)x0l6w_U%Sxo*>GN^K^w(%3hWafj%|7mR<hX#BI=Pg_;}&d;ghDy?>O
zbmGUfE(71f;KC^8d#YEoZib68t(e}``A;h+dR9WAM?ErLi*<|5_8g{~x%D2HrSQI1
zJ*z#0rSTe3=i`1dLuWW=oQ4CNq8nT43BKLRc6vE@EUVWZm3r^RU+{F;mL-BY!=7=$
zN5N>u=r=2ltX_k7(f1xTaf*|Ro@9y20ey8}O~IP`EX}DjY`a?CtYVGvc+uj!;ZRlj
z)_I}R41^-szLPzdgopKp!fVe4vg-Aur{};=D1|t?lf6wppP=$r<*!p;FH42o@UnW(
zt<Mgcam?h<0OcvTS-o#gKLFt;R&>Le2detIUeu=p$yc8<x+I~M+>)9q*f%$fM_+f@
z&ucYTNhl!Kq)vwHUE{)}1~<pXh0qb{YEp5|^V$A|89|e26fn&9%y>U_KQSA`na})T
zu!6s)PQT*0@w&HQJz_q%?HP*};{IUMD++-akXfg-kn2aYMCp*huO?vyX|TnkC&>pM
zbQS_=#_5d_%Xh<I`8!ua)l@3BU5}^PTQZ_?T*imo3ho=Qv3)Y+PD`3XU<;Rwj`u**
z;7n3K2-b7S_>C~0rt?lL90$Iu^}NMUP{UMupFCV)4Gx^4rApTWT&JPXzah;N`+Dcq
z2ZiI?6+JKYvyhup9eWL<BRnWre~~buGp{G9cjhe@3=ev+x2YR__j2zuQuRrybWyM*
zeF(&Sqo?-%sUG!t##)8nZc?rSB<-1VWc1tjzrh%u4Pku*?kX#E9-i0L#F%z9L;aUy
zkXtH8>?kYefjVbHR}zqZ6Y}eN*wm}E`yJ7|l+wlnW*45$Vz1NM?94YEtC-S)ClWPH
z`=mvU+pl`0cSWI8l15|Vl7?uL8*vp*RcDM80uW1__l(giflM9&bO$bU&;>X(`rZ|3
z8k=Yyec{76p5zoY?Ms_8Sln*dkughupruS_s7seuWvUw5Zr3%V+8`!R4v7qU;(VHJ
zii}q5`_||%ji+PXsl_KPr*M1rURft?(!tVwtF*7H^JkAA)##tCBt;Id+6%__a<NI&
zrScUgt*xKugob{=kA=)#?)XB+x+)-q4Y|@1jt|6mmKz*U{^mUdk}!b3V>vCUUCu@D
zJzOf~^*^~X<tU=~PJY*y|Acs-+f7$oZP<O`C|#!Pyr&UJHFRX2zVTS3bN<P%{Nm_I
zo@R%W369WTC+LMWEsr-d2;M`1K8pxKQg+$f^CL#}wmN)b7#s`w?ii@U)XQS73Ym=7
z{)UP7e}uB*^7M~Tdwv#7UY<c-P;gvanDL=V9E$VrM)jUH1!_j-+Vh|6vwx)u(_!dI
zEb#9({S-w(k5iHi9yz&RkGrYPk*>GfB?y>0sCz*_+hu7?XbJD&>Ip&^;_P-pYuB?$
zjHh&cX$EF5H}-gI{PCc3BF=NaWzIGJn+qV$`9c^fFM@fXKR<Z!oKVoBtMiLrlRl2D
zqPhoNR6chvNBRBa%^_B50WVJxe!V2>;t~o_5#?74wS-@yB0uq!Deo24Z8(w_Lv(Z8
zAO+%!wExk>FM=>0`7DnqCnvddy9xHp#l}xaah1VptA^FOhy~B3O1w_oz;G8kC+GaC
z^R?@N6%FG*7gG1!Jh0hpghx8m&h<ZB#~S#|h|1kJ9e%%QyuGt71;0g{oVR6Cn!n(@
zzf6{~8=Dnk&JWq*%@TDag1RI2j<DmkR+iGwCvJLma$LYsoSI-+MU7Xl_HS-VBl@$c
z+ynDO#@YU4h_?175o<z@3#}e$w>J#lPGdP*?;##^%#3$@(z>K=h36mW)+oE&iNd3u
zJXf9AtDU5HupY>v-q7X2$gZ#+Cv^L|u(l=BvY|Zs<^!}Ij<aY8BZh|{3*HIfTh$>|
zP8ia>c<@X{78)aNgwCZV=MLZP-^09ghF0)M+*W!KQk;*evp)(h+)k5KthdxwE}uh>
zfCn=KyT32QFtAVO2`VwUM3Nzdk|3ueQ#=l<v@>Fi*6qD$ZEI49Kk4zh7BsVJ@mIR*
z78+^P73h_a{+x&o_yLO5DyeqiZs|I!MJ|r5M}2|w%E@~81|D0u??xOy??{H_zSHV_
zSSi$@Yn67~`1C<P8UOrh{)}+u2K|#;y3SbP!smnIGlwzKn>`us!xO6t<4*h%*S%Tp
z)4~ImrVFAN^&T<1C4xmmXn4z?0@(iA0+xH2Ictwz`kh1tj%O91J3os}&7A<*C5alD
zJC+9%|Ij{};M~_9?-TSZ^|%D5@wyFCZfVV-2z2K^qk?w_WCjN^_tDi{%}GkFaJy_h
z9nYg9A<DIvNs9HmH`~x7tn>|qgBx}Rjb`J5prrnc4Lfge_sb7oCQ?FDq}e?)9;8~6
zHT&%cNB9=qfJU`Xb?uxQm&O!FmsRVn&U;<h4=?F)HEss_SA@O#tcXiwf)LOz?0HQb
zr{;~5mf=DYo%7Ce_!9pmKSEOOQQ>e?_(+yp-L{2SCmq;Fe=C13BY1m(VIYarOk;}=
z|Etn1Vw_FJK^;%aiEy3f6+%sR&pfziS9gJ!uDAcE-}zX=ONo2i{H61CnDuTTS>^UV
zSEc@{qU|<C_#I4Z8Hi%?;<=~h6+yZP#Nffn8s>f~@}L>N+X;^jv#@4n(r}Ytf{v$d
zE&AA!(e9jsP74mu>=*q{37Gn8<^~YcZ}-iES;+ZsZ-_@?Zq0+=JV@O?_I&;zS@qjk
z_pUYF&MI$!KlktZ|Ngt;S-0QAV*U5q?-KYKQ~&29xYio-7MJ*c-}8)71(?qNb<g8=
z%3#1R|9!;6wRQSH;2+-qKi|gF$4%qEx87GyR=qrdFL-=7AtWUH1rE07vD;Ap>(1zy
zc0c(kTX%KyEVVc_c*+&d^#M8i<a#GK>gN6khh*`5VjBN{URw9MieV?w;)SN;vtSIL
zrMKFrl40f|$rddCGlq7aC`eKIzbn85nfteLUAo9oEB@Co<Jk>iA8{RcreiK^(#0QU
zlwgtVvh}j-@s7E~+y5E(&?B|kvn^MgH>IqznN3B{!L+pFSf&{fU*lu{NdNB)!6K=Y
zSI<9r-<K+iBSOsP_wb_1knS2(Z)au7XxyxSS2EHM?wvSyE?OEbC#mh^vLw<^sWnDQ
zZV<&tf|X*lkc{I|-t9*aqvWtvIkFIvd!UVVS~-W4?Vbmj-qpqpm6TZxLpwjq<#t2a
z3>f0~;MBOGr*qcuR%Wa2>Pm(`Pjj5M6`HQ+;l1Buxl*7Pnm03WuAD2`hsO&KOkTa7
ze)LzKl&5&O;xiN3!<@N?6Bsx*N~gYv4hM;k&CFPSjw<RoFn%vgu=+4$^(tlaVtQo{
z958SrwHUN7yvv<jP5z&qPn%kt7XB35pO)~K9^gKV<f&T!gjm#51dSg7TIQ-lxDVuh
z?K}<E1vG<pq{Rb>C~2i|5@p6e8hPtabuWm*$zE6O0e)dtC}L2GxeN2hZ4>0{Xc~A}
z<t6?KOBk8;mBj&)_8Etj?sO0Ia2M<Tr~QCj?4<d2ySbXWBSZWI#K0kG_CNf{$XU{G
z^QkDoI&129r&aIarW-7HiVIy$#hF8To5K$6Mrni3FBr{@*PTt$>}L5ko)=`!0IFRx
z6>x>jb9Kx0tO1kwK)vQjb5i{Rf2ILHYi-mS$;tNksx-R43&GeFW-yj|pto=gV^&KJ
zxW49TUY&g+f;ACanPc}TdH<GfgK_q|J}Zu6dZyK;)3Z=ACJ88y69>%d@)Pey$=hVq
zk=9Lr?(jQQ?}Z$-x+^IyZMa@~*L-WUwFM+r-R35JjKzr~sbGnCVerTZj>tob)Mjd>
zH<s$veyhNxYhg-F^eV&a0E>w44!9RhK7&qo7zl->p5*;pEq=W%woC}%c!&P=+Lq+7
z`ogcrL`|O>!K9#(a?2;O77wdTo?y|>JPVD{km!*gMG1{_T&A(fctJ`3JVg`<>n_G(
z@rL}^7Q1&u)MZD_iVEjqLei{a;3-7WQA)x7W6P~%z`F;YGqJs$E<NrDhhux|&9-!)
z33-y4Y9hU7O;o8^X6)FH1}pMf+|bhVF!+>ldSGPHBeFUCp8PzHvBYh@w|qwJk@11b
zkb)DOq24}5L+IxDCx6eLQM!G@JqIh!<2mQv8|AWVd}%?r!};CLDtonK<yA>*yRJnE
zZPxTmSFzQVlWV*tq}=G0;bN(vv_vnz<>iA5UjB=IZ>HGG+wJfvrr<r1Jvg;p!^^kX
zKzi^(b#G7I3vN_4l{9_RC(6?d<D&;*1VAse2#WW*{Q_RSQ5a4vbqBq>*W1W_1~}>t
z7mK3{jgL3WdoKrD7exh=+iXr{({6iy_!3B9E6v~4Dkt+zfkzPuo}$HBI#ZgJy20w@
zV}ZO3a8Jh3{ZSuy#Ezl+VL6Ny=bwJ!&!pu63v1te2sh8PR8^a;_QTSo)d}y2QgD1O
z@}OL;WivJ^*X;5u;qh{mP7c!MaNbUTPfYJ-1I+f3&N^hxwu3q3wyp1r6&o{LVCP1~
zlvlTMei0${Wa{h$D}$Ej<7)4ghAp05ycV*1^khjly`~diZ;i)OFb9M(<>a~z4zis5
z;&}Zo1<KWq39hm6$xfeM!NJQ!3j~r01xweZ#e~)!R}#@z9HVK|Cmz=q_s{;Kgp#iB
zJKZtS)y~N*4jm{qJFP3UB8%<bcP~bVxuArQWE+hbHsBZa>RROHS0@XbS!P*5y;!W5
zydH6%4|F!IkS^x9wIgAFKu$^$6%~~`SzGV8of;pNlGywRXf%9D;`Z<mvwKzgnwcQt
z2eZM=qGy&*D$l#3q+Hga`1cVSBO%zMJRd-cLNPn(Nvd7ZiIx14RWC&-4>ob_Ip0f6
zx)3{itawP5`^L1|7G|U_GD8!JrPynS5yd}74FHF}VWX6K112hGi*xXmr}>1hYDndA
zTZ(VI55)@-zMzQ(r1x*xfy{!<Dg$6q-px|P_j3B`d1;whPL!sC5(7foeb^)dAtkU=
z`WgA$DY2g-G(CU&azAFTXf_aNYOa<>CsbM^+VXxc$@im&9MS*^ghI@7s?AV%d3A<O
zR`d0Zf9kc~V3)Ujowto<lU=j_8LQNirmP+C`EPrgmZ$zpN+uvxmErmo3iSyDkZkrB
zpRADnnHl`DsW?3oRk;gK-Z%eDuyy_zj}-&g9Gq!Ntsz3vkb(l263tt`;;lG#!P|HN
zpm!*C1yQ_<^AW{aZD7f*-@_E0Ejxmz7RDhdS1=X3c{PNHG1<AhVZHL&OtW`i1ofKa
z<WBnXpVe#(`5hNrp-<+96~Lk>bZXc~X6hsO;D>x$IDej3c8C4^Ns-8`w)fOb_^gO8
zCzlqT%q3eVts&R;GJn|a4kpxXfpv%c;(``a;0R>@`uGh$kf-}6Wb_AQRPIERF^k-Y
zLv?0=jeepZ?ZLk=d?ay#$Bt@)gF$mjk2^2-V4oCr8SLt=+L^FV&MGe=!=Iy}SAvGE
z<@dQxlG!wAoTJXjGGJ>W!>^h98zJEGno#gq0?3J~i>pR@8EU?BDMbcshT@yVmaB)>
zp*mGh-_hbP6QeN@>4_Rn@wf;st3JovU#eYnChsqC=6Fz<O7{>fp4+un*2#;3s>O$p
zQKe7wo(P4U+@M=Q0nRZ^RS{lLC|aO056Tm^Q2m}s)mh|HG1lCn2<Cr2&U5=gE&qvy
zp0;~9<qj;p&ed_GG~b0?q21Jq)~%%4j0G?h(1&XkC=nEXn0G0X9YQ(oC)3<fkdW?4
zRNSIhl7@8Pl8St7;T>DbpVj%H&TX=1(i+ge@`))a*<OoSvbKo@>|Yo|DH6++hhCY{
zh#R`bTT<aUM3>-*&FY%<m|KWQ`w~rE=z&<k5Z%I&hN9L9)j$6&{u!0I1sA*Ii*p)!
z6+HqQPR7?AlN!L>8z%*Yg+X)+U9_Th6N=G@TgdU0Ai31DDmcJZYnPO>uLg9+7~5_L
zqYyVJAx%9QYIwbq6lCR$m3jizsjfxTW>u*>uZH*a@GYRdH!iZ~hwYe=(YsyD87S$o
zPAnt`IZNx6v$J@uE3VEo;Xxl^dv5wvRG~rVLMZSApEeR$WaoH1^4wfVI>2ZM8yRIu
zui@zrF*7v7IZ@KYde6@s5;pXmgNyj05x;121G)9{#Y!bh4!Z&^--yA{b))lHBRqV_
z#aH=Fy?Kf`Yk<lVEWBq=;-4ALV$+{rGj1RRCVZiyZP{a@==kT|tgTZ3GvjM!`6fP$
z)qZPpjg@7!1v07zMJ^4<{Wz6ZW^7=w9`NAC8Ty;EifKw$T<>t{u&M#o9-OqTd;e(7
z{-Jpt9k4mVn{yLY4ot#8trS1CTOZ?V?F44ox|hr|cmzl0??UDGAr~0;btBWG2=Pm6
z5|^J7O0AI_ldm%GAGyY>i(jmS^55JEm6$wTCgbsTa>j~Zyf&(GV0e=pSfP93(RF(V
z3}<Z4o@P^|*})BA;%D#mWK9XGWxcg791eI|<Z7$VRjyn8Is?`!|1lk?<&|J^!{g!k
z@Ymr94{3%K^g`Kdl}xvsGx}L*+1jyEWJn0kOFqLUYCKY-kp@1{t}SKio}))vXmF<3
zYI%dp<I(1bXnFNOhTg(U5{Ewu4-&{pe~$NBN|)&u0Vku{jITz!@5h)T;DL_BkN}qn
zg`HzK3Zt}WMxnBH{-?fa(-`ve<F%19@#u$unl_TGt;xB-HN0#}HKFy`Z$}v)r*He&
zF#NEYqJuF`<Nn>s2W0<OaP*8LJ-^tWDPd(Nc?Z-)w!q2z1mvhcP4wPp1be*$vZuA$
zRSB1dc3_y-ZKncdzI#8t7#Rk2lp5iPy=%Sa_GoPB@2IY=*x2D>b-X5Wyj?5pkJ&{)
zCWE6l<dz-(fkr>$nLg<cC#0^DkBZ0Hy=}!>608gbtJ%3zNJhc#8PnW)8}I_5+Uo#r
zeVWh2Y}Af4Ie+K~h@A#scI%@ULV7GvdE>U#@TUny0!txs`fLmf+nQ8&g2PI`eTVqL
z>r0tADyE%$;F|$PVDa-}7+*dw*&x6chYnk8XVai!`v0glgQhVhJ|TF6h=_Q4e$Hw%
zlroKPpa}3XtjM6y^fA7MIIrqo(+nBKW0LPOzP*GF>c&plx&e(4A9ruF^>>)NWv{T3
z;ot7;<=;kgG_KYov|X>==(?NQ;TrA-r(tBYdKXPOX);!WO>Ok|7h&`qOdPd+<7Dv>
zO~s}vMajqQ!eJ2J+K>4eKX}_>YpW}?>x&11fHTMXcIe-84GE*CgNpNU#91>o)*)mM
z-d#LxR6a_j$NDr{g9xKTK*<nwVq=#oK0P}gv=7oja`W-Q6t$!4S~N7R#fPOh`<Sm0
zi(wZ{X&Y_NRE|VBkgj~tsi1mz64B?VDd5dbxkI(G3V3wy?;+_g{=Gmd_2R9#;;Iwp
zIXB9%(Ia1u<YeW#Onyzxw%BwX6fMDrZ4dL0Db-rZTNzR5{kPj5;NW=#?HVz9O$)uF
zmwZ%q9~xg^&{k69Kgg|S+*U+b14_uNHD59~<+QwW4J9^sO)w55q<OEW1MU!Ne#-K#
z{UFQCm%MxsKX5&stK2weaH<a7<;0+5^ZSWEH=gk8gbxGfc(@E4Q49VfXUDHC41>Fl
z&z=@yD78wyEsByZsH#dd64rd!(KgNfCE~yIs$}>HKB%82FbE{<`I47a9#M-u_3|z;
z=pReaxkBROU9wpdqhwzz5&@0w{1&>sX=Hu0{g`Ltv}+g{l}#^OHFL&k-X>~8UJ^?{
zrz0h?k0Pu^!TLN{79fgLl9Rr#ry-xcq~0A0bbiait`yeoUYDX*HTWdwP(iCxVe{rW
z{sSR&EEYZsj)`{MO^c5uF<|#k&dTDYmMQ7nvl2T<<AR;pe;)XREo8bS#r>Kvzs>1`
z)vll=XJ<jE6b(<RvRIB;iLzh16+jfg4nZuTNPwa_J^dO7yQnEMuwJ6bI4R&4J)q~m
z=cu!9y+gS$qEax8sn{qmEaEIzb9obCuOXC&7yZ9u6zV_G(b50T&i1G9*cqzDs8Eug
z{7T;p5Hji+w!UgmGpSIembt}#P00HL4=wF4tt+3>3n^d>#zUWEv%VQH<eMk$2Q|H2
z-&PZ(6hZE3vjHP7cD*|)4;eCry@sCj%{N_fECwl!wPO=->nch*^S(qBia6`YY<ZCN
zTwwRy&mszysmd2lhlH<nI91A2)Qrn7BE)VWg|zQ8Av4m_;$tHX-p(`r-THZu$q(;{
ze^r|>mXC8IFi6eQ6oho^ljmhW+A>#@q~S#O9t6Qu^InQX$R8HoS$rJnkYBKuc8!xS
zg0S>zYjmRM#i9b`Iko#*e><_Y9GVOm5;x?Ijx<DAcbN1+XRDDAw6ep8;(R|xTgpp`
zh>Aivz=)&c$o<@@qMs`CM-Ebk)R?o(j<CnpY}6|cIARl#1-+9?cxAup>DE6N$XgL9
z_YJ`~NrWA{*RZzSdg0w@;&XfnI~CGIN&LSVg%&J9N#v!5<*G?y*&YTC`n5HY!+D{+
z1u>;LCSkIk<uU%g0!osNck8j}pni*IPeM!b<mr1TM~4lyKx$#`tt9<=Zl%MByxnU;
z;@n!hv#K@I7*PtAFQTx=15|2*H`X;6-QV^DJ?%%gsJnT$_S88>939v-@>N6&a<!}3
zM|ek=9*=y$KeGf$h}taEBldV2P%(oaRqPI9(+!}!2{*YKPa(EkS=G4Q2|EO1Y-aq&
z;LI0RoY%?0W78E8=8QhMg##v9L0s+)mV9FUFLoz62Xga6iy<NDRBF`tmCYM>U*|qJ
z^`cUkdPQ8n^%&Sw4HhpT#etCVe6rfy`RGNHq1jApiZ2{%QsJODKjuJs@c9lGvVkoY
z^>cp4P~A)(sXWFlN-zO|Vn%HPN5+LEnck>3qo!Mg^vtoOhm>EDu#oBxNLL-18JBvy
znpf<?sF{wYGE`|i@;1qQ4{&2qoFT4MkEI29A`0Q&uj+h%eX4+IbB>v>tfcS96n1MI
zRd?;g3_ebo7f;a-f-?0eBB8-^MtKI&;Aa1(B;m2+->NM|7&(<XysG%Wwv*mynM}t|
z${K(}KKZBBQzmoM8XV-O{n#}qL93IIhBjv^a?ueVe7E?cSwo9%^NN=)#!0G_i^@x7
zBP~IC%H^KXi3k>hc94w~c9q${9R7qsPF^*F$InABN5QsI`Hz%n|6^ToE{#12^efe5
zH;@D<7DXf;6~f;zgHLSm)jY4eoFZN(3(?WcY`QXMuQcY*@t8xgA>3j-{N1c`ae1A6
zQDM9&ZS~Hx*Kx;$>q>zs5)pEvRkZ_5jhghEswt(O3&tc#=IF^`?x8a#<X6A$m60d*
zUdP$~82v(lolz`78-1V-t4(u{m2@cit$?)kRRwQa{kt)*gk8Tn{-E02Ve0pj;4+^V
z&vHLV810Ggt6vV{EVj`(og@{O@*W(t5b_9|^8tp^y=cR#v+SJRCiaNBnvahbyAd<|
zuFT-bS=y4W{HJsIKK<{^+>erN`x*v%`khtjw<rW(FJIRg-hE1O6cksp!zw7TRyZM^
zX1gSpj0#MN>d`gnu{1I03{tW)v?q2nqbpzIZmr!q+1-V|iqg*ngss_d8931N@AnFD
z{u2z8$$a^DW7`xu#g;g>EE&D(sxsq!!<7B`BDM1`m>2LTe^q!7UV=tZ1##4Q*|OCn
z@eGa0pYGtnVLfvt992_peH*&j>x8hl=I^VYS}F3f>-G8<t^2Md0LuV90#3ZyUqS($
z)l#B1S{8<_olj*Yqn?t21ar(V%zKh({meW&jtlzMr_uTw;}E#^$lM_JT%(v#=a%tW
zEIpTnJHe_?mcE^hJcEqV4R+KvLdKN!2Nxd^rCDrS&~TNsf7?{6BfyD_h$kqQ*0|xL
z5$x@8kfhP<+o;6X_?3go&uLXL8|!hzA8R?%g6=j{YHBK^9Milk;`oY;4JbTh)Oq@4
zYBPjK`?Q7Rjm4==Q5-wJzqOpl)r(wVgjyI4?lk7j-}fMj+K6D5>u+N)HDDm?fsPpG
zmF8E)a-JK1`^e`SE<NPF6q>q7r{3dVFt%E#+H(9^y}XR3ubEZmJxFR8CYb&E+e%1@
zqcyVmlhDQ)8ay=h4lXPTdy6Z)aU6T`_rMij7!sw#$^p_z4t_yq+A|lVfI)CP_X>xe
zx(8(}37SvhvonCLGxN@TZ-H>_-<j%!@=KEeD!TTq*Rc3SDW_iEhquGxTTu&R4Crlt
zejRAftWOrt_V}%uX6AP9omHX*wLnHhY*cC8WlY+Wlk02LrDma%on&mT+_;-Zgb$Ji
zmlie}b2}Uh)#=?+sj6*EG<j_d)S4t$hSY2Hhnn$F4)#s0&edHu6r6rq9R@aGY)s)!
zCnZvkg!fkBmHM6Os_I+&p!ZejQiY4!4+;r6wbghEISax9s2>|_Y1mdB8kf8OJ(CD9
zuUaUt>sduIPZ?Zqu65T?I@R5eZ^NpCr6s)(D%2GLW8p0JMDz(kYs`s1`b?Ir)arw?
zp9lI!arWGK6lpmUSi&(uqo579XP=SFl9EF??f;yhKc(#FcjZ-l9)K}A-H>n|=hF4<
z)pbg%!S=0LU|thoc~Q>z_Y=q2Y|9m#^2$#vK11<7wTueNsfxzl@h1n{8E!oV=MkR&
z(^GTpt54X->DAq97BSFwGqW6C*fpXCv!u^{s1!ERfr^l02@l3V@_wtF?}KwrCI!Wu
zdcWwQ6BAAi{t@P4SrP1rtGC)1_={Gg;fjn%-dLuhT4!+NCOb|~E#vjWEpD+RBT}?(
zImn5X`xwnq>l*6kKA4L3tV;vf2$9z_R%<pYz3N%$(r)P-JI3YnA~b@MfL)4;H4KL1
zYFfX&>t{+k9^Y*Zy_Pi~F6F?F4|6y>rpZwxeMIugBSsCswf3B?#3Fm01`S`sLY<Cb
zQ}s)t?v=tjuH4yS;?Pf_J!RW}pubZ&qp(cF9faS_wu8}3)$P7nQAv$XOOJs#m7J-`
zw0z0sRT_WyLzUuV-X)|ZRo!cv^kX?o=RdbKn7i4Hd-qwyK>A7=gOaXmAoA$fYJNue
zhq;W^zAFl9($+Uogp{eFuQ%nk8iFVixve)uf5Fyb(MP%n5g_K?@9oQeaeo~~(iC93
zW(OWgU<eja0EcZdkUrT}PfbYFn~#fqV_i|JK&vbCm&2x@5IW!2GWG63AtY}|oCr}7
z5QO`ql~_}VJd(t8(W4=foCMUdD9lSUhWtVsLLr$OsP~_6;;{2l!igt|7rv6=S7Ipi
zg9IBRk#wq929gYC^DSAuM=xYtzRFKKGqts|3H^Uuy#-KRO|Uf@BY^}7f#B}$t_gl{
zg1fsr1PksE<lyd}V8J1{ySux)!<&5fzyGWEswk>Br`WT1&vf@%YkKz(;ovhmDGZ14
zSUz~5*~gS8@4}@Lq+Lb#y^&>wwz0js2<MX`;nV9umvNJw^yw6d`U3j?RN-`C^ZE0X
z6Kj!h=(<J^yb0r?ibMX9g-Z0p*lrc>1A;waOElLDjLRcpD1t6iB4hHQ2qx;jSoDCH
z&V*d9dvDk>MT-NOBo#K*Ja6iIeed>K-)`9n8Z`Uysx;aRW}aqWn+5Nzt=|dz>R(W8
zzhAGM;R{Wk$yeb0w1;z#Vs;TA&$ob%QGn&hh{q8IVXAaknpCi)f4O=SG#W5A%Mkd^
zQ=AK&OMYtoQ@JAGQtqEszganGQdeqGml7ve!-UR8a68~M<U$*xS8bl3ks)EyRUdGR
za}yQFdPZ(O?9_L;tT5QoLBtWq7we*77O_u+hX}S~Op*k&20zNU2IqThKv@U?B29AX
zQ?JA|Nore}v7afS)o5O0m8WW#TlNI*&njUh8r{e1$eS4x1{3kKKi>>gTW`kq+?W=Z
z6G#{ey7-X8(%Uf97>J&2ocuZ!9#Y7ic7VpTM>77pXNnt`|K`Y_<bW(u57#Zab3{Rd
zm$Jlr)9#^BzUP=gV&BdLmL9O2{GPA$u+@h9C(d-1uFNkEe+#miiDjbVh~aDhtiq-K
z1yHJGpHqpZ5i|ZEU#RqdYXP1KdaICBmCDt(en#G^Hf$i?iKGFn7)1!cNYE(amn7K?
zwQx368Mml&$U&Tv0OPyz*kmI&QQg>a$8UBRfyU@bapWmBS*GHxTvE{fIa!ibqI8*<
z-cE#DaGND6HiE*qrDc2D?MJv_F4uO0cQtxUs%BGirr57IHg6D;$|_v57Cy5|wCrQX
z(bsw_hFZPVv9aK$m9kfvsf^OSM#M?eG}ur{bg$Ehc3|^qZwggl3*&YP)8Oj_mUi*!
z{t2xj)OPFV(P;Ml(E&Bb`-{ROO~W~~GUKH2<XfU@_fC3PtE`E}q(^z@8d;B${mBNu
z&?4b<1;ph}3g0oVF%}r*m5<bD3qQDBst#uL?u&CO9(Y<&6d#={JQbF@q!b+$IHxB2
zvQagsivV7Vx?*H$_NAWYku`*7?o%NSmjypR<BfGJbMZ8{%l6FE&?>ry!$jz?sm@j$
z4R%%UUY+-LWs2y}IBX=Bq)x>k?L|5U;E*1E5$)dO7gO&4PGqa??>=dN=&)ZC2k?~f
zKRi{9fwA0ONiJ$?jk{hpI9@~ZxzXjn%}Z7XoTEj*ajAn{e4-@r(M6?szw=xlc86$}
zKK+LXvJc{+RXCi<V#>iCqD7Jzimaj{rF;;wIGRS8aa0ym_tkOM^9T}=@axxM?;_(C
z={0xIdpDzZc0jK9B07#L_@vV3!M5o4%J)-uvrC`(gZB7myFS(Delhon8!y|h1-oX6
z0(m_y83e|*gr*O;sC)#UO!=c~Q55^*p_aS%)mfhe=xgsMh<_RM#Zo~r`rbBOL3o7?
z1M{Yu@ukBiJlIkBFfJ(@`9I-zTKtLXdy`)}a}szjmh%W8IM;k$OH7toUxyC^OD0aD
zWO?TL#Q|B`*$P6c!RbOTZDrRhQGm9<l;;=Bn4|<|89@vM!(b5Y!>?qeHpMxPD9i;r
zJr!nU#OJkX0bmk|z-$S14SyFAM_C+V$s1?K%<(5RC7P}5w%sGFnw~g06gkLf#PKtU
zY1VV<p|i1uOtwi~(jft=^T#lR)sq9G_7J!I`bgAK$8Z}xzc3t9KxTM!JvBSP0%Jqh
zr?K&Kl+c#DbT5AZH)iH}k;n8Em9|^ugaP1?J__nUH+0}BVK}t?%u!csh`6GY=R5Tg
z3YiypA8@`-p80`uYq{gH_qozWY~5G+KAF;|mWE=-ece=X|ASL>ud|yVhZ88iU*P3{
z9YV#t@*PdF-@fe>o5{`(MKBUX9)J-ONV(Z0$b(Blk&m^&=P#!#{c&f@x53o+7!VsC
zW(3U|B-^-+L5h*}n|mZ5%$Zb2SfF8Osj`2dk(=H-8KVI!2OJ8q!@IMCAu!YNlcPzw
z&as#Fm9ru3mB~65L9z?23uV=cj&HuJV!yJ1K^(O>W?Jz0me`hXh;;puU>gO5Uxwro
zi>8N28y6>(2=4w3`&#ErQ~%p`T&{=`5=hQt*xeOM7y@AZfS+Sxd8L7?dQMl0U~(;R
z!*vCS84AMJrV*x{*|_f@GueHGti8tp#65!lo6S4`Z*Q_*`}2I$3OJ+B*ry7vsws&G
zSfD4^eqTX8{#0QcdibkEGgBG5ta4(b_<x9v`gvnfd97VZF;tXtR->^fA|qp<zXv%b
zZBY@80#Im><DRUmS+e(&2qlOP^YitEXPGfM#Yy;l^0sChx30r&ZR;N!M>!@;;FHS$
z^Vz+OZ$T-}t;=(lbt@^JMg(s(oyG>1|K&5_lqb_Jt#hisRG0W4q(JQ}4r=yB#zk35
z?JaUn-6=VqgH|$MwbB6Y&%rPw%A+qjw0+vWR!KVs)4e*YdDS4-7`+y_U|=#1JEIWt
zO%U=em@l)ifV5v>Ga0_L<17;`H?&n8Rk3SzNtc3OYv7(<QGt~onb>d-@Bslh@}eYS
zp59kYcp+3VeE~f^FrDoiu4Fd1NG~T3TkwX07yp+3U1jP|Ku-=NcS#AuNxu0KYu=o#
zg`FhMZz?4B^!-p}92JDa*#%WYF^W|KlLyyzPAO&cs~Qu#Mk{WLsc;Zv4d(a@Q=WP-
z*C6a*I4G_1)8|s`6^liMZ!|66E!%K;0*bc6g{e`|UW{&F*zu_<TCb`QYBF#M<pc=)
zPLN?-7Sr?7aVR*L6)|L(`~#aI&nq3chL|1U9yPvGFG8RsJN7mU<^$@qP$FefSV;#2
zR5mEIm}9?q3U)|4fo!-}6yc)ERWEdC$Z?ZsWLoYxZpVDLETBjS<jsOkPU#<VO}xSe
z-A&4%|LFC%*^l5h(;U{*fA_(Wi-7JHnSL=AbP?n(KEQnw;Q)Woo{9@S3wODWqdd^>
zwN+s4#++t0?q^T$K;cqVdV2AwKfD%skzK&Fj&e-*XVY_<O+k?iprC;_Jw0;mOvMSJ
zA`#I5^J6~<tKu4Nn5-TLAt>{z)5L!>i_>0Eo%F)K%<f~;GN&oS**Dvi6EEfZR?X9X
zl==d`!C5K1wO8_Q!Lk7vgWNmLSEGdb(q$ctx|gU@5g-zV?~m_p$NazOG%i9Hy8dwj
zZ$GS}*yLTOZ4U~cB+Vg3%(44~HSO(QZlXn5f%03Z&?jQ}`ffQ^6kjd;QH3Hawbolv
z3br>sszgvY+b(^PaZ&@})sx@$e}wBZdTvpPmpm$IyCe`n+V<Z03yO(8>>%(fnXR7T
z5>R)KI6zO~g#S96`rm^90-%pgs<0Jhp(Lzp8<t5&0}|)*4Ke5&iYXrA0#p{KOTBZ9
zs{?^)R;F3l=3v}fai8)`$a&q**e=H5KE|$xKynX}R2%a`i+4$R0JpJ>vE0GZuALkR
zjDv+7NbZnDmF`ln<Gp!?U!=zJclPXDbm*f*gz_SNPXUTl7-pGfdF(qwfz|bs0~-5h
zDz&}>H9Y4|BKkNSb~N2i_r=!ru2&X3E>*gis9kHkF>w&$in>}kQQ^AK5dW-w)v_$y
zJ(S%>ln*C*_22y4IlC&;(S&a|G<+1u6qH0?ml^lpx$3@DETE2mCP>Otm;_Yf%uhO_
z2qss2)KPcA3sy=QNP!|A_n1gHj2oC3^PmJ8;r6Vzf-PUNvGPKdLp`F?4hFtcaQAV|
z?e*wOR95Z0k5p5+Sz9CIC8&;??Q&yU4jyY<-xK}QmAbJ$6;SbeRe7}yRy97e?TJDy
z8>dt>xTX+l(@GQn=`a+j;U7b|-*2Kj{~QwOOT#=pVlBZIZFGD2PhY5j@%L}9ESALl
z=HmQSF3Y42?t^S!pBdGodxdSmC<dG6Emrk-7Sy!V7WN;b5ub1U?M6NX*D0a}A*3#6
zx*e+a-0xNHQ`mlw+hl|}0sJ<CdBQJGLMWQj>FRP?ikjp3KUyOZ26MlA{aJDrM6%_-
z2Xr)aEV~vzEz0I51#GPR^rhQrt~LY69olY8(%<JcOdN$EwG_7K$Dh7Db~vW9Zz%Am
zA2W9EF|?N(o7HK*Sgjpbp~+z;)JMPQ+=`0Q=UDV_X}DN#idC_*5D$G$%!$hVMhZRr
zB4=+{rWZr`_e?xs^|XyHQ)``=6BV0`cQqnPnMu)5QOz@iN|}}_I`N0C)Z|&0mAS1q
z{7U3S45jK9C>6D+$9;b+UfKv;`xzOI^QYiQhP`?1D~oI<im=XYD?*M`E-K;Tbu%{g
z0yD%wH)U1aR6N(L63~7Bm}RURXr8zym5wYdPp^FSVLrs^VPhfTu$DXpbUXi4W-?s=
zha!1a<-4rofBmUYoUTraQ<pEtns6*FgDMeICx}<XgxTj#e5;E8)8cbKYu~l3@Q;;7
z<rOPhHrmldbcptK)sGX)C~V3}l=X3m%c$6X$+XO&Mui=UcFnnYMqbtu)J2WpL||9{
zA<5>$|4n5fjxL9~->ky(=;EM^8&QfaKuWn;<gltBtS}a={9^7Y%;C76&(dKSR)$>l
zx*qDW|75s-u+gzhc2Z-IsUT-Ynzy{tYI7KmUy-?VrH$a<Mh*=lgJ+23lLbz&U^^mF
z?*@&ygDz!Co2`}6p^uq3KdhD&(8I|&SYd>@uqm095xPb9)aWOOmCMIdm5b5qptY|>
z!;p*(+Z-B@mqunIGrore?QS4PaQ7A)J1c@!TI4i_o5m||=US%%-2F6m9ySCn+<7`f
zhct%L*83ztI+k`rod5TS>wbO(>i*$PeuY0Dj^NTL)HS$x8hWS`ne5$!d$I7h3MGF_
zs$xpi$&sf`I}ZNd@ornwwQ+Zw?RL$yOfOP@`;`$bJ~Ny7U>3ywPWjY~k>SP6N_|l(
z>@IXZMJ3Uwjv(ao*9<glAqIAg0gK%IX~202<&Z;nbH8x#ecQ5nxp_zr8V+*&S-jEw
zZ>Ea+6fGBaDe*(!b=Sm)JLWp7n3whRF*bg7jO_~mvKX~(W`DeDlE^2zM$+>D_5~oC
zhqQYr&1Pn!gm8u}gLn@68v$neo#u4j;;;-C{x5DsR;+dh&Ez<D;-w~pc~cjnJmp+>
z#No6gea+qDF{hu(W2{hnF0Qnd;G#Mwu1ode2b?rB>gvLw&#t8MM*^tMSZF4>1x1oA
z<l-ETYz(*FnYuowdl*(eAAJ9-HvA$Du{W#27v{#!p@<u@vFYJ-6VTKV@d(7~ngH8m
z@c)<~leUcED+;qe8zFp$7`U0Wm3=g`_@#WKSz^+Vcv?Vz^{a-;ERd~h-A50=I%`<x
zg7Q#pQYWZ#hc74PNkUxh%H`(}0~WaZB=3mf$nBb+0u5PKsU*?|No@k(i@y_w_B)wm
zHb$61wJDAm_9Ukr+v%Idh&J^noHHzH{!Dl+0f1w)mOkVMUuiE4MPR-qdc8gjg}E7^
z#zaJVD6zt<?NHGWt}=*vGdcA!x1qHtY5nByaNnL-8i&@OiyDnjKh+fNFB;M^Cj<m0
z&w6g-myATTdSg|)dvD)F`HkO{nsJVihC+VGJwFxAAmu(%fUGq2zLQ7%Gvvi<{bA^1
zSxI|eT^lZ!78;9&qP@_kY(+Z4qZM00Tbu}PiW<97+wWHjbui&j6+?wC@%KXSU^{Wg
zkG7$447uyF*^D$%S5gwqJ|AQsov<V#nDIC?NYmS!e8FfH=E1SUH_24<U$b2stS<`x
zcMjj@lR^2JC+qZhY;tG{Y-u>Pa#!Ap841FF@88~+q>H3ya||VKt^03Rm1YGlEn_%4
zYkzYz{u&;Bpdj*85=1%|ftYt{sIYkw1YAFgCbe(SZ3Cq<3O1GC^ijIeq&a_yRsXUe
z4n>wZ5t_C7_=DV`@u-&>bRFm`rAWJCt$LqG6RCCveZnoYFd2)Hrh^Ij7g!ZT08ScZ
zP}wRt{sMLDNO*>=F9SFos1}CTZKufXd$~%txgbKX=9$?|`sCR7^!%aX<Y`@p5e{}U
zp1o{LV)enyt6Z&s!TZ9qTPQoHNL>|TD2oxApFiTp$&lzKnM+hq;eH9x;9}7%wUhf!
zxo&gJ(xnt=>^vynEpGcRLXlQT;(iolquUTREeMc|#VJ#%L3SC?M$Q&e-gV{`B$(4a
z)o8IjaHmE0^DZ|oE`vs_SHH@Pco!krkghzWn7ck=0J!sJa@f<TanZ(yPTbnpoBE+y
zeIm0`xwxo>3M_3Gdo@b8!eEZe&46a7kmo!1GT~jf(<*FSI#--LDk1`B(b#)B`2R}G
zIJmN(^lSA?6`dovHGBFb+HtC5+fFT-dr{uM83zZgNbTwu>ZHmw-vs6JAe0iCwPdHP
z!M&f{sWo?hLq67APsfnm{93W$=Lp4b<8_`8xpivFNA5iJx9HTJP9LRe&8W;$jx<?I
zQ59&l<#+c&Q$!~1So`^iMKLaMUzo*f1OI#TSyBP!cHE(QK=RHa&G;lh1|>`}S&*Bz
zVU)!m`FN?aTY^1fqH4hf5Xhww%1`nfq#`2;YMb1c+xsM?65cu%KdUrgA9fsMtdNyB
z40AP3H4a3R@+LG?q~pix1Sainh$cnb-F(Q47S7)53w@<j<bmeKs)e42jR|w`3Gi%o
zq8I+z!^WA9;8!c9NSEfPj}VLr*M*`{$VXW-r7U?Uo)wxk?-dwHDPtzvP$r3UQ=I8-
z<&52xDuP5DPbms)4EiF9NYH46psvNG3<{2;zU{vLI6T)swp%~+)Y_|W2>0JhM;`|m
zLXx%<?1NP53-{9a@kEET>zj`*T{sUF;Ed2X<8?Y}T6B{X*c8s&>4D{e%4rk6egitN
z45IL$R&(`vM16m3-7S4Q)iM>~ndAY85@J;33ypiirF+hd@9-&}L2M(<8_IQQ&kz6N
zn0rHP?5r;VF?`<*H${D~r#VU8lEr@=WbKtGArPhJ&3-N(&|1E2*3|+l^frjB+x#wA
z=Ckz;i{KHu*2Ot^jZkg^r%*De3a1sjOo>1tYu8Mw7(LJ!X>^0byQePwL6J+DqDW(_
zSusaTb`j`$dcCJfA!}+=e0zSZz#X&l{Iy6J)$C9C7D4hpN^^iepO)2-BwIF6Pz(CX
zxTNf;3g~j)0^*SWf2Q>RTVVXTPd=5p&DK~^gNlNJ=NqYaeRi+y-J*Vufw|(GZ@cuX
za(kp1ngpExYUwb92gi;Fr&u(?$>a|>cs&kpJHv`Li`)lbe9j?+-@bdu+sNz)%?^o2
zwMnl`I7M~63haMXkShGEF1es!w&=z8+~XpR8KVu8#h`=CgcvoiWdm3EaDMIcI-x%9
zQSwW4!vOd4HHYIeQ@EG@#0=)N20Wxs=b)2_Edasfea%Uyf91yPa;$D$+w+0>!YH2P
z4NV^Wle{$?9dOc86&Y3X1i|u#(LjZzL-k*Yeh0Qo4ZDXyuh$z?#Ni=`rbnK*Z5hVp
zVfs|>mr=ja*3-lB3K9&b79ABU#jFB5sSg!EDg%QAbZZxI2OvaP|8g@^BfcFX&#Zl&
zZ+y*J?OO<Lk1PqNSl!eAv|d&T7|-1il?jR#wOWfJn(?}{trLd;I%sDyqw1(>^YriC
z1f2le+sN_lvcuux*YgqSTr%=Eo!(hJ<6P151POF>&))T0m(>x?)Saudyt(?+scF29
z{AH<2yo&g}M}Nzc&|b-g31~lgr}VwqKS~jfp=-=EyM~ImcF0==Q2U&I9wmRZ<gX{~
z9<`eaZ_l`XhF@@xTx3>9)sLmv&%8FT%*?xU5zYf)QMEKegxP-|f2^|z_F<vO$9x)3
zujS+LO)xet26m3kMAp)&lJb5I1`{~>B(~!63?)2|gz}+bynoyQUk@cq(mlWYM7iw8
z^nFiTt<a)?%4O3)=#&0G`J0?ze198k478qpx7!7KqF<#8|FChO-{ysunWy009{y*x
zGkl$ihF?~<w?rO(`oudsDBe27g(=i~ln^QzE!*!%tFVq!P|M|&5eX(x#PZ*34a@^8
zH(Fz?l9M3^d)(*IOseO?LqB_`wA|s|Y=?e-$)>2-wY;J-?c-{Dh+9fz$2VzXEFrg9
z#PR-eLHq^NtQb_glH9#w6*;APX__L^f^O*qb{gnZBJSv(HCB11sQg)DFk*`KIhB)2
zIKtpYGIY$A+0PUNRp009r0@$_z_;f5@sD-ykXZT1uex_{+rc2No+)-qnh0t2PK4!-
z!b<fM%d}JF^f=tr6>Vaxr3tNFYSL<#xYGyN;P%&iXibE4Y0BKZ$xO)B^+4MWX&k!|
zbC%hu)$L^7IYsWD+0WzM!Xdr?<iOf8^IskATR`^^5pc<&#lL2HHqPL`n<X}CCvosJ
zkcD&5e@vmiTvIZz?`7hjQJZ)Vs|<D(ENaI~&0HTy$-@mc6Oc73(2~lOaFNx5Hf6oo
zyLV^hjYb(&@nR0)4L2x!_2;qk>QGIq#&<uY(^cYuET$;>J}!$bj5ljNy<=|EJ)O;-
zZVC&OW@Z=E6`^=*!EfK2tUZld#-#nW<10Y7*Bjf;yOqo?Z<c$aS87IYm-Kwi_HFCS
zYVE%6^#_FR>opC=Eq;Y+Q#o;(-7X;Y0|op?{;6{SKOT8nq9+eZ>&5>(Bpi9lh3f#T
z$B8&6%flE4miQ3)ump>}w+_=DEmQYHAQ>;KA&(087Xz@r^E;E(zsx-BU7v5X)K<&k
z#ev8Wxwj8YAo<J;2?rD*&O=9vG*YC%%xaGNXFL}tsUn)t)avZ)NRc5EncG$F$5=%I
z4)md1FHH)nNc+!?RU;Q&>sA!2IwO}F?JS*jS)+`r!V8%w98C?FaH?2&q4{G6N5y=n
z;x?ZDx$awAn4uMm6<_6z8xu#reTgU=_nf^OdhZp1k458CRJ6oqeB8l_2P!K)ISm0|
zmt9bf5xR4X$!S*m*QJddyG!Xu+mUf0<e*L@A-#98J`^tgnW4T6AT5qO%o$LNtZ}Ci
z)`!+Q5H<7$VnF6>jfLZeryQ=YMQ!VP&CGCxlc`|e?!fi?u){yn^vv!*RRwdS$~;JD
zip{kCYh=8})7bLihfb{In-6ccKm1E)>UgkB|5T<@beuDqs1C73O&%%vJO?{5F6=ID
z<e)+B&L0;3Poo^QrpTJgnAC6zY?C~1Hs56XQm93U>0LD5AAJA?SSxEHld+UArxyXz
zSjnHmL9E@uM*0Y0mr8Q#F63*JE1z{H{2v4DIxBY%nN$=dNNxIBP(*DMW_KHjmwf1w
z6k0%o4FQoy^{Vv~BupvQK#y(S#B{Ih`_+KS;Sc?%oiCHGR0Oh0U9A+=wuOF-%9hMQ
zOJ<i89S1kLgNn9JMT;U`tgO?N9W8T6=I#~FDaJt2R`0}&!c}g-;Fu4DDPB_|r|H4#
z>WG^sVo&Sl#(D25Pw_ZfQPTkOXJZK)WBx+xwe$~J0TXYVn4f+m@r{Ip{v_;&OB!PI
z?Eas`1a|V+vDcSqRh6t8#^MsMBQ?XT;|#D5bi#(WU0r_Wr*Lv_4h&dJ2MgVaEPAmm
z`VvLrx;Sx;d6y?6e@~ee8=Hw#ER!3;d?AJT=&cgP%5piP{Ssz1TSS0s!_i@&hP*dh
z>$-jq#~xP^vni%W5Q?u49j++BtK5IEF-+*@JZrYA65&?*G!|5Gee?6A;T*AH!sI$c
ztjbHsUcfSJ)9Ft*kFo=6L|M*M2#aKlv(q8(*uaNlUswy+ws!_z+aiAsWe$H+jc>1p
zqNfZ(?MWB@OuRZNXJ2aL@D2<J!Iqm73P)IdJu%ymeDZr$1_^Jnt;{Wvb=Pbt<zDJl
zT3f8^e6kh+Ulu+Bb8g(5A5!+rHCJz?lP{sjrd5uXnRS+y;@TKRdiw6Cc0;NsFA}}Q
zb99k}Ef>|{5S;QHOep)QZZREGnpPSVKGRCLnjGMp<mcL>+;g+zeaC*+<bJPFN0a_U
ziZ-<zB4r=>XP?jR;mCa`<(f86s`itnz3O;&`AermFKFD*gnj#=n6d67ZAr>wI<eLl
z5{Qw;lE#qciO2g&ED?l{pADo}8~vA!!Hu9AZiEUeWrXZy8D?))SD5q$1sV#Ea(WE0
zC#bs-J1r(N$;6=^sO?n=mhUeiZ~GCr|0LP%8RnOAl#OP#?_fRE^2HUk8{$4Rr737|
zov1ozB8yBy3LKO%TwRkJ|KRez_e41?;mCf|ZttQ&bb<K=)d#f5@2U7!>9`N~DOW7T
z@Rjr5l6m^01vGE#QaF!2nO;K!C9j*XN)z*fA;PQCUgivsHr>i{4pbk|?T37h<x`mT
z2NYhV`N$oAe&>a1%gv-R<$|6+bfeeDOJwwS>d%Cmh_1*_d`)|}e)8_$=7p_MHO_e7
zQQ1~^g8l{&xPO_l_KNl3eV0|A$cMDtyvGzb#b!52nZD_RiI;Nc{U?3j`8c=I`r@=(
z=FQ;Y{?4vB*V61qX`=EQ0*}?#yfN$u<^p=tO#{V`sFT9^E4D`ECViG~nn<S}#7fea
zbYH#|Q&3%Odj8u(%(mEmkYrv+r6%$>b+~Z6MQlXbZ15PV=5%W6Spt&Xyyk8+tr6Ov
z)^&!dO~zk2l!T78qqBI`@(GWJVNKli$z3|tv*xp3qHH4%X;a=*jY`YoC}T}|)oB^B
z_H&+sreu9ftgWkkf~Gw!+amLo^aJcqQvX@j=}#x9-<Xf!jieu3`W`IZ()G3Pzwa+w
ziE`yjHYB$lh*XRhQ4FT0Ry5{uCGf3^AJhjLhC9op@X{K3DuF}O^OhpO7o59hUFh`2
zE`t#)){NjYJ|j_w)TRcJMx35#pZ+Lm)g_*;%|WTeZM&RxTSNTs_&x8my544c5aW&c
z3&w1gi_(GDZzw*J?o-l({%=|MWI;DtT30d-PLF>>{}7pTSa>tWhhsCEb)_bj5IX)n
znR7`plYyM;M3cxXF+Be?OdU__dZ<r^6>SI(XA&1TKDOGLTKdtRmNXr`FbgjvVx9Fd
z6p7*2QCBSbcSv-iwih$opp^1*&(pYC1Dkv_vwnx8r_JDpU(A<D{2V5-z#gE-XD__M
z&(8(oiTQ<yWD(rB`Lutt3TyJWjx%0(YhLH2Bv+J#VU_TvEL=ctG&ql4n}}_1)~Okr
zT{!K{t%VdFwx$<5Y8z*X<KEwia~jSq>Mv0T%&^esOFqB4O}s=kkk<YBNUMZ7%3hGq
z!iXQA+S(MaC1z=gFiq2vBu{>s|Au|)7>$+fDOE^E@FU-g*HL&{vlZTH8yM5KtuL-M
z9Ox8XT|w!i=C#f3T5+9$6PcAdQ7tlF*vQ;tYZ&9x@$YQcaF+x$K3QAhBpfzY+d%}!
zq1~!{M)R>r)NRQcExxfw!-3MKs0@1K%BI0L|62>NAQtv1noDxw<V&#BHG_VeS>FdV
zvhklM^%bhJ*QV69$Po^e4Wy&92{gql5iKc66_urdVhu;SM;qADw?7#eXa8Jt{uPQ-
z1P;jT(WVIA^abmKZIy})nr5|Ws(yZcVBu;wV0m!7z@bOc-VwwYytlXs^PyKd*=FMY
zu6=+3L9O9a(=fT`IT5g+fk(KxnOpMFlhF(JBMlJDL`Jkj{Dc;Q!ZGqD8T6F4%R;iU
zi_2nf5tFCy=8Af#q}6|{3SQ4e&mT%z$txa&^(Tjxk-l2A)SNG=u-h>t!}?HZzZ@zS
zEo5xHhu-|;?KJ21=jBSpJ5?1g>{-+G(pxQML<uZD2{k4g=aRsF9!!jPjtqr(7kbJm
zsdq3O%(4+Q2yUY+QeB%vMihTZCEmMYR#Wba>GSz`UAzK)f$tm#dnfiJR5|yHT>_gf
z-g@L3yhGXiwh&ZddL85UmJ3mw)HZy^0jZKqQ^N9=lhR)P)yq5xWJuCTQpK|Ap|2w8
z?2~phcW7gnS}}?)<Qwq4@Zr=CUu3+P8?IZVNs{ioO})0?7C$pUz7<fYgmovvkdzHt
zN)6|-=rTz8Hr+30WE%%M($?ebDtWryCBq>&gw+n|pMKpxhUDf~nq5s(Vxv89)S^s0
zrec3GRN)bqsVKp0K9IHSYAUtP+p<<s{d@ssC^s+^{|Je4%u~?#!9+8ooWe}3I)wWZ
zH|2}2lJDhlsara3Hh%VNDbUiT{Aw|C#!=f0WlS!@nhGVK_J`i0Thl<IyI^0_@P40e
z?*w<rI`3T5tH0ZQ>s5)`M2qZ0NlFQ<Q29I$yg=G7#y!#tVkydooQdwM5B3E^GAv%b
z(#0Elo9Sy+y%m?Mb`tg<7Bfe5y(&+h{)@>k?%EG67V`BHKD~^Oaiiml>^eJ(EvW5I
z?EM_dKU6o4Q)?;rHcL)xvHp-_>+TUd><V|(1l7mJB2kGpT=QN_mtZWstS*6dcTe|a
z@au}PA{7-c?yuT#0hq?uUk9btxRyv+Mu3Ymxs9=c+=^S?>EMOgkHs(gSKiMQt0Nq?
zqJ$~;On@vtm3Pg*XMMl`Ed5@{udmT0-vw#Kq9sj_aJHnIb%mH=rK-m{p5Bwim3)ad
zsK38r!q~;M$DU7~)=7GcXUvA_{hkD(=XU;qGBv|LfK{I}N-WypW^XpS<K+Csr`CYJ
zHd4uS8`6*xWcI8CBb>m&?zX%6p2320Etj=ZSl&zzFqZ8VOG*SmxeEzt(^cBx55IOa
zFqBhuqclh<Ndiubfh3ew4ps)%`^g<)utj>tPJ`sCV>aZMpV2d^*r-j`?6)g3o{V^U
z=_|RDrbZ@fl09&biAc4>6gvZSI~c<L;g6w>eTx3{<TIG5ygJI>#NNg^>{HyY_jZJn
zBb<3XSpKS@7rGU~HBVJK_xC^Q{Xc{xwnf%N-Xw2H;}A2aSsWP<NxTm*RiRH7G{wPy
z?)>FEw9AR4VD8#0UTI>X$8uc^Yc8Xq6iq4fi*@JZA4zX(b;T{3Wx_ZVRJ52-)(`@j
zKF2jwKB;q-g<S%hPr~Xuf=-3`bALxTMUfscefAVpgJpd;Rp@+7w~mQdS2%1)iSGD9
zLV|_0mZ0M3HzS$)w+%;~40;K@1WaFKw23lCMk{fF^Uz2(PSBOKAOsEu>UOyr&ua&F
zwbdSceb2NXMdE*TWO?DQE@u$l1$>!f6Af)**{3@Z!h&YBUIrS(RphHn$^1E1Z!g<`
z#pZM7lfXXG19p=8%*{yLwlu{qs<9osPR3kr0299c>+j)!j@4l}2u1q1{K-dz+#D|<
z(X-{ppYPty1$7Ddy`2N&^tI*I;$SLa8tGLKWA<RC@&CxE@$<i|K@KK;#aX{4AZG!d
z7crUNwVCXXdJ=P}t0TLr6m{iT5r^U2wkpphv(nR*bF&0Z8VA{JugAL;`^}7=r0Qqs
zWLu;ogT#C)CviKYu!8U`bLlqUMSZ;G%#{F=UZtOPyBX3gj+2Ss#{2T>PTkJr<1p)$
zH;i!#R8`e&j|v8z-&f{+ll%R1F%=J9K8uAq{FkO3A**oVV0gti)olkQ{(3Bg(ScRk
z*o%L-41e}loQ=i*_~=!YrICGT>VwV?YfVQ=R$<|6y6viWH#d~*_M;_|Dc@t2Ya$L-
zj)X?|N2E#Syk@ST6(tcjca-p)^}n3@bc?=up%#XuZe2NE1YA|>FJl4lw>7yW1{oS)
zoQAs#N#wp2@mczmG(VjS?_J;IioiBF7#3Vi*`LmDgr%j77Mkz*wuSrjr-HI4UG?U6
zge!c?)r-)4_khMQt}j_UUN0-n%Wx~}f91_3vaN~(4<*3D{>!i~Sz7PX!3HdJVBr}B
z@5PxfdC!wCeg8^$c05SvzLbv>c=6b1CCcR9Z>C$}+YEC>sOT|{HI3-EQqKwLEyN@?
zp48Xcm!HNTjvfq{BRk@oGk1fZ$+apD>f;wyNWkrmqMJ!Uii>P0K?3llDXxtklwo>Z
zc(^*>P=oTm=HH45&GQU!jj{e~ill<nD|m0Mv1Fk%*c=a{L4wgj5$*)KS#F&Nt}A8?
z$T8-Uble{x(W))$);iG^eN}IMKys-Q)f_276W!mpAg6sWyggT8z31;ei60-U^QLu6
z`x<QkB3I{FSA)vfiK;~scJh`Hq}`Zuz={Mdas+)zMkK>$?ZDLG;vlENnqJjZaUj8F
z61m2nA5DrFRkc!!rXspp%p!?=_G>l*TyIgK-3c4gk&xrY?V0B}x$3jBfyWGM3>HO<
z;b%y62^q;`XMRP_sx(#+U1h|mPrZ5%xRJ%>u;JAH=z!s9s8YCY$EQcuiRHGj0MNAm
z!Qt|VaEDfR&M$Jc>fH`PH9*XjCVhuWqE*o(F_;BkqF6~FU*k~qPQNO2MvzeMB*pBa
zDCIAZ2cB8i;O@k<>YlGxyn9)a#1-s8p<8py04fO2v3l>KQ&o&8C^E^0pa$Vpub)AQ
zCNV-XS?_!KVmgsHQ~-o#9O)Ci`wZmY!bmyljW^CXu!A69z4wg_TJ{D}Lln$^53B;P
zPGr$cqP8;%u>LMm%{nmgoDI4J_A$|5jEfOff3k$?(?e{96##Y|7M=#GxtVuN(#f}<
znI1O(EvOS6b}$`@jJB*HC8R#ytkY5PjEP{0ADC;)@ZjmXpB4_J+`u^u3L9nk$mcDB
zyf0yXjpkm4vo-CJ)QRBC)O<5x@PfN5wkWhzjhM4Fcp%lCm`@q;ag_~>kivM4U6OLi
zdd9qGdb}hpKR<u7a7$Ct;$KolRTL7((6AMw({_g&nX9plFd0u4X^4!lPA@8tG93Vt
zA={Jw6e~-^mRcOECtWVi+}s7u=*$cGB03|_#c(i8s$zhD+aX9Lzt5cK2HZ;WLLEPE
zuiI~rP5eQNWKLGBJO=F-9F?BHGt*epNxNZjUzHLLs)wb(vvFH^9@Mkn60WKN3SdMq
z=1XJ_zyWXgWPCQsFZ$%8c8ovI4{7c`Y2OIs++0DRoPef(AnAKZSt0p;mprO4pU_=H
z<6KtVqBL|Tvwwrw@-9qK8jr?GXcvw(+nmk-f+-hcAEev0yE7|jUz40+{zK96?Ive7
zJMw^dvEC)+^lv@8H~Dy*+WS-ZqdhcjZsf19eNJA^EY{;H+MXveE6b^;)MAEpnj)Q)
zlPcTaT98{?xy7f+-n<^zEgzj<_L)u$>T}baow4Lwh=8|9>i_O?USDB5_v#G~3`Iy<
z<8H4K>oZ~Suc}~EK_o?DH}v>`QgbQHt4&Bn^A)IaiY~nx72J(f=~8=Mp*L<nry|vs
zM0zUXxv=iN<G%D~Pju@0BY&RJgY{B|up%eo)Gk5g>2Ew=Z;=!CMMJ>0?u~NwYgeeq
z!M*SG_{qD)(eUQyfMIO&$4s-jcUo-$T55&{5Rs^Aq2tpKT;b8uWf}?bEQH@Bgc3}6
z#$@wJX}P)f#0ZM<40P5<y4nW-?r<31ErMiHnwtEQA&jFzUCVIphPr@8!!99WpTwv6
z(-{-l`tNr=;>COxI=!YyKd~jNzS8xSbbBUv(z8sq4kVew-ie>o*^45iTwJhcn@Vp@
zdW2;BBEg)g3wF@PJk$)H)0VO{hn%D7VaPE(7@CCOLZWTw9!!c}3s0TY*+m&}x~Ao%
z3AX?0?r==_u7#i(n5`N5i76zDSmJ$1$@(?wi(sl&w!u{Qa)l9vKt*Yi=8M7^7Ixp!
zkC#yNZ<_8@b-OVq9QG$%m9}%fwYa+`MPMcwnhjt8h--+^77OgD5cf2~1*_0g84pA-
z;s|s3`}x?7A%m`t$m~MIkcxc3i}_9V=h}2~;`9Dg8@%hT{QaZ^WC8<}d`zX4>{2&s
zK4-en{H3oW$^z>*{g~Q~DMQVglIbVqv1<GwS<K}Qq|)jy6w9NIWE32)I4V_GE}b+Y
z2$G#1ea{+iS(LN`r0ipowS0$D1G^K2rEOkS<S<|y3d`dFhlCoA^T~XjLS;MjWGX)<
zg&AilF&9?6*r;|`nep)}t3Rnbm;J1MmV5oh6?*4I>YDP|KvhZ!DZb>Km~l_`N8^Cn
zLWBbj8$Mz(ejuKZd}?i~>YD}sss1v-Ym^gIA_$YG%uB#xlJ@?VxkVYW&UT2jW!^`7
z+ZeC;YdaG4Q&p6Z_-fcyz*ntS<r6}flJ+`GW!iPmLCY}HGyw%+QF&aGB4$@aQZ1n|
z9$~RBU%OEC%3p@q8-2t&ki3v$IKY5z4eBWkvKEAd2a{4$4kZq?QPvPZn1Dyir7)+q
z9lYJ6rUwiVV5YW>ib?=Z5Dg%@cYcwKbk>N8Pj)PmUzIYV;Yg{)&d~rxgaV*7sz?rE
z(n{5>?c5l}P-4V<Vla`pLB8T^S#ZHt6aJSnF9BeVGJ#F!MoQjlT_4EdFy;!$T6f$W
zQSso6CSlfJGh>;I!XXvNDqG{RNJ>6?xD8Hvxvmt2ivv#FNMy14T$II}X~0>L^V?<b
zu&1JH<E(O&m3;kdYt6~%(*=~N<!K!FM5Q?&#CX|i%)^l5@FifIKJ8nAaS9-L_q7Qt
zojC1qsm-0J#<4!VtKZ6GuR{7o9oQ*<{8#r}E}Gf5?V!*k#JAN19|Y!$z<JSFM?+V8
z+nweOHZ+jAbVeuUZD$^<oGeA64f@Vd9vMlV(z+{5(}{Peb=S*UV&dhJ;n8ZNr^6(b
z)IN`sm%JU%0omhyMIMRo0)BUyTdzdAq(|rN6tg!81l2$aq{w1fgK$bHI_Qy2%5{<Q
zof`9r3Gjl@`~?xEIqxU>e=)!1i44S0ivDC{&21n_fwYBk{eXNe375P@#9SIuw#3U}
z(Hv`MGW;}MK*`<Tw|v4>lI@I+m>lU^%?>WCi(%_92pLfqAejRhs<p#F>kuL5zAL<E
z8B95o{n{kR2zu8~%BODc0ZMlIey?6)#3t?b<DXUb2MC#EA%?4ZZN8dsc>DoU3A{R@
zmBexmfbzEojn7p_*(Or|o__$Om}0=}Z=M^hOQ_1i2_~4mu@ugUxOC>eQ)p0EyFsqv
zIJ#B)e&3&n;}f^PnSH&h;4$+||G^3)$-9mMViVH1(2ACPB+dRc3G}kI+#?=?%9aJ4
z&+KWQ*!fclC5v@2S4-_kq|}YheT&Y^9@nEJIv8&?H8tP(w(7@{uC^CsXTQ!L4{j$6
zNOR-YTDoAFPO&4c&g~Sv7=*|lPQ*$R@9gE2%4D<b&mfIn9#4!lr-gli(K1E@(30`v
zp^^BezYbXuVcdpfK7|_=DJOGKhrrFEE63qiR8@VuTU88^H}Sl$Tey)TiIW|kIo=XC
z%~;~ujgia!l#^k>lXmZn`jq<Yij(+;=N7{VK9c%F!(cb?oLxlLb*CH<6cbG+O^PC<
zob_vNPq^%V&o^!uPh}R;N$Slwpr7ws<u&{YhWEmZE?N0~4|gXnhl(Ioo>`0tp!=F5
zxmL+f5|&O8E1OTY`ad1CRo8L_r)=gPB(c3nK?WF6B;S2vYAnNX7z`FfTB&-Mz^BVv
zs+kN<eNr`@bV*gEs1UIFE>$7^;9%#20xMdN(gGZTg9&J3gpf~(Pw(N0rJZbeY>5R%
z)WfEsf=)I>yUn&BApfF9YR8b|p_zZ}qRDxqQFp@&1IQH#9L}E?Y=hvm4#;&pQ9ss8
z=a<nXI@3)#uJ5BrMik$H03MPRb3{vu7P&s+Q4kWcoE%70xM3(MqImEx#xX{dU?y(~
zOsBYLsDg|@C;EL&xN8ffZk=%+jzd$x6T!Vv4y6TjLj{#GKIQX@{gn@zh6>vb!Sf8!
zoH3aJ0T}MPgbnAIDTWv4ztm^8*%`Yv#JBcqgl$mR_R8VvPZJQhqa)IAfl8}@O2@Uv
z0whAlmZNcpt0VE<1P7YMN2!5?+N7x?u=#`-CL)mKm4VxEluJ=;+acj9vE9I;ps6>B
z<1M86Er~t(Q8T#*Ug5XrcD4c<jfk8D_}yxCE-zObBvyr=&iyWuIdm7Lj06iI3O7Y8
zsfJ<`C#3rwz~WyNEn+tZ+dBvr`4QWLnC;idcaG+LHDz$7vkpX9X1^M#efE?g1fG^r
z^9B-^_|1{=^7HoiY)$f+k3Yb&p3X19cko^?(#h#SN$NY_UUxE*O=8DxI3isc%FnVv
zrcFI&syl!ufn<_6lliNLPsFBr4?`tHhn==27sUX0+;~IIz>_eAGAC--DuBbr@9F#$
zlrf#h`wU`0IAyglz(P#pMLRpQU+ufOwkAFM?)nOQuyQ$>cR$obt>SlId6*|~o~|pM
zC`KEAJOUyj;3cO)Iuj8Fj(a6lrv)KaU`5o&XC3T<1dyoi)&w0^aet9c9S;KOP7#Wx
zeMRN1FrX^o%Q2B4<ZVjotB;?f8nXVb+%#l*go5*7H0@)yXxonI2msJr+#;neO==r>
zd&W$@?qrZ*UM6JowLIo?2^q6&@c0%nuWWGMOz#O4nLQf_a)vS<x3@q%olXrMhWgG&
z^>&OwLDC+oKlIX7UEc;zHFWcE&an3T_o_U+$bBN`DawWam3+ct(0r7%VXNNVTQ!jP
zq~)d@_Na!83C5&=fCmU>CnxjoT`K^gB#EW1?nwIHqsc1P!V6DP(yM$4Xn{|s4m6>W
ztp61|@C61;7fN_Q!Z%mKiV{Z4hkr@%pVUV~9lJT)zTKBe<HZM1PWgNzMM${e_!F_*
zKB;VPnFoD74)J&xSU()9Q*k3pP2md{SU^WtTFERnCKsL!f78?nIFVRYWQH|P(jLJI
zB?%Ui{`jIK6>q9oReINHJjw-hY`i71cXRtEO3D{c5nLEm`PsT-VL)`mHTa)wd!qY5
zXdg)|Ic4~DB?8af^`$Bv;FaO#h)Wu)THm2GT$+Lw|E;TP1vI)`{sj?L&$T2+?y*8j
z?iineLLogusx$N~Q)!^0Fb;6sVqSuE1yP?y17;$}`Vf_>6i(6`yG`Gb>6WNzkAU06
z4K&OjWn3WiQ}m?x6zzbpAK1Pn;SguhHjq?2Tm8&hoTA=xF1OD$#)=N4rh|p+PqU_z
z7)D_kzx-;ENy#}zsY3ImZZJN50+6S4QN5(Xsz9=&m}7V@K=b-rTFZ}!rH=YR%YBMI
zGaZek1q0RiZXn7ZEIo6DV)HTY*%UvnDv6(O|AQXnfspVn0U>?DS81<6aE?&Pr=RG%
zi5?H?d{=4&;M5nWqk9@p-`RG!O852<&<I;^);4QXF_k#0O!`XSH2Uup5$DFq%M>Jr
z_z3^YbB?LhA=0Kpl0Y=b>Y)NkkXRi*JXE*gs;MKTcUt}HChj1;tg<IwO#g6Ru{BbW
zUN(XKKWy7Ip#4`}%xUh;pCgZePI+r`>eBaM?TqN;v!;MIYI;IKbh{|~ta5VKHL<$2
zrJdARH9g!4OF6oWW+z80UpGWU0`jBkPxF7<FG~<Wz`7Q3`Hj-jK!7~pXsG(A0ngve
zr%X3$l8}hPFJ0e~B33ob)!cskGLVWqaCyDq{)+7SD}t)GxTNI6Q6utoh>H6U<v)My
z)<ez{G?;8V_%OnOHd<Ja<lqcW=6m^dShRuoc>OC=WCfih4VwP_)nROWJbp0`9KJv@
zyEPdvZ|WDFgR~O=QB0#06`F{=-QzkUOBy^vE3ug>lw#GdG<dBn2nMP^p;KxQX%C9o
zc|mhqt*I2$P27w9J2){xn1?UOPjVqhcYIMuUl+dpiyVt2v(+RA#?lezX`4=-Lv1b<
zTUNQZ0pHq;Bm3yhk4f7&(<G3BRF#0D2jBFfFhrV$FQEi2r6;|NGq~$5=;&XIEH(8}
zw#2M3I0@hEY+24>uu~@oNmwc7a7VDGBCM<~NJJ1LFKR(Fn0~@H3NU%dgtROR(Wq<%
zo4}e86@=;31yxn4WSIiOlxGU4TYzZ>6_+hBR|!|;;9Y7o338~YTx<s;qtxqh7P$?f
z%XJkVADUjV3UL~~AATukmtUh%75*K-qbJglWR3CG?fR&5^Xz9F+Lvh@;6e(WA~a4b
zD)6|kg_RfwUf+A|IR7xc^kTU9#;K%k%~_W&de4<bqRrD)v}nfxwBI^>eagvdB;x|O
zP#~l>9*x)u&hv~}ibaQ+c8KzrAf(1;W18)!%v^sGIZJ_wzGt%1jET+(!?<(0;mn~*
zs@7Tv0&Fn=Q&c&%Ba9-NgfzmtL>J2zo;i)@Pq|Q2_(r_bI1A&T%i2QBXO_;0;9%X3
z=<ZRm3)0Lv#xM?%vxT3eFeoVFLChG3fIRfN_xEn<0sFUy5#uX-Fg3wXOz2o2jeWFI
zo8H*zrbEM*)Y>M#J{>8?ImCc}Y<^wch8WcKc4VUCm`#c}mkk;Wp<(-ZHU_P2p4=!G
zOJNr@&B3E@?>qn49z@=c0n;9)AH72&A5|UuNK5V&NYblA21LuPFKrC#CPg61@z3V;
zR`n&<A`2?xDkX9`NI5$=jL*+YL2NMtFuy7uCWxe{xHFVTY*ie64rY?-X3z)A3aCk@
zU2=-tqb9I1Lh3FkC~&(OcH(3}6wGS~CK(RTw5LGB{-SsQ)c;^CrzAxYgw*>{6AWQc
z)`h`^jzFAd7Z%XB;QMQNwOiL@uJydcPAR(dyp2Vwc!2OW`6uE;?0Y09a;=EZ|JY1O
z30Z`)-Scizs0q!e?(Zy<j&)?fUJfL>s_vd(k(_|8^y!*ZSYqmDba3a}(nf?-Zo%pm
zqESzWL1;x;SzYv_c1~Q3V9Ovw;L&98I#VqEz6QG#JTu!G7iJQ_Lqsx0PqZYD>+%mF
z%!Jw@q!NA>DFG~7+AyUq#9@0DG-*2iGcD#hR*zaHx03tr5&IsKYyZsiK8Cg4zp7@O
z4kv#CwvA2jk)WH|P?&ZEk^66VY@9AF@X*x2K5e~*;B&Jb%=HyZq?Q)=T;3|}fU?uf
z3OcKfo!z5ca9ZTq+1W<(w*crkGOBP*8h!vab<^s0{|N7j?B>QvBVOYIsIlm9$`V>9
zbCSZhUo02Wd);efo3+|ePWB4B4H?-R;p;~ceT=saPKRxqc2QALwvCm<ux){|=H}Fo
zkB_KmXhflbu3tX}>>eCc0VT(VrDIX}<D0Ml=f@8vZf<$m>n0bG5Kl%SoOjN_52*$m
zrsd_uU+;OdH|9f-!UNw<fcl;g5?zY~>%&kaOJTq(8ia-ILjXU$ylf3c0+*b~%gfV`
z9w`?Wy|I~V@!}sfiM6@uz|Iu>=;wvs&R}9{+PdrTDOd=x#fJcvJT)Q$Q6TeUNY+2;
z8<w)Zp#j`m$F2jOr|wsdH<u>!%)xwY(s)^s;L#9`Fy68yOI?%`!nP#{;e%1bgux#;
z)N4mXu;60<^V0?=@rLdHKGU189J=9rDh)OQA)>?`M81LACmunF7btMEm=m3p_9PDZ
z*bppEx;w}qOR(QPXCHE&KN4m;Lr}nduo#)|V<3Ku=10F68IlQ67973bEy^sy$ZrUR
z!JiHp>8(#o>#0zr*b)j8(9^J|oMh=6zrd3F4WZU;d*m~%Y4^IVfvP}s$Lroz1LcX|
zdzH(A->U!p$LDdD+si}rk9%hPORC5blql^CU!s2s-x9#KQNWMC7?*s@Dm0?~(eJh7
z_+JLeG`ZM+|D>RM2p_j>haZHL*&POJ;qE8&uHDa#E5eyg`g_z!?`K<2`nTU3IykBR
z#||kIM48R&ss52Tr>;`}pE(9aIRDpJJ~nNi{;it-YcFXPqThu7^F0oV{jWyf(4%|a
z`f-oO%6}$K5fai_CsA%%aoCf2o;e)?QwBa9sh~DX@Lat#aq2!D>a#VpceS}2@BFz9
zMrnH|{)SE3?*Cj<l)fP=026)bhmk$raml|?+90C;zsV}>=6>CFy81q+@-AP7uo&w_
z`cuEi({@b$NS!hF^wS!tQ{yR4eY>!gBQMgT3s>fm6V-NF*93DJ(NARo<8wr<r$eOK
zn*-v6fR{hmnd7_cf91MQF&PLOZtsIrl%CyvkKz_zh*b#8UUW*foP2+%6Lw$+XHWfV
z?%vvdW;4S>*S<CkF*80}$(JVowAa3MX1DA7)D~cjw^f(!_4-FZr4sXcbn?wA>3Fc;
zoz0!EB(FW|<hgR@9PWYV+<E=)|E&eM7uIsw@tkwAdPPv_z_Y!4Prc-^H@qQZkx^rb
z?{$+>Q2x)Ljb8D*C$OzPOfSG0QZQ*=C!ShQi%2T2-Pb)7rAF(_Nb+7?pLy=W)n~5G
zVq)3oa!wbDducFbtks!+Jzs8@JkDDOi4woqK}3&{uKO{!iM}e{pJUlAl6Jp*KXW;S
z^@0SFmtlm(Oy3+lV2iS2X}MO-{}|@z^^7aR!$;)hNKLOlmCJs<KR;A?6o|h3>~(Qu
zy+*Iyg!p6cwpit^@Ykt!{GeV&&4<O+ET_#}Ni`=vY3-kR&f5Be^v#0fUF@wa<#kOO
zSBrtOHLkQrRqzJF>*(ox_8ouvjp_7nORrDQj=efUTkh!cOrzN|y=UV+lE96?IaZ(U
zJwL|Q;60Kikoi|LbM7CdpKTE%ePi5LVB@{Uka^rFevVrw1155J00D-)&_;$FY0=@W
z6!ua^OL6#flP=8S{gv7|wNaXYi0$WjEzf8bho83NXyq3Hzeeiyi6fVsKaAS&hw|MD
z<1DefP+07uWcTttQ|(td?>|0!@LkSwm?Mf?exjuw{P$lEf2_yKjj?4GyvR>gV&$!$
zxV#(J<j%qA=b^P@OmdFcFA%j~I5UK%^!4_L6Ok#8F{H=Qnu8_qLggfQSgz)bX^QVu
zJ?(b32~{#_sOb5|`}1E+CfnY+4t>TX_D0u%t3u!_Hg<RYyy+X8%a0#_EHAqLKYYD)
zR8-yj2a2GeAW{O--Q6A1AR*l$(kb1HN=PF^w=@jh($X<>H`3i*GxzYm@Ar5AxNF^6
z$Z|1r&OZCe&-2;O?EPv#<0g`4BsKZNVc3V+)4S@x#`#xx36&4ij@Yj*&v*RzChBat
z&dZtu79rlY=|z^M7Q>tM6_2&RDOo}urY=6ZPQ4yu>7)pp3dK6Nb0{Ibl)~;SL$0;J
zS!q+5RP~J+V>J#dUm*)$^2`x_&86q!YyMQ%v~)Yu1s=aFKmGNE`!n{(@B2w^r%DH&
zht?T4SLvSBs5)*l*G@0S)yGL^ej_yXj%2v8MbA1^R79vd0$cVOfua)q&;1l9sEd8E
zEkU~lNx}U%D-E&JheSz$o*6AOQ>q*etrJJ5nq#jFQ&%P`pQ4MhGG&Dmm~+i(-ou_k
z5VIqs+j^yefC{g2>)P#LJCQH5d;NOl{qH6uWA|tJ#@=AMRu{U|OgNFs(|Is&@CD*y
zpPl4x|1=a0<!QTl<0$$-{`3#ju&q}Gr3D+t=;V9D7(13o9@c?2to7>Xlhg$^^2Len
zn)7lZo@k%x)`$|wMr0<Uc=y3(QvNqSE{vfq@oIs-vXa_XlNZk645_VwGq4M;>4!6+
z62Aw&hokDmzsgXv`P)oId7n-$jPt+fOkS(_E23NXTEK+(8(evgbK9zMzeb5Hyma%>
zIaj0_CTshG3wqqU<aYFY(wKYeDKV&ZTaOnw%53D}LM#S1zT_-)S}^`xt75+t-)pbe
z*t~NB<GfgZ7&rtqKh5dI&)=Dll;O`SzvGA4gxif<eOZ?7-MIh_$qpGWL`5Q)#naTO
za@=bGwA2qAM+?$0NUXl*2+aN%G1SK!upwP|x!CyzMr&7(CE2Qqu|cGT2j}@(s@k~2
z@(nM>as{{mbXOZt$H;MirVpe$uRBZQ&X8V&!jz`(I@|;6*P!6Y5#bKw)-!p%x{{pJ
z2NlZxu-Yr8HaJ#W%Q-e+;Z4_`+<-O%V`G3Uk~4ok%*23UQT_mv74|)jENmnx5DQ5@
zI9S|L@D&*#F^<mOwF6yR+ored8?Se-!2X~n<tA#3HSgMI2T+)4cC)X!9xN-sx?~O<
z$b#IeOW?x00v8^2<CV5(>-j8*^4Wk+d4(Uf7u)R)jL5z{sF~?z-7fJV(5E)Ml~>h?
zE!jh@04;dy)wg_@T8w9Cb6f!q^0B{Helob=z(ygfWzS#ke-}XaJWpDohlEX#NsYOY
zk3E>}z+ndxfL@YtU%%Vgw-{n;lhk&}wzh=j?$+4(Bz{y8s)CrBjyInwoze1A7TXLw
za#!0?BKa*HEL`AutrsAM{^`r8H<p;PyKzBvLLMJWchIYEOI2*clAiKB)m~}m#u-u)
zx;`?Q(YGy3smJDLUz1-LjsyTmk(eI1P;4DFY=8T#-f>A;dqUY(%Uhmi>?4~W$$g9_
zuC6BF8Ltw>keU{3S!QZfCkx*K(}_yk`7Fs#$(xuMG~=7`5Q;v#<WDUvpL~4s9O2!;
zab><VibM&47FEZVYpl9JvP<iOi>+Qo#cL{3s%kq#OVFw1^abc~CeiCW=Ev;iRk+58
zekkAsAY}@b#~aeB0XkKIZEkP<)KF>U9eZ2kKO}<|73B*yIl|PF6QO)e7}68bOK-5A
zRDb<mi^sy)ookaTarGphqPQs9)ZKB-pYUi`F&VbiiJ=P;@_VMS6yq#*x%rk7%K0W*
zY}3Ed=Mee;EBi2a@vsi0UBR?g?(uLYuibhvagRm@NU9R3*I#2vX8#Jk|6(HKO>pR&
z*SQ;6^`!)w^(5|c`iWw4>jE?E@Af{8%umuX@)n?^_KwUM-yI`)jj)P?Y2pG>B2k~<
zL=1ptRzoJseg}rl6`U=O9IvN?U|2(|W|u0k%ZzcLeAO}g*$y+Z)thK?WS?E>%y26|
zDb<Xpnx4BKE(WfP12RMi`kT=fM+_k&pWwY01=W{%AZ(JC6AN{0$*`RTp2!%IqE}C2
zoqiN`VQ^z&U7r$rjN>Eoe>oB2atUueS;merTN{%bcyxs)c+8Fny$-&UX~}c|T^bMG
z&f3RuGmBRX&9<2X85n>q<)+6%<>X8d+}U?VRG)uSwPX%JWX^l|<eA>}X4@Wb9j`P&
z_a-&K)Db$&JjePoI3zD4Z`l7qXtDsb3ZzmKfxEi5TvC=l-nO-Kj}Z^Y-AWoK*s-z!
z#wu?5_N}qIXja)9$}FSom}gh5=cv&upk6G;dY=`CwXyfK%)+rd;$khp_QC$P=$FOV
zng;syZMJU2^E0hvsS7_!Y~EEC#s?2=(g<SGirS?-!98LObW6Xq$A#t9Ae6Qs>lA)p
zYld?a@U^;Z)m9ItiUq5Mu(GiaTFe(Qzfz@#+YH+%&!()SHxlXgiU!WCVUV#ZH<2Y4
zpJb0slBYi$Hq`?-7zv2g-z;2y%^xsW12^oV3INdUtj0*v!rrmnX1Q@o29W&nw<gh^
z?AL0p>dn`kzX(}JSJ!bKPvDiGc&xWao|QE6*WM4u?5sWS>_^9{<FLUsNLPdzPiV_=
ztFka*mgfMmZ90&^4Nf21VpPF5&fJrk!9#Ap&`ZVYUk+suQWKQb`;k?)zyBVzY3Fy#
zuvSSAJZOkCavyz>*@R29E^ylDsuK|BFVM=W&o<>pXmfJI&_{&rHXlvE%=#@INd0X%
z`*e_nC{<jkrx*kdzDwpesFt0wkewpYGX4lV=!m-o1Y<bV1cb2H0&@rI)j|y?@W-e|
z+(g9D=2diO$i5Rwb;)HK>~DyPoB|9PM~Tm+cz+hQ$JK@0DJ9`|QOB$Pk&6ZJD9b}=
z*Jssy<1Zv?Xa0K^C=4aZPJQ>gm{fRFcm#ikevoyA^d@=r7nDZ57jk#(-bGs*8mvz6
zK&d{l*dpar`bTVUv1a-E-FObWR?*qd*Hk`T-}D+d%q*~h^$mO1_FKCSppNGK=*G|f
z?4lu#&P*#p6KV&WH)w6N4LUZ|*)|57Kpm|twok5uVCQ{>1%&UktK|ElBsa#(;<CTr
zeLzDU#bRPx?wv~{7d5=kPL9+;$iFN>IDia*MJTVKl~Qw|YX{5y5AfX8lBvPw9poN1
z;ir-VDd170`DP!bXHn*HSNqFSxe-l0<c+a1g*N>7eDL>)H}{Y5X!uBVn)mrk^cTk`
z-={0SN>2saIOh^q6qGfWlA!kfFsJeG0bT!nzGVM)x02uIO$f7#qPEGABcc+jhl*Q<
zm%5CWY}?<<;e^r?p10Rxh=0@D;H2_kfk!ca@y)bNMbS%2*1Ih-9e=3bX9qsg$k7H5
zp)Y^^2}xd0ELEXQ1P<9|R3sc)+pm@O`gaFPTUMT=uzeKWkCV!L2ktt@SORt3=Qf#R
zJfYdQQ$wZ?*<1*-7Z0DUc!i3O?nI~1lr7i5E&|uK7?o$k$GVaYXiss)hvtZ!uAw#>
zo&XNMoh)sR0RB5o*akDP^*^;gU}e)h*B+axi|%dpBAOU(n<g9)#Ut3LyvLrfFuGu%
zmi`1tbK@4J%{uS&s0}8;jQweEu2^7mtDI|eVQ9V$9yJ5InULIaIzI<>#%2t)qC3*o
z^ZXg+%b;p9<qYVzV3W*vSWVoqwl^KGLK{7J<KFg&JtB5d8kf(2!nNi^iA13|A$Nhq
ziAjq`AeCUAz3Oio^NnK$7qIu|UjYy}+qgv*H)Jr7;bD~T)k&1YdObkqA2^XI)15Za
zm&zVw3BTV5>Il-kiWJgv*eV@h)PS5NXjz1!6Ik|dpdvGRB(8jpzOg^x=g5R3O$p|F
zOCP8wd$)L0PR1QuL4CqU{`WKkKzVU|H}h30lfY-iE9hcPL4u@|0a|vu>G5vo&~$wK
zJgMTwE@jy-oeuP+o~Pt6X8vZ73Xo=8k!6|1xeMs?wLaTfflt||d9O^fQ#WH;Uj9E@
z_@vtaHXkK?A@k(E%S`>;80wiP?536P7U~@fnwuYVHsw7tjUFzMN`Qg>DZK$O@MTy;
zOct-Dsem0lfDB}J<l|3|_aqG<Sw@4wMd$qME^wZ!9h43q+LUY{g8uL4|EKnT`pEy6
z$OwJnNdE77I%cfdr$PSvSq-ECD4qZJ{j>_|3GSgBd2+n)O22#e0Vp(DGb0cqYbcd3
zHX}os9Fu=pS0Go+@3ZI^je-kRaFS1drX2K&tXJ6g_v2=2ID5qZ3VdKl-(S?apSA>6
z{=^!E53y2(+93Bv%*KEJ^1A5CsfNe8SLpvn_LVzaBBUS{%OLwK=Ka)uA4TwLNhJIu
z+q={Cbo~E4t*9O4r+}+`U|ZsKPs>dzX@CS+eHTNGz9bOZ{tQV{%H+j=pX8VG@JZ|Y
zCGX>M@qZuGiE%aKNw@0CEmpdq_>Zi3+3^KigajMmaEN1#<CNn<=ZOsCm*JM>sY*e%
z$o_@>qz#Owh-`*5r~Qc=PH2w-&`e-Ai~!!*1n}Z2y|ol2DFU(ta+c^A@um)z2mNX;
zw{<(ZVj5_PLrL931qB;xHY(xW*n6yRfT`cmPmqY_m)!_U^DsquXu(Q3!}2RGaM~zZ
zDqu-@MSIHmnYTPuPz$nZZeS*&2h`o6)MW2OD%<p32R%vvuIq~$>edYAV7Y+Bu6!8L
z(v-=g5GmL0G46|aK=HE4*?VTVd~4`<WXB~B`j>UbCM(nUJN7G1gX3|ykq1s(PPfic
zx+#FR$=n-V7{Lcsu%#)+W&9{yz$5}ci%*6BoDv_X$#|ps@d>KXwSIAADu1|iK)#y?
zo)!>=vQ;SIGD@I#*L_E2HtF$efT&@p{vAVL2_lhK;3g1Qp9V@y*=?*nYXOL>rcZZ>
zb;~~-&Pj~sm?y{jwm;%mC-vdHFKf=Q%Qo$-xy8Yjoo9M=ZBpb-DXme4!tkclu>(@t
zp|+@lD&4}A*w*1CJ5QJ;x4nOl0#~ss@>I2GFr5y-v0Kl<u=g{(zXA`4z*~@+BFz`0
zyBLyf<$AR6r?CJeZh3ykalR|hWY*DaE6-LUDUqGCOEN9;h}3xPsI1G%;(_P}6%Anf
zOq`YBXXn4Eec*eK1+Zp`x;@{a{sSpui^|ZeMC@JU@IqZrD(eOuj&F5_<>%94M|6v+
zzcUBI1_B#qerh*hF6~k+PIOGq!2i_o!Q?h;J``rO5Pp?MD=YYI@dEyE41Lze>|pYc
zrt(*bjRFsSqUM%h*=r9)3gtd(=o=1zK$OjRaE1gJ%=P%qMi&TraXv<=mx&5KeT^-5
zyl&r()X#)wsZ@X0X0D5BMgRS854dVUu>fsE65bvUyENaE`;f`&8^K9bU?aG|U1wp_
znO<wX+c>0n)FY&PTw{Ts7tnxxKWiqADe8S$kA)v4jUL*B{%~6vT+$qraU-O)+U2Li
z%&Fx!^TZxeo<v##mZ4>Xaf7L;fc~Q5BZ}(6Z?*6TUN-op1M#vVV4#}ZO_r0wHtuZI
z!*Jej2aL-%3oXr8JKA3ga5;~&Q7W3gGeDKoKq5-18C&r`+5)Y+lG(RyTnj*xu9`#b
zX=ttC&`c;31}LT;=0w)sO!#E;X1Xt^X@}dQz>QpUwO3e1H-{{)$14g$9YlT1HYU*G
zw$)>?hL>M2M_@?>pxb__#>UtlO=FocyXKpMk*dQ;F*W|xx3q(|U1OxIfXA*Y*Wyyj
z*Tq@qQy`P@YyBfm({~&Z-;5NgqO1v}P`sl>Rpn(Waif!j-|_Cu;!`D2v2r~HUP3B7
z`AJIlp{^B_B^CGPOYLRY_M`Y(P)o5nj?j!q&yfcV`A<_yx^)4Fz(bD$>5(LEIzyTn
z+jNLbtW(^WiF_5xOb1vh<ftoMOPn1<=_m|yb1wxEBP<<dx$l;GewTJVFyy0*Zf_2q
zP4i#&d@#Clc)&*(lM~|_ZcJ>3NZa`x)=yVCio)Qtk4HENf!<dfa4?**$xIL!UyyRC
zWSpRo7r$Ke%>3%u^94d8TVhqw_WH#(Hpvc6n+=u^pK$-(HV9Q5bZx|my8mQ;@TkO6
z!gr^$<dyCI2oJirg^dAxWN_p|pOt+386y6cIXOr$bMd=i&l~*bt6g~lw2^`>EdmPf
zgQBt($i760>U9w%k`vP`aS%vD3EhnDTMPf;g3|);1l{#Oy_6KwB9{m$OOoW4!oaj+
zwVqpK<0#34Pb}itM^}_xztD@DRy_`hO-xn2q9Sd_VeSa&eR+8xTix*N<#jCu6CtlG
zf{GpmL3h7$iBdqmlFIrBL;`F!@80rhGq1pnsM6}4v5dTezt4QVF396>L7W+z_NS2l
z<R2gu1Y(6_zw#jK`*7@UCN1gV8DsuJx^`^Bn|$b7tNXO*CBkY(rLFvx2M<<wU}2Iw
z$9~39R=poxx!6@L)xzqN_c%jxd?$Bf-pBTR4eew>B$AbEc6!yJIxfQ1lQz>JCc|x-
zz57od;7=A1P979I*5_~0m8<$5_`DyM@zac27hpo(EeBT?*b|A^hyMKv5d!#{o_LSX
zNui=?W(;Gr`n3YDHZ;_$Ph=1xJOhXQh{i~a5Rf~Uv{HbHO@!p^p?GhSDKU@#5S78f
z*Yx;h<8G1RLzZui@Qa=hOr1k77>xZ)o|2TPW|~7Oz;K^bXoI)2<=*RX%*RUSR(i|u
zOj0m)dG)flEn~b(7D!1_)KJp5Emo6WGK<uc_`R!mEmFkjPw_p6yx)O)JC0+!_cLwG
zGbAN^5=0=%r<G_d^$)rnQ<0*dFFJ}TX+vek&ux>Pu4aC)m?Qpp`90Ny(CNuUksc2j
zssYkuyaj~TwCp8p?*0DK6v0e)_3u&1P^TLrP5--Kjx9SXQuAw}=mOV0>lE6@r6Dbr
zsOuar==sAV9lk2NKSMVUlaA-Dir4w^irH~6-HOLkHtJ11a*)z$&Kc%X+F2x{ODgsI
z({N|?Pbx;O!XvK!iO9YdI;Lu1^bd03*A-~d<rgNf^IQe(%r?1q`A)Eh^deU_eU}^m
z#^(OU16kSlP5RJ24iAfBZ2FVH=;=GrUx?e4k&l^);X$^<!MNE-BA?6X<WU=!WT__w
z_8xSoBR+M_UTtOW94~U2wgQG_{m!ZmsaAqoo)$<JF<&rRI#YIE7wjp&os#)o9<RhE
z(gj5PKg}Ot;b%qo6^>AsuFo-4cS}zu#X27H$p~Y5g0zzxLt_JHSx9w7nxBE-lpbQC
zlALel9=E|QuQf{2x<UMEN!0s9qBc?zY@ahsG(JTe?Dr2!;Edp>rLBUHjh%I-y60}s
zQ2!uJM2Ni-aa_N!_;j6%L@tj#<)Wny+Dq^2hcTK-(-i+c75@&FYQM?rkCBtvv(k(H
z)YZq3YOi!|9-p(PEcgaBYD1H?0yfN&8BkI<Jz9pcZ@mzlJ@?8?TX(`ty1?W?O`LLZ
zK|z?1mgQJn*6WQU-o!{_;`dh{o|RZILXK(@7)r&$X^KJLgg*W>pzRPh7OKkrMop^w
z;yph{zYW3^2ka*_@D97P74<l?u#zGz?yf1N57@Wy5S*$DKTrvIeZjtd;NygXdNVUp
zo?B=^vu;_wb-UtExU~9tFO2Qv?z6G^^&OoWK#A27SM9NnE|F5<@u>1(eg8)wp>M9R
zUb)4n^QOubhL@JZKTU2*h_D3uT&b%0&^C7dfF=g8RTC@X3}sl=RjwbxN0xnl%()r<
zXlZkCNcSliK?q20N8s9NL9#1kHpMd6GpsY~r3p)6>93^zpcO|y9KOcJ#@51b`~dcK
zG78JI*R7;!lf&zN^OO7;Hi`Oi&4LeGL9^I!i{(*+y#>P0=Y8kv*D+~61Iw0)wbw>I
zrM2+y=sp=6T?H_njCvUURUK<^eY(Y7M<!)_Rad||^RjQ$A;tseQN|~=Rf0P{;%pBq
z{Ls<2{i7O&jT~?{O5ohz^>1z8CYHjPao;UnQ((8tH?NNn9T~Pgep)n?G*f@id*lUn
zb<L0qh}A;)&Q*?u5#e%j>NqM?^TU^R@bA`ojavzYFr2nQ<L5pbT=q)z!Tbn*dyI9y
zVzRV_qPl^NW}b-M5JyLnYI&X&{i_cfr~FZD(D;$#^L^&(4ITVNbs`MgM<2gM(kH3q
zK{5{FNkR*}uCcRg$=<uxwaQOTzfNz-kH0Q8g|sOxCn~1F9XGhJkn7VXhy2fXU*5Xr
zHcP(Ek5f?{q^YI9ynWu_I(zcwy{n1+LW`YV-ggB~QyUW{MMcGBTY9yabI_^b+lH>Y
z^l0ANw4Y(RMt5Y!6jh#~=uM;r8hu}`Mp{Ex&LN`6@n;N1u=N$n9f6VjPBQZ^P|4=o
z8F<9g42PcSYkIBAuB`8&fy7tt9_>JXwK1Q6!-w5F{zO&bQ`BbN6u5qLbUI%otMs2i
zdA9_288p0Qdb*tiqeWt?WC9R|T7yyTL@%(wtJcGQ@Km{O#r0!`P0!nCIPU~LEN=DC
za@PHT%PbZ+x~e>rE)n(F--e6CG<bO1k0zfa`)fFMo<!I-e1C<L-emc%UrZX4xr9Un
z(W+h2_<_urcCV#li;B$h%f3@t%L$5s*U<_y8BQ1Q+F<S8nhz&ucAUdU8z>x-BuZuu
z__e$x2efx`adFu#&EMP*#$F@i;NV<bUr$U-oI6<N(STqo$~iAgeC{i253_8$JYT0n
z@T^_*`z<2{JKb`4>ke#vWY65{&)Z<TRP<pN4wjuyLzxA+mM#q>8^a`rVf{0&JdJ$n
zt3R8XnmVhD2|RdIHboQqyEICn3gq39Gao2_leC;yVmj9Es+ncyXppu$LJDrvq*0lN
zv&}UccN}q}=DI4rj~QB`1s)A2)(36Ow0lY9xp>jQz`ujNc<F9e=BFuESmS|4EVyoA
z72Am^T9~BWZ>ba;so$+(mz1=K_}!%UXucd?DyORU4J1$Ti2Ww9uXWvFzA2$mo1*+z
z(lY&%;n7~d6C%5Fdn0_curUs}L#T(4&_92E8+C<(8JHJ<9F|=DI@%ixNZB5OvlUD4
ze5kmMy$h;uGTQ$c2Q?dMtPvSOS{eN6(K4(E|NT@AzH(w;og}Nlq4~=0cLiayM&qQX
z*P=e4;qMtUUYGZ6#hGS+bWBaFu%Fv$5@iKVhUX%|2}jrPLn&W7)>S+GK&m9>;>P|t
zI5eu?_3wOmx8F$FvW2BlliX;(RU4UETHnK*JO8Dx?I4AeAIHf7yc+70yv|3w9)GmS
z3bVr;EW4kKAVUK4P$ylFc);||oU>g_Tr+~zFZ<<wJ^#Boh_2fP7iw~DgT-%L#qQwH
zeijqQHZtk3m{%~0?h9M-&mjILb(3CI{6j;^G4W8f_K50F|2+27uQMZAOrc7h`+)>P
zIG(gJZv#efJ^$?QiBNJ0+E0wU4lmRzKbcn9NF5w~Hj{w!1x=1v<xdv=1s>XKGRW#%
z4MQQ|rT@K_&vl5Y<Y?He;bOLs$wKll_NuAjQCgJH!Um)N9*wbbb|&`KBMNF!qSix1
zCruSRa6y>hX@>v4HvGO$aVXfF!o@YUNikjR5(6@Lrt3Mhj}4z|YC78H(fnuhjhD)s
z{Tr-+_`kzo{@&i+%ZDX3)HuezGvmtUou2DnR}S+*LvyEPo9hm=iP14pY?w+z*$_ti
zD`jA_=1<4s2%HTs@xlX^htbW>ov@RonX}QGhJv7DvXm`fr{-m{;<C$KBEFcSs@8l*
zWwqo(!~-rft&AB!{-_zm>w_Q0NEN1~kjJbQ9{WAcqQVhl`GiLN52fDj@w_6pKsdP$
zeLsQtAM5>J^ya&+<y6V&Cb%TPevzOV3(i~m(1_{f5<HYJ%88ZQP%9g}uVq)CC_}9g
z4tV*r!jV)f$eFNti`7R~QjYAUz!SgI{d8D>O|AJ~FuZu_`Wx7=@Ign42``*PGiG<V
z^pxm-y#THl*h)j=z#UzAr9jts;|=6Rb|D#8mt7M^W_e%3AfO!RTsSp;MVNb-8lI-6
zs>Ul#&DXac0zQ5C*e|nR$g}=J$AI{^bk^n@$hwDt@os=V^|eZB_aEu)wW)nul$(MI
zh=>r#yFGeiIn(L}baV2Tl;6qMDr)&ka_$~<b!VDlzxaY-M=bTD*qR-o)^|<3E6Og&
z<T0n?o1}VETt#-6_0`%;iS-QVnwqDw<(#wimYO5DVGL`#FM!XkGl>}&0s2q{6ceO|
z%3gP>KvKH9EFp*w$bHrQYAxJg;@(SsU}BX}98*fXduckF?*T8s&xav_(+rJc{$Ppx
z<!q^P3;MjhgLBY3-r{gO(Xg)}*#j5u4t8gUsG&89M^TqbZ7vO*LwKhAc85Ar`b)Lo
zII48Ci3yJJDrWt>EC}jY?vK_or8~#UN#gV)Hyx?_s$=~!f3Aj?ytYK{sUP4<S_;pz
zXRKXZXD-ey{65A?LLlsuZPDTvjjVLiV;%Du_TFM=!E79pZ7OW{fLcXBfMKMqd9U?j
zN=ixsdfPQfI=cGQ@*10f?a<z~4?mVZEP`GKpw|-JvTmp)AMfS1C3K`w!WeTPsL8(X
z5~~xh>UiFMAOM8qEhO8Iul_q-<ZT{tgKvZcLu6BFrZ0zlqU}##4$YZh#<SeIWyTe)
ztUCDgK-PR!xu0<vrtAxYsH0%(w_yxnY6)*<zQXKiB25lxfwuR4T<qBrdpU*ojs@`^
zQeZA`F=v6s*KU)$Mn5K&gP8FXl_cM?xLp#3ESV@>YXMquuw(=D9c^S;;aHzVoUk7d
zIq)lGkJCzuay<tsGJ7(gejp1@lLe#k9~z>UGeHyuiLxhJ%8vAMM0FU%R4XQzGMTz6
z7Z%aY-%QqbZf4l?3^UVpQuY%76T`yXE6a0vpLWGsg7;(9Lzh97+1!e2vdF2VG|tbS
zq<amk1B*<b@0eo|C33$|5LB-&i>udvua|HqyMEeK_E_}MFk;u>=POXc=cYHN9={1&
zJ2$2|rry&YGb}E39_!6J4#FH^d)UPDZYxa*Kbq20&q3(l|LDhp!m{OFUmPJw2^!eS
zn<M4Ae6rI*x#tBXN%i!0pTa|%N~5Fd++JTee4h8gD`-iVnuki>i;h6o;jao>&{QkN
zm#1Yd?=h>?Iv53$E;DoJ&7lEVazOZ?-gKH>q#h3!=DJYv^m-7ZdP(VEN!HYpVd}f8
zEA^f_`IHKkvL!vIYa;xB^q!n=+yDE}p2S-gVypXiV}r!_y>Pl;<GZ91VAKObpYNx)
znG{UY?shi__&snGMpWCz`oGQ>F>JJ7_ywJG7A_ysMQ=1&JzGb_kKd$4v3?&*?V3Of
z`5u&>qw>XiZ^TLHz9NKI!(_kDdO77mU76&DbH-O}Gc>X{S;b&;U=eOQapGjrT>f)N
zR#o-3p&c{`OfWR2_6I{k)qI!a0HwT_f||1a<gd}C)(ley*&nv(a}tOj`*GLk!veDv
zTpNDS%fD!x{~@>C0UQfv$Uwm$O`O2`{2Agc)_B89Wgnr+N#S2guZ#ETwH3!E1Wf6X
zifhq-OhtAsk5xo9F8n~h#d-cM+TJ^mh!0NHi{ifSuy3p>6c>*@QwB%LSoA!rText8
zk9_gKC|5ZBV!ir<cperG_^_isVS~hXH2k&^I$p8XHA{r%^ATsa({ldND-NUdyJK&n
zJy(oMzs{VSoEf>?7B~Cm@!%^v7-rVZbKVa`l8-(DLZ`+T{uy#?;}aYb+V|wFmSVOC
zyu->5+N8U8$h!t1-J_|l9<L`B8eVCFon{5;2MFd%7(-1%)Oeu=f8FK{ZGXmNV>o(N
ztn$U>GRIbfdWvGoxgUqoszJk{t86?iq8v^MheV`VL#pM2#;;p~=-j_uQeD?GL$#~}
zanu#1hpRuWzj~~PhQm`o{YVESp6IvYeCgAD|Ctv9IFG)Tn|3Q)p0)KB$smffB@;ia
zKi{u{@(19{1@~>e-3??FE*J7W_!->}F6WayI0j8HGEd)5$!34p@!2Xm#wV;&#bFph
z?=%b*pVdDzI!a{lb_SL-;R6>9s)_ENM86a4J?Ubt$Z<hek2x<P5*<9~i01#j6}y<`
zjU_)!5yne=^~P|zMHrUp1v$P>LTw9P0=7C(c$pn90JpUx6deAlM4gW2WDhH!6?9H9
z>*+OFAyu0rkJ)(LzJt#DW?7bd^^?_*sM_4uEOdG2jN%!%h>D%C1HGkm{H3GIywJ1H
za81?yjgR(oGgm92q1rouy7GNII=KNwf?2wB`CLPuM0vW(wr9Qgx>e3+1lhvWvBK+y
zI=r38Zsp{qyS~u2@%?8}QSX$~P_glQRmW{!Ns>ciKg9ZKvX6;MIrT1uW&Y)cj|>GU
zSDe+J8IlX27zvJU{c}+J&W?P`gH(H!9a#xWfF1<;3k!eS{|_)0Z?iky{n+Fh8owRM
z1y8cRcW2_OZ3_b8GL>RgvS#<ol`q~VYz{N?jrT~)PxcV7bg6)+%dmTCTU}SWGpDsa
z=*$T;n36ep-&m!eA)1If;fBAp-0vgkJU+dk%-RqZLP^-G@h)Le(6zw#2aZjb-QS~6
z`4IE-@pn_Lw@z6U<lIBZO(QG5xqNeLNG!L-63Moah-o^}7IYq3{ID86Gb`Mf`d+=#
zsh0znh?dslD=?!5A~q)nU#eBJFqra$X%3$JFqbG?T|-l!8z~TH;;n);u@+yD8nR|Q
z*7KMdM;PBnO$D_mPn&PP6z{18BwI0M-?GR#$9#xgDWKkDkoQ7RYakz+Bqleh1U6dC
z3Fdw)Eram9{d>t<ug!6?k6>p6ASpQv!+0vIYJ*CwXam4%K*;+oaE)VH^g?RvJH3vl
z%b5!O+ljmLc*A|7YB7qK(`*RpjIdZXy>(3aV+;`xD9rK#m$o`gopx@UxG#}_6lc~T
zC(jDpyiWn^Yf(2oklKM!{A4e;4pjPZ*O~=R={8WEs2lzm{1pZ=G@k`XkrrP<%S?Rp
zU=ySwKTwU?96}8&{~+LMYP#B@fBoo-J?f6l*v7O623h`o@Y^+zE6-6NcqX?g*oT{A
zj)q<u#mGFRil7m1Kw(240FI-Ref_>-a&oK<kOASAqbbZ%DwY&a(y_|EGl$7O`0KPW
zQf2B5r8dEVt#G9g16meeWeD#Zhx?7Nqmiwgh!L^g{MFdk7YCmD;|L)I+w!?BQnA%a
zEJQY{n#tBH(7<(dgR~n^|5lDSQ5bc8vYzKRE1NgE4AZp$*oho@<k>ZNUfdnb=bS9T
zP<s7dmlfRq<5ibKMVSY15QN)VH}!V;BmVO-3CflbHt+b-{SnNFWfS}3H(&xXR!fXI
z-w02e49}kM&MoBUsNHYh__UNn=5L+c7;WxE(X(1QF1hJ4aFF>TMZ<}6_V?(n!YDZ!
zPKe!8!wqv(rVNc=8g>!222yWV>ZPGWtK0|~C&JmVl!nRsrAzRHzy@X<u2`xS>{VBo
zlLv9$58Fu?t=~%GDnjZKF7@g2e{<a9G%m=#){>uCP7I;7ln+^M^j|K)(DGd00#T);
zO>O2%*I+3tVudViY#l`R<Cy~C55OOuzU7?2Edg|9%Oyswt@v{|=oJl{s={+XNX)@(
z6sTI+{RGP?H#mrusfs?vrajlF4OQda*2k0PhVlbWP7l$cd0PWj(%lyJRIzAWwKtS5
zu=AJMK_7h$)Rirf0@rZN33PKmDsYi`rknMsa1#BZHZO9J1f~`v$Hv)k(wvosV_goj
z-=85a*>9s^u>2_*Inde&*Xn7H+&{9M7?nf_<_^BU=2-W6fglefKX2!A)#U>m%XgLO
z=8o0n^A#PC{Vv?+cS(V#X9whomKA=Jtv$zk_Hy6bgK%$WYf)uqcUGrv94jRs%aAW%
zVb0bQIN(~>H-_o?XTD3$5V~!7iwA>@yJw*nlwgBD4sNvv0JQouDx{&yhNS6-oYm7v
z5s%JZdxL#n?^`pRy{>#kB^}=&_?TF7xYT}Ei$b)azB)gMp{kLrCln@gNS>0`jo%=_
zh@ac?&GfVLs+R%qeamU8);`=8GM=icLO=Hgr~LTW=h9_WnBK9b3=<zKIq><T>2+qS
zU>b9G%IgCc7=2rx4~YFbKE8>soF;3KAa}3+L=wsNfE((HsnZICC%Uj{{jwZXX#c$p
z=WWQc*fe(qo?+SJvt}NbWLq>R-^zCZB9SZ!iVh@PJwQcDLY{K=#Z8qwq5r(+Vz4Pa
zy$P(yD#1uZoscpCg5s|AIfvR&1ed&3)gaJi!1^!@?(@97KkI%w=ldK;6fa!pML&7A
zWyb+AU;~u{IJd%(5W<+aQZCZ=&L4W>%g+&7689SI%9JBATaKDjR9sq>n!4eunDfxj
zHdG)^gk?8)Qggxbnv1Zd<*SqWaIu3QH1Pt93;^8xNk>tm0091Bl)IbwVJhH$Svm6u
zDfQ|;g;2AeA0={Db<rPIbmFS$A%WR(7Y{rVS_<@=&NjY&MNxVp#-)>mGSvt%`vWGs
zM7H}oL;1HvnfDCIzJWob-T5mM5kaG=8{w~fSE0q;WeElpz>#Ori0k-#bG2|b=O6=$
zo`~zsdKUE2R#hzpsaU2&S+D_hLYgEP4X81meHH^;|0B2xeZdp3X0QQ8{^jLH2@Ji;
zKnri+g>w*p`whR#xAU(SEtpX%021uq=omyX?j>p%G~H1GjLXw^JA}F*G~WXKQ8?v{
z`yq<p7z#)acn7`wo=RKZ!q?N+vgjh-20-8)p$rzjma0Ss7&4AO2ax+WD3zd9nuJx9
zm6(lkv}EtW=bO&``-i&wol}LtRlv13RNCLBFNu`A`fyE|16j3N-kbM$)TQ~K`x`Bw
zFfj+$xPOd^m1cnEg^B^&X|Ri4+cgX=jzVF37Z@?|*z4zFz%08as=GMVJ*g6tF<O-w
z*DVL`DS`5+KWICY#xhyY_nVa()Ta!y9PEbU4S<TT{)Vw0G}r{?%faHd1q~7r;upyM
z5yp<7(3)~)x|U$xTu7J^8P2&FPn-+$?pPc8-n}Cu5qIBHC3M&p@5HX-Z){HpNH`^p
zCMC_B!t<M1PXV+M81Cl=BHuPhq73sh5b(W)(51?$`d{>HXHMO#vU^J=y2T4Bm;pO@
z5935c*DHRe@U1+W3w^@lG-ZPiAK3zePNWa#J6z^HH4^qq?@V$y3-cRa6GqaIuIM@k
z=`=V_iTZ5csWR>Zs)=B8PL?sczcmHljs{iPmO8z*nFLNBB{0kDlGquV*H)#^bJf+J
zV=t1iC3;~hKN6N<=#XYj(+OD6MtCUDs55`Ve4d525&q#D`v<i*(Z|GqS1)N|CG^Y(
z{e6+&QjQ7u&cGeIn+DjKIe3LUBc0M%kwss2>b5Ce5K@uWTuvK{M#UhF*MU!D<hTT@
zIZHix!JvR$9;H&N2;G=w{})XJjq+!NY^``)ZSfBr29<fK0L%i#jG|i2w|pmE26W^g
z8?DAZ+XR{F?)ufaGndG-Dg&5S*!Y@|ULH`T0Qq^di0r5043n;j`HCms1S5R8kA}KI
zoL;nc@Ax9l1F5#ZcOd>n?cmL@8hu`90e;^`2Xwg{f_mPLc2|x2AweivCMaw?zlz-O
z6#&5%my>$Ztt2Dwe$~xRE*m!ej4G&|<%#LyZjeZGWu5#OiXF0Ca$icldHLeO_e7$z
z-|q4fD2P2;)ZCX(@&sQPv?fu5&&Z+ORY5wRfhX*L!Fl_#p#$tZgx{)I_LH7T2$jb=
z|Mg+MV07eOy&7xliFp_e3!btdQCCw~-Nz55*+?d$xtk1_O;Hd=Pf5q!TeO~9;Xs^N
zy(WxInsj3X@)#qjgm3_voOM4BjYtFv^YZ4OMJi2`+!z(P?Dp<0<FL22Mj6Q2^KYtX
zWl@2&_Uyx^FupI6rsj!k8Sg<G*K*=(=R$y#y>~Pxj>r|h>!=V!FU64Q+n`F>r~Ob>
zmi<RL?p`)8qGk2N;H|NwCaxjHD@=K>Bt75ukFr4MGGAegv>MNW_77BN1?C50#80H3
zrqd8-PV-W!Rm(YlBRKv5y=s>+|ET0fkn2Lx$pZiZL!%NEALFvv6^|-1X&h37j#GxE
zl<E{k(eKRo^7$>#Kg;M&7Gh|PTjhSd4l3WNVx1TFQnd!TwDTqWmq8}pvdw3=sKKXz
zJO*50zTZTXC&#@@k&YI_)O8w^7^fTWp~%uIF3cmF6}MH|FUB>g3cRf!@-dD(Po`K9
zGPH6yIi_~YlXzY0*_rrrQ>{}F(p;>2k5u|l#%5)|y!hPD=RcF&KaZt$PU&BsYLG_F
z(>G>r(bj!!WM%Wp#ChNIp}dt1TAO{*ztIVJ39eXTzwAY*I_G}5dR-~MpQc4mfl>uc
zo=)AKb!5rgSKU3&?49scMpjO_8IQ{p*E&Muk5Py2j16O|HkE(Tt4%Cb+62i4$)JYG
zG_|M9Rn?uKb@%M!H3g{!>%wj4;rQxqLJ0tLGBy(KbFOqi6atWiFU7a)0p^4SRTgL-
zR>(4WVl<FaCN@LiHOIIbniPrAyVK8bpNOq0XR~QIVf&3I^sY4SgCPDme&bl~c3qrj
zBz*!mU#@~kP^`N5oc6qG-tt;9gJC5Az9y?)7+kXMr>o(WBO7m^-kM5i?`WQ}?}H)%
zf{QJi#XDQx9?NJ61cZ?#^K8l=5a=>~gWe2?Y2Bptr-|r^AlQIAmt);8861~Gy_t00
z=Fq2}SHd{Zqd;JbreUwDy<~Y7^c<}{+^tP}pjDHwa<aKKq)b@92&&KsAd=9F@2d?2
zD@;<cTrBq{_5_pJ{S>9IEgs7W%goO?C^ysZ<(z}<L@j>%)7`6f>j4CrClLb`rS{KF
zQ(fgYH4pR%zY4GnySq9Y+34Qq$^pS_({9}?>P<I+jc@~A`cxv3rbYe<h2VZ*+aWD=
z$`KT{cX9Z?rpmJi`kcqeLB;3I@4Rtvr`V_StDfaL3!U5y6YLe<&4mV8>QAfM!}9Tr
zC(r!Q+@3?B_j=r$aSFn<znXTmCLoXBwEWxS@H8&Y4v1Z{;vKb;N?t#4+0H0Q7bq8S
z44T-t3dax64;^|Jx|kO?u5^a7VJ&i%ai4fk_RjbcLTN=B8ytA?$JTR75)4FTDulgC
zGKgy>APbw2l~0XJ+~TQ&n(pD*^3km~I#DwDOCDU2v{_T;ki!%Le9bxEBwy9=Q!1!5
zBFFlMIyf8|zhXXpF&V;XLpzc1A*YHT;ly(U`@`_ssm-xH_cM$xhn;~8UyiaSG^-VT
z<F=_yHC^A59_MW>hPM++<?$N_3mLFdOeOCm^k&F0M7s}vUGd(CHfB056aX6pE4q^K
z^t0C7<9FH{AKm6uSXw{MfQlCeBaWHdXVE1Ifpj}n&Q~pNEyWeI;e2dm{(`DhykECi
zBbruN(aU%o?l6gB3;mT~BcQ<)TeV5zIeMp62N87+>LNLmojNK5V20aD<2&yU+QZ6h
zt!7^+iVgz+yMU!q<G)sTrpvk(Zlv$+7@2zDdxGAwL^Ce>UGpq=tQs20RcfX~&7o9R
z5>na~T?7<%iflG+#S9t=XS(T@#uq7N)P->Xc6064F&|bdqUx+(W%Bm1){q5Krz8vl
zb*lYGuQuG~o4AiYHY|DxwjdgY-k|<-scJpNSP6N~Qr>0cf7U$gIq^NyZNoowaAx$M
zEymljWO%{>Pt<Y8vNUxr9`qb3;~x$dtD59XJzOgZbY(S5y1DO2OdJa6^%yqsUb3n`
zbDt>Wl|;5j<Xsn<IU^W#w|dmHIJibEvisVQ!db#P;mZ+Pu(d!HYVEgHlD$@XN~L1a
z?EO)n{Yc8<yz&fdjBsqjuG6qV%Ni(IEgnC3G4>14;%636vqe%=!(4)c>gt=3Q0Jw5
z0YLt!Xa6|-!iX;3P_2e7kd+eJ-Dqfn6RE-b2Yn=Z-#>AK631mk?NGwae(OgtOsMwc
zd;8u_@OE!-Rk;qS2Ha5Ojd!ObP`iDni1fT$5~_kJ|H|GwV*9v)V!Mq95Y7QpL9{;-
z?&8F}CB;6W)fhg|%;bU{2ws@yU6Sm{CL7HD9Fgh?H`vts$wqym%>r@h4lGl&nna<)
zH)hvS)NI;`%S~$^`shMe#5_8c!lA&^_e*AE*wffr@QJa7#Fv>6vvC1A6cYIs1VkhW
zM9p|~)Srs1it+;l8TLITaN-z7TLPBDEjQPxb1F$#RZw1Cb&4QlRAG50o53()S!~^>
z`{dgBkS#X)OD9TIRn3K{j|-x-u3-dw>~vBXI!^3@+5A0fObrL*qFTF*_qExDzci8D
z&>6dzkIw5B>m<3l4l*9)I^ou8biYrDIlQ8>^rBg*a(wyj-yz9r^<bQ070Ctkk`~LJ
z>wzi@Sn(@PZ$8x)+kA*kr(=17h|9B_kv2AGX|cH%HHJ4f*SR_aIVhdb4GHgx5$CXA
z=?;bQ5NZ*(t#cf-5dzrs;GxJ@yqkJ&Z8c5YDovcz0Y0t0dUt$|bhOyvAF0TCJk|%n
zzaR?fc}j<VwWn4#<+cS*ESEw<k0wPd3Aw@(%lM8w3mz=Bm0y7~V-F3)!RKa>H7hIU
zLOohYqn(=S{vvwwDzvdG8`~wf<M4utg|~a0k|#C%9bWfxpSk1#u`Vk%8<WZ2oe6K3
znG#tvwdIAACC5d+zZ$XCVQ%)@K3wo%VgR#t5=L`YG+Jav3x}=7q-HeWeM!{#d(38#
zUZ|(dGC9$C_gmn8EJ@z+60F)G)$hEk9ZKw_c*$(viM=A6QMDhvu@PwcS5HI}s=&c4
z&(xB?QnY9tVvvIkOejr+As!t><c@ltE2{rG-ADSo-|TLh(dDYEwWkb-q1W}{;xF=i
z5x<7^FZ4UNR{lCnn9+MfQ~eEBME}D&!j~ZxUHR<}WMivQ_auqXP+FA|QS{NMTj+F`
zYe?SKo;$?Gi_QRd`OOw$`999rN7L-1nrwi9&4r?|47twf_$T{n0N>K@?`{nBYfTzg
zs&9#5lB?oS?v~ie{&d=CK#03?Z39L(EzmeR4=R7>{+39!XhniS)3P1R)YZU!YuUKc
z*uNA|61R+hskR#sxrGQe;QTE)+4xBVj;F$c8Af!kLNBT;oY(ME1EqAbd__V`?QPK!
zcMLdfCLjR<8ITyIhvI5Zo2wd<G1G?RML4j|8Aut|@Hsy#=}WUi($3iX6Bn6LN}>+w
zP1fRMPPzD5GDi-uA#TM14S)9C%1F`L`H1vHfFOEPr?Xq9Hd<Z-Fi>>I#WC>`>1xto
zH1CCg=|haaJrDX)oV=ZvCY<m2b;-nMfVGS#WM2QC7JoH(<a;*Z2xK@>K@o`30S`vN
ziVHg35x@e1WxjRVB%?Jemthv)`|&*EeFi2F`-Y4)y&ZtB8rSp9AZL?`q*(QB(Xi&4
z`#8eTV#_?@Gp%V9ZGB`9v|GsEG}4uZ#!yF^Jy9!*lSpQt=2Zpp-(H<gleof0EaV1M
z&}y?D%fgw>aVxXcu@D8lWn{D#J;uw_e(61B6IIQ;RjH+oBa?=8niVvSJCn}>W(PR&
zR6fN;4_n~|MQC4{m}o@uKT^H8EWOCFqmbcOJqG_hO`lSZR}Fh^dg<WP+Gh?X0CkG$
zF!2HWwl2Kb?7T9VF5Sw>xC)@}_t$;_%X|{2{rMx)1Aq3O7z|r(WT2o0Y$Xxe;2&`r
zyfPmDYHIKW8#l4-ii4J#Y&>txQ^jMfIA#Ba9x^B~cUbLqKQX@>d7CO%<usSxkM9Z7
zTmKxb<beJUG9dD#ign93$2H1qto;>u%#+PD%fidiV3<O(sTJ9~;`vS6R=yIbxZ)eh
z%@9s65W(ZKrFqF~=JWTaoqs^}VB}Huj9|Rq6{oLAe=(>!blYq?h-;xfUteb#x??h5
z*?Aj3;3m(?<2G|5TaKfw$dfi}d4|;)6ACS;tTG(5ve8-?GWyn0rq%{j_Raxys<y-S
zrf16Q=*A9$_$Sh=vHh|s0~9P3+mtRmtOU4Qh}(d!v-B+pGFpQC<NBxc<q-l+ae(!t
zd8eU<yW@}oJb;vo4fd<R;%aq~ip^;JQ~Rrd!|d!6o+SSMA}$ji%}|DYSIxCF_HK6q
zXUQD@&S@siJ0d<EI*mf&A^q^NrOouuG;SKNb`}(tEp6jCtG}J2mliXg`BQjSTHhnR
z?CELX%q(1Z{$eo6*W=xOQ~EQY%_I3Vs5lobvT1CpAwFH|#HhC#5$+GfbD-v?NsV50
zzT$2!yM5$ct_`OawrbM3jE&OM(EOCuze^2(CJkC~;~AZ$T8pqmfFI%|%PRJ}shg0?
z#ZSHCv1MhBY#Uo|tL3NiK!QL>JosmunpUOoYN`_BP1O^BE3!sVZ>KVX6x1?+n*L&4
zsd&s?()g3%@v!*>urfzA)s@+tpwTl@0yi*h{%&#8kqFca7It*8P1s7XBJ^#<ZZIC4
zGBuhvP7(-q&ZoDHH!pG5ioEjiGILak%k_6<@xohH$NCr<fkomkrsJ4t)pvJxf2VM-
z--9j)W6{KHJ(a*{H&io(jeK7%l1-|4mo;pqXz@P*x5DE**gWIc88O{q0T!y!OT&-q
zoB+{opcw<b10El|GW(^!2;E;4FczdwV#^O>MnoqX>Jiqr1&vB0j-`6y9HOr{In8(4
z(GT|UJTl$M07(+>DISDSZ#T5EN0o3Jy;ibM#E1T`7vS!}49z2P6jTrspKXn7e-`yI
zj??pDNCjRfW#cF%ED|kN#p&`<o)ue}`%Mi)zqEz#rA31@!-8vpQ9^hZS5GXa0}Xm%
zWWFOcX489a)lA?pNum_&l@B)^+9A}IuiyWLgywVSX*V*;ed3vuc<s1Tn*U70Jg@KO
zp;~Do^E6#rD@;x;%bZX<A8y)RhQi3GZ3(M#*sdV>Hj9q;-GZ4awkg(XytAhM_LT-&
zJKA8)Z_&yV19j*KP|s@wFDZ!GxE);j{=Li(*ZJzo`#akeK||A_@wb*{(0(;MbM-A4
zU>*A)QQHZdriMS)Y-sU}$TJ*D_!$#86<K*fR}P;W1XIwP0YXuK;NC_zWdJ!DgIBa&
zQ4?2qSvzG|L_;GV-*{ta_rr5=S@1PIWFXhk9Mv&)^WPvF!si=g8{Qq9Z;nu@m|)4)
z)}}O=-LETIV}}!4A4{+Tb!#7D1Snpj6nt%~t$ei)t?$Z((A$%KRJ%3#k*)}w1@+))
zNXI9i6DgHA(x870GJojv8@pR-Q)G1>Y0T09R`?ClZ=v~anQYX*xo@{O&W7;$__kq<
zyHV^W{9eBjq`Xyycsbmw%?X*D5+Sq|1M438n`2!YOVq}S;F4Dd`a=NCVfdxTX07Np
zKg?R5wMH1(r}mLPp(MKLBgH#1OVQEUlp`&xyf3=StmPNf?4v!&T4yX1QWR)<z9BNx
z7#>W}DOFxF3e1+dDq)BrYGN_U;qRlZGCqL2Z`idx<h+>gY9UzPbNIJu37he}{liU4
zQi7)E)fw*F%|~0pMS9F6AjeCy&-;EVy2ao-4hM2aKF+ocP#D%;*hW$3hZZC*j%sKY
zg;yAVa1*cAm(P*--Q9s;BKjrE;WTsLF6J26o>HG;_HRv$O}-x+kEZWe3kQ~!V5kAV
zXdVgv9A%`;K%v&6vcIC@5sh<@r{T{*p>CV|m~7rQv6!nmgRZg!E!5=AMPV%_6^9=_
z{oVErf&*Kh4Xz&w&6{3ImG4h`46V^>%9#iBIiEN7;i@U#q(-A5%;2XmJt}JbD0=Ey
z1J;|mXhx_jBdRz3@bg8aCU!mOwLQqfSEpW>b8|G1P&bT@8VDptuc8^--`dfV8L9qU
zhzG3j_+$e$)SOPHa)WsFM(r(ZD*su^s|)Ae{qAuWH-(6xzunheh~5Hulg^@7rHEB!
zfj}}V=0xY?;Lw#&&Gj)oT~yfp<m^#lK+>;Jl()cWRaP1=Ufix@%Qq!cu&kl3CY4R#
z1NBh6dA#|Y-@TOn=ilBju$oy%rt!4Cpf)p>;-V0(uSbeSJJ5f%{KhPAa@2ATzeOQS
zgKVssP@a|5);VSHCP7mT0U5riDB5fn=!vTdIxfw9_V=`{@q4#lphhIdje;^7yKaWb
zLH;i4Q!kh^#^?@pmEKkWKCt*Al`zMy6Xm)rlW%3~rBe2BqoKIiC7IJY;$O+(NoqG7
z85+45)5r6R-NIaZUv~F~mY!Hi;=~FCmHRq8U^9ki&9CgK77yn-%5d-Mj=d5CD7$zP
zV#k}(gh~;{!8>DV)m(iAfXuS+9YQ+R{JGIgksv25qVlVsfswZOb6)$arw&lXBma(5
zfL&ta5#Dm+@uW1AJ3E6OTCy4Y88?-!F#Ox_#(wCNt~kU|j`+7~hE2|>xwY}D(=o0^
zYMOwf>6tswv%;rC)8|~2#b7PFFyFWW7%7j4=5z8{@?h!IoTj6C9h)c}h462y;gd{2
z`-JlIcK!$2>Sqehu!M17%>jLF?xm|X35EV@&q1_Xn*(TLK1^lzcOvHVXNlGIn<9||
zHH1x5LG(I=M%6k8&{hzQb#wZxy+a@_v9X=$nS)7vL*)WtCiXLvYC)%Yp;RNssd|H9
zBdM|Zi#waA1+-oW>UX)+oZ|l4sY@|^v}B~wC5|e{b*-rY!Ir~*!QAQg8y^(B%4Nj&
zm{KJP)2#T%$fbLQ5>#G*A2kSE&$wN?|3=oER=2-TeKS)(rED2GzvX1_j|Ez+xoa}~
zM?{Pa{a&m?X>{ra*ngE%<LTZU0vj=$lIJ|qbbk5bG%qQv4syTo0ajl)T5i^w($QnD
zDZmEoI!3^{aVJ(<ww3u~N`UXhV4G=gPw)Rj)?3F_*=%9MC@QF=AYB4VH*7kT6a=KZ
zySqz5KvF;f>5!J#bR*JT(!!=Yq;v264(EB!_dDPF{v#oK?zv~xHEUgK2J=biJCQe8
zk~sZYFB)6Ll6=CkMgC%YUjChi>Lk6|u2eP#d)$aTj77$$&tBsY@68X<4c`j7+xy-x
zF=LQYgb=>?dQ>Wa`N6c?0iD|dIy2k4tR&*T0pk?7{IE<4Fe|`TfLBG?>@o&FB=|u<
zr!$?ojAwx$zbn$IP(2R%h$tKKUHjd(^m8iKq=pY{ZAK?JeuXK*xen&xHlYIJi-$|u
zEt#twbVvqaa#E6wY!S>=QyBFeG(4IR1#s--7FMz*?-wDo9X6#XX5e>MuYB!mk)&DI
zF7P3z6gdiT@`WXlFtYuzgA2KzdR+a49{y%vmuUahJ16eDvK-K}wG`|IZ|bN?EI2?4
zQ;3}M*e1>qSq|MO<@(mvl`en7Lxv=vwf=b@x8i($yyh2>IT40%U*Jidoa8o_3A4#!
zg|S+&#Zult6h;1CI6*W+4rea!iTH0Aa6wMrFYpw!RAuL6A7j>3cH6scbN-kRwyb}J
zPMsR2@b`!dnTUxhO}cTi3=SBZiVGvnH>-kZsJD=+?re?a!g5+l+uF&cyd3Hp&X`^@
z7<ed;h>c*0IPYWv(i*IgloX|LetLI*#K?wDuL4hj^tlPPE$kPi-?|3B5Tg-pQc%x^
z+)D!LvH2TAy#H`NwckJ=vkqx}{+bu~cY=;`iSqUr9bfB3g70pJ1<}5Nz?hgi{&2J+
z#Cy^wQ5B-OIRH#f0Sb~ub+Vf1dbApDzjvn23cQXHhbz?{$D@lQP@hwVj;h7?t28QH
zSB1-K(HSwg8T>~9<FJ;Gb~F~Amat9Sk9m42T1ftUzj7i4-dR4!sGb-ED_F9g3%@|P
zj!2Gyic$mQrcEO~l$KBnU2jwoALqeSA70^(K2(ElYpBchlE$o%@1MUcah(|Px>WJW
zTCMDd9PYNaC<WLE0tIP0Y)S+PYKtJo^z^yy=5`nYm%OZoZ2oHCuIlNyu)eqa>84dq
zvFFjb9m307ke8(F!ZoVGWfcz)Fg@4)PH*p$l?-%%+UprNmILykz=G7kw~XPq{H94j
z6cpaC5v-DXN^5rkEm*F3;sX)xIOU`}O8${CCt2TRtov?is)$}2^Qn1iO{snMWrj3P
zp&I=jg-=SxaI~#nKnmVXKu4Bt_52?p=S$M4AHu&3VfKIx#4R+xvA4Y3VNftMHq-|<
z;H3AtSNkI%8F3(QOCi!em$A?D&1x~0gy9~xEu6^4rxt2<iBESP(3d4t5uReZP;uah
z`Y?i{R2%T^cE4cGOHmlX=fuJIgSBT;d4ZEFONfQG7|H4rAfst|k%v=f2b^i2l@~Tl
zCG$Ik4LxCh=foQ^NK2G_btDACYLWL9e9(XjyMyC*>aI3{nRn<eoi!fHZoUbHo-0(O
zO=<x}->%saAlZTS;9dCDDKv~t*Yr^xQvNxa%|9g7ugb(GyAiDBZ=7vgry*OoA{Gxg
zaK!xk8x<+$5O61H!~VkW0#z1*u$#ex8NP~0hk(5VGUA~Z=-N7Qrr&t;7kx~Q1&40l
z{YpvxBNDmnWKNw@;XL=__BxmDa#ETCC<4_Jf*4z$C~#m6y5OUv+Z_Y)OtA(uYJV8R
zN)i9s^Umk&9A1K;OnO#NKmlX=5J9hf=Zx%vG&6|ie%Sq~yP2MC+9rn2f_iRf&(_fO
zh|(ACffAkt9pDliG`<}_0URVvt#4WK&UnOZ2`8cxnJ^*W_yjb6l>jOH(mmL*5>O9G
z_&1P1FOjABb`&=x?OK9U($&5=G~(!aQ&90m-YETCL17OmU#rT31E?nP!=-!p$v)B6
zT!w)BSw@ogroaY8+Dk(LzqCHhubZ%NslukmlTg+wp15t<nVF|R2j$Gep=f@H9ISF5
zyzSG(;=K<!{ceFaCVgJVk}QR;2Gg%7L~h-#8zJ{Tv%Laz2~z)5^)}IwFp6u6+~5md
z9=FW1HuDO6PA&5L_sPrLezW$@lcn=q5EqQrY-|T1*f2}F`T(!XM3&{<+@Yj3VNSF{
z6iC-7se!KJUHq{7_gm6QaUa~1{EAMOk2xKghjHaRh#GL3!pg=3xH6xge|p0pFH|+d
z9?P^Z{k*H33VrT=<Bo{hS2vGnYcp5Lw<_elUK68E2lHnoeEpp5lr8Fn?;GdzDj|?>
zw1hI>YaAYT(OGY(Gq!Gxe=<l7Q#{^^b69b<T$L)rDHX(JOi7bcj{x=h_kZfQ^XXqI
zYsqhXe3syL^fCxa+NtbfI$#pEFjY3tCaY%?3BTst-<|6W4a{|Gt25>=sREG%h|p4e
zI(J9l8P3-3V&_cO*oql4F?MuP<bh>>St(7dBk(4eiKJP5XDVo|K~3_cY!dA`HJ^&T
zT84JACQAFj(RSO8<t8s`zkAN}sIGa4Ucd1EV=rcC`xo8d=4!;zR^_qHtaFE|?H`eR
z<Bm?#uezC;v~ezm&m~2UMwMFsUanRPCUKt=M5d#8h;}4p3iN3PJi1rUStpi#I-Jgr
zu;-#|{xD9h9taTkN2+ks6I@y4#CIZ5JpmN2ab~$)gA#56s)oPMn7cXKPM23$>{&ek
z{L-^&hILg{kh$41FR&q}<U8LFwMQ3(h;$$M8FX4ZQTIK(9kW3X>fLI*@v=PJ-0@A>
zhY*I<A@AQnM8RFBed{JAa^eh?p!yPUDiCC7G(Cgn<v3!oI~aJUuoRb^KoThTmpkzR
z@;6gdYM3u%CS!_hm+~K;Y=8eWkNau%Q}mz@6N}MQ;S%u_3ylEnRSL#@Xez~4s#gBO
zY3e-p-ETMm6g>hLHjDu`jyb3F;4}?zv_tipGGZ`Y>l0V%01pB{QNY5B!W0Ro*N*#2
zU0ggm0gda4^$WA3!9KZ!9)cfn)zm(Jr3M(q_U~)5+Vf-HNN@PLdyKO)GKqK(CT(_)
zW3(y(P%B3Xw5LfKHy2qBc$hvO#9;dF4YvJS|LuM%`C3AZD&zeAQiU58y74SrZuG0w
z*N1f-Q%YqP`{*iw=o@KCqY;iPH+1h!!+vpLv2Dc~j-&~x!JCpz^&<-P{?bnUA|VrB
zb@tz-RN;k%u65;X$d*&gh5ZhUvqNzli6E#`3%0j!Iau`nko<$+Fbt&`fTOL4dO&s%
zr~iNsaD!hmg!L4VPx7w{Miw&yQV|iUH_qz{9QonTLp{A~)c6raY8SZ*7gXZlEaBe1
zC!q3g+7Qw)6*IXarex+DSF&!CbT6>IRX9!La<bSH>pqaro#yh_@8glpBCMEm<TVKR
zY}2E#L_Sby-ld=v*>Vaw$JUsGs4D&UU+k&d1(Gu`c`~*`$!is+&i7CNiR_C}+H5m|
zwgAgtGN6`ry+TJ1#ETGmZNO~nR6gkiHM^-;s&HFk5lMig@YSXi2~IADvkN%^UfsQR
z2b^aLq}PIWhrQcL?{fLc-|X_Q&(Tw!6&9GF1_5Rv-|6|EZT+6ro>r6hwwe;Di~hC*
zyy|kP`qH_<*gzON0UABGg=M*}tewEK_2p;dH3k)rCl>Fbi>aAXGhkC~P4U*yWot<=
z&GQO%!<z!+9d^NEPXbL~WiIpIKpsM}%gCmOdM-SV0Mo8}RZyCKT>_={hrAPVhOePn
zNvX~a&Vd;2@sH~;c@~uA?9|<1TX;6M_Nug@PKp#S!Gl;Uj_Bf1MFe%*b?;*l<RzXa
zOSFgX!Tb>|?Dp@ftxghMvP^s*TR`(~pL=%f4LORm+$4(M0kH3kIyo?^x%czHu>VlG
z@#6kazZ786LqKkd;ZP260t<@=&oN^G$|NM|WF4*0O1}#=GhFA~v%24hC23M4asHv*
zi@4ri=%lo%<73~S`(*+2J2+2@CC74osa6phFdz-6oS<ESNdfhFnHS>8YaE`58S+f~
zYjD{5Ms|yV;mKbokhsWi*7l9*$1*vKNpk9u_tYWGKZBPX!PFyzGlp>i;~s29Y^NZ8
z<GM<Yni7IQADpjLzX5;`YN4x*H=Y}phg^_Ls#)I<<G1tcU)QL&Z9llR2zp(&2Xw}f
zXG$eTg7W`+wdb)&W&qF%uzWuPFZt^eye&|&n)t^HPP3PDAb8vnjU~&7zBF>RRsrI+
zQ2j-fkn<WT*`I4Cv6C+U)84~TEePJBEB4>5IM^8BN3=TsH0c=MV3v*o6c&3aH+W;3
zu#oRo9aJk_Q-W051M$oP%P)GUqsPtWv&VhE*ZS^p#aL9DQ05&=of0-@n_V3o2{^gj
z{yWRdi*c@!uhDXLg-2?yN80E$Kd%XODHBM>InD_xseS|<Cm{cLx4T(86}8Zq{a9LH
z5^{&?#gXvd?SSFs{WkOuO=N=0?Jb!d_AUu*uVy%X<Iye<ETF&VaIH6H<O`t}0luV>
z-H5wHwlPo6w|c?7{qUfg=3R$de>>>@_uBHtba-P)Xg=-apEs(idhNUJ=i4Ky1iv6Q
zI_bx`rOa)*!y*q}&)=iecQGt_@GvegBRxFl>re+Bk$x4W63!{7Xys~>Cs-n)>ny-P
zxwY7K#y;*{b^zAx01~PzN12jpYJhcJscSmDl_2DOssmhrj%x$j(d9_|P*3t%I+IQ<
zoLqtOc7Bh<?-`(#_CI>uZBI2I8MAkJS_$78>u3uem<3M%&cS<~Zq>Q+YeB#K4P{$(
zK6l(0bvOykh2;a#*^CqdU>cC8Djax$pud4B-i40p<LZWbhw1X2CS;RT=31a|W|uW|
z)n2C~EG8gU8Y>_gAod`taDW9s^*)ZJb4K`_Tk+2YIfd}G{GK}RNp_@+^F4=4PJlgp
zw=OR(mr&b!@|_2xml5z#C-c=t6Cv+4j|y$F89P=5^4lk|6mWvDT0hX5_3Z6Cytjj&
zha~ST2l%Kv%_|<Lc*8MNeen{sBY5s0V5>(aFr%B$p7(0s`noouItyTZQf+lTqT&KO
z_U3O-P+S`yY0X!W-;2wg7uBn^epaESSDz*f*i*mE-8dQ#S}Ru++qbu*)XcnfbxsAm
z0r>ZHJhV7FN1g^FE3C-#TwcT+-rP@Tl|OQ{oG-v|Q;q9+?j<_flyJ9Y(c2dXtkq(b
zzfXfK!$*Ek^Q&*hCos{To?o=1AzuJB=7f}>#wyOEFudhwRY<2xadR|3bZ<or;<?r~
zBp;>cC4y=C?tfEswq<@Qqsv|jv48rXUD=ao+maj9{24;cnQd2Fx%Fg2G-5i;?lJqG
z5#{hdkG&T-^cxq2=$C!&+QG8F{IYfeP?$L<dYR|O(m^M<V?zpuJ1Je^LICs?EWlL7
z3g@G>Q%<e%IJ1QUi_hJjV<Z@63vHLe^orlrlS)@jEi$R*nKY|=S?p7f)T2~ur-y!O
z^iMKmU@QJuN$l&^$}G379#E1&xwTK!WenYgDF!>><Mcm$@;cL#gs?ykmX&97dpKUR
zAsak3t+nC3x@qHaKl)Xk+z4>{W$3&1l=)41^9+O!pXL<Sa79{n?eDE`l^JXR;x~Fv
z0nq_$1hpwZ&jZskbZD+S7&PFVM^hStMw*VAcQ0g9b0dM^|0plNHv-uSzKZ=^U_J>q
zZPj~^NgyQ&W9?{Lj+mp<{Ix6G3Y6E3eNjoktMJvHBqd?~*>FAw9>;;R`5a}rlN+?v
zP(4w<>qc|8?7!bM*#)-F60hgg4)*_qOa8u&rn20Z!7nwVJhLw*%$D;5H?o)N6~km{
z{hQ<?#n`?Ox@%|RSNERgoG;4W9J-d9fW|<*gBQnL++iNK_gSTR3$Xp4D^*k)Ez#S>
z*aHQ1n@583Cdzltdw@I5*LoOv>T16>m<=<RAA$JwK;O$6RaHCfENN_S6D<Rm@9cO(
z_YGU`JMB7pr@galkHw6T?=!HOh>4DMLT0pCz4uic_gw4c^;|(5T5l|Huq>tJ^bVrw
zZ?@{S{Q~jEsD^Kb-4Q>m1$h4GQ@3cP1?ii{Zs27yAr{zRHw@L@`?5v`z5zwr-{<MZ
z401m6BvYrh0Q>Bo={#1@1ZTPv9^@EKn%d>T&<RY^i4jHWjLd!5uPY3z_-e>!EC0CH
zo#o~%Rn*W>fo@B2n{&VieB|x^nW=MD`1tV;W75=61<(!%k`$C-EvN-8cm*vgu<hrP
zOv2<eAY(R1wh$ilFtrBLLq}9WT0;vf!kg4~+Qa#w-yp7}&Fqsm2~V|2=b|Hm7N!(D
zdLA(o2!eG4<5Ptb>_G?>d^Ef;o)Kjt85YQQuNqd7$$^XHnk)))WWoUB(*sGk$AWWa
zaw8!=yzqro4A6EX+&Oh-;W+bWrgR_U{LpB6O9kMYcRuqdbXd2Kz+4S$W}N1lwt#v9
z$Rfxq5(BVm$4`z%L5O31^Nj=Uezo8YzuVFPe#Bb{Hl>9g%wBhxf(!yKzG2gm9$u=h
z<C>`>Eoe!ANE3o~Ka`<TOtricKlIeePYWC77c&!l`{_{_WJVycvt*Ta<?DPZ-JKuW
zI4f|R5#XzatKiF7J@WPd<ldROE1EqbzaU_+q0?kP*Y}Jzw-SIqbdUD`tP1H}z3>ZL
z9Okf9C-h4<0sIpX(#0tUpFitVt|Eh7NJ7d4yA#g}JRL_i(L1|A*Oo-W1V%Z3R)f>b
z95FH9;5Tstnz$ptpdr`sYG0h@3GLX}0AAe8cIySG6+WiRpXE9KLeX5OCR#ZUmdbwW
z+*0;PR+I#M6jXe`)(g~fwu9TcwwB<wnjT+PZ`#CaacI(#f0^5@LP1KZ$poAR#qjpw
zl&b?^?z5Af*@2{AC4!s0?6rU%>S=YXVO1kv?XE+G%_)Pq-zKQmnap2vftf!9$v@IZ
zF@I;E4sq{%;@0jnG8SOn-gPfJ%?Qr$T;m_q`ZxaRa9bU0Y+c~nBd;fG0ZmCU9aF>E
zcA2JNeF3fTNQ#vMa!Q50cf|~HGeEgri6MaQUfeHKec=LH|FVY0y2O1%K}ru5P&mQH
z<{5b6mQ_*2Ajqi5$i{(=Ke&&7bUUJjk6bTJ;L=Q0Et}R1Y)jGv{*ns$BVoDaHc@5g
zFZ>U(142MYrt6|jAwaCXp@B4@Z&&{HL{5yYIvpcpFx-dX%Xp|j{P*V9i<zKZPYUgD
z^LKmsed`WSM}WG7(@FU&Pm)~M)p`1&6deLU0>{a4w}-D8KH=>Hnl%SQC9xdlejOsm
zE$SUMC(>q+d7%;bqs+XmH%|@L{uU0pXq2~<>rNkXCm#A=G{IWZ3F#x=2vHQ2mpfdg
z${%RujM1|uMb_5N^9DC3H5^>w`UzwI?V`)<G}l|?1;-7CUb{OQnh=iYWv;K4EgXum
z!t{h%HD($SMy6`!Sr%x)eob53WOs^}^(M*ZGqn9?w*Ek*`w`_`lk)*lx7DGsUM9q8
zXO{#tU3~$ahtzNRf2h;tP_4MSI<EY5sHn9NET*2=CXv7?1y&YS%&5ttzNfA+Ydii6
zEEfJc*}fgTo=o`1?)*o|YO4fI={s#cb&p!9sJr`=<S!$SD*vGupi{GELFM)Lau*Cq
z2BW&#F<1*|Wd@EdWcVED)2<j!g%%me(}BI;P;H#_d$osQrFg*SwYRp+l-kYwzyez^
zz5Nb8^7QIDO!k?&r*$rv5z3ubhu=XyH)9wRbm{s)kYs?Q9UXiDb%S?=;NgKCU5~X>
zMZOOcmX4s`f)36D+|<cFpme>mfo^_2j&)~TpbePyf(q<Ur)!8M8J4O&4C`t+0C29<
z{a)gy$s`?kEYT#6R8XoOn*eA?+p@~T7`K#<eD18Zi~~3_czWt*({G@)Ts`u6K?`})
zP#4H9R%oZ4Ry&~Fo14GJeV~-OH?Qkjj|P0h-OI-RSRbI)cB(6FU*F4HsFBiH<8LI|
zTIdrSaR+$PTL7};>iYNZa`}q^$wxOA<BPSmKX5c7>9zT5Pc+`EKE8LCXVtK6Wudxj
z=>ZHsTTLGcjrV{FeC`0>Dvp^y31x$NUGib+6xfe1Mj$g=1Y=`Y%dnBilvy8*lABg$
zoy7H<7zeUV5t;ih9>1j`V_1H5_2w;uuZrvweA%Zg7IJN05VF}~a#?}251!rUPve8)
z!D6trrb;KF<JKBJhh%M{?lt4umCTf%RxaVbLdW4eT$v-y7Utz%z)yt;XwJGag&MtA
z=L?X!q3<nuMM7!r)N!`fFkr4KrG&qyj@^+BQRi6*ozU-{`{<GH5T)1kL+4h6n?)+4
zB<<l@l8Ra-e%O#)b{#=o$piUF7kD=tE|FaU)ST(nBUb4bl8fH5$D47#f7O%er|hs6
zR9E1Jv9Q5A=5<Db8VABVLkk(Teyub(JbY(8FwI$DjaqD+N8grl7Kw}kmr}|Ph=hC^
zUg4isQK=1D33`+`-16zZ;`6V0a`Y^gw0S*Rza3dlZ0h1R8FSgo6C2c0iOBe+^scP0
z!}QG-RxZp&+9Yen-#28eE{wc4fF@$WB7Y<YJpH)%Zo&({SS|?<!(+8_Q^lY)v|&=m
zIX>XI<Pf-smZz|o#{yg{85Y`)d3t%Fr1X2xu`oUUwgFuEM6Dwou!O!#(2}G{Ek<I4
z?)zUZfTWU3?POnadz^`_&Kbk~+;n`;;<tImw%XUPWuK<X<p1KIzH+>An5|etWwx$Q
z42(1RLdYVfgx`pE5ooNeR~a>TtoA#xyxVM~Ag7)<sR{?Isn)V@v25b|wRCa4D-zw|
zUxR0n%KFXd6Tj*Isw(N!lGIgw9?vniC7?5Vc*4-18C<MQ&y*HBuQXV^AxIQ`3bj94
z_<qVqq4791$v^phXHR3-p*U~6DxqGG+L>Y9yk+664E>6U5&Y|#PjRoyS7`mR_iZ*_
zv)NV8CpvM}B$ghlll;^WCdNq8vL0(U<)yOj?Z1`zR|-LK6a>Q!1UwTMF#D7D>N6qp
zS`u2z_y5GavF=x|%sO`lU-CsuIIIeFV<l^#c4wIhn~ZtAI$_1ynjFYP*GwNp9=le;
zdiRCRBX35_;+Ck^WsL@N_~sFs;(Pt-4;}k$W)tJsCdni~d!%Um_zz+3u%|!t9nsvI
zNb7Qm!iMDe1jJMp);$-VV?2zls$Uj}INrp;@TfAok{&yIFhBy6vOuzy>D~~}`LR`4
zZHEy*sjak{4J%^QljRoq^`{t<Ryh}Z*j5YiI4wpft@hMMfmfM(QhzuiUq^{$Y7lce
z9pgca^ZLtxw?h2=`FR(kvu%+j)9~l5*{G^X%(>N${sK-a9uMiC44r8hrU-OBbOE<a
zKR+`4Jo(g~E&~JV++NB(X`fpaKv-i|Lf_|L=(RdD^Mf^U(2t3MMZxTd`khM8ZrRc?
zPv$G4?d<9-j!ogM3NOvyj^vYh+x8DpB(Kg?3E0}%<3p&0Z8g0Whc`Vkn!begN0aSe
zqiU4GZEHW`eQ!D%EYckjTcTHwOh1!57_R>LR^dj(`eHw;0U?_>6yP&_W2rFdt#B$5
zJG7EhX3TQ-+@{irz-RM&=H-R0#lUf_d4dc|x%+M6!1SMy<{cKWD*U@a%M?3IQ_9Mt
z;XP^*lg1;if?6N#tvm5(Y_*P#($Lr+W>`NFQUl9VS%KIYypk21yQ0<6_#-FuJm<Ti
zzCy$M-_my5vOA1Z&*UHcZ5&X-|JrywNgHnz+Ci?{c-)$n7-f=a6ebujqLUa9)gONZ
z`#~@%rS+PoPxZ^>kjI*@eRPP=O!a9nzSm|JvC|Z5b0p#?E~}O&;bh)G5wlj1!xk0z
z&gw@4{=<(lzUT$QnH$$k3}>u#I6LdzEZ_)xrNdEwRvoxi@j#->j}r!Uo|cp9Y0Ldz
z9=I`>O6oJoEmH0!;A%<X?UTjbvaD9;h3_^GMyKk6<>f?s(<0=YMGkvT(Xd?lel<Dk
z-?@afhW-leJz5nbeW8q{fkS`A^Lv(Ybs6zN-EYg=IP^UjY)E&BqtB}<+?S&cI66%u
zyo#^F&1ELI=jP@tt*r$NI6m=aB_<Nc%E>V>3c#^Xd8@t<pml}r<1Z*Gbg{dwFksgC
z?c`TDyh_QCP-B^OdSa`tQQKB$wDleBnVjegvU&SGt8ybtqWl=6twP!BqBJX#Z)T4O
z<gSaFKYg;=nk{$%9PN=Qx*F6lnWWgdi@o|C=tHs|5^;iXR6)Pxe;QB{lZ`Y>VTxJ!
znLJCEoRZOL4b9zVP*v-)c8h-}InIKPFPE>_oY?ZXwov{37hB(L&k1<7{5Wr7ezr;S
zBZA3@d^)EYAzEc6=Hlm)^3%DyeF-TFOj}ZnDz5@1YfO^0&404KoTrxLRp#Gx*h`5b
z50<RSB7Vyp@x0?Vhdf^D<Tv3Yg2_;GT24j%o&st)(Li3#-Gte*P@~a_iQt~qy!v`{
zzy-4ZZZchc#2_;<Ezy&s&GF9eO^da;woroWQzF#Jt{5ZBZ4++ggh!9$+E@n%c?QV`
zx^m{&IiQ6QWo@$5v%w~T=$2LmkpK&0TNoFMU~Rg+<!qp+1%82*A<;E)R{<%B_XbJ+
zNLyUP5d>Nov-cA)VLN)8JtoQ>622sMZvV}(I@+^=abp669~#%+TxR!eiri!s=ktR3
z-sQL=#Du@J+*Q28+%0GKtu2H%MFb(Bs>W2D;@=j%@FU94JSu$?y8ntbb*DK>FIA;Q
zq+5ihTl=kii^x|mMft@Z*J(~Tg+iOuMETLtDy0o)DI<Za_yGSR^n?rA`spZk-{qD|
z)ucJ^;kR9@Ay!c@F=<Jtsi@!&0-cSj_FgaS7|cs_lgyRvkx{5$VR7^q0&z8F&C-~o
z5yg_=)>ivxPU#W-#=xB^dkak*{|Yj)1b<}87}SxGbmcqjN(mJht>%j`ev#J&axcmG
z8E`U^n=}->1sS8d#-5Q<aPt>Qa{jjyEvb!@_Fl5s6fc&TntCpbbW=u7qaP0{Zzu#u
zOg0f~7fUldDVL4Oo-7la+C%;XZ@ig_KtyjzH}3QIk7-jdnN3xcR2n-o-Dpx6sWp6O
zb#!``>A5g?N<;~?s;9up=?8&@JLUHD@yx|1k!2DlqR`OEo!(YLZkCwIox#>uRc0(P
z46$~5bsvfgxA0;-_G(Isrx-+c&5s)(?!pQ+!<^Y}8=8I=$0GgR#w3>B$6sJHN2c@z
zsX<mLL>rek-g|KW<JRsny#tI_MUP7hUq!wQ;xF%;nB;Yu?aOpa$r!wfcC0jI6j&dW
zFKD#KGq?~tuj}nJ$s|I~Sfu9^CWj99%7h2D>k~lOhC0FNRILIHd)sAWebX0f7khmQ
zF(+$gc%p{|OkB15Gl$d@ss-;9(hdYg;v!7y1k9nNryu2?ks3?ULSKX=M;H;gy1TC&
zdD1hbYL(o3(b;9@==fQN_KlyPa4};`)<>Mx`++F_Z3YJ~yZ93v2w#NYyg|SFN4Mff
zoXlDp<LH^IOO8vabTd>kSBW|f-TmepPY=PH<_6->*W4tMO88e4IB(uDkVKmYNXtZG
zzUY*QlcBx2qUvm@I6ptnNQxNN5MGJ4GBo_4$|%VU@!~cBa~NR9xW2h@YFeO8=$jw}
z4|5F-3;QIMtI168))xJR^Ml~mU{>}Qr(8adodl*__D5(<Y;}c9qrmum7#%;C9easL
z--N5gb3j6H3aCwN!%rkz_5M3`bf@<C&d`6~1N{Z7W<iFGhVI|^G4@TqR0nlNiWkIg
z@f8DdQs^AR>=?>W^tWBKxZi}Nn{nRI<{BwO=|A-rx=P}{S(B0K_`ZVsEN*NT%WSHM
zfC*AyQ`=E**r&IXIumC>x&ATJcenX>-mXA#aTfh?umn0s+$STVEc(LZE2@y>AR`jP
zlyBcdIp>$rUo49KbNyD0rn%r3t-34WOw(DOx)FCGLgu%&&7Zc5XrZHjZNvV%{Kh2S
zj2!bXbVt{YQvT;f;%EN9A37h(JtO@`R6t(!hHs6<@8m{pvYY9b=;62eV(*}N7EAK(
zcU-ieFz*I_-(bj3dT`q*DLR?fvD&4l8IjYwM-Im5{P^adv9g@oogs@}Y>+UYT5;UF
z(fMfpf6p8cn7?;7q_KO;g-@)KGB4!^Y~<y`4CE2->{VpIa`%XTQl(cU-}PSZ5C5B+
ztNm$F`CU0hr7BAv@Z<-RkyD?;L-<a^44}K&0WXOoQ4Br2G~M^fe>5ZXJR=5<>L|@_
z0{nxA{RC0ogRs56J9~PdsJvJoGS;rI?{x{|zg%NuclA^DjovL^3LWuJHSkB+H)0<=
zSn3|}OBxR~_dkX1vb#nHMF=4F&;pi{G6O`@Z`|*m;<f6d={~ey|5SEM`1Xk)Vt4+O
zV&Q52CmOcQ3;#`}Yi+F08oHs@&?AG)Mz8VKRES>3*+oEbIyPKX|8~}cT|Ob@Wm3g@
zOJ+{96T=v;C>!z+d-h^JGssF=#DSwOVBdnw|9bj2YfFey0A<_l?Lzrv9v)c22jl;d
zO#K#oV}R0RldTo|C!2@|AzJ3;;a=nIv9<c5BjJ8iCXE1R$be2SX66lRS<o|o<j~0Y
z`7xKS^KO`C#d@Rdx0=KHu)5~MiRzYP3`O4?7rx`;hUF?)`@#AB_Yj8{ya?D~w#{1#
zWOpXlIIPc%5+O7hsjf&2Gn_Y4jr<yaeKl+LTwc!UIsP(B=>x;X#sQfw6IXH>IuXd9
zBPU4PO+t0s-N3spYZ||uCQy-41%=!N2i<N-%clh9c(3rj{vO$ER18QzUzqpWZbK4J
za2Hu>ce5DWzQ{K^?yM8-F`UP(T{uA*!i%9a+*Yg0wjpVy@v-nFzq3jw_d90<?0iki
zmt5k0Fbal8CKU<Ua2Xe7x`E#^s?hOQ#U;VYFfh&2I<lp)^2Gt@e%pF13`rfDuheAg
zSNrZHk9QPYkDGyL*Wf!I%)Q;4s544=WV49yM<K9ldR#IzylT*vnMjzi#A|R(-`>g!
z-s=;?Dk=FXF3DE5`H?nC#4gDZ{#PTH`rh)whlU3)=ZS1ZgRHhZ<w`WYI($6O6x%jp
zk;jrTA{Rv1$RXjGG<!(PO#;8$P0Yb&&&6Udm+0-zTOYNV<JOzE-PA%ShX!4vJSYVA
zE%#Wyu3W%w<zhr`h}y2^W5)t+sdnv=L=o0~i6X~)xwn^#q+ZKyt!^8H5_iU9A@Ps3
z@#R@M62Zof)X^5uJdXYnioSF0jBUyy3hRl?h_bLyfi?-@t4~cF->%M@iV2aK_9(Uu
zg_l45FlR=(CF+!HX^B2*4UW|FaAo6>o6gn-jVyC%@N!>-^1F%eW<7YSZ2G7XzjEvx
zat~+xf%8)vVuJGPec@s5gt_ynAO)D!N!bv4^U3-5#Ig#~fyrrlZb$#TjG{xt<XZMg
z9K2?FQUXKxlc+nadmvAVrMv|zOJp}82@x^RC@92gw3SNLLj_`%8JIi1C1oy@9d5Or
zc;Mbin}dbe<Uh`cE5DG@X3?EjuH4_k+xAHbX5>(Y-r*qIeJX+{YhUJXwiGLo?u7V8
zHBI$ra-#jLOoheLYHGTRZ|tv914dX61u7E4C_J%QMvyT)ye*l(ioH`Hdh$0+BjaF&
zU8eQ>q5FUrC`iQTM5_|OQ6v$iDwKiOQ3wm0JmA~^Qk(JX+0?={?z1tR1vKZTLcTxx
zWOqZaAbBZ#%*Z2(4WV=Sr5bk<DaX7EJHV}V_iwP8t~MdDdum08BSmkpvfK4Vt`m}m
ze>{_vRDAhXLaHkAQ4FQ=2XWdQBgA(8##>3SF5zLp+zt&C9!3oC9&Fq#JCiOq4`t4R
z$0ZAWhWTCHodm>q-@4ZV-xlO(+g-*S<kgqOF^ODLDf(Xo?q%gi<QQ6N+wYZ@$45wy
zsq_(Npxl{Cn&Ll>w=;^RWe0-6(w6}9qMDyH&GF373uT&cHi@chCQ+GwjD406C5euG
ze!R;TVA_2n|8gY?>kR5RA3a0HdL2tiOLB@widnf6dXZK51aY2=()p$X{Lc?83kCAB
zAmdZhAXVznjve9kR_egl6j%Lpnl~{%n*IyvdwzR#r<9_}(KIAv0%m)Ql{>_g33t(F
z82ex8&N{t%<<9giVscpms;R`az}lRrrb&i{{}P@+O&rO-vTKUSw|UR9-Og>N!0UnN
zVAFK#@Im=YJ}W+%Gr9Iv)pXa5F!7yi=)3Q9cp=w~3}tq-1SGFU=M@+!Xt+nk7mm+d
z_q;@WFc#3<b8Hyy+!VfK-;_a{FO77DAO1U+ITLPfOcr8`!(7qsQ69*`S<&9ltPZ_W
z;Ng4O%KPjyHI!fbH_&o}#SEXN{wD;o|MtR|;GR8hRaB{QTxY1W@jJrSw8L6NTpd=O
zlV3OvBq34NYF7S6VW)YWHuNM{IpZbyFS!8e#t0%1o0_%|cac*h3W#7~AU@T>C)-Aw
z*S*zsJ?IXvyV)8L2!}N1wtTBm7f)Ukph7WLTU$l6h;y0`zksS*)L~+tR~{Z^Lf0ZZ
z9A~9l#U}D?#!ZarpNt^aQn^Iv+7b2{S9Rs=7L(=#28i?1*31C1)Q&!R>|3tL+J(;X
z68rVq^4kzn<l)78mT??)pt8xB|ES4>V7i#m#?`kQ4vnGXp?xO0QC2?dT41u}Um)Br
zeWjGf$W_tJrG&QLD1$hA@}pxmWb{7rDt54GjkE0PHw-Az<xrBSEcLVdKRry@Vi5~1
zmPNGFzB@LyzSDkpL8lElwV59cwLwN2rhrHncjT$>|48`jcNb(N|NOz_cAeL90nB`H
z&QAAYyBzQ7L(eq_(ZuvA_xu>48zSIRe)#nW&*)J!p11UN7$DI5G((s~kIWX25hJ5D
zNXa|Z%il|(xf9lOhH*-PRje0DO8(f?l$46f#8t%GG&&!OD@EY`VZ-i3cgW;U7xSHF
z!zc{bF%Q%C*;zIJ3AA^=55c*ERsa4`2+m5-%YVfGpWo<K{1c4+^F#R4L|T9${~hrE
z29Ey^_6eUJE+&Ny6qAmO==QR(uzX2OR901W)!*^q+7i0tYYMpVs3QLtdam5xEV54C
z$6>YnpNS>EjM#ZWVGw9F>`qI_`ag55>Wr(4llT*qEoZ)J`!ATgn^p32M#u<byi>3z
zh#D6EgZVm{->V9$_vm7f-?sj?4RiS~=Iz{-P*R1oolwr6XHXhK1r!?6hvI-o^e3kQ
zaD;;te=^#d<&j|_+sjqkw3q+PU?qsdVpeC?@k9eUlU7CEZzHoD(G<|xnA+-zuvWz<
zutT5>u(n}O8Vfp{JprJQ7weD!e-c%U+Y|?LyNYb=<J)|n=U}#-4?A?m#mrV;sZRE%
zUKC`$-OPsJV&9CIq(AAJjD$8lQNs5sdl`b`fWhcK5}NjM`Ey{wPfocb-THq;a`fnA
z$3+w@8W2fyH*@V1+p?GZ13|$g&)vo=58nN|E)JfMf+VH`1g?VOV-$y$3l4YWfsHH0
zEsKsll0ZR`V>ymO)<)wmxUtRbH*ENi8t+8Kij>gDn(r$w8e@gJvMpL;vkTBHT-G%z
zOn1W=95llwH%wg%gEca5?!oQY%rA7>*0fWm7CseJgsM`%125sSHGAk%tT0;K)!jUK
zm}PV&@A;%STsm~i`|NQ{^IytY_<c=Vg+x7nwn_OUW9S_5-k%{vL8~P9Z?vCk7q?$L
zJ;OKs{T;3^$teizTEliW>}JxhB&59gu2%xa{VuV^y!T?}BsIqCH`3RVAXx>^PHRL)
zS_@h~)p=qiEuO*mS{-iJ@b<jdUVFzN9!C!K#n<_UgAW+i_@JXTg4$8oWsHMptcD_x
zfhu#%Bfg^NdbFm43hab@R+02~M#bB#Dm18qMCi|*3t{}k6Ri(E=ukAD-h4itprI-^
z-wOL2^^v|vr*B+NRJNk+<GyHHZ0ppfm98ICo*NU&d)22UWmg!_F?ab-_b$AS0z|{C
zq0j?6gU$m+l&ot1=OyJg%*)FP`j=hsQwpEZD;ro}^K%DLv$W?YkV?~CX>`omyBqZK
z+U)4$cNZmBQ|6s(PY5RC4CZ<l=@5Za3)o7Fj*o$oU3^)PAenAX2>r_IWIR$nb5t^7
zNFu;g-bk5Bv%uDr<txHe-b%?ry!_F?H^qbv9+L7gsmgV}2h=ya9ojtb5M?bPw{({a
z7Y^+nxBMt6(=f@7h*UnrF`MW1a@#LsH^;u{fZz@l-$V^ruvqVA7nokLM25Oia}IYC
zC;YQ*WJP&ZBtfL7c>i-m>_+v6(V7SI$eRQ*?+t_SXe@vlw)y@N-DF-9bhMz&UW7Rq
zZ!z?fPN8mVNk=fISj}J92%L*{i`}rg-yU-jHA?OOVnQj@awIPDO!iasdkSvgIEOE1
z674KXug=ZC|6N4d!m8;(_sR0ZLfegCs1G*uu+L%C!&mSr??QBk-j@Ydq4ACw1HaJ4
zL<@c(=+n9u34bjb^(LEsZ<`dh4p>82V!EhV#p?4~<b@w71%lT_DYX3))wN?da@ZEy
z=0nNS<lp^CxXpNY+y*+Q`+X%hIW*LY=$lJHHa+7<wMcYqkMYKv7uWOpYahc9K}%v!
zKd#Gf`u72~c{i|<%xm?{>gW4R$kQ?1neR*ycfOL<^Qc19qGG5OTqGoVM7<c#zm_y2
zt;UiZ8py+y8quK=GPG@0zlrksK>b-P1{7K3fJ?Fc=%)^6m?~pL*~<$4W0demen8V#
zPVto{t9hE*{yGwhFWk@<>q-m^Afpqb?i;2&T98J_s7<Ihcb#FJk)epmNpV!0x~Gc4
zaAkMZvx^gM)JlrKHm?XY>cfTAQ<+q{X$j6%bX)L0{OUK4KT@KmX_<8<zoC37FrJF|
zD)exI8@#8IZcTFWl@5{R{9DuSVHSv8(1&pT5uh!GN$sdfebM#CWs&GPDZye%uk_qT
z$N%~p0fLfXIs0o;@L*%;%xfj*MlWE$D6<b9e8?^UKHy@ut<;^DRgpo%e~E5&z?Kz*
z<<n4H&+)+F0PCU1E!rCdUKTwtcW2|b-O?QLs5bfYy{groM3cr3&5mO=Qzs&G3Yj$n
z2%~La{QekNY<-qgq&AEc{S3SUfBLo@>3%Xa)dTgrsc$y>(SW$O68A#e9g%k4f^9-{
zT>T#Xg}$8o$RqrL->|39ir1X>E2lEXCi(xk)NtPP#x4yN4i8~8<f)#3fnN~t1DF?Z
zNYf%WCp@V@hBk`ptpsZIVF0Z~QQ6B+q1RrQyGa|&9_HP76`Ff0)9w+8cqoPanjLW}
zB$lwR3K{QkNXdIu?b9Z?%NhwRrX5<vw$mnmvo*w+CHbZGdy&purH<sjG_lal1a61;
z{SUroQpJWj_IJj(s0GqeG06=7#vJ*AN++5k&+w^*wlrryYJvO7vw?432xnO;t-pB|
zhZ1MVrp+08AmeD-6RW`gv|pl66TM^fZz5!kxz0jcH|<Z3F?CoIy11gkw_BMmu&Q_g
z7a|I2E1vOC9LcYd<DXLA{Q$ARtIGxblp>|^VQ7w_$A$@s0IHoJF%bVD)S`Z^o9%tm
zmh9YxFlR#)n5;{__7e~=g%jhf#0<r93&Hh;7P87>cVW2Ra7+6>%A|l7a|GEdO<Pvx
zZyY=kB}pK@R<dx%AK@JRWe1)v$28IFiOr1fyr{Br-Lns$kq@vuaJ?jJX01=aW{~Zh
z;5sX&xJlOjOKU>Cj{`axTE*|%PQ8J$1_iH{vgmr{%{mFHMn;d%J@2FupLOC@DD!YF
zjOLdslcyAyavG|%Wx@=YreKCx2sFe&9S6{6Ja=Ki;BHxCA^|Xd25QM<-^M&5&duTG
znYB6l@8!i{BJU@u9fI&H5S^W-RBav?s?;5j;*rFEQyLEW{^1VzIJFazTla9wtLq*y
z#I)UtVq8$9IXAW92^L!Z=?zV7lLdxy%%5aPqzzfldP#j+E7Xx@#%oAIa@5O?Ip9Dg
z^ha_et^nmO67CS_b#<q<Nlh=)DY2;U5EzK}!n(nI-2)6d8e~4(1z&!bnRW~r<Zbq<
zeED7az1p3hP4wLCR8=T7ykbVr)^GY^i~pM2>!ln1G;^-m=E$h>5kqW&2{M<tWe@L1
zzzuBWa4xu@Ffrxf{LD$G=%~wD>}jUwr<ulkQ>|y^0DuV!b-Dd!p6PBcKoSe9Y;L_5
zG&?KN^4^xd^Vl?Smecf^*Ap*2DmSjt&@pRNCVYv#85Lkh`<uR{cg}9;Kr1@uU1k<G
zn%gfL0k4mV(+tH=Q`g(kD1HCxGO}X?$Hn2w%E}I_$gD(LdU_Jo%|{RLoN>Dn<}Uqo
z@;&YV(N|vsI(F|#|E7w1TSV&=xr9_>k%2ckB#f>gRjL;TysqjHENVcfn<+Nnp`M(v
z%f^`BCBskC3K!m^@XSy%hrA#DJa9VwwXZ#GZy0eOjdPWrQuw6=^*{X0(wcG`-%!1O
z!-Ix6O;0N_cUgIgy<+1PQ+cfhtdBrkuHxpxAh!+v5(Qr}+rDp-HkXEM_K0%O;6|ZY
z(v8d?5mvo-Z3WR9ufHa!HjNq%jzwi34~Y73$UZ6WkX>xWq+XH$54a_0P27}G{3s}U
z5m!J@Z}Z)M_KZ*WA(`)W)3@X8GKPCFBU$UDJAcx;I{)dB4>5>iw4-kqR%{BB>jGu6
zQuShmdbi&9*tpySeszhp0WY$2rFy>Mn0bv*&e-G?{zTgAf-x)06e%4YQh9k!*j1{Q
zoZQpAygdBp&xhRU9f;D=C@n4y5%#qmKs%3KxlRg;(oj&WHDtm(uA{-2y6mcQFC<+8
zKI90{G4m)*qJ%1?RoN*ueC^u(lh!peN=VKxd1taX4n$AXQ=f)4v0E@|-~EUMxfoCT
z9%^MR0Y^$&%?@}Tv#=XgmRZh-O^+0j|B@8{H=*i)0||?iJG>TAZ;C|JCYcM8flqIK
zjSzNlIcJ*)e|CnXgZ?b9n8Q!Bbyd;j5Wz5Kw8x2k$E8-^8{aMtW(d-~RIc(ge4`QN
z>-+-PaPTT1nd(HrP)uH8f`yWgODAwE+Ib(Ds^{QA&&}#$OymS0Zs<L6zZ48A0n~qY
z9iGALr?kL8x@NIJHl@=>4C<*=+O_??dsHzzT1dc(G)&HH*_u$=y@!Z+l<1&mTBPy6
zT!4u}gQd?-5Cw6C*t^->nfVckiznp%ld<K@f&`L$k#%?VrZ*E$=(@hDQ|AhdjTP15
zt^UupsJYwLtK<_VGGgX0o8q_MM5~TP%$n=JR%>vc$QZA>{_kIg;>(E?_1-fMG#ILn
zn#`}Tts*T^<A-+>@`F4<c$KP%2f4<=$pm<b=D@8YqM|}di`NUbVaJUe7{QB+`hV<s
zySlmx*r-D4w=l4=@uH=Q%F3kF*1x5A!Y#eY<Q12y2~lujFu|9s-M8`D_~lJOfR`bp
zLENVrVo5FL<d2#yL-{a6_O@LeUsd+W25!{pGsRGPCR^%fq`y{~&AB0BfdwSUUMJb7
zo2cygw&H4~_#2b#WYz`W4Jq_c%in9XKKd}Wg;H$%T8RbREWj-sXq7T!N5NbiEn^E9
zd>@B)avUyk)LzPMpB)x7y7G%>*V_70SSX#(c*Ae{E>u^l2@Pxbh@!EU-0`2q+<fXr
zRsG7`*G&|48h=<uOLWj>f3Z}t4obiAn~XFW7|YdOkEML^y>_=qMc%3P#qV9EX-*kK
zSDgrQi>KZmS!t&^4l2g+!hI}lTkOAw(}{bgy<ByB6zqD9RbjiF_;#8t1s#lMq-4iG
zVDbs~-OqjcK`n}?Q{|;1k=5xwP2bIx!JYb-n`{d*(7JuT$)S?`KXcY9^d&X&4GK|r
z2-l)-n}HuLwUWcRGcVm)7Z%S?3W^fb(MCjkZv+%#(G-nlZNq3nJW(U^_qMDbC>ll6
zLOOSgYc{pWAZ#x`@}Rgr#5Nj}LP@YR4(Z>SBs;jldk*>PX!|*%y~VbqA*3a}&B$aD
zjsJ4z!bvc)=!cfL^o0W;KtM~4iS2}&G#74JwVNo+<c50qoKIo1%FqX~C0|6zuPHt*
zY7=ZEjR{mo_NVX&wZTQ}Ys^_dwg22hG_-h*o=NvZ(Kes0e!TaCq{;8ajtpo%nOT75
zqfn5v;Dbog+0Y*@OGkFtVb1eS9k1H@0+Bh*G~+*4Jw7gLGgs=U@l4C(p496`9P|Lr
zk5p0sFKQqSH@EHj=yJgN9KX)Y!Vx=Z;-8_T7z3rHA|v~=R|4J_mBoARF|{JnDhm*8
z0ewTezNXpfGxQVE^5Kk;PiFh-ge9r4kCQ9O_y)+oxO1pYMWyo}jCPuvcgq_Z{J990
zEO%XWE4PGa1@J;6Ggd{7zpXwIDjQGRogS|Y2DAZ3{s;wa#Qm!b>;5N6Ys@e(1&5-m
zOX3!(SKXV{o0&?p9F1}P8K|j~Zr$tBzl41+u-n!Jqm!0|O3)w3+*KBaQ{Nv_Ncr7x
zRPz<9wWa`D51%zo76?-ilJz=zV?8jT_)GA{;~v1%AkmoOkn7{fmhRUCHgxTdY~t+@
zZJ*i+RS*7Pnx3P^ftl%JoOE~-y5uzxI=HxWdV*WkF5hP9xppI>6EKi~d}sE(POmTU
z-eTl3s)vf(?b&#Re<4_v7;)jAHt2$u72Ly{;3g&#c>^B(2A9&_0=H|CpKRMR{#Ubr
zvUZeVeUipJ1x+G`bMuWAARd{ol9E|v7n>4OXfTXPe8^2#RE`&o-KjJq+x1RC{8R!%
zG-j-b?Up|d_2FBDV*qnyZ*wQzZ%uOKKR&v;NEppY+gebNhRJy@th=5a=5x2jz5gBy
zNExwJA?K}xbYJ{%-zRHSef>f(@QLkP#Het6m_E{<TX@*g^|K}Q2hkZ>`jy<=!<SRs
zuusML4&rV!ulbg9EgadN?v#_Y`G<BN9Tb48EST-<MVfRcR>8C)_`i}SYqan;`7(<v
z$jDnSbEF&A^O73a&HiS2rO?Cc`dO1Rc|vym-JZxfJ^aPY?0A24(hrdmqL8~h(@OJW
zY*2nArXnW)_053mt}+O6Ri$H=lr7?DqeJ-lR`*|Z<dTQvqVnzO1720`*uDGNSjyTM
zlF%&Dx98-@mr#ihaF#aO42&H5Cp}G>M%<7OMwVot_f?bnXZuvxV@Y4KUEc^9-|(`)
zg~DRMejtuM9ZT`X>KL`$_9po-m9Y=}N$0bBD2<Qw&FN>PWMlDSFpZV$eyja>nTUS3
zK)0698{<|qr!z-SAM`XLWQ2$ZyLr_W0T+y0_%-{AT-}>(zZVuQ;6}x9ICtD74vU^5
zZp0<u=(VGJ>ih55k=D7uQvh!q;|Bi(CN)+@Hx@-3mrNO3mFy0Sd2A+)V!7)_kvf?U
z3g1imk3wVFNyw?5ky7JWOHbgPVpHZjT(Tn!nZ&ap{ZCR+!e8Ce)=(`3`&bq?k-g+9
z*nVG#8Bp;R0>dKU(G7d!9ra1ZHXOKbaJc>}T{|vP1K8*b!2TM(bmOFe#Z@WTf)QU&
z;5~S=Z;>`5W5t10y-yuaZ`1=dX?ELZRF&+W;3K^YxZx*akzwHv1whP&#wq->MX9pn
zo5A`J@crvgO93z3Cl|&kkD#6CZzsFsVqgg}A15Aia;dsvAKc4gIzf7JCbz_)0rp=?
zRjY@kXlV8ruW#}ff`?+$3ENIo@6waMv64oeI{w&=o%NRbC5(dcP|?uz+z-%$P<#=n
zQBpS(M`*DoDUaYCVJ<7Ag0+cdqTq4)R#_I$Fz~jeB7Rz;<y(!f!bFR(<Q22FANz`j
zImbBA{s}HC;9T`k9pkC+QJw{>a;%%je^hfWG%#u@*qCAJS)DK(r~z&=k7-TE_&8VD
z!<#s>Me^s9JuNYrmE=Ui3@3}e9|Cpz1hjJS34EB=1C3_6>093M*<<IY$yDdQD}7_O
zzILtT7GGwFBFouaZY^>hws1g-oB!{V6YE81DYG!~eX^~oF^sRd!XnR#@L8MSKTH~u
z-%3k^L?L#NFk5a{Fkm!Tj%3_b`tN{Deoq^=3~c7&8hdoZ=N{naaByG%GRdmuMdquo
za{1K!NX5EyqNMQ^{#{IQR4Nk6{#S3i&pDt!^69on{dsQc%IGH6>E2z<3MaSnWA{8F
zMkgEEgZ0kZ+qRkXY@|Ahmu)jVlGc7jKA}lw0#*27GC0!+U6t#x80V^w1==Fv?5aF#
zSdSJ^!9&-Y5h&b=_ZmZ`pd^5>HQP0a8fT}M!e#pB9V^8C`__3u6Po0T|4N@BB(W?|
zqtXc@Y_95Nk9a29Xa{_*@!CKkDILc^8m90}g$$<3E&MIof$w*Q@Hto9_{!|1FR*IP
z9zGSnK=rxX?RGn~pcnU-zjd*``cI}Q4UHyUuo6kJFUlSncjpO=S1|o;MqJ1a{oThQ
z%UeIa<}>ZTSH2A|O3n|=r^_#;iNml!3MX3frQ1d(oa*OCFWX**VvSGTyn!%qSRwly
zhhZrkt$G&7fd?HTD9YEK<5Wc3K*~`eO~j8q?r(>@l}5^K3!Kun%Q(2-WVHLu23%l~
z4<ed|k`Violzj2gY#C%PNG?XUW**&_t~|<X!fWBob;fhUY<@#4@<pVqA~x`6+{Bk&
zx55+jj=0kgxw%ljkydZ>^)9|Ulj-D_?YbXz-JL)T6{A~zeP#R%2;5wpcEf`2cN|=h
zeJ+mN<xLu~yrSc;I&!>|Q_@UYGSU!+e#w=N5bEtfZMrgt-Gx9ak`U?^iItAobr6*T
zPc;ef%5BO6M8`ZoEVj4IUUCcfz5m-8wX%D5II_@o_>o~%n1@Tn{PuA1oku3|SBRUo
z_l?W>We;i?`yiL-^na)gPQ&rIqEZScl3a9ScW^j-S~1lDje#W*E*M7!NMlt*V;}8I
zt9P5H3U=RW`_xUO6BC-e8(Gqq%7s^-+mBeict&JL7e|u*C4BUhFK|r37f;*idrxC;
zsYgFi6uut3frY#A+5=ZH#N3zcNFuy!@SWhQTQleCv^H$CglwmO0XCed?j1(^*aPw(
z@VrU?Cjdo!1pbV4e<7H^!~FXlOy?8D5YH{WxJom#n3xlbO69}qy&6B54-AM%e2yAu
zQhnpT#vDl}ZME;kYO$m&6U*RSxIdOQMgEz>)79A6=KpZ^7C==-UE45<f=UTUm(tSR
zp>#Kbbho5*90ci-?(Xg`={z9aARyf>{cZ4hpLxIe=bwM(zh@W@hjZ?G$J%RMYprYT
zO_h++>+lXoJ0_{hf-C&*uPjE^(Se);Tb3MXt@m=C`RH*>OAAiV+wRA(*3d{E{l&|t
zAN0ok(xJOS(hu^tA<BaKlZc0JPS=0zMX&=CTFN}yAJr+vXCmAbwET;=Eb0CSS=ow?
zx7Z=)WBfesvA^5GSA}G^Q?l^Hub3(u2Zfdnb-$9Zm!nEQtI#ygiIftavD2gfmZv}3
z5F0jJL><!mn2F(l@7sKxc?+MbL5LTB(?#<*BQm6Hm+u>#^b^cH$L|zLF@c8j?<7Zg
zg)1wwU~iyz5tUzc#>a1Z57Zzt$4UI23*3cN0;_QWrl4X#=WWxheg<BoRS$~2{V)d{
zqs)UZXM@~+>-Q^iM0`M6P(Mq*=M6|b08dD53)ks$_gJ0h+R!o65K9R2Hrg0(ow0&>
zAfq@fSAOFFC@p$>G&o*97C<3-beSuY0!IGFIQczWl8JRzWXxUPf*eff^%c&mWInj?
z$CQjsjZ~!kQj1~<;nw`Ocd{Xm9SRUH)rLF3+`{=>rTFzF@^>y7&KMCIoU8+uVqTal
zUt}~%xLgga02I_X2G;?l(62-~q-M#snjC~1TY}67Xe6VLl49KV3;jF1%xxDs%GqlK
zsrxSShnt`1lg<x(_>iNH^*RWC69qrncHy_!K$W5EjcYD}^<-yOPpjiBSI(SY4h6@k
z{hKd=mX!J_vyj&d3^@ya|Ct0yHDW112i=?*S)QDFj|=91Wn~?Yi*nWc#arG;$waWU
zrq2G#F3e(%t~zgyHHCiV4Y1n$Rb?+#3)Fx-vO}0&T>tx8&gq#Dh*Cpl(#uB`@cGr*
zzKEpMeKxbE3dX})OCkc+V&eW7u$vY?%(`>u;8*JZWmyoCVTUQsp$O>A0eTpmc<YC}
zg0L>F&X*#=h@Ir3qr}Mi?{*1&zthT>Yo^|zJvK0B>!q(~_H`~cmSnJ$1jFeSiWK7)
zixMX$v5TMkiX{hx)6zF@R?UL}5K;`i$iRDlp29IG=d}GzuqTTpgg`X+fQiKR-vSU|
zV(JyU7#g$0!rLNU;XmgRA_dLWXhO?p7#ZUc#==TX*RhQK?XZL_Wp`Ch4OkVoZx&QX
z?VA`(6Q=s?u@#4!GPYi-xs8}Mro+Vj94J1v-j(`?$pzf9&}FU^6luZHmn-6P$OMq>
zvsaQ}iPcfspB?b+y-8dN=_`;~aA~u&&!mcZA4}c;r_&R-5miNn_tCqp_9Vgp)JE34
zKb@3Tn;dBkQq?mUwUrJU8)-Ecv9G$2PB~}AKN}lmuTKn`VsxHqsX=z}gN=*{%KuN-
zoZdQ)+ZU3gHk=XZEh8=c{H&T<DU;*=kfhV%_RSxgJ&_t>5)PkBLhxBg@SD<PG`&2&
zWM6fV@zAMC+dqD<F2yowk=>MsV<^DXNn`<3<Oi2m(*b?2p8IfO?9|!MIjV9UkVn0&
z?S4<|3o$r`IRudHJuO>0a<q?K$$HH|%fS3Llroq?L+=Q+eWWevSd=T3j6pX+<+-YY
zN{1uA6P^l<jIMl77<G!o(P;T<q~2@ANJl`yOY#}HJMxJ9B+6&Uwj{dcWp49FnZa@A
zwk{B_V&}}@yMcjqpc<H`7d9mi%=}6@K%?i_yR5_x=RhCHO0~Ao89g4lQxVG|M;Y~K
z2f4R10IupiR%7#MwOO?s-LRK_eT0F7rOi5ZG71sUhkh#2x&V7cJ(900y669h+`mzT
zcSH%5Jt;`5___#o>an56oEVBW=Mw}U?j>bsA?dTu<85?j!dT1B9A?0+HI6MdwfMcn
zclYko;M`ARt6S{E^)RM##(qg!TI}xA1udb3N9>Pi8t}UO12+ttBcuHezmV>&@9x+;
z**>>*iv|>xBMlft+W7qj;>Xo(2`~f!dpZY%j(UAbfH>Y&(4ta?Up3QdPgQ9yYM`ev
zK5hmRTgy?nrmIVCQ^>&qt+bKJ6)E%R>jk26HB+mDdX2KEOPX9{G4+0Sx*05R{q&4_
z8a4^%y-jf`DF+J|2h~5h;Rki46q|QMpp+RbBF!w>-FB491Ug&9gJeX!BP?F*qIW@#
zx{fN1!(zqC>fk`tJiw9rZM-uT-$szN_85O5Vmw!!oBkQQXwOaO%H><p?(BAVeqCq#
za^{anP-*J-n1su|7XN<|DOa{*SmpF0XhCq{3Ppa@=`xGF)Y%XrzVVFS8e_PN#omk8
zRYpGX0U-_vtE{h=zb%cCR9O_|VDs1Ap26GTQJHz8NsvR-bX3g(4)tDGkGH8W$J?A5
z8XOMD$eS)k*C(RqW(*IPp~D8nBozZ7R1uh=SJ^;0h<DJUuJEf@o!ukeK4=l}!qa!e
zrN+{-bm*pv?s-5jBa79Qi;GNKA_qR*b0)^RV>dztd2q5St+2ElIYD88sPdLA2&qCZ
zo&t({>OD*vL#+$5N~tT1-$J6DRDqcxsAH%w*bku$?6=N??HI4zK~j7GJThj3&EIGT
z__?+cfH(IbW_E?_u&?;;pA1S7-09$bxqAu|!#;kLjIt=LEUznbd7NQ%x3Ydwnf?B|
zFe}ar-xodT%q?_S%#VsNo<UlK%TdP;i`n?3`OSnC*ILeg1{{zbEEOBr<vZ^gdbUSx
z%P-jW>D94Wz1vyt`=Qj1fW<v99)~Ve?+=yA9hNn}>w8lMM!3HQHNtXl5V|hy0MQbG
ztuPVLhRfUD_e8vQNW_Dw%{1x9?zTMcY5bWr;GEROVS#RyHpUpdkmj0179Te{H&--d
z!J|K*5h?%<IQ3<`<Ic!%Vq{p{)?KMXTZJ_brNApH;H{KNu9A^|qckDmJmU2}RI1I$
zK5~O@m=NWsR|7RAZBkf<YFQo|@FFV6$jK3@f?{<N$XT<gnR+%>_S9S3BI?;A#xura
zt=bD3%I0`9%GW(!HCGpzG(>|c4Wo-%`!4q=%t25+Gy&XmWY4SP>LQB<q#!eW%L#Hi
zO=l2%VBB-(W{^g&t@Nj!x*@ZpngV!)WAOwr-9PM-RlWY}n!6(YMjevm;xKp?C>Alf
zXpYld+;<H(A^r*8Dbwv+7@M@&>c5WDu4Lui+xhhl=DfPK-@3EQ_j!oh?X9Dsx>-_D
zQ(k;)%j%egy2<N+p1&O}ePi8$eg91N2iY!vM8a&B3C%a`7Q&p+vB;(h9~RA{w>c@q
zOq3_#2w=uGB6BFj$w@To*Xi>pwZv#pJ6Y^M(bstc*Ewd5D6=l@Np0p2H(3n(wZi=O
z>ffG@N0(-#+kz^!HcLCjByCXNq@Y|8g>hmftV~mvOa>+!#FUQe`c<c`X~~{%mE@e5
z(?3t+Xni6bwrUfjpi&-lSse=)_tw)q)5Xqp_&!Q+=mq4JVul*QE{Fa+j+{Qu)z3(p
zABu;5Du&E`{%&ca6iwGrFq=oC^^6jnJXjj9SWlR{5Cm#EK$PcLgRCxDc)G|EOtatw
zT++GQtL8Jn`zNz2&8@K@D=-t5S9;Y{4eEqaIdk+th3o{`0C@CZ@vM~ahyW(E9u5R_
ziLH3SLASZho)zunuXrY$G}kx?o-4(cbgofm^{R_9jCNdg9b^J)1j4a%pQTKf-O&2v
zKhf4D*H0u;9*+N#s}8^WsS6~@W)?v#hZs~<p@KraZzbOdIQE49G*VYC1SoY0DDD}~
zoT$jpCux;{(hhIm0wfa`;@mS4pw`)FZy9dC>CZe7SQsLv)4UXyKv*9ey%aSYPyy9r
zx#ZR39Ptw?gWZmGw>G1Zr2@wHGMM99-C%;c%xq-T{W|v7Dp(;y^V`x@7degXRcF`n
z+M0b>maWkhH(NcP@2Qh|j4dyj>9mVZTIu(A_D(CF(xCAD{q4P>@J$yxI5=2NLBTdf
z0L@CMqobp4!BT^6E0vPYvQM69aU0{!@XS+1DONTNpgeVN;Y6;_m*1Ovgpjzo$LEMM
zV0T*9t{x3GaTWvkh8iLwo~`ia%^N9B;mp??84I-!LB{vP^EbX3e3;{!-!dpc@n7in
zs39UeHwa_-1g;1RMUp(MErD=8Ho9iNWgU2gozk50bA_+G!YPHp;OpX~f&dJ(=?*`v
zHP60@9YP&Sr>6`cw)=jtP_kBoGC>yD!dWB;5_ujd*-z=2#JE6}{`-0HufzJMFWoFi
zfya2g7fK4+<)M<8%~<zh)B_t!!5KfBwrkUN-m}rPBxYe|=D0mbI1pVq1|VPGFr;fM
z^zOJpRLLC7GeJ;Sdj{v!vc1Z0`&dQ!@nBHY=h2sHsAftD;*LIJ`-XGdvWy(INz{|`
zVROn}1!|w=9LanZxh82n=<j~N1VIr{Lj@`fL6vE0cLFU<DukK9o!fb*o8B&O4|TET
zj0i->5~`sGIr;mx>z}ML95-R6iXlHhU`fW6Z|>7A5$FjiF40Pak0UDf-b#W!$}d&B
z_q=!RN^e*d)oTJBmIhd2T0+jKEpsyuvZa{dAL{;G+6boiEayY@@XvW*;6y{q4ZC`t
z__Q}AB$d4VZ+%@|@>UEh*uDaY^5)UOMuFtrL6l;9io};0s*EGuz@{^ugj4v?3e!u!
z+$1jtssg=qV9zs0JV0SA1)Rv=b^?fqjn-mt|3<yU=Y=}YaR1bX3YX(GJ`1%#=g=)8
z=`j;?^UVDGEw?VxMNc&~HA-6A>^zh0bw>dW8Y#tLQzjR3MBV_i_=cHfpMC~JE^h84
zC!VP2Xs81RU=5|UwS5FtA7^J^O|xKPZm38E88=$NfWj+YKJRqhzWGzL_;Orua3F|O
z6-@4?Mdjq{S2i#~5@!yaIvzGWB2~~;>UtFiN)JI4Q@K2?;1kH<fO4J6kmrRH0lJ&m
zzA<~}de|-v+27I)8@~m2r4?YZ(DF+9O>7HIDRQo{@P>}1$Vvq1mQOPx0_`p;{|4OP
zIfim|8WkoBZvb>#%Bm2s3rG}YU4Dy$Few$4Z@zx{^vLP6f{YBRtxvBpClVIIN^<3t
z5dc)3SOEoYn=W8CjgJaJ%Vo~LLP!fDERpo|GE8ki(N%!bvmi4l`Rm}pgPig+{i%+G
zOJ0`E3sh|6a-ENML)A%j(CKP?yaNGcnz*=47pg`mHafGr2L&k+dADOu_qJrJAgi%k
zR6LUufu^jNUXM}yX9a_dhhjU2Ploj5rDrc#fCHM7F~}gvAI+|=@IaG}`+^>c%fM%y
zZP*@p6TsfOs&DbfgCh0jLe<%i_G0U*GjX-t9=*0aZ!*bU7GpSnI}zL4tlZ$9a84xI
zTHn03K^B}s(6}ew6JE@2cWs>bE4n>`lVYY~^3;Ro{OIF{i9s$=qcX)HuH~c*g8?oX
zTts`CHtT}bDzzj^362eHnMI>rY;tnyH&NnK78aL%trHJ@$tgIH$`2%Sy&LN8E42a6
zSmJ}RdWcd0I;CNV5b=i6x02k%XUXQEyoVMy1T8?QELHDR3sm>?F(86H2=4h^F9--!
z?DySoTyxAs79!RxXQpjyZ4Pu8CQb=|=ll?P`*NN%%X;Y#Iq7o5_a72s%f8<ezrPWK
zeF`VQ*sX&%zeYC`F|i|-Fmf7ix5#hMjTxa~I3UnEtoAwle8x4^*hqD9klB@gjsfwO
z;B~mAk3jnP^@}gUsNo&Yz1zlHQHUuz<N<k<8vSgN$JWTq#N;hu7&@}wHP;(dPVcxJ
zaZm!KRT`^GGdZ>F-;2|`af15Fr)+Ld!opvS;>(vWRaI5L&s@S5KmU1wPoaLQ*d+*$
zjDdm?e!&DF#a!`9I`JsU!`~_3gTKbdvxXyVM^C4Oh5lVvv}A1md6~5|YEufxlGq$u
zoGamP%jIz8WA7qS!v&?rNC}bAm}xRT5{Qx$Yjz3jb#}bVX!ESe04*q&<318AQ{gK;
zh2K9NY(ZXjmrPZF_e{{lb3W?1Ae_|0zrgP^Kud&*CEwB#kG8e5NHZW>TwQz=YS2|9
zSDkFj>ON|PM-9KAA7EmX4w`Uq)%LUPUqi&}f-XTSN;{mE6!i3A%A_L8TC+<#UZXx-
zI6p?2DzCY|U$DA2lN0~DZeu`f{@*of^}hVw|NS;oS~d#R@5r#$w(0tTD=@OCsGLv0
z2>SaY$}J|jtuHyjcaRV5?#Nw5L5)cC<(?qtHkJy4PlcnWb^7}I_rgS{Bo2Ee?(1xJ
zf+s(GKO8;h;XjYpsd{J&@&_=EGOZXFJe^sFDo?EeUy$g;s>yq4(c_&UMGDmZg10+n
z!(;e68TE9d5m}+fEevNzPuXV7u<hi{nyOL6Xf=K1RqYZ9nu#>@Gl%(m1grWWuS#;%
zmsRCkTRk(!^XR0j@@bhdU85+FZ#D>Y&vY(!hL0!F`N`2tcF<DUDN^T%w4nq1_i%IC
zF0VWh_-|=NpsGL7Q(Hq-IsCavf>8Kx=+8?&_w9ta!*`U)bGW`luxbfv)2A3&)EMrD
z@QEA+$?C<E!`vTbw!*?i&rR8#XV)H{-a3vdG`Eg5+xcSH?x5UDtlnwzoK=T;8it3n
zfA1o?!C%GFcmWsHbV+P}|5tI=9VyGuz;1(RDd%r?ZZ8$~&}JppNs#&mI?|Qvtvxyz
zxRR)8FpoH%i`)_#CI7<(_}l77sd<eS>#_WfjsKe4q2<_dCER-ZYH)yG3|b=1q7wBR
z)Xw-`6ZJ4jYh6#U_u=z+3D?g1EG+A+p|n2xStib0e`U^}yY=Day~p2H95z#ON&aKu
zBvZo4+e_44WD0h^Lsy=7&Z5g!KkHoM^3BP{UOdjwR*%~nrIlsP^hxprDhV9+`=>Rn
zr*9l`gKo;v`)IJgjYWQ>!S7rX?@3IQ?W(DU2~KVdroP`X9*?$I%}$+{9Ep(RyjRrj
ziHz>KYUPPYa6OzB|AT>Pqcet=jK+Mxb)m1_y$AhMEqVYy@bvd6y<6&Xq)|DT=8om#
zvcq-RNInyW;hSM&h1DD_YXZ=U^I}(=0jbl)ATl7qqFMihZ+uZn)Z7SJmAoxR#7#L_
zg|_KrO4RuTg&*#%)wmPx#h%_yKfglD7E&4sCxb^H88-jbD;>t2eiU?zFY5<Of3d?q
z;JGfjrQD1N2PZr7;oaStzcQwK=h{noch=lEQ@d{nGmrB*sd)vn!Y??hx+X(1IC^f9
z@pc8ToLiG)fw1D0hz>T~Di)0~cn;&$ZkMWfWC8)+lQ@?Vq|Wv|o{QfbCRQcJ!+67X
z&?!jQZXw^`d29;l^yrHZEN^}fYhS_8P8h-%tfTDJrRy?W7lbhwo!1xLv|*WYm0@|x
zk6|&`&hn((P3BV0)!`y~pMJcQQga}qvl5}Y9FeODYIWZu=@i&&{womxmf}cW_rtpT
zDqm?VxxKO8uaI(=%AG%Zgq%{O(ZuOYO00`7!|uBf3f&@h-88vDn&=ex2s0?;1--u_
zAhRmzte`pWks(G6`lV(Eleq8=x}R2`@vLzmgq=GqY1t<@r9N(i8H?HW>M4E+OF>Pm
z_t@S$W7T&!mx4zIYh{km6DchZBpCgTWYOD}Jt4s-FuuX55@$mpRqRu_dQ101R7s{<
zOz<eO)zuTqIT5&i1K3JGoA3^F?zs!6D~${b-2Vjo2+iHH>@H29HbP~fB~YgdxaF^|
z3bzF{#VQJdLPA+6CS4Cb#Gm-<gH>@D)7VSMgw{Cb1NGRX{SGbDb!_m5yH%Y`rzOia
zzfZoJ*@Jd2ORTQk-yrv?ib6xGitvNI^>ejvo~&zK*udzeJEZ)*vHy!#Al8}j2Mbf4
z!IDaOj=jukfx77L3QAuup1|^8mxaf=WAgrpn&RU7ed|P&4x^qe8SZU;4W|LkEJGv3
zMpyKhc&TdFN?RM3(fZ?7!fq*tsz&AtKsKuyv{(uMWy2E*<aKoJacPH>^b;_~Nb=pk
zwRf#KZVzh-F101vfNbv!)}6xk@CE3?V3x~y3RRJmqawkX6mx#y_|=aW#@`EmrcM1R
zsAGj_1n%Mjq<A)LYi@Qf{uT^~k?TpRHayLplZ5U=59&pY{V3EzyNzmBt@3ndyV%Qb
zvvd7h8o@!8y7A;Aa@<D5;eiX<+QnH9svGA+0MdiL&uG&pUpdY?nIwWpC|;(}O-YZD
z;)Lls)-fRBVG{H`vvMGm3B!W8vj(Vs7h-822y;uWY3SE)u;Tb7VeXEximCD8Dy?li
z`VDv}(8|n)m(`?ts82&!^Y1`!l5~BN*MVZujTc8N@0CBOy{bhSRkQQN+lM55R)KuC
zhip@F;3B{W_&f!I&%2QFJ0d3BT`ElU!D=8)JZi)=DdqsfZwvc8di;r%s|^byNB79F
zi62RNp6cl6kn2@VDF0Aqz>n>jW9+uvh!}>1QMVkU@5*4c^t*G+oBqL);b1B4=-b2z
za%m|yl)st``V&G&O!$(hXn90tFNGG7kULz6MIbGoZlby6D?CZg5ShN*$AT|^s+>DB
zb#IVZ7Z%bZ{abzdSL}D4&WLF0w#MEhu|D^lT}Tuk=r`Q_Ua9Dg)S<Ka9S3`%2<^cH
zj!8ZYZTJ(aS;EcfNt=B`sm4uTrInRtHFM-2{=%Qn1Mc;Ps);t%5BTG+ejJ=SmD2>#
z4&SAm&bMh?Lzfs#XE11Pdt<R#-JfLdXpy4^$<^p$ph6I+_nrOgr`(e=x?-)Nko$;%
zgWH!g9U>RltBdT~FayM+iz3%;q}`z^cgWZ*QG44;MRq*{kd8Iz@8i;&xR;VD)@{ib
zAl|HO;c@Pmk1ikD)eYehA;vV^KTCztS-OfnQr=%pxW6Miv1*8Xc*vf6>v}=y9loQ;
zB;znOkmOw7M>qki^h;&9fuCPs=-0?#3n*}Nz2H{DJKr;y$Xp_)B}o*nT@+rcv2VJY
zh8>T^T!r?*rx{%srCg@U>c@-IHS9bgr)iFwBDzLCxY>hz3r%<fxjTkNmE5wQSk`%i
z1;T6L0DpfqwD5PQ&?UIMxW?;z-9=)_<>iUa$Km6U!T?c$UEbY4Xr8wRP2J2F5ZuIt
zKC)ETAMKwdozEZ##<$uY9vZ_zrMCRQ<Od!h(CeKJlBwqOF<&h8l}x1?4o%eBY+(_0
z?teX52|FO0CFCfn2gon$0w>1)@bC;hN{p0}nmRKtFRxZFLaXL}PsRD_j{BMm5lrsG
zyuNqrRJFYBLU1SOzK;3y0GA#vvDWPWg?t`S)c>Ei4OL?dlLwy#2zgahFa-t%x=@pI
z!bV3${YXufwY0R<_8Z-CD7j>How{xL5AX}W$dAWbl3hNvV+d{u{c|f${Xf`MCVeu8
z6e+sA1$ou|Q6?YTuSkhDl%D;rt#2>(M+)WtU_#qh@#}AA6Cz0U`Yn=P{kQo^#K^;M
zq^N@oMu~A5uep*g$o?T?8LzSAHdbx7WYd9%g&9kMm)K#Skb#C8&g{eO{!XVuN<dvP
zqqM_WTR{&fZ=tJTU2G|>W1AfMR=mW8nW^O@CSP6ak)BkY{!!WU?WRh)OQy(Jj|`QK
zH>lx@`qMmhOjy|-R^~G2x-19U|G|-POx_<ma(^HH(ZN1gI#8z>$S^wC8ILVKm~e*2
z8hSUaHdbgC=r49q&UsumQ>oZ`ofz4ffk{_8<i02UuSZF6iP~2l+J0|=EXOY>`|H*z
zwo<HM-mO2o>{x0od~@yfDBz<Mwtg~3Gw_i3tK0tJ=1%V6MbU-3>Lk~lVMau}09F6s
zd3m1u-e?XDEpqx>4ia*@p6D3iy5?^Ywf91olg0{mPL;k)?w6W>!V&-$%c%B1;r_%g
zVK*`9&5QW(4WBW(j<r7W@lTRl-4ySau4sO_XS?Kbd-qB5xA_NbEPplrwWuJx<;$;F
ze(NK1JHO?vYP|=G_&!OL2Q$NHn98B)h#IZ@HB7AgI*!?Sk>0=n|FfU_V)|Fd!%S}N
zO!zV=cR5TB#h3QE$^`mG8hv$~>=7x<NkTM2XDL(D<yWyzOmKE>JsznpgDUQK7+P+p
zA{ifvOmQxg+?O|ScwCTup(1w5j9sO?Qfc)!q}q(YQfQT&&K#y-1QdseANz5B3}SPE
zUOPf_fsiMSsNtO--;k(`DAhlQjCpANT{|0(X$fygiD!q)<xUPHpof7|j9qAp!%<Oa
zSQekW+mjTUON@73#u6JQg!6>%8}v!@lUj8m25R5F-?_h3PTH5k)Ue|>-SURP=0AXy
zii1#NaDS>ex6&rwEj3;jzF;uvtg_T+LBwOG`zzZ$Sm#v%z1P`WYLVhW(>c$+P+)NQ
znA<`^p)E#?t9Lu5-J??JUCvx;wD~!xAsMSf=RD9Ahx3xyq1ZY2?EZ$zp)t~D${kDA
z=n2eo{dMM(4A&Rc6Aw=_TI$ZFO!DB2jB!gkkD*Gro{9c`d&Z*t#c++j<Q=Zr6U228
ztiH@Nt~`q-DmPto?88y$%)o02TZnhCrH=z>g#u4zc$}{ITg)>MT+w!oeAE|Wsbzzt
zM`ADoQabVE;5T#pPs<%AS2UB^;cb5R7$KVPx|Qgbg~k_&t%gh86y<}Y={0I}H=RU%
z1bopt>a@-z#=oJ2L){xK(i8hCFbo#Rm>RcVm`=KXNOfZKZ-|&lPL-X6+&_PdZ}O)J
zg;%02&iQXaP-ExjS7SLjj@X0<_UX#~3)L8LbHSGz?uRPDsdgkG@N8LEle?PGoM;rC
z&*abv_4?D$RwljU=lQD08&F(ulglV8_k;@C`)6-a5z7DM5HZ$S(?xS@>Ff>`zO4CF
zxm<6B8U|-tFfXv!TiHj%ik`S2&a=|fF2NfHgB%;%ggQn!cOy&X*!044E)jYN#Qglv
z>%ghA6>3TV>Q_#s+V*i5vw3mL#mIje9eAfL=KjM?%nGENBsgI(1mZuMt@yunY`tqt
zdy&gc0F(|5!gU*o60@zSF8OLzty1>-p)ywG-b5;zMPb(-v}^fzbWy=)EcwX?Ce$ve
zZt-Elz=dU_(<(_uWb#jHW)82zYF1^2vujR~Xa=6WCHzA1yO5N`n8Tqb-F-|KqFEqI
zjG0mI0{_`rztP<S2xA$y#xwN-#d1=;WhEN6Mp)Tyo%TIx(nWV<nn>QAdrt~|hL{<5
z<y!+DfwX$BcdMA;t&SKi2?V=KHz?;;cRc~kFNle4+mjYezt%31qE;TVl43JS*t59=
z{Zl*ib<OoO^+$eL8=;*>2Tllff5oXB{P1Yd$E31m^-v*Z!b0c2ZLobqDvh*dmxBkm
zyY+ud4Ycoz0l$gKekPeCi4f*NrgQpfJwKBE`bznFfEd%PErtKwpQ66Ay_mySX7@Z{
zVY!>ZLQ2yCJ~GCG=1dh|Mz$vgwZm7}QAcD&hd`}lUZZ}^e>mJQzUgwDH_7fqR@JRT
z@cYe~h5Myw@~0R89!+&PY+Q`boMu#VJ4LMiY4{;0U+7SgWJ@qJt#@te9j>&px=me^
zhV8XtKHp^9vk$V>(~Ht5e+z1t3ui{Q)WG<d<oQ5u|BTA)dHXu<hIp`t+fu$={e4?R
zcwjQUtCPUnIf^QMduRX54W3Jr`%U8W?jx0$pepv`*#o|z!wkdV6bGE3l509c^;F%S
zf>cL|E6}oheQY|5&^BHzt1rRmNR2kGO6L{q|BRe5j~{bL(M>svvZZ)3?uxZD_nCO7
z7d9JSO-GI2bTT1O`Rz+pM9etcs`nXjyyg%`sf&<jiRCot2W?NYVX)yWWs&=Yo(-^Y
zqhT)`8X%Hm9yz)~r8>)LA{%ba^XD%WVt3dY(V0;@_N{fXYsN8SftBDGV=^h8i!^gR
z-~j5~sp>%&%9XD^H_|-D=$2|tf}Kn0T2gY53R4GwH<aWHzC%wMVJQXr>sN6S7N=;u
zK@a|eF%69r^GrBCNB{OS1nY(|#W{Z8l<F6W(4j_Z@>$#q-AZo7LK+y~Q1#9gJR)M|
z(jQ{3%mGeuZJ8Ti6CACVlg9JkA7F$%dvU#k!@1=(UVsGJGI&$|H*r9mJR`rwnZ!tk
zTQ#*Kku_>wSMhLSf7fAA%$1~1D9Wq7-me-rILQqah#_<Q+uwz6d0jzW6c-FzqBlSt
z9l>k&@ddCE{-RtHm%Vh9D-p7{(Lm9PS3ge}Xk#dw(d6Z+eDHuYwt{|g*xLW9F)Gg3
zp;dcC9qv69FIZxkolY%sXd5G8yM|E@mA`Jq_=_I0?`P@^@)&8l(91Y-dHc$CRY=V*
zYQW5HU&vKDvMLTWCkH2?3Xyn#5%ijDSg2y%B>@`A99kN0)3(8J5tAxk3nAFz?7(JS
zLkm>l75L2loNaaF^;*|3qv?ofoMRWeNin@{XVyvi48gAR&OKLg!l40%^$-+g%vuLl
zu!sJM6i`#Y_BxO+@$#ncp6VNGdM~V{S39uJ2nJ%adAzsk@fzQA4H|X4YyVM>z_GEO
zs!+Y5%w&hqN5B=oGUeeLAN+)XmAyO39fLXMo`QSohW^W$>(Rs;=C!0b@nM{gJS(}@
z-O)gqysVixZASp{I#K=UQES5?ezVr|nEhJn{-eqz$0hd*fCT|(YIasaOG}I1ht!xj
zb=Yrer)%}-)Sj}rq+AajsyufEplSG%!fXdDU>Fz-cAo<M{~DP^f#cRT;)p@H@eWB<
z=QB%Ga|pXDHoz(*xMqg8{H+pdB7Ei6cu-~aCwX%h<VH+GoB6R3{4SUnK&6i%ERo@$
z!2X()lTg3adeu#%^Y^<UhX!GwwtHNG=Ka2dIlO173-=c!vfu<aRKa1j@2#dpcqz9=
z1#G?x%X;lz_c~MRt=Y}lff>&1`sAw?YL^5FRBR`f_F);9wXaR+axL%GU~s9~(98;T
zJHM|X64YWd<LIxRT;z5O3DS-qs7vl2zJCxrnb_D6dN;p>=0YxB$LBtF*erUuYwD@*
zP*D^sFYFN$+cr%>t8K|yUA2un?3PFgzwnclrh-Vwx^-AbqU)agHCZHPM|WM>OlWIL
z#8(h^*3GVw{XiU^R#X(6k&&?&C_eOojEwBh#ztgf;@njFX+yHP2XxWC^<alVNa&CI
ze#J14;8}_?P=9{z1c3nV2$;^q{$Tu5nLMju3TK3db~HrPN-=mWmo_FZlVJAf5F`os
z>UL}m{aijqJ4lbq^Y?FpaPTCDUDXV}An9gfUQPO%v_DkHyiKK-%UpHwpK!BGx)<XK
z$AMs&9yXkEqxRO0HF*9TyBulp+oa&Bu}_y6yESG_TWsWm$3J2`HuatA7Dx$s^QGVX
zL#@QQN;JE<#pzMf^lmCW#ex1!!<@W|WsSbD1mYnd7Er{5E*v#j-q!wXtJ^Zo*Vl@C
z8Dyd=EF>EvBmCePaQdiiI~<)eYyayvi?t!mGsMBH)DKWtKM=9$BiPkxw?H9F><|(l
z{{nr<WE<+nQiLD>Uk50Ov}3(3nlII$2p4k3-Y9MM_3QUt9`4ng-0=14k;y2Y=5H-1
z|BXT0Mp&F%ASTA1U!_L+@A8`6&%Z;?p*aO%UI$KE;n1-E#;qy(%AAj;=4E-B_TqY+
zm@TAIHo6YR3PK{&aq*abW5}p~T&}`!C-4<(H2@3*R#vBGEjT3H@u}tB_c6hK0e%T$
z1B4S(>0<mVCzmgL1Q3b%@Drt%@&gdR_8A1%=+JssyfOtzw(M*lM*VL5Df};OY87LC
z1a|j1lM2Un-9*M&6hEJGrLIeG%mxm`%joBA=zjKA(q1(HU4hkd8S-yI{+PA&kXGOq
zS99Uo;`F}e!sqOITCM(a(vrN?T36eT_*RT^pV$-uKyU#N8c45hj);J@Dt9~8(~F7K
z8FZdM=wVmkfsUf4^5wK(Vm4mf_+|^{MC|wRhyRe3aF@m*UA?i>XT1o<x0^lSm)k52
z0-zfo9Rf9aT0O^r9^Qfs+}EpomnA}~9d^`^=Q1X4Z<U5ywppJEob`<Jr@UA_iB(*v
zTl>E6SRqlUwZUG>GkhLcz*m3&A^p*3#g9DiUg-2f>J)%JO8BXwa!BG!QT4;;{jw%*
z&U<R@Bbfc>Q-q55AM(EGWW^VBLses(1ul5x0k5~9JGrT{G+?xW%WZEj_v{Wf;BoG3
zl>+=ztp}bA8U$OL3v2t45EI|Z(GX_U<YRBD*4s2i7$x1ZT0JdmU6+n9y2;b;H=Fz+
zCSq!_AWDqGz3-Y5Je#w}^4zQH8Nsu5R4wX7XSD<VAO9@5=)|YyZ$eo}15#*ti}N4>
z<S)jrbTB9ET8HX6yS~@M%lF1KeMTGZLuG5QT-KWZZKP3B#cVCi3y}o>&e_oG6pH@#
zj0PKoyRx9~Ge_mu(JKU`(6L4{gEYBEXEn5-dgWKd(g$}NA`xJJZzq=f5K~)k>U#W$
zp`<7!4PxZ_--$&Rac5?=d`gQ~`O2Msz@F`df%{U$2g~owi5p6i=S<ir^@j#ZB*0R9
zQWpx9`cJgYsm|xaUb=xuUA7k(a(C=-fB#c<AczIsg4B0U(_PFG!b;ONzBSY;&dGui
zg0;6IDu>vq<6wT;zNkp3&Mh7i;X%#e9hZ_;7b&2{4^o`7z4{b_Nc!mB_M&!qNgr^&
z?Angn1i1vx1Sci|*SdT~b^9^y9-hVajGX+FrIdu66`6pbivqLh@k@^o_0UU}=t$*-
zx@aox@7<p46PS{us;#xBRr8QE`G2F4*W7~5JX8NcJ<W(YCVn&1AH+&p^H26j@xSbi
zf7+Ro;KAX_IV%1}g{ywyU}kQyAZebqp|8|9w{J_Vt|KGlyr(`+SfjS8iFXm59CMAr
z@}I^&H>+)PZ9xr!Gh;*_X?ZCIWH|YFwq)*uUVuYJfc{*9{9IG>z<Ww@=0$x}1;E{+
z-HX<oU25E3ReB+;Mp?QfWGHDAYu<%{p>*w*%j5e^<Z+O<<Y~sG(pq*ByH)c(_p}81
z_c8`_-b785?2TF`*xW3`iV|3<iBoMiF4ZW=-y`Ql?~ZC|HL@w2oLVE%_CN-AkB%<~
zUJ7?q`7CIb_+&7$?NjrwF(UqJwOH8fj^mGDdrfkS_?K!ZU)>|Ah>4H204*#76i|Ir
zRjsNz*fSC`22(ouVQK;krubVw-OFGDCXvt3MBZ-ODt$fKPt8_X-~5Ag#%K1<s$I|Z
z5_qTJ!dG)2SK`;<iWlRSdq=kBYYlYO>>zToSXf@2)ni_q?oJvJPFM>2J+t_kE5|G=
zw&}h?BMJ7=2xie%4TJ_YtI2Xfo|uo*z9l(|)i9Ew@xHFPHYe0X0K)GY`Om$eWyZ;F
z)>R3cBKe~HUA<pc`vZVx^b;&!c8=6Y7X_phFYK8Xt6Pnbj)dyw%#VcPZduK_s;i$D
zipOczrZ*K9okJ~aOHX~Bh^6%Bp7bYFy6%qsn)EGgzKsIX^3M%ui1x2RN3p+0nrReh
zanF5^i;B7xhSrD8cONeMG~aZ#-!(|7b$}%@YNE-$^T?3ZN;km|4MQ0zks-|5R>j*2
zWm&O`AiF>peWK+pbT5VQ8m#`s>GJFU^@O$386o0Xd)d?F&?@DJz+H|h-Z!~b`KvdP
zySnC?{HlV2JK;OlELHnmkm<?D%wiaE(o9~<FPxtZtIdy|kA==2OziQN`bYiGnD3|t
zdi%PIf-@(Dgy3q9XRA-Ea91_=mb(dSLs_l|8u8JuwPFlMcj@8N^1l#4zldr{$FZy`
zu~~5GrJ|#T*2quP#TQtHI=h!buLtrhxLEt5zlr^mMPf{gtm^H(%yWd=Pp>VwT!dXg
zj?XFM=tEd;BtG4nBl)@cTdJS}-;kcKrH<DVY#2m}x52X{<F_30ZDQy^v8?M1&&e%f
z$dbP=E<8BHhNF6nzhwo9nLk$rrJ9v9njE~|4X0n|+}FPPfb1ABH7g{#BHYz87RaN#
z&YD;I#ZL8eoNnI&>Any;%EZ?A#x$$O%zQ~7m9qL6Tf?<P_cYy?P@f+atY0tG$QZe)
zVoE~esRSZCiHnQvTX?4bJn!6hNEnL0Pa^_|TjDA=o>fUB$tBgFin^rEHQcqpxRHYe
z8qjgaSC_Okh!5dqPrgTZWlQ0(t*+IWqRKO#Pm5;WG38(~kO|JJRBFSwRJ%jyM%D(W
zVDWq{{rTjkO-*gC5IM0;$65weh5nWgA;;s*obD}pAEk106G3)`E-YEd45>89L`=&1
zdT7vw-N}te1#faBPZvCuD=<3o|3q9kt%c*PvLjh|Iij&I)%yK@@Vj8%iy$Ys5OGb}
zZm&)+Q948lVq#cX=e-6ccf>|$pVxaL!~H)f%MOONZAtWu9M^KT9aU&7&ADvdi~TAA
z6eRge6rn9Yf;J!+VcdRem94#<BUm{lpJV!J++|l$2xb=<c_45qsV8r77M~gcjOiSC
z2isAWTcW~l$!m5Ss*e$HIiphfZNc8}6C0NWTZ<bM!)-glyv||ExZdFtXZ@mJm@&j1
zR{D+vYb!=S^Pd`s@V|Ol8wLCO6j^WFYpxHc`E|#p)7*~~_LTN6$5k8_Oc-X=o#_T(
zoopYB>blMXVl8!~k0Sq2)(Dm^cbsbP5O&*5fa8xuzIUuS>4vh~8(BEU_ig%GNF77t
z<thx2{x2K^1ZUJq$S5bd^WEUBtO$&2^Qj+F-7wzEm&TQD=UCR$V+wYI35QSCrLW`)
zAe9hEE(RGzW{op^%lYfOGCY{DfiB3>S6Y?f)F42FBV>x*8M=gMFc*fj#S>5~&XS;h
z+WWOd+5GX}TF5H4g$?j0YNSq<Ke3{)nm<-jSk=VwreHjur(eclfg6xmDP5cw9RV6U
zK5wWtNT&{@0{i9z?%PgnRP<{bxz73y5p_oo6LDhLk<t{OjL2m@Pi;l)C}Xx6e@+u;
zi)UI=B(plE@%iC3x~Qh7?(|DhH5hCz6DDfR1yK?9YE~oJ8<-Pv$csj(IOV7m59;>l
zhR`_M7n0XD;S67#G-SYb-q9Mo5X)S?Ouxak1TshK%Tp1iF|L~IrcX`i16;@XjJ99@
znXh*-!scCSNsAxf(A5Xcw|DV6oUMm)*4gSKe!JRJ?UD0&SQNeqIeHpHUom9?FTJ~H
zvpjVwdblFqk(tH{mk+`zUYXfz^mhCM6OMj=90pB_2REIuOYJK6z703JNHWUL!<+Rh
zq5J>g0#w#J-Kl_sZZHqsrUo<c;+VJi{V9ld3qRYTg;fX=<7DUL$4O`-vvmwLPic0s
z9*CPY6<Tk)m2ht$%ri8GdJQswrD9@olKK@9Hu-V2L_`Gpy=2?h{ab{>WHMNf&DMi@
zZH$;AH1hmHmu&SJo}`Ro6c8|SX6Un!V%K<&$q6Z(%8UVtkc<94{H{Qsi)EV)xrMst
zFkqur61ZW5wy3c4xN)c0f%u``wo35}L?hKjr6yBN*+Z@*Xc@C>JA54bTE{Nuq1Ep{
zpQK158EI%%f>-TSiZ?41R+~jo;s@+E7CWjd^$~}LvMTB?v-at&|2RG*o|w`)1*L;I
zH%<(Y5}gP2HE^Cw)%}5(6l;izv}Gw`Cj%4(|B@L<aV)dL^a`Cij_ZnTj@?%K^|act
z3b=8QY4_vkO)vDYELKQa^4z03#PB)}+6>`CVA(pp@vF*yy85M0c%ikoERvICk7pwq
ztR8dTG<sy6`D)#Ijo906I8<?Nl6y`Uj;VvI4~h5xJwZcEZ2TH*u$q_mmX|lV%^Uv0
zo?{e&hT8d1I?raj<-m|!e>(KII0=y_BqRjvl`IlQ7y`}dYA1&M=Hp`+pD&QkYieqm
zo}ZsTX58mQ>@^lHLX6X4gAY-RFnZbuwV7W<CUVVK33!oX*t7SQu-T42G+MCg5veHz
zXEwd$MI&Ps5_<OYC&zLtg^Z}^m*7=tnWn@zjzt^4FkbnCZnuV>esCVwgAV~NvDUR0
zaNZQGADL$4pWGU3gfpvCz9Gg&T(RAn<3Q!?WRVs~f9<QgdEn{)mHN%A>_t%%>Og;d
zI&8$8JNMU|IH3s%30m=awd>)`ih_c$M@~uB9P?A@Ai)g2y1lb=<kH-!2QpBiLPH<;
zQp4mZzb{y-fYx&TbGn|FP63x&jDLe;CL@)KkjpyHDbK$t)-(a|Xdn1lg!$m_)2v27
z*rE`CXaYR+aAg7d(?eG2;ku$FxZv`P>w9+88!*x9FY47PU@D|}B`YF?%qSh{#343V
z<_~AD5%2#M1ECrn#m^rOb~cchI2lkUI!Qiew0(GBR@IF$1Lo9|kAS50qkrYB$Za#o
z(|%q*vX9gk>I-FX;V*mgaX7I_`*U2*OW$wMU+|cyTm4m1WkufNNVmLi!UbstI$3Bx
z&F#4p^uK4km<?mRxRO1$$A$%-Ijn!YKl*vCHQ2@S!OaG^+##S&CYW3E^eo}u9QdEA
zKkAf!^YQ;kBldc(S=V3wmQ7vGXxu;8kprFn^X$=)X8HiFU)PsXwKdiD8>s~pktDZK
zriyE>hcq<j$H#jj%Gs4%Vhc;cJ3Wrr74QFIw%MNA!#r3Jmyo1#PrQSB0vPm|qTfZi
zLmSI@!{ofRo^$sHNRxL*y4^#XBA?n`5MKYWMNCYtRGDuN#;E-f&mQG_%k2aRH#vGc
zYGpJq$30R8Beah49CnO-LP^c$pB{Kv?{=-8{9KmM-}aee-be_G;LGy_r+})SH2USu
z-pR!N1vfNrJ)$&?=j3rdC??gDABD#_9MNv+r3Rd>#!tMkKb7wfP>)5hj^?=O;NH4!
zieWC;Vi`jlVGtrmtM82#Qi<OEQic7w86Q-?-_kREJbw^hX7Jo~s>lF(0QsBZI#>65
z$ckyr%lz*KyHze|X%ZoQWobi5C4D1CEW39?GT!IgIe1g5CD@<NsW-TabagSBrM%E_
zgp##V?Z%LXW_8&%YJeGt7e)df^ZGfKR*bEUJFnWm@b$B|wh<lFw~U=VXI}^yNR*lJ
zohp}#oKbQAo=tQ~d_C3L9>jn73Up56bmrQ|l^Mp`nYnI-nqL?21)9z`Z!MhnQ3h3Q
zm^bFnv&WN|Kg|z<Hc%Pjshk`Sx5^l`S<~Gg%a(C#YOYiclG>Ba^<gV{c0|Md?9D&1
z&T+<J-q1)RdBW&2sX$dzEE&?c8^oVs()S9GyL1;XcS_^j7L|+swqG<3@}C?7;)Sc{
zdSC?IRF0Mt?-*UfH|6_legZo@xDAh8tp~E7dg?t%>HQTqVj@dWVe2`91+uYO(p=)u
zX^>S`ffH}J+;1l><Ok6be+=L<{UjxRmw*zb0}7%A_5c^np+T#9l^~_%9vN-q1x&oN
z;LLI9Qss8}fv2Ia9Y_7&5_oIp(EvEu;Di>>c~pJp?Lwj)a9@}hL&p1VZl_!JC9>Il
z<6~zPMeT}^32=O{0kMqokqh6|Kx~}D2$F^SU6`g`(R@v)^Aseasz`%)?~4@Qz1g6F
z*GH1!_GJl@N*xZF*u;W8D_92ItyjL7(i<}ei_%-Ou`n;Huj>BTEg23b*}Y$?<6^zw
zhPX$u-yiD3;O|!{xn89`#q{_+hCkipOF(~5K!f1uLauL|Isow2>K+9oz%ZUAW^V;{
z=^A}?Jz2J|&6RA`=I(M&R=#|#c+n;PVMSt_KGKuEDgcH_eiqzMKsn?pgq-k>j#eHT
zZRnCkf=q)$E}mz(-7pCuQf>q;%$&Acg!C{^XvbQ2*-Rf1^S4p^Qk~D3Y>nP&rKk{n
zq)cNPViE{trZJt&?eLa(uFHl9&49t_Ob~L(IENo^VJb<X&a)vF6z9zDNdF#_;`i`6
zqs}U*CIymYtS}BOS11C-(o%c}1u|i!M#C|5VKYWx<62(+pKE=;vs3S|`kWA9RMfa|
zoCWjdYmhsPI*c5D0}c<GiN4rK)%ag07Ozshr9S@42Msa*&%lIIl32aKazFQY)h*q&
z2JJMeb!^!08zRHqZMw}^nEIK}L&W9e{u=ba6l>w<=aV6BmV>mX0<_Y>iLFU%j&39q
zcbyxhnYv5JW)7AH^EYdMsU*a^_1?7p#R|mKZvR~#*^9>-`pCxF|E(eT;@VwP52d>L
zM?C*&UYO1JOptX0uM$Qn#q?;Dne0G+TiHyU<1IzJ$B{J5^sdAFR(#!9WhhrE(4$E%
z{|PcAUysl;Ta40Zxt`ALs$aY!$sJnTF1_EDgWu_0MN;p1I^sk+{r-tU{TJSr0`~_I
z<Fyoxhm0jF9ZY}VLCdJmh}tH=i~G7b!I300o4E6$mX)Z<FPzH2g=2szY>RQ0fDYNh
zBU0X&-k^PSV7hVrQl`A8Jz1Z>>{2?RA%En7a+{m_&p9FSqYeQyoU%LmR-GwxS;skw
z-HT<J;oWf~JB~g=E;PWh8gbx<nbW#K=%T+_i#yO%sX%J%Xv;b&$trha4?kz?XKNW6
zMOpH0O%LQX*ZRp7H)vvIsdE_jQuBdjwZiTDObGBXc`g<j>yNl0I&IR958R<6F-nYU
zZpgJZpiy4N>0>=`ltft{SR!zF2E;L3_E{7Tey~6Sw!{xi7!Z$N1tH&=z{u4s+b3C~
zfB};XNwnF5y8T^-EFT;F;88`;U*U{(<a$nKb^3Q4f8iy|BwCnAFP>(UM8SCM{?D>G
z92Q%AKq{0vTWH?KVLJ9>v}}8xUxPZf9u91VW?>5Nwq3aYcYh!6qWb~*UTPY>?rb8M
zx432$<FkX)oC6Al><oy$G91_b<p8DhZe?OTkeFeW%%wT?jc5(eCu$$Kzm0q7U>i8?
zie#W|u|aN%mNznBL+gnr1w~j)f-P0h)R=<eP&ObQC$`?B0x4HLP9H8miH5wk+{tq-
z-w4@=gRCZNxJ|)E6}g84zAq!NPUTle@H6K(sSVAJY5719gFDu8{y?#w9I`8c1_yLz
zCd=5(oOIg_)}OCxoM#i2uiq24OObP2)X*zdx-uK$eIhuqQ)oCRJ1>QV*zr5iU$Ylw
zROfgBmBf7JwIe1ZsI)iz(?9<t4h-l6)sq{88Usv6OZvYqj$gIjcv+`K&!|97{cp9d
zw(c`sHpq=MFLmKkS$(4UZ-~#byZ#^&Abp@2AF^Azmc^B*wQEaW(ivhUG8|)W)jzXp
ziA~V&SR#6O)$=RB*ls4)qw~*6snH%0kx>uW2YS_gHT?A%Sr<$60U03@>}w0t&Lt&D
z{h58b+f>M*T~yN<agb$mBA_tyxGrk+A@JXf9`V!5-X#I{A2=?#hbgk@_{8e)TpJ=h
z5;359)J3GgVfV81?l^H5vQfR70uq@*E~&bo7nEkXkpC}~EZomlw352i;GOx&88Du~
z6&_gu7{{NTPK>X(f}ec+FLCMr#&`b<vHWk?_y7I&1HQ4n3Ql$<OobR-ybPgz|Nb36
zf9m}FyjExd1Z-kjbkkK0QU{MH>=Zr4>!t-+_vfz%EC1NKMFS2~a{_O}GyAu&-j7Qd
zh#_%vuhH;R{bwH96{UhWuOBH251-x=raO?bIG4JC2DBg>e)tzj24j(hRnLU^6pl9J
zrBr2Urt`hxBM!^LlGgUX0`#4DlJj!zF>B`vSw$hBXt`$ao$E$7p1el-94C%3%Q5jE
z3PkrWo1l2eMD3vw%Fd3QOg^Ap{mGEML9J!aom$6dal=I@y$wYJX+-zIWkOhA0#DUW
z(#@%e5Jl-n)XAyuRO!d~r^tt6Ve0{+PN<d#%to;L3(vZK8LA5A0jcBQh2{NfPGO-x
ze>r&~75UXMStdHK84|v=jZ8gx7$8IzkYUROgsaB^OL{s~C7$eYG0}y?%!$;<=bp;s
z0@w*cS;L$fT69;PsS4`ML%_T$?YYDw4ZA094IJcH&l<AeJNdoS$EsMTzZaL>lkpAJ
zZtwIVB6dCG7&J(_s;Heg9u@4&Bq)3f7=Vhy&EBZ!v8%^M=;u?FR%1)kt(OM`+UZX&
zNrsKKE4X`cd61do(>(6g0<~SgoAgfU7j;n3=(l^JM8xy^0t&{LKHQp87gGBrWH@7J
zLnz2QuACc<O_qhNOc8dgRKjjf2QqmqLIIIWxwHL?XWQR;cH7CK9kGF43S6^vi-ECS
z_v-+qqWTVU=CgV3UIMO{3?>D$?KLVM0%g?+c2>^cKa%7;vF^)F$#C0Ywz#}N0c52M
z*x*EWUcjXK{`4&Bi*tfIUP@<06^oisRxQrC)b2iG@@$4~klb5wqmpJTFI<=^3@~a!
zR~p2a>uWuC-pPeTWbyvT-Khsg1Cl7&CEOZ>$08}xYeQH8KI(mhJfS{QJRg_{SBqQS
zeUh#2e05icb!VMZWkTW|7LBpFchsl?Q#Xop_tT4YiI`)Sgg@$veq_((j(k4F=h?CL
zkoN7~ul49C%dm^{xHJCA&=6wx%YSRIbz2?OTG&HcV<XOF8ElWA(=_cjV%G>RVb$||
zegkcdPPXFdFJTP}2P2z5z_+lM1Ov!bKgSInN3DMoEMD!z9x5N`9FNK}xZTvl02a#w
zZRE%R(FkN@jm8jAelQz64YG23(b&<Y4!K54{KN<IvoLH)<fuTDYXxwcOgG$dwfB%0
zUA76IO0P+&JHQJLseSb-jRJYDfB|m2#Afqil8G6P>VG(72nO@WggZd!nWqqq7%`!Y
zrgS$+(fxJvUo*$2KiKu)3={7^0%G0En#o^E<pIm9!|7?(biTTPImlo;`dsP21N7$(
z*Hg!w)mwBp15gcc`<ILWi|L7XHRS$PdJPvYiiOUv!>vDv$=?Cp2o=FT!1gpMNW4?P
zLgNx<ujLi^f9F=8-uy0JZ$FWwx}Z4K;)LwHcwI9pAkX(@brAvf5gD%87E3O4J<a+G
zqlz_3;bwbSj69=i^59*g4o_x|hLH4@8CZ8uu`N8dJ|<Ar!u*yQ*5Br=jSS}wtm$?q
zT_G6O<iRC%KHOLq-iYvjK65yK3nuADE1@5Ea#irh=F^4%|K4}qAg0%7_C<`L>X&oE
zDh|hg6h=yXF#0Q&EDx)aMG-q`#;uu-e~hevg?Pa9fnl3p#NS0`zw5ruXAD0#dLT(m
zFTr5{K~9un8TT?Iw`pL8F}YA?sIz8cYm5w!j9EQi&*`3wDIzyN{M^~@;YAyr@0^Hi
zVbp%3eq?}&FJVEN0Z{J;={{V*Z+tfMeASc^k~^kf$*JMQbZlx>+-~HkSINz2;h?w7
zHoxUOZGp`6fB5?9sH&T`T~taMK{}*Sx*Mb$1eNZT?k)l8l9ujnkS=L9Dc#-OU1xZn
z_x;W}>#TM5AIe&6_MYF&9oKc=*UZ?8N6B=;-|Q)_?Mf|mN_97qOQCFmYd3&cQIYis
z3y2&)mY4Xmd9~bKYeNCn@ga#1KksVavJB>xByCX-jJIvnzYptaGTI)&=y@nRodWJ^
z!|@j+pH{#JaMnlnr;Nzw2fF@jy*nu=qz?auNHNz-#^<=CVn1F{&NGh#-W_&gXGLke
zEdrHT6jDH=>D#eEv0m1}t#bA99iAl&QnCpyUpF3N?MtAv+O;hq!@>q4gt}u4dHdhG
z%hq{oR`W|X3p=p=oidb)YeJ9QeKq`com=@e>&TZD?!oW3+<kxo!cON-%CY5y+DP&3
zt*D4=5;)Xfckj`l(E2a=vUF5z9yfm8f0U<3Knd|HC`3SLhsVq4X|ojrg|%Y4;V6nP
z8$SK>>WT!IJ`_xW023l998mKBBA>5bb<KNF{q3994yMeD54VH+E>FL8s|X2F_b(e@
zpc7F)sW;?)b|c^JV8tlt(YJQjz5uoB%PB9(i^Z>@iMFr(#u$vZRS{L6&Zpp6iGdax
zkeOZ}QNs~Kos*^gRvJQ6VH5ms<MlyqG0=oukEE}LSC-S-y1z$v&u!2Esh`ybh>|S)
zb;+ErF;{g3UTDJ@F-y_T4lUNE0jXf?71lzq=u3n3@b>bVih*!)zy4_CI8oZwBb#KZ
zEU53_%J*u}!9KTWCLas7eb2oU%N{B{5~QS(&x&0@Buit3WPzO1so!SVQ_sisODQUD
zR`d7xVW#U9&9hc6G}ZMzkJ8ibFQO_CEb#N3U<pWGENH~*oQohf3g0QXCjBFCh-4A#
z!FruF_L^*=f|k+zGMo<Y`&f#g*kOKn2<(qnVdQJNuwlPa@zx-C?U{Q-2IJ$>P~QsY
z{W;9waqF`_d+ZOf`iE-%UGq>kbp)31&bar-7j>YkBwy79qb^_%;CcAljO+`9dE%r*
z8P%3f@m$eC)GHs!RBwz|tX)1Ia0ydIrt%>{dFj7*6vs=r(f>Tv0|a1xQ(h6j2il+t
z)&bU(TyQ!G6Ih;4@`_XY`UZs+6ywWz`1tt6fc1N_i_^;u|0cT(5~tIP)K^!a?h*e7
zFs+B8o0qg5S~Iyo#0zFGKE(29@7JUOagOMjmF`gRleqMQmVcOb@WNpyYxFdsCD08w
zF5V-L#V-xEn-QhwSaJyW&~?jn7tB6<fPxE7%d);jh+DkJ=&R)w<^>ax9TF(|3W(-*
zzND>b!qt=+tjGKVML5eJTzGeGLX&OLH|A1-)aS#*uoUGluMLj;#YQ}3H7qhJx;)b>
znuQ^g1j%P#sE>c**qT3OF}*BpkB#dBY9qypGm2+L(QW0$Z?;2yfYudj4phzk!`gvR
zxYdjB01h$X7GD~O)CL;XwZLaY?edjX?14QJSHVx4$?8NIo=@DTVsbr)GFcUR`c_RE
z07Cs7V|m-L$)`umVuFr9eq!*=L(uB~ucgBMwR8;DUy9@YpUspmONem0HVwQ;3~FNe
zC2`%sVMw@*-}l?rsOL_H$I*-1CQa|d43Uzo>1TY0Gq0-yOf0AY6*b{4_XYxMGmqDg
z-HKu|ylEM0@j!mscI$Po`~h8uQhz+INl?v|{#liLSZ=}^Q)b*BvP4B>am3!D(kVZ2
z7!M)%(z|U!0|1EeU7<L&$JQ`mzr=TdmzGZrst$>-m_dj`!sCzaOJS&Zs1|38W=hwO
z?MVl^T9++(atwu8^}=sp5KRikEViq*ysE;}RhLiTW8_tg-T<WJI$Q+gi0_j<ae<1B
zTJ;GUxDCRdj>k_CUZBnoO}6JhXpsVm2uRO^${eXc2JQ`f{k+M@-2HHlHO>m^yd(pt
z>lu&NwSvej?iJj)li6XBCvJTfjZ}KnG>hQ=FSP%EZ(5)N@^Tt48w>3KK!}ep#^M$r
z*Df4qKr@bxSbO2X3D)fmQitf}Hk?ge58<TgimiB{D32G@{w2;p^E-^Sr>d((LEG#p
z?=6P@xy!{2Z+{bKE7(6!zi>k-0+ceTgWo8}NCcC1Wk>{zVJgQc%`VjOwK_H0RNy;d
zyV>B|VAs-^D5tJk`x97wPj~^_;^Fkw!NqE9o>(<jQ#JPMLRj0+MBT^77Y(EC2@FeH
zi&u+xGI?Qc!SCfT{(VKKkzy6S%B#f|RrrZ4^jYTlwWtn{g^H!VG>aN6l4G4RG(^l)
zIhfH;rS^e38$<6*R%Q`BSGS|{-`9HHkHN#u&Va|j5zVdFi!LY?iubAlQ9<AZ;psf&
z3+|fDc+ix1Al!)+fX(TQTafdoBD}zNiVViJda$s)`sI)OKSOyFOd2T-li+p6P=C(x
zbZ`8?N%{Ob!g-N*#Q*W5!UxRHUgG?hURk}n?DI-Rf5ttrJEG0z9H=@-EALrc=da!y
zO}_cxpBAOE**ErdHhjzU&tn~;W2vK}95F#j?c_m3ANhjk_rq!($`GO92jbqp#Wav?
z(?d@jVyWkmiTLl3{xe67e>Rh$(|_KZ*^WHvd>e;Cqb+xc2#G8~`9P7N(T5~nAODL8
z|DU1$&p%Gy6ZNrDC$Kl$Pa~uG3c}@oY90!=#0*5UtA>-qmuUXq@A!z7m95;;TB?wh
z3R#fbm-|Vvq3K?k#vp?c83chXpC;hFPkK5re!w{*#Pl=2#%pP7?p?u&Yq2c<<v4yt
zR!NHi4}I*+dwza?|D{i6<E67EU!daa_^dPf!*n#+!O<zM>c;|yFP$%Egkmh8uKmm2
z$3%f6fQqs1{L3J3^`<Q>2^{6em?S?$_rdJ!OjoYlN!t49afMAy%$+8GJV$uYoC9xD
zDM8^KqW3weRlrPW8G0WzV`!k-zH$5+>kP{NR?KGB!vGQew2QNnc4VnK#hfH~!jwu|
zMyHnhq~T7&DE_3*^_a6!(kR|$_F+9(fwz>%{m@fsZEEK{Os|UjeH@Fk=0Z%@%ae27
zj^?{wEK<yV6K=i^%zj)0(abpkIcm?8mBnWNqZkGX>sb%{IFA~%?v3qEBL_SphBy{@
zpYLObEW9m=b%I_~D6~Ek$IhHK(O{6YSZf)|q*W3DPVhxt-kH7rN4eC@1?J!BpQdyB
zf371Go$S@6XktpghYoqG@F9bm*+)kZjh6F(|3fS<dlG(GW4yB~k!a+L7e!tE9fxge
z`!JY__3t{G-GV5OK4G1Oo)i8xc&9^x>6Nsy=mDK+r9j^|y{&Ix@Iyv7R_^Gh65Oki
z+TUgtN>rX#-*bhlU5O+Yb+eD4W`v-Vx`K)&sDq1lK9!<%9qhO3UXe;tU_u+z`;aRi
zH@MnNH**t8ZM@vAE$n-r*d(ky<tVFk;ZT0*o=AYdt8*|bHZ!%CMy^b)Z2SBGQj(Z|
zbnSL#%UvIeVin5Ps%;hPp;8vnF-0j;Q}b%=@C!7c(i+pin)V+~rOL+yPgoT%m%F`>
zd83A#;Eq#P6d*$)ljj8eUBxv#HC%GE9xY{Y1vz#^4)(x)KCqjcn&!=n`wx_(pKUV-
zb0>+(d_(jdYdN<5mA`t9a9Md%f!&;=n0ayR3i4en)kZoV{8Zp8^Ux-njMi>6!*^&>
z@;tcPUCa{|sB*#m{U~-}n63E!O%Sv&S+lG~aVE8zx?)yt-b<g{wUX|a2M+nwNTDjN
z0#8kcWwXCXV%SSjv5Nl0nM^foB$^Y9mFwzNzSx%|cFa4`naswm_@(!Fo5obPhmJ6;
z*~L|f95NiUbce1QXOl~zp~bLR`j{q_5Rdhk9NF@~dEn{VD8}K@Os-t#_^@<x9cP!+
zU`cUGc@`uc8kRPD%vR*hHP7z1QtGjnrm<nVe3XF47f@Suvk)~yVad?+#}^}y!@X6H
zYj7|j&D#B{tcG`Mu>UEUZ;~o7`*69u9`|rzxS!=IGFWB(yk*$NV#n&evaPcQux#Ea
z4I|f81xHz4k$3k5h#B$kl@a!nT%#mT+1W#)tyd{Mhu?uCDC;Y(oscf~GivptW|q&+
z9W0EQ&JpIshM2$A9^bZm8JVsV&<CV_q5orj#Ls-gXtj7F&pUmC9EV|rCOybB>h)d5
z^Q~m5b@7$5v)xW>rN=IMhf{;cohbO;A9FL!BwTbki`^e6XP6$pubjy2<3+C1UCMiO
z9$#12J}J{99ro<uGV*$!ZMODI@d<jxS19cE^(`^&+Jgf@V(jU^U0c3o8cy+h8n`*h
zW7ia`NZvff_ww$Uk!^{2i7B<gYN)_wz-ghCj3V&1B5;Z4kM%H7>A*d9pB|B5`Le^s
zsvRohbQMO@;NZY%m5dy{`{d{UY5`K$cB{RFnH$@CsD`W$4v1}atp&gh5Yq8ap%GRd
zKb5s^ytdE7EA^MLPaMNeH&4AbT2_W_4sKi5MuJ$HE_VesvL?(l!%<+}oG>I8yDc49
zMr(<Sx`xhbALUcGAM}kf%YecE+(7W@%H8=p*yW|8cg#sI4mYw+c)ut}UN6*wW;_)a
zP=P3#f<jl?(zmvj2LcT>^?cn;tDU1YgO(F<Ik*{<P<ZyNo?w?f9QfknY|={Fwt;q;
z7i1qH^88IB+?DO!C8E8v_$Lv`DgTK*?UK;MaCte<aM^ppj{pbc=lBV1J&rJ-%MLPP
zxW#6SWYN+-u?5_D1tqZv==NO09NZWwd4`T1=Fx6g&tWQV1S)l__j`3kYNqKK?J(4P
zol8SCIp004wr|@ks@yXq6FwaLA5MxBs70ctqT~Gg=Kf+9IO$zuC~}&2P97va3Mq_$
ziPzKRgn^UicJ4bjm*u#x>u#&nncZ*8fY4o+gFCztIZs2T;r3H)yQS@uTzxTaly6H;
zdi@W`M`QED^I=G{9$B}{eZc`5Gjd&OCZ_s`IhgVEPq=BnT)*6v<mRy#eI7ee()V`c
z9+!g^Gp`ZQ4r+pw^wQkV<j92|@YQ6P$t>dj10B{=Kg-a4mVxUXYzZRJ=!%4Rn@ls-
z!%5is<s9!p>nTq&JNs04cO<ol7bNSl*+lE?AH)?sh|!AEV+6MNQNOKF<*4biHNH{h
zEwx-Q9djLa)L((Kzv)I+u2)*)P*rOdDM)VHIkwtPt$HzWY;8ZqfM1$D6re?#n7A=H
zI+ktw;^RBaR>78EP|Mw`sDb2GFMhuN@d8FPvY!1(nRA~gj?4;8a`DCT<f_0=y3TU7
z0}xzKbD$yn%s~UBH+nuQrg7g){q5;RO3@6BHmGR*wI~u)H!nxW>QWZ6&qzoibKnmU
za>&cH?ohvEzueIXwD*h+{sjkp%bh!Zhp8{SNL@(_#*a(ji0Vd}HvKahOIf>%RAq+W
zkx1~{C1Vn5vk0fBZ}VKL`DtHZ9yWY;Y1I24C%8E5wp}=F+h13O?an!h6JG`@CdBp{
z()z@nc#}bADI+h@rK%e6Uo-dQtvPGBq842A{X*sy=WYJj_a@&}H$VFA_h6Omk1qww
zNulcXfG<mw;*6CtDBEI6y<SalA#h~45elVY@e<^8NeWh32G5g-1f@6wHMRK|tidFv
zIxx0ZFG(rzXQ0n+AIIzGM5nAJ{XO4}qR*VPJVu}vI85(nA51S+=5(>+%L*D013u_^
z_t!pZzex}kDL|qI7(+GyOru>JcDX9Sx|9upBq~-2vg8MaLW4-d3b_EG^$;5XDPSA(
z3cm39Dv?^1+pT7nqNvfKfWI0UN78>VRsXCF2NM$O?3$`BEievgQFtmHdV~Q>Lb!%q
zSA8x_2`i5lzNC|GtY+4+dr_t&Cfidsa1T?kkKh!SX88V|JL-$B;|AC24%cGS!&60K
zUBQH1tMT`o32Crmi>VLNu&%k1TRtFMuZO$_6C3&XFt6t74_RTM_0u-yKYt_Fn6?pU
zdY_=`67D-EP-BUEU{IbKE_^13cEJo)$9|>U8~h_O;$x$uMojR$+AWE?zg&gIy3AO0
zFewwZd&cb(DjDnemW*fh2j7UbE2OAC*u?Yn)Y+>AzMEf%R~NGx-LCw^u96QewmA?Q
z9lo1pzB_GI*?HEVn6lhju|Ag`q6MLY4Vy4}<NxE4ra09Vko`9R(m1<%r-su(>swV3
zxcJ&q1a#?*+0TEYO{cb1Cp4vcMaDjt#vo2_=eB`dN-JaQ;+WboN6BHI0$I?%^-ucs
zYu;|uyPsOoZ+-y(ouov+ciIJ6LX$jv*CJ&KVpP5y`CG%Yih}+~4(hni(E7^k4w{@c
zzB2qWrD*C;2I*fQXrY>h{MT>ZR+ML$0dqpgfu@P<_wP|Ajjx7mbbtDV*doEoCQQp`
z?Cj2eO$wE9U0ujT(q?+!;3)Rn^fwfU%ayd5)XczJ5;oF)N-%uuA%pM$)2zH<jQW!C
zNh)ff6sl0#k=hek;X~vQOyoL3fyVc5rm)}b`8!QlU90c$v+YQ#@5hk}c5ikxKDu=$
zNeg_8>0d_}$*m2}*$T`x0z-f=E+Y#j#Tce7$E+8rNM`3O1;eI5YZRg+HsvOgA0~sm
z1>TRdD4-lj+GiM-)xGpJCU`xVDUK{Y(janU+pxna36WPmBjz1WV*`XarjYi_T6(cW
zqJr>K>h2MMLn5vfJI)o!B+7wN5)AL80Avslt}n(zv1E9Vh;Fl8M<%atKgb~_=_lQ=
zjZ?X#wDZQMj(OAMa3`QZn^#wT{o6GS{Fa8)kV%`(l%jXNgM1WGB&zm>!v#yUcFrfM
zPt+6+++W73?fHL@#{v+)j-U{{ffTu(0!xTMDn+sRa`@HH!2q_IB3F>BFm_x-3P30k
zju!MFEY@@bAxESj8`7fn)Tftvvn&mkF(?lkd<n2_GPK6bYTd9~Wtz3kQB+p0tzafF
z=AZWTy;A4M;PD|?ne(ciaU!nX39)H_Qdpbqh=bL^5@Nsz)YPRo^`AuP8m*!gr8ikx
z*&RrW9)d|gZINJf1uUW~$>|ZpPtgbbVX?Xx4{mYZ+F(OoykJGL(R<q<XCpzD&WW*7
z!r`}i2B*61LHM#g<HqkX<QdT98NMJJx^DV1C{k1Wt{IU)HT}{Fd_X!j^l;ks;AcxJ
z%>bM>X$s<-;AV^Cfszd>$lJ}A48V54EG3m%jN4q`MQ-#Of~c99J&dUPU!U1z4xO7;
zQ-330v<0JIm`X3c_%0fCXIFu2Z|4&aX!CMBxB$~m@+O67cuyIJ-D*d>g@Fvk#8kg!
zAhes_Sos_JJC1oOJa8H0Y6xyV#-WH&#^6l%fU@##Tt|H5dY^yk`?#^r(K89f1m|3r
zc4HAn*I1$Vk~L@}u!egTc*@H2FGpx%Xt)+CY$?Rwxr#b1x`Ic4IZ|6VJjPo`8}}S;
ztlT0TnGw3n6blX13BFR92e&~t_|t~?W0)^+1^+N-M3*vB{+S(o>DjPV*kbnX7AwKM
z?%+XiP&RT1_GfnGkaUI^X;nI|^J>9O?i$TkU;oHZvtbg+*f>|TP%Yq3U*V=HCO(ZG
zTfVKx-+5BJRvi1-tO=;VuaE1p9hFB|lBGX;Y16+@nMaNj9%!1{=0D`7|LhwE;RE;L
zrpUi>qSFCWbaKe-Yxu3z88l}8uD>3qO@XFyws4jARzaYs(W~LFXU+rfqzp9)lM$PL
zWcETAVGMS2t2DBws{WO4**&*nI{z>u^h4-9B+QTaouuL5wDNP<W{6ZB?~^c6X5|V|
z42mt}pD)1U4&J~7-mXYdgxg3ed2wS!#|OH_Y(>?=I~wqkA53~omvl+aKiCOQ{H@Y>
zIG8+*D0GojiwEQ-jCI$UEX;vXIg~`waC8Pr_m%t)#{*~HszCDCpqwpa<k%S@5gIb}
z!0m9rcxMVn%-D|sP}gO;Z;(T^H_X|pKEe;(MGoJq7QVP>YN0DV+S)yukn*e-iZO6Z
zP^2((w#W*96{r{lGGs=~XtjFDZ9-$DV3C!<7qJ=~R&#x`;TlCYYuW$?v#ZjSDO1|=
zRFvu{Z}g;tVKaUIn_r1{YdPvp`GK30rAO@lK}U*GWj{mmu{^{@Wj(3=$a)Jl+x&tc
zM$}(zX@6*qJjSgDvvbZ;*XxDm98*A9I5?5fmb8df#0V*oD-}-F3JZ=<Hh#w{tRSim
z5(XCMA`4aAU1sKMo%Pm@Z1t-?(jUd^i(H6iDO%PSOT%9(#ALo5@`wd^QRSnIy77hz
zN$(^SU4(uA)-NN1WJVr~Y=|GIH3GXyGh*~gaBz=2xlXc>+oVL?L(I>>DB+uF&Ca#?
zF$|_??k5>*w;geEmVViz+y;Z9t8j8lHreyJScNm{`l<1<s7WNmq7(dLCFQ=oFytG>
zQ0ED=e52n*xYcU76pON>_3uv3ld+<H(ufrm^_w}AZ*skrAO4IU(!h-q*zy&7D7}v)
zL6tJODQB#8u3z5Fa42B%9BJ~I?>4p^*Q()GpP62jP1J&6gWR2RZ`Si6v*A=j{i2M6
z;6rdI^;cGgU$aisv15r9IhX3ZY-N<o=AXrAo`X>lY|$ey5uC<g@&d5}HC(nyvmWL4
z_@8fs^xk)k?wZLO(*4lNCB@1QgaP4hFMK29^<X1iTq;>KstFMs8()52iv<9?8MwET
zW&xQ6s+xStbE`5JH(D9vI@6X^tzYN^RV+|9eTMg1>dBQ8?Fq(R)u<^yVJ6IQ-{4)a
zWPVxv4Zt;77IV~1LM}UdtmX*Po00KQ)9id#J6b;mrm9nn1Hv#si|zAMknrl09^FkY
z*z|UIrC+nf7*k;7snlOtlm`(#(_%eadB~4aQfWhiVB$Q*K}9+#iQFlF2oJe-s3Ps&
zf;qObwx&~8E0M#e+=j?aL?%7hSJq^^%55+489#tvrE{NnSvz7-H*q%cEm>#ksaXN_
z=WUS92kdXQjasrAOXW6O#58TwUl$6CsAG%jg}&e+6NiDSQWeeci|QBVz8NBsDjFVN
z?;6?P7f1Hdn=J{)NXahIsRw>*m9!OA?<!WH&iRGflppKhv10Z6h37+K*Pf8)_}97W
zqZ%#TJ;<^aDa24XC2IY2xbyo+c)}mM*+Q%OD1F@vL(PT4jRXZMld`=Q3v!zxh?{s6
z4M<30d0x^*X8wO?AMA(APcaT@pKv~1Sed-?9}8A7-+K-7+XOqQS=~UC*}BZ?0OoU6
zEYjrINw;LK0-;7wQvvR4BjwaFz9rWKXAgXf^)^QBt2y_M6L*JL;&DfJ_T!fY=4dKo
zfGEvff~j#la6S0zehj0@mxkDsuBlE-1;L(b{|hP{%_H2&b)p$^!%;IEl!VF~r#`u4
zXWrQFV-#nr#`7oqtXI2p4gVhEv&CglzZn@wq>?BJL+&m$U3?F!U1x3B5M^OO8a+WL
z;w*3+JHTqbVu!f`X#g?t9%Ygi)C;v2u-3k8b0}kRG5)7y8tvwv*_-`pH+_IA;@vnK
z)2brWAt|v>kYt#OX^n|qY4;RIBU>#graFG|Z?!HHa&4Jyp_i*WFdc7uTtDau#uI0-
zJDc^a?wT2#7R@g8NA@?oxU|(=43qJU74?nn728#gv+d&CN#<XYO}AXW(h`l4;=bCl
zr~g=rGxext<v^z#sz6<a{7tx`2y+j%F6zsu>?iQ?3$-bTE!k)O-;gj~i<Rwa^lbSs
zY(IwQZ52qEy#h%Ra=fssoEoyhde#=tuF);V=kK}rOVv{;?}=mcNlO`SufFP(R(hqH
zTZK9gX>TZH!x-{*@?2hN){TvV84**;3}Zwu1TrBYrUudBBh-d(&PJ>@EB0lJ2)ni2
z74Dp^UIPJ!n@s23ZQM|LOfM~Zp;@c&34Q~2Et+c%{e40EU>EVHG-6>o=?8TM@)QzU
z<v`QuEl{eWj<p}Am7*~69X6@N%&8ng`zz-eOuE-jEmD!sa^9_I*I>np?Hs8{(IND$
z<RM=IX0UNMKTYATbr6z1!NL4NpX(OS?{t=>XAIHcOgAM`K8mJ^?`evAO|+$Sku@A9
zI0rL>o%>7aI+CGIr_tTnb4}}XlCO1OOgXW`S>}-LKaoE}!tgZ61)BDn{9#hvme!l}
zaesbNcJYoDZ~bjAwq8Tqn(Gg|cAiaP2HbA9sc0Gzyf-25{apxmRe>=}ug3M2Zo|5Y
zv6k8zAu4>Bu%%IqG#4g?bf4kB|5{1D8`UbQnDFdVONLCXBuCL#F9AYNq`8+_=1S@L
zNXoKAi3(57BHeHnvVr8?VBl1*r=d;(?eJ4w9VX7O0>X(UJ#50ALoM~0bXvypE61PG
z1J2J`OijiVwz#U1fow<(X3-%RebiqolO)*h&P4V3W7Q-g(GlC?kuZALqZqo?H(xPj
zei>?_(G?Pqo*c7bo7dO6Ypg(t>%(2J#EtqwscP*0Z7Q5Yu3fZ1qV$u1N$MCOoG4<X
z;-^F*JUrq4I3?{I6-8k);R>^cH9tkgvw?3@C>y~{%v{YcN**6DMU5&f6_?~ZE6$UL
zLS$r@8z+fqIh>(NL>6`UuWh5RK9Rfhm!~BF6r1z4IiOq2;O{Oz?xkS{@#i;FNa}AZ
z{v2!=C%)0*aP8j_TEOdm={@~6*utJVE|nwZzX^NdxAN5;i<LG<hr*(vOou<{bMR#;
z27ZQgmA|)44)#p5qev5mSkXrR4w58$TcPu7zKFB{^29DwA7!Q%DY+%I=kEY29Ox7e
zItE;}^%kD5$+CPSrR~EMDaO{l6{sm*e_hZv<B1!us=HPDjOFR!!mthNxNFy(@6rq?
zKE$e>`aB}R`jkCNLnx+CwsfP*Hn+rh2zatA{XqW;2dgl|e3@(T&li3AW8O;ID4K2w
zR`TdUY`sJcih7YcT}FDT98>e=K35mLcF5chJ*()x9V!)?NDjc;_u>FW737d)t>+=H
zKP~M&X+C5!>=d)-jZ#Sx!JBAv9k*tz_As9c@1M}5KOa^c!L<%+hc|7dVCko$sx`Y(
zmu564*gS1-IQYa|HE;0A%Kb^_Jb3c~)Rn{@^d79l9_9{m@{&9eMapn4S<W)zB{`g#
zN~nBiVu@?-#l#GYDa+V#Us*9c<(t4&I(k?>sC-nwjO(~1EZlW~KfhJZ`l{YE^T><V
zDAhua(Z5&grybN)ukUZzTJ)coC}w+Nm@P%rxGMUwm7z(o^c_8Ib=_!ZHXn!zbef%8
zz6_SsK^tJMxZ{uT4wOtXTwu`t)`YT>YRi;=D<A7^-|df~ap^F#^;3yP7g8{=u;wLp
zuYjnFF1U_4@2QGLCZMvScE3sJ+AWN&TxlXcf*B_yWd<H2>ks?=<_{uW&0P_WrD$DZ
zUFl&iNr)^4=J#*AC^w4jpHTbsw4^0eXpIw_UK)swDk+%pq!(XV*2a5{XRlS8&BcPg
zgB`4^j;&J@aTwYWEdIzz;*@$3_p}6KF6FVsRQOa))MVCGc?f0F+}*5I+w_D(voU$S
z4Csm2Qj~2J<r_2Ib9-;ATg+u_nA(mxg0NoPZS^}gjaIthI3KSlDCw(k`^zBnvpO1`
zcSoRnQvc^n_Q0SZYT`}!q>46dyqYIjR?C3IrfE3J+GFM0>Wc5=@v5>KR@_=+B!B+y
zF>5W1)}%J>41;EdO6VPR%z<oADFYl*DPz6z|4hC@nDTj_aKk+wTXubzMYM_Ds6?aY
zN7A4<HL)khfq+nDmEEco4q%c<a0~T%#TzwcC+F^CN+sziKW)46+uowdEeb^?Aya??
zk))wAl5)U!tn1oDm;3vXb%bI*Z6hLu!s5o}1-BqYu8#KIB06Dup_9D^=Y71DPP&CY
z+b|8m1x2O=O_Hd(1$&K{zI6oCNiVXjSq-+QF7S5$qI)(EEfj4_+kSS}S~XjU7@%Ml
z{k`#Tio53mJ|_HX!Us_BWcM<;WLJj_GQhmjSyJ;%6u#AI+A%u%X)sNW3NxgpQUQ2I
z@!C?9?wm$B6pzxEPT3Z~knZDIHJ;xink7CJMHU-WeH2kMMl%y{%!c5!S!LGE#*GXS
z6)cj}`35BorlP&4msgbZ`L7k#<pZ3gENDTQKsd`wHb6)AX&2D!S;tB?P^y{gMyj^Z
z!gsllyYs`0cJlw1fl^_P(HJOCy!b5+x?SQ(C5dw_iRO5c^c?lrQ(5W7jk}t}ck>do
zZ1P#+VqrS_g)4=j4N+E2Zx-uh&B7du&S)a5{|2~9Z6G-*rLra075Yl+XMqxq*=$r<
zH{gJ5hfNPZZkxN^m%HM4q`q`!u6*RcY1XSl(5B5(1;|eJjj^>x6b-GvwwVb5LRVSj
z0KFNY%x|QJ(d8BkU44s>VE&qJFelf|)P&wFhkWS_qncu4s#gY6o1hW6(Yq1nP>v4c
zHxrxo+oA_JB#zu{e{2)?2x{}UZ4)Xfio;^wqW-qO;X3CZGN!BJwA@N4kKR`Fx*vDt
z&ZC#8gliZXZS?$(S~}Pu-f@&KFeW)?uJTSZ&nI}>rDL2tSq!RNp!|Ads-j^*TV8kj
zq09D6<+C6OPIJ5MI&(s@f<)lxdPXgXA~=mwAC!$Kb~u_2mEIXlPbJOc|0kSh{QOiU
z!V@c}AysxF8SFz~Rdbm1>$fnbbOY^?7_Hyj5t?FJNlT=GW6-B!mw1e<5`jIFdE_jy
z{4Rug&|bu_Quwgzc=2uBF=Rp@|G<C5{ErH-x`aQBsi|aWF8zOfSDFVDudCyHKY>*%
z=>)9PZ$epZUY2l~zIw*Rh~)fD(d)GZ8%$9>rgJIj=3+mD!70OyR{YF@nHob4?dv&N
zib~dhr<6@tbdP&_0HiG><zYdKo+Da;xzpK=<Bb|!)!O(prt?e2%ujMQ0BfsT2F&%L
zy@wc<4%fDnM%vT=WexxbAywD6n$Es27Gu0*4?=cspQqb{{5ekapT|3bl`bl)X2W49
zZfW1|lUKtV1DInclr(AQnYNnG$cZwi>J0~wHC5}lGPJ7yCk6;nrprMs*m}*KiLkW%
zHHa>zv{YoYb}Gs9bQ2&0ZDVy5s5VX+(I}}ObLM-Aec<t@zjRKjcYE)?S*UA&8>nSr
zR%cQh{EJQm9tTHsww{}cMS5Q4Ua)`d$dLKQx?ewxIWn91h8-gzXJSg-{}!tA^O;53
z2B=rf5w%vGntxUt9aOZennNsp3I)mm$~{0$mv7?gqXGS!J$eUfOqvL-i@%G+_#gwL
zxx84DXK&0sH1AppQ}wtOE<G@ICNb6(xx()0k812{nVmUaQ(3VctSZV=w$75SivbXv
z=xtW@;9@a=CUx_3ee&}@u_hb)pQI+c|CkO5ajWn5@5)5xZm9$wUi<Ajr(^o`*X<2K
zlW!7oaxHFl6@=N<8ofYJFvT0k>Ys_@Vlri=%v49wHf`LE?}LS>vYdR#L~{<1<ek`?
zS_Hi5_qhhtPlL#hza6x|G4E3Ne5zsEQ?l-Q;2fyyW4yNo5+9~!(HdLiYP4hf*E=Le
z8+lGtj&OlULl)2+J+7*PulIw_aIBzyc2saRY9z$C8yOpB<<_dz+wRH(X_|vr9x1^d
zQ5Kk|moSxll47UWEd{(vjVOHZa;uuDLuzzN>2$coBg}n=IBkCs_1|@NlHtOpB-Z};
zCs>#%oTiwhXRTLNUs(|&T1+aoc%>9GEKsq+P_bT!Ns#f@ac<A9>)RNV5tn<x0TjGW
zPTR-w$j)yU>ZnqRh6@M$Q_b4v!Ir0>3Y$S&Q>K5=q`X)b)0Iq*zOQk7<KXmhy4RA&
z$*qK2l10BncCod%Y;7>;{SuvlDytp-JwbWu$~V50FPI6jfp~hAGmq|E?m;3o5{w%b
zZiA-GveN?`SEV|5?9Z|mCP-I&WY<gfE?-%jT0Z51K#4OW_Y6ok=&!IhWpdy%rsl)0
zS$!`2&G9Kvu^zHvUn*+**HzD<^wCeT&?0IKuYW~))-KrAW#OJ=FOp6Kt%xSlbpGge
z`1q_(@tQf9^fNl*K($1Ji5_^Jo=LxRjWuvI9DBIYIEL{tVW1e9yi8QgX_xu7MxsKA
z-@|JD>({R)7R-dsOm4yB{w$SmOX;ry?GSZJlYg0iS20~v&!S+KhULP*R3aS&hoEOt
zzv*`f`cvr#AChqqJqzfNNTCakZP&N{PR9$tIR^bWKuBx39%`>T?W&6t5Q^y+iFR5X
zc~~3<|D$by*41jR-kt2T7{-&o7kh1>^xpaNgG&t>D4Zd9XIm|pk*jX{B^e1kSpH|m
z&|~$Ygk&tzJw{c|Y)Hhj=F%{O5?81x@kw#ah=mj*sIepsFSyXj3xuY1wh&s~(sye8
z{?e$0`->P=yyu@Wby&8!VAXW8Q;>a=FhEPj_QU!45_6xgSL!V|REdc&I!x!LV-KKF
ztW=b>;{+Ab<v+F1wD-59%L{6J-7svf4#DSbp2tJm<Q;N=_dd%F+H=Lc(4|o{uOnhY
zkJbqa+`Y4~X$npn!WBYOLoK>bZOh%7p?&F-fwI`o43?Y@NK9zsF0cjIgB5tRI&`mA
z%iYw7YUecS2nes%xoCafHV)R&W;t$Ks>*v>8Y}nfmn=?f<Avyme*AX3U}UlI_}T|1
zNw9U|Fu@ZK-Up_(s_Mc9+Wl}d`)X?Q$q39IDJUZ%78@mA{Mj-`dAL8R-jg_$r$XA>
zdi_r^<}{qzP^ecAw{f*CXPc)ySleLn<chqcin^GhUQy&V#lzu&q2=l0$0!;RWM20B
zRKZ*-;tnH+UC;QpL9zLZJXv2LyFz@lSor!60XNsLo)UNHB6cY=LFI44bhNyCJqdKL
zjW4~tBU0~<b}r?&!v|rsK1TI{;_(=+Fc8-e@ttTAwH`&-`|_wp)>WM&y)Bsj9jmQ9
zg_-8OyaXDfd5N%*>_s8Ma{@zDMAx-Q_D7?HNvJI{^-)wfjba6sc}M{Z{gElHoeaFk
zt$)XuUc>rMT#;$EF!8%SdM&Rn4zHG%b@3>7l7_5IsmN}(mJKiT_=>j-MH@d+5Ghx!
z^d9W5o<9zsoFG2owdQdNo?ib!=Ra9))g4~P@E0p+J2c^UeDqqsP96T8LLf5I`2D|H
z04RTqU_8gh0MyjSEg-SUK2g7PkTWry?}w}nvmCVeK{mHf!%jL21UB2qmKa%o*RUY4
z6JbDyOoksNI@j&g4YT|aKYBXibX%78$8arcj0lK$;*Uta)*9<{t6G}h*j{hT(%uTX
zd&J4m9W|@1pPbEr?GSIgD&dpl1ELvO&oGa)h27_|pRL?Iy-jI}%;mXoI?%D*j?sPp
zpvjZ81epoyGPSzZjC&SV%%7Xni&N!YdQSeWWBJQB*!edocPZtnH-8)@q9~w8{t+EJ
z!V4~~vFrwSpMQw(mXP0iQ$FeT!knbC;L)AUGbQR9g}R2>(}5C=cum-8($=XAY8Tp%
z>N`<IlT!Mc#gl>cLq_ChmDy3?cprs9L8$tOJna7G=xc#%g$7D_^qOYDc~`fR=%0q!
zI@`v_zbTI>020l{=Vb&|MNjmEvW4OvM8@M)u#@;vq_v|Lpdmj+sXMoxX&54b3y;B1
z6;7wGJNEW4TpI`olLgK(8*6#}msom*#l0dH!%JpQDu0nhCF*7j-L;z>EmwT@>Ye_B
z-VGOi`X{nXzoHl!6I)3eG=V6)PSsvkWTf)(y6^KiFL)5lo*>G0hj^!I63jgmaT0nG
zao@TjUF?s9LxMp!umj+aQB6D7r7ceO2bya>UtZT$H_&{QWp-=TMK4a~I~Hwtz##Z_
z3!iu8A?M@{TDL+KX0uEDPQ(!@_xu>bR5-f{o9!!k_Nn+vAt`sLo<R%hM0x!j-E9ZN
zW*^Qv4~KPX>$#er5cd<@eQv$I<a@}05xkloqDQfO!s&7HYS_r`Hgc%YUlmM7StB2a
z`XU%3MUeD}i524crrPb)rX*DE=(xd_S(Uhr70m*Vtm8YF6pX6}mfMCJCN@_AV^ntq
zWB01L`3CNl-*?vIxB__5o?r^qcszCP#FKbSa>a&57b^S+^|9-omg61#!4-Y|!s4xv
z>px8XZdc4~uB9t5(7St9Spi@4o9DP5-_7+9{V}cPpb)BtcYGeFjK`0a#fJuc_K-!+
zl@gYG1b7v^*}$x^Rw3;v_8O!VbLJMuo1STwn->l4`Wl?~Y$>#KSoo*}H{3zRHkMU7
z#av&iK1tyn8j;bEf^Iasi_t!T7g}wV^wiX(N|-jsrAj^AO0SsMCLXwQ5;35)uk){h
z*e%L&Z_v?Bs0LO@SQT7L?|Gc~&Sluib%8);tzFVeGy7;7gC9dx^07jG@y^AXILfJ2
z>oE+!^U{>dX}K?-@a?A=<)eYcgQBG0fBwyMWR)ZiFSiTkHYZv<Xol|O^^f-TSI^^j
z7G2}&4!B?MS*#as`Qtp>_yOn}rmHgn6=Gl4+LCU$8#WlkD-fZA{3mvR?n1>?&I$EX
zxr+IIVqI@Rb8ALP{+xxS+~EWArPW=z<0CNb8@*&_*B77Qs-J@+Waaa$PDPm+&lZY@
z-=f?^9cTnScQ*A@3CA5=#c%%X2;`R8Ni`mIE;AAF+{mABP6*nz;H&}t!|ch(qc5-Z
zx*1S6dYh`qb|_^Pm_sRisjb*M4|_I!rdh8~JpB01a&s3Tz1XfCD+5{8&&zqafTJza
z+OXnYkfg<++uFlV0_v=<ixRiQK%cHa1MR!q5<Du=4WDac%~vXems(C~mUAkCXvc1_
z=+F+8`k38{X9$&Q=FZ6r;{i)t2U>LQy`(A=ZFOx<*3*0hDUX=gC{OfMj$XN??jgC@
zpj}0A3JZ!Hy_KFtJwQ$tObSYal3N3p7l*1z(|vpUi7ew)>UZoP=RQ#$9LkE7ggL`E
zIq`}&++)Bak0FACV}Lpm4GkL6^XTUgd^&aIW$o0$u&;P)8!%OGWkmG9Q}K#i!sI!V
zCo=To=P9HR9tm3*!ScUj6{1cuAJg;n5q;i$L2kc=7+Oprz3{dmGrl>VdGc>o7sz+f
zXPs;m5j&vq8KIfAo1|>ck@DBsw}5UFs!1Q-`5cKbVP}y%eU?y2HR-)O_d!;@zK|sL
zCFmCG2@4&#FDTW}Oh7}}3{(vO)tfO5%p6-W-aE!U@c1yn-=%wHvPg8n{0oP-4T#|m
zJXvT2EN3^_B31uhRDaIe+CDG5$W)Ot;Yu%d=>D!IWK~uaug~KZZFz)p@M~gKXX{#+
zTlhyLdRCcTuqFmJGT%6+X#3uw@WblMlos>_mtfMhtI2ja815!vUh8Y8q)>ePgW<Iu
zu1&o{s(=O44dI@M;|VI_9+TR!sp^yv@zw!2RGt}*%1y!B(Gs37osCN_{n@zxwq>TC
zd3(ncIy8-C$n4H^Ud<}66JA?Rsn~Qfp*hzBkG%z-Xy+xC$<`bAm$uZQ$*i@F%uKmo
zAk6Bu6KZ)@F=qiS?K&ygzJt5V%a;*$_FGidFG9$be>V{TG;s5`2;RpJ%sQ9Tu~XuF
z_yIdM)#ZKk?w<)EZel(5V}dSQ1Q2%;9<+6Ou6<>rBim9<h<HzBU3qFrY2JW#?F;MK
z2VVRAtHdU{kAz4m0{56GN!)Fv6f}f~3qAG1@;j6g8P^WBe()g50<n<_og?@;qX8{J
zZi#veg)ax08{3D0M9L6cy9SbAoF)dUehGf>1_D!`o^Dp!GPupn{gXWrhf+uw_?4aM
zk#U9EKCmFsNNXM*m*ZXzoE?F5bwiZyMKL9Ea3dp1BdS8-w^w%QctjM8^NkVVg~pwV
znwR%#GXU`T=okV`n1Q1x4?nRxIt)y}Lf%fvgx*%~ki3m^4}%_hF_Qh%d<V=Y%y4<H
z2JRh&CAv>3BFOd!2JXkyK02U0<VRlAcvoB-b};OJ)xGX+yKOL`Qz+MNx?*OQf23Pm
zU+28f(-7-e<bCIf9AAY|WN7hqiFAB7S18MT@?7EXkO?tqh0Tm5rA&d*z?eWZQFR#l
zZ}!R#9to={_5(b@@}IrM+wBOTx&f+s;PTR>K|ku)Pnfs>g&+o`>h=GX1Yvu<JtMC$
z`tyaBFrORHL-z)Ex`SqJ+&^7M%O|Yd?je%{XF7p)p>0Evf*&1+T}BQ-2mnej$lu+W
z1bO)a-l^ao31prPO>^7&i_5vkgO#AVy2o&wzzrf~N7a8N4&3?V{AK1BJ>841AJc&V
z7#oGpZ>-c+6DG?F$PdwmX?3f)f$NmY9)pA-?&KK-6isqYJ<t&ZPo+llP!7yXOU)R9
zXh1cS?xz%N3$D8CI*;qI=Dr6yEg;(l8u0qvnk~)8*G=?a9XYVQKDOZW>>S$}Jz1S>
zLj;zu549(Du1|L4&%Ey*f#|$W4px5tN?e2Ul&vI96`@rt)>}DRken-P&y1M&)Pacm
z(k&4Q6sCZQ={E|LTvVrDt{8)iUa$g}u<!L{tQKgSo)Lk$16P>oT!oW!K+!PiPdX7!
zVDr=Lwn}5evrchuCF8Tq0G<+hAGEO9ZfTcAU70J)29ovJcF@t+FzZ66A+ftN133ed
zMQ>5GI%jL*r3b%=(KYomZoK`3&B(Q-<sf1DOB8W*XunvMWW6p{^Lgoi*}Z1;YfVu!
zS|kgPHv#Ki{A@vI1Cr1Io)dk$$IPCmA?N0N!sE6ux>MA_?|uQ6L<f1RAJfkTz<Wvy
z6L2<UShw7yp(?d~g4vtb;j&M`mvf5SUkyY_Q&Yti$N`2Nn0eK@z)X_2u(>+EI9zG9
z2M58{o7cF911x5U_!dDuK)%+!dvQo|i0~evnNS^wf3Eq2d0j5GaZr3r&2hQgK)USK
zJp*O}G{`+S!rCSXgzzHIn%8Fpq`~`0D0lSq5~Wd@Ll<uQ9L4VOuQJZzf_M#1iNCCS
z!cLHQVkpx}Co*|KwygOs6`h#Z-pqWq{loRiEzgu}DpG1jO=fO`v3k<b+rrW{rI1(g
zpmVYK05n99O|k|z8F}9JgooCzvVRKw(^x%eR3lXonuF(Ny2yH3`vE8L@0lop@WpD9
zq-yDyp3|ZdKY0K!D9i1kKYPdrl;t}6AAtDT<#eER9RxTMc&tH2#ph;miwMy3^E=<l
z;}$gJp<W#rKh;oGZw2NURj8Po6Skn{bqq_L)6o+=KL7b^&{H)8PijE~fKUxen$n&O
zMQeuUcl*($&e~aOkCeHje!(M4ZTX`0V5c*4S=TVpElC9$Gdg0Q^!_kLcIzL)Jf7Yl
z!a<o6@(+&e$nP93tq_XE^wEklMAQA)IY`zkcS{7)x{gCbkUhTRDFT=Z)c3rOt8Uy6
zZrjrwV5>v9D&e_({>g)y?QEgk5snw&sX-HZd1YqezBalCPbiIfiA^Jr#2(4&Ar*5K
zX#AHi9g~teCeZjVQec`&`Ab6S(-NVY*iRg}xT|lc7MNb=r`oY`H<5k!I}CrF>_`mg
z2N1JK8o%1jRzt%%;Q0X#A8awrFAanNlADno{FPT%53h}{ywHof!UuQP=i3-q_?^Er
zK3y*>05qAcHpv)~dQSv=u?N7KXN71%G6kp|SoJ79&muO8$5*o+{Q@qBOk(#)xg$nY
z!UxJ{xIv^Gv5TBf4RlK61p}#&o_gn19)jfCUzZDq0@oYdH?F|S&dwdpMbV%S=dQQ_
zl;XRz^oVkQ-<A?<4B1r1-JKbHwuPaf0O!;F*3KhbQncoF(BrL`OD0HrWFz8#B)pX2
zNBPtFhY982zB92Yg)n*c4qvT)3&k@5aHwZ7rDExNVfSskp!YK>y%BEh*zri~Fu4Ma
zt4L=fZxz$<ibpc{?Sa;1vaLt;+eXKq1N3XZ?>L)}fMm51>TmX%8zpgG2TyO?L2LTX
zR1ja`mWqb}&@G4bw3a}Ch>`W8`qe_?^Z5Sk9rhSGST6w$zs{UEXT=uFguDE+;J+d#
z(JM<OTo>IO5EigU1=O3@5t(Yjl5;vZd}u@LZ@Znhw>ey{ua8*RT;qTy)10IoxONF{
zLd5<(IS8)wmd5eySWqCY296!B4LmZOf&eKpmLcU^v9z{Zx`}{}R=#|NG!jfFOa)Sr
z`gE~elqj*ziZE69=|O}Cd|va+_vtw<h_HCe07gJE0YN{Sq;cI__?o6x$R^W2EYT@Q
zh~3^V)naRBkI&b^8Gg%m3ys&Zqw3($vL^M3{js7W`~C%in<EK5?Sn>LcanLkj3wBu
z!svMgg&pyjnxT;84>IcUkoIbCsr=|c+B<Ejcfq215L~f46x*f6r9Ts-#)8O=gM&M-
zTD!o40(<1-e=x}v8!l;V;F@L`?C9|#PMUyt7es8&e^qB<at_6C6Y%ZQ+gPoz8u?l)
z1Y5XgP3{u%FvtIFN?_;EGeOI0^h}tzuFjP?(0sIcrls>ka+Co_W;==aa*N%%SEC#F
zS*>O~&4i!7e-|xZvAt#+MoXITey36nk9D8=0vhP6^8hb^_ep;!O8WS^+KWyFng&RW
zizd7WcZ=o9paDElDiWT{5w*%L|AH(p#J0E?JfYXKwkxO*vwKEpSZ>S#_Fw>=KalyN
z45JY#GXSRPn1q-#F_>CHoLy^IR8y(ap;y7gZjA$YJ^c~2FW;CfhOuPfC)zx&iaQn=
zBtm`LYknfG6F5+HqmE^P&Q!+PUFZPu&%0-0P-mDmBJGwxM=BhM0f87c&+11=Pm8S8
zmXh554HA%o>tr+tr3}GT%<&}y3<fq`0U05n%Ooj?N>o)JM%Iydx`~#-6xuwsF)-mf
zqtsS~h|7CM05+lJ#DE>)k%V%@NTiJV%>Eu0cmP@mGjLRmZPY30tGgXUD_0`~$n66E
zwGvpye4cmTZ1ZtLXpx@+5?`c`@dN|8@F`X?Zrnb&8kf7NqmIDyK9yvjokPwI4*0}z
zeZY*sg@Qxu;%@uF{%GHc8!XtNG;IOiDttZf0XCD-`c!k*D><2f%e28pG1(qktcw|*
zrj9~6Rj?$W)Q0R+=%h)gKV6K{bO?5y!J!01M7H>zJ6ZIQs_a}wto?Sg;o`QQ_9#@2
z$pkGiKtg}^!S=q*JhzvrG~9*;(2uy~NxK3h#(L&6?CBCqdZIflV$KIDpe)a&B3hX~
z&$F3rc24GV*5Cs^bKB?yB!+v(9$;s{al_sFc)L(k0@0X0kSV>p^z@G=d$_!|X%dwI
zq+ah#)dp5zWWG$zFE5bg0k^X^8{V<0o9>1OjJRS)9)QB>#*0^VnwlWMWsPwN1vvcz
zfD5#=070ClbricpUADchW}Y9i1lSH*$MzaCM=I{sXJTFOQzhxD!_}JI)z}7p*AcGH
zQVrG4IkKJe9o4Z5>~qWnEKh}&9n|UW01v_q-e47X^bA&)mIqp7gdEIZQX8Wu>}Mki
zKY;__3*JX`tK^e#Bn51P6G<OX|N8+jKSccaAssEaRkl8A4VVYhG)qbKT4N1wd&x!~
z5FwE>BLd_!_lyxPE6$P^uV9ogYN=zQI>)L)FNs&z!PA>uk-UEk{09v>g#HOVzZ*DB
z+x4;ys5gW9U=^IFbROIms@GS<t?j+}ZU8hCqowHuIvy>ESN?`a=-1XqzD)o`J5W&)
zJN1QW;E$dOVfh}cO22Oty^mjSxoa_`hu%mJTAMBzCoE>-l{v}s7FVOMbOHuKS#N#)
z9zYyWry>H-5DLZO3C1IH0+vv@ISl$Vhx6I=+~6EYEUG<obJs9@dsuNnZVnq;92uXw
zysrHVCUx)7c&>NGbfH?eiW7&BQ-rzEN3=XM{b%ojZqM{9_x<}Pu<@zudBe|MugSB~
zL=Zi_0<1jYA<ekl7%!qI33%ZkNRO6ud|?ZlLV~(=RsvxCf-=^U`L0DlgSP(-XouO@
z4h!hI_LCu99FEviN279_zc77ry#(8lTyBHNtvGfJ_=KC?Y`E{ULz)TFX1kC0Z#Yv@
zCmT<YE@jR#Se|0UFJtS=jeoVaoQuufC7Gk_U9~<v9JNLR-Y&zOkX<`9jbL^92<`-4
zJ8`yd45KQc6!7X*#8stbc0Y~|<S??)<n-0CI`Bu_g{m`K#e{Ti?QmV_qS2}Vt5C9Y
zYrpysocU8K$C{JCxq0z|VftNBh1t2&k-^a<D)Zm5*;kqV-~UJ~y#P%b?oTy$p!1zP
zf48?wZu|-!{<>K>+fEP^Q7(|JM&fD2aQ6gcR0sBi3}-6{JX2CIuob8rZs`zd6w%{5
zh8IhS|H5YYykneh#*feXtiDl_XN36?4R8q~mCeDf)-7)`?Vh8Zj*V@dxouSkKb7yF
z-MoAaEMdU@!^cuhdl&I(t`jse54htx*X`r*Pt-Zb^|#LJoy0(J{C9`Y$bFf5fl$2=
z7Cs1UL~OWU0}fJ1Q!(Ma#Pg$PA);qSRC^1Pa8Z!kz2n@`5|DR<+N$V+k-MGq@OKbW
z;JqiDjW`Daos*Gc(<O3_^8Eh8EP+dWe_9HtX83M!GF7tY1lqtaWsp*r|6E3ZZzM@g
zUAtc?Lkn!_vOdz<RjwJkL2E1wIs_FpS=e_9A4XRQb?>GhW*zvQ1YYrtb&j1uuPKA*
z@jgYao`5j)oY=tT;ww;@KUXOcUEV+=>c29@5e6|rVi%<+Pe3-_DpZ3hw>0zci5KJ-
z_9EdV%ysD#Q=d;fpvB<IlSi#w382{fxX<=Z=DK>@bu~U#)ziNSxFG-?q367OrO5;K
zR<f{L2Ud8K)e#apFFb%EDTqWldHPYHaIv#DFBfkmgP2sIdH~`uH?Oe0lgzRcD&wRV
znk4$pG1ACVeF~7_pDYvx#7s@}0~^4^WIO5~DXTKP#;h2p1xZP`@&-P-qo-myXR0yn
zi;X|1qY9&8q8wa{Leqk={eHU-SgHTrr2?%nC*uV=@YWiXC2>$sbxaCqOVzwycm<Dz
z2v_433V6i(*eb9=2o3N9`Xx>xWexyX0a+w#<PVyf>2I+h7K#!Dn|!!9FBCN71PbO6
z#GM`x)?W3XZIIsO@wc}XM1YMsAowU~pa}FhJ>f-Z<f_QoJpTQ>F8#VDg8yT!mk&Tj
zDnU(cI}h4`*haZC$;Eqx>T&s!zvZeti#7<+1=U?n{Z|1Ms)RsfFZ%>gH7_WmM*yZ}
zFtq=~3xsXWrD&U=e*ro#JA1$MO!tB2d-Xb1TlkL;X~_P<Ei_6%{K+Gk{jK;6PeBd#
z=d7zLyH!#E1@Z>_f$5xsz_WwDcK*m<ZMZpnvz#D6k%9=~^wvp(>&gFW?>wWL`q~8_
z3*sLIMFABARGNSkks=_dR6{QbkrH}uN(o3;6i^V*ND+`Colrs#9W^3NY6v|b3Id@i
z5K3rwhj;EfbJyJYJ|FH`ELI37=j`&d{d=B$5)eL&TY~b|ehuSk*5vhzppOks1=}We
z&g@H%srP`rCp5NNe!1x@k3dr=h%2sGM0y2M3Dc1tV7PCCG$W?(E7`hd`WNv0YH=E_
zoV7Fr55HtPBD;SUSePRx)v)Due?aZsEZF<Ib%$Ck^KOJyXJzNIa^ud+yjhS50^y^?
zP4Cc@sDliChLX;#t5?A8%@-|1@TlcGizCz{YSaxEN^QMO=*P6iHqRt)_b|hq;0pHS
zE6mXGpW`3JD%_@n4Mte&c3n(|9Xzjs?U6CdR2E>b!*>RHUzhFU<rMN;XP2|`Y|g1B
z+^*dms@k}NZfF^KZgKQ@U0_z7kE4`F>UlzQ2QU>^&Ki!1*LSs4_f-_BOlG&<ZQn0A
zv(Vt{?Do}pSMChwPOj**gvD#ZkZf2hsjySVV8OvzZrNkwM``^lUiZUDN8zvd-|r2g
z`aX>@Lk$Y|x(Iz^vQV%e8kuQSES3@&3KPAT_ejOXCU%)>2e#7LS^v1Wt(2f8-CoPn
z{0zTr)%iyd>_L@{O{FRq8PR_n9@AnXsW-*-epx?$-?H-$6s((UaCMis?|cKk{;kN@
zxs_xLdG{n!iYWy{9_|1Eu4HUUGE=<JC6W&oY$gN33W!5Er_CA3_KdgQifs=kudZ63
zY4!=`eonFx=_#J73`n}H*y4x?HNE<bK#(sp(Xx0R@GL_zmwkd4Y;uy3X^r8OAG|v<
zt<bqNhMQkl$g1DjOV^*Y&ANU)(u~u#Jw-xi!1^_vYU;~IbtSC!={I&sEiI}I=3;w$
zT>+mbMKWO}-z;&H>=l538MEoLs8Ws&PnVo1zjpOX#`V>3n?7M{@St%3o>ERTEg%hd
zaDO>jRHY@0KuQ0Vhsi%lcE&1aZ@e@Z(X$61>AWUgcpl1DwZ^Kd6sG~Qqu|*AQ7st0
z){2LVqkd?Zl96o@8;4Ilz5Wfb$H?uhum)dO>ZH}CUcoexC>s?G7Q~rt&f^s~rb+`q
z`6uxYM72KFM29Y_C6Y+Kr*CU(?CQ@3BK7O>%*Rj7XfV}x6&H_pmQ&M!LK(ls$e$`|
z*FXz9ryXPAE$Clw<gLxDTjF@}HGjp?$;7JoAeSu)Z0BF9Z!gd1#2zF>Pnw}y_({ck
zMw*esTken?4W`1D!AkjK`ByJiZ6>l72ob8xE|=eMvO7t<U;Xsyji`r}0Sutygehf~
zC?ZLSWr=i|i97Nd51dkZjjhZ$wJ2P9+RN`-Yirp<5je?orwOKcjINNVkCb8si}|)l
zSYtYezR=Z9w22H|Wu1_Q-3>NjpF)>n#(SUL;cGI(uA(Ys*(8euJ{Hw}&k_jy(X~^V
z55L-b%{QqRrw!U#?0`Ks8GdgDhUSygFmfOL69&%NmdsoDiO9TxeX}krr05!SzF{?X
z!G9$SmFJjLUyrZsm8<F=uB2W)sYGcUB+Kd*KWFN&K9WZ&BvouIInNp&KUIFM0Hv3p
zr&nsS2rBRaLh$5^#V_~G+ZHbs5|0=scrU%-jnbCL*PnE+ChUCkIwEr8%Sk#q{Tr96
zBdVzUooJtC3fVHql(co3;@|tAaer{%XvS5_HNqKH>Ej-xAO<fFH=SWu_%*t>KAEq7
zg0f<|qqfkVQFb}Yg`+bct6U{*bpQTaK6JZepW};Ed1BZR)wfa#ZV30KacVI3Z<0%p
zYGK*E#)fEH$Lx2Mfl&{A^!)-?=kErculU*}q`jW4q|qu3q+c9Xw%z<mk~pl12-!Lq
zt6$m5$27$Lnw+1%dT&lO=K`}n^KpHameAN5YkVjp%FV^NJ~q?EnA;b=nBj7P6+y=#
zdL`qUs)nB2{ME2^xp!Ljzb9s6e>`A2djASOzGsK>GpBZQ+$-ePJB0nRZ?`P2rsknJ
z7D1{*y-R(>e`GPEp}`D%UzVAsb^71sqY&D0qjR3(zgh5HFW>3`v1#M^^OT<1cS8&D
zW26SFK?oCLxK-QiTDyJ4Uqy&zVxXeMme+9}X~<&R0ww%F5wi0oDLuW}LT=Pz@5c`q
z43>TGO2+8usDjLi$8F-LmBYfqB454I4+`2#I|F&#CMYcY4qP2Sb!8aR`@qgFr=zoz
zwIZed6{Krwd}v#%sLW%|GMW)`&?SD_?+(`>IE$C|i)C>5)BczGQ-=*My%EZI_`=dy
zd-DASZOo|wAr^+^+9~b_Zl46wo7I-(g6$x?-^5RElN-xr`5qzHRwkK#^iHh>%oJ{R
zm?FAhERg7}T`3IX)^u@UjJM!0y=b1gyMyf`iO+EI${$K(tkhbB3=dK*iXlv?e8jDM
zZ0p|9jIuY(=&o;*^pJY{_u*QEIQ;TUZ$Su?M7UF%q?YXE-JsI)X5>EFA4TTwV?QY!
z96`;nh;Lebka&dF+IuRO!kOpKJ+wJ?*!G7PKA!+DxHzmU|6L0J`9FGgoCExq{-ak+
z^F!>UoasL|i?1OWz&ld3zW2KFP-FXEFu&WN*61H=CGRr7n=VziC#@<XPmq7=4+(W}
za=u+V;ObkxSQX<U%;9_aD8v%?B=0x`(j{GfP|$p9_%`yqPK}a|`is!$ZguJ793}5x
ze&B{cF5f=9F$D6d{MuU8KM=@m%Wq7wy13Mx(uwynaEH^fy2ZcVQ3weulzxOR`MXWe
zkRM*`9e#OWJkHqPW#SaW3o`3W%%Io1Nkb32XI%aVY)QEVTXQPf%`>+|c4u6)4}Nne
zQ+`PJndzgmWn+u_8Xsv7_9O{Um&$un`+lJ&{z)vbF+jM9v=7ivY7fP7T*z0CP^7j>
zJ?dCme}J}EsL?vp98wrOy4q-cr%Y~b_#ZikxfYE`m*TG4h4t>x#46E%c-{h}wR&UV
zFRkJS$fD`5Oyq&OLViUB`^4B<n}w1(=kLYrqXBVba$|Cegz1V{vS>(#(XEv|^S^!n
zptH{`?;Ba{OB#v=QC{%QzfZ%XM&>wBPj<|Ou!4O8niG8?Mm+{s9*RNh4!-2Yf9N$|
z3)xpXuiIqK<q((PZjv9;6=cLZApz}IPP|iC(jKueCS~EgTdFzZeOpS#u6kmBLv{NY
z3eB6+l}m_tJ*3KmHA;Em({V$KI5ydoAGdBd)heFdDvk`Ga*_vfQ>3Sh^sQ9)_xFhq
z$d5Cxn1NjE-cJo0r%SF13^$NSa2ohTpzKn(=hm}Y7xuykamcvbzj#sN-ymIQpAei*
zN<OP;upY`S$NkBCpj)@8K!MfX>y&RK?7%J~wOMx=kvds8)_ON{xm^K#w8YfBQAO}l
zCw!)GpqRk#>}I6=*yC@0dmXj1*-@2z#@~H|hx}&1IM$5XFv{oS=sf}C4}-K8D4w`_
zd|1CoKu1?6Id`9QVj&01HMF@f>-6QNFpBuzlo#sTqzL5>IuIk$@73*exvYQQB=AAi
zITc}NkP>jNJoTQgV3<pV(R`1?L{_Kz<n?y8<z~d}KD+O@s{f<U!X~LrDmEw5vpRLQ
zy|UQlaG>&BA$%c~PRzw{*(93Vr=Uds7j|loc8cjYg?yB(U@~=ka=KEU(v~BZuin~(
z&h{ymBY)IPGaUQ6;qUM{szXSFFn8ZUYNzT5<cHq3mq4-hGt9Ss-YnKI%@hksE||rJ
zSSZMEoQf@f-~8LFF{t%&s>GbDi1}Tt@!kxJL)!xOt<?)KXj~H$^=ptWq=Kec;P!hh
z0!-U2o-2Jla<qz^|NV<dhhX=c9t$4)Nuj*k%KP_H&6pB_P9X~W6C6iB92knT43H~x
z{3*G*>xo|6gU?sSFD{1!`9v;<%=*(Qe?|@xOW^8!-}s%rWzHujuGap}2`stpTNxv_
zdwXQb2Pg{6So@V0VPNp*4fqqy8K6g>+l1lfhM57A$win^4Uq|PUAJ?i^IwnT&1bDx
zBo0~dK!M4eZ^B^>!*E(&hW3Kr&KDY<NG%>}+ID!NqNa_kN_+2@DAZs_^`B|={~3vp
z$~-9WcL~P7eBY^dU@SP_x&GbZ3=}QFAU!|QzCx}s=ZQBjg*tHFx_^}?Ti!f9dmMdZ
z#MQ^|gFd^$-1kY!=rPFS>sN<q0*uMVJV4tfhxf0_U>sV5N6|QQG|Y5`P`!Gm>JiZ+
zz2}x&H=%rtGm`SPeg1K~yKfu6=@V*u$@-GqgMV&9leh64Q=c;X5hdvrMvi}&m03Ah
zx!vpe9tG8Cn|X)&`z#8i@ERI?tM0-SW8vZ~u*q(9A!#9V#Srb=_Z%ZzS#x5b$T;|W
z40%)tY9P+i`;)SUTC1UFnTcVd=co#{crprYJL%Mo>G9U;VJ45~rbtw~jRd>uP1Gen
zKH%^i%zrd=U*okl+KN3gN<+_i|AZ`Vk~Oy|S7qNaRSq}L544(#a|e^{nWBMV;FD^w
z4~y4QJp##)Jt+;0>G6xXeRItQcvb3tjpnP?nLd$U`-tZOr{QL>Q1I=sT(M5)Cw0(q
zh-8vuKs?rrDMguUs6_cK7bkVDtkWU^zx+nf&~Tz2F5DkH&}pGqK+|J{`|d+wLh!(M
z?umiw)p7owM>yGOA3vB0@`j=PlK_tyX&@$iU-=6v>z*XGCctHmtKCei;osS$y4Y5X
zOncU_cRS!*)-x)+r>$CD?=~aL(w?Z++SuYoW<2Yr&kS~ozvkZoo}*r^6gaYtcGkR!
zqVv@BOk8u2{;80C4NenZ;=sbh*wRhu6t~2&O>e6Muip&O0n_hOtHsQAs-$Q3GEb?Q
z<^0<Dko|VAb*?>j#i{c4td5l9EW47uBX0H}$hEwkKO{qEqK^D$5bktWsMzw?k=wu<
zMX=Gl-IXM?prLdMl0kQU_*No5$165Hf7_+0chT+^FBH_a#J0w}b5nW?P%`mDapAaD
zwU$nC<yHFK$iIuE*0|7ev&KIyHbCJ!{mGai${)U-`A`9M!Lyjjf?}(1(7^&!V7X6V
zL$S4Kr#K^gM=1G@Zefiynm4^Uw_O^zP~gDsUB8mzDxsx(DtKGtR>%{ZylUwb?Pr$v
zmLK94eFjnV5X(n@mw~rrkx<4IV&O_!g@4vxF+o#Vmtvd-N)+c32~+)8U&gXA&hoLP
z=-e1@XOb$q@>j-dz3bOUT+7GU75!bmZ>z$a7?9$tLbBCZ`6&<8f{>xCIP+3t^Mgk`
z92>TdEBe&MXF2Q>zLi=@m_G^5DC#-jMI)n~-;16*Nt7wtD{H(S@(Bw%%sE(+_cQYs
zK6YknA`50_oMF-|HXc$bFW2xpg5*9loYmSnyD5a7#yLqc)hqX<O{NtBlU=Cz1CfK%
znvu#`9U5}Fhs)w1lCqS!TT}(wB(-x`U16Tq0aY=Y6Z_*M15NNe`@8fYUg)@32Bd=J
z<e+e)WmdPGPiTqUk`Wk0RNt$iju~$$=fpC%3Z1+D4J8VDK0!@^;gLeP`PtX$-u<7-
zFzX4z7a7Q}ZMxs_wy5&Gh!rjtHFAYD@V{;_m@L^g*tU~RZvK5Lnmf3_{q~xBahG{c
zlsdA|$iodaIX2WTnIdDYg7%Cgz%S$zDGn1k!}(A6zE<`dw`%qN9<Q2Gy}PFAH4VJ3
z-y$c~o{~r9^U6^~u9oC#eZb_E1Zv;9&|ILHO7Ez0U?>$Za@uci;_<=Fw1*18+nv;{
zR-CtM53^iT&-E_z{g1o%Lu#}Gy))gp4_-DQ*TU2`xU*>PXKjW%`~rV!<L)t}BK3rJ
z?Ydu=W>PVJlro?yV4k(f)`&^E+wc)<6j(%a$+w1@axP73BRs8J!^awzTh>6B^)Zs>
z(eI%TLzl-(ms5il>Ewdio-{Vk!GSnSI<GDDSU4$nvqy9ITRqYEc2_Y-Pg6#u|5ED*
z?8ipkfi;UexgvfGtQ>n|soG7(l!EH`+4+X4L?G074-neq%FJpLdld!9-D~^&7RYF>
zB#%Nc1xs+!SjXD>kNOZJBmHFeOPfzcW#>&qfV$=eJ~plCs#_#ddoVqco}ZlLs4uP1
z*~EEG${aneQ1KyF{@dNm^VC$#{K0nKgz+D=&xc4ub3+VWb6`$NcaufdiL{b~qO_b#
zQMpo>kGUlpBdShyGau+tuGqKKO*D@3t-%4m;~*NWjQ#GXfgJT7YCOMRa`X9t`mLsv
z5?SE`5vWjs&_LX!5Pc}BtVD6ihq5?~NZ;MsFjqJOBo5>Hdm@mp{M2Vly`-chGO=)I
zA2Xk^IkkzMUwUK_>3s>f8Av^d&4}0q{kq-sJeiT-!0W>&MW0K~C*AU`O#^ZpV%wt~
ztQ%B!`n4nDYv7n@M}jKNz=7jNlqCq#GyajPf&)F1u3$VBbQE&-pK(??+Pgg=e{l-c
ze)fZS5j~*tjtpQmL_40^F}ohVfOKB_&nw#9()LO>s$)3uyyN?K7}mUT1EHPpCLU)V
zf#NzlFJYqP?2H+$b}P)z&VG=1^v4zJi{YN0o-7<3h11j14-$|4xB@dXyMY}lbSKrd
zyU)r?L7D}{#1K$u`&D0&?N%!x0f(wOlstcbn7H+?%|ArA%f>E(MI!XAC}=8Z65A{#
z-enor(<T}HAW^CGZ$OcG^Sv>Rau|1h6j-~^j>?+;e%JWKSxmO1aDUEO*F_P$c%>rh
zl@KNp4y%*upE*3M_h)$p0<5y~Yi=tXo>hFWRHrtqfZ_O_#z^Uvf1l-9o%;Y?UXD<y
zITsq;Xk<DISB1aj6|}=EU84USc<u07CK|@ZF`jFK8fb>7YLyyxaALJ5$&5Hie1KkM
zAjIKBq`t5o&P1=LXLZS2r*K|q4xh1*NN0?r!f}p(&*Dnpo~3=&#dIk=uzxKDMLNZm
zIy&|RcfG`6SdoHB_0ntLvIYiec>{~Y=xR5ugMwFshj-sCJN|q>mF+aC-0p4YXX;6~
zN@*Da+}8GD^w2;XiEVpZ<?y`97s<)z;6p4jqeez)S;@)hf1N3vEjH<78tf77Y3y*g
zl6!pefwpN8)!!n^?Q@0&oEXN8d1+~bot}O)>IA-xYut(SuNLk*Y7z9DN4poCUMStq
z$ve3CERy}KtE8bSs%$k%!l(98*S9b*nj&V*hqo8WYq*1`^=n@AwCEm`l6b9m3Ni51
zGAn7&^<n;K9T~YSEYzj%2iHG^0InGrkWN9-5lZ47y0Z8&!F`ka!EuhRvkY`q{`gS-
z%^~_dp&K!aK>>$@dixESO6l4jNyeGqB~G86gOyABPQ#^gIy#V()BAeY^BLzgO~IqH
zXotyQ^_Q-CQu^Dn8`lW>C#91=Heeo50v;;cX#OU%VQ4E@$kPP4v2n&v^g5|puy7b-
zSy~b%L}mP}cB^P?H4BU|sk*&;GJmmwQ3UqCx+WmyR(A9mDS}N(iVuTv-X3c@=PHR$
zD1vvnw&ov>)+g>LtK(ICs6MVSx}&Jk2D_f64Vlnrg?x-4+<v)EZOm0VrOtbfuYJ&Q
zhgzA#5Ed%(kdxLJ2z|nkX-VeKO0}^?p9neB<x;~j3dd~qmK+C=oKjh@oLx%QR5~Y9
zVmGH()^gfzxEyXpFl7msSDJBMneuEDi8`Fsr+jd@o8gr|V|DL*>J9gHqKS6rE#T=B
zSyjIh_wDRtg+#iX`=}?SeemK+CRO`me|q%OZtu-krbzirvqrv(9O6VYXJ%1j7*?<t
zK|`zg!3S|*^$zU)k+*MawLa?M53eh5IwiX-k0e-6`grTxL{@N+ZhY5<X^p-ax-XN$
z?7FCSWe#C(g@4R}lQ(fl4~+hnb{h&v1v>8HkWu36di7)esaL>f@AYD(I4-D`TGQ?T
z+2AHf##czGqNhDo=eK_Glea#epYc@HL}zB5^WI20lkd9%0F0Xb?^U+qx0W_C4NV$w
z7;F1d%%Go*d;dI-*kplriV-J@6sC)@i2cO}735&M&caaXU2u5$MPJbUj!siyD7*h6
zpPa{Z)s)^ZgifCN><7vl>V_;Ftqs1*S!8ivt+b4zQ0B%PCym+HO*(m}n;$aOdnGPe
z+nhtSY#k7P1Pcf^Usv8%6_H>3mmD3la#?1V{U92BF*rV)OGbZ#za%PECsya~5|&Ju
zoo2Iecn+F7=x&^OOHI2!DXYX;VqgnSYbxbMbAe`rxkk@+ZJ5xZKK%I9K9Zc-6r|Jl
z@_b1`>-qYz>OTE{<W@qshL|JH5W*R~O!JjOqxp$j9mLI-HelAW?YYmNXT{c+1R-zs
z{X<_WaoAg?2FqS757S+98{K+MpZctk)>PRI^LBfeGCOvkyQinlvT=|!9AfI6v9VEL
z);-i?6d?a;yFTc|o?yDIEI-j*C7YS~6ZNyL9sT{7!BB|JSGDs#4RCEuu0x>p>9dVR
z$s^O>qwi^M2OJOC4%rao|KdS3EJIsIfaAaOHviRs{m+H|XN>+wjt-UOe+Oj$->8UZ
zZf@>!V$25O?(N%lc6J5j<p$tk;vjDObpNaP>6zJCVSau!R#w)xqV%8c=NA=eDW>ZN
zH(U6~PZk{6(*74Y=Ruy>hkR)Cr;O$?g?`SExOJ-v3n=_hwVNqm{Hk@QA@t06@#()$
z{`-IugsYC(|4lyNF#7$k*25wDRB0&nsfD59RuslXm<Orxb!%bQv+jfE?aPO4&YiFj
ztH+Dxy~D9U<WHS=E};}Y&gGk}dF1h%f5Ma)PK2(i_yK@z@-V&o80y!^X1n{?T^o@t
zmQk&wF$4INNHi^Mr`^*eaA3c2jNefI@X!CJ23s;;NPAMH*mwZQST!K{U|k?dvy7h<
zYaPD4qyAZ@3kVvYF7qY-+6mV4wB9+UM@(as+kM#OXB$oR=DPTH?HJ_g-t&JOTwDMh
z<j2`J7vE(oB4c1CpVg`~7c*++j>p3juNR8L`_dj#=6cnE?+u%&?GWyK>y!)_75ng#
z<M#7lw{D~`)Kl&QyZ5k%P_d|ikA7Xuf|P;V&oMx-7Pb@xoPG_a1EBhbx5fXb_rsmE
z$mmD}$}PVClym_=1As~K)N^OW6xX*o`8xk$##br^RIPSaYoPCxT&K(pBg)cl!-tFn
zK{HU#OrU7ea_<j(1f~Na8q$kpf+8#VgOZMZ{3OclyJM6(kh5eZp>0SvQ#)#uAZ(Zm
z`kEh%j79S@&MB&tLsPtlpy<m!5p{f<dMR7BbjnBUSaS-#n6fQbC>uDS+zH5qazr5_
zZ$HvgwJ^Kxfo#jX#A#*p0Kl67HFSuy8*KV0*x^R1s(1}s;>XyD7dTk+@MD`smGWxw
zO$>G{t_Hm)!#C?wxj@6!0Z57`#<jWsk_Yp%&p|Ur5l`{*>w|be#y|Yj0{7F_iO>Y(
zERDLQm@2btz&ICNxhNe0L=4s1%rA)LHFeK3l}QUqj}$(&-n)YXAVRg?1~s%D>oO%~
z;X;);=sk8}Pnio5$LrMbwV-aZxScBN6_!c5>2KXH$=UZ2cbO?K^kW8~jdOr^A_|8G
z>o!C4SU6Y(pmjF8JpNw$Yc!;&@zN<L4;!qUYT_G@UxooLNM6rLmiQ%x=EdQQ>$^kD
z6q^xH4>y1b2Xk~|_5HNxbz=SQ;XeC6`3Y-LYtckII~T6E(c@?<b?rOb{)|gOnY>P`
z!}68aP-R^HEB4TS8WA|9)1U#;1g8mlRam7nO#$!vac*F4RszZaHW4$a^LVQ>=Nkzo
zh<vmP;Oi`^`;rVgn+82-e5R;Qh4e1=jjJ?#AZ~tRNED%O_l&pQ9k<ZeX%1V_IR@A-
z%SpP*YZQ8g{XPr3<XqnQ@QUTZ_n&8hWVLhW69>^n$NnxHfjrG&-T_GVq##&lrykT9
zoN3x|fctf#4penJx$_Bb1hF9-ZjvC5P|VR?Q$MnYTpwts29wO-{+&|$WmYCBRT^k-
z0QHGEZoafzp(tLz)<@93#6kjuLLRVCji;+y=eCgy*rs0{gfEQF_7DZ`Ht|5~e#UVS
zuNG9~2Jh#WB>Bgfr>=$2@IsyYVrq2vSQA`5h(gd(_}hnE9V9No5U&M{e084~Z0E?<
z%@CtdtqX|!Y>eI+<Z2OZh8MHsDL?9EG|<bzYwUYV&*gccc<P5DP2HyZlSR9*p~fd^
z%Rw&XffPlO^#0W2r#w*62vgLNboW$geK7V}Le3rDq+xlEs#kx%I+=I4lD0Zj{i*ME
zK%xe<LZ{nq<=erZ;(pgRQjQNr$l{(Tak03KIN1Ly!nop99!^m(Mjs*~o3`}zF2IXB
zD>Z2nAUMiNSR6V5d8$lzDDZE@5Z_^d6Bp}P<N|6L*E!Xnr#Wc|OpM4#&AmcG;`|&J
zKADt$S5E+c8Y+Q12*JZ%Ta+W_+mkjOM}DtV7T?hYzz0p3f?2z|B-tyE0cL1u5LBvj
zmFIeUE)B#`Mg(b`j5ThSybymdh*{5(L;aD}cnM`l-%KomQ->O6E1%#4=|U-<!>Lmt
z0HxkcyWR5SXQk|;RDiepevhO4Z6;vFTF6XC-J+wP{ppk_=xku_pXoKaO4oNQx`##z
zom!vxf`~Ik=b%OkDJ`q9-YOA_&YVNleKsLWDGqVQHL9R5WTLDt4X09lF0o8l7@FqL
z(kP3Wk|HKM(Fv|*86X`dELakK&gPwWVEVgC?=_|3;nnA>nqHC)GXkr`9m&$Mv(SF1
zdfLZwvUTIylBR~(aJJN?yH5MbJt32)&OknKU!=K)+b?$ITJEETf*2Rn;t+{1aKJ!z
zSoyZiKVBmEv0>Bm20h%QtiJN1oC4mPv<hhUyAQp5@Dd6$Fym;f{}Zn^MmQzy1hNP5
z7YRZQYGJq%xrSz707;U|M`}~up^Q^q=;%{%fLyAbkuwFVpMJwnotCeGkD~c|Ry=Fs
zE{>V-!NXSR$zR5XY;U|H*&`-J)iD`&d=p!9SzSS50Tz+n4NTF{-i5f?4kursZ|4OJ
z!#^-3I`5gIoJnx>w1;}9BqLu-)m!k{g{^ALI{d}4i%<z)k`%C#k~Hqkq~C0R`8QKy
z=kMS;@8yKR7hKIh;Yqn4L2{+`Lz)8D+Jec&9Jp$9b-<of`Pia%xJbBD{-8ycW&YLS
zli^1h!jJmTHKx$A8{UH+7aYeuE|3X)E9A9wMOT){#knCb*tQVMsN$*$7{?iOcFo*Z
zeixb3%0R4o+_@ZrK8dAt@kfxZj$8&8rxN}_%;^f4_2s9I&5TTR7rbG1Wl;l`B$Woe
zp6)F#uchk&nZ#)~YfaR#Qx*p-46y-^aIMEfgP{gy-bz_ZOex4*C-?N#@`^8!feMOi
zC8^ar$58%08!UX^D*NrS4vIQ;YxP?<6`;NiKB`YTzQ<@KNqi4~%i@!w{c*`J6hHCD
zqn*2GZ2rB)h%4jgwisW@?MDD#=Q?#rGLG236Es{A34~7;QM2{yLFPSoptsvTv1`ya
z5zvt$8l+CPOwLtOd!Q3xTFsgJX1H=kcnRdu{Jj?o;ZB-xq$wX>G8yGz4p62^GvYau
zd~V;y^w$^$_GkR+CW{)L*7T_%Us%bbzr_KUC5MS|5niw47V6GE1BKx$NUrmz>0m(M
zjmn^&Zs~*9q+ToD)HXB@E6`5Ex+w+7ri#WJU62~d1<Ijh{{T30;@}R8gp#K`34{#p
z>)5(#jT<VNfJ`1g235Ha3%AM#?|A5Ycn)v6nVbc{(wml;w@d`i40uP75~Sz*#w*yt
zTFkts3%7PP$nt?VX{ZKRv;KPJcCqR#Lp}ht%S-;(M>z`}eP4%fa*9AbheLMerp^qs
zTU7`To^K8sRej)6fpD@txJmB+Ena(Bv_)Nj{319jzE%XDLu~0PQBah-pP25M)qQ6p
zur`$o`~VX&rGE+i#n%w3vv>vMV%{+KcT{o{tCptNdnt}UcYu5L?(U6(!HhE+`ah`6
zUhMa(F=N7Ij`!7helKPS4>d_oJvXrevKwAw%bv1kAQmF@DrV7!t{-C^4`VIJ%+<p0
zq-NISAD876RjtjG3LaKLJsMI*7Z9B^2TI@qlokKDah+J}te;Z0Vr)JjaMSw?Dmoyg
znpb)9YXvL3htjzS1Vr;k%j+2++Xrh7>S)8VJ$jGzP!<~)Sg@a)*Igzg>@prY|2rE@
zjZg6O5?Z(JbY&y0^Zs}^(Q$wZhQ8+)4h%u6#}MR4=NrQa>zl%P3+=*r1vQ>2qd0O=
za-GoL-sI*Y(d4{r%cbNB^7&}_WhKT%MHQfg0rgze9<p(VphhCEn@Bgz1OCx{JN<z8
zw)t`vqA6IzLiFIlUL99X1ixZl(H9sPHkzSIOWDg0-2aA)Z+tRPCHH8{4&*=!G{^9?
z)BvmkpMMTV$l4JLG?A)`cTVXwe{~b;zqM!jCF+@??3$=TaEoxV&zgWxQ5)j>MljdG
zoQ~av9G9t4S-JVPTa;eD9$NO<JyHFm9>EX^EGSGi7wDIrg&e5T0qYEBLT*>E(EN|{
z5=zNci$O#>XC7L;NwX&K!619=VmJEpPg?|#?#RDio!fK6rlK}Bisd@Qm8k9{84z(M
zg%ymNpmSclH|h}5B1BmzUX2wt0Ygbk45ML2@JfZ6Qm5^N6}TZHXJD5igc;Ahj;NHL
zEhbbxN;3i}XV1u6l~vhO)mQ!OMu1(lO+*&nN+>A3W?@HcNluh71&KJ>MV%bm8-z}a
z1Ek&9%D8pK7*U}%LV^M37^KtDea2fJFIg$%_NtoALFg(!6wrI>Zr&M6-zHYrR<el5
zETJ?SS8oKOIFM9>rmPJLIv%eIT=T`OCJ2w7FLYz|^WQEMsJe#os96a{(u2wm9&eCl
zqWLT5V^ztGf#FSNH*eixpU&p#(NoeAR*-jz9ObVBWiB7hQ+8elcbhRU=QZTQI;F+X
zol|K1vYwh1=CGWg%e<<wDFc+OkTweLNQ;mv^-cxv8{E|Q00H0qtoj_F*L<As>gM9I
gx)a>)itthnn)as4Pgb-_(iTBbRUMTQCCkwN01|6BbpQYW

literal 0
HcmV?d00001

diff --git a/doc/tracepoints.md b/doc/tracepoints.md
new file mode 100644
index 000000000..a6dcbcdfb
--- /dev/null
+++ b/doc/tracepoints.md
@@ -0,0 +1,126 @@
+# Whippet performance tracing
+
+Whippet includes support for run-time tracing via
+[LTTng](https://LTTng.org) user-space tracepoints.  This allows you to
+get a detailed look at how Whippet is performing on your system.
+Tracing support is currently limited to Linux systems.
+
+## Getting started
+
+First, you need to build Whippet with LTTng support.  Usually this is as
+easy as building it in an environment where the `lttng-ust` library is
+present, as determined by `pkg-config --libs lttng-ust`.  You can know
+if your Whippet has tracing support by seeing if the resulting binaries
+are dynamically linked to `liblttng-ust`.
+
+If we take as an example the `mt-gcbench` test in the Whippet source
+tree, we would have:
+
+```
+$ ldd bin/mt-gcbench.pcc | grep lttng
+...
+liblttng-ust.so.1 => ...
+...
+```
+
+### Capturing traces
+
+Actually capturing traces is a little annoying; it's not as easy as
+`perf run`.  The [LTTng
+documentation](https://lttng.org/docs/v2.13/#doc-controlling-tracing) is
+quite thorough, but here is a summary.
+
+First, create your tracing session:
+
+```
+$ lttng create
+Session auto-20250214-091153 created.
+Traces will be output to $HOME/lttng-traces/auto-20250214-091153
+```
+
+You run all these commands as your own user; they don't require root
+permissions or system-wide modifications, as all of the Whippet
+tracepoints are user-space tracepoints (UST).
+
+Just having an LTTng session created won't do anything though; you need
+to configure the session.  Monotonic nanosecond-resolution timestamps
+are already implicitly part of each event.  We also want to have process
+and thread IDs for all events:
+
+```
+$ lttng add-context --userspace --type=vpid --type=vtid
+ust context vpid added to all channels
+ust context vtid added to all channels
+```
+
+Now enable Whippet events:
+
+```
+$ lttng enable-event --userspace 'whippet:*'
+ust event whippet:* created in channel channel0
+```
+
+And now, start recording:
+
+```
+$ lttng start
+Tracing started for session auto-20250214-091153
+```
+
+With this, traces will be captured for our program of interest:
+
+```
+$ bin/mt-gcbench.pcc 2.5 8
+...
+```
+
+Now stop the trace:
+
+```
+$ lttng stop
+Waiting for data availability
+Tracing stopped for session auto-20250214-091153
+```
+
+Whew.  If we did it right, our data is now in
+$HOME/lttng-traces/auto-20250214-091153.
+
+### Visualizing traces
+
+LTTng produces traces in the [Common Trace Format
+(CTF)](https://diamon.org/ctf/).  My favorite trace viewing tool is the
+family of web-based trace viewers derived from `chrome://tracing`.  The
+best of these appear to be [the Firefox
+profiler](https://profiler.firefox.com) and
+[Perfetto](https://ui.perfetto.dev).  Unfortunately neither of these can
+work with CTF directly, so we instead need to run a trace converter.
+
+Oddly, there is no trace converter that can read CTF and write something
+that Perfetto (e.g.) can read.  However there is a JSON-based tracing
+format that Perfetto can read, and [Python bindings for Babeltrace, a
+library that works with CTF](https://babeltrace.org/), so that's what we
+do:
+
+```
+$ python3 ctf_to_json.py ~/lttng-traces/auto-20250214-091153 > trace.json
+```
+
+While Firefox Profiler can load this file, it works better on Perfetto,
+as the Whippet events are visually rendered on their respective threads.
+
+![Screenshot of part of Perfetto UI showing a minor GC](./perfetto-minor-gc.png)
+
+### Expanding the set of events
+
+As of February 2025,
+the current set of tracepoints includes the [heap
+events](https://github.com/wingo/whippet/blob/main/doc/manual.md#statistics)
+and some detailed internals of the parallel tracer.  We expect this set
+of tracepoints to expand over time.
+
+### Overhead of tracepoints
+
+When tracepoints are compiled in but no events are enabled, tracepoints
+appear to have no impact on run-time.  When event collection is on, for
+x86-64 hardware, [emitting a tracepoint event takes about
+100ns](https://discuss.systems/@DesnoyersMa/113986344940256872).

From 0bef1e943588d0a50d54d161867b8c6bc6223cf4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 14 Feb 2025 12:32:54 +0100
Subject: [PATCH 378/403] Doc rewording

---
 doc/tracepoints.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tracepoints.md b/doc/tracepoints.md
index a6dcbcdfb..f90bb051c 100644
--- a/doc/tracepoints.md
+++ b/doc/tracepoints.md
@@ -35,7 +35,7 @@ First, create your tracing session:
 ```
 $ lttng create
 Session auto-20250214-091153 created.
-Traces will be output to $HOME/lttng-traces/auto-20250214-091153
+Traces will be output to ~/lttng-traces/auto-20250214-091153
 ```
 
 You run all these commands as your own user; they don't require root
@@ -83,7 +83,7 @@ Tracing stopped for session auto-20250214-091153
 ```
 
 Whew.  If we did it right, our data is now in
-$HOME/lttng-traces/auto-20250214-091153.
+~/lttng-traces/auto-20250214-091153.
 
 ### Visualizing traces
 

From e19bf100b9c99ab2b84f47ea6a8ae8f72fbaf686 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 14 Feb 2025 12:33:09 +0100
Subject: [PATCH 379/403] Doc rewording

---
 doc/tracepoints.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tracepoints.md b/doc/tracepoints.md
index f90bb051c..0fa5b8652 100644
--- a/doc/tracepoints.md
+++ b/doc/tracepoints.md
@@ -83,7 +83,7 @@ Tracing stopped for session auto-20250214-091153
 ```
 
 Whew.  If we did it right, our data is now in
-~/lttng-traces/auto-20250214-091153.
+`~/lttng-traces/auto-20250214-091153`.
 
 ### Visualizing traces
 

From c410992d55ad9504391dd2b616d6bff7ac371f30 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 14 Feb 2025 12:34:17 +0100
Subject: [PATCH 380/403] Doc rewording

---
 doc/tracepoints.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/tracepoints.md b/doc/tracepoints.md
index 0fa5b8652..18b7d8f29 100644
--- a/doc/tracepoints.md
+++ b/doc/tracepoints.md
@@ -96,10 +96,11 @@ profiler](https://profiler.firefox.com) and
 work with CTF directly, so we instead need to run a trace converter.
 
 Oddly, there is no trace converter that can read CTF and write something
-that Perfetto (e.g.) can read.  However there is a JSON-based tracing
-format that Perfetto can read, and [Python bindings for Babeltrace, a
-library that works with CTF](https://babeltrace.org/), so that's what we
-do:
+that Perfetto (e.g.) can read.  However there is a [JSON-based tracing
+format that these tools can
+read](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw),
+and [Python bindings for Babeltrace, a library that works with
+CTF](https://babeltrace.org/), so that's what we do:
 
 ```
 $ python3 ctf_to_json.py ~/lttng-traces/auto-20250214-091153 > trace.json

From e780d2795933ed836e4ee993ebc3f903e78b0fae Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 5 Mar 2025 10:08:03 +0100
Subject: [PATCH 381/403] nofl: Refactor SWAR mark-matching routines

We are going to try to use fewer bits for mark state.
---
 src/nofl-space.h | 23 ++++++++--------
 src/swar.h       | 71 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 66aa0ac62..f98530b02 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -150,7 +150,6 @@ struct nofl_block_stack {
 #define NOFL_PAGE_OUT_QUEUE_SIZE 4
 
 struct nofl_space {
-  uint64_t sweep_mask;
   uint8_t live_mask;
   uint8_t marked_mask;
   uint8_t evacuating;
@@ -558,7 +557,7 @@ nofl_clear_memory(uintptr_t addr, size_t size) {
 
 static size_t
 nofl_space_live_object_granules(uint8_t *metadata) {
-  return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 1;
+  return scan_for_byte_with_bits(metadata, -1, NOFL_METADATA_BYTE_END) + 1;
 }
 
 static void
@@ -704,7 +703,7 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
 // reached the end of the block.
 static size_t
 nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
-                                  uintptr_t sweep_mask) {
+                                  uint8_t live_mask) {
   GC_ASSERT(nofl_allocator_has_block(alloc));
   GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
   uintptr_t sweep = alloc->sweep;
@@ -721,7 +720,7 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
   // right after a hole, which can point to either the end of the
   // block or to a live object.  Assume that a live object is more
   // common.
-  while (limit_granules && (metadata[0] & sweep_mask)) {
+  while (limit_granules && (metadata[0] & live_mask)) {
     // Object survived collection; skip over it and continue sweeping.
     size_t object_granules = nofl_space_live_object_granules(metadata);
     sweep += object_granules * NOFL_GRANULE_SIZE;
@@ -734,7 +733,8 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
     return 0;
   }
 
-  size_t hole_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+  size_t hole_granules = scan_for_byte_with_bits(metadata, limit_granules,
+                                                 live_mask);
   size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE;
   GC_ASSERT(hole_granules);
   GC_ASSERT(hole_granules <= limit_granules);
@@ -754,10 +754,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
 
 static void
 nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                                        uintptr_t sweep_mask) {
+                                        uint8_t live_mask) {
   do {
     nofl_allocator_finish_hole(alloc);
-  } while (nofl_allocator_next_hole_in_block(alloc, sweep_mask));
+  } while (nofl_allocator_next_hole_in_block(alloc, live_mask));
 }
 
 static void
@@ -771,7 +771,7 @@ nofl_allocator_release_block(struct nofl_allocator *alloc,
   } else if (space->evacuating) {
     nofl_allocator_release_full_evacuation_target(alloc, space);
   } else {
-    nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
+    nofl_allocator_finish_sweeping_in_block(alloc, space->live_mask);
     nofl_allocator_release_full_block(alloc, space);
   }
 }
@@ -801,7 +801,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
   // Sweep current block for a hole.
   if (nofl_allocator_has_block(alloc)) {
     size_t granules =
-      nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+      nofl_allocator_next_hole_in_block(alloc, space->live_mask);
     if (granules)
       return granules;
     else
@@ -819,7 +819,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
     alloc->block.summary->holes_with_fragmentation = 0;
     alloc->block.summary->fragmentation_granules = 0;
     size_t granules =
-      nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+      nofl_allocator_next_hole_in_block(alloc, space->live_mask);
     if (granules)
       return granules;
     nofl_allocator_release_full_block(alloc, space);
@@ -1134,7 +1134,6 @@ nofl_space_update_mark_patterns(struct nofl_space *space,
   if (advance_mark_mask)
     space->marked_mask = next_marked_mask;
   space->live_mask = survivor_mask | next_marked_mask;
-  space->sweep_mask = broadcast_byte(space->live_mask);
 }
 
 static void
@@ -1206,7 +1205,7 @@ nofl_space_promote_blocks(struct nofl_space *space) {
     block.summary->holes_with_fragmentation = 0;
     block.summary->fragmentation_granules = 0;
     struct nofl_allocator alloc = { block.addr, block.addr, block };
-    nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
+    nofl_allocator_finish_sweeping_in_block(&alloc, space->live_mask);
     atomic_fetch_add(&space->old_generation_granules,
                      NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
     nofl_block_list_push(&space->old, block);
diff --git a/src/swar.h b/src/swar.h
index 293d99ec2..e516ed83f 100644
--- a/src/swar.h
+++ b/src/swar.h
@@ -26,23 +26,80 @@ load_eight_aligned_bytes(uint8_t *ptr) {
   return word;
 }
 
+static inline uint64_t
+match_bytes_against_bits(uint64_t bytes, uint8_t mask) {
+  return bytes & broadcast_byte(mask);
+}
+
 static size_t
-scan_for_byte(uint8_t *ptr, size_t limit, uint64_t mask) {
+scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) {
   size_t n = 0;
   size_t unaligned = ((uintptr_t) ptr) & 7;
   if (unaligned) {
     uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8);
-    bytes &= mask;
-    if (bytes)
-      return count_zero_bytes(bytes);
+    uint64_t match = match_bytes_against_bits(bytes, mask);
+    if (match)
+      return count_zero_bytes(match);
     n += 8 - unaligned;
   }
 
   for(; n < limit; n += 8) {
     uint64_t bytes = load_eight_aligned_bytes(ptr + n);
-    bytes &= mask;
-    if (bytes)
-      return n + count_zero_bytes(bytes);
+    uint64_t match = match_bytes_against_bits(bytes, mask);
+    if (match)
+      return n + count_zero_bytes(match);
+  }
+
+  return limit;
+}
+
+static inline uint64_t
+match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1,
+                           uint8_t tag2) 
+{
+  // Precondition: tags are covered by within mask.
+  GC_ASSERT_EQ(tag1 & mask, tag1);
+  GC_ASSERT_EQ(tag2 & mask, tag2);
+  // Precondition: high bit of mask byte is empty, so that we can add without
+  // overflow.
+  GC_ASSERT_EQ(mask & 0x7f, mask);
+  // Precondition: mask is low bits of byte.
+  GC_ASSERT(mask);
+  GC_ASSERT_EQ(mask & (mask + 1), 0);
+  
+  uint64_t vmask = broadcast_byte(mask);
+  uint64_t vtest = broadcast_byte(mask + 1);
+  uint64_t vtag1 = broadcast_byte(tag1);
+  uint64_t vtag2 = broadcast_byte(tag2);
+
+  bytes &= vmask;
+  uint64_t m1 = (bytes ^ vtag1) + vmask;
+  uint64_t m2 = (bytes ^ vtag2) + vmask;
+  return ((m1 & m2) & vtest) ^ vtest;
+}
+
+static size_t
+scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask,
+                        uint8_t tag1, uint8_t tag2) {
+  // The way we handle unaligned reads by padding high bytes with zeroes assumes
+  // that all-zeroes is not a matching byte.
+  GC_ASSERT(tag1 && tag2);
+
+  size_t n = 0;
+  size_t unaligned = ((uintptr_t) ptr) & 7;
+  if (unaligned) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8);
+    uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2);
+    if (match)
+      return count_zero_bytes(match);
+    n += 8 - unaligned;
+  }
+
+  for(; n < limit; n += 8) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr + n);
+    uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2);
+    if (match)
+      return n + count_zero_bytes(match);
   }
 
   return limit;

From 5c4d5a72e48db6ddb8a7d09fcb75e7d5fd41c1e2 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 5 Mar 2025 11:32:32 +0100
Subject: [PATCH 382/403] Fix embarrassing bug in lospace initial mark value

---
 src/large-object-space.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index a7c762056..b33616290 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -421,6 +421,7 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
       // Mark the hole as live.
       node->value.is_live = 1;
       memset(&node->value.live, 0, sizeof(node->value.live));
+      node->value.live.mark = LARGE_OBJECT_NURSERY;
 
       // If the hole is actually too big, trim its tail.
       if (node->key.size > size) {
@@ -462,7 +463,7 @@ large_object_space_obtain_and_alloc(struct large_object_space *space,
   struct large_object k = { addr, bytes };
   struct large_object_data v = {0,};
   v.is_live = 1;
-  v.live.mark = 0;
+  v.live.mark = LARGE_OBJECT_NURSERY;
 
   pthread_mutex_lock(&space->lock);
   pthread_mutex_lock(&space->object_tree_lock);
@@ -489,6 +490,8 @@ large_object_space_init(struct large_object_space *space,
   space->page_size = getpagesize();
   space->page_size_log2 = __builtin_ctz(space->page_size);
 
+  space->marked = LARGE_OBJECT_MARK_0;
+
   large_object_tree_init(&space->object_tree);
   address_map_init(&space->object_map);
   address_map_init(&space->nursery);

From 8c1b98d3063ac7f0caee308c203edbd1c78ff937 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 5 Mar 2025 11:32:53 +0100
Subject: [PATCH 383/403] Make parallel tracer less verbose; tracepoints are
 better

---
 src/parallel-tracer.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/parallel-tracer.h b/src/parallel-tracer.h
index 0b80a6dde..db8afae8a 100644
--- a/src/parallel-tracer.h
+++ b/src/parallel-tracer.h
@@ -172,7 +172,7 @@ tracer_maybe_unpark_workers(struct gc_tracer *tracer) {
 
 static inline void
 tracer_share(struct gc_trace_worker *worker) {
-  DEBUG("tracer #%zu: sharing\n", worker->id);
+  LOG("tracer #%zu: sharing\n", worker->id);
   GC_TRACEPOINT(trace_share);
   size_t to_share = LOCAL_WORKLIST_SHARE_AMOUNT;
   while (to_share) {
@@ -310,10 +310,10 @@ trace_with_data(struct gc_tracer *tracer,
   atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
   worker->data = data;
 
-  DEBUG("tracer #%zu: running trace loop\n", worker->id);
+  LOG("tracer #%zu: running trace loop\n", worker->id);
 
   {
-    DEBUG("tracer #%zu: tracing roots\n", worker->id);
+    LOG("tracer #%zu: tracing roots\n", worker->id);
     size_t n = 0;
     do {
       struct gc_root root = root_worklist_pop(&tracer->roots);
@@ -323,7 +323,7 @@ trace_with_data(struct gc_tracer *tracer,
       n++;
     } while (1);
 
-    DEBUG("tracer #%zu: done tracing roots, %zu roots traced\n", worker->id, n);
+    LOG("tracer #%zu: done tracing roots, %zu roots traced\n", worker->id, n);
   }
 
   if (tracer->trace_roots_only) {
@@ -337,7 +337,7 @@ trace_with_data(struct gc_tracer *tracer,
         pthread_mutex_lock(&tracer->workers[i].lock);
     }
   } else {
-    DEBUG("tracer #%zu: tracing objects\n", worker->id);
+    LOG("tracer #%zu: tracing objects\n", worker->id);
     GC_TRACEPOINT(trace_objects_begin);
     size_t n = 0;
     size_t spin_count = 0;
@@ -357,7 +357,7 @@ trace_with_data(struct gc_tracer *tracer,
     } while (trace_worker_should_continue(worker, spin_count++));
     GC_TRACEPOINT(trace_objects_end);
 
-    DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
+    LOG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
   }
 
   worker->data = NULL;
@@ -398,27 +398,27 @@ gc_tracer_should_parallelize(struct gc_tracer *tracer) {
 
 static inline void
 gc_tracer_trace(struct gc_tracer *tracer) {
-  DEBUG("starting trace; %zu workers\n", tracer->worker_count);
+  LOG("starting trace; %zu workers\n", tracer->worker_count);
 
   for (int i = 1; i < tracer->worker_count; i++)
     pthread_mutex_unlock(&tracer->workers[i].lock);
 
   if (gc_tracer_should_parallelize(tracer)) {
-    DEBUG("waking workers\n");
+    LOG("waking workers\n");
     tracer_unpark_all_workers(tracer);
   } else {
-    DEBUG("starting in local-only mode\n");
+    LOG("starting in local-only mode\n");
   }
 
   trace_worker_trace(&tracer->workers[0]);
   root_worklist_reset(&tracer->roots);
 
-  DEBUG("trace finished\n");
+  LOG("trace finished\n");
 }
 
 static inline void
 gc_tracer_trace_roots(struct gc_tracer *tracer) {
-  DEBUG("starting roots-only trace\n");
+  LOG("starting roots-only trace\n");
 
   GC_TRACEPOINT(trace_roots_begin);
   tracer->trace_roots_only = 1;
@@ -427,7 +427,7 @@ gc_tracer_trace_roots(struct gc_tracer *tracer) {
   GC_TRACEPOINT(trace_roots_end);
   
   GC_ASSERT_EQ(atomic_load(&tracer->active_tracers), 0);
-  DEBUG("roots-only trace finished\n");
+  LOG("roots-only trace finished\n");
 }
 
 #endif // PARALLEL_TRACER_H

From 4d271e74925e1f9f291a9f67d6bad6b34c33b22b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 5 Mar 2025 16:08:41 +0100
Subject: [PATCH 384/403] nofl space: refactor to add
 NOFL_METADATA_BYTE_MARK_MASK

---
 src/nofl-space.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index f98530b02..47c352154 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -237,6 +237,10 @@ enum nofl_metadata_byte {
   NOFL_METADATA_BYTE_MARK_0 = 2,
   NOFL_METADATA_BYTE_MARK_1 = 4,
   NOFL_METADATA_BYTE_MARK_2 = 8,
+  NOFL_METADATA_BYTE_MARK_MASK = (NOFL_METADATA_BYTE_YOUNG |
+                                  NOFL_METADATA_BYTE_MARK_0 |
+                                  NOFL_METADATA_BYTE_MARK_1 |
+                                  NOFL_METADATA_BYTE_MARK_2),
   NOFL_METADATA_BYTE_END = 16,
   NOFL_METADATA_BYTE_PINNED = 32,
   NOFL_METADATA_BYTE_LOGGED_0 = 64,
@@ -1420,8 +1424,7 @@ nofl_space_should_evacuate(struct nofl_space *space, uint8_t metadata_byte,
 static inline int
 nofl_space_set_mark_relaxed(struct nofl_space *space, uint8_t *metadata,
                             uint8_t byte) {
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  uint8_t mask = NOFL_METADATA_BYTE_MARK_MASK;
   atomic_store_explicit(metadata,
                         (byte & ~mask) | space->marked_mask,
                         memory_order_relaxed);
@@ -1430,8 +1433,7 @@ nofl_space_set_mark_relaxed(struct nofl_space *space, uint8_t *metadata,
 
 static inline int
 nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  uint8_t mask = NOFL_METADATA_BYTE_MARK_MASK;
   atomic_store_explicit(metadata,
                         (byte & ~mask) | space->marked_mask,
                         memory_order_release);

From 29cf0f40d329cde87ed9cba1e2717701065ae9b1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 5 Mar 2025 17:01:51 +0100
Subject: [PATCH 385/403] nofl space: Rework treatment of mark bits to avoid
 masks

This will allow us to free up some metadata bits.
---
 src/nofl-space.h | 94 ++++++++++++++++++++++++++----------------------
 src/swar.h       | 50 ++++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 44 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 47c352154..e13572653 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -150,8 +150,8 @@ struct nofl_block_stack {
 #define NOFL_PAGE_OUT_QUEUE_SIZE 4
 
 struct nofl_space {
-  uint8_t live_mask;
-  uint8_t marked_mask;
+  uint8_t current_mark;
+  uint8_t survivor_mark;
   uint8_t evacuating;
   struct extents *extents;
   size_t heap_size;
@@ -249,10 +249,17 @@ enum nofl_metadata_byte {
 };
 
 static uint8_t
-nofl_rotate_dead_survivor_marked(uint8_t mask) {
-  uint8_t all =
-    NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  return ((mask << 1) | (mask >> 2)) & all;
+nofl_advance_current_mark(uint8_t mark) {
+  switch (mark) {
+    case NOFL_METADATA_BYTE_MARK_0:
+      return NOFL_METADATA_BYTE_MARK_1;
+    case NOFL_METADATA_BYTE_MARK_1:
+      return NOFL_METADATA_BYTE_MARK_2;
+    case NOFL_METADATA_BYTE_MARK_2:
+      return NOFL_METADATA_BYTE_MARK_0;
+    default:
+      GC_CRASH();
+  }
 }
 
 static struct gc_lock
@@ -702,12 +709,23 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
   }
 }
 
+static inline int
+nofl_metadata_byte_has_mark(uint8_t byte, uint8_t marked) {
+  return (byte & NOFL_METADATA_BYTE_MARK_MASK) == marked;
+}
+
+static inline int
+nofl_metadata_byte_is_young_or_has_mark(uint8_t byte, uint8_t marked) {
+  return (nofl_metadata_byte_has_mark(byte, NOFL_METADATA_BYTE_YOUNG)
+          || nofl_metadata_byte_has_mark(byte, marked));
+}
+
 // Sweep some heap to reclaim free space, advancing alloc->alloc and
 // alloc->sweep.  Return the size of the hole in granules, or 0 if we
 // reached the end of the block.
 static size_t
 nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
-                                  uint8_t live_mask) {
+                                  uint8_t survivor_mark) {
   GC_ASSERT(nofl_allocator_has_block(alloc));
   GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
   uintptr_t sweep = alloc->sweep;
@@ -724,7 +742,8 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
   // right after a hole, which can point to either the end of the
   // block or to a live object.  Assume that a live object is more
   // common.
-  while (limit_granules && (metadata[0] & live_mask)) {
+  while (limit_granules &&
+         nofl_metadata_byte_has_mark(metadata[0], survivor_mark)) {
     // Object survived collection; skip over it and continue sweeping.
     size_t object_granules = nofl_space_live_object_granules(metadata);
     sweep += object_granules * NOFL_GRANULE_SIZE;
@@ -737,8 +756,9 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
     return 0;
   }
 
-  size_t hole_granules = scan_for_byte_with_bits(metadata, limit_granules,
-                                                 live_mask);
+  size_t hole_granules = scan_for_byte_with_tag(metadata, limit_granules,
+                                                NOFL_METADATA_BYTE_MARK_MASK,
+                                                survivor_mark);
   size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE;
   GC_ASSERT(hole_granules);
   GC_ASSERT(hole_granules <= limit_granules);
@@ -758,10 +778,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
 
 static void
 nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                                        uint8_t live_mask) {
+                                        uint8_t survivor_mark) {
   do {
     nofl_allocator_finish_hole(alloc);
-  } while (nofl_allocator_next_hole_in_block(alloc, live_mask));
+  } while (nofl_allocator_next_hole_in_block(alloc, survivor_mark));
 }
 
 static void
@@ -775,7 +795,7 @@ nofl_allocator_release_block(struct nofl_allocator *alloc,
   } else if (space->evacuating) {
     nofl_allocator_release_full_evacuation_target(alloc, space);
   } else {
-    nofl_allocator_finish_sweeping_in_block(alloc, space->live_mask);
+    nofl_allocator_finish_sweeping_in_block(alloc, space->survivor_mark);
     nofl_allocator_release_full_block(alloc, space);
   }
 }
@@ -805,7 +825,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
   // Sweep current block for a hole.
   if (nofl_allocator_has_block(alloc)) {
     size_t granules =
-      nofl_allocator_next_hole_in_block(alloc, space->live_mask);
+      nofl_allocator_next_hole_in_block(alloc, space->survivor_mark);
     if (granules)
       return granules;
     else
@@ -823,7 +843,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
     alloc->block.summary->holes_with_fragmentation = 0;
     alloc->block.summary->fragmentation_granules = 0;
     size_t granules =
-      nofl_allocator_next_hole_in_block(alloc, space->live_mask);
+      nofl_allocator_next_hole_in_block(alloc, space->survivor_mark);
     if (granules)
       return granules;
     nofl_allocator_release_full_block(alloc, space);
@@ -1130,16 +1150,6 @@ nofl_space_prepare_evacuation(struct nofl_space *space) {
   }
 }
 
-static void
-nofl_space_update_mark_patterns(struct nofl_space *space,
-                                int advance_mark_mask) {
-  uint8_t survivor_mask = space->marked_mask;
-  uint8_t next_marked_mask = nofl_rotate_dead_survivor_marked(survivor_mask);
-  if (advance_mark_mask)
-    space->marked_mask = next_marked_mask;
-  space->live_mask = survivor_mask | next_marked_mask;
-}
-
 static void
 nofl_space_clear_block_marks(struct nofl_space *space) {
   for (size_t s = 0; s < space->nslabs; s++) {
@@ -1152,7 +1162,7 @@ static void
 nofl_space_prepare_gc(struct nofl_space *space, enum gc_collection_kind kind) {
   int is_minor = kind == GC_COLLECTION_MINOR;
   if (!is_minor) {
-    nofl_space_update_mark_patterns(space, 1);
+    space->current_mark = nofl_advance_current_mark(space->current_mark);
     nofl_space_clear_block_marks(space);
   }
 }
@@ -1209,7 +1219,7 @@ nofl_space_promote_blocks(struct nofl_space *space) {
     block.summary->holes_with_fragmentation = 0;
     block.summary->fragmentation_granules = 0;
     struct nofl_allocator alloc = { block.addr, block.addr, block };
-    nofl_allocator_finish_sweeping_in_block(&alloc, space->live_mask);
+    nofl_allocator_finish_sweeping_in_block(&alloc, space->current_mark);
     atomic_fetch_add(&space->old_generation_granules,
                      NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
     nofl_block_list_push(&space->old, block);
@@ -1238,7 +1248,7 @@ nofl_space_verify_sweepable_blocks(struct nofl_space *space,
     uintptr_t limit = addr + NOFL_BLOCK_SIZE;
     uint8_t *meta = nofl_metadata_byte_for_addr(b.addr);
     while (addr < limit) {
-      if (meta[0] & space->live_mask) {
+      if (nofl_metadata_byte_has_mark(meta[0], space->current_mark)) {
         struct gc_ref obj = gc_ref(addr);
         size_t obj_bytes;
         gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
@@ -1275,8 +1285,7 @@ nofl_space_verify_swept_blocks(struct nofl_space *space,
     uint8_t *meta = nofl_metadata_byte_for_addr(addr);
     while (addr < limit) {
       if (meta[0]) {
-        GC_ASSERT(meta[0] & space->marked_mask);
-        GC_ASSERT_EQ(meta[0] & ~(space->marked_mask | NOFL_METADATA_BYTE_END), 0);
+        GC_ASSERT(nofl_metadata_byte_has_mark(meta[0], space->current_mark));
         struct gc_ref obj = gc_ref(addr);
         size_t obj_bytes;
         gc_trace_object(obj, NULL, NULL, NULL, &obj_bytes);
@@ -1381,7 +1390,7 @@ nofl_space_finish_gc(struct nofl_space *space,
   gc_lock_release(&lock);
   nofl_space_promote_blocks(space);
   nofl_space_reset_statistics(space);
-  nofl_space_update_mark_patterns(space, 0);
+  space->survivor_mark = space->current_mark;
   if (GC_DEBUG)
     nofl_space_verify_before_restart(space);
 }
@@ -1426,7 +1435,7 @@ nofl_space_set_mark_relaxed(struct nofl_space *space, uint8_t *metadata,
                             uint8_t byte) {
   uint8_t mask = NOFL_METADATA_BYTE_MARK_MASK;
   atomic_store_explicit(metadata,
-                        (byte & ~mask) | space->marked_mask,
+                        (byte & ~mask) | space->current_mark,
                         memory_order_relaxed);
   return 1;
 }
@@ -1435,7 +1444,7 @@ static inline int
 nofl_space_set_mark(struct nofl_space *space, uint8_t *metadata, uint8_t byte) {
   uint8_t mask = NOFL_METADATA_BYTE_MARK_MASK;
   atomic_store_explicit(metadata,
-                        (byte & ~mask) | space->marked_mask,
+                        (byte & ~mask) | space->current_mark,
                         memory_order_release);
   return 1;
 }
@@ -1515,7 +1524,7 @@ nofl_space_evacuate(struct nofl_space *space, uint8_t *metadata, uint8_t byte,
     // First check again if someone else tried to evacuate this object and ended
     // up marking in place instead.
     byte = atomic_load_explicit(metadata, memory_order_acquire);
-    if (byte & space->marked_mask) {
+    if (nofl_metadata_byte_has_mark(byte, space->current_mark)) {
       // Indeed, already marked in place.
       gc_atomic_forward_abort(&fwd);
       return 0;
@@ -1581,7 +1590,7 @@ nofl_space_evacuate_or_mark_object(struct nofl_space *space,
                                    struct nofl_allocator *evacuate) {
   uint8_t *metadata = nofl_metadata_byte_for_object(old_ref);
   uint8_t byte = *metadata;
-  if (byte & space->marked_mask)
+  if (nofl_metadata_byte_has_mark(byte, space->current_mark))
     return 0;
 
   if (nofl_space_should_evacuate(space, byte, old_ref))
@@ -1626,7 +1635,7 @@ nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
                                      struct gc_ref ref) {
   uint8_t *metadata = nofl_metadata_byte_for_object(ref);
   uint8_t byte = *metadata;
-  if (byte & space->marked_mask)
+  if (nofl_metadata_byte_has_mark(byte, space->current_mark))
     return 1;
 
   if (!nofl_space_should_evacuate(space, byte, ref))
@@ -1663,13 +1672,12 @@ nofl_space_mark_conservative_ref(struct nofl_space *space,
   uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
 
   // Already marked object?  Nothing to do.
-  if (byte & space->marked_mask)
+  if (nofl_metadata_byte_has_mark(byte, space->current_mark))
     return gc_ref_null();
 
   // Addr is the not start of an unmarked object?  Search backwards if
   // we have interior pointers, otherwise not an object.
-  uint8_t object_start_mask = space->live_mask | NOFL_METADATA_BYTE_YOUNG;
-  if (!(byte & object_start_mask)) {
+  if (!nofl_metadata_byte_is_young_or_has_mark(byte, space->survivor_mark)) {
     if (!possibly_interior)
       return gc_ref_null();
 
@@ -1685,9 +1693,12 @@ nofl_space_mark_conservative_ref(struct nofl_space *space,
       // Ran into the end of some other allocation?  Not an object, then.
       if (byte & NOFL_METADATA_BYTE_END)
         return gc_ref_null();
+      // Object already marked?  Nothing to do.
+      if (nofl_metadata_byte_has_mark(byte, space->current_mark))
+        return gc_ref_null();
 
       // Continue until we find object start.
-    } while (!(byte & object_start_mask));
+    } while (!nofl_metadata_byte_is_young_or_has_mark(byte, space->survivor_mark));
 
     // Found object start, and object is unmarked; adjust addr.
     addr = block_base + (loc - loc_base) * NOFL_GRANULE_SIZE;
@@ -1842,8 +1853,7 @@ nofl_space_init(struct nofl_space *space, size_t size, int atomic,
   if (!slabs)
     return 0;
 
-  space->marked_mask = NOFL_METADATA_BYTE_MARK_0;
-  nofl_space_update_mark_patterns(space, 0);
+  space->current_mark = space->survivor_mark = NOFL_METADATA_BYTE_MARK_0;
   space->extents = extents_allocate(10);
   nofl_space_add_slabs(space, slabs, nslabs);
   pthread_mutex_init(&space->lock, NULL);
diff --git a/src/swar.h b/src/swar.h
index e516ed83f..d8598c8b5 100644
--- a/src/swar.h
+++ b/src/swar.h
@@ -31,7 +31,7 @@ match_bytes_against_bits(uint64_t bytes, uint8_t mask) {
   return bytes & broadcast_byte(mask);
 }
 
-static size_t
+static inline size_t
 scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) {
   size_t n = 0;
   size_t unaligned = ((uintptr_t) ptr) & 7;
@@ -53,6 +53,52 @@ scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) {
   return limit;
 }
 
+static inline uint64_t
+match_bytes_against_tag(uint64_t bytes, uint8_t mask, uint8_t tag) {
+  // Precondition: tag within mask.
+  GC_ASSERT_EQ(tag & mask, tag);
+  // Precondition: high bit of mask byte is empty, so that we can add without
+  // overflow.
+  GC_ASSERT_EQ(mask & 0x7f, mask);
+  // Precondition: mask is low bits of byte.
+  GC_ASSERT(mask);
+  GC_ASSERT_EQ(mask & (mask + 1), 0);
+
+  uint64_t vmask = broadcast_byte(mask);
+  uint64_t vtest = broadcast_byte(mask + 1);
+  uint64_t vtag = broadcast_byte(tag);
+
+  bytes &= vmask;
+  uint64_t m = (bytes ^ vtag) + vmask;
+  return (m & vtest) ^ vtest;
+}
+
+static inline size_t
+scan_for_byte_with_tag(uint8_t *ptr, size_t limit, uint8_t mask, uint8_t tag) {
+  // The way we handle unaligned reads by padding high bytes with zeroes assumes
+  // that all-zeroes is not a matching byte.
+  GC_ASSERT(tag);
+
+  size_t n = 0;
+  size_t unaligned = ((uintptr_t) ptr) & 7;
+  if (unaligned) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 8);
+    uint64_t match = match_bytes_against_tag(bytes, mask, tag);
+    if (match)
+      return count_zero_bytes(match);
+    n += 8 - unaligned;
+  }
+
+  for(; n < limit; n += 8) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr + n);
+    uint64_t match = match_bytes_against_tag(bytes, mask, tag);
+    if (match)
+      return n + count_zero_bytes(match);
+  }
+
+  return limit;
+}
+
 static inline uint64_t
 match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1,
                            uint8_t tag2) 
@@ -78,7 +124,7 @@ match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1,
   return ((m1 & m2) & vtest) ^ vtest;
 }
 
-static size_t
+static inline size_t
 scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask,
                         uint8_t tag1, uint8_t tag2) {
   // The way we handle unaligned reads by padding high bytes with zeroes assumes

From ed5db9bc3620971c768524ee46eb50d3d2a1044c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Wed, 5 Mar 2025 17:17:55 +0100
Subject: [PATCH 386/403] nofl: Reclaim another mark bit if concurrent tracing
 is disabled

---
 api/gc-config.h  |  4 ++++
 src/nofl-space.h | 22 +++++++++++++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/api/gc-config.h b/api/gc-config.h
index ca1b38d14..867af63d2 100644
--- a/api/gc-config.h
+++ b/api/gc-config.h
@@ -33,4 +33,8 @@
 #define GC_CONSERVATIVE_TRACE 0
 #endif
 
+#ifndef GC_CONCURRENT_TRACE
+#define GC_CONCURRENT_TRACE 0
+#endif
+
 #endif // GC_CONFIG_H
diff --git a/src/nofl-space.h b/src/nofl-space.h
index e13572653..9e7245ebf 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -235,12 +235,16 @@ enum nofl_metadata_byte {
   NOFL_METADATA_BYTE_NONE = 0,
   NOFL_METADATA_BYTE_YOUNG = 1,
   NOFL_METADATA_BYTE_MARK_0 = 2,
-  NOFL_METADATA_BYTE_MARK_1 = 4,
-  NOFL_METADATA_BYTE_MARK_2 = 8,
-  NOFL_METADATA_BYTE_MARK_MASK = (NOFL_METADATA_BYTE_YOUNG |
-                                  NOFL_METADATA_BYTE_MARK_0 |
-                                  NOFL_METADATA_BYTE_MARK_1 |
-                                  NOFL_METADATA_BYTE_MARK_2),
+  NOFL_METADATA_BYTE_MARK_1 = 3,
+#if GC_CONCURRENT_TRACE
+  NOFL_METADATA_BYTE_MARK_2 = 4,
+  NOFL_METADATA_BYTE_MARK_MASK = 7,
+  /* NOFL_METADATA_BYTE_UNUSED_0 = 8, */
+#else
+  NOFL_METADATA_BYTE_MARK_MASK = 3,
+  /* NOFL_METADATA_BYTE_UNUSED_0 = 4, */
+  /* NOFL_METADATA_BYTE_UNUSED_1 = 8, */
+#endif
   NOFL_METADATA_BYTE_END = 16,
   NOFL_METADATA_BYTE_PINNED = 32,
   NOFL_METADATA_BYTE_LOGGED_0 = 64,
@@ -254,8 +258,10 @@ nofl_advance_current_mark(uint8_t mark) {
     case NOFL_METADATA_BYTE_MARK_0:
       return NOFL_METADATA_BYTE_MARK_1;
     case NOFL_METADATA_BYTE_MARK_1:
+#if GC_CONCURRENT_TRACE
       return NOFL_METADATA_BYTE_MARK_2;
     case NOFL_METADATA_BYTE_MARK_2:
+#endif
       return NOFL_METADATA_BYTE_MARK_0;
     default:
       GC_CRASH();
@@ -955,10 +961,8 @@ nofl_space_contains_edge(struct nofl_space *space, struct gc_edge edge) {
 static inline int
 nofl_space_is_survivor(struct nofl_space *space, struct gc_ref ref) {
   uint8_t *metadata = nofl_metadata_byte_for_object(ref);
-  uint8_t mask = NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
   uint8_t byte = atomic_load_explicit(metadata, memory_order_relaxed);
-  return byte & mask;
+  return nofl_metadata_byte_has_mark(byte, space->survivor_mark);
 }
 
 static uint8_t*

From 3db1e48ea69dfdb0878324f9a016837362a15c1f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 13:25:47 +0100
Subject: [PATCH 387/403] Fix race in large object space when counting live
 object pages

Embarrassing!
---
 src/large-object-space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index b33616290..f7286614e 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -212,7 +212,7 @@ large_object_space_mark(struct large_object_space *space, struct gc_ref ref) {
                                                   memory_order_acquire));
 
   size_t pages = node->key.size >> space->page_size_log2;
-  space->live_pages_at_last_collection += pages;
+  atomic_fetch_add(&space->live_pages_at_last_collection, pages);
 
   return 1;
 }

From f1b660484ee296759ca427d966b3ffaafcead1e3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 09:32:03 +0100
Subject: [PATCH 388/403] Remove gc_allocator_needs_clear

Whether the returned object needs to be cleared or not depends on a
couple things:
 - Whether the embedder actually needs the object to be cleared.
 - Whether the collector allocated the object from memory that was all
   zeroes already.

The goal of course would be to prevent clearing memory if the mutator
was just going to write all over it.  But it's hard to know statically
if the memory would have been all zeroes anyway, and in that case if you
did clear it you'd be doing double work.  In the end it's simpler to
just require collectors to clear memory in bulk.  We can revisit this
later if it is an issue.
---
 api/bdw-attrs.h  |  4 ----
 api/gc-api.h     | 10 ----------
 api/gc-attrs.h   |  2 --
 api/mmc-attrs.h  |  4 ----
 api/pcc-attrs.h  |  4 ----
 api/semi-attrs.h |  4 ----
 src/pcc.c        |  1 -
 src/semi.c       |  3 +--
 8 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index 938356a5e..19d5250f2 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -36,10 +36,6 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
   GC_CRASH();
 }
 
-static inline int gc_allocator_needs_clear(void) {
-  return 0;
-}
-
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
   return GC_OLD_GENERATION_CHECK_NONE;
 }
diff --git a/api/gc-api.h b/api/gc-api.h
index ff1a20927..58cd5c02d 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -56,14 +56,6 @@ GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
 GC_API_ void gc_collect(struct gc_mutator *mut,
                         enum gc_collection_kind requested_kind);
 
-static inline void gc_clear_fresh_allocation(struct gc_ref obj,
-                                             size_t size) GC_ALWAYS_INLINE;
-static inline void gc_clear_fresh_allocation(struct gc_ref obj,
-                                             size_t size) {
-  if (!gc_allocator_needs_clear()) return;
-  memset(gc_ref_heap_object(obj), 0, size);
-}
-
 static inline void gc_update_alloc_table(struct gc_ref obj,
                                          size_t size) GC_ALWAYS_INLINE;
 static inline void gc_update_alloc_table(struct gc_ref obj,
@@ -119,7 +111,6 @@ static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut,
 
   *hp_loc = new_hp;
 
-  gc_clear_fresh_allocation(gc_ref(hp), size);
   gc_update_alloc_table(gc_ref(hp), size);
 
   return (void*)hp;
@@ -140,7 +131,6 @@ static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut, size
 
   *freelist_loc = *(void**)head;
 
-  gc_clear_fresh_allocation(gc_ref_from_heap_object(head), size);
   gc_update_alloc_table(gc_ref_from_heap_object(head), size);
 
   return head;
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index f0a6e94e6..bb563c0e2 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -25,8 +25,6 @@ static inline size_t gc_allocator_alloc_table_alignment(void) GC_ALWAYS_INLINE;
 static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) GC_ALWAYS_INLINE;
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLINE;
 
-static inline int gc_allocator_needs_clear(void) GC_ALWAYS_INLINE;
-
 enum gc_old_generation_check_kind {
   GC_OLD_GENERATION_CHECK_NONE,
   GC_OLD_GENERATION_CHECK_ALLOC_TABLE,
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 65cb434c9..fe9edb0a4 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -36,10 +36,6 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
   return 16;
 }
 
-static inline int gc_allocator_needs_clear(void) {
-  return 0;
-}
-
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) {
   if (GC_GENERATIONAL) {
     if (obj_size <= gc_allocator_large_threshold())
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 654acf8b9..913208cdd 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -39,10 +39,6 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
   GC_CRASH();
 }
 
-static inline int gc_allocator_needs_clear(void) {
-  return 0;
-}
-
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t size) {
   if (!GC_GENERATIONAL)
     return GC_OLD_GENERATION_CHECK_NONE;
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index 69a87560e..b8baab693 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -28,10 +28,6 @@ static inline size_t gc_allocator_freelist_offset(size_t size) {
   GC_CRASH();
 }
 
-static inline int gc_allocator_needs_clear(void) {
-  return 1;
-}
-
 static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 0;
 }
diff --git a/src/pcc.c b/src/pcc.c
index f3a94d22b..c480021f9 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -997,7 +997,6 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
       break;
   }
 
-  gc_clear_fresh_allocation(ret, size);
   return gc_ref_heap_object(ret);
 }
 
diff --git a/src/semi.c b/src/semi.c
index 0d0c9ecca..86541f913 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -437,6 +437,7 @@ static void collect(struct gc_mutator *mut, size_t for_alloc) {
   gc_heap_sizer_on_gc(heap->sizer, heap->size, live_size, pause_ns,
                       resize_heap);
   reset_heap_limits(heap);  
+  clear_memory(semi->hp, semi->limit - semi->hp);
 
   HEAP_EVENT(heap, restarting_mutators);
   // fprintf(stderr, "%zd bytes copied\n", (space->size>>1)-(space->limit-space->hp));
@@ -520,8 +521,6 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
       continue;
     }
     space->hp = new_hp;
-    // FIXME: Allow allocator to avoid clearing memory?
-    clear_memory(addr, size);
     return (void *)addr;
   }
 }

From 4a95a514b73a3ac729b876b8e31a97749a88c67d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 10:30:56 +0100
Subject: [PATCH 389/403] Fix inline old-generation check for nofl space mark
 byte changes

---
 api/bdw-attrs.h  |  5 ++++-
 api/gc-api.h     |  4 +++-
 api/gc-attrs.h   |  3 ++-
 api/mmc-attrs.h  | 14 +++++++-------
 api/pcc-attrs.h  |  5 ++++-
 api/semi-attrs.h |  5 ++++-
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index 19d5250f2..a22445f14 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -39,7 +39,10 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
   return GC_OLD_GENERATION_CHECK_NONE;
 }
-static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
   GC_CRASH();
 }
 
diff --git a/api/gc-api.h b/api/gc-api.h
index 58cd5c02d..23113060c 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -193,7 +193,9 @@ static inline int gc_object_is_old_generation(struct gc_mutator *mut,
     uintptr_t granule = (addr & (alignment - 1)) / granule_size;
     uint8_t *byte_loc = (uint8_t*)(base + granule);
     uint8_t byte = atomic_load_explicit(byte_loc, memory_order_relaxed);
-    return byte & gc_old_generation_check_alloc_table_bit_pattern();
+    uint8_t mask = gc_old_generation_check_alloc_table_tag_mask();
+    uint8_t young = gc_old_generation_check_alloc_table_young_tag();
+    return (byte & mask) != young;
   }
   case GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY: {
     struct gc_heap *heap = gc_mutator_heap(mut);
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index bb563c0e2..0ef3e082b 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -34,7 +34,8 @@ enum gc_old_generation_check_kind {
 
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) GC_ALWAYS_INLINE;
 
-static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) GC_ALWAYS_INLINE;
 
 enum gc_write_barrier_kind {
   GC_WRITE_BARRIER_NONE,
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index fe9edb0a4..7066caf23 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -44,9 +44,11 @@ static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(siz
   }
   return GC_OLD_GENERATION_CHECK_NONE;
 }
-static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
-  // The three mark bits.
-  return 2 + 4 + 8;
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  return 7;
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
+  return 1;
 }
 
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) {
@@ -58,12 +60,10 @@ static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size)
   return GC_WRITE_BARRIER_NONE;
 }
 static inline size_t gc_write_barrier_card_table_alignment(void) {
-  GC_ASSERT(GC_GENERATIONAL);
-  return 4 * 1024 * 1024;
+  GC_CRASH();
 }
 static inline size_t gc_write_barrier_card_size(void) {
-  GC_ASSERT(GC_GENERATIONAL);
-  return 256;
+  GC_CRASH();
 }
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_ASSERT(GC_GENERATIONAL);
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 913208cdd..727d134c0 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -46,7 +46,10 @@ static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(siz
     return GC_OLD_GENERATION_CHECK_SMALL_OBJECT_NURSERY;
   return GC_OLD_GENERATION_CHECK_SLOW;
 }
-static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
   GC_CRASH();
 }
 
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index b8baab693..c749fd3cf 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -41,7 +41,10 @@ static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t) {
   return GC_OLD_GENERATION_CHECK_NONE;
 }
-static inline uint8_t gc_old_generation_check_alloc_table_bit_pattern(void) {
+static inline uint8_t gc_old_generation_check_alloc_table_tag_mask(void) {
+  GC_CRASH();
+}
+static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
   GC_CRASH();
 }
 

From 6c444b33f10b555ce90caace758a0dc499e37677 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 10:33:10 +0100
Subject: [PATCH 390/403] Remove card-marking barriers

Field barriers aren't slower and are more precise.
---
 api/bdw-attrs.h  | 6 ------
 api/gc-api.h     | 9 ---------
 api/gc-attrs.h   | 3 ---
 api/mmc-attrs.h  | 6 ------
 api/pcc-attrs.h  | 6 ------
 api/semi-attrs.h | 6 ------
 6 files changed, 36 deletions(-)

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index a22445f14..7c03144b5 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -49,12 +49,6 @@ static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
   return GC_WRITE_BARRIER_NONE;
 }
-static inline size_t gc_write_barrier_card_table_alignment(void) {
-  GC_CRASH();
-}
-static inline size_t gc_write_barrier_card_size(void) {
-  GC_CRASH();
-}
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_CRASH();
 }
diff --git a/api/gc-api.h b/api/gc-api.h
index 23113060c..2b08d18b3 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -228,15 +228,6 @@ static inline int gc_write_barrier_fast(struct gc_mutator *mut, struct gc_ref ob
   switch (gc_write_barrier_kind(obj_size)) {
   case GC_WRITE_BARRIER_NONE:
     return 0;
-  case GC_WRITE_BARRIER_CARD: {
-    size_t card_table_alignment = gc_write_barrier_card_table_alignment();
-    size_t card_size = gc_write_barrier_card_size();
-    uintptr_t addr = gc_ref_value(obj);
-    uintptr_t base = addr & ~(card_table_alignment - 1);
-    uintptr_t card = (addr & (card_table_alignment - 1)) / card_size;
-    atomic_store_explicit((uint8_t*)(base + card), 1, memory_order_relaxed);
-    return 0;
-  }
   case GC_WRITE_BARRIER_FIELD: {
     if (!gc_object_is_old_generation(mut, obj, obj_size))
       return 0;
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index 0ef3e082b..e7ea8c8f3 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -39,14 +39,11 @@ static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) GC_ALW
 
 enum gc_write_barrier_kind {
   GC_WRITE_BARRIER_NONE,
-  GC_WRITE_BARRIER_CARD,
   GC_WRITE_BARRIER_FIELD,
   GC_WRITE_BARRIER_SLOW
 };
 
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size) GC_ALWAYS_INLINE;
-static inline size_t gc_write_barrier_card_table_alignment(void) GC_ALWAYS_INLINE;
-static inline size_t gc_write_barrier_card_size(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_field_table_alignment(void) GC_ALWAYS_INLINE;
 static inline ptrdiff_t gc_write_barrier_field_table_offset(void) GC_ALWAYS_INLINE;
 static inline size_t gc_write_barrier_field_fields_per_byte(void) GC_ALWAYS_INLINE;
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 7066caf23..0c6162dc9 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -59,12 +59,6 @@ static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size)
   }
   return GC_WRITE_BARRIER_NONE;
 }
-static inline size_t gc_write_barrier_card_table_alignment(void) {
-  GC_CRASH();
-}
-static inline size_t gc_write_barrier_card_size(void) {
-  GC_CRASH();
-}
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return gc_allocator_alloc_table_alignment();
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 727d134c0..064e5181a 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -60,12 +60,6 @@ static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t obj_size)
     return GC_WRITE_BARRIER_FIELD;
   return GC_WRITE_BARRIER_SLOW;
 }
-static inline size_t gc_write_barrier_card_table_alignment(void) {
-  GC_CRASH();
-}
-static inline size_t gc_write_barrier_card_size(void) {
-  GC_CRASH();
-}
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_ASSERT(GC_GENERATIONAL);
   return 64 * 1024 * 1024;
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index c749fd3cf..763f45606 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -51,12 +51,6 @@ static inline uint8_t gc_old_generation_check_alloc_table_young_tag(void) {
 static inline enum gc_write_barrier_kind gc_write_barrier_kind(size_t) {
   return GC_WRITE_BARRIER_NONE;
 }
-static inline size_t gc_write_barrier_card_table_alignment(void) {
-  GC_CRASH();
-}
-static inline size_t gc_write_barrier_card_size(void) {
-  GC_CRASH();
-}
 static inline size_t gc_write_barrier_field_table_alignment(void) {
   GC_CRASH();
 }

From 5bddd522cf20e871a363e5c0fbefa6bf755762d3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 10:47:41 +0100
Subject: [PATCH 391/403] Rework large_object_space_alloc to also map if needed

Also fix a bug in which objects reclaimed from freelists were not
zeroed.
---
 src/large-object-space.h | 49 ++++++++++++++++++----------------------
 src/mmc.c                |  2 --
 src/pcc.c                |  2 --
 src/semi.c               |  2 --
 4 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/src/large-object-space.h b/src/large-object-space.h
index f7286614e..887cdef0e 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -444,40 +444,35 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
     
       space->free_pages -= npages;
       ret = (void*)node->key.addr;
+      memset(ret, 0, size);
       break;
     }
   }
+
+  // If we didn't find anything in the quarantine, get fresh pages from the OS.
+  if (!ret) {
+    ret = gc_platform_acquire_memory(size, 0);
+    if (ret) {
+      uintptr_t addr = (uintptr_t)ret;
+      struct large_object k = { addr, size };
+      struct large_object_data v = {0,};
+      v.is_live = 1;
+      v.live.mark = LARGE_OBJECT_NURSERY;
+
+      pthread_mutex_lock(&space->object_tree_lock);
+      struct large_object_node *node =
+        large_object_tree_insert(&space->object_tree, k, v);
+      uintptr_t node_bits = (uintptr_t)node;
+      address_map_add(&space->object_map, addr, node_bits);
+      space->total_pages += npages;
+      pthread_mutex_unlock(&space->object_tree_lock);
+    }
+  }
+
   pthread_mutex_unlock(&space->lock);
   return ret;
 }
 
-static void*
-large_object_space_obtain_and_alloc(struct large_object_space *space,
-                                    size_t npages) {
-  size_t bytes = npages * space->page_size;
-  void *ret = gc_platform_acquire_memory(bytes, 0);
-  if (!ret)
-    return NULL;
-
-  uintptr_t addr = (uintptr_t)ret;
-  struct large_object k = { addr, bytes };
-  struct large_object_data v = {0,};
-  v.is_live = 1;
-  v.live.mark = LARGE_OBJECT_NURSERY;
-
-  pthread_mutex_lock(&space->lock);
-  pthread_mutex_lock(&space->object_tree_lock);
-  struct large_object_node *node =
-    large_object_tree_insert(&space->object_tree, k, v);
-  uintptr_t node_bits = (uintptr_t)node;
-  address_map_add(&space->object_map, addr, node_bits);
-  space->total_pages += npages;
-  pthread_mutex_unlock(&space->object_tree_lock);
-  pthread_mutex_unlock(&space->lock);
-
-  return ret;
-}
-
 static int
 large_object_space_init(struct large_object_space *space,
                         struct gc_heap *heap,
diff --git a/src/mmc.c b/src/mmc.c
index db7e1f512..0c9448e62 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -876,8 +876,6 @@ allocate_large(struct gc_mutator *mut, size_t size) {
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(lospace, npages);
-  if (!ret)
-    ret = large_object_space_obtain_and_alloc(lospace, npages);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
diff --git a/src/pcc.c b/src/pcc.c
index c480021f9..877827ed3 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -965,8 +965,6 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
   atomic_fetch_add(&heap->large_object_pages, npages);
 
   void *ret = large_object_space_alloc(space, npages);
-  if (!ret)
-    ret = large_object_space_obtain_and_alloc(space, npages);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
diff --git a/src/semi.c b/src/semi.c
index 86541f913..833cfabef 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -496,8 +496,6 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
     collect_for_large_alloc(mut, npages);
 
   void *ret = large_object_space_alloc(space, npages);
-  if (!ret)
-    ret = large_object_space_obtain_and_alloc(space, npages);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");

From 521cd44ebd85a5b093efe8f7a66abfdd2c89e22f Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 10:58:05 +0100
Subject: [PATCH 392/403] Add gc_allocation_kind argument to gc_allocate

Adapt all users.  Will eventually allow for mmc to have untagged
allocations.
---
 api/bdw-attrs.h               | 20 +++++++++--
 api/gc-allocation-kind.h      | 19 ++++++++++
 api/gc-api.h                  | 65 ++++++++++++++++++++---------------
 api/gc-attrs.h                |  6 ++--
 api/mmc-attrs.h               |  5 +--
 api/pcc-attrs.h               |  4 +--
 api/semi-attrs.h              |  5 +--
 benchmarks/simple-allocator.h |  4 +--
 src/bdw.c                     | 44 ++++++++++++++++++------
 src/mmc.c                     | 22 +++++++-----
 src/nofl-space.h              |  5 +--
 src/pcc.c                     | 17 +++++----
 src/semi.c                    | 17 +++++----
 13 files changed, 158 insertions(+), 75 deletions(-)
 create mode 100644 api/gc-allocation-kind.h

diff --git a/api/bdw-attrs.h b/api/bdw-attrs.h
index 7c03144b5..7f8000b3f 100644
--- a/api/bdw-attrs.h
+++ b/api/bdw-attrs.h
@@ -21,15 +21,29 @@ static inline size_t gc_allocator_allocation_limit_offset(void) {
   GC_CRASH();
 }
 
-static inline size_t gc_allocator_freelist_offset(size_t size) {
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) {
   GC_ASSERT(size);
-  return sizeof(void*) * ((size - 1) / gc_allocator_small_granule_size());
+  size_t base;
+  switch (kind) {
+    case GC_ALLOCATION_TAGGED:
+    case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+      base = 0;
+      break;
+    case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+    case GC_ALLOCATION_TAGGED_POINTERLESS:
+      base = (sizeof(void*) * gc_allocator_large_threshold() /
+              gc_allocator_small_granule_size());
+      break;
+  }
+  size_t bucket = (size - 1) / gc_allocator_small_granule_size();
+  return base + sizeof(void*) * bucket;
 }
 
 static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 0;
 }
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind) {
   GC_CRASH();
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
diff --git a/api/gc-allocation-kind.h b/api/gc-allocation-kind.h
new file mode 100644
index 000000000..72de3b6be
--- /dev/null
+++ b/api/gc-allocation-kind.h
@@ -0,0 +1,19 @@
+#ifndef GC_ALLOCATION_KIND_H
+#define GC_ALLOCATION_KIND_H
+
+enum gc_allocation_kind {
+  // An object whose type can be inspected at run-time based on its contents,
+  // and whose fields be traced via the gc_trace_object procedure.
+  GC_ALLOCATION_TAGGED,
+  // Like GC_ALLOCATION_TAGGED, but not containing any fields that reference
+  // GC-managed objects.  The GC may choose to handle these specially.
+  GC_ALLOCATION_TAGGED_POINTERLESS,
+  // A raw allocation whose type cannot be inspected at trace-time, and whose
+  // fields should be traced conservatively.
+  GC_ALLOCATION_UNTAGGED_CONSERVATIVE,
+  // A raw allocation whose type cannot be inspected at trace-time, but
+  // containing no fields that reference GC-managed objects.
+  GC_ALLOCATION_UNTAGGED_POINTERLESS
+};
+
+#endif // GC_ALLOCATION_KIND_H
diff --git a/api/gc-api.h b/api/gc-api.h
index 2b08d18b3..245784b33 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -2,6 +2,7 @@
 #define GC_API_H_
 
 #include "gc-config.h"
+#include "gc-allocation-kind.h"
 #include "gc-assert.h"
 #include "gc-attrs.h"
 #include "gc-collection-kind.h"
@@ -56,10 +57,10 @@ GC_API_ void* gc_call_without_gc(struct gc_mutator *mut, void* (*f)(void*),
 GC_API_ void gc_collect(struct gc_mutator *mut,
                         enum gc_collection_kind requested_kind);
 
-static inline void gc_update_alloc_table(struct gc_ref obj,
-                                         size_t size) GC_ALWAYS_INLINE;
-static inline void gc_update_alloc_table(struct gc_ref obj,
-                                         size_t size) {
+static inline void gc_update_alloc_table(struct gc_ref obj, size_t size,
+                                         enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void gc_update_alloc_table(struct gc_ref obj, size_t size,
+                                         enum gc_allocation_kind kind) {
   size_t alignment = gc_allocator_alloc_table_alignment();
   if (!alignment) return;
 
@@ -69,7 +70,7 @@ static inline void gc_update_alloc_table(struct gc_ref obj,
   uintptr_t granule = (addr & (alignment - 1)) / granule_size;
   uint8_t *alloc = (uint8_t*)(base + granule);
 
-  uint8_t begin_pattern = gc_allocator_alloc_table_begin_pattern();
+  uint8_t begin_pattern = gc_allocator_alloc_table_begin_pattern(kind);
   uint8_t end_pattern = gc_allocator_alloc_table_end_pattern();
   if (end_pattern) {
     size_t granules = size / granule_size;
@@ -86,11 +87,15 @@ static inline void gc_update_alloc_table(struct gc_ref obj,
   }
 }
 
-GC_API_ void* gc_allocate_slow(struct gc_mutator *mut, size_t bytes) GC_NEVER_INLINE;
+GC_API_ void* gc_allocate_slow(struct gc_mutator *mut, size_t bytes,
+                               enum gc_allocation_kind kind) GC_NEVER_INLINE;
 
 static inline void*
-gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut, size_t size) {
+gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut, size_t size,
+                                    enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut,
+                                                        size_t size,
+                                                        enum gc_allocation_kind kind) {
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
   size_t granule_size = gc_allocator_small_granule_size();
@@ -111,17 +116,20 @@ static inline void* gc_allocate_small_fast_bump_pointer(struct gc_mutator *mut,
 
   *hp_loc = new_hp;
 
-  gc_update_alloc_table(gc_ref(hp), size);
+  gc_update_alloc_table(gc_ref(hp), size, kind);
 
   return (void*)hp;
 }
 
 static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut,
-                                                    size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut, size_t size) {
+                                                    size_t size,
+                                                    enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut,
+                                                    size_t size,
+                                                    enum gc_allocation_kind kind) {
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
-  size_t freelist_offset = gc_allocator_freelist_offset(size);
+  size_t freelist_offset = gc_allocator_freelist_offset(size, kind);
   uintptr_t base_addr = (uintptr_t)mut;
   void **freelist_loc = (void**)(base_addr + freelist_offset);
 
@@ -131,21 +139,23 @@ static inline void* gc_allocate_small_fast_freelist(struct gc_mutator *mut, size
 
   *freelist_loc = *(void**)head;
 
-  gc_update_alloc_table(gc_ref_from_heap_object(head), size);
+  gc_update_alloc_table(gc_ref_from_heap_object(head), size, kind);
 
   return head;
 }
 
-static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size) {
+static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size,
+                                           enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size,
+                                           enum gc_allocation_kind kind) {
   GC_ASSERT(size != 0);
   GC_ASSERT(size <= gc_allocator_large_threshold());
 
   switch (gc_allocator_kind()) {
   case GC_ALLOCATOR_INLINE_BUMP_POINTER:
-    return gc_allocate_small_fast_bump_pointer(mut, size);
+    return gc_allocate_small_fast_bump_pointer(mut, size, kind);
   case GC_ALLOCATOR_INLINE_FREELIST:
-    return gc_allocate_small_fast_freelist(mut, size);
+    return gc_allocate_small_fast_freelist(mut, size, kind);
   case GC_ALLOCATOR_INLINE_NONE:
     return NULL;
   default:
@@ -153,27 +163,28 @@ static inline void* gc_allocate_small_fast(struct gc_mutator *mut, size_t size)
   }
 }
 
-static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size) {
+static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size,
+                                     enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate_fast(struct gc_mutator *mut, size_t size,
+                                     enum gc_allocation_kind kind) {
   GC_ASSERT(size != 0);
   if (size > gc_allocator_large_threshold())
     return NULL;
 
-  return gc_allocate_small_fast(mut, size);
+  return gc_allocate_small_fast(mut, size, kind);
 }
 
-static inline void* gc_allocate(struct gc_mutator *mut, size_t size) GC_ALWAYS_INLINE;
-static inline void* gc_allocate(struct gc_mutator *mut, size_t size) {
-  void *ret = gc_allocate_fast(mut, size);
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size,
+                                          enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
+static inline void* gc_allocate(struct gc_mutator *mut, size_t size,
+                                          enum gc_allocation_kind kind) {
+  void *ret = gc_allocate_fast(mut, size, kind);
   if (GC_LIKELY(ret != NULL))
     return ret;
 
-  return gc_allocate_slow(mut, size);
+  return gc_allocate_slow(mut, size, kind);
 }
 
-// FIXME: remove :P
-GC_API_ void* gc_allocate_pointerless(struct gc_mutator *mut, size_t bytes);
-
 GC_API_ int gc_object_is_old_generation_slow(struct gc_mutator *mut,
                                              struct gc_ref obj) GC_NEVER_INLINE;
 
diff --git a/api/gc-attrs.h b/api/gc-attrs.h
index e7ea8c8f3..44d5d47e6 100644
--- a/api/gc-attrs.h
+++ b/api/gc-attrs.h
@@ -2,6 +2,7 @@
 #define GC_ATTRS_H
 
 #include "gc-inline.h"
+#include "gc-allocation-kind.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -19,10 +20,11 @@ static inline size_t gc_allocator_small_granule_size(void) GC_ALWAYS_INLINE;
 static inline size_t gc_allocator_allocation_pointer_offset(void) GC_ALWAYS_INLINE;
 static inline size_t gc_allocator_allocation_limit_offset(void) GC_ALWAYS_INLINE;
 
-static inline size_t gc_allocator_freelist_offset(size_t size) GC_ALWAYS_INLINE;
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
 
 static inline size_t gc_allocator_alloc_table_alignment(void) GC_ALWAYS_INLINE;
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) GC_ALWAYS_INLINE;
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) GC_ALWAYS_INLINE;
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) GC_ALWAYS_INLINE;
 
 enum gc_old_generation_check_kind {
diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 0c6162dc9..3241677dd 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -22,14 +22,15 @@ static inline size_t gc_allocator_allocation_limit_offset(void) {
   return sizeof(uintptr_t) * 1;
 }
 
-static inline size_t gc_allocator_freelist_offset(size_t size) {
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) {
   GC_CRASH();
 }
 
 static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 4 * 1024 * 1024;
 }
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
   return 1;
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
diff --git a/api/pcc-attrs.h b/api/pcc-attrs.h
index 064e5181a..12a555a5d 100644
--- a/api/pcc-attrs.h
+++ b/api/pcc-attrs.h
@@ -25,14 +25,14 @@ static inline size_t gc_allocator_allocation_limit_offset(void) {
   return sizeof(uintptr_t) * 1;
 }
 
-static inline size_t gc_allocator_freelist_offset(size_t size) {
+static inline size_t gc_allocator_freelist_offset(size_t size, enum gc_allocation_kind kind) {
   GC_CRASH();
 }
 
 static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 0;
 }
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
   GC_CRASH();
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
diff --git a/api/semi-attrs.h b/api/semi-attrs.h
index 763f45606..f2efbd831 100644
--- a/api/semi-attrs.h
+++ b/api/semi-attrs.h
@@ -24,14 +24,15 @@ static inline size_t gc_allocator_allocation_limit_offset(void) {
   return sizeof(uintptr_t) * 1;
 }
 
-static inline size_t gc_allocator_freelist_offset(size_t size) {
+static inline size_t gc_allocator_freelist_offset(size_t size,
+                                                  enum gc_allocation_kind kind) {
   GC_CRASH();
 }
 
 static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 0;
 }
-static inline uint8_t gc_allocator_alloc_table_begin_pattern(void) {
+static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
   GC_CRASH();
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
diff --git a/benchmarks/simple-allocator.h b/benchmarks/simple-allocator.h
index 1edba85d3..09ed8f3be 100644
--- a/benchmarks/simple-allocator.h
+++ b/benchmarks/simple-allocator.h
@@ -6,14 +6,14 @@
 
 static inline void*
 gc_allocate_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
-  void *obj = gc_allocate(mut, bytes);
+  void *obj = gc_allocate(mut, bytes, GC_ALLOCATION_TAGGED);
   *tag_word(gc_ref_from_heap_object(obj)) = tag_live(kind);
   return obj;
 }
 
 static inline void*
 gc_allocate_pointerless_with_kind(struct gc_mutator *mut, enum alloc_kind kind, size_t bytes) {
-  void *obj = gc_allocate_pointerless(mut, bytes);
+  void *obj = gc_allocate(mut, bytes, GC_ALLOCATION_TAGGED_POINTERLESS);
   *tag_word(gc_ref_from_heap_object(obj)) = tag_live(kind);
   return obj;
 }
diff --git a/src/bdw.c b/src/bdw.c
index 5f90057a7..332e4a7ec 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -63,6 +63,7 @@ struct gc_heap {
 
 struct gc_mutator {
   void *freelists[GC_INLINE_FREELIST_COUNT];
+  void *pointerless_freelists[GC_INLINE_FREELIST_COUNT];
   struct gc_heap *heap;
   struct gc_mutator_roots *roots;
   struct gc_mutator *next; // with heap lock
@@ -122,27 +123,48 @@ allocate_small(void **freelist, size_t idx, enum gc_inline_kind kind) {
   }
 
   *freelist = *(void **)(head);
+
+  if (kind == GC_INLINE_KIND_POINTERLESS)
+    memset(head, 0, gc_inline_freelist_object_size(idx));
+
   return head;
 }
 
-void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size,
+                       enum gc_allocation_kind kind) {
   GC_ASSERT(size != 0);
   if (size <= gc_allocator_large_threshold()) {
     size_t idx = gc_inline_bytes_to_freelist_index(size);
-    return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
+    void **freelists;
+    enum gc_inline_kind freelist_kind;
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return allocate_small(&mut->freelists[idx], idx, GC_INLINE_KIND_NORMAL);
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return allocate_small(&mut->pointerless_freelists[idx], idx,
+                              GC_INLINE_KIND_POINTERLESS);
+      default:
+        GC_CRASH();
+    }
   } else {
-    return GC_malloc(size);
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return GC_malloc(size);
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS: {
+        void *ret = GC_malloc_atomic(size);
+        memset(ret, 0, size);
+        return ret;
+      }
+      default:
+        GC_CRASH();
+    }
   }
 }
 
-void* gc_allocate_pointerless(struct gc_mutator *mut,
-                                            size_t size) {
-  // Because the BDW API requires us to implement a custom marker so
-  // that the pointerless freelist gets traced, even though it's in a
-  // pointerless region, we punt on thread-local pointerless freelists.
-  return GC_malloc_atomic(size);
-}
-
 void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   // Nothing to do.
 }
diff --git a/src/mmc.c b/src/mmc.c
index 0c9448e62..d5716558f 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -891,7 +891,15 @@ collect_for_small_allocation(void *mut) {
 }
 
 void*
-gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+gc_allocate_slow(struct gc_mutator *mut, size_t size,
+                 enum gc_allocation_kind kind) {
+  if (GC_UNLIKELY(kind != GC_ALLOCATION_TAGGED
+                  && kind != GC_ALLOCATION_TAGGED_POINTERLESS)) {
+    fprintf(stderr, "mmc collector cannot make allocations of kind %d\n",
+            (int)kind);
+    GC_CRASH();
+  }
+
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
 
   if (size > gc_allocator_large_threshold())
@@ -900,12 +908,7 @@ gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   return gc_ref_heap_object(nofl_allocate(&mut->allocator,
                                           heap_nofl_space(mutator_heap(mut)),
                                           size, collect_for_small_allocation,
-                                          mut));
-}
-
-void*
-gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
-  return gc_allocate(mut, size);
+                                          mut, kind));
 }
 
 void
@@ -952,7 +955,8 @@ gc_write_barrier_slow(struct gc_mutator *mut, struct gc_ref obj,
 struct gc_ephemeron*
 gc_allocate_ephemeron(struct gc_mutator *mut) {
   struct gc_ref ret =
-    gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size()));
+    gc_ref_from_heap_object(gc_allocate(mut, gc_ephemeron_size(),
+                                        GC_ALLOCATION_TAGGED));
   nofl_space_set_ephemeron_flag(ret);
   return gc_ref_heap_object(ret);
 }
@@ -977,7 +981,7 @@ gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
 
 struct gc_finalizer*
 gc_allocate_finalizer(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_finalizer_size());
+  return gc_allocate(mut, gc_finalizer_size(), GC_ALLOCATION_TAGGED);
 }
 
 void
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 9e7245ebf..2ad64dc4f 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -871,7 +871,8 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
 
 static struct gc_ref
 nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
-              size_t size, void (*gc)(void*), void *gc_data) {
+              size_t size, void (*gc)(void*), void *gc_data,
+              enum gc_allocation_kind kind) {
   GC_ASSERT(size > 0);
   GC_ASSERT(size <= gc_allocator_large_threshold());
   size = align_up(size, NOFL_GRANULE_SIZE);
@@ -890,7 +891,7 @@ nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
 
   struct gc_ref ret = gc_ref(alloc->alloc);
   alloc->alloc += size;
-  gc_update_alloc_table(ret, size);
+  gc_update_alloc_table(ret, size, kind);
   return ret;
 }
 
diff --git a/src/pcc.c b/src/pcc.c
index 877827ed3..e4c4e37e3 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -978,7 +978,14 @@ static void get_more_empty_blocks_for_mutator(void *mut) {
   trigger_collection(mut, GC_COLLECTION_MINOR);
 }
 
-void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size,
+                       enum gc_allocation_kind kind) {
+  if (GC_UNLIKELY(kind != GC_ALLOCATION_TAGGED
+                  && kind != GC_ALLOCATION_TAGGED_POINTERLESS)) {
+    fprintf(stderr, "pcc collector cannot make allocations of kind %d\n",
+            (int)kind);
+    GC_CRASH();
+  }
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
 
   if (size > gc_allocator_large_threshold())
@@ -998,10 +1005,6 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
   return gc_ref_heap_object(ret);
 }
 
-void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
-  return gc_allocate(mut, size);
-}
-
 void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   GC_CRASH();
 }
@@ -1056,7 +1059,7 @@ void gc_safepoint_slow(struct gc_mutator *mut) {
 }
   
 struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_ephemeron_size());
+  return gc_allocate(mut, gc_ephemeron_size(), GC_ALLOCATION_TAGGED);
 }
 
 void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
@@ -1077,7 +1080,7 @@ unsigned gc_heap_ephemeron_trace_epoch(struct gc_heap *heap) {
 }
 
 struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_finalizer_size());
+  return gc_allocate(mut, gc_finalizer_size(), GC_ALLOCATION_TAGGED);
 }
 
 void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,
diff --git a/src/semi.c b/src/semi.c
index 833cfabef..b1abee836 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -505,7 +505,15 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
   return ret;
 }
 
-void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
+void* gc_allocate_slow(struct gc_mutator *mut, size_t size,
+                       enum gc_allocation_kind kind) {
+  if (GC_UNLIKELY(kind != GC_ALLOCATION_TAGGED
+                  && kind != GC_ALLOCATION_TAGGED_POINTERLESS)) {
+    fprintf(stderr, "semispace collector cannot make allocations of kind %d\n",
+            (int)kind);
+    GC_CRASH();
+  }
+
   if (size > gc_allocator_large_threshold())
     return allocate_large(mut, size);
 
@@ -522,16 +530,13 @@ void* gc_allocate_slow(struct gc_mutator *mut, size_t size) {
     return (void *)addr;
   }
 }
-void* gc_allocate_pointerless(struct gc_mutator *mut, size_t size) {
-  return gc_allocate(mut, size);
-}
 
 void gc_pin_object(struct gc_mutator *mut, struct gc_ref ref) {
   GC_CRASH();
 }
 
 struct gc_ephemeron* gc_allocate_ephemeron(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_ephemeron_size());
+  return gc_allocate(mut, gc_ephemeron_size(), GC_ALLOCATION_TAGGED);
 }
 
 void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
@@ -540,7 +545,7 @@ void gc_ephemeron_init(struct gc_mutator *mut, struct gc_ephemeron *ephemeron,
 }
 
 struct gc_finalizer* gc_allocate_finalizer(struct gc_mutator *mut) {
-  return gc_allocate(mut, gc_finalizer_size());
+  return gc_allocate(mut, gc_finalizer_size(), GC_ALLOCATION_TAGGED);
 }
 
 void gc_finalizer_attach(struct gc_mutator *mut, struct gc_finalizer *finalizer,

From d22eb889482f3022eaeaab17bd67123d0e2f9990 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 12:54:29 +0100
Subject: [PATCH 393/403] nofl space / mmc supports untagged allocations

---
 api/mmc-attrs.h          |  33 +++++++++++-
 src/gc-trace.h           |  12 +++++
 src/large-object-space.h |  28 +++++++++--
 src/mmc.c                | 106 ++++++++++++++++++++++++++-------------
 src/nofl-space.h         |  79 +++++++++++++++++++++++------
 src/pcc.c                |   2 +-
 src/semi.c               |   2 +-
 7 files changed, 203 insertions(+), 59 deletions(-)

diff --git a/api/mmc-attrs.h b/api/mmc-attrs.h
index 3241677dd..9371f8abe 100644
--- a/api/mmc-attrs.h
+++ b/api/mmc-attrs.h
@@ -31,10 +31,39 @@ static inline size_t gc_allocator_alloc_table_alignment(void) {
   return 4 * 1024 * 1024;
 }
 static inline uint8_t gc_allocator_alloc_table_begin_pattern(enum gc_allocation_kind kind) {
-  return 1;
+  uint8_t young = 1;
+  uint8_t trace_precisely = 0;
+  uint8_t trace_none = 8;
+  uint8_t trace_conservatively = 16;
+  uint8_t pinned = 16;
+  if (GC_CONSERVATIVE_TRACE) {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return young | trace_conservatively;
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+        return young | trace_none;
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return young | trace_none;
+      default:
+        GC_CRASH();
+      };
+  } else {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+        return young | trace_precisely;
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+        return young | trace_none;
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return young | trace_none | pinned;
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+      default:
+        GC_CRASH();
+    };
+  }
 }
 static inline uint8_t gc_allocator_alloc_table_end_pattern(void) {
-  return 16;
+  return 32;
 }
 
 static inline enum gc_old_generation_check_kind gc_old_generation_check_kind(size_t obj_size) {
diff --git a/src/gc-trace.h b/src/gc-trace.h
index b9e4691e8..cc1dd2808 100644
--- a/src/gc-trace.h
+++ b/src/gc-trace.h
@@ -28,6 +28,18 @@ static inline int gc_has_conservative_roots(void) {
     gc_has_global_conservative_roots();
 }
 
+enum gc_trace_kind {
+  GC_TRACE_PRECISELY,
+  GC_TRACE_NONE,
+  GC_TRACE_CONSERVATIVELY,
+  GC_TRACE_EPHEMERON,
+};
+
+struct gc_trace_plan {
+  enum gc_trace_kind kind;
+  size_t size; // For conservative tracing.
+};
+
 static inline int
 gc_conservative_ref_might_be_a_heap_object(struct gc_conservative_ref ref,
                                            int possibly_interior) {
diff --git a/src/large-object-space.h b/src/large-object-space.h
index 887cdef0e..cdd798343 100644
--- a/src/large-object-space.h
+++ b/src/large-object-space.h
@@ -11,6 +11,7 @@
 #include "gc-assert.h"
 #include "gc-ref.h"
 #include "gc-conservative-ref.h"
+#include "gc-trace.h"
 #include "address-map.h"
 #include "address-set.h"
 #include "background-thread.h"
@@ -35,6 +36,7 @@ struct large_object {
 struct large_object_node;
 struct large_object_live_data {
   uint8_t mark;
+  enum gc_trace_kind trace;
 };
 struct large_object_dead_data {
   uint8_t age;
@@ -166,14 +168,27 @@ large_object_space_start_gc(struct large_object_space *space, int is_minor_gc) {
   }
 }
 
-static inline size_t
-large_object_space_object_size(struct large_object_space *space,
-                               struct gc_ref ref) {
+static inline struct gc_trace_plan
+large_object_space_object_trace_plan(struct large_object_space *space,
+                                     struct gc_ref ref) {
   uintptr_t node_bits =
     address_map_lookup(&space->object_map, gc_ref_value(ref), 0);
   GC_ASSERT(node_bits);
   struct large_object_node *node = (struct large_object_node*) node_bits;
-  return node->key.size;
+  switch (node->value.live.trace) {
+    case GC_TRACE_PRECISELY:
+      return (struct gc_trace_plan){ GC_TRACE_PRECISELY, };
+    case GC_TRACE_NONE:
+      return (struct gc_trace_plan){ GC_TRACE_NONE, };
+#if GC_CONSERVATIVE_TRACE
+    case GC_TRACE_CONSERVATIVELY: {
+      return (struct gc_trace_plan){ GC_TRACE_CONSERVATIVELY, node->key.size };
+    }
+    // No large ephemerons.
+#endif
+    default:
+      GC_CRASH();
+  }
 }
 
 static uint8_t*
@@ -402,7 +417,8 @@ large_object_space_mark_conservative_ref(struct large_object_space *space,
 }
 
 static void*
-large_object_space_alloc(struct large_object_space *space, size_t npages) {
+large_object_space_alloc(struct large_object_space *space, size_t npages,
+                         enum gc_trace_kind trace) {
   void *ret = NULL;
   pthread_mutex_lock(&space->lock);
   
@@ -422,6 +438,7 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
       node->value.is_live = 1;
       memset(&node->value.live, 0, sizeof(node->value.live));
       node->value.live.mark = LARGE_OBJECT_NURSERY;
+      node->value.live.trace = trace;
 
       // If the hole is actually too big, trim its tail.
       if (node->key.size > size) {
@@ -458,6 +475,7 @@ large_object_space_alloc(struct large_object_space *space, size_t npages) {
       struct large_object_data v = {0,};
       v.is_live = 1;
       v.live.mark = LARGE_OBJECT_NURSERY;
+      v.live.trace = trace;
 
       pthread_mutex_lock(&space->object_tree_lock);
       struct large_object_node *node =
diff --git a/src/mmc.c b/src/mmc.c
index d5716558f..081d7b83a 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -332,37 +332,41 @@ trace_conservative_edges(uintptr_t low, uintptr_t high, int possibly_interior,
                                   possibly_interior);
 }
 
-static inline void
-trace_one_conservatively(struct gc_ref ref, struct gc_heap *heap,
-                         struct gc_trace_worker *worker) {
-  size_t bytes;
+static inline struct gc_trace_plan
+trace_plan(struct gc_heap *heap, struct gc_ref ref) {
   if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref))) {
-    // Generally speaking we trace conservatively and don't allow much
-    // in the way of incremental precise marking on a
-    // conservative-by-default heap.  But, we make an exception for
-    // ephemerons.
-    if (GC_UNLIKELY(nofl_is_ephemeron(ref))) {
-      gc_trace_ephemeron(gc_ref_heap_object(ref), tracer_visit, heap,
-                         worker);
-      return;
-    }
-    bytes = nofl_space_object_size(heap_nofl_space(heap), ref);
+    return nofl_space_object_trace_plan(heap_nofl_space(heap), ref);
   } else {
-    bytes = large_object_space_object_size(heap_large_object_space(heap), ref);
+    return large_object_space_object_trace_plan(heap_large_object_space(heap),
+                                                ref);
   }
-  // Intraheap edges are not interior.
-  int possibly_interior = 0;
-  trace_conservative_edges(gc_ref_value(ref), gc_ref_value(ref) + bytes,
-                           possibly_interior, heap, worker);
 }
 
 static inline void
 trace_one(struct gc_ref ref, struct gc_heap *heap,
           struct gc_trace_worker *worker) {
-  if (gc_has_conservative_intraheap_edges())
-    trace_one_conservatively(ref, heap, worker);
-  else
-    gc_trace_object(ref, tracer_visit, heap, worker, NULL);
+  struct gc_trace_plan plan = trace_plan(heap, ref);
+  switch (plan.kind) {
+    case GC_TRACE_PRECISELY:
+      gc_trace_object(ref, tracer_visit, heap, worker, NULL);
+      break;
+    case GC_TRACE_NONE:
+      break;
+    case GC_TRACE_CONSERVATIVELY: {
+      // Intraheap edges are not interior.
+      uintptr_t addr = gc_ref_value(ref);
+      int possibly_interior = 0;
+      trace_conservative_edges(addr, addr + plan.size, possibly_interior,
+                               heap, worker);
+      break;
+    }
+    case GC_TRACE_EPHEMERON:
+      gc_trace_ephemeron(gc_ref_heap_object(ref), tracer_visit, heap,
+                         worker);
+      break;
+    default:
+      GC_CRASH();
+  }
 }
 
 static inline void
@@ -860,8 +864,36 @@ gc_safepoint_slow(struct gc_mutator *mut) {
   heap_unlock(heap);
 }
 
+static enum gc_trace_kind
+compute_trace_kind(enum gc_allocation_kind kind) {
+  if (GC_CONSERVATIVE_TRACE) {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+        return GC_TRACE_CONSERVATIVELY;
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return GC_TRACE_NONE;
+      default:
+        GC_CRASH();
+      };
+  } else {
+    switch (kind) {
+      case GC_ALLOCATION_TAGGED:
+        return GC_TRACE_PRECISELY;
+      case GC_ALLOCATION_TAGGED_POINTERLESS:
+      case GC_ALLOCATION_UNTAGGED_POINTERLESS:
+        return GC_TRACE_NONE;
+      case GC_ALLOCATION_UNTAGGED_CONSERVATIVE:
+      default:
+        GC_CRASH();
+    };
+  }
+}
+
 static void*
-allocate_large(struct gc_mutator *mut, size_t size) {
+allocate_large(struct gc_mutator *mut, size_t size,
+               enum gc_trace_kind kind) {
   struct gc_heap *heap = mutator_heap(mut);
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   struct large_object_space *lospace = heap_large_object_space(heap);
@@ -875,7 +907,7 @@ allocate_large(struct gc_mutator *mut, size_t size) {
     trigger_collection(mut, GC_COLLECTION_COMPACTING, 0);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
-  void *ret = large_object_space_alloc(lospace, npages);
+  void *ret = large_object_space_alloc(lospace, npages, kind);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
@@ -893,17 +925,10 @@ collect_for_small_allocation(void *mut) {
 void*
 gc_allocate_slow(struct gc_mutator *mut, size_t size,
                  enum gc_allocation_kind kind) {
-  if (GC_UNLIKELY(kind != GC_ALLOCATION_TAGGED
-                  && kind != GC_ALLOCATION_TAGGED_POINTERLESS)) {
-    fprintf(stderr, "mmc collector cannot make allocations of kind %d\n",
-            (int)kind);
-    GC_CRASH();
-  }
-
   GC_ASSERT(size > 0); // allocating 0 bytes would be silly
 
   if (size > gc_allocator_large_threshold())
-    return allocate_large(mut, size);
+    return allocate_large(mut, size, compute_trace_kind(kind));
 
   return gc_ref_heap_object(nofl_allocate(&mut->allocator,
                                           heap_nofl_space(mutator_heap(mut)),
@@ -1121,7 +1146,20 @@ gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
   GC_ASSERT_EQ(gc_allocator_allocation_limit_offset(),
                offsetof(struct nofl_allocator, sweep));
   GC_ASSERT_EQ(gc_allocator_alloc_table_alignment(), NOFL_SLAB_SIZE);
-  GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(), NOFL_METADATA_BYTE_YOUNG);
+  GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(GC_ALLOCATION_TAGGED),
+               NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_TRACE_PRECISELY);
+  GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(GC_ALLOCATION_TAGGED_POINTERLESS),
+               NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_TRACE_NONE);
+  if (GC_CONSERVATIVE_TRACE) {
+    GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(GC_ALLOCATION_UNTAGGED_CONSERVATIVE),
+                 NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_TRACE_CONSERVATIVELY);
+    GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(GC_ALLOCATION_UNTAGGED_POINTERLESS),
+                 NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_TRACE_NONE);
+  } else {
+    GC_ASSERT_EQ(gc_allocator_alloc_table_begin_pattern(GC_ALLOCATION_UNTAGGED_POINTERLESS),
+                 NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_TRACE_NONE |
+                 NOFL_METADATA_BYTE_PINNED);
+  }
   GC_ASSERT_EQ(gc_allocator_alloc_table_end_pattern(), NOFL_METADATA_BYTE_END);
   if (GC_GENERATIONAL) {
     GC_ASSERT_EQ(gc_write_barrier_field_table_alignment(), NOFL_SLAB_SIZE);
diff --git a/src/nofl-space.h b/src/nofl-space.h
index 2ad64dc4f..9a7f29304 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -183,6 +183,11 @@ struct nofl_allocator {
   struct nofl_block_ref block;
 };
 
+#if GC_CONSERVATIVE_TRACE && GC_CONCURRENT_TRACE
+// There are just not enough bits in the mark table.
+#error Unsupported configuration
+#endif
+
 // Each granule has one mark byte stored in a side table.  A granule's
 // mark state is a whole byte instead of a bit to facilitate parallel
 // marking.  (Parallel markers are allowed to race.)  We also use this
@@ -236,32 +241,32 @@ enum nofl_metadata_byte {
   NOFL_METADATA_BYTE_YOUNG = 1,
   NOFL_METADATA_BYTE_MARK_0 = 2,
   NOFL_METADATA_BYTE_MARK_1 = 3,
-#if GC_CONCURRENT_TRACE
   NOFL_METADATA_BYTE_MARK_2 = 4,
   NOFL_METADATA_BYTE_MARK_MASK = 7,
-  /* NOFL_METADATA_BYTE_UNUSED_0 = 8, */
-#else
-  NOFL_METADATA_BYTE_MARK_MASK = 3,
-  /* NOFL_METADATA_BYTE_UNUSED_0 = 4, */
-  /* NOFL_METADATA_BYTE_UNUSED_1 = 8, */
-#endif
-  NOFL_METADATA_BYTE_END = 16,
-  NOFL_METADATA_BYTE_PINNED = 32,
+  NOFL_METADATA_BYTE_TRACE_PRECISELY = 0,
+  NOFL_METADATA_BYTE_TRACE_NONE = 8,
+  NOFL_METADATA_BYTE_TRACE_CONSERVATIVELY = 16,
+  NOFL_METADATA_BYTE_TRACE_EPHEMERON = 24,
+  NOFL_METADATA_BYTE_TRACE_KIND_MASK = 0|8|16|24,
+  NOFL_METADATA_BYTE_PINNED = 16,
+  NOFL_METADATA_BYTE_END = 32,
   NOFL_METADATA_BYTE_LOGGED_0 = 64,
   NOFL_METADATA_BYTE_LOGGED_1 = 128,
-  NOFL_METADATA_BYTE_EPHEMERON = NOFL_METADATA_BYTE_PINNED,
 };
 
+STATIC_ASSERT_EQ(0,
+                 NOFL_METADATA_BYTE_TRACE_PRECISELY&NOFL_METADATA_BYTE_PINNED);
+STATIC_ASSERT_EQ(0,
+                 NOFL_METADATA_BYTE_TRACE_NONE&NOFL_METADATA_BYTE_PINNED);
+
 static uint8_t
 nofl_advance_current_mark(uint8_t mark) {
   switch (mark) {
     case NOFL_METADATA_BYTE_MARK_0:
       return NOFL_METADATA_BYTE_MARK_1;
     case NOFL_METADATA_BYTE_MARK_1:
-#if GC_CONCURRENT_TRACE
       return NOFL_METADATA_BYTE_MARK_2;
     case NOFL_METADATA_BYTE_MARK_2:
-#endif
       return NOFL_METADATA_BYTE_MARK_0;
     default:
       GC_CRASH();
@@ -925,14 +930,16 @@ nofl_finish_sweeping(struct nofl_allocator *alloc,
 static inline int
 nofl_is_ephemeron(struct gc_ref ref) {
   uint8_t meta = *nofl_metadata_byte_for_addr(gc_ref_value(ref));
-  return meta & NOFL_METADATA_BYTE_EPHEMERON;
+  uint8_t kind = meta & NOFL_METADATA_BYTE_TRACE_KIND_MASK;
+  return kind == NOFL_METADATA_BYTE_TRACE_EPHEMERON;
 }
 
 static void
 nofl_space_set_ephemeron_flag(struct gc_ref ref) {
   if (gc_has_conservative_intraheap_edges()) {
     uint8_t *metadata = nofl_metadata_byte_for_addr(gc_ref_value(ref));
-    *metadata |= NOFL_METADATA_BYTE_EPHEMERON;
+    uint8_t byte = *metadata & ~NOFL_METADATA_BYTE_TRACE_KIND_MASK;
+    *metadata = byte | NOFL_METADATA_BYTE_TRACE_EPHEMERON;
   }
 }
 
@@ -1465,8 +1472,8 @@ nofl_space_set_nonempty_mark(struct nofl_space *space, uint8_t *metadata,
 
 static inline void
 nofl_space_pin_object(struct nofl_space *space, struct gc_ref ref) {
-  // For the heap-conservative configuration, all objects are pinned,
-  // and we re-use the pinned bit to identify ephemerons.
+  // For the heap-conservative configuration, all objects are pinned, and we use
+  // the pinned bit instead to identify an object's trace kind.
   if (gc_has_conservative_intraheap_edges())
     return;
   uint8_t *metadata = nofl_metadata_byte_for_object(ref);
@@ -1721,6 +1728,46 @@ nofl_space_object_size(struct nofl_space *space, struct gc_ref ref) {
   return granules * NOFL_GRANULE_SIZE;
 }
 
+static inline enum gc_trace_kind
+nofl_metadata_byte_trace_kind(uint8_t byte)
+{
+  switch (byte & NOFL_METADATA_BYTE_TRACE_KIND_MASK) {
+  case NOFL_METADATA_BYTE_TRACE_PRECISELY:
+    return GC_TRACE_PRECISELY;
+  case NOFL_METADATA_BYTE_TRACE_NONE:
+    return GC_TRACE_NONE;
+#if GC_CONSERVATIVE_TRACE
+  case NOFL_METADATA_BYTE_TRACE_CONSERVATIVELY:
+    return GC_TRACE_CONSERVATIVELY;
+  case NOFL_METADATA_BYTE_TRACE_EPHEMERON:
+    return GC_TRACE_EPHEMERON;
+#endif
+    default:
+      GC_CRASH();
+  }
+}
+static inline struct gc_trace_plan
+nofl_space_object_trace_plan(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *loc = nofl_metadata_byte_for_object(ref);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
+  enum gc_trace_kind kind = nofl_metadata_byte_trace_kind(byte);
+  switch (kind) {
+    case GC_TRACE_PRECISELY:
+    case GC_TRACE_NONE:
+      return (struct gc_trace_plan){ kind, };
+#if GC_CONSERVATIVE_TRACE
+    case GC_TRACE_CONSERVATIVELY: {
+      size_t granules = nofl_space_live_object_granules(loc);
+      return (struct gc_trace_plan){ kind, granules * NOFL_GRANULE_SIZE };
+    }
+    case GC_TRACE_EPHEMERON:
+      return (struct gc_trace_plan){ kind, };
+#endif
+    default:
+      GC_CRASH();
+  }
+}
+
 static struct nofl_slab*
 nofl_allocate_slabs(size_t nslabs) {
   return gc_platform_acquire_memory(nslabs * NOFL_SLAB_SIZE, NOFL_SLAB_SIZE);
diff --git a/src/pcc.c b/src/pcc.c
index e4c4e37e3..b605eb762 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -964,7 +964,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
     trigger_collection(mut, GC_COLLECTION_COMPACTING);
   atomic_fetch_add(&heap->large_object_pages, npages);
 
-  void *ret = large_object_space_alloc(space, npages);
+  void *ret = large_object_space_alloc(space, npages, GC_TRACE_PRECISELY);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");
diff --git a/src/semi.c b/src/semi.c
index b1abee836..0626e02b0 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -495,7 +495,7 @@ static void* allocate_large(struct gc_mutator *mut, size_t size) {
   while (!semi_space_steal_pages(semi_space, npages))
     collect_for_large_alloc(mut, npages);
 
-  void *ret = large_object_space_alloc(space, npages);
+  void *ret = large_object_space_alloc(space, npages, GC_TRACE_PRECISELY);
 
   if (!ret) {
     perror("weird: we have the space but mmap didn't work");

From 3a86fedcded198a65d5c32cc519fe4bca77b9de1 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 7 Mar 2025 21:15:47 +0100
Subject: [PATCH 394/403] Update nofl metadata byte comment

---
 src/nofl-space.h | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 9a7f29304..2668232c6 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -197,32 +197,42 @@ struct nofl_allocator {
 // Because we want to allow for conservative roots, we need to know
 // whether an address indicates an object or not.  That means that when
 // an object is allocated, it has to set a bit, somewhere.  We use the
-// metadata byte for this purpose, setting the "young" bit.
+// metadata byte for this purpose, setting the "young" mark.
 //
-// The "young" bit's name might make you think about generational
+// The "young" mark's name might make you think about generational
 // collection, and indeed all objects collected in a minor collection
 // will have this bit set.  However, the nofl space never needs to check
-// for the young bit; if it weren't for the need to identify
-// conservative roots, we wouldn't need a young bit at all.  Perhaps in
+// for the young mark; if it weren't for the need to identify
+// conservative roots, we wouldn't need a young mark at all.  Perhaps in
 // an all-precise system, we would be able to avoid the overhead of
 // initializing mark byte upon each fresh allocation.
 //
-// When an object becomes dead after a GC, it will still have a bit set
-// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
-// clear these bits before the next collection.  But if we add
+// When an object becomes dead after a GC, it will still have a mark set
+// -- maybe the young mark, or maybe a survivor mark.  The sweeper has
+// to clear these marks before the next collection.  If we add
 // concurrent marking, we will also be marking "live" objects, updating
-// their mark bits.  So there are four object states concurrently
-// observable:  young, dead, survivor, and marked.  (We don't currently
-// have concurrent marking, though.)  Even though these states are
-// mutually exclusive, we use separate bits for them because we have the
-// space.  After each collection, the dead, survivor, and marked states
-// rotate by one bit.
+// their mark bits.  So there are three and possibly four object states
+// concurrently observable:  young, dead, survivor, and marked.  (We
+// don't currently have concurrent marking, though.)  We store this
+// state in the low 3 bits of the byte.  After each major collection,
+// the dead, survivor, and marked states rotate.
+//
+// It can be useful to support "raw" allocations, most often
+// pointerless, but for compatibility with BDW-GC, sometimes
+// conservatively-traced tagless data.  We reserve one or two bits for
+// the "kind" of the allocation: either a normal object traceable via
+// `gc_trace_object`, a pointerless untagged allocation that doesn't
+// need tracing, an allocation that should be traced conservatively, or
+// an ephemeron.  The latter two states are only used when conservative
+// tracing is enabled.
 //
 // An object can be pinned, preventing it from being evacuated during
 // collection.  Pinning does not keep the object alive; if it is
 // otherwise unreachable, it will be collected.  To pin an object, a
 // running mutator can set the pinned bit, using atomic
-// compare-and-swap.
+// compare-and-swap.  This bit overlaps the "trace conservatively" and
+// "ephemeron" trace kinds, but that's OK because we don't use the
+// pinned bit in those cases, as all objects are implicitly pinned.
 //
 // For generational collectors, the nofl space supports a field-logging
 // write barrier.  The two logging bits correspond to the two words in a
@@ -230,12 +240,6 @@ struct nofl_allocator {
 // the logged bit; if it is unset, it should try to atomically set the
 // bit, and if that works, then we record the field location as a
 // generational root, adding it to a sequential-store buffer.
-//
-// Finally, for heap-conservative collectors, nofl generally traces all
-// objects in the same way, treating them as an array of conservative
-// edges.  But we need to know when we have an ephemeron.  In that case,
-// we re-use the pinned bit, because it's of no use to us anyway in that
-// configuration, as all objects are pinned.
 enum nofl_metadata_byte {
   NOFL_METADATA_BYTE_NONE = 0,
   NOFL_METADATA_BYTE_YOUNG = 1,

From 0cff6ffba484e7dd43acc0c5b5c0d1759b58275c Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 13 Mar 2025 13:43:41 +0100
Subject: [PATCH 395/403] Basic stats also record CPU time

---
 api/gc-basic-stats.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/api/gc-basic-stats.h b/api/gc-basic-stats.h
index 6b39a59c6..055340817 100644
--- a/api/gc-basic-stats.h
+++ b/api/gc-basic-stats.h
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/time.h>
+#include <time.h>
 
 GC_DEFINE_HISTOGRAM(gc_latency, 25, 4);
 
@@ -16,8 +17,11 @@ struct gc_basic_stats {
   uint64_t major_collection_count;
   uint64_t minor_collection_count;
   uint64_t last_time_usec;
+  uint64_t last_cpu_time_usec;
   uint64_t elapsed_mutator_usec;
   uint64_t elapsed_collector_usec;
+  uint64_t cpu_mutator_usec;
+  uint64_t cpu_collector_usec;
   size_t heap_size;
   size_t max_heap_size;
   size_t max_live_data_size;
@@ -33,18 +37,31 @@ static inline uint64_t gc_basic_stats_now(void) {
   return ret;
 }
 
+static inline uint64_t gc_basic_stats_cpu_time(void) {
+  struct timespec ts;
+  clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &ts);
+  uint64_t ret = ts.tv_sec;
+  ret *= 1000 * 1000;
+  ret += ts.tv_nsec / 1000;
+  return ret;
+}
+
 static inline void gc_basic_stats_init(void *data, size_t heap_size) {
   struct gc_basic_stats *stats = data;
   memset(stats, 0, sizeof(*stats));
   stats->last_time_usec = gc_basic_stats_now();
+  stats->last_cpu_time_usec = gc_basic_stats_cpu_time();
   stats->heap_size = stats->max_heap_size = heap_size;
 }
 
 static inline void gc_basic_stats_requesting_stop(void *data) {
   struct gc_basic_stats *stats = data;
   uint64_t now = gc_basic_stats_now();
+  uint64_t cpu_time = gc_basic_stats_cpu_time();
   stats->elapsed_mutator_usec += now - stats->last_time_usec;
+  stats->cpu_mutator_usec += cpu_time - stats->last_cpu_time_usec;
   stats->last_time_usec = now;
+  stats->last_cpu_time_usec = cpu_time;
 }
 static inline void gc_basic_stats_waiting_for_stop(void *data) {}
 static inline void gc_basic_stats_mutators_stopped(void *data) {}
@@ -66,10 +83,14 @@ static inline void gc_basic_stats_finalizers_traced(void *data) {}
 static inline void gc_basic_stats_restarting_mutators(void *data) {
   struct gc_basic_stats *stats = data;
   uint64_t now = gc_basic_stats_now();
+  uint64_t cpu_time = gc_basic_stats_cpu_time();
   uint64_t pause_time = now - stats->last_time_usec;
+  uint64_t pause_cpu_time = cpu_time - stats->last_cpu_time_usec;
   stats->elapsed_collector_usec += pause_time;
+  stats->cpu_collector_usec += pause_cpu_time;
   gc_latency_record(&stats->pause_times, pause_time);
   stats->last_time_usec = now;
+  stats->last_cpu_time_usec = cpu_time;
 }
 
 static inline void* gc_basic_stats_mutator_added(void *data) {
@@ -118,8 +139,11 @@ static inline void gc_basic_stats_live_data_size(void *data, size_t size) {
 
 static inline void gc_basic_stats_finish(struct gc_basic_stats *stats) {
   uint64_t now = gc_basic_stats_now();
+  uint64_t cpu_time = gc_basic_stats_cpu_time();
   stats->elapsed_mutator_usec += now - stats->last_time_usec;
+  stats->cpu_mutator_usec += cpu_time - stats->last_cpu_time_usec;
   stats->last_time_usec = now;
+  stats->last_cpu_time_usec = cpu_time;
 }
 
 static inline void gc_basic_stats_print(struct gc_basic_stats *stats, FILE *f) {
@@ -127,10 +151,15 @@ static inline void gc_basic_stats_print(struct gc_basic_stats *stats, FILE *f) {
           stats->major_collection_count, stats->minor_collection_count);
   uint64_t stopped = stats->elapsed_collector_usec;
   uint64_t elapsed = stats->elapsed_mutator_usec + stopped;
+  uint64_t cpu_stopped = stats->cpu_collector_usec;
+  uint64_t cpu_total = stats->cpu_mutator_usec + cpu_stopped;
   uint64_t ms = 1000; // per usec
   fprintf(f, "%" PRIu64 ".%.3" PRIu64 " ms total time "
+          "(%" PRIu64 ".%.3" PRIu64 " stopped); "
+          "%" PRIu64 ".%.3" PRIu64 " ms CPU time "
           "(%" PRIu64 ".%.3" PRIu64 " stopped).\n",
-          elapsed / ms, elapsed % ms, stopped / ms, stopped % ms);
+          elapsed / ms, elapsed % ms, stopped / ms, stopped % ms,
+          cpu_total / ms, cpu_total % ms, cpu_stopped / ms, cpu_stopped % ms);
   uint64_t pause_median = gc_latency_median(&stats->pause_times);
   uint64_t pause_p95 = gc_latency_percentile(&stats->pause_times, 0.95);
   uint64_t pause_max = gc_latency_max(&stats->pause_times);

From c51a48eae8b76a1985b7eb830fa25378689947fc Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 14 Mar 2025 09:15:03 +0100
Subject: [PATCH 396/403] Fix prototype of copy_space_add_to_allocation_counter

---
 src/copy-space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/copy-space.h b/src/copy-space.h
index 19b00e5fa..7f262c221 100644
--- a/src/copy-space.h
+++ b/src/copy-space.h
@@ -581,7 +581,7 @@ copy_space_can_allocate(struct copy_space *space, size_t bytes) {
 
 static void
 copy_space_add_to_allocation_counter(struct copy_space *space,
-                                     uintptr_t *counter) {
+                                     uint64_t *counter) {
   *counter += space->allocated_bytes - space->allocated_bytes_at_last_gc;
 }
 

From 05e8aba462202278b2dcce233e7df2de2994059e Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Fri, 14 Mar 2025 09:44:18 +0100
Subject: [PATCH 397/403] Add gc_allocation_counter API

---
 api/gc-api.h              |  2 ++
 src/adaptive-heap-sizer.h | 25 ++++++++++++++-----------
 src/bdw.c                 |  4 ++++
 src/mmc.c                 | 17 +++++++++++++++--
 src/pcc.c                 | 17 +++++++++++++++--
 src/semi.c                |  4 ++++
 6 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/api/gc-api.h b/api/gc-api.h
index 245784b33..78d8b2bdb 100644
--- a/api/gc-api.h
+++ b/api/gc-api.h
@@ -31,6 +31,8 @@ GC_API_ int gc_init(const struct gc_options *options,
                     struct gc_event_listener event_listener,
                     void *event_listener_data);
 
+GC_API_ uint64_t gc_allocation_counter(struct gc_heap *heap);
+
 GC_API_ struct gc_heap* gc_mutator_heap(struct gc_mutator *mut);
 
 GC_API_ uintptr_t gc_small_object_nursery_low_address(struct gc_heap *heap);
diff --git a/src/adaptive-heap-sizer.h b/src/adaptive-heap-sizer.h
index 6d3db05bb..225b44baf 100644
--- a/src/adaptive-heap-sizer.h
+++ b/src/adaptive-heap-sizer.h
@@ -109,17 +109,20 @@ gc_adaptive_heap_sizer_background_task(void *data) {
   gc_adaptive_heap_sizer_lock(sizer);
   uint64_t bytes_allocated =
     sizer->get_allocation_counter(sizer->heap);
-  uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
-  double rate = (double) (bytes_allocated - sizer->last_bytes_allocated) /
-    (double) (heartbeat - sizer->last_heartbeat);
-  // Just smooth the rate, under the assumption that the denominator is almost
-  // always 1.
-  sizer->smoothed_allocation_rate *= 1.0 - sizer->allocation_smoothing_factor;
-  sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
-  sizer->last_heartbeat = heartbeat;
-  sizer->last_bytes_allocated = bytes_allocated;
-  sizer->set_heap_size(sizer->heap,
-                       gc_adaptive_heap_sizer_calculate_size(sizer));
+  // bytes_allocated being 0 means the request failed; retry later.
+  if (bytes_allocated) {
+    uint64_t heartbeat = gc_platform_monotonic_nanoseconds();
+    double rate = (double) (bytes_allocated - sizer->last_bytes_allocated) /
+      (double) (heartbeat - sizer->last_heartbeat);
+    // Just smooth the rate, under the assumption that the denominator is almost
+    // always 1.
+    sizer->smoothed_allocation_rate *= 1.0 - sizer->allocation_smoothing_factor;
+    sizer->smoothed_allocation_rate += rate * sizer->allocation_smoothing_factor;
+    sizer->last_heartbeat = heartbeat;
+    sizer->last_bytes_allocated = bytes_allocated;
+    sizer->set_heap_size(sizer->heap,
+                         gc_adaptive_heap_sizer_calculate_size(sizer));
+  }
   gc_adaptive_heap_sizer_unlock(sizer);
 }
 
diff --git a/src/bdw.c b/src/bdw.c
index 332e4a7ec..18d1b893b 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -505,6 +505,10 @@ static void on_heap_resize(GC_word size) {
   HEAP_EVENT(heap_resized, size);
 }
 
+uint64_t gc_allocation_counter(struct gc_heap *heap) {
+  return GC_get_total_bytes();
+}
+
 int gc_init(const struct gc_options *options, struct gc_stack_addr *stack_base,
             struct gc_heap **heap, struct gc_mutator **mutator,
             struct gc_event_listener event_listener,
diff --git a/src/mmc.c b/src/mmc.c
index 081d7b83a..661b7084b 100644
--- a/src/mmc.c
+++ b/src/mmc.c
@@ -1080,12 +1080,25 @@ gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
+// with heap lock
+static uint64_t allocation_counter(struct gc_heap *heap) {
   uint64_t ret = heap->total_allocated_bytes_at_last_gc;
-  if (pthread_mutex_trylock(&heap->lock)) return ret;
   nofl_space_add_to_allocation_counter(heap_nofl_space(heap), &ret);
   large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
                                                &ret);
+  return ret;
+}
+
+uint64_t gc_allocation_counter(struct gc_heap *heap) {
+  pthread_mutex_lock(&heap->lock);
+  uint64_t ret = allocation_counter(heap);
+  pthread_mutex_unlock(&heap->lock);
+  return ret;
+}
+
+static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
+  if (pthread_mutex_trylock(&heap->lock)) return 0;
+  uint64_t ret = allocation_counter(heap);
   pthread_mutex_unlock(&heap->lock);
   return ret;
 }
diff --git a/src/pcc.c b/src/pcc.c
index b605eb762..ca8be1c11 100644
--- a/src/pcc.c
+++ b/src/pcc.c
@@ -1149,12 +1149,25 @@ int gc_options_parse_and_set(struct gc_options *options, int option,
   return gc_common_options_parse_and_set(&options->common, option, value);
 }
 
-static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
+// with heap lock
+static uint64_t allocation_counter(struct gc_heap *heap) {
   uint64_t ret = heap->total_allocated_bytes_at_last_gc;
-  if (pthread_mutex_trylock(&heap->lock)) return ret;
   copy_space_add_to_allocation_counter(heap_allocation_space(heap), &ret);
   large_object_space_add_to_allocation_counter(heap_large_object_space(heap),
                                                &ret);
+  return ret;
+}
+
+uint64_t gc_allocation_counter(struct gc_heap *heap) {
+  pthread_mutex_lock(&heap->lock);
+  uint64_t ret = allocation_counter(heap);
+  pthread_mutex_unlock(&heap->lock);
+  return ret;
+}
+
+static uint64_t allocation_counter_from_thread(struct gc_heap *heap) {
+  if (pthread_mutex_trylock(&heap->lock)) return 0;
+  uint64_t ret = allocation_counter(heap);
   pthread_mutex_unlock(&heap->lock);
   return ret;
 }
diff --git a/src/semi.c b/src/semi.c
index 0626e02b0..6f902534d 100644
--- a/src/semi.c
+++ b/src/semi.c
@@ -618,6 +618,10 @@ static uint64_t get_allocation_counter(struct gc_heap *heap) {
   return heap->total_allocated_bytes_at_last_gc;
 }
 
+uint64_t gc_allocation_counter(struct gc_heap *heap) {
+  return get_allocation_counter(heap);
+}
+
 static void ignore_async_heap_size_adjustment(struct gc_heap *heap,
                                               size_t size) {
 }

From d1aa0d894f2b5c1542e2f9a9f518da49b5ecf7b4 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 27 Mar 2025 12:00:21 +0100
Subject: [PATCH 398/403] bdw: Fix embarrassing failure to mark atomic
 freelists

---
 src/bdw.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/bdw.c b/src/bdw.c
index 18d1b893b..ea446557d 100644
--- a/src/bdw.c
+++ b/src/bdw.c
@@ -399,6 +399,13 @@ mark_mutator(GC_word *addr, struct GC_ms_entry *mark_stack_ptr,
                                              state.mark_stack_limit,
                                              NULL);
 
+  for (int i = 0; i < GC_INLINE_FREELIST_COUNT; i++)
+    for (void *head = mut->pointerless_freelists[i]; head; head = *(void**)head)
+      state.mark_stack_ptr = GC_MARK_AND_PUSH (head,
+                                               state.mark_stack_ptr,
+                                               state.mark_stack_limit,
+                                               NULL);
+
   if (mut->roots)
     gc_trace_mutator_roots(mut->roots, bdw_mark_edge, mut->heap, &state);
 

From 0c35069f58e55d76eb86bf362725abed17981d08 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Apr 2025 12:13:48 +0200
Subject: [PATCH 399/403] Add autotools embedding files

---
 embed.am   | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 whippet.m4 | 181 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 391 insertions(+)
 create mode 100644 embed.am
 create mode 100644 whippet.m4

diff --git a/embed.am b/embed.am
new file mode 100644
index 000000000..81f93d287
--- /dev/null
+++ b/embed.am
@@ -0,0 +1,210 @@
+# Automake snippet for embedding Whippet in an autotools project.
+#
+# The including Makefile.am needs to do this, assuming Whippet is in the
+# whippet/ subdirectory:
+#  
+#     noinst_LTLIBRARIES =
+#     WHIPPET_EMBEDDER_CPPFLAGS = -include src/my-embedder.h
+#     include whippet/embed.am
+#
+# my-embedder.h should provide the various hooks that Whippet needs to
+# specialize itself to the embedder's object representation.
+#
+# The result is a libwhippet.la.  To compile and link against it:
+#
+#     AM_CFLAGS = $(WHIPPET_CPPFLAGS) $(WHIPPET_CFLAGS) $(WHIPPET_TO_EMBEDDER_CPPFLAGS)
+#     LDADD = libwhippet.la
+#     AM_LDFLAGS = $(WHIPPET_TO_EMBEDDER_LDFLAGS)
+#
+# The assumption is that the embedder will build a single copy of
+# Whippet, specialized against a single collector, a single set of
+# embedder hooks, and a single target platform.  The collector and
+# platform should be chosen at configure-time.  Because Automake really
+# wants the set of source files to be visible to it at automake-time, we
+# need to implement these conditions via AM_CONDITIONAL in a
+# configure.ac.  For example for a parallel-mmc configuration on
+# gnu-linux, we would need:
+#
+#     AM_SUBST(WHIPPET_COLLECTOR, parallel-mmc)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_SEMI, 0)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_PCC, 0)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_BDW, 0)
+#     AM_CONDITIONAL(WHIPPET_COLLECTOR_MMC, 1)
+#     AM_CONDITIONAL(WHIPPET_PLATFORM_GNU_LINUX, 1)
+#
+# Then there are other conditionals for compilation options:
+#
+#     AM_CONDITIONAL(WHIPPET_ENABLE_DEBUG, 0)
+#     AM_CONDITIONAL(WHIPPET_USE_LTTNG, 1)
+#
+# Finally, LTO should be enabled, for best performance.  This should be
+# added to CFLAGS at configure-time.
+#
+# Getting all of this in there is gnarly.  See the example configure.ac
+# for one take on the topic.
+
+noinst_LTLIBRARIES += libwhippet-common.la libwhippet.la
+
+libwhippet_common_la_SOURCES = \
+  %D%/src/gc-options-internal.h \
+  %D%/src/gc-options.c \
+  %D%/src/gc-stack.c \
+  %D%/src/gc-stack.h \
+  %D%/src/gc-tracepoint.c
+
+if WHIPPET_PLATFORM_GNU_LINUX
+libwhippet_common_la_SOURCES += %D%/src/gc-platform-gnu-linux.c
+endif
+
+libwhippet_la_SOURCES = \
+  %D%/src/adaptive-heap-sizer.h \
+  %D%/src/address-hash.h \
+  %D%/src/address-map.h \
+  %D%/src/address-set.h \
+  %D%/src/assert.h \
+  %D%/src/background-thread.h \
+  %D%/src/copy-space.h \
+  %D%/src/debug.h \
+  %D%/src/extents.h \
+  %D%/src/field-set.h \
+  %D%/src/freelist.h \
+  %D%/src/gc-align.h \
+  %D%/src/gc-ephemeron-internal.h \
+  %D%/src/gc-ephemeron.c \
+  %D%/src/gc-finalizer-internal.h \
+  %D%/src/gc-finalizer.c \
+  %D%/src/gc-internal.h \
+  %D%/src/gc-lock.h \
+  %D%/src/gc-platform.h \
+  %D%/src/gc-trace.h \
+  %D%/src/generational-roots.h \
+  %D%/src/growable-heap-sizer.h \
+  %D%/src/heap-sizer.h \
+  %D%/src/large-object-space.h \
+  %D%/src/local-worklist.h \
+  %D%/src/nofl-space.h \
+  %D%/src/nursery.h \
+  %D%/src/parallel-tracer.h \
+  %D%/src/remembered-edges.h \
+  %D%/src/root.h \
+  %D%/src/root-worklist.h \
+  %D%/src/serial-tracer.h \
+  %D%/src/shared-worklist.h \
+  %D%/src/simple-worklist.h \
+  %D%/src/spin.h \
+  %D%/src/splay-tree.h \
+  %D%/src/swar.h \
+  %D%/src/tracer.h
+
+WHIPPET_CFLAGS_bdw = -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+WHIPPET_CFLAGS_semi = -DGC_PRECISE_ROOTS=1
+WHIPPET_CFLAGS_pcc = -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_generational_pcc = $(WHIPPET_CFLAGS_pcc) -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_mmc = \
+  -DGC_PRECISE_ROOTS=1
+WHIPPET_CFLAGS_generational_mmc = \
+  -DGC_PRECISE_ROOTS=1 -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_parallel_mmc = \
+  -DGC_PRECISE_ROOTS=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_parallel_generational_mmc = \
+  -DGC_PRECISE_ROOTS=1 -DGC_GENERATIONAL=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_stack_conservative_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1
+WHIPPET_CFLAGS_stack_conservative_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_stack_conservative_parallel_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_stack_conservative_parallel_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_GENERATIONAL=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_heap_conservative_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1
+WHIPPET_CFLAGS_heap_conservative_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_GENERATIONAL=1
+WHIPPET_CFLAGS_heap_conservative_parallel_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_PARALLEL=1
+WHIPPET_CFLAGS_heap_conservative_parallel_generational_mmc = \
+  -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 -DGC_GENERATIONAL=1 -DGC_PARALLEL=1
+
+WHIPPET_CFLAGS      = $(WHIPPET_CFLAGS_$(subst -,_,$(WHIPPET_COLLECTOR)))
+WHIPPET_IMPL_CFLAGS =
+WHIPPET_LIBS        = -lm
+WHIPPET_CPPFLAGS    = -I%D%/api
+WHIPPET_TO_EMBEDDER_CPPFLAGS = $(WHIPPET_CPPFLAGS)
+
+if WHIPPET_ENABLE_DEBUG
+WHIPPET_CFLAGS += -DGC_DEBUG=1
+endif
+
+if WHIPPET_COLLECTOR_SEMI
+libwhippet_la_SOURCES += %D%/src/semi.c
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/semi-attrs.h
+endif
+
+if WHIPPET_COLLECTOR_PCC
+libwhippet_la_SOURCES += %D%/src/pcc.c
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/pcc-attrs.h
+endif
+
+if WHIPPET_COLLECTOR_BDW
+libwhippet_la_SOURCES += %D%/src/bdw.c
+WHIPPET_IMPL_CFLAGS   += $(WHIPPET_BDW_CFLAGS)
+WHIPPET_LIBS          += $(WHIPPET_BDW_LIBS)
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/bdw-attrs.h
+endif
+
+if WHIPPET_COLLECTOR_MMC
+libwhippet_la_SOURCES += %D%/src/mmc.c
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/mmc-attrs.h
+endif
+
+# add to cflags: -flto -fvisibility=hidden -fno-strict-aliasing
+
+libwhippet_common_la_CPPFLAGS = $(WHIPPET_CPPFLAGS)
+libwhippet_common_la_CFLAGS = -Wall -Wno-unused $(CFLAGS)
+libwhippet_common_la_CFLAGS += $(WHIPPET_CFLAGS)
+libwhippet_common_la_LDFLAGS = -lpthread $(LDFLAGS)
+libwhippet_common_la_LIBADD = $(LIBS)
+
+if WHIPPET_USE_LTTNG
+libwhippet_common_la_CPPFLAGS += $(WHIPPET_LTTNG_CFLAGS) -DGC_TRACEPOINT_LTTNG=1
+WHIPPET_LIBS += $(WHIPPET_LTTNG_LIBS)
+endif
+
+if !WHIPPET_ENABLE_DEBUG
+libwhippet_common_la_CFLAGS += -DNDEBUG
+endif
+
+libwhippet_la_CPPFLAGS = $(libwhippet_common_la_CPPFLAGS) $(WHIPPET_EMBEDDER_CPPFLAGS)
+libwhippet_la_CFLAGS = $(libwhippet_common_la_CFLAGS)
+libwhippet_la_CFLAGS += $(WHIPPET_IMPL_CFLAGS)
+libwhippet_la_LDFLAGS = $(libwhippet_common_la_LDFLAGS) $(WHIPPET_LIBS)
+libwhippet_la_LIBADD = libwhippet-common.la
+
+noinst_HEADERS = \
+  %D%/api/bdw-attrs.h \
+  %D%/api/gc-allocation-kind.h \
+  %D%/api/gc-api.h \
+  %D%/api/gc-assert.h \
+  %D%/api/gc-attrs.h \
+  %D%/api/gc-basic-stats.h \
+  %D%/api/gc-collection-kind.h \
+  %D%/api/gc-config.h \
+  %D%/api/gc-conservative-ref.h \
+  %D%/api/gc-edge.h \
+  %D%/api/gc-embedder-api.h \
+  %D%/api/gc-ephemeron.h \
+  %D%/api/gc-event-listener-chain.h \
+  %D%/api/gc-event-listener.h \
+  %D%/api/gc-finalizer.h \
+  %D%/api/gc-forwarding.h \
+  %D%/api/gc-histogram.h \
+  %D%/api/gc-inline.h \
+  %D%/api/gc-lttng.h \
+  %D%/api/gc-null-event-listener.h \
+  %D%/api/gc-options.h \
+  %D%/api/gc-ref.h \
+  %D%/api/gc-tracepoint.h \
+  %D%/api/gc-visibility.h \
+  %D%/api/mmc-attrs.h \
+  %D%/api/pcc-attrs.h \
+  %D%/api/semi-attrs.h
diff --git a/whippet.m4 b/whippet.m4
new file mode 100644
index 000000000..9cd5c3449
--- /dev/null
+++ b/whippet.m4
@@ -0,0 +1,181 @@
+AC_DEFUN([WHIPPET_ENABLE_LTO],
+ [AC_REQUIRE([AC_PROG_CC])
+  AC_MSG_CHECKING([whether the compiler supports -flto])
+  old_CFLAGS="$CFLAGS"
+  LTO_CFLAGS="-flto"
+  CFLAGS="$CFLAGS $LTO_CFLAGS"
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo;], [])],, [LTO_CFLAGS=])
+  CFLAGS="$old_CFLAGS"
+  if test -n "$LTO_CFLAGS"; then
+    AC_MSG_RESULT([yes])
+  else
+    AC_MSG_RESULT([no])
+  fi
+
+  AC_ARG_ENABLE(lto,
+    [AS_HELP_STRING([--enable-lto]
+                    [enable link-time optimization])],
+    [],
+    [if test -z "$LTO_CFLAGS"; then enable_lto=no; else enable_lto=yes; fi])
+  case "$enable_lto" in
+    yes | y)
+      if test -z "$LTO_CFLAGS"; then
+        AC_MSG_ERROR([--enable-lto=$enable_lto unsupported for $CC])
+      fi
+      CFLAGS="$CFLAGS $LTO_CFLAGS"
+      AC_MSG_CHECKING([for lto-specific prefix for ar, nm, objcopy, ranlib])
+      if test "$GCC" = yes; then
+         TOOLCHAIN_PREFIX=gcc
+      else
+         # Assuming LLVM if not GCC.  Probably won't hurt.
+         TOOLCHAIN_PREFIX=llvm
+      fi
+      AC_MSG_RESULT([$TOOLCHAIN_PREFIX])
+      AC_CHECK_TOOLS([AR], [$TOOLCHAIN_PREFIX-ar ar])
+      AC_CHECK_TOOLS([NM], [$TOOLCHAIN_PREFIX-nm nm])
+      AC_CHECK_TOOLS([OBJCOPY], [$TOOLCHAIN_PREFIX-objcopy objcopy])
+      AC_CHECK_TOOLS([RANLIB], [$TOOLCHAIN_PREFIX-ranlib ranlib])
+      ;;
+    no | n)
+      ;;
+    *)
+      AC_MSG_ERROR([unexpected --enable-lto=$enable_lto])
+      ;;
+  esac])
+ 
+AC_DEFUN([WHIPPET_PKG_PLATFORM],
+ [# Detect the target system
+  AC_MSG_CHECKING([which platform support library the garbage collector should use])
+  case "$host_os" in
+    *linux-gnu*)
+      AC_MSG_RESULT(gnu-linux)
+      whippet_platform=gnu-linux
+      ;;
+    *)
+      AC_MSG_ERROR([unsupported host OS: $host_os])
+      ;;
+  esac
+  AM_CONDITIONAL(WHIPPET_PLATFORM_GNU_LINUX, [test "$whippet_platform" = gnu-linux])])
+
+AC_DEFUN([WHIPPET_PKG_TRACING],
+ [WHIPPET_TRACING_DEFAULT="m4_default([$1], [auto])"
+  AC_ARG_WITH(gc-lttng,
+    AS_HELP_STRING([--with-gc-lttng],
+                   [Compile GC library with LTTng tracing support (default: $WHIPPET_TRACING_DEFAULT)]),
+    [whippet_with_lttng=$withval],
+    [whippet_with_lttng=auto])
+  PKG_CHECK_MODULES(WHIPPET_LTTNG, lttng-ust,
+                    [whippet_have_lttng=yes], [whippet_have_lttng=no])
+  AC_MSG_CHECKING(whether to compile GC library with LTTng tracing support)
+  if test "$whippet_with_lttng" = auto; then
+    if test "$whippet_have_lttng" = no; then
+      whippet_use_lttng=no
+    else
+      whippet_use_lttng=yes
+    fi
+  else
+    whippet_use_lttng=$whippet_with_lttng
+  fi
+  AC_MSG_RESULT($whippet_use_lttng)
+
+  if test "$whippet_use_lttng" != no && test "$whippet_have_lttng" = no; then
+    AC_MSG_ERROR([LTTng support explicitly required, but lttng not found])
+  fi
+  AM_CONDITIONAL(WHIPPET_USE_LTTNG, [test "$whippet_use_lttng" != no])
+  AC_SUBST(WHIPPET_LTTNG_CFLAGS)
+  AC_SUBST(WHIPPET_LTTNG_LIBS)])
+
+AC_DEFUN([WHIPPET_PKG_COLLECTOR],
+ [PKG_CHECK_MODULES(WHIPPET_BDW, bdw-gc,
+                    [whippet_have_bdw=yes], [whippet_have_bdw=no])
+  AC_SUBST(WHIPPET_BDW_CFLAGS)
+  AC_SUBST(WHIPPET_BDW_LIBS)
+
+  WHIPPET_COLLECTOR_DEFAULT="m4_default([$1], [pcc])"
+  AC_ARG_WITH(gc,
+    AS_HELP_STRING([--with-gc],
+                   [Select garbage collector implementation (see --with-gc=help)]),
+    [whippet_collector=$withval],
+    [whippet_collector=$WHIPPET_COLLECTOR_DEFAULT])
+
+  WHIPPET_ALL_COLLECTORS=$(echo <<END
+Available garbage collection implementations (--with-gc=GC values):
+  semi                     serial copying
+  pcc                      parallel copying
+  generational-pcc         generational parallel copying
+  bdw                      third-party BDW-GC parallel mark-sweep
+  mmc                      serial immix
+  generational-mmc         mmc + in-place generations
+  parallel-mmc             mmc + parallel tracing
+  stack-conservative-mmc   mmc + conservative stack root finding
+  heap-conservative-mmc    stack-conservative-mmc + conservative heap edges
+  stack-conservative-parallel-mmc
+  heap-conservative-parallel-mmc
+  stack-conservative-generational-mmc
+  heap-conservative-generational-mmc
+  parallel-generational-mmc
+  stack-conservative-parallel-generational-mmc
+  heap-conservative-parallel-generational-mmc
+                           combinations of the above
+
+The default collector is $WHIPPET_COLLECTOR_DEFAULT.
+END
+)
+
+  if test "$whippet_collector" = help; then
+    echo "$WHIPPET_ALL_COLLECTORS"
+    exit 0
+  fi
+
+  WHIPPET_COLLECTOR_SEMI=false
+  WHIPPET_COLLECTOR_PCC=false
+  WHIPPET_COLLECTOR_BDW=false
+  WHIPPET_COLLECTOR_MMC=false
+  AC_MSG_CHECKING([for which garbage collector implementation to use])
+  case "$whippet_collector" in
+    semi)
+      WHIPPET_COLLECTOR_SEMI=true
+      ;;
+    pcc | generational-pcc)
+      WHIPPET_COLLECTOR_PCC=true
+      ;;
+    bdw)
+      WHIPPET_COLLECTOR_BDW=true
+      ;;
+    mmc | generational-mmc | parallel-mmc | parallel-generational-mmc | \
+    stack-conservative-mmc | stack-conservative-generational-mmc | \
+    stack-conservative-parallel-mmc | stack-conservative-parallel-generational-mmc | \
+    heap-conservative-mmc | heap-conservative-generational-mmc | \
+    heap-conservative-parallel-mmc | heap-conservative-parallel-generational-mmc)
+      WHIPPET_COLLECTOR_MMC=true
+      ;;
+    *)
+      AC_MSG_RESULT([unrecognized collector: $whippet_collector; try --with-gc=help])
+      exit 1
+      ;;
+  esac
+  WHIPPET_COLLECTOR=$whippet_collector
+  AC_MSG_RESULT($WHIPPET_COLLECTOR)
+  AC_SUBST(WHIPPET_COLLECTOR)
+  AM_CONDITIONAL(WHIPPET_COLLECTOR_SEMI, $WHIPPET_COLLECTOR_SEMI)
+  AM_CONDITIONAL(WHIPPET_COLLECTOR_PCC, $WHIPPET_COLLECTOR_PCC)
+  AM_CONDITIONAL(WHIPPET_COLLECTOR_BDW, $WHIPPET_COLLECTOR_BDW)
+  AM_CONDITIONAL(WHIPPET_COLLECTOR_MMC, $WHIPPET_COLLECTOR_MMC)
+
+  if $WHIPPET_COLLECTOR_BDW && test "$whippet_have_bdw" != yes; then
+    AC_MSG_ERROR(BDW-GC collector selected but BDW library not found)
+  fi])
+
+AC_DEFUN([WHIPPET_PKG_DEBUG],
+ [AC_ARG_WITH(whippet-debug,
+    AS_HELP_STRING([--with-gc-debug],
+                   [Compile GC library with debugging support (default: no)]),
+    [whippet_with_debug=$withval],
+    [whippet_with_debug=no])
+  AM_CONDITIONAL(WHIPPET_ENABLE_DEBUG, [test "$whippet_with_debug" != no])])
+
+AC_DEFUN([WHIPPET_PKG],
+ [AC_REQUIRE([WHIPPET_PKG_PLATFORM])
+  AC_REQUIRE([WHIPPET_PKG_TRACING])
+  AC_REQUIRE([WHIPPET_PKG_COLLECTOR])
+  AC_REQUIRE([WHIPPET_PKG_DEBUG])])

From 63f52b635b3b2f672eed3c5e9d7796e04d0e0fe3 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Apr 2025 13:39:29 +0200
Subject: [PATCH 400/403] Add some text about autotools

---
 doc/manual.md | 71 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 2 deletions(-)

diff --git a/doc/manual.md b/doc/manual.md
index a6742cbe5..14577932d 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -201,8 +201,10 @@ compiling user code.
 ### Compiling the collector
 
 As an embed-only library, Whippet needs to be integrated into the build
-system of its host (embedder).  Currently the only supported build
-system uses GNU make.  We would be happy to add other systems over time.
+system of its host (embedder).  There are two build systems supported
+currently; we would be happy to add other systems over time.
+
+#### GNU make
 
 At a high level, first the embedder chooses a collector and defines how
 to specialize the collector against the embedder.  Whippet's `embed.mk`
@@ -253,6 +255,71 @@ remove any overhead imposed by the division of code into separate
 compilation units.  `embed.mk` includes the necessary LTO flags in
 `GC_CFLAGS` and `GC_LDFLAGS`.
 
+#### GNU Autotools
+
+To use Whippet from an autotools project, the basic idea is to include a
+`Makefile.am` snippet from the subdirectory containing the Whippet
+checkout.  That will build `libwhippet.la`, which you should link into
+your binary.  There are some `m4` autoconf macros that need to be
+invoked, for example to select the collector.
+
+Let us imagine you have checked out Whippet in `whippet/`.  Let us also
+assume for the moment that we are going to build `mt-gcbench`, a program
+included in Whippet itself.
+
+A top-level autoconf file (`configure.ac`) might look like this:
+
+```autoconf
+AC_PREREQ([2.69])
+AC_INIT([whippet-autotools-example],[0.1.0])
+AC_CONFIG_SRCDIR([whippet/benchmarks/mt-gcbench.c])
+AC_CONFIG_AUX_DIR([build-aux])
+AC_CONFIG_MACRO_DIRS([m4 whippet])
+AM_INIT_AUTOMAKE([subdir-objects foreign])
+
+WHIPPET_ENABLE_LTO
+
+LT_INIT
+
+WARN_CFLAGS=-Wall
+AC_ARG_ENABLE([Werror],
+  AS_HELP_STRING([--disable-Werror],
+                 [Don't stop the build on errors]),
+  [],
+  WARN_CFLAGS="-Wall -Werror")
+CFLAGS="$CFLAGS $WARN_CFLAGS"
+
+WHIPPET_PKG
+
+AC_CONFIG_FILES(Makefile)
+AC_OUTPUT
+```
+
+Then your `Makefile.am` might look like this:
+
+```automake
+noinst_LTLIBRARIES =
+WHIPPET_EMBEDDER_CPPFLAGS = -include whippet/benchmarks/mt-gcbench-embedder.h
+include whippet/embed.am
+
+noinst_PROGRAMS = whippet/benchmarks/mt-gcbench
+
+AM_CFLAGS = $(WHIPPET_CPPFLAGS) $(WHIPPET_CFLAGS) $(WHIPPET_TO_EMBEDDER_CPPFLAGS)
+LDADD = libwhippet.la
+```
+
+To actually build, you do the usual autotools dance:
+
+```bash
+autoreconf -vif && ./configure && make
+```
+
+See `./configure --help` for a list of user-facing options.  Before the
+`WHIPPET_PKG`, you can run e.g. `WHIPPET_PKG_COLLECTOR(mmc)` to set the
+default collector to `mmc`; if you don't do that, the default collector
+is `pcc`.  There are also `WHIPPET_PKG_DEBUG`, `WHIPPET_PKG_TRACING`,
+and `WHIPPET_PKG_PLATFORM`; see `whippet.m4` for more details.
+
 #### Compile-time options
 
 There are a number of pre-processor definitions that can parameterize

From 1664913ebd45aec0d132a23b329383a7982f7d0d Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Apr 2025 13:54:14 +0200
Subject: [PATCH 401/403] Update autotools and docs

---
 doc/manual.md | 14 +++++++++++++-
 embed.am      | 13 +++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/doc/manual.md b/doc/manual.md
index 14577932d..edde53585 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -299,15 +299,27 @@ Then your `Makefile.am` might look like this:
 
 ```automake
 noinst_LTLIBRARIES =
-WHIPPET_EMBEDDER_CPPFLAGS = -include whippet/benchmarks/mt-gcbench-embedder.h
+WHIPPET_EMBEDDER_CPPFLAGS = -include $(srcdir)/whippet/benchmarks/mt-gcbench-embedder.h
 include whippet/embed.am
 
 noinst_PROGRAMS = whippet/benchmarks/mt-gcbench
+whippet_benchmarks_mt_gcbench_SOURCES = \
+  whippet/benchmarks/mt-gcbench.c \
+  whippet/benchmarks/mt-gcbench-types.h \
+  whippet/benchmarks/simple-tagging-scheme.h \
+  whippet/benchmarks/simple-roots-api.h \
+  whippet/benchmarks/simple-roots-types.h \
+  whippet/benchmarks/simple-allocator.h \
+  whippet/benchmarks/heap-objects.h \
+  whippet/benchmarks/mt-gcbench-embedder.h
 
 AM_CFLAGS = $(WHIPPET_CPPFLAGS) $(WHIPPET_CFLAGS) $(WHIPPET_TO_EMBEDDER_CPPFLAGS)
 LDADD = libwhippet.la
 ```
 
+We have to list all the little header files it uses because, well,
+autotools.
+
 To actually build, you do the usual autotools dance:
 
 ```bash
diff --git a/embed.am b/embed.am
index 81f93d287..af49e5ead 100644
--- a/embed.am
+++ b/embed.am
@@ -77,15 +77,12 @@ libwhippet_la_SOURCES = \
   %D%/src/gc-lock.h \
   %D%/src/gc-platform.h \
   %D%/src/gc-trace.h \
-  %D%/src/generational-roots.h \
   %D%/src/growable-heap-sizer.h \
   %D%/src/heap-sizer.h \
   %D%/src/large-object-space.h \
   %D%/src/local-worklist.h \
   %D%/src/nofl-space.h \
-  %D%/src/nursery.h \
   %D%/src/parallel-tracer.h \
-  %D%/src/remembered-edges.h \
   %D%/src/root.h \
   %D%/src/root-worklist.h \
   %D%/src/serial-tracer.h \
@@ -128,7 +125,7 @@ WHIPPET_CFLAGS_heap_conservative_parallel_generational_mmc = \
 WHIPPET_CFLAGS      = $(WHIPPET_CFLAGS_$(subst -,_,$(WHIPPET_COLLECTOR)))
 WHIPPET_IMPL_CFLAGS =
 WHIPPET_LIBS        = -lm
-WHIPPET_CPPFLAGS    = -I%D%/api
+WHIPPET_CPPFLAGS    = -I$(srcdir)/%D%/api
 WHIPPET_TO_EMBEDDER_CPPFLAGS = $(WHIPPET_CPPFLAGS)
 
 if WHIPPET_ENABLE_DEBUG
@@ -137,24 +134,24 @@ endif
 
 if WHIPPET_COLLECTOR_SEMI
 libwhippet_la_SOURCES += %D%/src/semi.c
-WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/semi-attrs.h
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/semi-attrs.h
 endif
 
 if WHIPPET_COLLECTOR_PCC
 libwhippet_la_SOURCES += %D%/src/pcc.c
-WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/pcc-attrs.h
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/pcc-attrs.h
 endif
 
 if WHIPPET_COLLECTOR_BDW
 libwhippet_la_SOURCES += %D%/src/bdw.c
 WHIPPET_IMPL_CFLAGS   += $(WHIPPET_BDW_CFLAGS)
 WHIPPET_LIBS          += $(WHIPPET_BDW_LIBS)
-WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/bdw-attrs.h
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/bdw-attrs.h
 endif
 
 if WHIPPET_COLLECTOR_MMC
 libwhippet_la_SOURCES += %D%/src/mmc.c
-WHIPPET_TO_EMBEDDER_CPPFLAGS += -include %D%/api/mmc-attrs.h
+WHIPPET_TO_EMBEDDER_CPPFLAGS += -include $(srcdir)/%D%/api/mmc-attrs.h
 endif
 
 # add to cflags: -flto -fvisibility=hidden -fno-strict-aliasing

From 96c8bb8ce9ceb576eb2dc54ff950ce4ecf22687b Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Apr 2025 14:03:29 +0200
Subject: [PATCH 402/403] Fix unused variable in mt-gcbench.c

---
 benchmarks/mt-gcbench.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/mt-gcbench.c b/benchmarks/mt-gcbench.c
index 6494c22d2..9b2521043 100644
--- a/benchmarks/mt-gcbench.c
+++ b/benchmarks/mt-gcbench.c
@@ -327,7 +327,6 @@ static void *join_thread(void *data) {
 }
 
 int main(int argc, char *argv[]) {
-  size_t sizeof_double_array = sizeof(size_t);
   size_t heap_max_live =
     tree_size(long_lived_tree_depth) * sizeof(Node) +
     tree_size(max_tree_depth) * sizeof(Node) +

From f909438596f63ecd44c5b9a28385260714857361 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@igalia.com>
Date: Thu, 10 Apr 2025 14:07:52 +0200
Subject: [PATCH 403/403] Fix distcheck, better docs

---
 doc/manual.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/doc/manual.md b/doc/manual.md
index edde53585..7b889e364 100644
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -304,14 +304,15 @@ include whippet/embed.am
 
 noinst_PROGRAMS = whippet/benchmarks/mt-gcbench
 whippet_benchmarks_mt_gcbench_SOURCES = \
-  whippet/benchmarks/mt-gcbench.c \
+  whippet/benchmarks/heap-objects.h \
+  whippet/benchmarks/mt-gcbench-embedder.h \
   whippet/benchmarks/mt-gcbench-types.h \
-  whippet/benchmarks/simple-tagging-scheme.h \
+  whippet/benchmarks/mt-gcbench.c \
+  whippet/benchmarks/simple-allocator.h \
+  whippet/benchmarks/simple-gc-embedder.h \
   whippet/benchmarks/simple-roots-api.h \
   whippet/benchmarks/simple-roots-types.h \
-  whippet/benchmarks/simple-allocator.h \
-  whippet/benchmarks/heap-objects.h \
-  whippet/benchmarks/mt-gcbench-embedder.h
+  whippet/benchmarks/simple-tagging-scheme.h
 
 AM_CFLAGS = $(WHIPPET_CPPFLAGS) $(WHIPPET_CFLAGS) $(WHIPPET_TO_EMBEDDER_CPPFLAGS)
 LDADD = libwhippet.la
@@ -330,7 +331,10 @@ See `./configure --help` for a list of user-facing options.  Before the
 `WHIPPET_PKG`, you can run e.g. `WHIPPET_PKG_COLLECTOR(mmc)` to set the
 default collector to `mmc`; if you don't do that, the default collector
 is `pcc`.  There are also `WHIPPET_PKG_DEBUG`, `WHIPPET_PKG_TRACING`,
-and `WHIPPET_PKG_PLATFORM`; see `whippet.m4` for more details.
+and `WHIPPET_PKG_PLATFORM`; see [`whippet.m4`](../whippet.m4) for more
+details.  See also
+[`whippet-autotools`](https://github.com/wingo/whippet-autotools) for an
+example of how this works.
 
 #### Compile-time options