Rename mark-sweep "markers" to "tracers"

There could be other reasons than marking to trace the heap.
2025-07-15 13:40:39 +02:00 · 2022-04-18 15:19:55 +02:00 · 2022-04-18 15:19:55 +02:00 · 119e273fa4
commit 119e273fa4
parent 19f7f72b68
6 changed files with 843 additions and 839 deletions
--- a/4
+++ b/4
@ -17,10 +17,10 @@ bdw-%: bdw.h conservative-roots.h %-types.h %.c
 semi-%: semi.h precise-roots.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_SEMI -o $@ $*.c

-mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %-types.h heap-objects.h %.c
+mark-sweep-%: mark-sweep.h precise-roots.h serial-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_MARK_SWEEP -o $@ $*.c

-parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %-types.h heap-objects.h %.c
+parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-tracer.h assert.h debug.h %-types.h heap-objects.h %.c
 	$(COMPILE) -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c

 check: $(addprefix test-$(TARGET),$(TARGETS))
--- a/mark-sweep.h
+++ b/mark-sweep.h
@ -11,9 +11,9 @@
 #include "inline.h"
 #include "precise-roots.h"
 #ifdef GC_PARALLEL_MARK
-#include "parallel-marker.h"
+#include "parallel-tracer.h"
 #else
-#include "serial-marker.h"
+#include "serial-tracer.h"
 #endif

 #define GRANULE_SIZE 8
@ -124,12 +124,12 @@ struct mark_space {
  void *mem;
  size_t mem_size;
  long count;
-  struct marker marker;
  struct mutator *deactivated_mutators;
 };

 struct heap {
  struct mark_space mark_space;
+  struct tracer tracer;
 };

 struct mutator_mark_buf {
@ -148,8 +148,8 @@ struct mutator {
  struct mutator *next;
 };

-static inline struct marker* mark_space_marker(struct mark_space *space) {
-  return &space->marker;
+static inline struct tracer* heap_tracer(struct heap *heap) {
+  return &heap->tracer;
 }
 static inline struct mark_space* heap_mark_space(struct heap *heap) {
  return &heap->mark_space;
@ -174,7 +174,7 @@ static inline void clear_memory(uintptr_t addr, size_t size) {
  memset((char*)addr, 0, size);
 }

-static void collect(struct mark_space *space, struct mutator *mut) NEVER_INLINE;
+static void collect(struct heap *heap, struct mutator *mut) NEVER_INLINE;

 static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
  ASSERT(space->heap_base <= (uintptr_t) obj);
@ -183,8 +183,8 @@ static inline uint8_t* mark_byte(struct mark_space *space, struct gcobj *obj) {
  return &space->mark_bytes[granule];
 }

-static inline int mark_object(struct mark_space *space, struct gcobj *obj) {
-  uint8_t *byte = mark_byte(space, obj);
+static inline int trace_object(struct heap *heap, struct gcobj *obj) {
+  uint8_t *byte = mark_byte(heap_mark_space(heap), obj);
  if (*byte)
    return 0;
  *byte = 1;
@ -195,7 +195,7 @@ static inline void trace_one(struct gcobj *obj, void *mark_data) {
  switch (tag_live_alloc_kind(obj->tag)) {
 #define SCAN_OBJECT(name, Name, NAME) \
    case ALLOC_KIND_##NAME: \
-      visit_##name##_fields((Name*)obj, marker_visit, mark_data); \
+      visit_##name##_fields((Name*)obj, tracer_visit, mark_data); \
      break;
    FOR_EACH_HEAP_OBJECT_KIND(SCAN_OBJECT)
 #undef SCAN_OBJECT
@ -317,11 +317,12 @@ static void mutator_mark_buf_destroy(struct mutator_mark_buf *buf) {
 // Mark the roots of a mutator that is stopping for GC.  We can't
 // enqueue them directly, so we send them to the controller in a buffer.
 static void mark_stopping_mutator_roots(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
+  struct heap *heap = mutator_heap(mut);
+  struct mark_space *space = heap_mark_space(heap);
  struct mutator_mark_buf *local_roots = &mut->mark_buf;
  for (struct handle *h = mut->roots; h; h = h->next) {
    struct gcobj *root = h->v;
-    if (root && mark_object(space, root))
+    if (root && trace_object(heap, root))
      mutator_mark_buf_push(local_roots, root);
  }

@ -336,11 +337,11 @@ static void mark_stopping_mutator_roots(struct mutator *mut) {

 // Mark the roots of the mutator that causes GC.
 static void mark_controlling_mutator_roots(struct mutator *mut) {
-  struct mark_space *space = mutator_mark_space(mut);
+  struct heap *heap = mutator_heap(mut);
  for (struct handle *h = mut->roots; h; h = h->next) {
    struct gcobj *root = h->v;
-    if (root && mark_object(space, root))
-      marker_enqueue_root(&space->marker, root);
+    if (root && trace_object(heap, root))
+      tracer_enqueue_root(&heap->tracer, root);
  }
 }

@ -359,16 +360,17 @@ static void mark_inactive_mutators(struct mark_space *space) {
    mark_controlling_mutator_roots(mut);
 }

-static void mark_global_roots(struct mark_space *space) {
+static void mark_global_roots(struct heap *heap) {
+  struct mark_space *space = heap_mark_space(heap);
  for (struct handle *h = space->global_roots; h; h = h->next) {
    struct gcobj *obj = h->v;
-    if (obj && mark_object(space, obj))
-      marker_enqueue_root(&space->marker, obj);
+    if (obj && trace_object(heap, obj))
+      tracer_enqueue_root(&heap->tracer, obj);
  }

  struct mutator_mark_buf *roots = atomic_load(&space->mutator_roots);
  for (; roots; roots = roots->next)
-    marker_enqueue_roots(&space->marker, roots->objects, roots->size);
+    tracer_enqueue_roots(&heap->tracer, roots->objects, roots->size);
  atomic_store(&space->mutator_roots, NULL);
 }

@ -425,16 +427,17 @@ static void reset_sweeper(struct mark_space *space) {
  space->sweep = space->heap_base;
 }

-static void collect(struct mark_space *space, struct mutator *mut) {
+static void collect(struct heap *heap, struct mutator *mut) {
+  struct mark_space *space = heap_mark_space(heap);
  DEBUG("start collect #%ld:\n", space->count);
-  marker_prepare(space);
+  tracer_prepare(heap);
  request_mutators_to_stop(space);
  mark_controlling_mutator_roots(mut);
  wait_for_mutators_to_stop(space);
  mark_inactive_mutators(space);
-  mark_global_roots(space);
-  marker_trace(space);
-  marker_release(space);
+  mark_global_roots(heap);
+  tracer_trace(heap);
+  tracer_release(heap);
  clear_global_freelists(space);
  reset_sweeper(space);
  space->count++;
@ -651,7 +654,7 @@ static void* allocate_medium(struct mutator *mut, enum alloc_kind kind,
      fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
      abort();
    } else {
-      collect(space, mut);
+      collect(mutator_heap(mut), mut);
      swept_from_beginning = 1;
    }
  }
@ -739,7 +742,7 @@ static void fill_small_from_global(struct mutator *mut,
        fprintf(stderr, "ran out of space, heap size %zu\n", space->heap_size);
        abort();
      } else {
-        collect(space, mut);
+        collect(mutator_heap(mut), mut);
        swept_from_beginning = 1;
      }
    }
@ -838,7 +841,7 @@ static int initialize_gc(size_t size, struct heap **heap,
  space->heap_base = ((uintptr_t) mem) + overhead;
  space->heap_size = size - overhead;
  space->sweep = space->heap_base + space->heap_size;
-  if (!marker_init(space))
+  if (!tracer_init(*heap))
    abort();
  reclaim(space, NULL, NOT_SMALL_OBJECT, (void*)space->heap_base,
          size_to_granules(space->heap_size));
--- a/parallel-marker.h
+++ b/parallel-marker.h
@ -1,642 +0,0 @@
-#ifndef PARALLEL_MARKER_H
-#define PARALLEL_MARKER_H
-
-#include <pthread.h>
-#include <stdatomic.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include "assert.h"
-#include "debug.h"
-#include "inline.h"
-
-// The Chase-Lev work-stealing deque, as initially described in "Dynamic
-// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
-// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
-// and improved with C11 atomics in "Correct and Efficient Work-Stealing
-// for Weak Memory Models" (Lê et al, PPoPP'13)
-// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
-
-struct gcobj;
-
-struct mark_buf {
-  unsigned log_size;
-  size_t size;
-  struct gcobj **data;
-};
-
-// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
-#define mark_buf_min_log_size ((unsigned) 10)
-// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
-#define mark_buf_max_log_size ((unsigned) 28)
-
-static int
-mark_buf_init(struct mark_buf *buf, unsigned log_size) {
-  ASSERT(log_size >= mark_buf_min_log_size);
-  ASSERT(log_size <= mark_buf_max_log_size);
-  size_t size = (1 << log_size) * sizeof(struct gcobj *);
-  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("Failed to grow work-stealing dequeue");
-    DEBUG("Failed to allocate %zu bytes", size);
-    return 0;
-  }
-  buf->log_size = log_size;
-  buf->size = 1 << log_size;
-  buf->data = mem;
-  return 1;
-}
-  
-static inline size_t
-mark_buf_size(struct mark_buf *buf) {
-  return buf->size;
-}
-
-static inline size_t
-mark_buf_byte_size(struct mark_buf *buf) {
-  return mark_buf_size(buf) * sizeof(struct gcobj *);
-}
-
-static void
-mark_buf_release(struct mark_buf *buf) {
-  if (buf->data)
-    madvise(buf->data, mark_buf_byte_size(buf), MADV_DONTNEED);
-}
-
-static void
-mark_buf_destroy(struct mark_buf *buf) {
-  if (buf->data) {
-    munmap(buf->data, mark_buf_byte_size(buf));
-    buf->data = NULL;
-    buf->log_size = 0;
-    buf->size = 0;
-  }
-}
-
-static inline struct gcobj *
-mark_buf_get(struct mark_buf *buf, size_t i) {
-  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
-                              memory_order_relaxed);
-}
-
-static inline void
-mark_buf_put(struct mark_buf *buf, size_t i, struct gcobj * o) {
-  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
-                               o,
-                               memory_order_relaxed);
-}
-
-static inline int
-mark_buf_grow(struct mark_buf *from, struct mark_buf *to,
-              size_t b, size_t t) {
-  if (from->log_size == mark_buf_max_log_size)
-    return 0;
-  if (!mark_buf_init (to, from->log_size + 1))
-    return 0;
-  for (size_t i=t; i<b; i++)
-    mark_buf_put(to, i, mark_buf_get(from, i));
-  return 1;
-}
-
-// Chase-Lev work-stealing deque.  One thread pushes data into the deque
-// at the bottom, and many threads compete to steal data from the top.
-struct mark_deque {
-  // Ensure bottom and top are on different cache lines.
-  union {
-    atomic_size_t bottom;
-    char bottom_padding[64];
-  };
-  union {
-    atomic_size_t top;
-    char top_padding[64];
-  };
-  atomic_int active; // Which mark_buf is active.
-  struct mark_buf bufs[(mark_buf_max_log_size - mark_buf_min_log_size) + 1];
-};
-
-#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
-#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
-
-#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
-#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
-
-#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
-
-static int
-mark_deque_init(struct mark_deque *q) {
-  memset(q, 0, sizeof (*q));
-  int ret = mark_buf_init(&q->bufs[0], mark_buf_min_log_size);
-  // Note, this fence isn't in the paper, I added it out of caution.
-  atomic_thread_fence(memory_order_release);
-  return ret;
-}
-
-static void
-mark_deque_release(struct mark_deque *q) {
-  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
-    mark_buf_release(&q->bufs[i]);
-}
-
-static void
-mark_deque_destroy(struct mark_deque *q) {
-  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
-    mark_buf_destroy(&q->bufs[i]);
-}
-
-static int
-mark_deque_grow(struct mark_deque *q, int cur, size_t b, size_t t) {
-  if (!mark_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
-    fprintf(stderr, "failed to grow deque!!\n");
-    abort();
-  }
-
-  cur++;
-  STORE_RELAXED(&q->active, cur);
-  return cur;
-}
-
-static void
-mark_deque_push(struct mark_deque *q, struct gcobj * x) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  size_t t = LOAD_ACQUIRE(&q->top);
-  int active = LOAD_RELAXED(&q->active);
-
-  if (b - t > mark_buf_size(&q->bufs[active]) - 1) /* Full queue. */
-    active = mark_deque_grow(q, active, b, t);
-
-  mark_buf_put(&q->bufs[active], b, x);
-  atomic_thread_fence(memory_order_release);
-  STORE_RELAXED(&q->bottom, b + 1);
-}
-
-static void
-mark_deque_push_many(struct mark_deque *q, struct gcobj **objv, size_t count) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  size_t t = LOAD_ACQUIRE(&q->top);
-  int active = LOAD_RELAXED(&q->active);
-
-  while (b - t > mark_buf_size(&q->bufs[active]) - count) /* Full queue. */
-    active = mark_deque_grow(q, active, b, t);
-
-  for (size_t i = 0; i < count; i++)
-    mark_buf_put(&q->bufs[active], b + i, objv[i]);
-  atomic_thread_fence(memory_order_release);
-  STORE_RELAXED(&q->bottom, b + count);
-}
-
-static struct gcobj *
-mark_deque_try_pop(struct mark_deque *q) {
-  size_t b = LOAD_RELAXED(&q->bottom);
-  b = b - 1;
-  int active = LOAD_RELAXED(&q->active);
-  STORE_RELAXED(&q->bottom, b);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t t = LOAD_RELAXED(&q->top);
-  struct gcobj * x;
-  if (t <= b) { // Non-empty queue.
-    x = mark_buf_get(&q->bufs[active], b);
-    if (t == b) { // Single last element in queue.
-      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
-                                                   memory_order_seq_cst,
-                                                   memory_order_relaxed))
-        // Failed race.
-        x = NULL;
-      STORE_RELAXED(&q->bottom, b + 1);
-    }
-  } else { // Empty queue.
-    x = NULL;
-    STORE_RELAXED(&q->bottom, b + 1);
-  }
-  return x;
-}
-
-static struct gcobj *
-mark_deque_steal(struct mark_deque *q) {
-  while (1) {
-    size_t t = LOAD_ACQUIRE(&q->top);
-    atomic_thread_fence(memory_order_seq_cst);
-    size_t b = LOAD_ACQUIRE(&q->bottom);
-    if (t >= b)
-      return NULL;
-    int active = LOAD_CONSUME(&q->active);
-    struct gcobj *x = x = mark_buf_get(&q->bufs[active], t);
-    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
-                                                 memory_order_seq_cst,
-                                                 memory_order_relaxed))
-      // Failed race.
-      continue;
-    return x;
-  }
-}
-
-static int
-mark_deque_can_steal(struct mark_deque *q) {
-  size_t t = LOAD_ACQUIRE(&q->top);
-  atomic_thread_fence(memory_order_seq_cst);
-  size_t b = LOAD_ACQUIRE(&q->bottom);
-  return t < b;
-}
-
-#undef LOAD_RELAXED
-#undef STORE_RELAXED
-#undef LOAD_ACQUIRE
-#undef STORE_RELEASE
-#undef LOAD_CONSUME
-
-#define LOCAL_MARK_QUEUE_SIZE 1024
-#define LOCAL_MARK_QUEUE_MASK (LOCAL_MARK_QUEUE_SIZE - 1)
-#define LOCAL_MARK_QUEUE_SHARE_AMOUNT (LOCAL_MARK_QUEUE_SIZE * 3 / 4)
-struct local_mark_queue {
-  size_t read;
-  size_t write;
-  struct gcobj * data[LOCAL_MARK_QUEUE_SIZE];
-};
-
-static inline void
-local_mark_queue_init(struct local_mark_queue *q) {
-  q->read = q->write = 0;
-}
-static inline void
-local_mark_queue_poison(struct local_mark_queue *q) {
-  q->read = 0; q->write = LOCAL_MARK_QUEUE_SIZE;
-}
-static inline size_t
-local_mark_queue_size(struct local_mark_queue *q) {
-  return q->write - q->read;
-}
-static inline int
-local_mark_queue_empty(struct local_mark_queue *q) {
-  return local_mark_queue_size(q) == 0;
-}
-static inline int
-local_mark_queue_full(struct local_mark_queue *q) {
-  return local_mark_queue_size(q) >= LOCAL_MARK_QUEUE_SIZE;
-}
-static inline void
-local_mark_queue_push(struct local_mark_queue *q, struct gcobj * v) {
-  q->data[q->write++ & LOCAL_MARK_QUEUE_MASK] = v;
-}
-static inline struct gcobj *
-local_mark_queue_pop(struct local_mark_queue *q) {
-  return q->data[q->read++ & LOCAL_MARK_QUEUE_MASK];
-}
-
-enum mark_worker_state {
-  MARK_WORKER_STOPPED,
-  MARK_WORKER_IDLE,
-  MARK_WORKER_MARKING,
-  MARK_WORKER_STOPPING,
-  MARK_WORKER_DEAD
-};
-
-struct mark_worker {
-  struct mark_space *space;
-  size_t id;
-  size_t steal_id;
-  pthread_t thread;
-  enum mark_worker_state state;
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
-  struct mark_deque deque;
-};
-
-#define MARK_WORKERS_MAX_COUNT 8
-
-struct marker {
-  atomic_size_t active_markers;
-  size_t worker_count;
-  atomic_size_t running_markers;
-  long count;
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
-  struct mark_worker workers[MARK_WORKERS_MAX_COUNT];
-};
-
-struct local_marker {
-  struct mark_worker *worker;
-  struct mark_deque *share_deque;
-  struct mark_space *space;
-  struct local_mark_queue local;
-};
-
-struct context;
-static inline struct marker* mark_space_marker(struct mark_space *space);
-
-static size_t number_of_current_processors(void) { return 1; }
-
-static int
-mark_worker_init(struct mark_worker *worker, struct mark_space *space,
-                 struct marker *marker, size_t id) {
-  worker->space = space;
-  worker->id = id;
-  worker->steal_id = 0;
-  worker->thread = 0;
-  worker->state = MARK_WORKER_STOPPED;
-  pthread_mutex_init(&worker->lock, NULL);
-  pthread_cond_init(&worker->cond, NULL);
-  return mark_deque_init(&worker->deque);
-}
-
-static void mark_worker_mark(struct mark_worker *worker);
-
-static void*
-mark_worker_thread(void *data) {
-  struct mark_worker *worker = data;
-
-  pthread_mutex_lock(&worker->lock);
-  while (1) {
-    switch (worker->state) {
-    case MARK_WORKER_IDLE:
-      pthread_cond_wait(&worker->cond, &worker->lock);
-      break;
-    case MARK_WORKER_MARKING:
-      mark_worker_mark(worker);
-      worker->state = MARK_WORKER_IDLE;
-      break;
-    case MARK_WORKER_STOPPING:
-      worker->state = MARK_WORKER_DEAD;
-      pthread_mutex_unlock(&worker->lock);
-      return NULL;
-    default:
-      abort();
-    }
-  }
-}
-
-static int
-mark_worker_spawn(struct mark_worker *worker) {
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == MARK_WORKER_STOPPED);
-  worker->state = MARK_WORKER_IDLE;
-  pthread_mutex_unlock(&worker->lock);
-
-  if (pthread_create(&worker->thread, NULL, mark_worker_thread, worker)) {
-    perror("spawning marker thread failed");
-    worker->state = MARK_WORKER_STOPPED;
-    return 0;
-  }
-
-  return 1;
-}
-
-static void
-mark_worker_request_mark(struct mark_worker *worker) {
-  struct marker *marker = mark_space_marker(worker->space);
-    
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == MARK_WORKER_IDLE);
-  worker->state = MARK_WORKER_MARKING;
-  pthread_cond_signal(&worker->cond);
-  pthread_mutex_unlock(&worker->lock);
-}  
-
-static void
-mark_worker_finished_marking(struct mark_worker *worker) {
-  // Signal controller that we are done with marking.
-  struct marker *marker = mark_space_marker(worker->space);
-    
-  if (atomic_fetch_sub(&marker->running_markers, 1) == 1) {
-    pthread_mutex_lock(&marker->lock);
-    marker->count++;
-    pthread_cond_signal(&marker->cond);
-    pthread_mutex_unlock(&marker->lock);
-  }
-}
-
-static void
-mark_worker_request_stop(struct mark_worker *worker) {
-  pthread_mutex_lock(&worker->lock);
-  ASSERT(worker->state == MARK_WORKER_IDLE);
-  worker->state = MARK_WORKER_STOPPING;
-  pthread_cond_signal(&worker->cond);
-  pthread_mutex_unlock(&worker->lock);
-}  
-
-static int
-marker_init(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-  atomic_init(&marker->active_markers, 0);
-  atomic_init(&marker->running_markers, 0);
-  marker->count = 0;
-  pthread_mutex_init(&marker->lock, NULL);
-  pthread_cond_init(&marker->cond, NULL);
-  size_t desired_worker_count = 0;
-  if (getenv("GC_MARKERS"))
-    desired_worker_count = atoi(getenv("GC_MARKERS"));
-  if (desired_worker_count == 0)
-    desired_worker_count = number_of_current_processors();
-  if (desired_worker_count > MARK_WORKERS_MAX_COUNT)
-    desired_worker_count = MARK_WORKERS_MAX_COUNT;
-  for (size_t i = 0; i < desired_worker_count; i++) {
-    if (!mark_worker_init(&marker->workers[i], space, marker, i))
-      break;
-    if (mark_worker_spawn(&marker->workers[i]))
-      marker->worker_count++;
-    else
-      break;
-  }
-  return marker->worker_count > 0;
-}
-
-static void marker_prepare(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-  for (size_t i = 0; i < marker->worker_count; i++)
-    marker->workers[i].steal_id = 0;
-}
-static void marker_release(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-  for (size_t i = 0; i < marker->worker_count; i++)
-    mark_deque_release(&marker->workers[i].deque);
-}
-
-struct gcobj;
-static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
-static inline int mark_object(struct mark_space *space,
-                              struct gcobj *obj) ALWAYS_INLINE;
-
-static inline void
-marker_share(struct local_marker *mark) {
-  DEBUG("marker #%zu: sharing\n", mark->worker->id);
-  for (size_t i = 0; i < LOCAL_MARK_QUEUE_SHARE_AMOUNT; i++)
-    mark_deque_push(mark->share_deque, local_mark_queue_pop(&mark->local));
-}
-
-static inline void
-marker_visit(void **loc, void *mark_data) {
-  struct local_marker *mark = mark_data;
-  struct gcobj *obj = *loc;
-  if (obj && mark_object(mark->space, obj)) {
-    if (local_mark_queue_full(&mark->local))
-      marker_share(mark);
-    local_mark_queue_push(&mark->local, obj);
-  }
-}
-
-static struct gcobj *
-marker_steal_from_worker(struct marker *marker, size_t id) {
-  ASSERT(id < marker->worker_count);
-  return mark_deque_steal(&marker->workers[id].deque);
-}
-
-static int
-marker_can_steal_from_worker(struct marker *marker, size_t id) {
-  ASSERT(id < marker->worker_count);
-  return mark_deque_can_steal(&marker->workers[id].deque);
-}
-
-static struct gcobj *
-mark_worker_steal_from_any(struct mark_worker *worker, struct marker *marker) {
-  size_t steal_id = worker->steal_id;
-  for (size_t i = 0; i < marker->worker_count; i++) {
-    steal_id = (steal_id + 1) % marker->worker_count;
-    DEBUG("marker #%zu: stealing from #%zu\n", worker->id, steal_id);
-    struct gcobj * obj = marker_steal_from_worker(marker, steal_id);
-    if (obj) {
-      DEBUG("marker #%zu: stealing got %p\n", worker->id, obj);
-      worker->steal_id = steal_id;
-      return obj;
-    }
-  }
-  DEBUG("marker #%zu: failed to steal\n", worker->id);
-  return 0;
-}
-
-static int
-mark_worker_can_steal_from_any(struct mark_worker *worker, struct marker *marker) {
-  size_t steal_id = worker->steal_id;
-  DEBUG("marker #%zu: checking if any worker has tasks\n", worker->id);
-  for (size_t i = 0; i < marker->worker_count; i++) {
-    steal_id = (steal_id + 1) % marker->worker_count;
-    int res = marker_can_steal_from_worker(marker, steal_id);
-    if (res) {
-      DEBUG("marker #%zu: worker #%zu has tasks!\n", worker->id, steal_id);
-      worker->steal_id = steal_id;
-      return 1;
-    }
-  }
-  DEBUG("marker #%zu: nothing to steal\n", worker->id);
-  return 0;
-}
-
-static int
-mark_worker_check_termination(struct mark_worker *worker,
-                              struct marker *marker) {
-  // We went around all workers and nothing.  Enter termination phase.
-  if (atomic_fetch_sub_explicit(&marker->active_markers, 1,
-                                memory_order_relaxed) == 1) {
-    DEBUG("  ->> marker #%zu: DONE (no spinning) <<-\n", worker->id);
-    return 1;
-  }
-
-  size_t spin_count = 0;
-  while (1) {
-    if (mark_worker_can_steal_from_any(worker, marker)) {
-      atomic_fetch_add_explicit(&marker->active_markers, 1,
-                                memory_order_relaxed);
-      return 0;
-    }
-    if (atomic_load_explicit(&marker->active_markers,
-                             memory_order_relaxed) == 0) {
-      DEBUG("  ->> marker #%zu: DONE <<-\n", worker->id);
-      return 1;
-    }
-    // spin
-    DEBUG("marker #%zu: spinning #%zu\n", worker->id, spin_count);
-    if (spin_count < 10)
-      __builtin_ia32_pause();
-    else if (spin_count < 20)
-      sched_yield();
-    else if (spin_count < 40)
-      usleep(0);
-    else
-      usleep(1);
-    spin_count++;
-  }
-}
-
-static struct gcobj *
-mark_worker_steal(struct local_marker *mark) {
-  struct marker *marker = mark_space_marker(mark->space);
-  struct mark_worker *worker = mark->worker;
-
-  while (1) {
-    DEBUG("marker #%zu: trying to steal\n", worker->id);
-    struct gcobj *obj = mark_worker_steal_from_any(worker, marker);
-    if (obj)
-      return obj;
-
-    if (mark_worker_check_termination(worker, marker))
-      return NULL;
-  }
-}
-
-static void
-mark_worker_mark(struct mark_worker *worker) {
-  struct local_marker mark;
-  mark.worker = worker;
-  mark.share_deque = &worker->deque;
-  mark.space = worker->space;
-  local_mark_queue_init(&mark.local);
-
-  size_t n = 0;
-  DEBUG("marker #%zu: running mark loop\n", worker->id);
-  while (1) {
-    struct gcobj * obj;
-    if (!local_mark_queue_empty(&mark.local)) {
-      obj = local_mark_queue_pop(&mark.local);
-    } else {
-      obj = mark_worker_steal(&mark);
-      if (!obj)
-        break;
-    }
-    trace_one(obj, &mark);
-    n++;
-  }
-  DEBUG("marker #%zu: done marking, %zu objects traced\n", worker->id, n);
-
-  mark_worker_finished_marking(worker);
-}
-
-static inline void
-marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
-  struct mark_deque *worker0_deque = &marker->workers[0].deque;
-  mark_deque_push(worker0_deque, obj);
-}
-
-static inline void
-marker_enqueue_roots(struct marker *marker, struct gcobj **objv,
-                     size_t count) {
-  struct mark_deque *worker0_deque = &marker->workers[0].deque;
-  mark_deque_push_many(worker0_deque, objv, count);
-}
-
-static inline void
-marker_trace(struct mark_space *space) {
-  struct marker *marker = mark_space_marker(space);
-
-  pthread_mutex_lock(&marker->lock);
-  long mark_count = marker->count;
-  pthread_mutex_unlock(&marker->lock);
-
-  DEBUG("starting trace; %zu workers\n", marker->worker_count);
-  DEBUG("waking workers\n");
-  atomic_store_explicit(&marker->active_markers, marker->worker_count,
-                        memory_order_release);
-  atomic_store_explicit(&marker->running_markers, marker->worker_count,
-                        memory_order_release);
-  for (size_t i = 0; i < marker->worker_count; i++)
-    mark_worker_request_mark(&marker->workers[i]);
-
-  DEBUG("waiting on markers\n");
-
-  pthread_mutex_lock(&marker->lock);
-  while (marker->count <= mark_count)
-    pthread_cond_wait(&marker->cond, &marker->lock);
-  pthread_mutex_unlock(&marker->lock);
-
-  DEBUG("trace finished\n");
-}
-
-#endif // PARALLEL_MARKER_H
--- a/parallel-tracer.h
+++ b/parallel-tracer.h
@ -0,0 +1,643 @@
+#ifndef PARALLEL_TRACER_H
+#define PARALLEL_TRACER_H
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+#include "inline.h"
+
+// The Chase-Lev work-stealing deque, as initially described in "Dynamic
+// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
+// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
+// and improved with C11 atomics in "Correct and Efficient Work-Stealing
+// for Weak Memory Models" (Lê et al, PPoPP'13)
+// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
+
+struct gcobj;
+
+struct trace_buf {
+  unsigned log_size;
+  size_t size;
+  struct gcobj **data;
+};
+
+// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
+#define trace_buf_min_log_size ((unsigned) 10)
+// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
+#define trace_buf_max_log_size ((unsigned) 28)
+
+static int
+trace_buf_init(struct trace_buf *buf, unsigned log_size) {
+  ASSERT(log_size >= trace_buf_min_log_size);
+  ASSERT(log_size <= trace_buf_max_log_size);
+  size_t size = (1 << log_size) * sizeof(struct gcobj *);
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow work-stealing dequeue");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return 0;
+  }
+  buf->log_size = log_size;
+  buf->size = 1 << log_size;
+  buf->data = mem;
+  return 1;
+}
+  
+static inline size_t
+trace_buf_size(struct trace_buf *buf) {
+  return buf->size;
+}
+
+static inline size_t
+trace_buf_byte_size(struct trace_buf *buf) {
+  return trace_buf_size(buf) * sizeof(struct gcobj *);
+}
+
+static void
+trace_buf_release(struct trace_buf *buf) {
+  if (buf->data)
+    madvise(buf->data, trace_buf_byte_size(buf), MADV_DONTNEED);
+}
+
+static void
+trace_buf_destroy(struct trace_buf *buf) {
+  if (buf->data) {
+    munmap(buf->data, trace_buf_byte_size(buf));
+    buf->data = NULL;
+    buf->log_size = 0;
+    buf->size = 0;
+  }
+}
+
+static inline struct gcobj *
+trace_buf_get(struct trace_buf *buf, size_t i) {
+  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
+                              memory_order_relaxed);
+}
+
+static inline void
+trace_buf_put(struct trace_buf *buf, size_t i, struct gcobj * o) {
+  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
+                               o,
+                               memory_order_relaxed);
+}
+
+static inline int
+trace_buf_grow(struct trace_buf *from, struct trace_buf *to,
+              size_t b, size_t t) {
+  if (from->log_size == trace_buf_max_log_size)
+    return 0;
+  if (!trace_buf_init (to, from->log_size + 1))
+    return 0;
+  for (size_t i=t; i<b; i++)
+    trace_buf_put(to, i, trace_buf_get(from, i));
+  return 1;
+}
+
+// Chase-Lev work-stealing deque.  One thread pushes data into the deque
+// at the bottom, and many threads compete to steal data from the top.
+struct trace_deque {
+  // Ensure bottom and top are on different cache lines.
+  union {
+    atomic_size_t bottom;
+    char bottom_padding[64];
+  };
+  union {
+    atomic_size_t top;
+    char top_padding[64];
+  };
+  atomic_int active; // Which trace_buf is active.
+  struct trace_buf bufs[(trace_buf_max_log_size - trace_buf_min_log_size) + 1];
+};
+
+#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
+#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
+
+#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
+#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
+
+#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
+
+static int
+trace_deque_init(struct trace_deque *q) {
+  memset(q, 0, sizeof (*q));
+  int ret = trace_buf_init(&q->bufs[0], trace_buf_min_log_size);
+  // Note, this fence isn't in the paper, I added it out of caution.
+  atomic_thread_fence(memory_order_release);
+  return ret;
+}
+
+static void
+trace_deque_release(struct trace_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    trace_buf_release(&q->bufs[i]);
+}
+
+static void
+trace_deque_destroy(struct trace_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    trace_buf_destroy(&q->bufs[i]);
+}
+
+static int
+trace_deque_grow(struct trace_deque *q, int cur, size_t b, size_t t) {
+  if (!trace_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
+    fprintf(stderr, "failed to grow deque!!\n");
+    abort();
+  }
+
+  cur++;
+  STORE_RELAXED(&q->active, cur);
+  return cur;
+}
+
+static void
+trace_deque_push(struct trace_deque *q, struct gcobj * x) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  if (b - t > trace_buf_size(&q->bufs[active]) - 1) /* Full queue. */
+    active = trace_deque_grow(q, active, b, t);
+
+  trace_buf_put(&q->bufs[active], b, x);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + 1);
+}
+
+static void
+trace_deque_push_many(struct trace_deque *q, struct gcobj **objv, size_t count) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  while (b - t > trace_buf_size(&q->bufs[active]) - count) /* Full queue. */
+    active = trace_deque_grow(q, active, b, t);
+
+  for (size_t i = 0; i < count; i++)
+    trace_buf_put(&q->bufs[active], b + i, objv[i]);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + count);
+}
+
+static struct gcobj *
+trace_deque_try_pop(struct trace_deque *q) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  b = b - 1;
+  int active = LOAD_RELAXED(&q->active);
+  STORE_RELAXED(&q->bottom, b);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t t = LOAD_RELAXED(&q->top);
+  struct gcobj * x;
+  if (t <= b) { // Non-empty queue.
+    x = trace_buf_get(&q->bufs[active], b);
+    if (t == b) { // Single last element in queue.
+      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                   memory_order_seq_cst,
+                                                   memory_order_relaxed))
+        // Failed race.
+        x = NULL;
+      STORE_RELAXED(&q->bottom, b + 1);
+    }
+  } else { // Empty queue.
+    x = NULL;
+    STORE_RELAXED(&q->bottom, b + 1);
+  }
+  return x;
+}
+
+static struct gcobj *
+trace_deque_steal(struct trace_deque *q) {
+  while (1) {
+    size_t t = LOAD_ACQUIRE(&q->top);
+    atomic_thread_fence(memory_order_seq_cst);
+    size_t b = LOAD_ACQUIRE(&q->bottom);
+    if (t >= b)
+      return NULL;
+    int active = LOAD_CONSUME(&q->active);
+    struct gcobj *x = x = trace_buf_get(&q->bufs[active], t);
+    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                 memory_order_seq_cst,
+                                                 memory_order_relaxed))
+      // Failed race.
+      continue;
+    return x;
+  }
+}
+
+static int
+trace_deque_can_steal(struct trace_deque *q) {
+  size_t t = LOAD_ACQUIRE(&q->top);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t b = LOAD_ACQUIRE(&q->bottom);
+  return t < b;
+}
+
+#undef LOAD_RELAXED
+#undef STORE_RELAXED
+#undef LOAD_ACQUIRE
+#undef STORE_RELEASE
+#undef LOAD_CONSUME
+
+#define LOCAL_TRACE_QUEUE_SIZE 1024
+#define LOCAL_TRACE_QUEUE_MASK (LOCAL_TRACE_QUEUE_SIZE - 1)
+#define LOCAL_TRACE_QUEUE_SHARE_AMOUNT (LOCAL_TRACE_QUEUE_SIZE * 3 / 4)
+struct local_trace_queue {
+  size_t read;
+  size_t write;
+  struct gcobj * data[LOCAL_TRACE_QUEUE_SIZE];
+};
+
+static inline void
+local_trace_queue_init(struct local_trace_queue *q) {
+  q->read = q->write = 0;
+}
+static inline void
+local_trace_queue_poison(struct local_trace_queue *q) {
+  q->read = 0; q->write = LOCAL_TRACE_QUEUE_SIZE;
+}
+static inline size_t
+local_trace_queue_size(struct local_trace_queue *q) {
+  return q->write - q->read;
+}
+static inline int
+local_trace_queue_empty(struct local_trace_queue *q) {
+  return local_trace_queue_size(q) == 0;
+}
+static inline int
+local_trace_queue_full(struct local_trace_queue *q) {
+  return local_trace_queue_size(q) >= LOCAL_TRACE_QUEUE_SIZE;
+}
+static inline void
+local_trace_queue_push(struct local_trace_queue *q, struct gcobj * v) {
+  q->data[q->write++ & LOCAL_TRACE_QUEUE_MASK] = v;
+}
+static inline struct gcobj *
+local_trace_queue_pop(struct local_trace_queue *q) {
+  return q->data[q->read++ & LOCAL_TRACE_QUEUE_MASK];
+}
+
+enum trace_worker_state {
+  TRACE_WORKER_STOPPED,
+  TRACE_WORKER_IDLE,
+  TRACE_WORKER_TRACING,
+  TRACE_WORKER_STOPPING,
+  TRACE_WORKER_DEAD
+};
+
+struct heap;
+struct trace_worker {
+  struct heap *heap;
+  size_t id;
+  size_t steal_id;
+  pthread_t thread;
+  enum trace_worker_state state;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+  struct trace_deque deque;
+};
+
+#define TRACE_WORKERS_MAX_COUNT 8
+
+struct tracer {
+  atomic_size_t active_tracers;
+  size_t worker_count;
+  atomic_size_t running_tracers;
+  long count;
+  pthread_mutex_t lock;
+  pthread_cond_t cond;
+  struct trace_worker workers[TRACE_WORKERS_MAX_COUNT];
+};
+
+struct local_tracer {
+  struct trace_worker *worker;
+  struct trace_deque *share_deque;
+  struct heap *heap;
+  struct local_trace_queue local;
+};
+
+struct context;
+static inline struct tracer* heap_tracer(struct heap *heap);
+
+static size_t number_of_current_processors(void) { return 1; }
+
+static int
+trace_worker_init(struct trace_worker *worker, struct heap *heap,
+                 struct tracer *tracer, size_t id) {
+  worker->heap = heap;
+  worker->id = id;
+  worker->steal_id = 0;
+  worker->thread = 0;
+  worker->state = TRACE_WORKER_STOPPED;
+  pthread_mutex_init(&worker->lock, NULL);
+  pthread_cond_init(&worker->cond, NULL);
+  return trace_deque_init(&worker->deque);
+}
+
+static void trace_worker_trace(struct trace_worker *worker);
+
+static void*
+trace_worker_thread(void *data) {
+  struct trace_worker *worker = data;
+
+  pthread_mutex_lock(&worker->lock);
+  while (1) {
+    switch (worker->state) {
+    case TRACE_WORKER_IDLE:
+      pthread_cond_wait(&worker->cond, &worker->lock);
+      break;
+    case TRACE_WORKER_TRACING:
+      trace_worker_trace(worker);
+      worker->state = TRACE_WORKER_IDLE;
+      break;
+    case TRACE_WORKER_STOPPING:
+      worker->state = TRACE_WORKER_DEAD;
+      pthread_mutex_unlock(&worker->lock);
+      return NULL;
+    default:
+      abort();
+    }
+  }
+}
+
+static int
+trace_worker_spawn(struct trace_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == TRACE_WORKER_STOPPED);
+  worker->state = TRACE_WORKER_IDLE;
+  pthread_mutex_unlock(&worker->lock);
+
+  if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) {
+    perror("spawning tracer thread failed");
+    worker->state = TRACE_WORKER_STOPPED;
+    return 0;
+  }
+
+  return 1;
+}
+
+static void
+trace_worker_request_trace(struct trace_worker *worker) {
+  struct tracer *tracer = heap_tracer(worker->heap);
+    
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == TRACE_WORKER_IDLE);
+  worker->state = TRACE_WORKER_TRACING;
+  pthread_cond_signal(&worker->cond);
+  pthread_mutex_unlock(&worker->lock);
+}  
+
+static void
+trace_worker_finished_tracing(struct trace_worker *worker) {
+  // Signal controller that we are done with tracing.
+  struct tracer *tracer = heap_tracer(worker->heap);
+    
+  if (atomic_fetch_sub(&tracer->running_tracers, 1) == 1) {
+    pthread_mutex_lock(&tracer->lock);
+    tracer->count++;
+    pthread_cond_signal(&tracer->cond);
+    pthread_mutex_unlock(&tracer->lock);
+  }
+}
+
+static void
+trace_worker_request_stop(struct trace_worker *worker) {
+  pthread_mutex_lock(&worker->lock);
+  ASSERT(worker->state == TRACE_WORKER_IDLE);
+  worker->state = TRACE_WORKER_STOPPING;
+  pthread_cond_signal(&worker->cond);
+  pthread_mutex_unlock(&worker->lock);
+}  
+
+static int
+tracer_init(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+  atomic_init(&tracer->active_tracers, 0);
+  atomic_init(&tracer->running_tracers, 0);
+  tracer->count = 0;
+  pthread_mutex_init(&tracer->lock, NULL);
+  pthread_cond_init(&tracer->cond, NULL);
+  size_t desired_worker_count = 0;
+  if (getenv("GC_TRACERS"))
+    desired_worker_count = atoi(getenv("GC_TRACERS"));
+  if (desired_worker_count == 0)
+    desired_worker_count = number_of_current_processors();
+  if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
+    desired_worker_count = TRACE_WORKERS_MAX_COUNT;
+  for (size_t i = 0; i < desired_worker_count; i++) {
+    if (!trace_worker_init(&tracer->workers[i], heap, tracer, i))
+      break;
+    if (trace_worker_spawn(&tracer->workers[i]))
+      tracer->worker_count++;
+    else
+      break;
+  }
+  return tracer->worker_count > 0;
+}
+
+static void tracer_prepare(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    tracer->workers[i].steal_id = 0;
+}
+static void tracer_release(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    trace_deque_release(&tracer->workers[i].deque);
+}
+
+struct gcobj;
+static inline void tracer_visit(void **loc, void *trace_data) ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
+static inline int trace_object(struct heap *heap,
+                               struct gcobj *obj) ALWAYS_INLINE;
+
+static inline void
+tracer_share(struct local_tracer *trace) {
+  DEBUG("tracer #%zu: sharing\n", trace->worker->id);
+  for (size_t i = 0; i < LOCAL_TRACE_QUEUE_SHARE_AMOUNT; i++)
+    trace_deque_push(trace->share_deque, local_trace_queue_pop(&trace->local));
+}
+
+static inline void
+tracer_visit(void **loc, void *trace_data) {
+  struct local_tracer *trace = trace_data;
+  struct gcobj *obj = *loc;
+  if (obj && trace_object(trace->heap, obj)) {
+    if (local_trace_queue_full(&trace->local))
+      tracer_share(trace);
+    local_trace_queue_push(&trace->local, obj);
+  }
+}
+
+static struct gcobj *
+tracer_steal_from_worker(struct tracer *tracer, size_t id) {
+  ASSERT(id < tracer->worker_count);
+  return trace_deque_steal(&tracer->workers[id].deque);
+}
+
+static int
+tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
+  ASSERT(id < tracer->worker_count);
+  return trace_deque_can_steal(&tracer->workers[id].deque);
+}
+
+static struct gcobj *
+trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
+  size_t steal_id = worker->steal_id;
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    steal_id = (steal_id + 1) % tracer->worker_count;
+    DEBUG("tracer #%zu: stealing from #%zu\n", worker->id, steal_id);
+    struct gcobj * obj = tracer_steal_from_worker(tracer, steal_id);
+    if (obj) {
+      DEBUG("tracer #%zu: stealing got %p\n", worker->id, obj);
+      worker->steal_id = steal_id;
+      return obj;
+    }
+  }
+  DEBUG("tracer #%zu: failed to steal\n", worker->id);
+  return 0;
+}
+
+static int
+trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
+  size_t steal_id = worker->steal_id;
+  DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
+  for (size_t i = 0; i < tracer->worker_count; i++) {
+    steal_id = (steal_id + 1) % tracer->worker_count;
+    int res = tracer_can_steal_from_worker(tracer, steal_id);
+    if (res) {
+      DEBUG("tracer #%zu: worker #%zu has tasks!\n", worker->id, steal_id);
+      worker->steal_id = steal_id;
+      return 1;
+    }
+  }
+  DEBUG("tracer #%zu: nothing to steal\n", worker->id);
+  return 0;
+}
+
+static int
+trace_worker_check_termination(struct trace_worker *worker,
+                              struct tracer *tracer) {
+  // We went around all workers and nothing.  Enter termination phase.
+  if (atomic_fetch_sub_explicit(&tracer->active_tracers, 1,
+                                memory_order_relaxed) == 1) {
+    DEBUG("  ->> tracer #%zu: DONE (no spinning) <<-\n", worker->id);
+    return 1;
+  }
+
+  size_t spin_count = 0;
+  while (1) {
+    if (trace_worker_can_steal_from_any(worker, tracer)) {
+      atomic_fetch_add_explicit(&tracer->active_tracers, 1,
+                                memory_order_relaxed);
+      return 0;
+    }
+    if (atomic_load_explicit(&tracer->active_tracers,
+                             memory_order_relaxed) == 0) {
+      DEBUG("  ->> tracer #%zu: DONE <<-\n", worker->id);
+      return 1;
+    }
+    // spin
+    DEBUG("tracer #%zu: spinning #%zu\n", worker->id, spin_count);
+    if (spin_count < 10)
+      __builtin_ia32_pause();
+    else if (spin_count < 20)
+      sched_yield();
+    else if (spin_count < 40)
+      usleep(0);
+    else
+      usleep(1);
+    spin_count++;
+  }
+}
+
+static struct gcobj *
+trace_worker_steal(struct local_tracer *trace) {
+  struct tracer *tracer = heap_tracer(trace->heap);
+  struct trace_worker *worker = trace->worker;
+
+  while (1) {
+    DEBUG("tracer #%zu: trying to steal\n", worker->id);
+    struct gcobj *obj = trace_worker_steal_from_any(worker, tracer);
+    if (obj)
+      return obj;
+
+    if (trace_worker_check_termination(worker, tracer))
+      return NULL;
+  }
+}
+
+static void
+trace_worker_trace(struct trace_worker *worker) {
+  struct local_tracer trace;
+  trace.worker = worker;
+  trace.share_deque = &worker->deque;
+  trace.heap = worker->heap;
+  local_trace_queue_init(&trace.local);
+
+  size_t n = 0;
+  DEBUG("tracer #%zu: running trace loop\n", worker->id);
+  while (1) {
+    struct gcobj * obj;
+    if (!local_trace_queue_empty(&trace.local)) {
+      obj = local_trace_queue_pop(&trace.local);
+    } else {
+      obj = trace_worker_steal(&trace);
+      if (!obj)
+        break;
+    }
+    trace_one(obj, &trace);
+    n++;
+  }
+  DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
+
+  trace_worker_finished_tracing(worker);
+}
+
+static inline void
+tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
+  struct trace_deque *worker0_deque = &tracer->workers[0].deque;
+  trace_deque_push(worker0_deque, obj);
+}
+
+static inline void
+tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objv,
+                     size_t count) {
+  struct trace_deque *worker0_deque = &tracer->workers[0].deque;
+  trace_deque_push_many(worker0_deque, objv, count);
+}
+
+static inline void
+tracer_trace(struct heap *heap) {
+  struct tracer *tracer = heap_tracer(heap);
+
+  pthread_mutex_lock(&tracer->lock);
+  long trace_count = tracer->count;
+  pthread_mutex_unlock(&tracer->lock);
+
+  DEBUG("starting trace; %zu workers\n", tracer->worker_count);
+  DEBUG("waking workers\n");
+  atomic_store_explicit(&tracer->active_tracers, tracer->worker_count,
+                        memory_order_release);
+  atomic_store_explicit(&tracer->running_tracers, tracer->worker_count,
+                        memory_order_release);
+  for (size_t i = 0; i < tracer->worker_count; i++)
+    trace_worker_request_trace(&tracer->workers[i]);
+
+  DEBUG("waiting on tracers\n");
+
+  pthread_mutex_lock(&tracer->lock);
+  while (tracer->count <= trace_count)
+    pthread_cond_wait(&tracer->cond, &tracer->lock);
+  pthread_mutex_unlock(&tracer->lock);
+
+  DEBUG("trace finished\n");
+}
+
+#endif // PARALLEL_TRACER_H
--- a/serial-marker.h
+++ b/serial-marker.h
@ -1,168 +0,0 @@
-#ifndef SERIAL_TRACE_H
-#define SERIAL_TRACE_H
-
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include "assert.h"
-#include "debug.h"
-
-struct gcobj;
-
-struct mark_queue {
-  size_t size;
-  size_t read;
-  size_t write;
-  struct gcobj **buf;
-};
-
-static const size_t mark_queue_max_size =
-  (1ULL << (sizeof(struct gcobj *) * 8 - 1)) / sizeof(struct gcobj *);
-static const size_t mark_queue_release_byte_threshold = 1 * 1024 * 1024;
-
-static struct gcobj **
-mark_queue_alloc(size_t size) {
-  void *mem = mmap(NULL, size * sizeof(struct gcobj *), PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  if (mem == MAP_FAILED) {
-    perror("Failed to grow mark queue");
-    DEBUG("Failed to allocate %zu bytes", size);
-    return NULL;
-  }
-  return mem;
-}
-
-static int
-mark_queue_init(struct mark_queue *q) {
-  q->size = getpagesize() / sizeof(struct gcobj *);
-  q->read = 0;
-  q->write = 0;
-  q->buf = mark_queue_alloc(q->size);
-  return !!q->buf;
-}
-  
-static inline struct gcobj *
-mark_queue_get(struct mark_queue *q, size_t idx) {
-  return q->buf[idx & (q->size - 1)];
-}
-
-static inline void
-mark_queue_put(struct mark_queue *q, size_t idx, struct gcobj *x) {
-  q->buf[idx & (q->size - 1)] = x;
-}
-
-static int mark_queue_grow(struct mark_queue *q) NEVER_INLINE;
-
-static int
-mark_queue_grow(struct mark_queue *q) {
-  size_t old_size = q->size;
-  struct gcobj **old_buf = q->buf;
-  if (old_size >= mark_queue_max_size) {
-    DEBUG("mark queue already at max size of %zu bytes", old_size);
-    return 0;
-  }
-
-  size_t new_size = old_size * 2;
-  struct gcobj **new_buf = mark_queue_alloc(new_size);
-  if (!new_buf)
-    return 0;
-
-  size_t old_mask = old_size - 1;
-  size_t new_mask = new_size - 1;
-
-  for (size_t i = q->read; i < q->write; i++)
-    new_buf[i & new_mask] = old_buf[i & old_mask];
-
-  munmap(old_buf, old_size * sizeof(struct gcobj *));
-
-  q->size = new_size;
-  q->buf = new_buf;
-  return 1;
-}
-  
-static inline void
-mark_queue_push(struct mark_queue *q, struct gcobj *p) {
-  if (UNLIKELY(q->write - q->read == q->size)) {
-    if (!mark_queue_grow(q))
-      abort();
-  }
-  mark_queue_put(q, q->write++, p);
-}
-
-static inline void
-mark_queue_push_many(struct mark_queue *q, struct gcobj **pv, size_t count) {
-  while (q->size - (q->write - q->read) < count) {
-    if (!mark_queue_grow(q))
-      abort();
-  }
-  for (size_t i = 0; i < count; i++)
-    mark_queue_put(q, q->write++, pv[i]);
-}
-
-static inline struct gcobj*
-mark_queue_pop(struct mark_queue *q) {
-  if (UNLIKELY(q->read == q->write))
-    return NULL;
-  return mark_queue_get(q, q->read++);
-}
-
-static void
-mark_queue_release(struct mark_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gcobj *);
-  if (byte_size >= mark_queue_release_byte_threshold)
-    madvise(q->buf, byte_size, MADV_DONTNEED);
-  q->read = q->write = 0;
-}
-
-static void
-mark_queue_destroy(struct mark_queue *q) {
-  size_t byte_size = q->size * sizeof(struct gcobj *);
-  munmap(q->buf, byte_size);
-}
-
-struct marker {
-  struct mark_queue queue;
-};
-
-struct mark_space;
-static inline struct marker* mark_space_marker(struct mark_space *space);
-
-static int
-marker_init(struct mark_space *space) {
-  return mark_queue_init(&mark_space_marker(space)->queue);
-}
-static void marker_prepare(struct mark_space *space) {}
-static void marker_release(struct mark_space *space) {
-  mark_queue_release(&mark_space_marker(space)->queue);
-}
-
-struct gcobj;
-static inline void marker_visit(void **loc, void *mark_data) ALWAYS_INLINE;
-static inline void trace_one(struct gcobj *obj, void *mark_data) ALWAYS_INLINE;
-static inline int mark_object(struct mark_space *space,
-                              struct gcobj *obj) ALWAYS_INLINE;
-
-static inline void
-marker_enqueue_root(struct marker *marker, struct gcobj *obj) {
-  mark_queue_push(&marker->queue, obj);
-}
-static inline void
-marker_enqueue_roots(struct marker *marker, struct gcobj **objs,
-                     size_t count) {
-  mark_queue_push_many(&marker->queue, objs, count);
-}
-static inline void
-marker_visit(void **loc, void *mark_data) {
-  struct mark_space *space = mark_data;
-  struct gcobj *obj = *loc;
-  if (obj && mark_object(space, obj))
-    marker_enqueue_root(mark_space_marker(space), obj);
-}
-static inline void
-marker_trace(struct mark_space *space) {
-  struct gcobj *obj;
-  while ((obj = mark_queue_pop(&mark_space_marker(space)->queue)))
-    trace_one(obj, space);
-}
-
-#endif // SERIAL_MARK_H
--- a/serial-tracer.h
+++ b/serial-tracer.h
@ -0,0 +1,168 @@
+#ifndef SERIAL_TRACER_H
+#define SERIAL_TRACER_H
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+
+struct gcobj;
+
+struct trace_queue {
+  size_t size;
+  size_t read;
+  size_t write;
+  struct gcobj **buf;
+};
+
+static const size_t trace_queue_max_size =
+  (1ULL << (sizeof(struct gcobj *) * 8 - 1)) / sizeof(struct gcobj *);
+static const size_t trace_queue_release_byte_threshold = 1 * 1024 * 1024;
+
+static struct gcobj **
+trace_queue_alloc(size_t size) {
+  void *mem = mmap(NULL, size * sizeof(struct gcobj *), PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow trace queue");
+    DEBUG("Failed to allocate %zu bytes", size);
+    return NULL;
+  }
+  return mem;
+}
+
+static int
+trace_queue_init(struct trace_queue *q) {
+  q->size = getpagesize() / sizeof(struct gcobj *);
+  q->read = 0;
+  q->write = 0;
+  q->buf = trace_queue_alloc(q->size);
+  return !!q->buf;
+}
+  
+static inline struct gcobj *
+trace_queue_get(struct trace_queue *q, size_t idx) {
+  return q->buf[idx & (q->size - 1)];
+}
+
+static inline void
+trace_queue_put(struct trace_queue *q, size_t idx, struct gcobj *x) {
+  q->buf[idx & (q->size - 1)] = x;
+}
+
+static int trace_queue_grow(struct trace_queue *q) NEVER_INLINE;
+
+static int
+trace_queue_grow(struct trace_queue *q) {
+  size_t old_size = q->size;
+  struct gcobj **old_buf = q->buf;
+  if (old_size >= trace_queue_max_size) {
+    DEBUG("trace queue already at max size of %zu bytes", old_size);
+    return 0;
+  }
+
+  size_t new_size = old_size * 2;
+  struct gcobj **new_buf = trace_queue_alloc(new_size);
+  if (!new_buf)
+    return 0;
+
+  size_t old_mask = old_size - 1;
+  size_t new_mask = new_size - 1;
+
+  for (size_t i = q->read; i < q->write; i++)
+    new_buf[i & new_mask] = old_buf[i & old_mask];
+
+  munmap(old_buf, old_size * sizeof(struct gcobj *));
+
+  q->size = new_size;
+  q->buf = new_buf;
+  return 1;
+}
+  
+static inline void
+trace_queue_push(struct trace_queue *q, struct gcobj *p) {
+  if (UNLIKELY(q->write - q->read == q->size)) {
+    if (!trace_queue_grow(q))
+      abort();
+  }
+  trace_queue_put(q, q->write++, p);
+}
+
+static inline void
+trace_queue_push_many(struct trace_queue *q, struct gcobj **pv, size_t count) {
+  while (q->size - (q->write - q->read) < count) {
+    if (!trace_queue_grow(q))
+      abort();
+  }
+  for (size_t i = 0; i < count; i++)
+    trace_queue_put(q, q->write++, pv[i]);
+}
+
+static inline struct gcobj*
+trace_queue_pop(struct trace_queue *q) {
+  if (UNLIKELY(q->read == q->write))
+    return NULL;
+  return trace_queue_get(q, q->read++);
+}
+
+static void
+trace_queue_release(struct trace_queue *q) {
+  size_t byte_size = q->size * sizeof(struct gcobj *);
+  if (byte_size >= trace_queue_release_byte_threshold)
+    madvise(q->buf, byte_size, MADV_DONTNEED);
+  q->read = q->write = 0;
+}
+
+static void
+trace_queue_destroy(struct trace_queue *q) {
+  size_t byte_size = q->size * sizeof(struct gcobj *);
+  munmap(q->buf, byte_size);
+}
+
+struct tracer {
+  struct trace_queue queue;
+};
+
+struct heap;
+static inline struct tracer* heap_tracer(struct heap *heap);
+
+static int
+tracer_init(struct heap *heap) {
+  return trace_queue_init(&heap_tracer(heap)->queue);
+}
+static void tracer_prepare(struct heap *heap) {}
+static void tracer_release(struct heap *heap) {
+  trace_queue_release(&heap_tracer(heap)->queue);
+}
+
+struct gcobj;
+static inline void tracer_visit(void **loc, void *trace_data) ALWAYS_INLINE;
+static inline void trace_one(struct gcobj *obj, void *trace_data) ALWAYS_INLINE;
+static inline int trace_object(struct heap *heap,
+                               struct gcobj *obj) ALWAYS_INLINE;
+
+static inline void
+tracer_enqueue_root(struct tracer *tracer, struct gcobj *obj) {
+  trace_queue_push(&tracer->queue, obj);
+}
+static inline void
+tracer_enqueue_roots(struct tracer *tracer, struct gcobj **objs,
+                     size_t count) {
+  trace_queue_push_many(&tracer->queue, objs, count);
+}
+static inline void
+tracer_visit(void **loc, void *trace_data) {
+  struct heap *heap = trace_data;
+  struct gcobj *obj = *loc;
+  if (obj && trace_object(heap, obj))
+    tracer_enqueue_root(heap_tracer(heap), obj);
+}
+static inline void
+tracer_trace(struct heap *heap) {
+  struct gcobj *obj;
+  while ((obj = trace_queue_pop(&heap_tracer(heap)->queue)))
+    trace_one(obj, heap);
+}
+
+#endif // SERIAL_TRACER_H