diff --git a/GCBench.c b/GCBench.c
index f00f71016..f229de866 100644
--- a/GCBench.c
+++ b/GCBench.c
@@ -50,6 +50,9 @@
 #include "semi.h"
 #elif defined(GC_MARK_SWEEP)
 #include "mark-sweep.h"
+#elif defined(GC_PARALLEL_MARK_SWEEP)
+#define GC_PARALLEL_MARK 1
+#include "mark-sweep.h"
 #else
 #error unknown gc
 #endif
diff --git a/Makefile b/Makefile
index 81723fd93..04a23ed6c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 TESTS=GCBench # MT_GCBench MT_GCBench2
-COLLECTORS=bdw semi mark-sweep
+COLLECTORS=bdw semi mark-sweep parallel-mark-sweep
 
 CC=gcc
 CFLAGS=-Wall -O2 -g
@@ -17,6 +17,9 @@ semi-%: semi.h precise-roots.h %.c
 mark-sweep-%: mark-sweep.h precise-roots.h serial-marker.h assert.h debug.h %.c
 	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_MARK_SWEEP -o $@ $*.c
 
+parallel-mark-sweep-%: mark-sweep.h precise-roots.h parallel-marker.h assert.h debug.h %.c
+	$(CC) $(CFLAGS) -I. -DNDEBUG -DGC_PARALLEL_MARK_SWEEP -o $@ $*.c
+
 check: $(addprefix test-$(TARGET),$(TARGETS))
 
 test-%: $(ALL_TESTS)
diff --git a/mark-sweep.h b/mark-sweep.h
index 0920821db..f6b000fc2 100644
--- a/mark-sweep.h
+++ b/mark-sweep.h
@@ -7,7 +7,11 @@
 #include "assert.h"
 #include "debug.h"
 #include "precise-roots.h"
+#ifdef GC_PARALLEL_MARK
+#include "parallel-marker.h"
+#else
 #include "serial-marker.h"
+#endif
 
 #define GRANULE_SIZE 8
 #define GRANULE_SIZE_LOG_2 3
diff --git a/parallel-marker.h b/parallel-marker.h
new file mode 100644
index 000000000..8bfac725a
--- /dev/null
+++ b/parallel-marker.h
@@ -0,0 +1,269 @@
+#ifndef SERIAL_TRACE_H
+#define SERIAL_TRACE_H
+
+#include <stdatomic.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "debug.h"
+
+// The Chase-Lev work-stealing deque, as initially described in "Dynamic
+// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
+// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
+// and improved with C11 atomics in "Correct and Efficient Work-Stealing
+// for Weak Memory Models" (Lê et al, PPoPP'13)
+// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
+
+struct mark_buf {
+  unsigned log_size;
+  size_t size;
+  atomic_uintptr_t *data;
+};
+
+// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
+#define mark_buf_min_log_size ((unsigned) 10)
+// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
+#define mark_buf_max_log_size ((unsigned) 28)
+
+static int
+mark_buf_init(struct mark_buf *buf, unsigned log_size) {
+  ASSERT(log_size >= mark_buf_min_log_size);
+  ASSERT(log_size <= mark_buf_max_log_size);
+  size_t size = (1 << log_size) * sizeof(uintptr_t);
+  void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (mem == MAP_FAILED) {
+    perror("Failed to grow work-stealing dequeue");
+    DEBUG("Failed to allocate %zu bytes", size, );
+    return 0;
+  }
+  buf->log_size = log_size;
+  buf->size = 1 << log_size;
+  buf->data = mem;
+  return 1;
+}
+  
+static inline size_t
+mark_buf_size(struct mark_buf *buf) {
+  return buf->size;
+}
+
+static inline size_t
+mark_buf_byte_size(struct mark_buf *buf) {
+  return mark_buf_size(buf) * sizeof(uintptr_t);
+}
+
+static void
+mark_buf_release(struct mark_buf *buf) {
+  if (buf->data)
+    madvise(buf->data, mark_buf_byte_size(buf), MADV_DONTNEED);
+}
+
+static void
+mark_buf_destroy(struct mark_buf *buf) {
+  if (buf->data) {
+    munmap(buf->data, mark_buf_byte_size(buf));
+    buf->data = NULL;
+    buf->log_size = 0;
+    buf->size = 0;
+  }
+}
+
+static inline uintptr_t
+mark_buf_get(struct mark_buf *buf, size_t i) {
+  return atomic_load_explicit(&buf->data[i & (buf->size - 1)],
+                              memory_order_relaxed);
+}
+
+static inline void
+mark_buf_put(struct mark_buf *buf, size_t i, uintptr_t o) {
+  return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
+                               o,
+                               memory_order_relaxed);
+}
+
+static inline int
+mark_buf_grow(struct mark_buf *from, struct mark_buf *to,
+              size_t b, size_t t) {
+  if (from->log_size == mark_buf_max_log_size)
+    return 0;
+  if (!mark_buf_init (to, from->log_size + 1))
+    return 0;
+  for (size_t i=t; i<b; i++)
+    mark_buf_put(to, i, mark_buf_get(from, i));
+  return 1;
+}
+
+static const uintptr_t mark_deque_empty = 0;
+static const uintptr_t mark_deque_abort = 1;
+
+// Chase-Lev work-stealing deque.  One thread pushes data into the deque
+// at the bottom, and many threads compete to steal data from the top.
+struct mark_deque {
+  // Ensure bottom and top are on different cache lines.
+  union {
+    atomic_size_t bottom;
+    char bottom_padding[64];
+  };
+  union {
+    atomic_size_t top;
+    char top_padding[64];
+  };
+  atomic_int active; // Which mark_buf is active.
+  struct mark_buf bufs[(mark_buf_max_log_size - mark_buf_min_log_size) + 1];
+};
+
+#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
+#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
+
+#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
+#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
+
+#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
+
+static int
+mark_deque_init(struct mark_deque *q) {
+  memset(q, 0, sizeof (*q));
+  int ret = mark_buf_init(&q->bufs[0], mark_buf_min_log_size);
+  // Note, this fence isn't in the paper, I added it out of caution.
+  atomic_thread_fence(memory_order_release);
+  return ret;
+}
+
+static void
+mark_deque_release(struct mark_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    mark_buf_release(&q->bufs[i]);
+}
+
+static void
+mark_deque_destroy(struct mark_deque *q) {
+  for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
+    mark_buf_destroy(&q->bufs[i]);
+}
+
+static int
+mark_deque_grow(struct mark_deque *q, int cur, size_t b, size_t t) {
+  if (!mark_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
+    fprintf(stderr, "failed to grow deque!!\n");
+    abort();
+  }
+
+  cur++;
+  STORE_RELAXED(&q->active, cur);
+  return cur;
+}
+
+static void
+mark_deque_push(struct mark_deque *q, uintptr_t x) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  size_t t = LOAD_ACQUIRE(&q->top);
+  int active = LOAD_RELAXED(&q->active);
+
+  if (b - t > mark_buf_size(&q->bufs[active]) - 1) /* Full queue. */
+    active = mark_deque_grow(q, active, b, t);
+
+  mark_buf_put(&q->bufs[active], b, x);
+  atomic_thread_fence(memory_order_release);
+  STORE_RELAXED(&q->bottom, b + 1);
+}
+
+static uintptr_t
+mark_deque_try_pop(struct mark_deque *q) {
+  size_t b = LOAD_RELAXED(&q->bottom);
+  b = b - 1;
+  int active = LOAD_RELAXED(&q->active);
+  STORE_RELAXED(&q->bottom, b);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t t = LOAD_RELAXED(&q->top);
+  uintptr_t x;
+  if (t <= b) { // Non-empty queue.
+    x = mark_buf_get(&q->bufs[active], b);
+    if (t == b) { // Single last element in queue.
+      if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                   memory_order_seq_cst,
+                                                   memory_order_relaxed))
+        // Failed race.
+        x = mark_deque_empty;
+      STORE_RELAXED(&q->bottom, b + 1);
+    }
+  } else { // Empty queue.
+    x = mark_deque_empty;
+    STORE_RELAXED(&q->bottom, b + 1);
+  }
+  return x;
+}
+
+static uintptr_t
+mark_deque_steal(struct mark_deque *q) {
+  size_t t = LOAD_ACQUIRE(&q->top);
+  atomic_thread_fence(memory_order_seq_cst);
+  size_t b = LOAD_ACQUIRE(&q->bottom);
+  uintptr_t x = mark_deque_empty;
+  if (t < b) { // Non-empty queue.
+    int active = LOAD_CONSUME(&q->active);
+    x = mark_buf_get(&q->bufs[active], t);
+    if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
+                                                 memory_order_seq_cst,
+                                                 memory_order_relaxed))
+      // Failed race.
+      return mark_deque_abort;
+  }
+  return x;
+}
+
+#undef LOAD_RELAXED
+#undef STORE_RELAXED
+#undef LOAD_ACQUIRE
+#undef STORE_RELEASE
+#undef LOAD_CONSUME
+
+struct marker {
+  struct mark_deque deque;
+};
+
+struct context;
+static inline struct marker* context_marker(struct context *cx);
+
+static int
+marker_init(struct context *cx) {
+  return mark_deque_init(&context_marker(cx)->deque);
+}
+static void marker_prepare(struct context *cx) {}
+static void marker_release(struct context *cx) {
+  mark_deque_release(&context_marker(cx)->deque);
+}
+
+struct gcobj;
+static inline void marker_visit(struct context *cx, void **loc) __attribute__((always_inline));
+static inline void marker_trace(struct context *cx,
+                                void (*)(struct context *, struct gcobj *))
+  __attribute__((always_inline));
+static inline int mark_object(struct context *cx,
+                              struct gcobj *obj) __attribute__((always_inline));
+
+static inline void
+marker_visit(struct context *cx, void **loc) {
+  struct gcobj *obj = *loc;
+  if (obj && mark_object(cx, obj))
+    mark_deque_push(&context_marker(cx)->deque, (uintptr_t)obj);
+}
+static inline void
+marker_visit_root(struct context *cx, void **loc) {
+  marker_visit(cx, loc);
+}
+static inline void
+marker_trace(struct context *cx,
+             void (*process)(struct context *, struct gcobj *)) {
+  while (1) {
+    uintptr_t addr = mark_deque_steal(&context_marker(cx)->deque);
+    if (addr == mark_deque_empty)
+      return;
+    if (addr == mark_deque_abort)
+      continue;
+    process(cx, (struct gcobj*)addr);
+  }
+}
+
+#endif // SERIAL_MARK_H