mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-19 19:20:23 +02:00
Factor trace deque out to shared-worklist.h
Also increase alignment to account for cache line prefetcher.
This commit is contained in:
parent
b4bf949df6
commit
dd3953ef1a
3 changed files with 280 additions and 255 deletions
|
@ -14,4 +14,9 @@ static inline uintptr_t align_up(uintptr_t addr, size_t align) {
|
||||||
return align_down(addr + align - 1, align);
|
return align_down(addr + align - 1, align);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Poor man's equivalent of std::hardware_destructive_interference_size.
|
||||||
|
#define AVOID_FALSE_SHARING 128
|
||||||
|
#define ALIGNED_TO_AVOID_FALSE_SHARING \
|
||||||
|
__attribute__((aligned(AVOID_FALSE_SHARING)))
|
||||||
|
|
||||||
#endif // GC_ALIGN_H
|
#endif // GC_ALIGN_H
|
||||||
|
|
|
@ -9,249 +9,9 @@
|
||||||
#include "assert.h"
|
#include "assert.h"
|
||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
#include "gc-inline.h"
|
#include "gc-inline.h"
|
||||||
|
#include "shared-worklist.h"
|
||||||
#include "spin.h"
|
#include "spin.h"
|
||||||
|
|
||||||
// The Chase-Lev work-stealing deque, as initially described in "Dynamic
|
|
||||||
// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
|
|
||||||
// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
|
|
||||||
// and improved with C11 atomics in "Correct and Efficient Work-Stealing
|
|
||||||
// for Weak Memory Models" (Lê et al, PPoPP'13)
|
|
||||||
// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
|
|
||||||
|
|
||||||
struct trace_buf {
|
|
||||||
unsigned log_size;
|
|
||||||
size_t size;
|
|
||||||
uintptr_t *data;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
|
|
||||||
#define trace_buf_min_log_size ((unsigned) 10)
|
|
||||||
// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
|
|
||||||
#define trace_buf_max_log_size ((unsigned) 28)
|
|
||||||
|
|
||||||
static int
|
|
||||||
trace_buf_init(struct trace_buf *buf, unsigned log_size) {
|
|
||||||
ASSERT(log_size >= trace_buf_min_log_size);
|
|
||||||
ASSERT(log_size <= trace_buf_max_log_size);
|
|
||||||
size_t size = (1 << log_size) * sizeof(uintptr_t);
|
|
||||||
void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
|
|
||||||
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
|
|
||||||
if (mem == MAP_FAILED) {
|
|
||||||
perror("Failed to grow work-stealing dequeue");
|
|
||||||
DEBUG("Failed to allocate %zu bytes", size);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
buf->log_size = log_size;
|
|
||||||
buf->size = 1 << log_size;
|
|
||||||
buf->data = mem;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline size_t
|
|
||||||
trace_buf_size(struct trace_buf *buf) {
|
|
||||||
return buf->size;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline size_t
|
|
||||||
trace_buf_byte_size(struct trace_buf *buf) {
|
|
||||||
return trace_buf_size(buf) * sizeof(uintptr_t);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
trace_buf_release(struct trace_buf *buf) {
|
|
||||||
if (buf->data)
|
|
||||||
madvise(buf->data, trace_buf_byte_size(buf), MADV_DONTNEED);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
trace_buf_destroy(struct trace_buf *buf) {
|
|
||||||
if (buf->data) {
|
|
||||||
munmap(buf->data, trace_buf_byte_size(buf));
|
|
||||||
buf->data = NULL;
|
|
||||||
buf->log_size = 0;
|
|
||||||
buf->size = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct gc_ref
|
|
||||||
trace_buf_get(struct trace_buf *buf, size_t i) {
|
|
||||||
return gc_ref(atomic_load_explicit(&buf->data[i & (buf->size - 1)],
|
|
||||||
memory_order_relaxed));
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
trace_buf_put(struct trace_buf *buf, size_t i, struct gc_ref ref) {
|
|
||||||
return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
|
|
||||||
gc_ref_value(ref),
|
|
||||||
memory_order_relaxed);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
trace_buf_grow(struct trace_buf *from, struct trace_buf *to,
|
|
||||||
size_t b, size_t t) {
|
|
||||||
if (from->log_size == trace_buf_max_log_size)
|
|
||||||
return 0;
|
|
||||||
if (!trace_buf_init (to, from->log_size + 1))
|
|
||||||
return 0;
|
|
||||||
for (size_t i=t; i<b; i++)
|
|
||||||
trace_buf_put(to, i, trace_buf_get(from, i));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Chase-Lev work-stealing deque. One thread pushes data into the deque
|
|
||||||
// at the bottom, and many threads compete to steal data from the top.
|
|
||||||
struct trace_deque {
|
|
||||||
// Ensure bottom and top are on different cache lines.
|
|
||||||
union {
|
|
||||||
atomic_size_t bottom;
|
|
||||||
char bottom_padding[64];
|
|
||||||
};
|
|
||||||
union {
|
|
||||||
atomic_size_t top;
|
|
||||||
char top_padding[64];
|
|
||||||
};
|
|
||||||
atomic_int active; // Which trace_buf is active.
|
|
||||||
struct trace_buf bufs[(trace_buf_max_log_size - trace_buf_min_log_size) + 1];
|
|
||||||
};
|
|
||||||
|
|
||||||
#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
|
|
||||||
#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
|
|
||||||
|
|
||||||
#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
|
|
||||||
#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
|
|
||||||
|
|
||||||
#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
|
|
||||||
|
|
||||||
static int
|
|
||||||
trace_deque_init(struct trace_deque *q) {
|
|
||||||
memset(q, 0, sizeof (*q));
|
|
||||||
int ret = trace_buf_init(&q->bufs[0], trace_buf_min_log_size);
|
|
||||||
// Note, this fence isn't in the paper, I added it out of caution.
|
|
||||||
atomic_thread_fence(memory_order_release);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
trace_deque_release(struct trace_deque *q) {
|
|
||||||
for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
|
|
||||||
trace_buf_release(&q->bufs[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
trace_deque_destroy(struct trace_deque *q) {
|
|
||||||
for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
|
|
||||||
trace_buf_destroy(&q->bufs[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
trace_deque_grow(struct trace_deque *q, int cur, size_t b, size_t t) {
|
|
||||||
if (!trace_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
|
|
||||||
fprintf(stderr, "failed to grow deque!!\n");
|
|
||||||
GC_CRASH();
|
|
||||||
}
|
|
||||||
|
|
||||||
cur++;
|
|
||||||
STORE_RELAXED(&q->active, cur);
|
|
||||||
return cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
trace_deque_push(struct trace_deque *q, struct gc_ref x) {
|
|
||||||
size_t b = LOAD_RELAXED(&q->bottom);
|
|
||||||
size_t t = LOAD_ACQUIRE(&q->top);
|
|
||||||
int active = LOAD_RELAXED(&q->active);
|
|
||||||
|
|
||||||
ssize_t size = b - t;
|
|
||||||
if (size > trace_buf_size(&q->bufs[active]) - 1) /* Full queue. */
|
|
||||||
active = trace_deque_grow(q, active, b, t);
|
|
||||||
|
|
||||||
trace_buf_put(&q->bufs[active], b, x);
|
|
||||||
atomic_thread_fence(memory_order_release);
|
|
||||||
STORE_RELAXED(&q->bottom, b + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
trace_deque_push_many(struct trace_deque *q, struct gc_ref *objv, size_t count) {
|
|
||||||
size_t b = LOAD_RELAXED(&q->bottom);
|
|
||||||
size_t t = LOAD_ACQUIRE(&q->top);
|
|
||||||
int active = LOAD_RELAXED(&q->active);
|
|
||||||
|
|
||||||
ssize_t size = b - t;
|
|
||||||
while (size > trace_buf_size(&q->bufs[active]) - count) /* Full queue. */
|
|
||||||
active = trace_deque_grow(q, active, b, t);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < count; i++)
|
|
||||||
trace_buf_put(&q->bufs[active], b + i, objv[i]);
|
|
||||||
atomic_thread_fence(memory_order_release);
|
|
||||||
STORE_RELAXED(&q->bottom, b + count);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct gc_ref
|
|
||||||
trace_deque_try_pop(struct trace_deque *q) {
|
|
||||||
size_t b = LOAD_RELAXED(&q->bottom);
|
|
||||||
int active = LOAD_RELAXED(&q->active);
|
|
||||||
STORE_RELAXED(&q->bottom, b - 1);
|
|
||||||
atomic_thread_fence(memory_order_seq_cst);
|
|
||||||
size_t t = LOAD_RELAXED(&q->top);
|
|
||||||
struct gc_ref x;
|
|
||||||
ssize_t size = b - t;
|
|
||||||
if (size > 0) { // Non-empty queue.
|
|
||||||
x = trace_buf_get(&q->bufs[active], b - 1);
|
|
||||||
if (size == 1) { // Single last element in queue.
|
|
||||||
if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
|
|
||||||
memory_order_seq_cst,
|
|
||||||
memory_order_relaxed))
|
|
||||||
// Failed race.
|
|
||||||
x = gc_ref_null();
|
|
||||||
STORE_RELAXED(&q->bottom, b);
|
|
||||||
}
|
|
||||||
} else { // Empty queue.
|
|
||||||
x = gc_ref_null();
|
|
||||||
STORE_RELAXED(&q->bottom, b);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct gc_ref
|
|
||||||
trace_deque_steal(struct trace_deque *q) {
|
|
||||||
while (1) {
|
|
||||||
size_t t = LOAD_ACQUIRE(&q->top);
|
|
||||||
atomic_thread_fence(memory_order_seq_cst);
|
|
||||||
size_t b = LOAD_ACQUIRE(&q->bottom);
|
|
||||||
ssize_t size = b - t;
|
|
||||||
if (size <= 0)
|
|
||||||
return gc_ref_null();
|
|
||||||
int active = LOAD_CONSUME(&q->active);
|
|
||||||
struct gc_ref ref = trace_buf_get(&q->bufs[active], t);
|
|
||||||
if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
|
|
||||||
memory_order_seq_cst,
|
|
||||||
memory_order_relaxed))
|
|
||||||
// Failed race.
|
|
||||||
continue;
|
|
||||||
return ref;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static ssize_t
|
|
||||||
trace_deque_size(struct trace_deque *q) {
|
|
||||||
size_t t = LOAD_ACQUIRE(&q->top);
|
|
||||||
atomic_thread_fence(memory_order_seq_cst);
|
|
||||||
size_t b = LOAD_ACQUIRE(&q->bottom);
|
|
||||||
ssize_t size = b - t;
|
|
||||||
return size;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
trace_deque_can_steal(struct trace_deque *q) {
|
|
||||||
return trace_deque_size(q) > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef LOAD_RELAXED
|
|
||||||
#undef STORE_RELAXED
|
|
||||||
#undef LOAD_ACQUIRE
|
|
||||||
#undef STORE_RELEASE
|
|
||||||
#undef LOAD_CONSUME
|
|
||||||
|
|
||||||
#define LOCAL_TRACE_QUEUE_SIZE 1024
|
#define LOCAL_TRACE_QUEUE_SIZE 1024
|
||||||
#define LOCAL_TRACE_QUEUE_MASK (LOCAL_TRACE_QUEUE_SIZE - 1)
|
#define LOCAL_TRACE_QUEUE_MASK (LOCAL_TRACE_QUEUE_SIZE - 1)
|
||||||
#define LOCAL_TRACE_QUEUE_SHARE_AMOUNT (LOCAL_TRACE_QUEUE_SIZE * 3 / 4)
|
#define LOCAL_TRACE_QUEUE_SHARE_AMOUNT (LOCAL_TRACE_QUEUE_SIZE * 3 / 4)
|
||||||
|
@ -319,7 +79,7 @@ struct trace_worker {
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
enum trace_worker_state state;
|
enum trace_worker_state state;
|
||||||
pthread_mutex_t lock;
|
pthread_mutex_t lock;
|
||||||
struct trace_deque deque;
|
struct shared_worklist deque;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define TRACE_WORKERS_MAX_COUNT 8
|
#define TRACE_WORKERS_MAX_COUNT 8
|
||||||
|
@ -335,7 +95,7 @@ struct tracer {
|
||||||
|
|
||||||
struct local_tracer {
|
struct local_tracer {
|
||||||
struct trace_worker *worker;
|
struct trace_worker *worker;
|
||||||
struct trace_deque *share_deque;
|
struct shared_worklist *share_deque;
|
||||||
struct local_trace_queue local;
|
struct local_trace_queue local;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -350,7 +110,7 @@ trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
|
||||||
worker->steal_id = 0;
|
worker->steal_id = 0;
|
||||||
worker->thread = 0;
|
worker->thread = 0;
|
||||||
pthread_mutex_init(&worker->lock, NULL);
|
pthread_mutex_init(&worker->lock, NULL);
|
||||||
return trace_deque_init(&worker->deque);
|
return shared_worklist_init(&worker->deque);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void trace_worker_trace(struct trace_worker *worker);
|
static void trace_worker_trace(struct trace_worker *worker);
|
||||||
|
@ -416,7 +176,7 @@ static void tracer_prepare(struct gc_heap *heap) {
|
||||||
static void tracer_release(struct gc_heap *heap) {
|
static void tracer_release(struct gc_heap *heap) {
|
||||||
struct tracer *tracer = heap_tracer(heap);
|
struct tracer *tracer = heap_tracer(heap);
|
||||||
for (size_t i = 0; i < tracer->worker_count; i++)
|
for (size_t i = 0; i < tracer->worker_count; i++)
|
||||||
trace_deque_release(&tracer->workers[i].deque);
|
shared_worklist_release(&tracer->workers[i].deque);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
|
@ -453,7 +213,7 @@ tracer_share(struct local_tracer *trace) {
|
||||||
while (to_share) {
|
while (to_share) {
|
||||||
struct gc_ref *objv;
|
struct gc_ref *objv;
|
||||||
size_t count = local_trace_queue_pop_many(&trace->local, &objv, to_share);
|
size_t count = local_trace_queue_pop_many(&trace->local, &objv, to_share);
|
||||||
trace_deque_push_many(trace->share_deque, objv, count);
|
shared_worklist_push_many(trace->share_deque, objv, count);
|
||||||
to_share -= count;
|
to_share -= count;
|
||||||
}
|
}
|
||||||
tracer_maybe_unpark_workers(heap_tracer(trace->worker->heap));
|
tracer_maybe_unpark_workers(heap_tracer(trace->worker->heap));
|
||||||
|
@ -476,13 +236,13 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
|
||||||
static struct gc_ref
|
static struct gc_ref
|
||||||
tracer_steal_from_worker(struct tracer *tracer, size_t id) {
|
tracer_steal_from_worker(struct tracer *tracer, size_t id) {
|
||||||
ASSERT(id < tracer->worker_count);
|
ASSERT(id < tracer->worker_count);
|
||||||
return trace_deque_steal(&tracer->workers[id].deque);
|
return shared_worklist_steal(&tracer->workers[id].deque);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
|
tracer_can_steal_from_worker(struct tracer *tracer, size_t id) {
|
||||||
ASSERT(id < tracer->worker_count);
|
ASSERT(id < tracer->worker_count);
|
||||||
return trace_deque_can_steal(&tracer->workers[id].deque);
|
return shared_worklist_can_steal(&tracer->workers[id].deque);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct gc_ref
|
static struct gc_ref
|
||||||
|
@ -502,7 +262,8 @@ trace_worker_steal_from_any(struct trace_worker *worker, struct tracer *tracer)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *tracer) {
|
trace_worker_can_steal_from_any(struct trace_worker *worker,
|
||||||
|
struct tracer *tracer) {
|
||||||
DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
|
DEBUG("tracer #%zu: checking if any worker has tasks\n", worker->id);
|
||||||
for (size_t i = 0; i < tracer->worker_count; i++) {
|
for (size_t i = 0; i < tracer->worker_count; i++) {
|
||||||
int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
|
int res = tracer_can_steal_from_worker(tracer, worker->steal_id);
|
||||||
|
@ -561,7 +322,7 @@ trace_worker_steal(struct local_tracer *trace) {
|
||||||
// something from the worker's own queue.
|
// something from the worker's own queue.
|
||||||
{
|
{
|
||||||
DEBUG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
|
DEBUG("tracer #%zu: trying to pop worker's own deque\n", worker->id);
|
||||||
struct gc_ref obj = trace_deque_try_pop(&worker->deque);
|
struct gc_ref obj = shared_worklist_try_pop(&worker->deque);
|
||||||
if (gc_ref_is_heap_object(obj))
|
if (gc_ref_is_heap_object(obj))
|
||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
@ -610,15 +371,15 @@ trace_worker_trace(struct trace_worker *worker) {
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
tracer_enqueue_root(struct tracer *tracer, struct gc_ref ref) {
|
tracer_enqueue_root(struct tracer *tracer, struct gc_ref ref) {
|
||||||
struct trace_deque *worker0_deque = &tracer->workers[0].deque;
|
struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
|
||||||
trace_deque_push(worker0_deque, ref);
|
shared_worklist_push(worker0_deque, ref);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objv,
|
tracer_enqueue_roots(struct tracer *tracer, struct gc_ref *objv,
|
||||||
size_t count) {
|
size_t count) {
|
||||||
struct trace_deque *worker0_deque = &tracer->workers[0].deque;
|
struct shared_worklist *worker0_deque = &tracer->workers[0].deque;
|
||||||
trace_deque_push_many(worker0_deque, objv, count);
|
shared_worklist_push_many(worker0_deque, objv, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
|
@ -629,7 +390,7 @@ tracer_trace(struct gc_heap *heap) {
|
||||||
|
|
||||||
ssize_t parallel_threshold =
|
ssize_t parallel_threshold =
|
||||||
LOCAL_TRACE_QUEUE_SIZE - LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
|
LOCAL_TRACE_QUEUE_SIZE - LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
|
||||||
if (trace_deque_size(&tracer->workers[0].deque) >= parallel_threshold) {
|
if (shared_worklist_size(&tracer->workers[0].deque) >= parallel_threshold) {
|
||||||
DEBUG("waking workers\n");
|
DEBUG("waking workers\n");
|
||||||
tracer_unpark_all_workers(tracer);
|
tracer_unpark_all_workers(tracer);
|
||||||
} else {
|
} else {
|
||||||
|
|
259
src/shared-worklist.h
Normal file
259
src/shared-worklist.h
Normal file
|
@ -0,0 +1,259 @@
|
||||||
|
#ifndef SHARED_WORKLIST_H
|
||||||
|
#define SHARED_WORKLIST_H
|
||||||
|
|
||||||
|
#include <stdatomic.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "assert.h"
|
||||||
|
#include "debug.h"
|
||||||
|
#include "gc-align.h"
|
||||||
|
#include "gc-inline.h"
|
||||||
|
#include "spin.h"
|
||||||
|
|
||||||
|
// The Chase-Lev work-stealing deque, as initially described in "Dynamic
|
||||||
|
// Circular Work-Stealing Deque" (Chase and Lev, SPAA'05)
|
||||||
|
// (https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf)
|
||||||
|
// and improved with C11 atomics in "Correct and Efficient Work-Stealing
|
||||||
|
// for Weak Memory Models" (Lê et al, PPoPP'13)
|
||||||
|
// (http://www.di.ens.fr/%7Ezappa/readings/ppopp13.pdf).
|
||||||
|
|
||||||
|
struct shared_worklist_buf {
|
||||||
|
unsigned log_size;
|
||||||
|
size_t size;
|
||||||
|
uintptr_t *data;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Min size: 8 kB on 64-bit systems, 4 kB on 32-bit.
|
||||||
|
#define shared_worklist_buf_min_log_size ((unsigned) 10)
|
||||||
|
// Max size: 2 GB on 64-bit systems, 1 GB on 32-bit.
|
||||||
|
#define shared_worklist_buf_max_log_size ((unsigned) 28)
|
||||||
|
|
||||||
|
static int
|
||||||
|
shared_worklist_buf_init(struct shared_worklist_buf *buf, unsigned log_size) {
|
||||||
|
ASSERT(log_size >= shared_worklist_buf_min_log_size);
|
||||||
|
ASSERT(log_size <= shared_worklist_buf_max_log_size);
|
||||||
|
size_t size = (1 << log_size) * sizeof(uintptr_t);
|
||||||
|
void *mem = mmap(NULL, size, PROT_READ|PROT_WRITE,
|
||||||
|
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
|
||||||
|
if (mem == MAP_FAILED) {
|
||||||
|
perror("Failed to grow work-stealing dequeue");
|
||||||
|
DEBUG("Failed to allocate %zu bytes", size);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
buf->log_size = log_size;
|
||||||
|
buf->size = 1 << log_size;
|
||||||
|
buf->data = mem;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline size_t
|
||||||
|
shared_worklist_buf_size(struct shared_worklist_buf *buf) {
|
||||||
|
return buf->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline size_t
|
||||||
|
shared_worklist_buf_byte_size(struct shared_worklist_buf *buf) {
|
||||||
|
return shared_worklist_buf_size(buf) * sizeof(uintptr_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shared_worklist_buf_release(struct shared_worklist_buf *buf) {
|
||||||
|
if (buf->data)
|
||||||
|
madvise(buf->data, shared_worklist_buf_byte_size(buf), MADV_DONTNEED);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shared_worklist_buf_destroy(struct shared_worklist_buf *buf) {
|
||||||
|
if (buf->data) {
|
||||||
|
munmap(buf->data, shared_worklist_buf_byte_size(buf));
|
||||||
|
buf->data = NULL;
|
||||||
|
buf->log_size = 0;
|
||||||
|
buf->size = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct gc_ref
|
||||||
|
shared_worklist_buf_get(struct shared_worklist_buf *buf, size_t i) {
|
||||||
|
return gc_ref(atomic_load_explicit(&buf->data[i & (buf->size - 1)],
|
||||||
|
memory_order_relaxed));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
shared_worklist_buf_put(struct shared_worklist_buf *buf, size_t i,
|
||||||
|
struct gc_ref ref) {
|
||||||
|
return atomic_store_explicit(&buf->data[i & (buf->size - 1)],
|
||||||
|
gc_ref_value(ref),
|
||||||
|
memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int
|
||||||
|
shared_worklist_buf_grow(struct shared_worklist_buf *from,
|
||||||
|
struct shared_worklist_buf *to, size_t b, size_t t) {
|
||||||
|
if (from->log_size == shared_worklist_buf_max_log_size)
|
||||||
|
return 0;
|
||||||
|
if (!shared_worklist_buf_init (to, from->log_size + 1))
|
||||||
|
return 0;
|
||||||
|
for (size_t i=t; i<b; i++)
|
||||||
|
shared_worklist_buf_put(to, i, shared_worklist_buf_get(from, i));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chase-Lev work-stealing deque. One thread pushes data into the deque
|
||||||
|
// at the bottom, and many threads compete to steal data from the top.
|
||||||
|
struct shared_worklist {
|
||||||
|
// Ensure bottom and top are on different cache lines.
|
||||||
|
union {
|
||||||
|
atomic_size_t bottom;
|
||||||
|
char bottom_padding[AVOID_FALSE_SHARING];
|
||||||
|
};
|
||||||
|
union {
|
||||||
|
atomic_size_t top;
|
||||||
|
char top_padding[AVOID_FALSE_SHARING];
|
||||||
|
};
|
||||||
|
atomic_int active; // Which shared_worklist_buf is active.
|
||||||
|
struct shared_worklist_buf bufs[(shared_worklist_buf_max_log_size -
|
||||||
|
shared_worklist_buf_min_log_size) + 1];
|
||||||
|
};
|
||||||
|
|
||||||
|
#define LOAD_RELAXED(loc) atomic_load_explicit(loc, memory_order_relaxed)
|
||||||
|
#define STORE_RELAXED(loc, o) atomic_store_explicit(loc, o, memory_order_relaxed)
|
||||||
|
|
||||||
|
#define LOAD_ACQUIRE(loc) atomic_load_explicit(loc, memory_order_acquire)
|
||||||
|
#define STORE_RELEASE(loc, o) atomic_store_explicit(loc, o, memory_order_release)
|
||||||
|
|
||||||
|
#define LOAD_CONSUME(loc) atomic_load_explicit(loc, memory_order_consume)
|
||||||
|
|
||||||
|
static int
|
||||||
|
shared_worklist_init(struct shared_worklist *q) {
|
||||||
|
memset(q, 0, sizeof (*q));
|
||||||
|
int ret = shared_worklist_buf_init(&q->bufs[0],
|
||||||
|
shared_worklist_buf_min_log_size);
|
||||||
|
// Note, this fence isn't in the paper, I added it out of caution.
|
||||||
|
atomic_thread_fence(memory_order_release);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shared_worklist_release(struct shared_worklist *q) {
|
||||||
|
for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
|
||||||
|
shared_worklist_buf_release(&q->bufs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shared_worklist_destroy(struct shared_worklist *q) {
|
||||||
|
for (int i = LOAD_RELAXED(&q->active); i >= 0; i--)
|
||||||
|
shared_worklist_buf_destroy(&q->bufs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
shared_worklist_grow(struct shared_worklist *q, int cur, size_t b, size_t t) {
|
||||||
|
if (!shared_worklist_buf_grow(&q->bufs[cur], &q->bufs[cur + 1], b, t)) {
|
||||||
|
fprintf(stderr, "failed to grow deque!!\n");
|
||||||
|
GC_CRASH();
|
||||||
|
}
|
||||||
|
|
||||||
|
cur++;
|
||||||
|
STORE_RELAXED(&q->active, cur);
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shared_worklist_push(struct shared_worklist *q, struct gc_ref x) {
|
||||||
|
size_t b = LOAD_RELAXED(&q->bottom);
|
||||||
|
size_t t = LOAD_ACQUIRE(&q->top);
|
||||||
|
int active = LOAD_RELAXED(&q->active);
|
||||||
|
|
||||||
|
ssize_t size = b - t;
|
||||||
|
if (size > shared_worklist_buf_size(&q->bufs[active]) - 1)
|
||||||
|
active = shared_worklist_grow(q, active, b, t); /* Full queue; grow. */
|
||||||
|
|
||||||
|
shared_worklist_buf_put(&q->bufs[active], b, x);
|
||||||
|
atomic_thread_fence(memory_order_release);
|
||||||
|
STORE_RELAXED(&q->bottom, b + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shared_worklist_push_many(struct shared_worklist *q, struct gc_ref *objv,
|
||||||
|
size_t count) {
|
||||||
|
size_t b = LOAD_RELAXED(&q->bottom);
|
||||||
|
size_t t = LOAD_ACQUIRE(&q->top);
|
||||||
|
int active = LOAD_RELAXED(&q->active);
|
||||||
|
|
||||||
|
ssize_t size = b - t;
|
||||||
|
while (size > shared_worklist_buf_size(&q->bufs[active]) - count)
|
||||||
|
active = shared_worklist_grow(q, active, b, t); /* Full queue; grow. */
|
||||||
|
|
||||||
|
for (size_t i = 0; i < count; i++)
|
||||||
|
shared_worklist_buf_put(&q->bufs[active], b + i, objv[i]);
|
||||||
|
atomic_thread_fence(memory_order_release);
|
||||||
|
STORE_RELAXED(&q->bottom, b + count);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct gc_ref
|
||||||
|
shared_worklist_try_pop(struct shared_worklist *q) {
|
||||||
|
size_t b = LOAD_RELAXED(&q->bottom);
|
||||||
|
int active = LOAD_RELAXED(&q->active);
|
||||||
|
STORE_RELAXED(&q->bottom, b - 1);
|
||||||
|
atomic_thread_fence(memory_order_seq_cst);
|
||||||
|
size_t t = LOAD_RELAXED(&q->top);
|
||||||
|
struct gc_ref x;
|
||||||
|
ssize_t size = b - t;
|
||||||
|
if (size > 0) { // Non-empty queue.
|
||||||
|
x = shared_worklist_buf_get(&q->bufs[active], b - 1);
|
||||||
|
if (size == 1) { // Single last element in queue.
|
||||||
|
if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
|
||||||
|
memory_order_seq_cst,
|
||||||
|
memory_order_relaxed))
|
||||||
|
// Failed race.
|
||||||
|
x = gc_ref_null();
|
||||||
|
STORE_RELAXED(&q->bottom, b);
|
||||||
|
}
|
||||||
|
} else { // Empty queue.
|
||||||
|
x = gc_ref_null();
|
||||||
|
STORE_RELAXED(&q->bottom, b);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct gc_ref
|
||||||
|
shared_worklist_steal(struct shared_worklist *q) {
|
||||||
|
while (1) {
|
||||||
|
size_t t = LOAD_ACQUIRE(&q->top);
|
||||||
|
atomic_thread_fence(memory_order_seq_cst);
|
||||||
|
size_t b = LOAD_ACQUIRE(&q->bottom);
|
||||||
|
ssize_t size = b - t;
|
||||||
|
if (size <= 0)
|
||||||
|
return gc_ref_null();
|
||||||
|
int active = LOAD_CONSUME(&q->active);
|
||||||
|
struct gc_ref ref = shared_worklist_buf_get(&q->bufs[active], t);
|
||||||
|
if (!atomic_compare_exchange_strong_explicit(&q->top, &t, t + 1,
|
||||||
|
memory_order_seq_cst,
|
||||||
|
memory_order_relaxed))
|
||||||
|
// Failed race.
|
||||||
|
continue;
|
||||||
|
return ref;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t
|
||||||
|
shared_worklist_size(struct shared_worklist *q) {
|
||||||
|
size_t t = LOAD_ACQUIRE(&q->top);
|
||||||
|
atomic_thread_fence(memory_order_seq_cst);
|
||||||
|
size_t b = LOAD_ACQUIRE(&q->bottom);
|
||||||
|
ssize_t size = b - t;
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
shared_worklist_can_steal(struct shared_worklist *q) {
|
||||||
|
return shared_worklist_size(q) > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef LOAD_RELAXED
|
||||||
|
#undef STORE_RELAXED
|
||||||
|
#undef LOAD_ACQUIRE
|
||||||
|
#undef STORE_RELEASE
|
||||||
|
#undef LOAD_CONSUME
|
||||||
|
|
||||||
|
#endif // SHARED_WORKLIST_H
|
Loading…
Add table
Add a link
Reference in a new issue