1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 11:40:18 +02:00

Rework parallel tracing state machine

Instead of sending a message to each worker, we pthread_cond_broadcast
at the start.  Instead of having N worker threads, we have N-1 threads
and the main thread also does work.  Instead of termination being
detected by the worker threads, let the main thread detect it.  Avoid
parallelism if the mark stack is small enough, which can be the case for
ephemeron chains.  Let aux threads exit when they have no work instead
of spinning: sharing will start them up again.
This commit is contained in:
Andy Wingo 2023-10-23 11:26:33 +02:00
parent 9cc12916a9
commit 3d2a12c684

View file

@ -319,7 +319,6 @@ struct trace_worker {
pthread_t thread; pthread_t thread;
enum trace_worker_state state; enum trace_worker_state state;
pthread_mutex_t lock; pthread_mutex_t lock;
pthread_cond_t cond;
struct trace_deque deque; struct trace_deque deque;
}; };
@ -328,8 +327,7 @@ struct trace_worker {
struct tracer { struct tracer {
atomic_size_t active_tracers; atomic_size_t active_tracers;
size_t worker_count; size_t worker_count;
atomic_size_t running_tracers; long epoch;
long count;
pthread_mutex_t lock; pthread_mutex_t lock;
pthread_cond_t cond; pthread_cond_t cond;
struct trace_worker workers[TRACE_WORKERS_MAX_COUNT]; struct trace_worker workers[TRACE_WORKERS_MAX_COUNT];
@ -351,9 +349,7 @@ trace_worker_init(struct trace_worker *worker, struct gc_heap *heap,
worker->id = id; worker->id = id;
worker->steal_id = 0; worker->steal_id = 0;
worker->thread = 0; worker->thread = 0;
worker->state = TRACE_WORKER_STOPPED;
pthread_mutex_init(&worker->lock, NULL); pthread_mutex_init(&worker->lock, NULL);
pthread_cond_init(&worker->cond, NULL);
return trace_deque_init(&worker->deque); return trace_deque_init(&worker->deque);
} }
@ -362,89 +358,46 @@ static void trace_worker_trace(struct trace_worker *worker);
static void* static void*
trace_worker_thread(void *data) { trace_worker_thread(void *data) {
struct trace_worker *worker = data; struct trace_worker *worker = data;
struct tracer *tracer = heap_tracer(worker->heap);
long trace_epoch = 0;
pthread_mutex_lock(&worker->lock); pthread_mutex_lock(&worker->lock);
while (1) { while (1) {
switch (worker->state) { long epoch = atomic_load_explicit(&tracer->epoch, memory_order_acquire);
case TRACE_WORKER_IDLE: if (trace_epoch != epoch) {
pthread_cond_wait(&worker->cond, &worker->lock); trace_epoch = epoch;
break;
case TRACE_WORKER_TRACING:
trace_worker_trace(worker); trace_worker_trace(worker);
worker->state = TRACE_WORKER_IDLE;
break;
case TRACE_WORKER_STOPPING:
worker->state = TRACE_WORKER_DEAD;
pthread_mutex_unlock(&worker->lock);
return NULL;
default:
GC_CRASH();
} }
pthread_cond_wait(&tracer->cond, &worker->lock);
} }
return NULL;
} }
static int static int
trace_worker_spawn(struct trace_worker *worker) { trace_worker_spawn(struct trace_worker *worker) {
pthread_mutex_lock(&worker->lock);
ASSERT(worker->state == TRACE_WORKER_STOPPED);
worker->state = TRACE_WORKER_IDLE;
pthread_mutex_unlock(&worker->lock);
if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) { if (pthread_create(&worker->thread, NULL, trace_worker_thread, worker)) {
perror("spawning tracer thread failed"); perror("spawning tracer thread failed");
worker->state = TRACE_WORKER_STOPPED;
return 0; return 0;
} }
return 1; return 1;
} }
static void
trace_worker_request_trace(struct trace_worker *worker) {
struct tracer *tracer = heap_tracer(worker->heap);
pthread_mutex_lock(&worker->lock);
ASSERT(worker->state == TRACE_WORKER_IDLE);
worker->state = TRACE_WORKER_TRACING;
pthread_cond_signal(&worker->cond);
pthread_mutex_unlock(&worker->lock);
}
static void
trace_worker_finished_tracing(struct trace_worker *worker) {
// Signal controller that we are done with tracing.
struct tracer *tracer = heap_tracer(worker->heap);
if (atomic_fetch_sub(&tracer->running_tracers, 1) == 1) {
pthread_mutex_lock(&tracer->lock);
tracer->count++;
pthread_cond_signal(&tracer->cond);
pthread_mutex_unlock(&tracer->lock);
}
}
static void
trace_worker_request_stop(struct trace_worker *worker) {
pthread_mutex_lock(&worker->lock);
ASSERT(worker->state == TRACE_WORKER_IDLE);
worker->state = TRACE_WORKER_STOPPING;
pthread_cond_signal(&worker->cond);
pthread_mutex_unlock(&worker->lock);
}
static int static int
tracer_init(struct gc_heap *heap, size_t parallelism) { tracer_init(struct gc_heap *heap, size_t parallelism) {
struct tracer *tracer = heap_tracer(heap); struct tracer *tracer = heap_tracer(heap);
atomic_init(&tracer->active_tracers, 0); atomic_init(&tracer->active_tracers, 0);
atomic_init(&tracer->running_tracers, 0); tracer->epoch = 0;
tracer->count = 0;
pthread_mutex_init(&tracer->lock, NULL); pthread_mutex_init(&tracer->lock, NULL);
pthread_cond_init(&tracer->cond, NULL); pthread_cond_init(&tracer->cond, NULL);
size_t desired_worker_count = parallelism; size_t desired_worker_count = parallelism;
ASSERT(desired_worker_count); ASSERT(desired_worker_count);
if (desired_worker_count > TRACE_WORKERS_MAX_COUNT) if (desired_worker_count > TRACE_WORKERS_MAX_COUNT)
desired_worker_count = TRACE_WORKERS_MAX_COUNT; desired_worker_count = TRACE_WORKERS_MAX_COUNT;
for (size_t i = 0; i < desired_worker_count; i++) { if (!trace_worker_init(&tracer->workers[0], heap, tracer, 0))
return 0;
tracer->worker_count++;
for (size_t i = 1; i < desired_worker_count; i++) {
if (!trace_worker_init(&tracer->workers[i], heap, tracer, i)) if (!trace_worker_init(&tracer->workers[i], heap, tracer, i))
break; break;
if (trace_worker_spawn(&tracer->workers[i])) if (trace_worker_spawn(&tracer->workers[i]))
@ -452,7 +405,7 @@ tracer_init(struct gc_heap *heap, size_t parallelism) {
else else
break; break;
} }
return tracer->worker_count > 0; return 1;
} }
static void tracer_prepare(struct gc_heap *heap) { static void tracer_prepare(struct gc_heap *heap) {
@ -466,6 +419,24 @@ static void tracer_release(struct gc_heap *heap) {
trace_deque_release(&tracer->workers[i].deque); trace_deque_release(&tracer->workers[i].deque);
} }
static inline void
tracer_unpark_all_workers(struct tracer *tracer) {
long old_epoch =
atomic_fetch_add_explicit(&tracer->epoch, 1, memory_order_acq_rel);
long epoch = old_epoch + 1;
DEBUG("starting trace; %zu workers; epoch=%ld\n", tracer->worker_count,
epoch);
pthread_cond_broadcast(&tracer->cond);
}
static inline void
tracer_maybe_unpark_workers(struct tracer *tracer) {
size_t active =
atomic_load_explicit(&tracer->active_tracers, memory_order_acquire);
if (active < tracer->worker_count)
tracer_unpark_all_workers(tracer);
}
static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap, static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
void *trace_data) GC_ALWAYS_INLINE; void *trace_data) GC_ALWAYS_INLINE;
static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap, static inline void tracer_enqueue(struct gc_ref ref, struct gc_heap *heap,
@ -485,6 +456,7 @@ tracer_share(struct local_tracer *trace) {
trace_deque_push_many(trace->share_deque, objv, count); trace_deque_push_many(trace->share_deque, objv, count);
to_share -= count; to_share -= count;
} }
tracer_maybe_unpark_workers(heap_tracer(trace->worker->heap));
} }
static inline void static inline void
@ -501,16 +473,6 @@ tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
tracer_enqueue(gc_edge_ref(edge), heap, trace_data); tracer_enqueue(gc_edge_ref(edge), heap, trace_data);
} }
static inline void
tracer_visit_(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
if (trace_edge(heap, edge)) {
struct local_tracer *trace = trace_data;
if (local_trace_queue_full(&trace->local))
tracer_share(trace);
local_trace_queue_push(&trace->local, gc_edge_ref(edge));
}
}
static struct gc_ref static struct gc_ref
tracer_steal_from_worker(struct tracer *tracer, size_t id) { tracer_steal_from_worker(struct tracer *tracer, size_t id) {
ASSERT(id < tracer->worker_count); ASSERT(id < tracer->worker_count);
@ -556,28 +518,35 @@ trace_worker_can_steal_from_any(struct trace_worker *worker, struct tracer *trac
} }
static int static int
trace_worker_check_termination(struct trace_worker *worker, trace_worker_should_continue(struct trace_worker *worker) {
struct tracer *tracer) { // Helper workers should park themselves immediately if they have no work.
// We went around all workers and nothing. Enter termination phase. if (worker->id != 0)
if (atomic_fetch_sub_explicit(&tracer->active_tracers, 1, return 0;
memory_order_relaxed) == 1) {
DEBUG(" ->> tracer #%zu: DONE (no spinning) <<-\n", worker->id); struct tracer *tracer = heap_tracer(worker->heap);
return 1;
}
for (size_t spin_count = 0;; spin_count++) { for (size_t spin_count = 0;; spin_count++) {
if (trace_worker_can_steal_from_any(worker, tracer)) {
atomic_fetch_add_explicit(&tracer->active_tracers, 1,
memory_order_relaxed);
return 0;
}
if (atomic_load_explicit(&tracer->active_tracers, if (atomic_load_explicit(&tracer->active_tracers,
memory_order_relaxed) == 0) { memory_order_acquire) == 1) {
DEBUG(" ->> tracer #%zu: DONE <<-\n", worker->id); // All trace workers have exited except us, the main worker. We are
return 1; // probably done, but we need to synchronize to be sure that there is no
// work pending, for example if a worker had a spurious wakeup. Skip
// worker 0 (the main worker).
size_t locked = 1;
while (locked < tracer->worker_count) {
if (pthread_mutex_trylock(&tracer->workers[locked].lock) == 0)
locked++;
else
break;
}
int done = (locked == tracer->worker_count) &&
!trace_worker_can_steal_from_any(worker, tracer);
while (locked > 1)
pthread_mutex_unlock(&tracer->workers[--locked].lock);
return !done;
} }
// spin // spin
DEBUG("tracer #%zu: spinning #%zu\n", worker->id, spin_count); DEBUG("checking for termination: spinning #%zu\n", spin_count);
yield_for_spin(spin_count); yield_for_spin(spin_count);
} }
} }
@ -597,42 +566,46 @@ trace_worker_steal(struct local_tracer *trace) {
return obj; return obj;
} }
while (1) { DEBUG("tracer #%zu: trying to steal\n", worker->id);
DEBUG("tracer #%zu: trying to steal\n", worker->id); struct gc_ref obj = trace_worker_steal_from_any(worker, tracer);
struct gc_ref obj = trace_worker_steal_from_any(worker, tracer); if (gc_ref_is_heap_object(obj))
if (gc_ref_is_heap_object(obj)) return obj;
return obj;
if (trace_worker_check_termination(worker, tracer)) return gc_ref_null();
return gc_ref_null();
}
} }
static void static void
trace_worker_trace(struct trace_worker *worker) { trace_worker_trace(struct trace_worker *worker) {
struct gc_heap *heap = worker->heap;
struct tracer *tracer = heap_tracer(heap);
atomic_fetch_add_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
struct local_tracer trace; struct local_tracer trace;
trace.worker = worker; trace.worker = worker;
trace.share_deque = &worker->deque; trace.share_deque = &worker->deque;
struct gc_heap *heap = worker->heap;
local_trace_queue_init(&trace.local); local_trace_queue_init(&trace.local);
size_t n = 0; size_t n = 0;
DEBUG("tracer #%zu: running trace loop\n", worker->id); DEBUG("tracer #%zu: running trace loop\n", worker->id);
while (1) {
struct gc_ref ref; do {
if (!local_trace_queue_empty(&trace.local)) { while (1) {
ref = local_trace_queue_pop(&trace.local); struct gc_ref ref;
} else { if (!local_trace_queue_empty(&trace.local)) {
ref = trace_worker_steal(&trace); ref = local_trace_queue_pop(&trace.local);
if (!gc_ref_is_heap_object(ref)) } else {
break; ref = trace_worker_steal(&trace);
if (!gc_ref_is_heap_object(ref))
break;
}
trace_one(ref, heap, &trace);
n++;
} }
trace_one(ref, heap, &trace); } while (trace_worker_should_continue(worker));
n++;
}
DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n); DEBUG("tracer #%zu: done tracing, %zu objects traced\n", worker->id, n);
trace_worker_finished_tracing(worker); atomic_fetch_sub_explicit(&tracer->active_tracers, 1, memory_order_acq_rel);
} }
static inline void static inline void
@ -652,25 +625,18 @@ static inline void
tracer_trace(struct gc_heap *heap) { tracer_trace(struct gc_heap *heap) {
struct tracer *tracer = heap_tracer(heap); struct tracer *tracer = heap_tracer(heap);
pthread_mutex_lock(&tracer->lock);
long trace_count = tracer->count;
pthread_mutex_unlock(&tracer->lock);
DEBUG("starting trace; %zu workers\n", tracer->worker_count); DEBUG("starting trace; %zu workers\n", tracer->worker_count);
DEBUG("waking workers\n");
atomic_store_explicit(&tracer->active_tracers, tracer->worker_count,
memory_order_release);
atomic_store_explicit(&tracer->running_tracers, tracer->worker_count,
memory_order_release);
for (size_t i = 0; i < tracer->worker_count; i++)
trace_worker_request_trace(&tracer->workers[i]);
DEBUG("waiting on tracers\n"); ssize_t parallel_threshold =
LOCAL_TRACE_QUEUE_SIZE - LOCAL_TRACE_QUEUE_SHARE_AMOUNT;
if (trace_deque_size(&tracer->workers[0].deque) >= parallel_threshold) {
DEBUG("waking workers\n");
tracer_unpark_all_workers(tracer);
} else {
DEBUG("starting in local-only mode\n");
}
pthread_mutex_lock(&tracer->lock); trace_worker_trace(&tracer->workers[0]);
while (tracer->count <= trace_count)
pthread_cond_wait(&tracer->cond, &tracer->lock);
pthread_mutex_unlock(&tracer->lock);
DEBUG("trace finished\n"); DEBUG("trace finished\n");
} }