mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 03:40:34 +02:00
Fix hang in srfi-18.test
* libguile/threads.h (held_mutex): New field. * libguile/threads.c (enqueue, remqueue, dequeue): Use critical section to protect access to the queue. (guilify_self_1): Initialize held_mutex field. (on_thread_exit): If held_mutex non-null, unlock it. (fat_mutex_unlock, fat_cond_free, scm_make_condition_variable, fat_cond_signal, fat_cond_broadcast): Delete now unnecessary uses of c->lock. (fat_mutex_unlock): Pass m->lock to block_self() instead of c->lock; move scm_i_pthread_mutex_unlock(m->lock) call from before block_self() to after. (scm_pthread_cond_wait, scm_pthread_cond_timedwait, scm_i_thread_sleep_for_gc): Set held_mutex before pthread call; reset it afterwards. I was seeing a hang in srfi-18.test, when running make check in master, in the "exception handler installation is thread-safe" test. It wasn't 100% reproducible, so looked like a race. The problem is that wait-condition-variable is not actually atomic in the way that it is supposed to be. It unlocks the mutex, then starts waiting on the cond var. So it is possible for another thread to lock the same mutex, and signal the cond var, before the wait-condition-variable thread starts waiting. In order for wait-condition-variable to be atomic - e.g. in a race where thread A holds (Scheme-level) mutex M, and calls (wait-condition-variable C M), and thread B calls (begin (lock-mutex M) (signal-condition-variable C)) - it needs to call pthread_cond_wait with the same underlying mutex as is involved in the `lock-mutex' call. In terms of the threads.c code, this means that it has to use M->lock, not C->lock. block_self() used its mutex arg for two purposes: for protecting access and changes to the wait queue, and for the pthread_cond_wait call. But it wouldn't work reliably to use M->lock to protect C's wait queue, because in theory two threads can call (wait-condition-variable C M1) and (wait-condition-variable C M2) concurrently, with M1 and M2 different. So we either have to pass both C->lock and M->lock into block_self(), or use some other mutex to protect the wait queue. For this patch, I switched to using the critical section mutex, because that is a global and so easily available. (If that turns out to be a problem for performance, we could make each queue structure have its own mutex, but there's no reason to believe yet that it is a problem, because the critical section mutex isn't used much overall.) So then we call block_self() with M->lock, and move where M->lock is unlocked to after the block_self() call, instead of before. That solves the first hang, but introduces a new one, when a SRFI-18 thread is terminated (`thread-terminate!') between being launched (`make-thread') and started (`thread-start!'). The problem now is that pthread_cond_wait is a cancellation point (see man pthread_cancel), so the pthread_cond_wait call is one of the few places where a thread-terminate! call can take effect. If the thread is cancelled at that point, M->lock ends up still being locked, and then when do_thread_exit() tries to lock M->lock again, it hangs. The fix for that is a new `held_mutex' field in scm_i_thread, which is set to point to the mutex just before a pthread_cond_(timed)wait call, and set to NULL again afterwards. If on_thread_exit() finds that held_mutex is non-NULL, it unlocks that mutex. A detail is that checking and unlocking held_mutex must be done before on_thread_exit() calls scm_i_ensure_signal_delivery_thread(), because the innards of scm_i_ensure_signal_delivery_thread() can do another pthread_cond_wait() call and so overwrite held_mutex. But that's OK, because it's fine for the mutex check and unlock to happen outside Guile mode. Lastly, C->lock is then not needed, so I've removed it.
This commit is contained in:
parent
02e720ff76
commit
f9d44e8476
2 changed files with 34 additions and 21 deletions
|
@ -96,11 +96,13 @@ static SCM
|
|||
enqueue (SCM q, SCM t)
|
||||
{
|
||||
SCM c = scm_cons (t, SCM_EOL);
|
||||
SCM_CRITICAL_SECTION_START;
|
||||
if (scm_is_null (SCM_CDR (q)))
|
||||
SCM_SETCDR (q, c);
|
||||
else
|
||||
SCM_SETCDR (SCM_CAR (q), c);
|
||||
SCM_SETCAR (q, c);
|
||||
SCM_CRITICAL_SECTION_END;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -113,6 +115,7 @@ static int
|
|||
remqueue (SCM q, SCM c)
|
||||
{
|
||||
SCM p, prev = q;
|
||||
SCM_CRITICAL_SECTION_START;
|
||||
for (p = SCM_CDR (q); !scm_is_null (p); p = SCM_CDR (p))
|
||||
{
|
||||
if (scm_is_eq (p, c))
|
||||
|
@ -120,10 +123,12 @@ remqueue (SCM q, SCM c)
|
|||
if (scm_is_eq (c, SCM_CAR (q)))
|
||||
SCM_SETCAR (q, SCM_CDR (c));
|
||||
SCM_SETCDR (prev, SCM_CDR (c));
|
||||
SCM_CRITICAL_SECTION_END;
|
||||
return 1;
|
||||
}
|
||||
prev = p;
|
||||
}
|
||||
SCM_CRITICAL_SECTION_END;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -133,14 +138,20 @@ remqueue (SCM q, SCM c)
|
|||
static SCM
|
||||
dequeue (SCM q)
|
||||
{
|
||||
SCM c = SCM_CDR (q);
|
||||
SCM c;
|
||||
SCM_CRITICAL_SECTION_START;
|
||||
c = SCM_CDR (q);
|
||||
if (scm_is_null (c))
|
||||
return SCM_BOOL_F;
|
||||
{
|
||||
SCM_CRITICAL_SECTION_END;
|
||||
return SCM_BOOL_F;
|
||||
}
|
||||
else
|
||||
{
|
||||
SCM_SETCDR (q, SCM_CDR (c));
|
||||
if (scm_is_null (SCM_CDR (q)))
|
||||
SCM_SETCAR (q, SCM_EOL);
|
||||
SCM_CRITICAL_SECTION_END;
|
||||
return SCM_CAR (c);
|
||||
}
|
||||
}
|
||||
|
@ -217,8 +228,7 @@ thread_free (SCM obj)
|
|||
interrupted. Upon return of this function, the current thread is
|
||||
no longer on QUEUE, even when the sleep has been interrupted.
|
||||
|
||||
The QUEUE data structure is assumed to be protected by MUTEX and
|
||||
the caller of block_self must hold MUTEX. It will be atomically
|
||||
The caller of block_self must hold MUTEX. It will be atomically
|
||||
unlocked while sleeping, just as with scm_i_pthread_cond_wait.
|
||||
|
||||
SLEEP_OBJECT is an arbitrary SCM value that is kept alive as long
|
||||
|
@ -266,9 +276,8 @@ block_self (SCM queue, SCM sleep_object, scm_i_pthread_mutex_t *mutex,
|
|||
return err;
|
||||
}
|
||||
|
||||
/* Wake up the first thread on QUEUE, if any. The caller must hold
|
||||
the mutex that protects QUEUE. The awoken thread is returned, or
|
||||
#f when the queue was empty.
|
||||
/* Wake up the first thread on QUEUE, if any. The awoken thread is
|
||||
returned, or #f if the queue was empty.
|
||||
*/
|
||||
static SCM
|
||||
unblock_from_queue (SCM queue)
|
||||
|
@ -441,6 +450,7 @@ guilify_self_1 (SCM_STACKITEM *base)
|
|||
t->result = SCM_BOOL_F;
|
||||
t->cleanup_handler = SCM_BOOL_F;
|
||||
t->mutexes = SCM_EOL;
|
||||
t->held_mutex = NULL;
|
||||
t->join_queue = SCM_EOL;
|
||||
t->dynamic_state = SCM_BOOL_F;
|
||||
t->dynwinds = SCM_EOL;
|
||||
|
@ -591,6 +601,14 @@ on_thread_exit (void *v)
|
|||
/* This handler is executed in non-guile mode. */
|
||||
scm_i_thread *t = (scm_i_thread *) v, **tp;
|
||||
|
||||
/* If this thread was cancelled while doing a cond wait, it will
|
||||
still have a mutex locked, so we unlock it here. */
|
||||
if (t->held_mutex)
|
||||
{
|
||||
scm_i_pthread_mutex_unlock (t->held_mutex);
|
||||
t->held_mutex = NULL;
|
||||
}
|
||||
|
||||
scm_i_pthread_setspecific (scm_i_thread_key, v);
|
||||
|
||||
/* Ensure the signal handling thread has been launched, because we might be
|
||||
|
@ -1419,17 +1437,15 @@ fat_mutex_unlock (SCM mutex, SCM cond,
|
|||
{
|
||||
int brk = 0;
|
||||
|
||||
scm_i_scm_pthread_mutex_lock (&c->lock);
|
||||
if (m->level > 0)
|
||||
m->level--;
|
||||
if (m->level == 0)
|
||||
m->owner = unblock_from_queue (m->waiting);
|
||||
|
||||
scm_i_pthread_mutex_unlock (&m->lock);
|
||||
|
||||
t->block_asyncs++;
|
||||
|
||||
err = block_self (c->waiting, cond, &c->lock, waittime);
|
||||
err = block_self (c->waiting, cond, &m->lock, waittime);
|
||||
scm_i_pthread_mutex_unlock (&m->lock);
|
||||
|
||||
if (err == 0)
|
||||
{
|
||||
|
@ -1444,7 +1460,6 @@ fat_mutex_unlock (SCM mutex, SCM cond,
|
|||
else if (err != EINTR)
|
||||
{
|
||||
errno = err;
|
||||
scm_i_pthread_mutex_unlock (&c->lock);
|
||||
scm_syserror (NULL);
|
||||
}
|
||||
|
||||
|
@ -1452,12 +1467,9 @@ fat_mutex_unlock (SCM mutex, SCM cond,
|
|||
{
|
||||
if (relock)
|
||||
scm_lock_mutex_timed (mutex, SCM_UNDEFINED, owner);
|
||||
scm_i_pthread_mutex_unlock (&c->lock);
|
||||
break;
|
||||
}
|
||||
|
||||
scm_i_pthread_mutex_unlock (&c->lock);
|
||||
|
||||
t->block_asyncs--;
|
||||
scm_async_click ();
|
||||
|
||||
|
@ -1572,7 +1584,6 @@ static size_t
|
|||
fat_cond_free (SCM mx)
|
||||
{
|
||||
fat_cond *c = SCM_CONDVAR_DATA (mx);
|
||||
scm_i_pthread_mutex_destroy (&c->lock);
|
||||
scm_gc_free (c, sizeof (fat_cond), "condition-variable");
|
||||
return 0;
|
||||
}
|
||||
|
@ -1596,7 +1607,6 @@ SCM_DEFINE (scm_make_condition_variable, "make-condition-variable", 0, 0, 0,
|
|||
SCM cv;
|
||||
|
||||
c = scm_gc_malloc (sizeof (fat_cond), "condition variable");
|
||||
scm_i_pthread_mutex_init (&c->lock, 0);
|
||||
c->waiting = SCM_EOL;
|
||||
SCM_NEWSMOB (cv, scm_tc16_condvar, (scm_t_bits) c);
|
||||
c->waiting = make_queue ();
|
||||
|
@ -1635,9 +1645,7 @@ SCM_DEFINE (scm_timed_wait_condition_variable, "wait-condition-variable", 2, 1,
|
|||
static void
|
||||
fat_cond_signal (fat_cond *c)
|
||||
{
|
||||
scm_i_scm_pthread_mutex_lock (&c->lock);
|
||||
unblock_from_queue (c->waiting);
|
||||
scm_i_pthread_mutex_unlock (&c->lock);
|
||||
}
|
||||
|
||||
SCM_DEFINE (scm_signal_condition_variable, "signal-condition-variable", 1, 0, 0,
|
||||
|
@ -1654,10 +1662,8 @@ SCM_DEFINE (scm_signal_condition_variable, "signal-condition-variable", 1, 0, 0,
|
|||
static void
|
||||
fat_cond_broadcast (fat_cond *c)
|
||||
{
|
||||
scm_i_scm_pthread_mutex_lock (&c->lock);
|
||||
while (scm_is_true (unblock_from_queue (c->waiting)))
|
||||
;
|
||||
scm_i_pthread_mutex_unlock (&c->lock);
|
||||
}
|
||||
|
||||
SCM_DEFINE (scm_broadcast_condition_variable, "broadcast-condition-variable", 1, 0, 0,
|
||||
|
@ -1806,7 +1812,9 @@ int
|
|||
scm_pthread_cond_wait (scm_i_pthread_cond_t *cond, scm_i_pthread_mutex_t *mutex)
|
||||
{
|
||||
scm_t_guile_ticket t = scm_leave_guile ();
|
||||
((scm_i_thread *)t)->held_mutex = mutex;
|
||||
int res = scm_i_pthread_cond_wait (cond, mutex);
|
||||
((scm_i_thread *)t)->held_mutex = NULL;
|
||||
scm_enter_guile (t);
|
||||
return res;
|
||||
}
|
||||
|
@ -1817,7 +1825,9 @@ scm_pthread_cond_timedwait (scm_i_pthread_cond_t *cond,
|
|||
const scm_t_timespec *wt)
|
||||
{
|
||||
scm_t_guile_ticket t = scm_leave_guile ();
|
||||
((scm_i_thread *)t)->held_mutex = mutex;
|
||||
int res = scm_i_pthread_cond_timedwait (cond, mutex, wt);
|
||||
((scm_i_thread *)t)->held_mutex = NULL;
|
||||
scm_enter_guile (t);
|
||||
return res;
|
||||
}
|
||||
|
@ -1966,7 +1976,9 @@ void
|
|||
scm_i_thread_sleep_for_gc ()
|
||||
{
|
||||
scm_i_thread *t = suspend ();
|
||||
t->held_mutex = &t->heap_mutex;
|
||||
scm_i_pthread_cond_wait (&wake_up_cond, &t->heap_mutex);
|
||||
t->held_mutex = NULL;
|
||||
resume (t);
|
||||
}
|
||||
|
||||
|
|
|
@ -56,6 +56,7 @@ typedef struct scm_i_thread {
|
|||
|
||||
scm_i_pthread_mutex_t admin_mutex;
|
||||
SCM mutexes;
|
||||
scm_i_pthread_mutex_t *held_mutex;
|
||||
|
||||
SCM result;
|
||||
int canceled;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue