[Ecls-list] [PATCH] Fix locks and condition variables.

Mon Sep 21 17:48:48 UTC 2009

1) As far as I understand, accessing the environment might
   trigger an interrupt. Since the inside of the lock functions
   is somewhat critical, get the process pointer beforehand.
2) If get-lock jumps to OUTPUT when the process seems to own
   it already, so should giveup-lock.
3) Waiting on a condition variable temporarily gives up the
   lock, and restores the ownership before returning. This
   should be reflected in the lock structure fields.
4) The lock counter is meaningless if the lock is not held
   by the current thread, so it is necessary to check both
   lock-count and lock-holder inside with-lock.
---

	Failure mode for 2: 
	 - Thread A acquires the lock, count = 1
	 - Thread A reacquires the lock, count = 2
	 - Thread A releases the lock, count = 1
	 - Thread B steals the lock, count = 2

	Failure mode for 3:
	 - Thread A waits on a condition (count left at 1)
	 - Thread B acquires the lock (count = 2)
	 - Thread B releases the lock (count = 1, holder not reset)
	 - Thread B acquires the lock 
	   (doesn't actually, since get-lock thinks that it already has it)

	Failure mode for 4:
	 - Thread A acquires the lock, count = 1
	 - Thread B reads the count
	 - Thread A releases the lock, count = 0
	 - Thread B acquires the lock via with-lock, count = 1
	 - with-lock does not release the lock

	My program very quickly trips on problems coming from 3 and 4.

	Alexander


 src/c/threads.d |   53 +++++++++++++++++++++++++++++++++++++++++++----------
 src/lsp/mp.lsp  |   12 ++++++++----
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/src/c/threads.d b/src/c/threads.d
index 2b0156d..29ac771 100644
--- a/src/c/threads.d
+++ b/src/c/threads.d
@@ -516,6 +516,8 @@ mp_giveup_lock(cl_object lock)
 	}
 	if (--lock->lock.counter == 0) {
 		lock->lock.holder = Cnil;
+	} else {
+		goto OUTPUT;
 	}
 #ifdef ECL_WINDOWS_THREADS
         if (ReleaseMutex(lock->lock.mutex) == 0)
@@ -523,16 +525,18 @@ mp_giveup_lock(cl_object lock)
 #else
 	pthread_mutex_unlock(&lock->lock.mutex);
 #endif
+OUTPUT:
 	@(return Ct)
 }
 
 @(defun mp::get-lock (lock &optional (wait Ct))
-	cl_object output;
+	cl_object output, own_process;
 	int rc;
 @
+	own_process = mp_current_process();
 	if (type_of(lock) != t_lock)
 		FEwrong_type_argument(@'mp::lock', lock);
-        if (lock->lock.holder == the_env->own_process) {
+        if (lock->lock.holder == own_process) {
                 if (!lock->lock.recursive) {
                         FEerror("A recursive attempt was made to hold lock ~S",
                                 1, lock);
@@ -549,7 +553,7 @@ mp_giveup_lock(cl_object lock)
 	switch (WaitForSingleObject(lock->lock.mutex, (wait==Ct?INFINITE:0))) {
 		case WAIT_OBJECT_0:
                         lock->lock.counter++;
-                        lock->lock.holder = the_env->own_process;
+                        lock->lock.holder = own_process;
                         output = Ct;
 			break;
 		case WAIT_TIMEOUT:
@@ -572,7 +576,7 @@ mp_giveup_lock(cl_object lock)
 	}
 	if (rc == 0) {
 		lock->lock.counter++;
-		lock->lock.holder = the_env->own_process;
+		lock->lock.holder = own_process;
 		output = Ct;
 	} else if (rc == EBUSY) {
 		output = Cnil;
@@ -612,13 +616,27 @@ mp_condition_variable_wait(cl_object cv, cl_object lock)
 #ifdef ECL_WINDOWS_THREADS
 	FEerror("Condition variables are not supported under Windows.", 0);
 #else
+	int count, rc;
+	cl_object own_process = mp_current_process();
 	if (type_of(cv) != t_condition_variable)
 		FEwrong_type_argument(@'mp::condition-variable', cv);
 	if (type_of(lock) != t_lock)
 		FEwrong_type_argument(@'mp::lock', lock);
-	if (pthread_cond_wait(&cv->condition_variable.cv,
-	                      &lock->lock.mutex) == 0)
-		lock->lock.holder = mp_current_process();
+	if (lock->lock.holder != own_process) {
+		FEerror("Attempt wait on a condition variable using lock ~S that is not owned by ~S.", 2,
+			lock, own_process);
+	}
+	count = lock->lock.counter;
+	lock->lock.counter = 0;
+	lock->lock.holder = Cnil;
+	rc = pthread_cond_wait(&cv->condition_variable.cv,
+	                       &lock->lock.mutex);
+	lock->lock.counter = count;
+	lock->lock.holder = own_process;
+	if (rc != 0) {
+		FEerror("Unable to wait on a condition. Error code: %d.", 1,
+			MAKE_FIXNUM(rc));
+	}
 #endif
 	@(return Ct)
 }
@@ -629,6 +647,8 @@ mp_condition_variable_timedwait(cl_object cv, cl_object lock, cl_object seconds)
 #ifdef ECL_WINDOWS_THREADS
 	FEerror("Condition variables are not supported under Windows.", 0);
 #else
+	int count, rc;
+	cl_object own_process = mp_current_process();
 	double r;
 	struct timespec   ts;
 	struct timeval    tp;
@@ -637,6 +657,10 @@ mp_condition_variable_timedwait(cl_object cv, cl_object lock, cl_object seconds)
 		FEwrong_type_argument(@'mp::condition-variable', cv);
 	if (type_of(lock) != t_lock)
 		FEwrong_type_argument(@'mp::lock', lock);
+	if (lock->lock.holder != own_process) {
+		FEerror("Attempt wait on a condition variable using lock ~S that is not owned by ~S.", 2,
+			lock, own_process);
+	}
 	/* INV: ecl_minusp() makes sure `seconds' is real */
 	if (ecl_minusp(seconds))
 		cl_error(9, @'simple-type-error', @':format-control',
@@ -656,11 +680,20 @@ mp_condition_variable_timedwait(cl_object cv, cl_object lock, cl_object seconds)
 		ts.tv_nsec -= 1e9;
 		ts.tv_sec++;
 	}
-	if (pthread_cond_timedwait(&cv->condition_variable.cv,
-	                           &lock->lock.mutex, &ts) == 0) {
-		lock->lock.holder = mp_current_process();
+	count = lock->lock.counter;
+	lock->lock.counter = 0;
+	lock->lock.holder = Cnil;
+	rc = pthread_cond_timedwait(&cv->condition_variable.cv,
+	                            &lock->lock.mutex, &ts);
+	lock->lock.counter = count;
+	lock->lock.holder = own_process;
+	if (rc == 0) {
 		@(return Ct)
 	} else {
+		if (rc != ETIMEDOUT) {
+			FEerror("Unable to wait on a condition. Error code: %d.", 1,
+				MAKE_FIXNUM(rc));
+		}
 		@(return Cnil)
 	}
 #endif
diff --git a/src/lsp/mp.lsp b/src/lsp/mp.lsp
index 3917f08..7473798 100644
--- a/src/lsp/mp.lsp
+++ b/src/lsp/mp.lsp
@@ -89,20 +89,24 @@ WITHOUT-INTERRUPTS in:
   ;; Why do we need %count? Even if get-lock succeeeds, an interrupt may
   ;; happen between the end of get-lock and when we save the output of
   ;; the function. That means we lose the information and ignore that
-  ;; the lock was actually acquired. Furthermore, a lock can be recursive
-  ;; and mp:lock-holder is also not reliable.
+  ;; the lock was actually acquired. Furthermore, since lock-count is
+  ;; meaningless if the lock is not held by the current thread, it is
+  ;; necessary to check mp:lock-holder before and after the body.
   ;;
   ;; Next notice how we need to disable interrupts around the body and
   ;; the get-lock statement, to ensure that the unlocking is done with
   ;; interrupts disabled.
   #+threads
-  (ext:with-unique-names (lock count interrupts)
+  (ext:with-unique-names (lock my-lock count interrupts)
     `(let* ((,lock ,lock-form)
+            (,my-lock (eql (mp:lock-holder ,lock) mp:*current-process*))
             (,count (mp:lock-count ,lock)))
        (without-interrupts
            (unwind-protect
                 (with-restored-interrupts
                     (mp::get-lock ,lock)
                   (locally , at body))
-             (when (> (mp:lock-count ,lock) ,count)
+             (when (and (eql (mp:lock-holder ,lock) mp:*current-process*)
+                        (or (not ,my-lock) (> (mp:lock-count ,lock) ,count)))
                (mp::giveup-lock ,lock)))))))
+
-- 
1.6.3.rc3.12.gb7937