From ef5699424a8b2b40f286c8afc20d9ed9da825f1f Mon Sep 17 00:00:00 2001 From: bobijam <bobijam> Date: Wed, 18 Jun 2008 01:41:40 +0000 Subject: [PATCH] Branch b1_8 b=14480 o=green i=shadow, bobijam Description: LBUG during stress test Details : Need properly lock accesses the flock deadlock detection list. --- lustre/ChangeLog | 5 +++++ lustre/ldlm/ldlm_flock.c | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 0c81657d03..5bce61ddf7 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -24,6 +24,11 @@ tbd Sun Microsystems, Inc. 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'. For more information, please refer to bugzilla 13904. +Severity : normal +Bugzilla : 14480 +Description: LBUG during stress test +Details : Need properly lock accesses the flock deadlock detection list. + Severity : minor Bugzilla : 15837 Description: oops in page fault handler diff --git a/lustre/ldlm/ldlm_flock.c b/lustre/ldlm/ldlm_flock.c index bd92a84d8b..0b9f09b817 100644 --- a/lustre/ldlm/ldlm_flock.c +++ b/lustre/ldlm/ldlm_flock.c @@ -42,6 +42,7 @@ #define l_flock_waitq l_lru static struct list_head ldlm_flock_waitq = CFS_LIST_HEAD_INIT(ldlm_flock_waitq); +spinlock_t ldlm_flock_waitq_lock = SPIN_LOCK_UNLOCKED; int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag); @@ -82,6 +83,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, int flags) LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%x)", mode, flags); + /* Safe to not lock here, since it should be empty anyway */ LASSERT(list_empty(&lock->l_flock_waitq)); list_del_init(&lock->l_res_link); @@ -107,6 +109,7 @@ ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *blocking_lock) pid_t blocking_pid = blocking_lock->l_policy_data.l_flock.pid; struct ldlm_lock *lock; + spin_lock(&ldlm_flock_waitq_lock); restart: list_for_each_entry(lock, &ldlm_flock_waitq, l_flock_waitq) { if ((lock->l_policy_data.l_flock.pid != blocking_pid) || @@ -116,11 +119,14 @@ restart: blocking_pid = lock->l_policy_data.l_flock.blocking_pid; blocking_export = (struct obd_export *)(long) lock->l_policy_data.l_flock.blocking_export; - if (blocking_pid == req_pid && blocking_export == req_export) + if (blocking_pid == req_pid && blocking_export == req_export) { + spin_unlock(&ldlm_flock_waitq_lock); return 1; + } goto restart; } + spin_unlock(&ldlm_flock_waitq_lock); return 0; } @@ -225,7 +231,9 @@ reprocess: (long)(void *)lock->l_export; LASSERT(list_empty(&req->l_flock_waitq)); + spin_lock(&ldlm_flock_waitq_lock); list_add_tail(&req->l_flock_waitq, &ldlm_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); ldlm_resource_add_lock(res, &res->lr_waiting, req); *flags |= LDLM_FL_BLOCK_GRANTED; @@ -242,7 +250,9 @@ reprocess: /* In case we had slept on this lock request take it off of the * deadlock detection waitq. */ + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&req->l_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); /* Scan the locks owned by this process that overlap this request. * We may have to merge or split existing locks. */ @@ -341,7 +351,7 @@ reprocess: * and restart processing this lock. */ if (!new2) { unlock_res_and_lock(req); - new2 = ldlm_lock_create(ns, res->lr_name, LDLM_FLOCK, + new2 = ldlm_lock_create(ns, res->lr_name, LDLM_FLOCK, lock->l_granted_mode, NULL, NULL, NULL, NULL, 0); lock_res_and_lock(req); @@ -454,7 +464,9 @@ ldlm_flock_interrupted_wait(void *data) lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock; /* take lock off the deadlock detection waitq. */ + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&lock->l_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); /* client side - set flag to prevent lock from being put on lru list */ lock->l_flags |= LDLM_FL_CBPENDING; @@ -484,6 +496,21 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data) CDEBUG(D_DLMTRACE, "flags: 0x%x data: %p getlk: %p\n", flags, data, getlk); + /* Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. */ + lock_res_and_lock(lock); + if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == + (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) { + unlock_res_and_lock(lock); + if (lock->l_req_mode == lock->l_granted_mode && + lock->l_granted_mode != LCK_NL) + ldlm_lock_decref_internal(lock, lock->l_req_mode); + RETURN(0); + } + unlock_res_and_lock(lock); + LASSERT(flags != LDLM_FL_WAIT_NOREPROC); if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | @@ -523,7 +550,9 @@ granted: lock_res_and_lock(lock); /* take lock off the deadlock detection waitq. */ + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&lock->l_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); /* ldlm_lock_enqueue() has already placed lock on the granted list. */ list_del_init(&lock->l_res_link); @@ -572,8 +601,8 @@ int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, ns = lock->l_resource->lr_namespace; /* take lock off the deadlock detection waitq. */ - lock_res_and_lock(lock); + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&lock->l_flock_waitq); - unlock_res_and_lock(lock); + spin_unlock(&ldlm_flock_waitq_lock); RETURN(0); } -- GitLab