From 2cc418892732f5750584a908d91795b8d136a7f5 Mon Sep 17 00:00:00 2001
From: shadow <shadow>
Date: Thu, 25 Oct 2007 16:52:02 +0000
Subject: [PATCH] interrupt oig_wait produce painc on resend.

b=13888
i=nikita
i=alex
---
 lustre/ChangeLog            | 10 ++++++++++
 lustre/include/lustre_net.h |  1 -
 lustre/osc/osc_request.c    | 10 +++++++---
 lustre/ptlrpc/ptlrpcd.c     | 28 ++++++++++------------------
 lustre/tests/sanity.sh      | 18 ++++++++++++++++++
 5 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 46bc23b301..0e9e25dac5 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -26,6 +26,16 @@ Bugzilla   : 13556
 Description: conf-sanity.sh test_33 failed with 1
 Details    : change mgsnode 
 
+Severity   : normal
+Bugzilla   : 13888
+Description: interrupt oig_wait produce painc on resend.
+Details    : brw_redo_request can be used for resend requests from ptlrpcd and
+	     private set, and this produce situation when rq_ptlrpcd_data not
+	     copyed to new allocated request and triggered LBUG on assert 
+	     req->rq_ptlrpcd_data != NULL. But this member used only for wakeup
+	     ptlrpcd set if request is changed and can be safety changed to use
+	     rq_set directly.
+
 --------------------------------------------------------------------------------
 
 2007-10-26         Cluster File Systems, Inc. <info@clusterfs.com>
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index e2ec068a21..d59777c0af 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -355,7 +355,6 @@ struct ptlrpc_request {
         struct ptlrpc_request_set *rq_set;
         void *rq_interpret_reply;               /* Async completion handler */
         union ptlrpc_async_args rq_async_args;  /* Async completion context */
-        void *rq_ptlrpcd_data;
         struct ptlrpc_request_pool *rq_pool;    /* Pool if request from
                                                    preallocated list */
 };
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 04df048063..25e265651c 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -1341,12 +1341,16 @@ int osc_brw_redo_request(struct ptlrpc_request *request,
                         oap->oap_request = ptlrpc_request_addref(new_req);
                 }
         }
-        client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock);
-
-        DEBUG_REQ(D_INFO, new_req, "new request");
 
+        /* use ptlrpc_set_add_req is safe because interpret functions work 
+         * in check_set context. only one way exist with access to request 
+         * from different thread got -EINTR - this way protected with 
+         * cl_loi_list_lock */
         ptlrpc_set_add_req(set, new_req);
 
+        client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock);
+
+        DEBUG_REQ(D_INFO, new_req, "new request");
         RETURN(0);
 }
 
diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c
index 16c448c218..d0abb2eb85 100644
--- a/lustre/ptlrpc/ptlrpcd.c
+++ b/lustre/ptlrpc/ptlrpcd.c
@@ -48,8 +48,6 @@ struct ptlrpcd_ctl {
         spinlock_t                pc_lock;
         struct completion         pc_starting;
         struct completion         pc_finishing;
-        struct list_head          pc_req_list;
-        cfs_waitq_t               pc_waitq;
         struct ptlrpc_request_set *pc_set;
         char                      pc_name[16];
 #ifndef __KERNEL__
@@ -67,11 +65,11 @@ static int ptlrpcd_users = 0;
 
 void ptlrpcd_wake(struct ptlrpc_request *req)
 {
-        struct ptlrpcd_ctl *pc = req->rq_ptlrpcd_data;
+        struct ptlrpc_request_set *rq_set = req->rq_set;
 
-        LASSERT(pc != NULL);
+        LASSERT(rq_set != NULL);
 
-        cfs_waitq_signal(&pc->pc_waitq);
+        cfs_waitq_signal(&rq_set->set_waitq);
 }
 
 /* requests that are added to the ptlrpcd queue are sent via
@@ -85,9 +83,8 @@ void ptlrpcd_add_req(struct ptlrpc_request *req)
         else
                 pc = &ptlrpcd_recovery_pc;
 
-        req->rq_ptlrpcd_data = pc;
         ptlrpc_set_add_new_req(pc->pc_set, req);
-        wake_up(&pc->pc_waitq);
+        cfs_waitq_signal(&pc->pc_set->set_waitq);
 }
 
 static int ptlrpcd_check(struct ptlrpcd_ctl *pc)
@@ -161,19 +158,13 @@ static int ptlrpcd(void *arg)
          * on the set's new_req_list and ptlrpcd_check moves them into
          * the set. */
         while (1) {
-                cfs_waitlink_t set_wait;
                 struct l_wait_info lwi;
                 cfs_duration_t timeout;
 
                 timeout = cfs_time_seconds(ptlrpc_set_next_timeout(pc->pc_set));
                 lwi = LWI_TIMEOUT(timeout, ptlrpc_expired_set, pc->pc_set);
 
-                /* ala the pinger, wait on pc's waitqueue and the set's */
-                cfs_waitlink_init(&set_wait);
-                cfs_waitq_add(&pc->pc_set->set_waitq, &set_wait);
-                cfs_waitq_forward(&set_wait, &pc->pc_waitq);
-                l_wait_event(pc->pc_waitq, ptlrpcd_check(pc), &lwi);
-                cfs_waitq_del(&pc->pc_set->set_waitq, &set_wait);
+                l_wait_event(pc->pc_set->set_waitq, ptlrpcd_check(pc), &lwi);
 
                 if (test_bit(LIOD_STOP, &pc->pc_flags))
                         break;
@@ -187,8 +178,11 @@ static int ptlrpcd(void *arg)
 
 static void ptlrpcd_zombie_impexp_notify(void)
 {
-        cfs_waitq_signal(&ptlrpcd_pc.pc_waitq);
+        LASSERT(ptlrpcd_pc.pc_set != NULL); // call before ptlrpcd inited ?
+
+        cfs_waitq_signal(&ptlrpcd_pc.pc_set->set_waitq);
 }
+
 #else
 
 int ptlrpcd_check_async_rpcs(void *arg)
@@ -230,10 +224,8 @@ static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
         memset(pc, 0, sizeof(*pc));
         init_completion(&pc->pc_starting);
         init_completion(&pc->pc_finishing);
-        cfs_waitq_init(&pc->pc_waitq);
         pc->pc_flags = 0;
         spin_lock_init(&pc->pc_lock);
-        CFS_INIT_LIST_HEAD(&pc->pc_req_list);
         snprintf (pc->pc_name, sizeof (pc->pc_name), name);
 
         pc->pc_set = ptlrpc_prep_set();
@@ -266,7 +258,7 @@ static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
 static void ptlrpcd_stop(struct ptlrpcd_ctl *pc)
 {
         set_bit(LIOD_STOP, &pc->pc_flags);
-        cfs_waitq_signal(&pc->pc_waitq);
+        cfs_waitq_signal(&pc->pc_set->set_waitq);
 #ifdef __KERNEL__
         obd_zombie_impexp_notify = NULL;
         wait_for_completion(&pc->pc_finishing);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 9525d16994..8932d821a4 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -4282,6 +4282,24 @@ test_118j() {
 }
 run_test 118j "Simulate unrecoverable OST side error =========="
 
+test_118k()
+{
+	#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+	do_facet ost sysctl -w lustre.fail_loc=0x20e
+	mkdir -p $DIR/$tdir
+
+        for ((i=0;i<10;i++)); do
+                dd if=/dev/zero of=$DIR/$tdir/$tdir-$i bs=1M count=10 &
+	        SLEEPPID=$!
+                sleep 0.500s
+	        kill $SLEEPPID
+	        wait $SLEEPPID
+        done
+
+        sysctl -w lustre.fail_loc=0
+}
+run_test 118k "bio alloc -ENOMEM and IO TERM handling ========="
+
 test_119a() # bug 11737
 {
         BSIZE=$((512 * 1024))
-- 
GitLab