From 5223a84cb0fc68ba98b95f51791f350b3932b7f4 Mon Sep 17 00:00:00 2001
From: ericm <ericm>
Date: Mon, 26 May 2008 23:27:28 +0000
Subject: [PATCH] branch: b1_6 do not drop replay according to msg flags,
 instead we check the per-export recovery request queue for duplication of
 transno. b=15756 r=adilger r=rread

---
 lustre/ChangeLog               |  7 +++
 lustre/include/lustre_export.h |  1 +
 lustre/include/obd_support.h   |  1 +
 lustre/ldlm/ldlm_lib.c         | 82 +++++++++++++++++++++++++++++-----
 lustre/obdclass/genops.c       |  2 +
 lustre/tests/replay-single.sh  | 13 ++++++
 6 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index c83935f86c..865ce021b5 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -24,6 +24,13 @@ tbd Sun Microsystems, Inc.
 	  	'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'.
 	  For more information, please refer to bugzilla 13904.
 
+Severity   : normal
+Bugzilla   : 15756
+Frequency  : rare, replay get lost on server
+Description: server incorrectly drop resent replays lead to recovery failure.
+Details    : do not drop replay according to msg flags, instead we check the
+             per-export recovery request queue for duplication of transno.
+
 Severity   : normal
 Bugzilla   : 14835
 Frequency  : after recovery
diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h
index 7ece7a25bf..6890dbd867 100644
--- a/lustre/include/lustre_export.h
+++ b/lustre/include/lustre_export.h
@@ -88,6 +88,7 @@ struct obd_export {
         struct ldlm_export_data   exp_ldlm_data;
         struct list_head          exp_outstanding_replies;
         time_t                    exp_last_request_time;
+        struct list_head          exp_req_replay_queue;
         spinlock_t                exp_lock; /* protects flags int below */
         /* ^ protects exp_outstanding_replies too */
         __u64                     exp_connect_flags;
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 792a6d47ce..2ee8438d45 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -254,6 +254,7 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
 #define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
 #define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP         0x707
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index 50e769ffb4..cb5c4ef75a 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -973,6 +973,47 @@ void target_destroy_export(struct obd_export *exp)
  * Recovery functions
  */
 
+static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
+{
+        __u64                  transno = lustre_msg_get_transno(req->rq_reqmsg);
+        struct obd_export     *exp = req->rq_export;
+        struct ptlrpc_request *reqiter;
+        int                    dup = 0;
+
+        LASSERT(exp);
+
+        spin_lock(&exp->exp_lock);
+        list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
+                            rq_replay_list) {
+                if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
+                        dup = 1;
+                        break;
+                }
+        }
+
+        if (dup) {
+                /* we expect it with RESENT and REPLAY flags */
+                if ((lustre_msg_get_flags(req->rq_reqmsg) &
+                     (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
+                        CERROR("invalid flags %x of resent replay\n",
+                               lustre_msg_get_flags(req->rq_reqmsg));
+        } else {
+                list_add_tail(&req->rq_replay_list, &exp->exp_req_replay_queue);
+        }
+
+        spin_unlock(&exp->exp_lock);
+        return dup;
+}
+
+static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
+{
+        LASSERT(!list_empty(&req->rq_replay_list));
+        LASSERT(req->rq_export);
+
+        spin_lock(&req->rq_export->exp_lock);
+        list_del_init(&req->rq_replay_list);
+        spin_unlock(&req->rq_export->exp_lock);
+}
 
 static void target_release_saved_req(struct ptlrpc_request *req)
 {
@@ -1017,6 +1058,7 @@ static void abort_recovery_queue(struct obd_device *obd)
 
         list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                target_exp_dequeue_req_replay(req);
                 list_del(&req->rq_list);
                 DEBUG_REQ(D_ERROR, req, "aborted:");
                 req->rq_status = -ENOTCONN;
@@ -1066,6 +1108,7 @@ void target_cleanup_recovery(struct obd_device *obd)
 
         list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                target_exp_dequeue_req_replay(req);
                 list_del(&req->rq_list);
                 target_release_saved_req(req);
         }
@@ -1278,6 +1321,7 @@ static void process_recovery_queue(struct obd_device *obd)
                         }
                         continue;
                 }
+                target_exp_dequeue_req_replay(req);
                 list_del_init(&req->rq_list);
                 obd->obd_requests_queued_for_recovery--;
                 spin_unlock_bh(&obd->obd_processing_task_lock);
@@ -1314,6 +1358,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         __u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
         struct ptlrpc_request *saved_req;
         struct lustre_msg *reqmsg;
+        int rc = 0;
 
         /* CAVEAT EMPTOR: The incoming request message has been swabbed
          * (i.e. buflens etc are in my own byte order), but type-dependent
@@ -1351,20 +1396,12 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                 /* Processing the queue right now, don't re-add. */
                 LASSERT(list_empty(&req->rq_list));
                 spin_unlock_bh(&obd->obd_processing_task_lock);
-                OBD_FREE(reqmsg, req->rq_reqlen);
-                OBD_FREE(saved_req, sizeof *saved_req);
-                return 1;
+                GOTO(err_free, rc = 1);
         }
 
-        /* A resent, replayed request that is still on the queue; just drop it.
-           The queued request will handle this. */
-        if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT|MSG_REPLAY)) ==
-            (MSG_RESENT | MSG_REPLAY)) {
-                DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+        if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP))) {
                 spin_unlock_bh(&obd->obd_processing_task_lock);
-                OBD_FREE(reqmsg, req->rq_reqlen);
-                OBD_FREE(saved_req, sizeof *saved_req);
-                return 0;
+                GOTO(err_free, rc = 0);
         }
 
         memcpy(saved_req, req, sizeof *req);
@@ -1373,6 +1410,13 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         req->rq_reqmsg = reqmsg;
         class_export_get(req->rq_export);
         CFS_INIT_LIST_HEAD(&req->rq_list);
+        CFS_INIT_LIST_HEAD(&req->rq_replay_list);
+
+        if (target_exp_enqueue_req_replay(req)) {
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+                GOTO(err_exp, rc = 0);
+        }
 
         /* XXX O(n^2) */
         list_for_each(tmp, &obd->obd_recovery_queue) {
@@ -1384,6 +1428,15 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                         inserted = 1;
                         break;
                 }
+
+                if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) ==
+                             transno)) {
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        DEBUG_REQ(D_ERROR, req, "dropping replay: transno "
+                                  "has been claimed by another client");
+                        target_exp_dequeue_req_replay(req);
+                        GOTO(err_exp, rc = 0);
+                }
         }
 
         if (!inserted) {
@@ -1409,6 +1462,13 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
 
         process_recovery_queue(obd);
         return 0;
+
+err_exp:
+        class_export_put(req->rq_export);
+err_free:
+        OBD_FREE(reqmsg, req->rq_reqlen);
+        OBD_FREE(saved_req, sizeof(*saved_req));
+        return rc;
 }
 
 struct obd_device * target_req2obd(struct ptlrpc_request *req)
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c
index 63d52c26b5..f6fbf45f7c 100644
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -640,6 +640,7 @@ void class_export_destroy(struct obd_export *exp)
                 ptlrpc_put_connection_superhack(exp->exp_connection);
 
         LASSERT(list_empty(&exp->exp_outstanding_replies));
+        LASSERT(list_empty(&exp->exp_req_replay_queue));
         obd_destroy_export(exp);
 
         OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
@@ -664,6 +665,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
         atomic_set(&export->exp_rpc_count, 0);
         export->exp_obd = obd;
         CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies);
+        CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue);
         /* XXX this should be in LDLM init */
         CFS_INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks);
         spin_lock_init(&export->exp_ldlm_data.led_lock);
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index 5142e45f96..b60f032d45 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -1367,6 +1367,19 @@ test_61c() {
 }
 run_test 61c "test race mds llog sync vs llog cleanup"
 
+test_62() { # Bug 15756 - don't mis-drop resent replay
+    replay_barrier mds
+    createmany -o $DIR/$tdir/$tfile- 25
+#define OBD_FAIL_TGT_REPLAY_DROP         0x707
+    do_facet mds "sysctl -w lustre.fail_loc=0x80000707"
+    facet_failover mds
+    df $MOUNT || return 1
+    do_facet mds "sysctl -w lustre.fail_loc=0"
+    unlinkmany $DIR/$tdir/$tfile- 25 || return 2
+    return 0
+}
+run_test 62 "don't mis-drop resent replay"
+
 #Adaptive Timeouts (bug 3055)
 AT_MAX_SET=0
 
-- 
GitLab