From f43b9324340b6930d9ace6224cfdae4c2bcec84e Mon Sep 17 00:00:00 2001
From: alex <alex>
Date: Sat, 7 May 2005 19:01:31 +0000
Subject: [PATCH] b=6063

 - to avoid possible lock collision during replay, we should replay all
   request before any locks
---
 lustre/include/linux/lustre_export.h |   3 +-
 lustre/include/linux/lustre_idl.h    |   2 +
 lustre/include/linux/obd.h           |  11 +-
 lustre/include/linux/obd_class.h     |   6 +-
 lustre/ldlm/ldlm_lib.c               | 360 ++++++++++++++++++++-------
 lustre/ldlm/ldlm_lockd.c             |   4 +-
 lustre/ldlm/ldlm_request.c           |   5 +
 lustre/mds/handler.c                 |   8 -
 lustre/mds/mds_fs.c                  |  10 +-
 lustre/obdclass/genops.c             |  10 +-
 lustre/obdclass/obd_config.c         |   5 +-
 lustre/obdfilter/filter.c            |   5 +-
 lustre/ost/ost_handler.c             |   9 -
 lustre/ptlrpc/import.c               |   2 +-
 14 files changed, 316 insertions(+), 124 deletions(-)

diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h
index e4a0bdbc68..ec36e7215c 100644
--- a/lustre/include/linux/lustre_export.h
+++ b/lustre/include/linux/lustre_export.h
@@ -84,7 +84,8 @@ struct obd_export {
         /* ^ protects exp_outstanding_replies too */
         unsigned long             exp_flags;
         int                       exp_failed:1,
-                                  exp_replay_needed:1,
+                                  exp_req_replay_needed:1,
+                                  exp_lock_replay_needed:1,
                                   exp_libclient:1, /* liblustre client? */
                                   exp_sync:1;
         union {
diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h
index 8a0926c75c..1125ccfa94 100644
--- a/lustre/include/linux/lustre_idl.h
+++ b/lustre/include/linux/lustre_idl.h
@@ -174,6 +174,8 @@ struct lustre_msg {
 #define MSG_LAST_REPLAY        1
 #define MSG_RESENT             2
 #define MSG_REPLAY             4
+#define MSG_REQ_REPLAY_DONE    8
+#define MSG_LOCK_REPLAY_DONE  16
 
 static inline int lustre_msg_get_flags(struct lustre_msg *msg)
 {
diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h
index ac5d138fea..0b581eb2c6 100644
--- a/lustre/include/linux/obd.h
+++ b/lustre/include/linux/obd.h
@@ -648,16 +648,23 @@ struct obd_device {
         spinlock_t                       obd_processing_task_lock;
         __u64                            obd_next_recovery_transno;
         int                              obd_replayed_requests;
+        int                              obd_replayed_locks;
         int                              obd_requests_queued_for_recovery;
         wait_queue_head_t                obd_next_transno_waitq;
         struct list_head                 obd_uncommitted_replies;
         spinlock_t                       obd_uncommitted_replies_lock;
         struct timer_list                obd_recovery_timer;
-        struct list_head                 obd_recovery_queue;
-        struct list_head                 obd_delayed_reply_queue;
         time_t                           obd_recovery_start;
         time_t                           obd_recovery_end;
 
+        atomic_t                         obd_req_replay_clients;
+        atomic_t                         obd_lock_replay_clients;
+
+        struct list_head                 obd_req_replay_queue;
+        struct list_head                 obd_lock_replay_queue;
+        struct list_head                 obd_final_req_queue;
+        int                              obd_recovery_stage;
+
         union {
                 struct filter_obd        filter;
                 struct mds_obd           mds;
diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h
index 9f41ed278d..2da67304ac 100644
--- a/lustre/include/linux/obd_class.h
+++ b/lustre/include/linux/obd_class.h
@@ -154,8 +154,10 @@ void class_put_type(struct obd_type *type);
 int class_connect(struct lustre_handle *conn, struct obd_device *obd,
                   struct obd_uuid *cluuid);
 int class_disconnect(struct obd_export *exp, unsigned long flags);
-void class_disconnect_exports(struct obd_device *obddev, unsigned long flags);
-void class_disconnect_stale_exports(struct obd_device *obddev, unsigned long flags);
+void class_disconnect_exports(struct obd_device *, unsigned long);
+int class_disconnect_stale_exports(struct obd_device *,
+                                   int (*test_export)(struct obd_export *), 
+                                   unsigned long);
 
 /* generic operations shared by various OBD types */
 int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data);
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index a4ea476301..9244c92033 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -898,12 +898,8 @@ static void target_release_saved_req(struct ptlrpc_request *req)
 
 static void target_finish_recovery(struct obd_device *obd)
 {
-        struct list_head *tmp, *n;
         int rc;
 
-        CWARN("%s: sending delayed replies to recovered clients\n",
-              obd->obd_name);
-
         ldlm_reprocess_all_ns(obd->obd_namespace);
 
         /* when recovery finished, cleanup orphans on mds and ost */
@@ -916,26 +912,40 @@ static void target_finish_recovery(struct obd_device *obd)
                         CERROR("postrecov failed %d\n", rc);
         }
 
+        obd->obd_recovery_end = LTIME_S(CURRENT_TIME);
+        return;
+}
 
-        list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
-                struct ptlrpc_request *req;
+static void abort_req_replay_queue(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        struct list_head *tmp, *n;
+        int rc;
+
+        list_for_each_safe(tmp, n, &obd->obd_req_replay_queue) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 list_del(&req->rq_list);
-                DEBUG_REQ(D_ERROR, req, "delayed:");
-                ptlrpc_reply(req);
+                DEBUG_REQ(D_ERROR, req, "aborted:");
+                req->rq_status = -ENOTCONN;
+                req->rq_type = PTL_RPC_MSG_ERR;
+                rc = lustre_pack_reply(req, 0, NULL, NULL);
+                if (rc == 0) {
+                        ptlrpc_reply(req);
+                } else {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "packing failed for abort-reply; skipping");
+                }
                 target_release_saved_req(req);
         }
-        obd->obd_recovery_end = LTIME_S(CURRENT_TIME);
-        return;
 }
 
-static void abort_recovery_queue(struct obd_device *obd)
+static void abort_lock_replay_queue(struct obd_device *obd)
 {
         struct ptlrpc_request *req;
         struct list_head *tmp, *n;
         int rc;
 
-        list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
+        list_for_each_safe(tmp, n, &obd->obd_lock_replay_queue) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 list_del(&req->rq_list);
                 DEBUG_REQ(D_ERROR, req, "aborted:");
@@ -976,14 +986,19 @@ void target_cleanup_recovery(struct obd_device *obd)
         target_cancel_recovery_timer(obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
-        list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+        list_for_each_safe(tmp, n, &obd->obd_req_replay_queue) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 list_del(&req->rq_list);
-                LASSERT (req->rq_reply_state);
-                lustre_free_reply_state(req->rq_reply_state);
+                LASSERT (req->rq_reply_state == 0);
                 target_release_saved_req(req);
         }
-        list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
+        list_for_each_safe(tmp, n, &obd->obd_lock_replay_queue) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                list_del(&req->rq_list);
+                LASSERT (req->rq_reply_state == 0);
+                target_release_saved_req(req);
+        }
+        list_for_each_safe(tmp, n, &obd->obd_final_req_queue) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 list_del(&req->rq_list);
                 LASSERT (req->rq_reply_state == 0);
@@ -991,6 +1006,7 @@ void target_cleanup_recovery(struct obd_device *obd)
         }
 }
 
+#if 0
 static void target_abort_recovery(void *data)
 {
         struct obd_device *obd = data;
@@ -1006,11 +1022,11 @@ static void target_abort_recovery(void *data)
         target_finish_recovery(obd);
         ptlrpc_run_recovery_over_upcall(obd);
 }
+#endif
 
 static void target_recovery_expired(unsigned long castmeharder)
 {
         struct obd_device *obd = (struct obd_device *)castmeharder;
-        CERROR("recovery timed out, aborting\n");
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (obd->obd_recovering)
                 obd->obd_abort_recovery = 1;
@@ -1066,8 +1082,8 @@ static int check_for_next_transno(struct obd_device *obd)
         __u64 next_transno, req_transno;
 
         spin_lock_bh(&obd->obd_processing_task_lock);
-        if (!list_empty(&obd->obd_recovery_queue)) {
-                req = list_entry(obd->obd_recovery_queue.next,
+        if (!list_empty(&obd->obd_req_replay_queue)) {
+                req = list_entry(obd->obd_req_replay_queue.next,
                                  struct ptlrpc_request, rq_list);
                 req_transno = req->rq_reqmsg->transno;
         } else {
@@ -1076,7 +1092,7 @@ static int check_for_next_transno(struct obd_device *obd)
 
         max = obd->obd_max_recoverable_clients;
         connected = obd->obd_connected_clients;
-        completed = max - obd->obd_recoverable_clients;
+        completed = max - atomic_read(&obd->obd_req_replay_clients);
         queue_len = obd->obd_requests_queued_for_recovery;
         next_transno = obd->obd_next_recovery_transno;
 
@@ -1086,7 +1102,7 @@ static int check_for_next_transno(struct obd_device *obd)
         if (obd->obd_abort_recovery) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
-        } else if (max == completed) {
+        } else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
                 CDEBUG(D_HA, "waking for completed recovery\n");
                 wake_up = 1;
         } else if (req_transno == next_transno) {
@@ -1120,8 +1136,8 @@ target_next_replay_req(struct obd_device *obd)
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (obd->obd_abort_recovery) {
                 req = NULL;
-        } else if (!list_empty(&obd->obd_recovery_queue)) {
-                req = list_entry(obd->obd_recovery_queue.next,
+        } else if (!list_empty(&obd->obd_req_replay_queue)) {
+                req = list_entry(obd->obd_req_replay_queue.next,
                                  struct ptlrpc_request, rq_list);
                 list_del_init(&req->rq_list);
                 obd->obd_requests_queued_for_recovery--;
@@ -1132,11 +1148,90 @@ target_next_replay_req(struct obd_device *obd)
         return req;
 }
 
+static int check_for_next_lock(struct obd_device *obd)
+{
+        struct ptlrpc_request *req = NULL;
+        int wake_up = 0;
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (!list_empty(&obd->obd_lock_replay_queue)) {
+                req = list_entry(obd->obd_lock_replay_queue.next,
+                                 struct ptlrpc_request, rq_list);
+                CDEBUG(D_HA, "waking for next lock\n");
+                wake_up = 1;
+        } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
+                CDEBUG(D_HA, "waking for completed lock replay\n");
+                wake_up = 1;
+        } else if (obd->obd_abort_recovery) {
+                CDEBUG(D_HA, "waking for aborted recovery\n");
+                wake_up = 1;
+        }
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        
+        return wake_up;
+}
+
+static struct ptlrpc_request *
+target_next_replay_lock(struct obd_device *obd)
+{
+        struct l_wait_info lwi = { 0 };
+        struct ptlrpc_request *req;
+
+        CDEBUG(D_HA, "Waiting for lock\n");
+        l_wait_event(obd->obd_next_transno_waitq,
+                     check_for_next_lock(obd), &lwi);
+        
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (obd->obd_abort_recovery) {
+                req = NULL;
+        } else if (!list_empty(&obd->obd_lock_replay_queue)) {
+                req = list_entry(obd->obd_lock_replay_queue.next,
+                                 struct ptlrpc_request, rq_list);
+                list_del_init(&req->rq_list);
+        } else {
+                req = NULL;
+        }
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        return req;
+}
+
+static struct ptlrpc_request *
+target_next_final_ping(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (!list_empty(&obd->obd_final_req_queue)) {
+                req = list_entry(obd->obd_final_req_queue.next,
+                                 struct ptlrpc_request, rq_list);
+                list_del_init(&req->rq_list);
+        } else {
+                req = NULL;
+        }
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        return req;
+}
+
+static int req_replay_done(struct obd_export *exp)
+{
+        if (exp->exp_req_replay_needed)
+                return 0;
+        return 1;
+}
+
+static int lock_replay_done(struct obd_export *exp)
+{
+        if (exp->exp_lock_replay_needed)
+                return 0;
+        return 1;
+}
+
 static int target_recovery_thread(void *arg)
 {
         struct obd_device *obd = arg;
         struct ptlrpc_request *req;
         struct target_recovery_data *trd = &obd->obd_recovery_data;
+        char peer_str[PTL_NALFMT_SIZE];
         unsigned long flags;
         ENTRY;
 
@@ -1154,40 +1249,85 @@ static int target_recovery_thread(void *arg)
         obd->obd_recovering = 1;
         complete(&trd->trd_starting);
 
-        while (obd->obd_recovering) {
+        /* The first stage: replay requests */
+        CWARN("1: request replay stage - %d clients\n",
+              atomic_read(&obd->obd_req_replay_clients));
+        while ((req = target_next_replay_req(obd))) {
                 LASSERT(trd->trd_processing_task == current->pid);
-                req = target_next_replay_req(obd);
-                if (req != NULL) {
-                        char peer_str[PTL_NALFMT_SIZE];
-                        DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s: ", 
-                                  req->rq_reqmsg->transno, 
-                                  ptlrpc_peernid2str(&req->rq_peer, peer_str));
-                        (void)trd->trd_recovery_handler(req);
-                        obd->obd_replayed_requests++;
-                        reset_recovery_timer(obd);
-                        /* bug 1580: decide how to properly sync() in recovery*/
-                        //mds_fsync_super(mds->mds_sb);
-                        ptlrpc_free_clone(req);
-                        spin_lock_bh(&obd->obd_processing_task_lock);
-                        obd->obd_next_recovery_transno++;
-                        spin_unlock_bh(&obd->obd_processing_task_lock);
-                } else {
-                        /* recovery is over */
-                        spin_lock_bh(&obd->obd_processing_task_lock);
-                        obd->obd_recovering = 0;
-                        target_cancel_recovery_timer(obd);
-                        if (obd->obd_abort_recovery) {
-                                obd->obd_abort_recovery = 0;
-                                spin_unlock_bh(&obd->obd_processing_task_lock);
-                                target_abort_recovery(obd); 
-                        } else {
-                                LASSERT(obd->obd_recoverable_clients == 0);
-                                spin_unlock_bh(&obd->obd_processing_task_lock);
-                                target_finish_recovery(obd);
-                        }
-                }
+                DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s: ", 
+                          req->rq_reqmsg->transno, 
+                          ptlrpc_peernid2str(&req->rq_peer, peer_str));
+                (void)trd->trd_recovery_handler(req);
+                obd->obd_replayed_requests++;
+                reset_recovery_timer(obd);
+                /* bug 1580: decide how to properly sync() in recovery*/
+                //mds_fsync_super(mds->mds_sb);
+                ptlrpc_free_clone(req);
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                obd->obd_next_recovery_transno++;
+                spin_unlock_bh(&obd->obd_processing_task_lock);
         }
 
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        target_cancel_recovery_timer(obd);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        /* If some clients haven't replayed requests in time, evict them */
+        if (obd->obd_abort_recovery) {
+                int stale;
+                CERROR("req replay timed out, aborting ...\n");
+                obd->obd_abort_recovery = 0;
+                stale = class_disconnect_stale_exports(obd, req_replay_done, 0);
+                atomic_sub(stale, &obd->obd_lock_replay_clients);
+                abort_req_replay_queue(obd);
+        }
+
+        /* The second stage: replay locks */
+        CWARN("2: lock replay stage - %d clients\n",
+              atomic_read(&obd->obd_lock_replay_clients));
+        while ((req = target_next_replay_lock(obd))) {
+                LASSERT(trd->trd_processing_task == current->pid);
+                DEBUG_REQ(D_HA, req, "processing lock from %s: ", 
+                          ptlrpc_peernid2str(&req->rq_peer, peer_str));
+                (void)trd->trd_recovery_handler(req);
+                reset_recovery_timer(obd);
+                ptlrpc_free_clone(req);
+                obd->obd_replayed_locks++;
+        }
+        
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        target_cancel_recovery_timer(obd);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        /* If some clients haven't replayed requests in time, evict them */
+        if (obd->obd_abort_recovery) {
+                int stale;
+                CERROR("lock replay timed out, aborting ...\n");
+                obd->obd_abort_recovery = 0;
+                stale = class_disconnect_stale_exports(obd, lock_replay_done, 0);
+                abort_lock_replay_queue(obd);
+        }
+
+        /* We drop recoverying flag to forward all new requests
+         * to regular mds_handle() since now */
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        obd->obd_recovering = 0;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        /* The third stage: reply on final pings */
+        CWARN("3: final stage - process recovery completion pings\n");
+        while ((req = target_next_final_ping(obd))) {
+                LASSERT(trd->trd_processing_task == current->pid);
+                DEBUG_REQ(D_HA, req, "processing final ping from %s: ", 
+                          ptlrpc_peernid2str(&req->rq_peer, peer_str));
+                (void)trd->trd_recovery_handler(req);
+                ptlrpc_free_clone(req);
+        }
+        
+        CWARN("4: recovery completed - %d/%d reqs/locks replayed\n",
+              obd->obd_replayed_requests, obd->obd_replayed_locks);
+        target_finish_recovery(obd);
+
         trd->trd_processing_task = 0;
         complete(&trd->trd_finishing);
         return 0;
@@ -1227,6 +1367,43 @@ void target_stop_recovery_thread(struct obd_device *obd)
 }
 #endif
 
+int target_process_req_flags(struct obd_device *obd, struct ptlrpc_request *req)
+{
+        struct obd_export *exp = req->rq_export;
+        LASSERT(exp != NULL);
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
+                /* client declares he's ready to replay locks */
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                if (exp->exp_req_replay_needed) {
+                        LASSERT(atomic_read(&obd->obd_req_replay_clients) > 0);
+                        exp->exp_req_replay_needed = 0;
+                        atomic_dec(&obd->obd_req_replay_clients);
+                        if (atomic_read(&obd->obd_req_replay_clients) == 0) {
+                                CDEBUG(D_HA, "all clients have replayed reqs\n");
+                                wake_up(&obd->obd_next_transno_waitq);
+                        }
+                }
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+        }
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+                /* client declares he's ready to complete recovery 
+                 * so, we put the request on th final queue */
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                if (exp->exp_lock_replay_needed) {
+                        LASSERT(atomic_read(&obd->obd_lock_replay_clients) > 0);
+                        exp->exp_lock_replay_needed = 0;
+                        atomic_dec(&obd->obd_lock_replay_clients);
+                        if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
+                                CDEBUG(D_HA, "all clients have replayed locks\n");
+                                wake_up(&obd->obd_next_transno_waitq);
+                        }
+                }
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+        }
+
+        return 0;
+}
+
 int target_queue_recovery_request(struct ptlrpc_request *req,
                                   struct obd_device *obd)
 {
@@ -1234,6 +1411,39 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         int inserted = 0;
         __u64 transno = req->rq_reqmsg->transno;
 
+        if (obd->obd_recovery_data.trd_processing_task == current->pid) {
+                /* Processing the queue right now, don't re-add. */
+                return 1;
+        }
+
+        target_process_req_flags(obd, req);
+
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+                /* client declares he's ready to complete recovery 
+                 * so, we put the request on th final queue */
+                req = ptlrpc_clone_req(req);
+                if (req == NULL)
+                        return -ENOMEM;
+                DEBUG_REQ(D_HA, req, "queue final req");
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                list_add_tail(&req->rq_list, &obd->obd_final_req_queue);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                return 0;
+        }
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
+                /* client declares he's ready to replay locks */
+                req = ptlrpc_clone_req(req);
+                if (req == NULL)
+                        return -ENOMEM;
+                DEBUG_REQ(D_HA, req, "queue lock replay req");
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                wake_up(&obd->obd_next_transno_waitq);
+                return 0;
+        }
+
+
         /* CAVEAT EMPTOR: The incoming request message has been swabbed
          * (i.e. buflens etc are in my own byte order), but type-dependent
          * buffers (eg mds_body, ost_body etc) have NOT been swabbed. */
@@ -1256,8 +1466,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
          * handled will pass through here and be processed immediately.
          */
         spin_lock_bh(&obd->obd_processing_task_lock);
-        if (obd->obd_recovery_data.trd_processing_task == current->pid ||
-            transno < obd->obd_next_recovery_transno) {
+        if (transno < obd->obd_next_recovery_transno) {
                 /* Processing the queue right now, don't re-add. */
                 LASSERT(list_empty(&req->rq_list));
                 spin_unlock_bh(&obd->obd_processing_task_lock);
@@ -1280,7 +1489,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         spin_lock_bh(&obd->obd_processing_task_lock);
 
         /* XXX O(n^2) */
-        list_for_each(tmp, &obd->obd_recovery_queue) {
+        list_for_each(tmp, &obd->obd_req_replay_queue) {
                 struct ptlrpc_request *reqiter =
                         list_entry(tmp, struct ptlrpc_request, rq_list);
 
@@ -1292,7 +1501,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         }
 
         if (!inserted)
-                list_add_tail(&req->rq_list, &obd->obd_recovery_queue);
+                list_add_tail(&req->rq_list, &obd->obd_req_replay_queue);
 
         obd->obd_requests_queued_for_recovery++;
         wake_up(&obd->obd_next_transno_waitq);
@@ -1305,41 +1514,6 @@ struct obd_device * target_req2obd(struct ptlrpc_request *req)
         return req->rq_export->exp_obd;
 }
 
-int target_queue_final_reply(struct ptlrpc_request *req, int rc)
-{
-        struct obd_device *obd = target_req2obd(req);
-
-        LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
-
-        if (rc) {
-                /* Just like ptlrpc_error, but without the sending. */
-                rc = lustre_pack_reply(req, 0, NULL, NULL);
-                LASSERT(rc == 0); /* XXX handle this */
-                req->rq_type = PTL_RPC_MSG_ERR;
-        }
-
-        LASSERT (!req->rq_reply_state->rs_difficult);
-        LASSERT(list_empty(&req->rq_list));
-        
-        req = ptlrpc_clone_req(req);
-
-        spin_lock_bh(&obd->obd_processing_task_lock);
-
-        list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
-
-        /* only count the first "replay over" request from each
-           export */
-        if (req->rq_export->exp_replay_needed) {
-                --obd->obd_recoverable_clients;
-                req->rq_export->exp_replay_needed = 0;
-                CWARN("%s: %d recoverable clients remain\n",
-                      obd->obd_name, obd->obd_recoverable_clients);
-        }
-        wake_up(&obd->obd_next_transno_waitq);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-        return 1;
-}
-
 int
 target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id)
 {
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index 0634cb7b04..952112beb6 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -399,6 +399,9 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                 RETURN(0);
         }
 
+        LASSERTF(lock->l_export->exp_obd->obd_recovering == 0,
+                 "BUG 6063: lock collide during recovery");
+
         LASSERT(lock);
 
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
@@ -1676,4 +1679,3 @@ EXPORT_SYMBOL(target_send_reply);
 EXPORT_SYMBOL(target_queue_recovery_request);
 EXPORT_SYMBOL(target_handle_ping);
 EXPORT_SYMBOL(target_handle_disconnect);
-EXPORT_SYMBOL(target_queue_final_reply);
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c
index 90c988a73f..012b31eb7c 100644
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -1036,6 +1036,11 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                 size[1] = lock->l_lvb_len;
         }
         req->rq_replen = lustre_msg_size(buffers, size);
+        /* notify the server we've replayed all requests.
+         * also, we mark the request to be put on a dedicated
+         * queue to be processed after all request replayes.
+         * bug 6063 */
+        lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
 
         LDLM_DEBUG(lock, "replaying lock:");
 
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index c6e48cf2fb..ecfc6a208e 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -2823,14 +2823,6 @@ int mds_handle(struct ptlrpc_request *req)
         }
  out:
 
-        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
-                if (obd && obd->obd_recovering) {
-                        DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        return target_queue_final_reply(req, rc);
-                }
-                /* Lost a race with recovery; let the error path DTRT. */
-                rc = req->rq_status = -ENOTCONN;
-        }
 
         target_send_reply(req, rc, fail);
         return 0;
diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c
index 391654a4a4..b8bb10007c 100644
--- a/lustre/mds/mds_fs.c
+++ b/lustre/mds/mds_fs.c
@@ -380,9 +380,17 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
                 spin_lock_init(&med->med_open_lock);
 
                 mcd = NULL;
-                exp->exp_replay_needed = 1;
+                exp->exp_req_replay_needed = 1;
                 obd->obd_recoverable_clients++;
                 obd->obd_max_recoverable_clients++;
+
+                /* track clients to separate req replay
+                 * from lock replay. bug 6063 */
+                atomic_inc(&obd->obd_req_replay_clients);
+                exp->exp_req_replay_needed = 1;
+                atomic_inc(&obd->obd_lock_replay_clients);
+                exp->exp_lock_replay_needed = 1;
+                
                 class_export_put(exp);
 
                 CDEBUG(D_OTHER, "client at idx %d has last_transno = "LPU64"\n",
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c
index 36ae1e8d3b..596bc0b6d4 100644
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -738,7 +738,9 @@ void class_disconnect_exports(struct obd_device *obd, unsigned long flags)
 
 /* Remove exports that have not completed recovery.
  */
-void class_disconnect_stale_exports(struct obd_device *obd, unsigned long flags)
+int class_disconnect_stale_exports(struct obd_device *obd,
+                                   int (*test_export)(struct obd_export *),
+                                   unsigned long flags)
 {
         struct list_head work_list;
         struct list_head *pos, *n;
@@ -750,10 +752,12 @@ void class_disconnect_stale_exports(struct obd_device *obd, unsigned long flags)
         spin_lock(&obd->obd_dev_lock);
         list_for_each_safe(pos, n, &obd->obd_exports) {
                 exp = list_entry(pos, struct obd_export, exp_obd_chain);
-                if (exp->exp_replay_needed) {
+                if (!test_export(exp)) {
                         list_del(&exp->exp_obd_chain);
                         list_add(&exp->exp_obd_chain, &work_list);
                         cnt++;
+                        CDEBUG(D_ERROR, "%s: disconnect stale client %s\n",
+                               obd->obd_name, exp->exp_client_uuid.uuid);
                 }
         }
         spin_unlock(&obd->obd_dev_lock);
@@ -761,7 +765,7 @@ void class_disconnect_stale_exports(struct obd_device *obd, unsigned long flags)
         CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
                obd->obd_name, cnt);
         class_disconnect_export_list(&work_list, flags);
-        EXIT;
+        RETURN(cnt);
 }
 
 
diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c
index 414fde4f5c..c27c6ae5d6 100644
--- a/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@ -116,8 +116,9 @@ static int class_attach(struct lustre_cfg *lcfg)
         init_timer(&obd->obd_recovery_timer);
         spin_lock_init(&obd->obd_processing_task_lock);
         init_waitqueue_head(&obd->obd_next_transno_waitq);
-        INIT_LIST_HEAD(&obd->obd_recovery_queue);
-        INIT_LIST_HEAD(&obd->obd_delayed_reply_queue);
+        INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+        INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+        INIT_LIST_HEAD(&obd->obd_final_req_queue);
 
         spin_lock_init(&obd->obd_uncommitted_replies_lock);
         INIT_LIST_HEAD(&obd->obd_uncommitted_replies);
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c
index c5dbac89ec..43a778d50d 100644
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -499,7 +499,10 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 spin_lock_init(&fed->fed_lock);
 
                 fcd = NULL;
-                exp->exp_replay_needed = 1;
+                exp->exp_req_replay_needed = 1;
+                exp->exp_lock_replay_needed = 1;
+                atomic_inc(&obd->obd_req_replay_clients);
+                atomic_inc(&obd->obd_lock_replay_clients);
                 obd->obd_recoverable_clients++;
                 obd->obd_max_recoverable_clients++;
                 class_export_put(exp);
diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c
index 1090944866..cf1c6de90d 100644
--- a/lustre/ost/ost_handler.c
+++ b/lustre/ost/ost_handler.c
@@ -1228,15 +1228,6 @@ int ost_handle(struct ptlrpc_request *req)
         }
 
 out_check_req:
-        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
-                if (obd && obd->obd_recovering) {
-                        DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        rc = target_queue_final_reply(req, rc);
-                        GOTO(out_free_oti, rc);
-                }
-                /* Lost a race with recovery; let the error path DTRT. */
-                rc = req->rq_status = -ENOTCONN;
-        }
 
         if (!rc)
                 oti_to_request(oti, req);
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index 7cf82f11dd..52f3587f3f 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -589,7 +589,7 @@ static int signal_completed_replay(struct obd_import *imp)
 
         req->rq_replen = lustre_msg_size(0, NULL);
         req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
-        req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
+        req->rq_reqmsg->flags |= MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE;
         req->rq_timeout *= 3;
         req->rq_interpret_reply = completed_replay_interpret;
 
-- 
GitLab