From 2b8a823c4ab0ad089aefd903abc6c702b52bd420 Mon Sep 17 00:00:00 2001
From: shadow <shadow>
Date: Thu, 21 Aug 2008 06:08:58 +0000
Subject: [PATCH] avoid I/O failures after umount during fail back Branch b1_6
 b=16611 i=green i=rread

---
 lustre/ChangeLog       |  8 ++++++++
 lustre/ptlrpc/import.c | 33 ++++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 32141756be..b29653bf2f 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -38,6 +38,14 @@ tbd Sun Microsystems, Inc.
 	* Output of lfs quota has been made less detailed by default,
 	  old (verbose) output can be obtained by using -v option.
 
+Severity   : normal
+Bugzilla   : 16611
+Frequency  : on recovery
+Description: I/O failures after umount during fail back 
+Details    : if client reconnected to restarted server we need join to recovery
+             instead of find server handler is changed and process self eviction
+             with cancel all locks.
+
 Severity   : enhancement
 Bugzilla   : 16633
 Description: Update to RHEL5 kernel-2.6.18-92.1.10.el5.
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index 0986cc11e7..5d7252d781 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -705,19 +705,34 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                 if (memcmp(&imp->imp_remote_handle,
                            lustre_msg_get_handle(request->rq_repmsg),
                            sizeof(imp->imp_remote_handle))) {
+                        int level = msg_flags & MSG_CONNECT_RECOVERING ? D_HA :
+                                                                         D_WARNING;
+
+                        /* Bug 16611/14775: if server handle have changed,
+                         * that means some sort of disconnection happened.
+                         * If the server is not in recovery, that also means it
+                         * already erased all of our state because of previous
+                         * eviction. If it is in recovery - we are safe to
+                         * participate since we can reestablish all of our state
+                         * with server again */
+                        CDEBUG(level,"%s@%s changed server handle from "
+                                     LPX64" to "LPX64"%s \n" "but is still in recovery \n",
+                                     obd2cli_tgt(imp->imp_obd),
+                                     imp->imp_connection->c_remote_uuid.uuid,
+                                     imp->imp_remote_handle.cookie,
+                                     lustre_msg_get_handle(request->rq_repmsg)->
+                                                                        cookie,
+                                     (MSG_CONNECT_RECOVERING & msg_flags) ?
+                                         "but is still in recovery" : "");
 
-                        CWARN("%s@%s changed server handle from "
-                               LPX64" to "LPX64" - evicting.\n",
-                               obd2cli_tgt(imp->imp_obd),
-                               imp->imp_connection->c_remote_uuid.uuid,
-                               imp->imp_remote_handle.cookie,
-                               lustre_msg_get_handle(request->rq_repmsg)->
-                                         cookie);
                         imp->imp_remote_handle =
                                      *lustre_msg_get_handle(request->rq_repmsg);
 
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
-                        GOTO(finish, rc = 0);
+                        if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                                GOTO(finish, rc = 0);
+                        }
+
                 } else {
                         CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
                                obd2cli_tgt(imp->imp_obd),
-- 
GitLab