From 36a859cf7a4d11a475538ac288d9f053ae36400c Mon Sep 17 00:00:00 2001
From: shadow <shadow>
Date: Thu, 6 Mar 2008 04:19:28 +0000
Subject: [PATCH] Client not clear own cache if answer to reconnect is lost.
 b=14775 i=green i=nathan i=johann

---
 lustre/ChangeLog               | 10 ++++++++++
 lustre/include/obd_support.h   |  1 +
 lustre/ost/ost_handler.c       |  1 +
 lustre/ptlrpc/import.c         | 17 +++++++----------
 lustre/tests/recovery-small.sh | 32 ++++++++++++++++++++++++++++++++
 5 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 71deebe997..ebc20e2235 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -397,6 +397,16 @@ Details    : When the failover node is the primary node, it is possible
              compare not conn's pointers but NIDs, otherwise we
 	     can defeat connection throttling.
 
+Severity   : major
+Bugzilla   : 14775
+Description: Client not clear own cache if answer to reconnect is lost.
+Details    : client gets evicted from server. Now client also thinks it is
+             disconnected (ot gets enotconn on its operation) and decides to
+             reconnect. Server receives reconnect message, but cannot find export.
+             New export is created that is fully valid (new cookie!), but client
+             gets a reply that the export is new, and so no recovery should be
+             performed.
+
 --------------------------------------------------------------------------------
 
 2007-12-07         Cluster File Systems, Inc. <info@clusterfs.com>
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 27d7dfffeb..ded394addb 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -187,6 +187,7 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
 #define OBD_FAIL_OST_PAUSE_CREATE        0x223
 #define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2        0x225
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c
index e67da314d6..8a6ccfbc4c 100644
--- a/lustre/ost/ost_handler.c
+++ b/lustre/ost/ost_handler.c
@@ -1529,6 +1529,7 @@ static int ost_handle(struct ptlrpc_request *req)
                 CDEBUG(D_INODE, "connect\n");
                 OBD_FAIL_RETURN(OBD_FAIL_OST_CONNECT_NET, 0);
                 rc = target_handle_connect(req, ost_handle);
+                OBD_FAIL_RETURN(OBD_FAIL_OST_CONNECT_NET2, 0);
                 if (!rc)
                         obd = req->rq_export->exp_obd;
                 break;
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index 90da29b4aa..98a2c45b39 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -669,22 +669,19 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                 if (memcmp(&imp->imp_remote_handle,
                            lustre_msg_get_handle(request->rq_repmsg),
                            sizeof(imp->imp_remote_handle))) {
-                        int level = D_ERROR;
-                        /* Old MGC can reconnect to a restarted MGS */
-                        if (strcmp(imp->imp_obd->obd_type->typ_name,
-                                   LUSTRE_MGC_NAME) == 0) {
-                                level = D_CONFIG;
-                        }
-                        CDEBUG(level, 
-                               "%s@%s changed handle from "LPX64" to "LPX64
-                               "; copying, but this may foreshadow disaster\n",
+
+                        CWARN("%s@%s changed server handle from "
+                               LPX64" to "LPX64" - evicting.\n",
                                obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid,
                                imp->imp_remote_handle.cookie,
                                lustre_msg_get_handle(request->rq_repmsg)->
-                                        cookie);
+                                         cookie);
                         imp->imp_remote_handle =
                                      *lustre_msg_get_handle(request->rq_repmsg);
+
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                        GOTO(finish, rc = 0);
                 } else {
                         CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
                                obd2cli_tgt(imp->imp_obd),
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh
index 65f440c6f7..7d1fa4cbb0 100755
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -269,6 +269,7 @@ test_18b() {
     do_facet client cp $SAMPLE_FILE $f
     sync
     ost_evict_client
+
     # force reconnect
     df $MOUNT > /dev/null 2>&1
     sleep 2
@@ -281,6 +282,37 @@ test_18b() {
 }
 run_test 18b "eviction and reconnect clears page cache (2766)"
 
+test_18c() {
+    do_facet client mkdir -p $MOUNT/$tdir
+    f=$MOUNT/$tdir/$tfile
+    f2=$MOUNT/$tdir/${tfile}-2
+
+    cancel_lru_locks osc
+    pgcache_empty || return 1
+
+    # shouldn't have to set stripe size of count==1
+    lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1
+    lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1
+
+    do_facet client cp $SAMPLE_FILE $f
+    sync
+    ost_evict_client
+
+    # OBD_FAIL_OST_CONNECT_NET2
+    # lost reply to connect request
+    do_facet ost1 sysctl -w lustre.fail_loc=0x80000225
+    # force reconnect
+    df $MOUNT > /dev/null 2>&1
+    sleep 2
+    # my understanding is that there should be nothing in the page
+    # cache after the client reconnects?     
+    rc=0
+    pgcache_empty || rc=2
+    rm -f $f $f2
+    return $rc
+}
+run_test 18c "Dropped connect reply after eviction handing (14755)"
+
 test_19a() {
     f=$MOUNT/$tfile
     do_facet client mcreate $f        || return 1
-- 
GitLab