diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 71deebe997bcfae847ff782f8f0c1e9c16371baa..ebc20e22359dbc55b9bef7cefee015287a9f268b 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -397,6 +397,16 @@ Details : When the failover node is the primary node, it is possible compare not conn's pointers but NIDs, otherwise we can defeat connection throttling. +Severity : major +Bugzilla : 14775 +Description: Client not clear own cache if answer to reconnect is lost. +Details : client gets evicted from server. Now client also thinks it is + disconnected (ot gets enotconn on its operation) and decides to + reconnect. Server receives reconnect message, but cannot find export. + New export is created that is fully valid (new cookie!), but client + gets a reply that the export is new, and so no recovery should be + performed. + -------------------------------------------------------------------------------- 2007-12-07 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 27d7dfffeba391abb6875c558650607f987701cf..ded394addb3b850a97eb2331da30a01645713e94 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -187,6 +187,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 #define OBD_FAIL_OST_PAUSE_CREATE 0x223 #define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 +#define OBD_FAIL_OST_CONNECT_NET2 0x225 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index e67da314d6d8cd81462158cd242a647f76a143bd..8a6ccfbc4c4da42f1b3703fd28fa1480d54f0bc6 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1529,6 +1529,7 @@ static int ost_handle(struct ptlrpc_request *req) CDEBUG(D_INODE, "connect\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_CONNECT_NET, 0); rc = target_handle_connect(req, ost_handle); + OBD_FAIL_RETURN(OBD_FAIL_OST_CONNECT_NET2, 0); if (!rc) obd = req->rq_export->exp_obd; break; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 90da29b4aaa1186ac1d53d76f75fa70a26a1a4f8..98a2c45b39f41706097de86b6a9841a3c4fba1f5 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -669,22 +669,19 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, if (memcmp(&imp->imp_remote_handle, lustre_msg_get_handle(request->rq_repmsg), sizeof(imp->imp_remote_handle))) { - int level = D_ERROR; - /* Old MGC can reconnect to a restarted MGS */ - if (strcmp(imp->imp_obd->obd_type->typ_name, - LUSTRE_MGC_NAME) == 0) { - level = D_CONFIG; - } - CDEBUG(level, - "%s@%s changed handle from "LPX64" to "LPX64 - "; copying, but this may foreshadow disaster\n", + + CWARN("%s@%s changed server handle from " + LPX64" to "LPX64" - evicting.\n", obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid, imp->imp_remote_handle.cookie, lustre_msg_get_handle(request->rq_repmsg)-> - cookie); + cookie); imp->imp_remote_handle = *lustre_msg_get_handle(request->rq_repmsg); + + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + GOTO(finish, rc = 0); } else { CDEBUG(D_HA, "reconnected to %s@%s after partition\n", obd2cli_tgt(imp->imp_obd), diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 65f440c6f74e8fc6b5a8491087f3cc7e0eba3a22..7d1fa4cbb009a00a1b3e85a7998c62906b60270f 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -269,6 +269,7 @@ test_18b() { do_facet client cp $SAMPLE_FILE $f sync ost_evict_client + # force reconnect df $MOUNT > /dev/null 2>&1 sleep 2 @@ -281,6 +282,37 @@ test_18b() { } run_test 18b "eviction and reconnect clears page cache (2766)" +test_18c() { + do_facet client mkdir -p $MOUNT/$tdir + f=$MOUNT/$tdir/$tfile + f2=$MOUNT/$tdir/${tfile}-2 + + cancel_lru_locks osc + pgcache_empty || return 1 + + # shouldn't have to set stripe size of count==1 + lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 + lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1 + + do_facet client cp $SAMPLE_FILE $f + sync + ost_evict_client + + # OBD_FAIL_OST_CONNECT_NET2 + # lost reply to connect request + do_facet ost1 sysctl -w lustre.fail_loc=0x80000225 + # force reconnect + df $MOUNT > /dev/null 2>&1 + sleep 2 + # my understanding is that there should be nothing in the page + # cache after the client reconnects? + rc=0 + pgcache_empty || rc=2 + rm -f $f $f2 + return $rc +} +run_test 18c "Dropped connect reply after eviction handing (14755)" + test_19a() { f=$MOUNT/$tfile do_facet client mcreate $f || return 1