From 3ac3d2917697294e0a643c10bd74d61832302c14 Mon Sep 17 00:00:00 2001 From: johann <johann> Date: Thu, 30 Aug 2007 16:16:35 +0000 Subject: [PATCH] Branch b1_6 b=12459 i=adilger,tianzy i=scjody Severity : normal Bugzilla : 12459 Description: Client eviction due to failover config Details : after a connection loss, the lustre client should attempt to reconnect to the last active server first before trying the other potential connections. --- lustre/ChangeLog | 7 ++++++ lustre/ptlrpc/import.c | 2 +- lustre/tests/conf-sanity.sh | 46 +++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 9ac9f75f1a..d7eca9d9f9 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -128,6 +128,13 @@ Description: testing performance impact of enabling checksumming Details : enable checksum by default, allow --disable-checksum configure option and "-o nochecksum" mount option +Severity : normal +Bugzilla : 12459 +Description: Client eviction due to failover config +Details : after a connection loss, the lustre client should attempt to + reconnect to the last active server first before trying the + other potential connections. + -------------------------------------------------------------------------------- 2007-08-27 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index dc2256444d..66adf403cc 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -279,7 +279,7 @@ static int import_select_connection(struct obd_import *imp) cfs_time_current_64())) { /* If we have never tried this connection since the the last successful attempt, go with this one */ - if (cfs_time_before_64(conn->oic_last_attempt, + if (cfs_time_beforeq_64(conn->oic_last_attempt, imp->imp_last_success_conn)) { imp_conn = conn; break; diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 02afc82f49..f0536637e3 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1254,5 +1254,51 @@ test_34c() { } run_test 34c "force umount with failed mds should be normal" +test_35() { # bug 12459 + setup + + debugsave + sysctl -w lnet.debug="ha" + + log "Set up a fake failnode for the MDS" + FAKENID="127.0.0.2" + $LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4 + + log "Wait for RECONNECT_INTERVAL seconds (10s)" + sleep 10 + + MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`" + $LCTL clear + log "$MSG" + log "Stopping the MDT:" + stop_mds || return 5 + + df $MOUNT > /dev/null 2>&1 & + DFPID=$! + log "Restarting the MDT:" + start_mds || return 6 + log "Wait for df ($DFPID) ... " + wait $DFPID + log "done" + debugrestore + + # retrieve from the log the first server that the client tried to + # contact after the connection loss + $LCTL dk $TMP/lustre-log-$TESTNAME.log + NEXTCONN=`awk "/${MSG}/ {start = 1;} + /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ { + if (start) { + if (\\\$NF ~ /$FAKENID/) + print \\\$NF; + else + print 0; + exit; + } + }" $TMP/lustre-log-$TESTNAME.log` + [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7 + cleanup +} +run_test 35 "Reconnect to the last active server first" + equals_msg "Done" echo "$0: completed" -- GitLab