diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 9ac9f75f1a313193ed7ce51568fdcccc841ecfc4..d7eca9d9f9e6b725941f017a031f2cd17626cb03 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -128,6 +128,13 @@ Description: testing performance impact of enabling checksumming Details : enable checksum by default, allow --disable-checksum configure option and "-o nochecksum" mount option +Severity : normal +Bugzilla : 12459 +Description: Client eviction due to failover config +Details : after a connection loss, the lustre client should attempt to + reconnect to the last active server first before trying the + other potential connections. + -------------------------------------------------------------------------------- 2007-08-27 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index dc2256444d3979d38659e17b7101e42366041c32..66adf403cc0fe07def7f7dea7a1657ebbfaa167a 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -279,7 +279,7 @@ static int import_select_connection(struct obd_import *imp) cfs_time_current_64())) { /* If we have never tried this connection since the the last successful attempt, go with this one */ - if (cfs_time_before_64(conn->oic_last_attempt, + if (cfs_time_beforeq_64(conn->oic_last_attempt, imp->imp_last_success_conn)) { imp_conn = conn; break; diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 02afc82f490839716dc116aa1c47c161ec905e5c..f0536637e3071fc5409232dc89f776c493a067e0 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1254,5 +1254,51 @@ test_34c() { } run_test 34c "force umount with failed mds should be normal" +test_35() { # bug 12459 + setup + + debugsave + sysctl -w lnet.debug="ha" + + log "Set up a fake failnode for the MDS" + FAKENID="127.0.0.2" + $LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4 + + log "Wait for RECONNECT_INTERVAL seconds (10s)" + sleep 10 + + MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`" + $LCTL clear + log "$MSG" + log "Stopping the MDT:" + stop_mds || return 5 + + df $MOUNT > /dev/null 2>&1 & + DFPID=$! + log "Restarting the MDT:" + start_mds || return 6 + log "Wait for df ($DFPID) ... " + wait $DFPID + log "done" + debugrestore + + # retrieve from the log the first server that the client tried to + # contact after the connection loss + $LCTL dk $TMP/lustre-log-$TESTNAME.log + NEXTCONN=`awk "/${MSG}/ {start = 1;} + /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ { + if (start) { + if (\\\$NF ~ /$FAKENID/) + print \\\$NF; + else + print 0; + exit; + } + }" $TMP/lustre-log-$TESTNAME.log` + [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7 + cleanup +} +run_test 35 "Reconnect to the last active server first" + equals_msg "Done" echo "$0: completed"