diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 59e69a500381b9f87ca3d05b0f1caec162dcd0f5..bacd497099c9b49b368341dd7f40ab15e7ca9ded 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -201,6 +201,13 @@ Bugzilla : 11802 Description: lustre support for RHEL5 Details : Add support for RHEL5. +Severity : normal +Bugzilla : 12459 +Description: Client eviction due to failover config +Details : after a connection loss, the lustre client should attempt to + reconnect to the last active server first before trying the + other potential connections. + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 07785302d307270f556581cd159076905d598403..0c7eff2392b565f28d4d0ddba122a338cfea2578 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -290,7 +290,7 @@ static int import_select_connection(struct obd_import *imp) cfs_time_current_64())) { /* If we have never tried this connection since the the last successful attempt, go with this one */ - if (cfs_time_before_64(conn->oic_last_attempt, + if (cfs_time_beforeq_64(conn->oic_last_attempt, imp->imp_last_success_conn)) { imp_conn = conn; break; diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 46f36ed66e9280a532a9ee4aecba03a95e454a5f..a61e502e4875926038487ca6c9048e2adad0c84e 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1193,6 +1193,52 @@ test_33() { # bug 12333 } run_test 33 "Mount ost with a large index number" +test_35() { # bug 12459 + setup + + DBG_SAVE="`sysctl -n lnet.debug`" + sysctl -w lnet.debug="ha" + + log "Set up a fake failnode for the MDS" + FAKENID="127.0.0.2" + $LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4 + + log "Wait for RECONNECT_INTERVAL seconds (10s)" + sleep 10 + + MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`" + $LCTL clear + log "$MSG" + log "Stopping the MDT:" + stop_mds || return 5 + + df $MOUNT > /dev/null 2>&1 & + DFPID=$! + log "Restarting the MDT:" + start_mds || return 6 + log "Wait for df ($DFPID) ... " + wait $DFPID + log "done" + sysctl -w lnet.debug="$DBG_SAVE" + + # retrieve from the log the first server that the client tried to + # contact after the connection loss + $LCTL dk $TMP/lustre-log-$TESTNAME.log + NEXTCONN=`awk "/${MSG}/ {start = 1;} + /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ { + if (start) { + if (\\\$NF ~ /$FAKENID/) + print \\\$NF; + else + print 0; + exit; + } + }" $TMP/lustre-log-$TESTNAME.log` + [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7 + cleanup +} +run_test 35 "Reconnect to the last active server first" + umount_client $MOUNT cleanup_nocli cleanup_krb5_env