From a0279514893dbfa063b36791a5a0f82b3759be52 Mon Sep 17 00:00:00 2001 From: johann <johann> Date: Fri, 31 Aug 2007 12:48:09 +0000 Subject: [PATCH] Branch HEAD b=12459 i=adilger,tianzy i=scjody Severity : normal Bugzilla : 12459 Description: Client eviction due to failover config Details : after a connection loss, the lustre client should attempt to reconnect to the last active server first before trying the other potential connections. --- lustre/ChangeLog | 7 ++++++ lustre/ptlrpc/import.c | 2 +- lustre/tests/conf-sanity.sh | 46 +++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 59e69a5003..bacd497099 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -201,6 +201,13 @@ Bugzilla : 11802 Description: lustre support for RHEL5 Details : Add support for RHEL5. +Severity : normal +Bugzilla : 12459 +Description: Client eviction due to failover config +Details : after a connection loss, the lustre client should attempt to + reconnect to the last active server first before trying the + other potential connections. + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 07785302d3..0c7eff2392 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -290,7 +290,7 @@ static int import_select_connection(struct obd_import *imp) cfs_time_current_64())) { /* If we have never tried this connection since the the last successful attempt, go with this one */ - if (cfs_time_before_64(conn->oic_last_attempt, + if (cfs_time_beforeq_64(conn->oic_last_attempt, imp->imp_last_success_conn)) { imp_conn = conn; break; diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 46f36ed66e..a61e502e48 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1193,6 +1193,52 @@ test_33() { # bug 12333 } run_test 33 "Mount ost with a large index number" +test_35() { # bug 12459 + setup + + DBG_SAVE="`sysctl -n lnet.debug`" + sysctl -w lnet.debug="ha" + + log "Set up a fake failnode for the MDS" + FAKENID="127.0.0.2" + $LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4 + + log "Wait for RECONNECT_INTERVAL seconds (10s)" + sleep 10 + + MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`" + $LCTL clear + log "$MSG" + log "Stopping the MDT:" + stop_mds || return 5 + + df $MOUNT > /dev/null 2>&1 & + DFPID=$! + log "Restarting the MDT:" + start_mds || return 6 + log "Wait for df ($DFPID) ... " + wait $DFPID + log "done" + sysctl -w lnet.debug="$DBG_SAVE" + + # retrieve from the log the first server that the client tried to + # contact after the connection loss + $LCTL dk $TMP/lustre-log-$TESTNAME.log + NEXTCONN=`awk "/${MSG}/ {start = 1;} + /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ { + if (start) { + if (\\\$NF ~ /$FAKENID/) + print \\\$NF; + else + print 0; + exit; + } + }" $TMP/lustre-log-$TESTNAME.log` + [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7 + cleanup +} +run_test 35 "Reconnect to the last active server first" + umount_client $MOUNT cleanup_nocli cleanup_krb5_env -- GitLab