From a0279514893dbfa063b36791a5a0f82b3759be52 Mon Sep 17 00:00:00 2001
From: johann <johann>
Date: Fri, 31 Aug 2007 12:48:09 +0000
Subject: [PATCH] Branch HEAD b=12459 i=adilger,tianzy i=scjody

Severity   : normal
Bugzilla   : 12459
Description: Client eviction due to failover config
Details    : after a connection loss, the lustre client should attempt to
	     reconnect to the last active server first before trying the
	     other potential connections.
---
 lustre/ChangeLog            |  7 ++++++
 lustre/ptlrpc/import.c      |  2 +-
 lustre/tests/conf-sanity.sh | 46 +++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 59e69a5003..bacd497099 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -201,6 +201,13 @@ Bugzilla   : 11802
 Description: lustre support for RHEL5
 Details    : Add support for RHEL5.
 
+Severity   : normal
+Bugzilla   : 12459
+Description: Client eviction due to failover config
+Details    : after a connection loss, the lustre client should attempt to
+	     reconnect to the last active server first before trying the
+	     other potential connections.
+
 --------------------------------------------------------------------------------
 
 2007-08-10         Cluster File Systems, Inc. <info@clusterfs.com>
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index 07785302d3..0c7eff2392 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -290,7 +290,7 @@ static int import_select_connection(struct obd_import *imp)
                                        cfs_time_current_64())) {
                         /* If we have never tried this connection since the
                            the last successful attempt, go with this one */
-                        if (cfs_time_before_64(conn->oic_last_attempt,
+                        if (cfs_time_beforeq_64(conn->oic_last_attempt,
                                                imp->imp_last_success_conn)) {
                                 imp_conn = conn;
                                 break;
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh
index 46f36ed66e..a61e502e48 100644
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -1193,6 +1193,52 @@ test_33() { # bug 12333
 }
 run_test 33 "Mount ost with a large index number"
 
+test_35() { # bug 12459
+	setup
+
+	DBG_SAVE="`sysctl -n lnet.debug`"
+	sysctl -w lnet.debug="ha"
+
+	log "Set up a fake failnode for the MDS"
+	FAKENID="127.0.0.2"
+	$LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4
+
+	log "Wait for RECONNECT_INTERVAL seconds (10s)"
+	sleep 10
+
+	MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`"
+	$LCTL clear
+	log "$MSG"
+	log "Stopping the MDT:"
+	stop_mds || return 5
+
+	df $MOUNT > /dev/null 2>&1 &
+	DFPID=$!
+	log "Restarting the MDT:"
+	start_mds || return 6
+	log "Wait for df ($DFPID) ... "
+	wait $DFPID
+	log "done"
+	sysctl -w lnet.debug="$DBG_SAVE"
+
+	# retrieve from the log the first server that the client tried to
+	# contact after the connection loss
+	$LCTL dk $TMP/lustre-log-$TESTNAME.log
+	NEXTCONN=`awk "/${MSG}/ {start = 1;}
+		       /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ {
+				if (start) {
+					if (\\\$NF ~ /$FAKENID/)
+						print \\\$NF;
+					else
+						print 0;
+					exit;
+				}
+		       }" $TMP/lustre-log-$TESTNAME.log`
+	[ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7
+	cleanup
+}
+run_test 35 "Reconnect to the last active server first"
+
 umount_client $MOUNT	
 cleanup_nocli
 cleanup_krb5_env
-- 
GitLab