From 3ac3d2917697294e0a643c10bd74d61832302c14 Mon Sep 17 00:00:00 2001
From: johann <johann>
Date: Thu, 30 Aug 2007 16:16:35 +0000
Subject: [PATCH] Branch b1_6 b=12459 i=adilger,tianzy i=scjody

Severity   : normal
Bugzilla   : 12459
Description: Client eviction due to failover config
Details    : after a connection loss, the lustre client should attempt to
	     reconnect to the last active server first before trying the
	     other potential connections.
---
 lustre/ChangeLog            |  7 ++++++
 lustre/ptlrpc/import.c      |  2 +-
 lustre/tests/conf-sanity.sh | 46 +++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 9ac9f75f1a..d7eca9d9f9 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -128,6 +128,13 @@ Description: testing performance impact of enabling checksumming
 Details    : enable checksum by default, allow --disable-checksum 
              configure option and "-o nochecksum" mount option 
 
+Severity   : normal
+Bugzilla   : 12459
+Description: Client eviction due to failover config
+Details    : after a connection loss, the lustre client should attempt to
+	     reconnect to the last active server first before trying the
+	     other potential connections.
+
 --------------------------------------------------------------------------------
 
 2007-08-27         Cluster File Systems, Inc. <info@clusterfs.com>
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index dc2256444d..66adf403cc 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -279,7 +279,7 @@ static int import_select_connection(struct obd_import *imp)
                                        cfs_time_current_64())) {
                         /* If we have never tried this connection since the
                            the last successful attempt, go with this one */
-                        if (cfs_time_before_64(conn->oic_last_attempt,
+                        if (cfs_time_beforeq_64(conn->oic_last_attempt,
                                                imp->imp_last_success_conn)) {
                                 imp_conn = conn;
                                 break;
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh
index 02afc82f49..f0536637e3 100644
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -1254,5 +1254,51 @@ test_34c() {
 }
 run_test 34c "force umount with failed mds should be normal"
 
+test_35() { # bug 12459
+	setup
+
+	debugsave
+	sysctl -w lnet.debug="ha"
+
+	log "Set up a fake failnode for the MDS"
+	FAKENID="127.0.0.2"
+	$LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4
+
+	log "Wait for RECONNECT_INTERVAL seconds (10s)"
+	sleep 10
+
+	MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`"
+	$LCTL clear
+	log "$MSG"
+	log "Stopping the MDT:"
+	stop_mds || return 5
+
+	df $MOUNT > /dev/null 2>&1 &
+	DFPID=$!
+	log "Restarting the MDT:"
+	start_mds || return 6
+	log "Wait for df ($DFPID) ... "
+	wait $DFPID
+	log "done"
+	debugrestore
+
+	# retrieve from the log the first server that the client tried to
+	# contact after the connection loss
+	$LCTL dk $TMP/lustre-log-$TESTNAME.log
+	NEXTCONN=`awk "/${MSG}/ {start = 1;}
+		       /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ {
+				if (start) {
+					if (\\\$NF ~ /$FAKENID/)
+						print \\\$NF;
+					else
+						print 0;
+					exit;
+				}
+		       }" $TMP/lustre-log-$TESTNAME.log`
+	[ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7
+	cleanup
+}
+run_test 35 "Reconnect to the last active server first"
+
 equals_msg "Done"
 echo "$0: completed"
-- 
GitLab