diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 9fbea8c63a4a318f4dfe911a00968f8209bec20e..73a1a2b76c520d98f75b9fe659fe535dae372bf5 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -47,41 +47,41 @@ Frequency  : Create a symlink file with a very long name
 Bugzilla   : 16578
 Description: ldlm_cancel_pack()) ASSERTION(max >= dlm->lock_count + count)
 Details    : If there is no extra space in the request for early cancels,
-             ldlm_req_handles_avail() returns 0 instead of a negative value.
+	     ldlm_req_handles_avail() returns 0 instead of a negative value.
 
 Severity   : major
 Frequency  : rare
 Bugzilla   : 16492
 Description: mds is deadlocked
 Details    : in rare cases, inode in catalog can have i_no less than have parent
-             i_no, this produce wrong order for locking during open, and parallel
-             unlink can be lock open. this need teach mds_open to grab locks in
-             resouce id order, not at parent -> child order.
+	     i_no, this produce wrong order for locking during open, and parallel
+	     unlink can be lock open. this need teach mds_open to grab locks in
+	     resource id order, not at parent -> child order.
 
 Severity   : enhancement
 Bugzilla   : 1819
 Description: Add /proc entry for import status
 Details    : The mdc, osc, and mgc import directories now have
-             an import directory that contains useful import data for debugging
-             connection problems.
+	     an import directory that contains useful import data for debugging
+	     connection problems.
 
 Severity   : enhancement
 Bugzilla   : 15966
 Description: Re-disable certain /proc logging
 Details    : Enable and disable client's offset_stats, extents_stats and
-             extents_stats_per_process stats logging on the fly.
+	     extents_stats_per_process stats logging on the fly.
 
 Severity   : enhancement
 Bugzilla   : 16643
 Description: Generic /proc file permissions
 Details    : Set /Proc file permissions in a more generic way to enable non-
-             root users operate on some /proc files.
+	     root users operate on some /proc files.
 
 Severity   : major
 Bugzilla   : 16561
 Description: Hitting mdc_commit_close() ASSERTION
 Details    : Properly handle request reference release in
-             ll_release_openhandle().
+	     ll_release_openhandle().
 
 Severity   : normal
 Bugzilla   : 16907
@@ -91,21 +91,21 @@ Severity   : major
 Bugzilla   : 16750
 Description: service mount cannot take device name with ":"
 Details    : Only when device name contains ":/" will mount treat it as
-             client mount.
+	     client mount.
 
 Severity   : normal
 Bugzilla   : 15927
 Frequency  : rare
 Description: replace ptlrpcd with the statahead thread to interpret the async
-             statahead RPC callback
+	     statahead RPC callback
 
 Severity   : normal
 Bugzilla   : 16611
 Frequency  : on recovery
 Description: I/O failures after umount during fail back
 Details    : if client reconnected to restarted server we need join to recovery
-             instead of find server handler is changed and process self eviction
-             with cancel all locks.
+	     instead of find server handler is changed and process self eviction
+	     with cancel all locks.
 
 Severity   : enhancement
 Bugzilla   : 16633
@@ -119,7 +119,7 @@ Severity   : enhancement
 Bugzilla   : 16566
 Description: Upcall on Lustre log has been dumped
 Details    : Allow for a user mode script to be called once a Lustre log has
-             been dumped. It passes the filename of the dumped log to the
+	     been dumped. It passes the filename of the dumped log to the
 	     script, the location of the script can be specified via
 	     /proc/sys/lnet/debug_log_upcall.
 
@@ -128,22 +128,22 @@ Bugzilla   : 16583
 Frequency  : rare
 Description: avoid messages about idr_remove called for id  which is not allocated.
 Details    : Move assigment s_dev for clustered nfs to end of initialization, for
-             avoid problem with error handling.
+	     avoid problem with error handling.
 
 Severity   : minor
 Bugzilla   : 16583
 Frequency  : rare
 Description: avoid messages about idr_remove called for id  which is not allocated.
 Details    : Move assigment s_dev for clustered nfs to end of initialization, for avoid
-             problem with error handling.
+	     problem with error handling.
 
 Severity   : minor
 Bugzilla   : 16109
 Frequency  : rare
 Description: avoid Already found the key in hash [CONN_UNUSED_HASH] messages
 Details    : When connection is reused this not moved from CONN_UNUSED_HASH into
-             CONN_USED_HASH and this prodice warning when put connection again
-             in unused hash.
+	     CONN_USED_HASH and this prodice warning when put connection again
+	     in unused hash.
 
 Severity   : enhancement
 Bugzilla   : 16573
@@ -153,21 +153,21 @@ Severity   : normal
 Bugzilla   : 16237
 Description: Early reply size mismatch, MGC loses connection
 Details    : Apply the MGS_CONNECT_SUPPORTED mask at reconnect time so
-             the connect flags are properly negotiated.
+	     the connect flags are properly negotiated.
 
 Severity   : major
 Bugzilla   : 14840
 Description: quota recovery deadlock during mds failover
 Details    : This patch includes att18982, att18236, att18237 in bz14840.
-             Slove the problems:
-             1. fix osts hang when mds does failover with quotaon
-             2. prevent watchdog storm when osts threads wait for the
+	     Solve the problems:
+	     1. fix osts hang when mds does failover with quotaon
+	     2. prevent watchdog storm when osts threads wait for the
 	        recovery of mds
 
 Severity   : enhancement
 Bugzilla   : 14095
 Description: Add lustre_start utility to start or stop multiple Lustre servers
-             from a CSV file.
+	     from a CSV file.
 
 Severity   : normal
 Bugzilla   : 17026
@@ -180,14 +180,14 @@ Severity   : enhancement
 Bugzilla   : 12800
 Description: More exported tunables for mballoc
 Details    : Add support for tunable preallocation window and new tunables for
-             large/small requests
+	     large/small requests
 
 Severity   : normal
 Bugzilla   : 16680
 Description: Detect corruption of block bitmap and checking for preallocations
 Details    : Checks validity of on-disk block bitmap. Also it does better
-             checking of number of applied preallocations. When corruption is
-             found, it turns filesystem readonly to prevent further corruptions.
+	     checking of number of applied preallocations. When corruption is
+	     found, it turns filesystem readonly to prevent further corruptions.
 
 Severity   : normal
 Bugzilla   : 17197
@@ -202,8 +202,18 @@ Bugzilla   : 16438
 Frequency  : only for big-endian servers
 Description: Check if system is big-endian while mounting fs with extents feature
 Details    : Mounting a filesystem with extents feature will fail on big-endian
-             systems since ext3-based ldiskfs is not supported on big-endian
-             systems. This can be over-riden with "bigendian_extents" mount option.
+	     systems since ext3-based ldiskfs is not supported on big-endian
+	     systems. This can be over-riden with "bigendian_extents" mount option.
+
+Severity   : normal
+Bugzilla   : 16860
+Description: Excessive recovery window
+Details    : With AT enabled, the recovery window can be excessively long (6000+
+	     seconds). To address this problem, we no longer use
+	     OBD_RECOVERY_FACTOR when extending the recovery window (the connect
+	     timeout no longer depends on the service time, it is set to
+	     INITIAL_CONNECT_TIMEOUT now) and clients report the old service
+	     time via pb_service_time.
 
 --------------------------------------------------------------------------
 
diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h
index 30adbdf721c10fcf3008f7e09e424450a2678208..7613b11c785d2060ff1a568b1516eb6f62c50cb0 100644
--- a/lustre/include/lustre_import.h
+++ b/lustre/include/lustre_import.h
@@ -179,8 +179,8 @@ static inline unsigned int at_est2timeout(unsigned int val)
 
 static inline unsigned int at_timeout2est(unsigned int val)
 {
-        /* restore estimate value from timeout */
-        return ((val - 1) / 5 * 4);
+        /* restore estimate value from timeout: e=4/5(t-5) */
+        return (max((val << 2) / 5, 5U) - 4);
 }
 
 static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index 10bf8f669ccebc2fed40a4e3c266d253a1b34ea9..ae38782bfa3063ff970f9713eea1ae3d62324e7e 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1254,18 +1254,27 @@ target_start_and_reset_recovery_timer(struct obd_device *obd,
                                       struct ptlrpc_request *req,
                                       int new_client)
 {
-        int req_timeout = lustre_msg_get_timeout(req->rq_reqmsg);
+        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
 
-        /* teach server about old server's estimates */
-        if (!new_client)
+        if (!new_client && service_time)
+                /* Teach server about old server's estimates, as first guess
+                 * at how long new requests will take. */
                 at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate,
-                       at_timeout2est(req_timeout));
+                       service_time);
 
         check_and_start_recovery_timer(obd, handler);
 
-        req_timeout *= OBD_RECOVERY_FACTOR;
-        if (req_timeout > obd->obd_recovery_timeout && !new_client)
-                reset_recovery_timer(obd, req_timeout, 0);
+        /* convert the service time to rpc timeout,
+         * reuse service_time to limit stack usage */
+        service_time = at_est2timeout(service_time);
+
+        /* We expect other clients to timeout within service_time, then try
+         * to reconnect, then try the failover server.  The max delay between
+         * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */
+        service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
+                             INITIAL_CONNECT_TIMEOUT);
+        if (service_time > obd->obd_recovery_timeout && !new_client)
+                reset_recovery_timer(obd, service_time, 0);
 }
 
 static int check_for_next_transno(struct obd_device *obd)
@@ -1349,8 +1358,9 @@ static void process_recovery_queue(struct obd_device *obd)
                 DEBUG_REQ(D_HA, req, "processing: ");
                 (void)obd->obd_recovery_handler(req);
                 obd->obd_replayed_requests++;
-                reset_recovery_timer(obd, OBD_RECOVERY_FACTOR *
-                       AT_OFF ? obd_timeout :
+                /* Extend the recovery timer enough to complete the next
+                 * replayed rpc */
+                reset_recovery_timer(obd, AT_OFF ? obd_timeout :
                        at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
                 /* bug 1580: decide how to properly sync() in recovery */
                 //mds_fsync_super(obd->u.obt.obt_sb);
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index a869a811b64c555c06f12ffe539ec1a893dbdf29..9d2e13c7787f23c4cbe5dba889b476a855385da8 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -626,6 +626,19 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
         if (!request)
                 GOTO(out, rc = -ENOMEM);
 
+        /* Report the rpc service time to the server so that it knows how long
+         * to wait for clients to join recovery */
+        lustre_msg_set_service_time(request->rq_reqmsg,
+                                    at_timeout2est(request->rq_timeout));
+
+        /* The amount of time we give the server to process the connect req.
+         * import_select_connection will increase the net latency on
+         * repeated reconnect attempts to cover slow networks.
+         * We override/ignore the server rpc completion estimate here,
+         * which may be large if this is a reconnect attempt */
+        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+        lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
 #ifndef __KERNEL__
         lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
 #endif
@@ -651,10 +664,6 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
                 spin_lock(&imp->imp_lock);
                 imp->imp_replayable = 1;
                 spin_unlock(&imp->imp_lock);
-                if (AT_OFF)
-                        /* AT will use INITIAL_CONNECT_TIMEOUT the first
-                           time, adaptive after that. */
-                        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
         }
 
         DEBUG_REQ(D_RPCTRACE, request, "%sconnect request %d",