diff --git a/lustre/ChangeLog b/lustre/ChangeLog index a4dd44c688b3fa5781426c0a452eb7ff9ae540fe..3fb9d7dca794b6c1fa99208340747d79ccf9ecd8 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1842,6 +1842,17 @@ Details : A security feature, which is to prevent users from being able configuration management server (MGS). The functionality also allows to specify sets of clients for which the remapping does not apply. + +Severity : normal +Bugzilla : 16860 +Description: Excessive recovery window +Details : With AT enabled, the recovery window can be excessively long (6000+ + seconds). To address this problem, we no longer use + OBD_RECOVERY_FACTOR when extending the recovery window (the connect + timeout no longer depends on the service time, it is set to + INITIAL_CONNECT_TIMEOUT now) and clients report the old service + time via pb_service_time. + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index b594b855be1e59c8c3bc0f42504c6dd706c13ab9..4f6e83f88f862211c0a6912f7d1203aa43e9870b 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -200,9 +200,9 @@ static inline unsigned int at_est2timeout(unsigned int val) static inline unsigned int at_timeout2est(unsigned int val) { - /* restore estimate value from timeout */ + /* restore estimate value from timeout: e=4/5(t-5) */ LASSERT(val); - return ((val - 1) / 5 * 4); + return (max((val << 2) / 5, 5U) - 4); } static inline void at_init(struct adaptive_timeout *at, int val, int flags) { diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index bc49e4b836e6d9801ef559e26a2ca73889b75ae8..7ac4c939c4a1efe401340e5a3be504640b49cee5 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1361,18 +1361,27 @@ target_start_and_reset_recovery_timer(struct obd_device *obd, struct ptlrpc_request *req, int new_client) { - int req_timeout = lustre_msg_get_timeout(req->rq_reqmsg); + int service_time = lustre_msg_get_service_time(req->rq_reqmsg); - /* teach server about old server's estimates */ - if (!new_client) + if (!new_client && service_time) + /* Teach server about old server's estimates, as first guess + * at how long new requests will take. */ at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate, - at_timeout2est(req_timeout)); + service_time); check_and_start_recovery_timer(obd); - req_timeout *= OBD_RECOVERY_FACTOR; - if (req_timeout > obd->obd_recovery_timeout && !new_client) - reset_recovery_timer(obd, req_timeout, 0); + /* convert the service time to rpc timeout, + * reuse service_time to limit stack usage */ + service_time = at_est2timeout(service_time); + + /* We expect other clients to timeout within service_time, then try + * to reconnect, then try the failover server. The max delay between + * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */ + service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + + INITIAL_CONNECT_TIMEOUT); + if (service_time > obd->obd_recovery_timeout && !new_client) + reset_recovery_timer(obd, service_time, 0); } #ifdef __KERNEL__ @@ -1595,7 +1604,7 @@ static int handle_recovery_req(struct ptlrpc_thread *thread, if (!req_replay_done(req->rq_export) || !lock_replay_done(req->rq_export)) reset_recovery_timer(class_exp2obd(req->rq_export), - OBD_RECOVERY_FACTOR * AT_OFF ? obd_timeout : + AT_OFF ? obd_timeout : at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1); ptlrpc_free_clone(req); RETURN(0); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 38cd423fa758872db6be5266d1ecab43ddde0866..5021179d8f2c922499f47bb83e5ced8c492afa7b 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -655,6 +655,19 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) GOTO(out, rc); } + /* Report the rpc service time to the server so that it knows how long + * to wait for clients to join recovery */ + lustre_msg_set_service_time(request->rq_reqmsg, + at_timeout2est(request->rq_timeout)); + + /* The amount of time we give the server to process the connect req. + * import_select_connection will increase the net latency on + * repeated reconnect attempts to cover slow networks. + * We override/ignore the server rpc completion estimate here, + * which may be large if this is a reconnect attempt */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; + lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); + #ifndef __KERNEL__ lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT); #endif @@ -681,10 +694,6 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) spin_unlock(&imp->imp_lock); lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_INITIAL); - if (AT_OFF) - /* AT will use INITIAL_CONNECT_TIMEOUT the first - time, adaptive after that. */ - request->rq_timeout = INITIAL_CONNECT_TIMEOUT; } if (set_transno)