diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 8eb885071b4b50bfac915809af910234a7d9434d..3fde7b2f223736948791b71856f6c5f285b4a45f 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -191,6 +191,19 @@ void class_notify_import_observers(struct obd_import *imp, int event, void *event_arg); /* import.c */ +static inline unsigned int at_est2timeout(unsigned int val) +{ + /* add an arbitrary minimum: 125% +5 sec */ + return (val + (val >> 2) + 5); +} + +static inline unsigned int at_timeout2est(unsigned int val) +{ + /* restore estimate value from timeout */ + LASSERT(val); + return ((val - 1) / 5 * 4); +} + static inline void at_init(struct adaptive_timeout *at, int val, int flags) { memset(at, 0, sizeof(*at)); at->at_current = val; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 44b315971800e224c309ddabd71ae365f86e872b..e7397210d2ebd92b05b99c11dd4a66cc8530906e 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1329,11 +1329,16 @@ target_start_and_reset_recovery_timer(struct obd_device *obd, struct ptlrpc_request *req, int new_client) { - int req_timeout = OBD_RECOVERY_FACTOR * - lustre_msg_get_timeout(req->rq_reqmsg); + int req_timeout = lustre_msg_get_timeout(req->rq_reqmsg); + + /* teach server about old server's estimates */ + if (!new_client) + at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate, + at_timeout2est(req_timeout)); check_and_start_recovery_timer(obd); + req_timeout *= OBD_RECOVERY_FACTOR; if (req_timeout > obd->obd_recovery_timeout && !new_client) reset_recovery_timer(obd, req_timeout, 0); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 39d704929b293062caff27cf36c8f86f2dc3b157..de5f94bc74a50a112cfd226a85a5d1353ec2a875 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -207,8 +207,7 @@ void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) idx = import_at_get_index(req->rq_import, req->rq_request_portal); serv_est = at_get(&at->iat_service_estimate[idx]); - /* add an arbitrary minimum: 125% +5 sec */ - req->rq_timeout = serv_est + (serv_est >> 2) + 5; + req->rq_timeout = at_est2timeout(serv_est); /* We could get even fancier here, using history to predict increased loading... */ @@ -225,6 +224,10 @@ static void ptlrpc_at_adj_service(struct ptlrpc_request *req, unsigned int oldse; struct imp_at *at; + /* do estimate only if is not in recovery */ + if (!(req->rq_send_state & (LUSTRE_IMP_FULL | LUSTRE_IMP_CONNECTING))) + return; + LASSERT(req->rq_import); at = &req->rq_import->imp_at; diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index c83c8772a725536acaff3fd66c81321b0345a4f0..d79ad5b5fbbad1b6d6e024b993371ca4edf82454 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -321,9 +321,11 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) req->rq_arrival_time.tv_sec, 1); if (!(flags & PTLRPC_REPLY_EARLY) && - (req->rq_type != PTL_RPC_MSG_ERR)) { - /* early replies and errors don't count toward our service - time estimate */ + (req->rq_type != PTL_RPC_MSG_ERR) && + !(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY | MSG_LAST_REPLAY))) { + /* early replies, errors and recovery requests don't count + * toward our service time estimate */ int oldse = at_add(&svc->srv_at_estimate, service_time); if (oldse != 0) DEBUG_REQ(D_ADAPTTO, req, diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 974f5269ca6c7741b597f28d944f0dd7390629da..05ff23d5fc95f2346764af2c55d90f50e5d6ff07 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -737,15 +737,22 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, RETURN(-ENOSYS); } - if (extra_time) { - /* Fake our processing time into the future to ask the - clients for some extra amount of time */ - extra_time += cfs_time_current_sec() - - req->rq_arrival_time.tv_sec; - at_add(&svc->srv_at_estimate, extra_time); + if (req->rq_export && req->rq_export->exp_in_recovery) { + /* don't increase server estimates during recovery, and give + clients the full recovery time. */ + newdl = cfs_time_current_sec() + + req->rq_export->exp_obd->obd_recovery_timeout; + } else { + if (extra_time) { + /* Fake our processing time into the future to ask the + clients for some extra amount of time */ + extra_time += cfs_time_current_sec() - + req->rq_arrival_time.tv_sec; + at_add(&svc->srv_at_estimate, extra_time); + } + newdl = req->rq_arrival_time.tv_sec + + at_get(&svc->srv_at_estimate); } - - newdl = req->rq_arrival_time.tv_sec + at_get(&svc->srv_at_estimate); if (req->rq_deadline >= newdl) { /* We're not adding any time, no need to send an early reply (e.g. maybe at adaptive_max) */ diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index c0a9fe45eb6ae67232c31e28905d5162455dfb58..dc31f909adaea93caef2b5c5019593a7b55fe7d4 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -376,6 +376,33 @@ test_19() { # Bug 10991 - resend of open request does not fail assertion. } run_test 19 "resend of open request" +test_20() { #16389 + BEFORE=`date +%s` + replay_barrier $SINGLEMDS + touch $MOUNT1/a + touch $MOUNT2/b + umount $MOUNT2 + facet_failover $SINGLEMDS + df $MOUNT1 || return 1 + rm $MOUNT1/a + zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" + TIER1=$((`date +%s` - BEFORE)) + BEFORE=`date +%s` + replay_barrier $SINGLEMDS + touch $MOUNT1/a + touch $MOUNT2/b + umount $MOUNT2 + facet_failover $SINGLEMDS + df $MOUNT1 || return 1 + rm $MOUNT1/a + zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" + TIER2=$((`date +%s` - BEFORE)) + [ $TIER2 -ge $((TIER1 * 2)) ] && \ + error "recovery time is growing $TIER2 > $TIER1" + return 0 +} +run_test 20 "recovery time is not increasing" + equals_msg `basename $0`: test complete, cleaning up SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP