diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 42dd2a8cd454ac12f0fcf4e3d2aacd79ec9c5db2..cef9318a15596c546179c5747383e5a53f5a8e39 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -12,14 +12,11 @@ /* Adaptive Timeout stuff */ #define D_ADAPTTO D_OTHER #define AT_BINS 4 /* "bin" means "N seconds of history" */ -#define AT_TIMEBASE_DEFAULT 600 /* remembered history (sec) (should be - evenly divisible by AT_BINS) */ #define AT_FLG_NOHIST 0x1 /* use last reported value only */ #define AT_FLG_MIN 0x2 /* use a minimum limit */ struct adaptive_timeout { time_t at_binstart; /* bin start time */ - time_t at_binlimit; /* bin time limit */ unsigned int at_hist[AT_BINS]; /* timeout history bins */ unsigned int at_flags; unsigned int at_current; /* current timeout value */ @@ -143,10 +140,8 @@ struct obd_import { }; /* import.c */ -static inline void at_init(struct adaptive_timeout *at, int val, int timebase, - int flags) { +static inline void at_init(struct adaptive_timeout *at, int val, int flags) { memset(at, 0, sizeof(*at)); - at->at_binlimit = timebase / AT_BINS; at->at_current = val; at->at_worst_ever = val; at->at_worst_time = cfs_time_current_sec(); @@ -159,7 +154,7 @@ static inline int at_get(struct adaptive_timeout *at) { return min(at->at_current, adaptive_timeout_max); return at->at_current; } -void at_add(struct adaptive_timeout *at, unsigned int val); +int at_add(struct adaptive_timeout *at, unsigned int val); int import_at_get_index(struct obd_import *imp, int portal); int import_at_get_ldlm(struct obd_import *imp); #define AT_OFF (adaptive_timeout_max == 0) diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index d1be77c78139dff770d353eb32a6fa38b63a70f3..3636c8444b1b5d8c93e524015e788280fca45b73 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -80,7 +80,7 @@ void target_abort_recovery(void *data); void target_cleanup_recovery(struct obd_device *obd); int target_queue_recovery_request(struct ptlrpc_request *req, struct obd_device *obd); -int target_queue_final_reply(struct ptlrpc_request *req, int rc); +int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc); void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); /* client.c */ diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 3111f418248337588e0cfec95d6c66035bb65973..94ee5d22f6412163da1e89e08494588eeafac2d5 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -260,7 +260,7 @@ struct ptlrpc_request { __u64 rq_history_seq; /* history sequence # */ int rq_status; spinlock_t rq_lock; - /* client-side flags. */ + /* client-side flags are serialized by rq_lock */ unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, /* @@ -275,8 +275,8 @@ struct ptlrpc_request { rq_replay:1, rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1, - /* server-side: */ - rq_final:1; /* packed final reply */ + /* server-side flags */ + rq_packed_final:1; /* packed final reply */ enum rq_phase rq_phase; /* one of RQ_PHASE_* */ atomic_t rq_refcount; /* client-side refcount for SENT race, server-side refcounf for multiple replies */ @@ -313,7 +313,6 @@ struct ptlrpc_request { /* server-side... */ struct timeval rq_arrival_time; /* request arrival time */ struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */ - struct semaphore rq_rs_sem; /* one reply at a time */ struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/ #ifdef CRAY_XT3 __u32 rq_uid; /* peer uid, used in MDS only */ @@ -337,7 +336,9 @@ struct ptlrpc_request { /* client outgoing req */ time_t rq_sent; /* when request/reply sent (secs) */ - time_t rq_deadline; /* when request must finish */ + volatile time_t rq_deadline; /* when request must finish. volatile + so that servers' early reply updates to the deadline aren't + kept in per-cpu cache */ int rq_timeout; /* service time estimate (secs) */ /* Multi-rpc bits */ @@ -660,34 +661,12 @@ void ptlrpc_cleanup_client(struct obd_import *imp); struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); static inline int -ptlrpc_client_receiving_reply (struct ptlrpc_request *req) +ptlrpc_client_recv_or_unlink (struct ptlrpc_request *req) { int rc; spin_lock(&req->rq_lock); - rc = req->rq_receiving_reply; - spin_unlock(&req->rq_lock); - return (rc); -} - -static inline int -ptlrpc_client_must_unlink (struct ptlrpc_request *req) -{ - int rc; - - spin_lock(&req->rq_lock); - rc = req->rq_must_unlink; - spin_unlock(&req->rq_lock); - return (rc); -} - -static inline int -ptlrpc_client_replied (struct ptlrpc_request *req) -{ - int rc; - - spin_lock(&req->rq_lock); - rc = req->rq_replied; + rc = req->rq_receiving_reply || req->rq_must_unlink; spin_unlock(&req->rq_lock); return (rc); } @@ -873,14 +852,6 @@ static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) ptlrpc_rs_decref(req->rq_reply_state); req->rq_reply_state = NULL; req->rq_repmsg = NULL; - up(&req->rq_rs_sem); /* held since lustre_pack_reply */ -} - -/* Check if we already packed a normal (non-early) reply. - Single thread only! */ -static inline int lustre_packed_reply(struct ptlrpc_request *req) -{ - return req->rq_final; } static inline __u32 lustre_request_magic(struct ptlrpc_request *req) @@ -907,6 +878,7 @@ static inline void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *lens) { int size = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + req->rq_replen = size + lustre_msg_early_size(); if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) req->rq_reqmsg->lm_repsize = size; diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 2dfd54a7bd0172032db59c4799ac77d893f701ec..9c05f17836094148c887ff42803da4d9006a69f8 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -614,7 +614,7 @@ static inline void oti_init(struct obd_trans_info *oti, oti->oti_xid = req->rq_xid; - if (req->rq_reqmsg && req->rq_repmsg && req->rq_reply_state) + if ((req->rq_reqmsg != NULL) && (req->rq_repmsg != NULL)) oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1; oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index f1c189624adf58d4944cef9ec6bb76d4751988f2..b4bbad6cc02e7957396d3a4d8ca23a3ddece9867 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -40,6 +40,7 @@ extern unsigned int obd_timeout; /* seconds */ extern unsigned int ldlm_timeout; /* seconds */ extern unsigned int adaptive_timeout_min; /* seconds */ extern unsigned int adaptive_timeout_max; /* seconds */ +extern unsigned int adaptive_timeout_history; /* seconds */ extern unsigned int obd_sync_filter; extern unsigned int obd_max_dirty_pages; extern atomic_t obd_dirty_pages; @@ -71,7 +72,7 @@ extern unsigned int obd_alloc_fail_rate; #define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2) #endif #define LND_TIMEOUT 50 /* LNET LND-level RPC timeout */ -#define FOREVER 300 /* Something taking this long is broken */ +#define LONG_UNLINK 300 /* Unlink should happen before now */ #define OBD_FAIL_MDS 0x100 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 838e6b7f7ca63e6372b3071fe088439b38e4e6f0..47fe12dd7e70e1c5ad974008a42b2b91b5b878bc 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1329,32 +1329,34 @@ struct obd_device * target_req2obd(struct ptlrpc_request *req) return req->rq_export->exp_obd; } -int target_queue_final_reply(struct ptlrpc_request *req, int rc) +int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc) { struct obd_device *obd = target_req2obd(req); struct ptlrpc_request *saved_req; struct lustre_msg *reqmsg; int recovery_done = 0; - LASSERT ((rc == 0) == lustre_packed_reply(req)); + LASSERT ((rc == 0) == req->rq_packed_final); - if (!lustre_packed_reply(req)) { + if (!req->rq_packed_final) { /* Just like ptlrpc_error, but without the sending. */ rc = lustre_pack_reply(req, 1, NULL, NULL); if (rc) - CERROR("pack error %d\n", rc); + return rc; req->rq_type = PTL_RPC_MSG_ERR; } - LASSERT (!req->rq_reply_state->rs_difficult); + LASSERT(!req->rq_reply_state->rs_difficult); LASSERT(list_empty(&req->rq_list)); /* XXX a bit like the request-dup code in queue_recovery_request */ OBD_ALLOC(saved_req, sizeof *saved_req); if (!saved_req) - LBUG(); + return -ENOMEM; OBD_ALLOC(reqmsg, req->rq_reqlen); - if (!reqmsg) - LBUG(); + if (!reqmsg) { + OBD_FREE(saved_req, sizeof *req); + return -ENOMEM; + } *saved_req = *req; memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index e79914d20278948f352ec941c54a83ed0e6944fe..acc5b3267523324545b6c9d06b784b0cf7a4645c 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1018,7 +1018,7 @@ existing_lock: EXIT; out: req->rq_status = rc ?: err; /* return either error - bug 11190 */ - if (!lustre_packed_reply(req)) { + if (!req->rq_packed_final) { err = lustre_pack_reply(req, 1, NULL, NULL); if (rc == 0) rc = err; @@ -1348,7 +1348,7 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) { req->rq_status = rc; - if (!lustre_packed_reply(req)) { + if (!req->rq_packed_final) { rc = lustre_pack_reply(req, 1, NULL, NULL); if (rc) return rc; @@ -2008,7 +2008,7 @@ EXPORT_SYMBOL(target_queue_recovery_request); EXPORT_SYMBOL(target_handle_ping); EXPORT_SYMBOL(target_pack_pool_reply); EXPORT_SYMBOL(target_handle_disconnect); -EXPORT_SYMBOL(target_queue_final_reply); +EXPORT_SYMBOL(target_queue_last_replay_reply); /* l_lock.c */ EXPORT_SYMBOL(lock_res_and_lock); diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c index cfba05b2263373d684b4c6cb29da423fbf59fba0..2ef3332fc4cebd8b9a1bd0c621f1b7625b47bc60 100644 --- a/lustre/mdc/lproc_mdc.c +++ b/lustre/mdc/lproc_mdc.c @@ -79,7 +79,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight, mdc_wr_max_rpcs_in_flight, 0 }, - { "timeouts", lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 }, + { "timeouts", lprocfs_rd_timeouts, 0, 0 }, { 0 } }; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 4b235027821ac26c02e306c95512cf545ff7f8b5..35eac951c9ba96cf86e80327c3174223cd6c7b4b 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1027,7 +1027,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); default: mds_exit_ucred(&uc, mds); - if (!lustre_packed_reply(req)) { + if (!req->rq_packed_final) { req->rq_status = rc; lustre_pack_reply(req, 1, NULL, NULL); } @@ -1077,7 +1077,7 @@ static int mds_getattr(struct ptlrpc_request *req, int offset) out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); out_ucred: - if (!lustre_packed_reply(req)) { + if (!req->rq_packed_final) { req->rq_status = rc; lustre_pack_reply(req, 1, NULL, NULL); } @@ -1798,7 +1798,7 @@ int mds_handle(struct ptlrpc_request *req) if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) { if (obd && obd->obd_recovering) { DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply"); - return target_queue_final_reply(req, rc); + return target_queue_last_replay_reply(req, rc); } /* Lost a race with recovery; let the error path DTRT. */ rc = req->rq_status = -ENOTCONN; diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 17e174b219ca63702a5b31083f0c813e9c385ba2..632d7e3e013b521d5b978fe0e7319ae3d6ad0247 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -65,8 +65,11 @@ unsigned int obd_dump_on_timeout; unsigned int obd_dump_on_eviction; unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ +/* Covers the maximum expected network latency */ unsigned int adaptive_timeout_min = 10; /* seconds */ unsigned int adaptive_timeout_max = 600; /* seconds */ +/* We remember the slowest event that took place within history */ +unsigned int adaptive_timeout_history = 600; /* seconds */ unsigned int obd_max_dirty_pages = 256; atomic_t obd_dirty_pages; @@ -387,6 +390,7 @@ EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(ldlm_timeout); EXPORT_SYMBOL(adaptive_timeout_min); EXPORT_SYMBOL(adaptive_timeout_max); +EXPORT_SYMBOL(adaptive_timeout_history); EXPORT_SYMBOL(obd_max_dirty_pages); EXPORT_SYMBOL(obd_dirty_pages); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 86cafe346fc1a76260635e9ab52bcd4b5b0f4edd..470cae73d966f7ac4f4ba98b6a8cb1fc332b4354 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -797,14 +797,13 @@ static void init_imp_at(struct imp_at *at) { Since we can't say for sure how slow a network might be, we use a user-defined max expected network latency. We will adapt to slow increases, but a sudden jump can still kill us. */ - at_init(&at->iat_net_latency, adaptive_timeout_min, AT_TIMEBASE_DEFAULT, - AT_FLG_MIN); + at_init(&at->iat_net_latency, adaptive_timeout_min, AT_FLG_MIN); for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { /* max service estimates are tracked on the server side, so don't use the AT history here, just use the last reported val. (But keep hist for proc histogram, worst_ever) */ at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, - AT_TIMEBASE_DEFAULT, AT_FLG_NOHIST); + AT_FLG_NOHIST); } at->iat_drain = 0; } diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c index 89c0cb9b642c341770c19dcb781dc00f8ab64bbd..85cba69e50ee12b4c714eb3333324fa56038b3c4 100644 --- a/lustre/obdclass/linux/linux-sysctl.c +++ b/lustre/obdclass/linux/linux-sysctl.c @@ -64,6 +64,7 @@ enum { OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */ ADAPTIVE_MIN, /* Adaptive timeout lower limit */ ADAPTIVE_MAX, /* Adaptive timeout upper limit */ + ADAPTIVE_HISTORY, /* Adaptive timeout timebase */ }; int LL_PROC_PROTO(proc_fail_loc) @@ -213,6 +214,14 @@ static cfs_sysctl_table_t obd_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = ADAPTIVE_HISTORY, + .procname = "adaptive_history", + .data = &adaptive_timeout_history, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { 0 } }; diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index c6883e4f8cda0280a7366c5c300df0d8da1fbb53..77d405a6aba5cd44834115be197b926be0de4585 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -598,10 +598,6 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count, "adaptive timeouts off, using obd_timeout %u\n", obd_timeout); - rc += snprintf(page + rc, count - rc, - "%-10s : %ld sec\n", "timebase", - imp->imp_at.iat_net_latency.at_binlimit * AT_BINS); - now = cfs_time_current_sec(); /* Some network health info for kicks */ @@ -640,38 +636,6 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count, return rc; } -int lprocfs_wr_timeouts(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - struct obd_import *imp; - time_t bval; - int val, i, rc; - - LASSERT(obd != NULL); - LPROCFS_CLIMP_CHECK(obd); - imp = obd->u.cli.cl_import; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc < 0) - return rc; - if (val <= 0) - return -ERANGE; - - bval = max(1, val / AT_BINS); - spin_lock(&imp->imp_at.iat_net_latency.at_lock); - imp->imp_at.iat_net_latency.at_binlimit = bval; - spin_unlock(&imp->imp_at.iat_net_latency.at_lock); - for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { - spin_lock(&imp->imp_at.iat_service_estimate[i].at_lock); - imp->imp_at.iat_service_estimate[i].at_binlimit = bval; - spin_unlock(&imp->imp_at.iat_service_estimate[i].at_lock); - } - - LPROCFS_CLIMP_EXIT(obd); - return count; -} - static const char *obd_connect_names[] = { "read_only", "lov_index", @@ -1586,7 +1550,6 @@ EXPORT_SYMBOL(lprocfs_rd_num_exports); EXPORT_SYMBOL(lprocfs_rd_numrefs); EXPORT_SYMBOL(lprocfs_at_hist_helper); EXPORT_SYMBOL(lprocfs_rd_timeouts); -EXPORT_SYMBOL(lprocfs_wr_timeouts); EXPORT_SYMBOL(lprocfs_rd_blksize); EXPORT_SYMBOL(lprocfs_rd_kbytestotal); EXPORT_SYMBOL(lprocfs_rd_kbytesfree); diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 7a50e66a1171a9b2ec1f49338cd059924fa7ad83..878fa8e5e5e423c001e2b66dd0b8e430415d8b63 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -329,7 +329,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 }, { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 }, { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, - { "timeouts", lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 }, + { "timeouts", lprocfs_rd_timeouts, 0, 0 }, { 0 } }; diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index f3659e164073f56ebb311a6ef2e8cf3be5e2b3fe..be608a6c6479e39116b50f4ca64126a82876c91a 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1643,7 +1643,7 @@ out: if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) { if (obd && obd->obd_recovering) { DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply"); - return target_queue_final_reply(req, rc); + return target_queue_last_replay_reply(req, rc); } /* Lost a race with recovery; let the error path DTRT. */ rc = req->rq_status = -ENOTCONN; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 741dcc685d0ac93b2ecc4979c30ccfa159d2bbc6..e69a5f1c6a9e66e1f8485806f11967451da60cff 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -190,22 +190,25 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) } /* Set server timelimit for this req */ -static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) { +static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +{ __u32 serv_est; int idx; struct imp_at *at; + LASSERT(req->rq_import); - if (AT_OFF || - ((idx = import_at_get_index(req->rq_import, - req->rq_request_portal)) < 0)) { + if (AT_OFF) { /* non-AT settings */ req->rq_timeout = req->rq_import->imp_server_timeout ? obd_timeout / 2 : obd_timeout; - goto out; + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); + return; } at = &req->rq_import->imp_at; + idx = import_at_get_index(req->rq_import, + req->rq_request_portal); serv_est = at_get(&at->iat_service_estimate[idx]); /* add an arbitrary minimum: 125% +5 sec */ req->rq_timeout = serv_est + (serv_est >> 2) + 5; @@ -220,17 +223,18 @@ static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) { req->rq_timeout); } -out: /* Let the server know what this RPC timeout is by putting it in the reqmsg*/ lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); } /* Adjust max service estimate based on server value */ -static void ptlrpc_at_adj_service(struct ptlrpc_request *req) { +static void ptlrpc_at_adj_service(struct ptlrpc_request *req) +{ int idx; unsigned int serv_est, oldse; struct imp_at *at = &req->rq_import->imp_at; + LASSERT(req->rq_import); /* service estimate is returned in the repmsg timeout field, @@ -238,31 +242,29 @@ static void ptlrpc_at_adj_service(struct ptlrpc_request *req) { serv_est = lustre_msg_get_timeout(req->rq_repmsg); idx = import_at_get_index(req->rq_import, req->rq_request_portal); - if (idx < 0) - return; - - oldse = at_get(&at->iat_service_estimate[idx]); /* max service estimates are tracked on the server side, so just keep minimal history here */ - at_add(&at->iat_service_estimate[idx], serv_est); - - if (at_get(&at->iat_service_estimate[idx]) != oldse) + oldse = at_add(&at->iat_service_estimate[idx], serv_est); + if (oldse != 0) CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " - "has changed %ds to %u\n", - req->rq_import->imp_obd->obd_name, req->rq_request_portal, - at_get(&at->iat_service_estimate[idx]) - oldse, serv_est); + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name,req->rq_request_portal, + oldse, at_get(&at->iat_service_estimate[idx])); } /* Expected network latency per remote node (secs) */ -int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) { +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) +{ return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); } /* Adjust expected network latency */ -static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) { +static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) +{ unsigned int st, nl, oldnl; struct imp_at *at = &req->rq_import->imp_at; time_t now = cfs_time_current_sec(); + LASSERT(req->rq_import); st = lustre_msg_get_service_time(req->rq_repmsg); @@ -273,19 +275,18 @@ static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) { CERROR("Reported service time %u > total measured time %ld\n", st, now - req->rq_sent); - oldnl = at_get(&at->iat_net_latency); - at_add(&at->iat_net_latency, nl); - - if (at_get(&at->iat_net_latency) != oldnl) - CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s)" - " has changed %ds to %u\n", + oldnl = at_add(&at->iat_net_latency, nl); + if (oldnl != 0) + CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) " + "has changed from %d to %d\n", req->rq_import->imp_obd->obd_name, obd_uuid2str( &req->rq_import->imp_connection->c_remote_uuid), - at_get(&at->iat_net_latency) - oldnl, nl); + oldnl, at_get(&at->iat_net_latency)); } -static int unpack_reply(struct ptlrpc_request *req) { +static int unpack_reply(struct ptlrpc_request *req) +{ int rc; /* Clear reply swab mask; we may have already swabbed an early reply */ @@ -308,7 +309,7 @@ static int unpack_reply(struct ptlrpc_request *req) { /* Handle an early reply message. We can't risk the real reply coming in and changing rq_repmsg, so this fn must be called under the rq_lock */ -static int ptlrpc_at_early_reply(struct ptlrpc_request *req) { +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { struct lustre_msg *oldmsg, *msgcpy; time_t olddl; int oldlen, rc; @@ -741,7 +742,7 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) GOTO(out, rc = 1); if (req->rq_early) { - ptlrpc_at_early_reply(req); + ptlrpc_at_recv_early_reply(req); GOTO(out, rc = 0); /* keep waiting */ } @@ -1049,21 +1050,27 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) force_timer_recalc = 1; } + spin_lock(&req->rq_lock); + /* Still waiting for a reply? */ - if (ptlrpc_client_receiving_reply(req)) + if (req->rq_receiving_reply) { + spin_unlock(&req->rq_lock); continue; + } - spin_lock(&req->rq_lock); if (req->rq_early) { - ptlrpc_at_early_reply(req); + ptlrpc_at_recv_early_reply(req); spin_unlock(&req->rq_lock); continue; } - spin_unlock(&req->rq_lock); /* Did we actually receive a reply? */ - if (!ptlrpc_client_replied(req)) + if (!req->rq_replied) { + spin_unlock(&req->rq_lock); continue; + } + + spin_unlock(&req->rq_lock); spin_lock(&imp->imp_lock); list_del_init(&req->rq_list); @@ -1291,7 +1298,7 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); /* request in-flight? */ - if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting) || + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || (req->rq_phase == RQ_PHASE_BULK))) continue; @@ -1301,7 +1308,9 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) if (req->rq_deadline <= now) { /* actually expired already */ timeout = 1; /* ASAP */ break; - } else if (timeout == 0 || timeout > req->rq_deadline - now) { + } + + if ((timeout == 0) || (timeout > (req->rq_deadline - now))) { timeout = req->rq_deadline - now; } } @@ -1495,9 +1504,7 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) ENTRY; LASSERT(!in_interrupt ()); /* might sleep */ - - if (!ptlrpc_client_receiving_reply(request) && - !ptlrpc_client_must_unlink(request)) + if (!ptlrpc_client_recv_or_unlink(request)) /* Nothing left to do */ return; @@ -1515,19 +1522,16 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL); - rc = l_wait_event (*wq, - !ptlrpc_client_receiving_reply(request) && - !ptlrpc_client_must_unlink(request), + lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL); + rc = l_wait_event (*wq, !ptlrpc_client_recv_or_unlink(request), &lwi); if (rc == 0) return; LASSERT (rc == -ETIMEDOUT); DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " - "rvcng=%d unlnk=%d", - ptlrpc_client_receiving_reply(request), - ptlrpc_client_must_unlink(request)); + "rvcng=%d unlnk=%d", request->rq_receiving_reply, + request->rq_must_unlink); } EXIT; } @@ -1831,6 +1835,7 @@ restart: lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, libcfs_nid2str(imp->imp_connection->c_peer.nid), lustre_msg_get_opc(req->rq_reqmsg)); + spin_lock(&imp->imp_lock); list_del_init(&req->rq_list); spin_unlock(&imp->imp_lock); @@ -1884,33 +1889,25 @@ restart: out: if (req->rq_bulk != NULL) { - while(rc >= 0) { + if (rc >= 0) { /* success so far. Note that anything going wrong * with bulk now, is EXTREMELY strange, since the * server must have believed that the bulk * tranferred OK before she replied with success to * me. */ - timeoutl = req->rq_deadline - cfs_time_current_sec(); - timeout = (timeoutl <= 0) ? CFS_TICK : - cfs_time_seconds(timeoutl); - lwi = LWI_TIMEOUT(timeout, NULL, NULL); brc = l_wait_event(req->rq_reply_waitq, !ptlrpc_bulk_active(req->rq_bulk), &lwi); LASSERT(brc == 0 || brc == -ETIMEDOUT); - if ((brc == -ETIMEDOUT) && - (req->rq_deadline > cfs_time_current_sec())) - /* Wait again if we changed deadline */ - continue; if (brc != 0) { + LASSERT(brc == -ETIMEDOUT); DEBUG_REQ(D_ERROR, req, "bulk timed out"); rc = brc; } else if (!req->rq_bulk->bd_success) { DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); rc = -EIO; } - break; } if (rc < 0) ptlrpc_unregister_bulk (req); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 9df83cb20ef13aca34c84d74349b404f6293a812..d836c5b7b254a4a5ba346d0bc5cb018f10793f0b 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -88,6 +88,9 @@ void reply_in_callback(lnet_event_t *ev) LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); LASSERT(ev->md.start == req->rq_repbuf); LASSERT(ev->mlength <= req->rq_replen); + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + for adaptive timeouts' early reply. */ + LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0); spin_lock(&req->rq_lock); @@ -102,12 +105,8 @@ void reply_in_callback(lnet_event_t *ev) goto out_wake; } - /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests - for adaptive timeouts' early reply. */ - LASSERT(ev->md.options & LNET_MD_MANAGE_REMOTE); - if ((ev->offset == 0) && - lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) { + ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) != 0)) { /* Early reply */ DEBUG_REQ(D_ADAPTTO, req, "Early reply received: mlen=%u offset=%d replen=%d " @@ -254,7 +253,6 @@ void request_in_callback(lnet_event_t *ev) req->rq_uid = ev->uid; #endif spin_lock_init(&req->rq_lock); - sema_init(&req->rq_rs_sem, 1); CFS_INIT_LIST_HEAD(&req->rq_timed_list); atomic_set(&req->rq_refcount, 1); if (ev->type == LNET_EVENT_PUT) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 62c92a4862ffc47d5421b0202c72904572c4f194..a058f6b6cd8c5148859aeca8f08f76caca99eb11 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -213,7 +213,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp) timeout = (int)(last - cfs_time_current_sec()); if (timeout > 0) { lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout), - HZ, NULL, NULL); + cfs_time_seconds(1), NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, (atomic_read(&imp->imp_inflight) == 0), &lwi); @@ -1050,14 +1050,15 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) if (ptlrpc_import_in_recovery(imp)) { struct l_wait_info lwi; cfs_duration_t timeout; - int idx; - if (AT_OFF || (idx = import_at_get_index(imp, - imp->imp_client->cli_request_portal)) < 0) + if (AT_OFF) { timeout = cfs_time_seconds(obd_timeout); - else + } else { + int idx = import_at_get_index(imp, + imp->imp_client->cli_request_portal); timeout = cfs_time_seconds( at_get(&imp->imp_at.iat_service_estimate[idx])); + } lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); rc = l_wait_event(imp->imp_recovery_waitq, @@ -1124,20 +1125,22 @@ void ptlrpc_import_setasync(struct obd_import *imp, int count) This gives us a max of the last binlimit*AT_BINS secs without the storage, but still smoothing out a return to normalcy from a slow response. (E.g. remember the maximum latency in each minute of the last 4 minutes.) */ -void at_add(struct adaptive_timeout *at, unsigned int val) { - /*unsigned int old = at->at_current;*/ +int at_add(struct adaptive_timeout *at, unsigned int val) +{ + unsigned int old = at->at_current; time_t now = cfs_time_current_sec(); + time_t binlimit = max_t(time_t, adaptive_timeout_history / AT_BINS, 1); LASSERT(at); #if 0 - CDEBUG(D_INFO, "add %u to %p time=%lu tb=%lu v=%u (%u %u %u %u)\n", - val, at, now - at->at_binstart, at->at_binlimit, at->at_current, + CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", + val, at, now - at->at_binstart, at->at_current, at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); #endif if (val == 0) /* 0's don't count, because we never want our timeout to drop to 0, and because 0 could mean an error */ - return; + return 0; spin_lock(&at->at_lock); @@ -1148,7 +1151,7 @@ void at_add(struct adaptive_timeout *at, unsigned int val) { at->at_worst_time = now; at->at_hist[0] = val; at->at_binstart = now; - } else if (now - at->at_binstart < at->at_binlimit ) { + } else if (now - at->at_binstart < binlimit ) { /* in bin 0 */ at->at_hist[0] = max(val, at->at_hist[0]); at->at_current = max(val, at->at_current); @@ -1156,7 +1159,7 @@ void at_add(struct adaptive_timeout *at, unsigned int val) { int i, shift; unsigned int maxv = val; /* move bins over */ - shift = (now - at->at_binstart) / at->at_binlimit; + shift = (now - at->at_binstart) / binlimit; LASSERT(shift > 0); for(i = AT_BINS - 1; i >= 0; i--) { if (i >= shift) { @@ -1168,7 +1171,7 @@ void at_add(struct adaptive_timeout *at, unsigned int val) { } at->at_hist[0] = val; at->at_current = maxv; - at->at_binstart += shift * at->at_binlimit; + at->at_binstart += shift * binlimit; } if ((at->at_flags & AT_FLG_MIN) && @@ -1193,11 +1196,17 @@ void at_add(struct adaptive_timeout *at, unsigned int val) { at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); #endif + + /* if we changed, report the old value */ + old = (at->at_current != old) ? old : 0; + spin_unlock(&at->at_lock); + return old; } /* Find the imp_at index for a given portal; assign if space available */ -int import_at_get_index(struct obd_import *imp, int portal) { +int import_at_get_index(struct obd_import *imp, int portal) +{ struct imp_at *at = &imp->imp_at; int i; @@ -1220,15 +1229,11 @@ int import_at_get_index(struct obd_import *imp, int portal) { /* unused */ break; } + + /* Not enough portals? */ + LASSERT(i < IMP_AT_MAX_PORTALS); - if (i >= IMP_AT_MAX_PORTALS) { - CERROR("Tried to use more than %d portals, not enough room " - "in adaptive timeout stats.\n", IMP_AT_MAX_PORTALS); - i = -1; - goto out; - } at->iat_portal[i] = portal; - out: spin_unlock(&imp->imp_lock); return i; @@ -1238,18 +1243,16 @@ out: Since any early reply will only affect the RPC wait time, and not any local lock timer we set based on the return value here, we should be conservative. */ -int import_at_get_ldlm(struct obd_import *imp) { +int import_at_get_ldlm(struct obd_import *imp) +{ int idx, tot; if (!imp || !imp->imp_client || AT_OFF) return obd_timeout; - tot = at_get(&imp->imp_at.iat_net_latency); idx = import_at_get_index(imp, imp->imp_client->cli_request_portal); - if (idx < 0) - tot += obd_timeout; - else - tot += at_get(&imp->imp_at.iat_service_estimate[idx]); + tot = at_get(&imp->imp_at.iat_net_latency) + + at_get(&imp->imp_at.iat_service_estimate[idx]); /* add an arbitrary minimum: 150% + 10 sec */ tot += (tot >> 1) + 10; diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 392c3c73d1eb489a71de20de9f2e62aa2854fc05..15a7da27ba4cdb2761340f597b39e05408ae3ec1 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -406,9 +406,6 @@ static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off, rc += snprintf(page + rc, count - rc, "adaptive timeouts off, using obd_timeout %u\n", obd_timeout); - rc += snprintf(page + rc, count - rc, - "%10s : %ld sec\n", "timebase", - svc->srv_at_estimate.at_binlimit * AT_BINS); rc += snprintf(page + rc, count - rc, "%10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", "service", cur, worst, worstt, @@ -418,25 +415,6 @@ static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off, return rc; } -static int ptlrpc_lprocfs_wr_timeouts(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct ptlrpc_service *svc = data; - int val, rc; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc < 0) - return rc; - if (val <= 0) - return -ERANGE; - - spin_lock(&svc->srv_at_estimate.at_lock); - svc->srv_at_estimate.at_binlimit = max(1, val / AT_BINS); - spin_unlock(&svc->srv_at_estimate.at_lock); - - return count; -} - void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, struct ptlrpc_service *svc) { @@ -450,7 +428,6 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, .read_fptr = ptlrpc_lprocfs_read_req_history_max, .data = svc}, {.name = "timeouts", - .write_fptr = ptlrpc_lprocfs_wr_timeouts, .read_fptr = ptlrpc_lprocfs_rd_timeouts, .data = svc}, {NULL} diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 49bbcdef3b588725739b7f942ff72d0e224e0e1a..0f4e00187ed8da83ee6949c8c32fc4ee1a58b64a 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -349,14 +349,14 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags) req->rq_arrival_time.tv_sec, 1); if (!(flags & PTLRPC_REPLY_EARLY) && (req->rq_type != PTL_RPC_MSG_ERR)) { - int oldse = at_get(&svc->srv_at_estimate); /* early replies and errors don't count toward our service time estimate */ - at_add(&svc->srv_at_estimate, service_time); - if (service_time > oldse) + int oldse = at_add(&svc->srv_at_estimate, service_time); + if (oldse != 0) DEBUG_REQ(D_ADAPTTO, req, - "svc %s increased estimate from %d to %d", - svc->srv_name, oldse, service_time); + "svc %s changed estimate from %d to %d", + svc->srv_name, oldse, + at_get(&svc->srv_at_estimate)); } /* Report actual service time for client latency calc */ lustre_msg_set_service_time(req->rq_repmsg, service_time); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index eea15f074e9de6801b1a1d0e3011f1ea5bc41e8d..29f4f36e333645b901cabe7fe335ef9385c8ca96 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -346,24 +346,20 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count, int size; ENTRY; - /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */ - down(&req->rq_rs_sem); LASSERT(req->rq_reply_state == NULL); - if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) { + if (req->rq_packed_final) /* Already packed final, no more early */ - up(&req->rq_rs_sem); RETURN(-EALREADY); - } + if ((flags & LPRFL_EARLY_REPLY) == 0) + req->rq_packed_final = 1; msg_len = lustre_msg_size_v1(count, lens); size = sizeof(struct ptlrpc_reply_state) + msg_len; OBD_ALLOC(rs, size); if (unlikely(rs == NULL)) { rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size); - if (!rs) { - up(&req->rq_rs_sem); + if (!rs) RETURN (-ENOMEM); - } } atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ rs->rs_cb_id.cbid_fn = reply_out_callback; @@ -377,8 +373,6 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count, req->rq_replen = msg_len; req->rq_reply_state = rs; req->rq_repmsg = rs->rs_msg; - if (!flags) - req->rq_final = 1; /* checked in lustre_packed_reply */ lustre_init_msg_v1(rs->rs_msg, count, lens, bufs); @@ -395,24 +389,21 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, int size; ENTRY; - /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */ - down(&req->rq_rs_sem); LASSERT(req->rq_reply_state == NULL); - if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) { + if (req->rq_packed_final) { /* Already packed final, no more early */ - up(&req->rq_rs_sem); RETURN(-EALREADY); } + if ((flags & LPRFL_EARLY_REPLY) == 0) + req->rq_packed_final = 1; msg_len = lustre_msg_size_v2(count, lens); size = sizeof(struct ptlrpc_reply_state) + msg_len; OBD_ALLOC(rs, size); if (unlikely(rs == NULL)) { rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size); - if (!rs) { - up(&req->rq_rs_sem); + if (!rs) RETURN (-ENOMEM); - } } atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ rs->rs_cb_id.cbid_fn = reply_out_callback; @@ -426,8 +417,6 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, req->rq_replen = msg_len; req->rq_reply_state = rs; req->rq_repmsg = rs->rs_msg; - if (!(flags & LPRFL_EARLY_REPLY)) - req->rq_final = 1; /* checked in lustre_packed_reply */ /* server side, no rq_repbuf */ lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); @@ -1579,7 +1568,7 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg) case LUSTRE_MSG_MAGIC_V2_SWABBED: { struct ptlrpc_body *pb; - pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); if (!pb) { CERROR("invalid msg %p: no ptlrpc body!\n", msg); return 0; @@ -1603,7 +1592,7 @@ __u32 lustre_msg_get_service_time(struct lustre_msg *msg) case LUSTRE_MSG_MAGIC_V2_SWABBED: { struct ptlrpc_body *pb; - pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); if (!pb) { CERROR("invalid msg %p: no ptlrpc body!\n", msg); return 0; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 8470fc7ec993f28853abb28cddd9812c6d63a118..2cf34b081ea349d1f7be3adbd179ccb9dc6678cd 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -38,6 +38,14 @@ int test_req_buffer_pressure = 0; CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444, "set non-zero to put pressure on request buffer pools"); +static int at_early_margin = 3; +CFS_MODULE_PARM(at_early_margin, "i", int, 0644, + "How far before the deadline we send an early reply"); + +static int at_extra = 10; +CFS_MODULE_PARM(at_extra, "i", int, 0644, + "How much extra time we give with an early reply"); + /* forward ref */ static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc); @@ -321,7 +329,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service); /* At SOW, service time should be quick; 10s seems generous. If client timeout is less than this, we'll be sending an early reply. */ - at_init(&service->srv_at_estimate, 10, AT_TIMEBASE_DEFAULT, 0); + at_init(&service->srv_at_estimate, 10, 0); spin_lock (&ptlrpc_all_services_lock); list_add (&service->srv_list, &ptlrpc_all_services); @@ -352,7 +360,7 @@ failed: return NULL; } -static void ptlrpc_server_decref(struct ptlrpc_request *req) +static void ptlrpc_server_req_decref(struct ptlrpc_request *req) { struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; @@ -379,7 +387,7 @@ static void __ptlrpc_server_free_request(struct ptlrpc_request *req) { list_del(&req->rq_list); ptlrpc_req_drop_rs(req); - ptlrpc_server_decref(req); + ptlrpc_server_req_decref(req); } static void ptlrpc_server_free_request(struct ptlrpc_request *req) @@ -390,7 +398,8 @@ static void ptlrpc_server_free_request(struct ptlrpc_request *req) struct list_head *tmp; struct list_head *nxt; - DEBUG_REQ(D_INFO, req, "free req"); + if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */ + DEBUG_REQ(D_INFO, req, "free req"); spin_lock(&svc->srv_at_lock); list_del_init(&req->rq_timed_list); spin_unlock(&svc->srv_at_lock); @@ -552,12 +561,6 @@ static int ptlrpc_check_req(struct ptlrpc_request *req) return 0; } -/* If closest exipiration is within EARLY_MIN, send early replies to everybody - expiring within EARLY_MAX, asking for AT_EXTRA time */ -#define AT_EARLY_MIN 2 /* Min time needed to send an early reply */ -#define AT_EARLY_MAX 5 /* Dont send early replies if deadline is beyond */ -#define AT_EXTRA 10 /* Early replies add this time to client timeout */ - static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) { struct ptlrpc_request *rq; @@ -573,13 +576,13 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) /* Set timer for closest deadline */ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, rq_timed_list); - next = rq->rq_deadline - cfs_time_current_sec() - AT_EARLY_MIN; + next = rq->rq_deadline - cfs_time_current_sec() - at_early_margin; if (next <= 0) ptlrpc_at_timer((unsigned long)svc); else cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next)); spin_unlock(&svc->srv_at_lock); - CDEBUG(D_ADAPTTO, "armed %s at %+lds\n", svc->srv_name, next); + CDEBUG(D_INFO, "armed %s at %+lds\n", svc->srv_name, next); } /* Add rpc to early reply check list */ @@ -629,6 +632,8 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, int extra_time) { struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + struct ptlrpc_request *reqcopy; + struct lustre_msg *reqmsg; long deadline = req->rq_deadline - cfs_time_current_sec(); int rc; ENTRY; @@ -644,46 +649,67 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, if (AT_OFF) RETURN(0); + if (deadline < 0) { + CERROR("Already past deadline (%+lds), not sending early " + "reply\n", deadline); + /* Return an error so we're not re-added to the timed list. */ + RETURN(-ETIMEDOUT); + } + if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) { CERROR("Wanted to ask client for more time, but no AT " "support\n"); RETURN(-ENOSYS); } - rc = lustre_pack_reply_flags(req, 1, NULL, NULL, LPRFL_EARLY_REPLY); + OBD_ALLOC(reqcopy, sizeof *reqcopy); + if (reqcopy == NULL) + RETURN(-ENOMEM); + OBD_ALLOC(reqmsg, req->rq_reqlen); + if (!reqmsg) { + OBD_FREE(reqcopy, sizeof *reqcopy); + RETURN(-ENOMEM); + } + + *reqcopy = *req; + /* We need the reqmsg for the magic */ + reqcopy->rq_reqmsg = reqmsg; + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + + rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); if (rc) - /* EALREADY means final reply was already packed */ - RETURN(rc); + GOTO(out, rc); if (extra_time) { /* Fake our processing time into the future to ask the clients for some extra amount of time */ extra_time += cfs_time_current_sec() - - req->rq_arrival_time.tv_sec; + reqcopy->rq_arrival_time.tv_sec; at_add(&svc->srv_at_estimate, extra_time); } - req->rq_early_count++; /* number sent, server side */ - rc = ptlrpc_send_reply(req, PTLRPC_REPLY_EARLY); + rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); if (!rc) { /* Adjust our own deadline to what we told the client */ req->rq_deadline = req->rq_arrival_time.tv_sec + at_get(&svc->srv_at_estimate); + req->rq_early_count++; /* number sent, server side */ } else { DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); } - /* Reset reply */ - req->rq_rep_swab_mask = 0; /* Free the (early) reply state from lustre_pack_reply. (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ - ptlrpc_req_drop_rs(req); - + ptlrpc_req_drop_rs(reqcopy); +out: + OBD_FREE(reqmsg, req->rq_reqlen); + OBD_FREE(reqcopy, sizeof *reqcopy); RETURN(rc); } -/* Check if we need to send any early replies, and send them */ +/* Send early replies to everybody expiring within at_early_margin + asking for at_extra time */ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) { struct ptlrpc_request *rq, *n; @@ -711,7 +737,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, rq_timed_list); first = (int)(rq->rq_deadline - now); - if (first > AT_EARLY_MIN) { + if (first > at_early_margin) { /* We've still got plenty of time. Reset the timer. */ spin_unlock(&svc->srv_at_lock); ptlrpc_at_set_timer(svc); @@ -722,7 +748,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) server will take. Send early replies to everyone expiring soon. */ CFS_INIT_LIST_HEAD(&work_list); list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) { - if (rq->rq_deadline <= now + AT_EARLY_MAX) { + if (rq->rq_deadline <= now + at_early_margin) { list_move(&rq->rq_timed_list, &work_list); counter++; } else { @@ -733,7 +759,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) spin_unlock(&svc->srv_at_lock); CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " - "replies\n", first, AT_EXTRA, counter); + "replies\n", first, at_extra, counter); if (first < 0) /* We're already past request deadlines before we even get a chance to send early replies */ @@ -752,15 +778,12 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) deleted, and is safe to take a ref to keep the req around */ atomic_inc(&rq->rq_refcount); spin_unlock(&svc->srv_at_lock); - if (!rq->rq_final && - (ptlrpc_at_send_early_reply(rq, AT_EXTRA) == 0)) { + if (!rq->rq_packed_final && + (ptlrpc_at_send_early_reply(rq, at_extra) == 0)) { counter++; ptlrpc_at_add_timed(rq); - } else if (rq->rq_final) { - DEBUG_REQ(D_ADAPTTO, rq, "already packed final reply, " - "not sending early"); } - ptlrpc_server_decref(rq); + ptlrpc_server_req_decref(rq); spin_lock(&svc->srv_at_lock); } spin_unlock(&svc->srv_at_lock); @@ -773,7 +796,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) RETURN(0); } -/* Handle freshly incoming reqs, check timeout, send early reply if needed, +/* Handle freshly incoming reqs, add to timed early reply list, pass on to regular request queue */ static int ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) @@ -932,6 +955,17 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, export = class_export_rpc_get(request->rq_export); } + /* Discard requests queued for longer than the deadline. + The deadline is increased if we send an early reply. */ + if (cfs_time_current_sec() > request->rq_deadline) { + CERROR("Dropping timed-out opc %d request from %s" + ": deadline %lds ago\n", + lustre_msg_get_opc(request->rq_reqmsg), + libcfs_id2str(request->rq_peer), + cfs_time_current_sec() - request->rq_deadline); + goto put_rpc_export; + } + request->rq_phase = RQ_PHASE_INTERPRET; CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " @@ -960,6 +994,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, libcfs_id2str(request->rq_peer), lustre_msg_get_opc(request->rq_reqmsg)); +put_rpc_export: if (export != NULL) class_export_rpc_put(export); @@ -1197,7 +1232,7 @@ static int ptlrpc_main(void *arg) #ifdef WITH_GROUP_INFO struct group_info *ginfo = NULL; #endif - int counter, rc = 0; + int counter = 0, rc = 0; ENTRY; ptlrpc_daemonize(data->name); @@ -1305,13 +1340,13 @@ static int ptlrpc_main(void *arg) if (!list_empty(&svc->srv_reply_queue)) ptlrpc_server_handle_reply (svc); - counter = 0; - while(!list_empty(&svc->srv_req_in_queue)) { + if (!list_empty(&svc->srv_req_in_queue)) { /* Process all incoming reqs before handling any */ ptlrpc_server_handle_req_in(svc); /* but limit ourselves in case of flood */ - if (counter++ > 1000) - break; + if (counter++ < 1000) + continue; + counter = 0; } if (svc->srv_at_check) @@ -1524,7 +1559,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL); + lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL); rc = l_wait_event(service->srv_waitq, service->srv_nrqbd_receiving == 0, &lwi); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 5ddab2391718a6a48c5f01d806292d0a2a8edad7..f7e260492d2e84d30daeec37429d4f475221eb0e 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1177,16 +1177,14 @@ test_61c() { } run_test 61c "test race mds llog sync vs llog cleanup" - +#Adaptive Timeouts at_start() #bug 3055 { if [ -z "$ATOLDBASE" ]; then - ATOLDBASE=$(do_facet mds "grep timebase $LPROC/mdt/MDS/mds/timeouts" | awk '{print $3}' ) + ATOLDBASE=$(do_facet mds "sysctl -n lustre.adaptive_history") # speed up the timebase so we can check decreasing AT - do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds/timeouts" - do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_readpage/timeouts" - do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_setattr/timeouts" - do_facet ost1 "echo 8 >> $LPROC/ost/OSS/ost/timeouts" + do_facet mds "sysctl -w lustre.adaptive_history=8" + do_facet ost1 "sysctl -w lustre.adaptive_history=8" fi } @@ -1195,9 +1193,9 @@ test_65() #bug 3055 at_start $LCTL dk > /dev/null # slow down a request - sysctl -w lustre.fail_val=30000 + do_facet mds sysctl -w lustre.fail_val=30000 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a - sysctl -w lustre.fail_loc=0x8000050a + do_facet mds sysctl -w lustre.fail_loc=0x8000050a createmany -o $DIR/$tfile 10 > /dev/null unlinkmany $DIR/$tfile 10 > /dev/null # check for log message @@ -1301,10 +1299,8 @@ test_67b() #bug 3055 run_test 67b "AT: verify instant slowdown doesn't induce reconnects" if [ -n "$ATOLDBASE" ]; then - do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds/timeouts" - do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_readpage/timeouts" - do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_setattr/timeouts" - do_facet ost1 "echo $ATOLDBASE >> $LPROC/ost/OSS/ost/timeouts" + do_facet mds "sysctl -w lustre.adaptive_history=$ATOLDBASE" + do_facet ost1 "sysctl -w lustre.adaptive_history=$ATOLDBASE" fi # end of AT tests includes above lines