From dd6ee968ae726b50d4cfc1a8630ba0d0ccd4eaa3 Mon Sep 17 00:00:00 2001 From: nathan <nathan> Date: Mon, 5 Nov 2007 19:30:38 +0000 Subject: [PATCH] b=13813 i=adilger i=johann Need to set the AT timeout estimate after the req portal is set. --- lustre/include/lustre_net.h | 1 + lustre/ldlm/ldlm_request.c | 2 +- lustre/mdc/mdc_reint.c | 3 ++- lustre/mdc/mdc_request.c | 4 ++-- lustre/osc/osc_create.c | 3 ++- lustre/osc/osc_request.c | 9 +++++++-- lustre/ost/ost_handler.c | 14 ++++++++++++-- lustre/ptlrpc/client.c | 3 +-- lustre/ptlrpc/pack_generic.c | 9 ++++----- lustre/ptlrpc/ptlrpc_module.c | 1 + lustre/ptlrpc/recov_thread.c | 3 ++- lustre/ptlrpc/service.c | 23 +++++++++++++---------- 12 files changed, 48 insertions(+), 27 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 5c8d8597a3..6fbde895ef 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -694,6 +694,7 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int, void (*populate_pool)(struct ptlrpc_request_pool *, int)); +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, int *lengths, char **bufs); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index fa8a5cd893..8b5c280282 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -918,9 +918,9 @@ int ldlm_cli_cancel_req(struct obd_export *exp, req->rq_no_resend = 1; req->rq_no_delay = 1; - /* XXX FIXME bug 249 */ req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); body = lustre_msg_buf(req->rq_reqmsg, DLM_LOCKREQ_OFF, sizeof(*body)); diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index e983e85730..37f3a29936 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -136,7 +136,8 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data, RETURN(-ENOMEM); if (iattr->ia_valid & ATTR_FROM_OPEN) { - req->rq_request_portal = MDS_SETATTR_PORTAL; //XXX FIXME bug 249 + req->rq_request_portal = MDS_SETATTR_PORTAL; + ptlrpc_at_set_req_timeout(req); rpc_lock = obd->u.cli.cl_setattr_lock; } else { rpc_lock = obd->u.cli.cl_rpc_lock; diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index fc3358eae8..7b6079dd5e 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -678,8 +678,8 @@ int mdc_close(struct obd_export *exp, struct obdo *oa, /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a * portal whose threads are not taking any DLM locks and are therefore * always progressing */ - /* XXX FIXME bug 249 */ req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); /* Ensure that this close's handle is fixed up during replay. */ LASSERT(och != NULL); @@ -787,8 +787,8 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset, if (req == NULL) GOTO(out, rc = -ENOMEM); - /* XXX FIXME bug 249 */ req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL); if (desc == NULL) diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 90e0019cad..71c8986437 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -160,7 +160,8 @@ static int oscc_internal_create(struct osc_creator *oscc) RETURN(-ENOMEM); } - request->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + request->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(request); body = lustre_msg_buf(request->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); spin_lock(&oscc->oscc_lock); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 9e6fec39c6..42b877d7e2 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -478,6 +478,7 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, RETURN(-ENOMEM); req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); @@ -610,6 +611,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); @@ -942,6 +944,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, RETURN (-ENOMEM); req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); if (opc == OST_WRITE) desc = ptlrpc_prep_bulk_imp (req, page_count, @@ -3048,7 +3051,8 @@ static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo, RETURN(-ENOMEM); ptlrpc_req_set_repsize(req, 2, size); - req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); req->rq_interpret_reply = osc_statfs_interpret; CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); @@ -3079,7 +3083,8 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, RETURN(-ENOMEM); ptlrpc_req_set_repsize(req, 2, size); - req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); rc = ptlrpc_queue_wait(req); if (rc) diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 056ed0d15f..69e12f9be1 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -812,6 +812,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) else rc = ptlrpc_start_bulk_transfer(desc); if (rc == 0) { + time_t start = cfs_time_current_sec(); do { long timeoutl = req->rq_deadline - cfs_time_current_sec(); @@ -830,7 +831,11 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) (req->rq_deadline > cfs_time_current_sec())); if (rc == -ETIMEDOUT) { - DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); + DEBUG_REQ(D_ERROR, req, + "timeout on bulk PUT after %ld%+lds", + req->rq_deadline - start, + cfs_time_current_sec() - + req->rq_deadline); ptlrpc_abort_bulk(desc); } else if (exp->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT"); @@ -1063,6 +1068,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) else rc = ptlrpc_start_bulk_transfer (desc); if (rc == 0) { + time_t start = cfs_time_current_sec(); do { long timeoutl = req->rq_deadline - cfs_time_current_sec(); @@ -1079,7 +1085,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) (req->rq_deadline > cfs_time_current_sec())); if (rc == -ETIMEDOUT) { - DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); + DEBUG_REQ(D_ERROR, req, + "timeout on bulk GET after %ld%+lds", + req->rq_deadline - start, + cfs_time_current_sec() - + req->rq_deadline); ptlrpc_abort_bulk(desc); } else if (desc->bd_export->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET"); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 2d30542a09..6fdc64b64c 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -190,7 +190,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) } /* Set server timelimit for this req */ -static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) { __u32 serv_est; int idx; @@ -532,7 +532,6 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode, request->rq_phase = RQ_PHASE_NEW; - /* XXX FIXME bug 249 */ request->rq_request_portal = imp->imp_client->cli_request_portal; request->rq_reply_portal = imp->imp_client->cli_reply_portal; diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index b7ce82b9d6..dc31d08a3e 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -2397,9 +2397,9 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, va_start(args, fmt); libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file, data->msg_fn, data->msg_line, fmt, args, - " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d " - "e %d to %d dl %ld ref %d fl "REQ_FLAGS_FMT"/%x/%x " - "rc %d/%d\n", + " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d/%d " + "lens %d/%d e %d to %d dl %ld ref %d " + "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", req, req->rq_xid, req->rq_transno, req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1, req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : @@ -2409,8 +2409,7 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, (char *)req->rq_import->imp_connection->c_remote_uuid.uuid : req->rq_export ? (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>", - (req->rq_import && req->rq_import->imp_client) ? - req->rq_import->imp_client->cli_request_portal : -1, + req->rq_request_portal, req->rq_reply_portal, req->rq_reqlen, req->rq_replen, req->rq_early_count, req->rq_timeout, req->rq_deadline, atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req), diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 4a46cb7f61..56e7228670 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -145,6 +145,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); EXPORT_SYMBOL(ptlrpc_init_rq_pool); EXPORT_SYMBOL(ptlrpc_free_rq_pool); EXPORT_SYMBOL(ptlrpc_prep_req_pool); +EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); EXPORT_SYMBOL(ptlrpc_prep_req); EXPORT_SYMBOL(ptlrpc_free_req); EXPORT_SYMBOL(ptlrpc_unregister_reply); diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index ea915c6536..9ec7074114 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -370,9 +370,10 @@ static int log_commit_thread(void *arg) break; } - /* XXX FIXME bug 249, 5515 */ + /* bug 5515 */ request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(request); ptlrpc_req_set_repsize(request, 1, NULL); mutex_down(&llcd->llcd_ctxt->loc_sem); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index e05bdf6860..287076e72a 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -49,7 +49,7 @@ unsigned int at_history = 600; CFS_MODULE_PARM(at_history, "i", int, 0644, "Adaptive timeouts remember the slowest event that took place " "within this period (sec)"); -static int at_early_margin = 3; +static int at_early_margin = 5; CFS_MODULE_PARM(at_early_margin, "i", int, 0644, "How soon before an RPC deadline to send an early reply"); static int at_extra = 30; @@ -662,8 +662,9 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, RETURN(0); if (olddl < 0) { - CDEBUG(D_ADAPTTO, "x"LPU64": Already past deadline (%+lds), not" - " sending early reply\n", req->rq_xid, olddl); + CDEBUG(D_WARNING, "x"LPU64": Already past deadline (%+lds), not" + " sending early reply. Increase at_early_margin (%d)?\n", + req->rq_xid, olddl, at_early_margin); /* Return an error so we're not re-added to the timed list. */ RETURN(-ETIMEDOUT); } @@ -996,11 +997,12 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, /* Discard requests queued for longer than the deadline. The deadline is increased if we send an early reply. */ if (cfs_time_current_sec() > request->rq_deadline) { - CERROR("Dropping timed-out opc %d request from %s" - ": deadline %lds ago\n", - lustre_msg_get_opc(request->rq_reqmsg), - libcfs_id2str(request->rq_peer), - cfs_time_current_sec() - request->rq_deadline); + DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s" + ": deadline %ld%+lds ago\n", + libcfs_id2str(request->rq_peer), + request->rq_deadline - + request->rq_arrival_time.tv_sec, + cfs_time_current_sec() - request->rq_deadline); goto put_rpc_export; } @@ -1042,8 +1044,9 @@ put_conn: if (cfs_time_current_sec() > request->rq_deadline) { DEBUG_REQ(D_WARNING, request, "Request x"LPU64" took longer " - "than estimated (%+lds); client may timeout.", - request->rq_xid, + "than estimated (%ld%+lds); client may timeout.", + request->rq_xid, request->rq_deadline - + request->rq_arrival_time.tv_sec, cfs_time_current_sec() - request->rq_deadline); } -- GitLab