From 0f28f3284fed757eb3c826262ddadb49d32e5f5b Mon Sep 17 00:00:00 2001 From: johann <johann> Date: Mon, 8 Sep 2008 12:05:33 +0000 Subject: [PATCH] Branch b1_6 b=16972 i=rread i=nathan ptlrpc_at_recv_early_reply() should not modify req->rq_repmsg because it can be accessed by reply_in_callback() without the rq_lock held. --- lustre/ChangeLog | 14 +++++++++--- lustre/ptlrpc/client.c | 49 +++++++++++++++++------------------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 9ea69c8e49..f7a7f63485 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -597,12 +597,20 @@ Details : FIEMAP ioctl will allow an application to efficiently fetch the extent information of a file. It can be used to map logical blocks in a file to physical blocks in the block device. -Severity : minor -Bugzilla : 16717 -Description: LBUG when llog conf file is full +Severity : minor +Bugzilla : 16717 +Description: LBUG when llog conf file is full Details : When llog bitmap is full, ENOSPC should be returned for plain log. +Severity : normal +Frequency : only with adaptive timeout enabled +Bugzilla : 16972 +Description: DEBUG_REQ() bad paging request +Details : ptlrpc_at_recv_early_reply() should not modify req->rq_repmsg + because it can be accessed by reply_in_callback() without the + rq_lock held. + ------------------------------------------------------------------------------- diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 55cad7f705..885fab8464 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -231,18 +231,16 @@ void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); } -/* Adjust max service estimate based on server value */ -static void ptlrpc_at_adj_service(struct ptlrpc_request *req) +/* Adjust max service estimate based on server value. */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req, + unsigned int serv_est) { int idx; - unsigned int serv_est, oldse; - struct imp_at *at = &req->rq_import->imp_at; + unsigned int oldse; + struct imp_at *at; LASSERT(req->rq_import); - - /* service estimate is returned in the repmsg timeout field, - may be 0 on err */ - serv_est = lustre_msg_get_timeout(req->rq_repmsg); + at = &req->rq_import->imp_at; idx = import_at_get_index(req->rq_import, req->rq_request_portal); /* max service estimates are tracked on the server side, @@ -262,21 +260,21 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) } /* Adjust expected network latency */ -static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) +static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + unsigned int service_time) { - unsigned int st, nl, oldnl; - struct imp_at *at = &req->rq_import->imp_at; + unsigned int nl, oldnl; + struct imp_at *at; time_t now = cfs_time_current_sec(); LASSERT(req->rq_import); - - st = lustre_msg_get_service_time(req->rq_repmsg); + at = &req->rq_import->imp_at; /* Network latency is total time less server processing time */ - nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/; - if (st > now - req->rq_sent + 3 /* bz16408 */) + nl = max_t(int, now - req->rq_sent - service_time, 0) + 1/*st rounding*/; + if (service_time > now - req->rq_sent + 3 /* bz16408 */) CWARN("Reported service time %u > total measured time %ld\n", - st, now - req->rq_sent); + service_time, now - req->rq_sent); oldnl = at_add(&at->iat_net_latency, nl); if (oldnl != 0) @@ -314,7 +312,7 @@ static int unpack_reply(struct ptlrpc_request *req) We can't risk the real reply coming in and changing rq_repmsg, so this fn must be called under the rq_lock */ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { - struct lustre_msg *oldmsg, *msgcpy; + struct lustre_msg *msgcpy; time_t olddl; int oldlen, rc; ENTRY; @@ -339,8 +337,7 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { /* Another reply might have changed the repmsg and replen while we dropped the lock; doesn't really matter, just use the latest. If it doesn't fit in oldlen, checksum will be wrong. */ - oldmsg = req->rq_repmsg; - memcpy(msgcpy, oldmsg, oldlen); + memcpy(msgcpy, req->rq_repmsg, oldlen); if (lustre_msg_get_cksum(msgcpy) != lustre_msg_calc_cksum(msgcpy)) { CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, " @@ -349,13 +346,9 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { GOTO(out, rc = -EINVAL); } - /* Our copied msg is valid, now we can adjust the timeouts without - worrying that a new reply will land on the copy. */ - req->rq_repmsg = msgcpy; - /* Expecting to increase the service time estimate here */ - ptlrpc_at_adj_service(req); - ptlrpc_at_adj_net_latency(req); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(msgcpy)); + ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(msgcpy)); /* Adjust the local timeout for this req */ ptlrpc_at_set_req_timeout(req); @@ -371,8 +364,6 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { req->rq_early_count, req->rq_deadline - cfs_time_current_sec(), req->rq_deadline - olddl); - req->rq_repmsg = oldmsg; - out: OBD_FREE(msgcpy, oldlen); RETURN(rc); @@ -858,8 +849,8 @@ static int after_reply(struct ptlrpc_request *req) timediff); OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val); - ptlrpc_at_adj_service(req); - ptlrpc_at_adj_net_latency(req); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(req->rq_repmsg)); if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { -- GitLab