From 0f28f3284fed757eb3c826262ddadb49d32e5f5b Mon Sep 17 00:00:00 2001
From: johann <johann>
Date: Mon, 8 Sep 2008 12:05:33 +0000
Subject: [PATCH] Branch b1_6 b=16972 i=rread i=nathan

ptlrpc_at_recv_early_reply() should not modify req->rq_repmsg
because it can be accessed by reply_in_callback() without the
rq_lock held.
---
 lustre/ChangeLog       | 14 +++++++++---
 lustre/ptlrpc/client.c | 49 +++++++++++++++++-------------------------
 2 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 9ea69c8e49..f7a7f63485 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -597,12 +597,20 @@ Details    : FIEMAP ioctl will allow an application to efficiently fetch the
              extent information of a file. It can be used to map logical blocks
              in a file to physical blocks in the block device.
 
-Severity   : minor 
-Bugzilla   : 16717 
-Description: LBUG when llog conf file is full 
+Severity   : minor
+Bugzilla   : 16717
+Description: LBUG when llog conf file is full
 Details    : When llog bitmap is full, ENOSPC should be returned for plain
 	     log.
 
+Severity   : normal
+Frequency  : only with adaptive timeout enabled
+Bugzilla   : 16972
+Description: DEBUG_REQ() bad paging request
+Details    : ptlrpc_at_recv_early_reply() should not modify req->rq_repmsg
+	     because it can be accessed by reply_in_callback() without the
+	     rq_lock held.
+
 -------------------------------------------------------------------------------
 
 
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 55cad7f705..885fab8464 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -231,18 +231,16 @@ void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 }
 
-/* Adjust max service estimate based on server value */
-static void ptlrpc_at_adj_service(struct ptlrpc_request *req)
+/* Adjust max service estimate based on server value. */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+                                  unsigned int serv_est)
 {
         int idx;
-        unsigned int serv_est, oldse;
-        struct imp_at *at = &req->rq_import->imp_at;
+        unsigned int oldse;
+        struct imp_at *at;
 
         LASSERT(req->rq_import);
-
-        /* service estimate is returned in the repmsg timeout field,
-           may be 0 on err */
-        serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+        at = &req->rq_import->imp_at;
 
         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
         /* max service estimates are tracked on the server side,
@@ -262,21 +260,21 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 }
 
 /* Adjust expected network latency */
-static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+                                      unsigned int service_time)
 {
-        unsigned int st, nl, oldnl;
-        struct imp_at *at = &req->rq_import->imp_at;
+        unsigned int nl, oldnl;
+        struct imp_at *at;
         time_t now = cfs_time_current_sec();
 
         LASSERT(req->rq_import);
-
-        st = lustre_msg_get_service_time(req->rq_repmsg);
+        at = &req->rq_import->imp_at;
 
         /* Network latency is total time less server processing time */
-        nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
-        if (st > now - req->rq_sent + 3 /* bz16408 */)
+        nl = max_t(int, now - req->rq_sent - service_time, 0) + 1/*st rounding*/;
+        if (service_time > now - req->rq_sent + 3 /* bz16408 */)
                 CWARN("Reported service time %u > total measured time %ld\n",
-                      st, now - req->rq_sent);
+                      service_time, now - req->rq_sent);
 
         oldnl = at_add(&at->iat_net_latency, nl);
         if (oldnl != 0)
@@ -314,7 +312,7 @@ static int unpack_reply(struct ptlrpc_request *req)
    We can't risk the real reply coming in and changing rq_repmsg,
    so this fn must be called under the rq_lock */
 static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
-        struct lustre_msg *oldmsg, *msgcpy;
+        struct lustre_msg *msgcpy;
         time_t olddl;
         int oldlen, rc;
         ENTRY;
@@ -339,8 +337,7 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
         /* Another reply might have changed the repmsg and replen while
            we dropped the lock; doesn't really matter, just use the latest.
            If it doesn't fit in oldlen, checksum will be wrong. */
-        oldmsg = req->rq_repmsg;
-        memcpy(msgcpy, oldmsg, oldlen);
+        memcpy(msgcpy, req->rq_repmsg, oldlen);
         if (lustre_msg_get_cksum(msgcpy) !=
             lustre_msg_calc_cksum(msgcpy)) {
                 CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, "
@@ -349,13 +346,9 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
                 GOTO(out, rc = -EINVAL);
         }
 
-        /* Our copied msg is valid, now we can adjust the timeouts without
-           worrying that a new reply will land on the copy. */
-        req->rq_repmsg = msgcpy;
-
         /* Expecting to increase the service time estimate here */
-        ptlrpc_at_adj_service(req);
-        ptlrpc_at_adj_net_latency(req);
+        ptlrpc_at_adj_service(req, lustre_msg_get_timeout(msgcpy));
+        ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(msgcpy));
 
         /* Adjust the local timeout for this req */
         ptlrpc_at_set_req_timeout(req);
@@ -371,8 +364,6 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
                   req->rq_early_count, req->rq_deadline -
                   cfs_time_current_sec(), req->rq_deadline - olddl);
 
-        req->rq_repmsg = oldmsg;
-
 out:
         OBD_FREE(msgcpy, oldlen);
         RETURN(rc);
@@ -858,8 +849,8 @@ static int after_reply(struct ptlrpc_request *req)
                                     timediff);
 
         OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
-        ptlrpc_at_adj_service(req);
-        ptlrpc_at_adj_net_latency(req);
+        ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+        ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(req->rq_repmsg));
 
         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
-- 
GitLab