diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 0ab317f0c273be15aa8dcdfe853c4810cb4d53a7..662658380d967e9f309b7818ea82bd26441e71fd 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -271,7 +271,7 @@ static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, oldnl, at_get(&at->iat_net_latency)); } -static int unpack_reply(struct ptlrpc_request *req) +static int unpack_reply_common(struct ptlrpc_request *req) { int rc; @@ -285,6 +285,17 @@ static int unpack_reply(struct ptlrpc_request *req) if (rc > 0) lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + return rc; +} + +static int unpack_reply(struct ptlrpc_request *req) +{ + int rc; + + rc = unpack_reply_common(req); + if (rc < 0) + return rc; + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); if (rc) { DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); @@ -293,53 +304,103 @@ static int unpack_reply(struct ptlrpc_request *req) return 0; } -/* Handle an early reply message. - We can't risk the real reply coming in and changing rq_repmsg, - so this fn must be called under the rq_lock */ -static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { +static inline void unpack_reply_free_msg(struct lustre_msg *msg, int len) +{ + OBD_FREE(msg, len); +} + +static int unpack_reply_copy_msg(struct ptlrpc_request *req, + struct lustre_msg **msg, int *len) +{ struct lustre_msg *msgcpy; - time_t olddl; - int oldlen, rc; + __u32 csum_calc, csum_get; + int lencpy, rc; ENTRY; - req->rq_early = 0; + LASSERT_SPIN_LOCKED(&req->rq_lock); + *msg = NULL; + *len = 0; - rc = unpack_reply(req); - if (rc) - /* Let's just ignore it - same as if it never got here */ + /* Swabbing required when rc == 1 */ + rc = unpack_reply_common(req); + if (rc < 0) RETURN(rc); - /* We've got to make sure another early reply doesn't land on - top of our current repbuf. Make a copy and verify checksum. */ - oldlen = req->rq_replen; + lencpy = req->rq_replen; spin_unlock(&req->rq_lock); - OBD_ALLOC(msgcpy, oldlen); + + OBD_ALLOC(msgcpy, lencpy); if (!msgcpy) { spin_lock(&req->rq_lock); RETURN(-ENOMEM); } spin_lock(&req->rq_lock); - /* Another reply might have changed the repmsg and replen while - we dropped the lock; doesn't really matter, just use the latest. - If it doesn't fit in oldlen, checksum will be wrong. */ - memcpy(msgcpy, req->rq_repmsg, oldlen); - if (lustre_msg_get_cksum(msgcpy) != - lustre_msg_calc_cksum(msgcpy)) { - CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, " - "discarding %x != %x\n", lustre_msg_get_cksum(msgcpy), - lustre_msg_calc_cksum(msgcpy)); - GOTO(out, rc = -EINVAL); + + /* Checksum must be calculated before being unswabbed. If the magic + * in the copy is unswabbed discard like the checksum failure case */ + memcpy(msgcpy, req->rq_repmsg, lencpy); + if (lustre_msg_need_swab(msgcpy)) { + DEBUG_REQ(D_NET, req, "incorrect message magic: %08x\n", + msgcpy->lm_magic); + GOTO(err, rc = -EINVAL); + } + + csum_calc = lustre_msg_calc_cksum(msgcpy); + + /* Unpack the copy the original rq_repmsg is untouched */ + rc = lustre_unpack_msg_ptlrpc_body(msgcpy, MSG_PTLRPC_BODY_OFF, rc); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack msg copy failed: %d", rc); + GOTO(err, rc = -EPROTO); + } + + /* For early replies the LND may update repmsg outside req->rq_lock + * resulting in a checksum failure which is non-harmful */ + csum_get = lustre_msg_get_cksum(msgcpy); + if (csum_calc != csum_get) { + DEBUG_REQ(D_NET, req, "checksum mismatch: %x != %x\n", + csum_calc, csum_get); + GOTO(err, rc = -EINVAL); + } + + *msg = msgcpy; + *len = lencpy; + return 0; +err: + unpack_reply_free_msg(msgcpy, lencpy); + return rc; +} + +/* Handle an early reply message. To prevent a real reply from arriving + * and changing req->rq_repmsg this func is called under the rq_lock */ +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { + struct lustre_msg *msg; + time_t olddl; + int len, rc; + ENTRY; + + LASSERT_SPIN_LOCKED(&req->rq_lock); + req->rq_early = 0; + + /* All early replys for this request use a single repbuf which can + * be updated outside the req->rq_lock. To prevent racing we create + * a copy of the repmsg and verify its checksum before it is used. */ + rc = unpack_reply_copy_msg(req, &msg, &len); + if (rc) { + /* Let's just ignore it - same as if it never got here */ + CDEBUG(D_ADAPTTO, "Discarding racing early reply: %d\n", rc); + RETURN(rc); } /* Expecting to increase the service time estimate here */ - ptlrpc_at_adj_service(req, lustre_msg_get_timeout(msgcpy)); - ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(msgcpy)); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(msg)); + ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(msg)); /* Adjust the local timeout for this req */ ptlrpc_at_set_req_timeout(req); olddl = req->rq_deadline; - /* server assumes it now has rq_timeout from when it sent the + /* Server assumes it now has rq_timeout from when it sent the early reply, so client should give it at least that long. */ req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + ptlrpc_at_get_net_latency(req); @@ -349,8 +410,7 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { req->rq_early_count, req->rq_deadline - cfs_time_current_sec(), req->rq_deadline - olddl); -out: - OBD_FREE(msgcpy, oldlen); + unpack_reply_free_msg(msg, len); RETURN(rc); } diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 14b0e3a2d10911d9077065b11d305e284757a8ee..0ce50b4d392c142b980cacbc30a2f9319d127943 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -70,7 +70,7 @@ static inline int lustre_msg_hdr_size_v2(int count) return size_round(offsetof(struct lustre_msg_v2, lm_buflens[count])); } -static int lustre_msg_need_swab(struct lustre_msg *msg) +int lustre_msg_need_swab(struct lustre_msg *msg) { return (msg->lm_magic == LUSTRE_MSG_MAGIC_V1_SWABBED) || (msg->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); @@ -899,6 +899,20 @@ static inline int lustre_unpack_ptlrpc_body_v2(struct lustre_msg_v2 *m, return 0; } +int lustre_unpack_msg_ptlrpc_body(struct lustre_msg *msg, + int offset, int swab_needed) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return 0; + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(msg, offset, swab_needed); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return -EINVAL; + } +} + int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) { switch (req->rq_reqmsg->lm_magic) { diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 439a84a63b25eef0b5b25961f54fa6ef60da7881..a10f4b06f025bf1f215e911ac0663c4368f7d638 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -56,6 +56,8 @@ void ptlrpc_handle_failed_import(struct obd_import *imp); int ptlrpc_replay_next(struct obd_import *imp, int *inflight); void ptlrpc_initiate_recovery(struct obd_import *imp); +int lustre_msg_need_swab(struct lustre_msg *msg); +int lustre_unpack_msg_ptlrpc_body(struct lustre_msg *msg, int offset, int swab); int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset); int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);