diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 7e1e6cd0106ea2864dfd2a1bf32aa6f52d560433..affe53fc6012d24b2c4d2678fadcbd5fea412636 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1063,7 +1063,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) if (ptlrpc_client_recv_or_unlink(req) || ptlrpc_client_bulk_active(req)) continue; - + /* Turn repl fail_loc off to prevent it from looping * forever. */ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { @@ -1091,7 +1091,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) /* Note that this also will start async reply unlink */ if (req->rq_net_err && !req->rq_timedout) { ptlrpc_expire_one_request(req, 1); - + /* Check if we still need to wait for unlink. */ if (ptlrpc_client_recv_or_unlink(req) || ptlrpc_client_bulk_active(req)) @@ -1124,8 +1124,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) continue; spin_lock(&imp->imp_lock); - if (ptlrpc_import_delay_req(imp, req, &status)){ + /* put on delay list - only if we wait + * recovery finished - before send */ + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); spin_unlock(&imp->imp_lock); continue; } @@ -1157,8 +1160,6 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) /* This is re-sending anyways, * let's mark req as resend. */ req->rq_resend = 1; - lustre_msg_add_flags(req->rq_reqmsg, - MSG_RESENT); if (req->rq_bulk) { __u64 old_xid; @@ -1209,17 +1210,8 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) spin_unlock(&req->rq_lock); req->rq_status = after_reply(req); - if (req->rq_resend) { - /* Add this req to the delayed list so - it can be errored if the import is - evicted after recovery. */ - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_list); - list_add_tail(&req->rq_list, - &imp->imp_delayed_list); - spin_unlock(&imp->imp_lock); + if (req->rq_resend) continue; - } /* If there is no bulk associated with this request, * then we're done and should let the interpreter @@ -1967,8 +1959,6 @@ restart: } if (req->rq_resend) { - lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); - if (req->rq_bulk != NULL) { ptlrpc_unregister_bulk(req, 0); @@ -2046,18 +2036,18 @@ restart: } /* Resend if we need to */ - if (req->rq_resend) { + if (req->rq_resend||req->rq_timedout) { /* ...unless we were specifically told otherwise. */ if (req->rq_no_resend) GOTO(out, rc = -ETIMEDOUT); spin_lock(&imp->imp_lock); + /* we can have rq_timeout on dlm fake import which not support + * recovery - but me need resend request on this import instead + * of return error */ + req->rq_resend = 1; goto restart; } - if (req->rq_timedout) { /* non-recoverable timeout */ - GOTO(out, rc = -ETIMEDOUT); - } - if (!ptlrpc_client_replied(req)) { /* How can this be? -eeb */ DEBUG_REQ(D_ERROR, req, "!rq_replied: "); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 613ef6f8060f945f80449c276dc80545073db254..7bb6103c42ddf7f0aa2120dc5dc7f2663612755d 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -523,6 +523,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) lustre_msghdr_set_flags(request->rq_reqmsg, request->rq_import->imp_msghdr_flags); + if (request->rq_resend) + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + if (!noreply) { LASSERT (request->rq_replen != 0); if (request->rq_repbuf == NULL) diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 20ad67402ef0a613028632c3c5c03166bb9c30b7..5c12b63be81b3c3bd32af2fef0d361d669888249 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -223,7 +223,7 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) GOTO(exit, rc = 0); lcm = llcd->llcd_lcm; - + /* * Check if we're in exit stage. Do not send llcd in * this case. @@ -271,6 +271,11 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) ptlrpc_at_set_req_timeout(req); req->rq_interpret_reply = llcd_interpret; req->rq_async_args.pointer_arg[0] = llcd; + + /* llog cancels will be replayed after reconnect so this will do twice + * first from replay llog, second for resended rpc */ + req->rq_no_delay = req->rq_no_resend = 1; + rc = ptlrpc_set_add_new_req(&lcm->lcm_pc, req); if (rc) { ptlrpc_req_finished(req); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index d5b0273da69815bbc33d1b2893e7fd2f12bb54f1..15bcc98b87b69eca50a4299dd78997d3d6c88241 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1332,33 +1332,6 @@ test_59() { } run_test 59 "test log_commit_thread vs filter_destroy race" -# bug 17323 -test_59b() { - do_facet mds "lctl set_param debug=+rpctrace" - mkdir -p $DIR/$tdir - createmany -o $DIR/$tdir/$tfile-%d 2000 - sync -#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 - do_facet mds "lctl set_param fail_loc=0x606" - unlinkmany $DIR/$tdir/$tfile-%d 2000 - - # make sure that all llcds left ost and nothing left cached - sync - sleep 10 - do_facet mds "lctl set_param fail_loc=0x0" - - # sleep 2 obd_timeouts from ost to make sure that we get resents. - local timeout=$(do_facet ost1 lctl get_param -n timeout) - timeout=$((timeout * 2)) - log "Sleep $timeout" - sleep $timeout - do_facet mds $LCTL dk | grep -q "RESENT cancel req" - local res=$? - rmdir $DIR/$tdir - return $res -} -run_test 59b "resent handle in llog_origin_handle_cancel" - # race between add unlink llog vs cat log init in post_recovery (only for b1_6) # bug 12086: should no oops and No ctxt error for this test test_60() {