diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index b4bbad6cc02e7957396d3a4d8ca23a3ddece9867..8bd5753b0d968cdcd5fcae0d0fd12d96f1a621b6 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -170,6 +170,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 #define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index be608a6c6479e39116b50f4ca64126a82876c91a..a268d7e48d2b3611cbd9297eeb2e5795e4f42a4a 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -986,6 +986,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) rc = lustre_pack_reply(req, 3, size, NULL); if (rc != 0) GOTO(out, rc); + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, obd_fail_val); rcs = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, niocount * sizeof(*rcs)); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index e69a5f1c6a9e66e1f8485806f11967451da60cff..85e23edb60ee5f5a984c11fbe7ecb2bbbdcef48f 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1052,14 +1052,14 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) spin_lock(&req->rq_lock); - /* Still waiting for a reply? */ - if (req->rq_receiving_reply) { + if (req->rq_early) { + ptlrpc_at_recv_early_reply(req); spin_unlock(&req->rq_lock); continue; } - - if (req->rq_early) { - ptlrpc_at_recv_early_reply(req); + + /* Still waiting for a reply? */ + if (req->rq_receiving_reply) { spin_unlock(&req->rq_lock); continue; } diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 29f4f36e333645b901cabe7fe335ef9385c8ca96..9d884511d68ef1916f5826bdcebe11152b88718b 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -347,9 +347,7 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count, ENTRY; LASSERT(req->rq_reply_state == NULL); - if (req->rq_packed_final) - /* Already packed final, no more early */ - RETURN(-EALREADY); + if ((flags & LPRFL_EARLY_REPLY) == 0) req->rq_packed_final = 1; @@ -390,10 +388,7 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, ENTRY; LASSERT(req->rq_reply_state == NULL); - if (req->rq_packed_final) { - /* Already packed final, no more early */ - RETURN(-EALREADY); - } + if ((flags & LPRFL_EARLY_REPLY) == 0) req->rq_packed_final = 1; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 2cf34b081ea349d1f7be3adbd179ccb9dc6678cd..09fd201f06b818c0c28ba6f643dbc5b3e2e2d533 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -262,7 +262,7 @@ ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc) static void ptlrpc_at_timer(unsigned long castmeharder) { struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder; - CDEBUG(D_ADAPTTO, "at timer %s hit at %ld%s\n", + CDEBUG(D_INFO, "at timer %s hit at %ld%s\n", svc->srv_name, cfs_time_current_sec(), list_empty(&svc->srv_at_list) ? ", empty" : ""); svc->srv_at_check = 1; @@ -634,7 +634,8 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; struct ptlrpc_request *reqcopy; struct lustre_msg *reqmsg; - long deadline = req->rq_deadline - cfs_time_current_sec(); + long olddl = req->rq_deadline - cfs_time_current_sec(); + time_t newdl; int rc; ENTRY; @@ -643,25 +644,43 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, DEBUG_REQ(D_ADAPTTO, req, "%ssending early reply (deadline %+lds, margin %+lds) for " "%d+%d", AT_OFF ? "AT off - not " : "", - deadline, deadline - at_get(&svc->srv_at_estimate), + olddl, olddl - at_get(&svc->srv_at_estimate), at_get(&svc->srv_at_estimate), extra_time); if (AT_OFF) RETURN(0); - if (deadline < 0) { - CERROR("Already past deadline (%+lds), not sending early " - "reply\n", deadline); + if (olddl < 0) { + CDEBUG(D_ADAPTTO, "x"LPU64": Already past deadline (%+lds), not" + " sending early reply\n", req->rq_xid, olddl); /* Return an error so we're not re-added to the timed list. */ RETURN(-ETIMEDOUT); } if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) { - CERROR("Wanted to ask client for more time, but no AT " - "support\n"); + CDEBUG(D_INFO, "Wanted to ask client for more time, but no AT " + "support\n"); RETURN(-ENOSYS); } + if (extra_time) { + /* Fake our processing time into the future to ask the + clients for some extra amount of time */ + extra_time += cfs_time_current_sec() - + req->rq_arrival_time.tv_sec; + at_add(&svc->srv_at_estimate, extra_time); + } + + newdl = req->rq_arrival_time.tv_sec + at_get(&svc->srv_at_estimate); + if (req->rq_deadline >= newdl) { + /* We're not adding any time, no need to send an early reply + (e.g. maybe at adaptive_max) */ + CDEBUG(D_ADAPTTO, "x"LPU64": Couldn't add any time (%ld/%ld), " + "not sending early reply\n", req->rq_xid, olddl, + newdl - cfs_time_current_sec()); + RETURN(-ETIMEDOUT); + } + OBD_ALLOC(reqcopy, sizeof *reqcopy); if (reqcopy == NULL) RETURN(-ENOMEM); @@ -672,7 +691,9 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, } *reqcopy = *req; - /* We need the reqmsg for the magic */ + reqcopy->rq_reply_state = NULL; + reqcopy->rq_rep_swab_mask = 0; + /* We only need the reqmsg for the magic */ reqcopy->rq_reqmsg = reqmsg; memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); @@ -680,20 +701,11 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, if (rc) GOTO(out, rc); - if (extra_time) { - /* Fake our processing time into the future to ask the - clients for some extra amount of time */ - extra_time += cfs_time_current_sec() - - reqcopy->rq_arrival_time.tv_sec; - at_add(&svc->srv_at_estimate, extra_time); - } - rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); if (!rc) { /* Adjust our own deadline to what we told the client */ - req->rq_deadline = req->rq_arrival_time.tv_sec + - at_get(&svc->srv_at_estimate); + req->rq_deadline = newdl; req->rq_early_count++; /* number sent, server side */ } else { DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); @@ -778,8 +790,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) deleted, and is safe to take a ref to keep the req around */ atomic_inc(&rq->rq_refcount); spin_unlock(&svc->srv_at_lock); - if (!rq->rq_packed_final && - (ptlrpc_at_send_early_reply(rq, at_extra) == 0)) { + if (ptlrpc_at_send_early_reply(rq, at_extra) == 0) { counter++; ptlrpc_at_add_timed(rq); } diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index e26ce10c0f51acd762d3a60111c46f24d8d2032a..39e05f0cf0ae1d623dcd02fb98b48577e13db722 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -40,7 +40,8 @@ run_test 3 "stat: drop req, drop rep" SAMPLE_NAME=recovery-small.junk SAMPLE_FILE=$TMP/$SAMPLE_NAME -dd if=/dev/urandom of=$SAMPLE_FILE bs=1K count=4 +# make this big, else test 9 doesn't wait for bulk -- bz 5595 +dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4 test_4() { do_facet client "cp $SAMPLE_FILE $MOUNT/$SAMPLE_NAME" || return 1 @@ -195,6 +196,10 @@ test_16() { run_test 16 "timeout bulk put, don't evict client (2732)" test_17() { + # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max + OST_AT_MAX=$(do_facet ost1 sysctl -n lustre.adaptive_max) + do_facet ost1 sysctl -w lustre.adaptive_max=$TIMEOUT + # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE # OST bulk will time out here, client retries do_facet ost1 sysctl -w lustre.fail_loc=0x80000503 @@ -202,12 +207,16 @@ test_17() { do_facet client cp $SAMPLE_FILE $DIR/$tfile sync - sleep $TIMEOUT + # with AT, client will wait adaptive_max*factor+net_latency before + # expiring the req, hopefully timeout*2 is enough + sleep $(($TIMEOUT*2)) + do_facet ost1 sysctl -w lustre.fail_loc=0 do_facet client "df $DIR" # expect cmp to succeed, client resent bulk do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3 do_facet client "rm $DIR/$tfile" || return 4 + do_facet ost1 sysctl -w lustre.adaptive_max=$OST_AT_MAX return 0 } run_test 17 "timeout bulk get, don't evict client (2732)" diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 86f53e02365c4d32052fe2fa0e6eefcf6079b0b9..a1371a5a22bd45202d5bda1c4c1e2a427f986b5b 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1188,7 +1188,7 @@ at_start() #bug 3055 fi } -test_65() #bug 3055 +test_65a() #bug 3055 { at_start $LCTL dk > /dev/null @@ -1200,12 +1200,42 @@ test_65() #bug 3055 unlinkmany $DIR/$tfile 10 > /dev/null # check for log message $LCTL dk | grep "Early reply #" || error "No early reply" - # client should show 30s timeouts + # client should show 30s estimates grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts sleep 9 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts } -run_test 65 "AT: verify early replies" +run_test 65a "AT: verify early replies" + +test_65b() #bug 3055 +{ + at_start + # turn on D_ADAPTTO + debugsave + sysctl -w lnet.debug="+other" + $LCTL dk > /dev/null + # slow down bulk i/o + do_facet ost1 sysctl -w lustre.fail_val=30 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 + do_facet ost1 sysctl -w lustre.fail_loc=0x224 + + rm -f $DIR/$tfile + lfs setstripe $DIR/$tfile --index=0 --count=1 + # force some real bulk transfer + dd if=/dev/urandom of=$TMP/big bs=1M count=4 + cp $TMP/big $DIR/$tfile + echo "append" >> $DIR/$tfile + cat $DIR/$tfile >> /dev/null + rm $TMP/big + + do_facet ost1 sysctl -w lustre.fail_loc=0 + # check for log message + $LCTL dk | grep "Early reply #" || error "No early reply" + debugrestore + # client should show 30s estimates + grep portal $LPROC/osc/${FSNAME}-OST0000-osc-*/timeouts +} +run_test 65b "AT: verify early replies on packed reply / bulk" test_66a() #bug 3055 {