From b642b4688551722037cf9cf3fdad6d954b0be9e6 Mon Sep 17 00:00:00 2001 From: yury <yury> Date: Tue, 21 Oct 2008 16:52:41 +0000 Subject: [PATCH] b=17323 r=adilger,johann - handle log_cancel resent correctly; - some cleanups in llog. --- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lockd.c | 2 + lustre/obdclass/llog.c | 12 +-- lustre/obdclass/llog_cat.c | 2 +- lustre/ost/ost_handler.c | 1 + lustre/ptlrpc/client.c | 6 +- lustre/ptlrpc/llog_server.c | 133 ++++++++++++++++++++-------------- lustre/ptlrpc/recov_thread.c | 2 +- lustre/tests/replay-single.sh | 15 ++++ lustre/tests/sanity.sh | 2 +- 10 files changed, 112 insertions(+), 64 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 56c048173b..b7817608af 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -266,6 +266,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 #define OBD_FAIL_OBD_DQACQ 0x604 #define OBD_FAIL_OBD_LLOG_SETUP 0x605 +#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 #define OBD_FAIL_TGT_REPLY_NET 0x700 #define OBD_FAIL_TGT_CONN_RACE 0x701 diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index a6cc1a2309..ab4f31e155 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1578,6 +1578,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */ OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0); rc = llog_origin_handle_cancel(req); + OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_REP, 0); ldlm_callback_reply(req, rc); RETURN(0); case OBD_QC_CALLBACK: @@ -1744,6 +1745,7 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) case OBD_LOG_CANCEL: OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0); rc = llog_origin_handle_cancel(req); + OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_REP, 0); ldlm_callback_reply(req, rc); RETURN(0); default: diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 62105b3c0a..2a5ba2d940 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -100,17 +100,17 @@ int llog_cancel_rec(struct llog_handle *loghandle, int index) int rc = 0; ENTRY; - CDEBUG(D_RPCTRACE, "canceling %d in log "LPX64"\n", + CDEBUG(D_RPCTRACE, "Canceling %d in log "LPX64"\n", index, loghandle->lgh_id.lgl_oid); if (index == 0) { - CERROR("cannot cancel index 0 (which is header)\n"); + CERROR("Can't cancel index 0 which is header\n"); RETURN(-EINVAL); } if (!ext2_clear_bit(index, llh->llh_bitmap)) { - CDEBUG(D_RPCTRACE, "catalog index %u already clear?\n", index); - RETURN(-EINVAL); + CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index); + RETURN(-ENOENT); } llh->llh_count--; @@ -120,7 +120,7 @@ int llog_cancel_rec(struct llog_handle *loghandle, int index) (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) { rc = llog_destroy(loghandle); if (rc) { - CERROR("failure destroying log after last cancel: %d\n", + CERROR("Failure destroying log after last cancel: %d\n", rc); ext2_set_bit(index, llh->llh_bitmap); llh->llh_count++; @@ -132,7 +132,7 @@ int llog_cancel_rec(struct llog_handle *loghandle, int index) rc = llog_write_rec(loghandle, &llh->llh_hdr, NULL, 0, NULL, 0); if (rc) { - CERROR("failure re-writing header %d\n", rc); + CERROR("Failure re-writing header %d\n", rc); ext2_set_bit(index, llh->llh_bitmap); llh->llh_count++; } diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c index 51a2d17ca6..b9a68337e9 100644 --- a/lustre/obdclass/llog_cat.c +++ b/lustre/obdclass/llog_cat.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL(llog_cat_add_rec); * Assumes caller has already pushed us into the kernel context. */ int llog_cat_cancel_records(struct llog_handle *cathandle, int count, - struct llog_cookie *cookies) + struct llog_cookie *cookies) { int i, index, rc = 0; ENTRY; diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 839180f5c2..fc699efa35 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1706,6 +1706,7 @@ static int ost_handle(struct ptlrpc_request *req) CDEBUG(D_INODE, "log cancel\n"); OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0); rc = llog_origin_handle_cancel(req); + OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_REP, 0); req->rq_status = rc; rc = lustre_pack_reply(req, 1, NULL, NULL); if (rc) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 662658380d..ca0b4a5b43 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1120,7 +1120,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) spin_unlock(&imp->imp_lock); req->rq_waiting = 0; - if (req->rq_resend) { + + if (req->rq_timedout||req->rq_resend) { + /* This is re-sending anyways, + * let's mark req as resend. */ + req->rq_resend = 1; lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); if (req->rq_bulk) { diff --git a/lustre/ptlrpc/llog_server.c b/lustre/ptlrpc/llog_server.c index bc436524c5..9888c26715 100644 --- a/lustre/ptlrpc/llog_server.c +++ b/lustre/ptlrpc/llog_server.c @@ -76,7 +76,7 @@ int llog_origin_handle_create(struct ptlrpc_request *req) body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), lustre_swab_llogd_body); if (body == NULL) { - CERROR ("Can't unpack llogd_body\n"); + CERROR("Can't unpack llogd_body\n"); RETURN(-EFAULT); } @@ -89,12 +89,13 @@ int llog_origin_handle_create(struct ptlrpc_request *req) CERROR("Can't unpack name\n"); RETURN(-EFAULT); } - CDEBUG(D_INFO, "opening log %s\n", name); + CDEBUG(D_INFO, "Opening log %s\n", name); } ctxt = llog_get_context(obd, body->lgd_ctxt_idx); if (ctxt == NULL) - RETURN(-EINVAL); + RETURN(-ENODEV); + disk_obd = ctxt->loc_exp->exp_obd; push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); @@ -108,7 +109,7 @@ int llog_origin_handle_create(struct ptlrpc_request *req) body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body)); body->lgd_logid = loghandle->lgh_id; - + EXIT; out_close: rc2 = llog_close(loghandle); if (!rc) @@ -116,7 +117,7 @@ out_close: out_pop: pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); llog_ctxt_put(ctxt); - RETURN(rc); + return rc; } int llog_origin_handle_destroy(struct ptlrpc_request *req) @@ -146,14 +147,20 @@ int llog_origin_handle_destroy(struct ptlrpc_request *req) ctxt = llog_get_context(obd, body->lgd_ctxt_idx); if (ctxt == NULL) - RETURN(-EINVAL); + RETURN(-ENODEV); disk_obd = ctxt->loc_exp->exp_obd; push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); rc = llog_create(ctxt, &loghandle, logid, NULL); - if (rc) + if (rc) { + /* This might already be killed. Let's check if this is + * resent case. */ + if (rc == -ENOENT && + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) + rc = 0; GOTO(out_pop, rc); + } rc = lustre_pack_reply(req, 2, size, NULL); if (rc) @@ -167,16 +174,18 @@ int llog_origin_handle_destroy(struct ptlrpc_request *req) GOTO(out_close, rc); rc = llog_destroy(loghandle); if (rc) + /* Do not check for resent as this is already done above after + * llog_create(). */ GOTO(out_close, rc); llog_free_handle(loghandle); - + EXIT; out_close: if (rc) llog_close(loghandle); out_pop: pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); llog_ctxt_put(ctxt); - RETURN(rc); + return rc; } int llog_origin_handle_next_block(struct ptlrpc_request *req) @@ -210,7 +219,8 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req) ctxt = llog_get_context(obd, body->lgd_ctxt_idx); if (ctxt == NULL) - GOTO(out_free, rc = -EINVAL); + GOTO(out_free, rc = -ENODEV); + disk_obd = ctxt->loc_exp->exp_obd; push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); @@ -230,7 +240,6 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req) if (rc) GOTO(out_close, rc); - rc = lustre_pack_reply(req, 3, size, NULL); if (rc) GOTO(out_close, rc = -ENOMEM); @@ -240,18 +249,17 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req) ptr = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF+1, LLOG_CHUNK_SIZE); memcpy(ptr, buf, LLOG_CHUNK_SIZE); - + EXIT; out_close: rc2 = llog_close(loghandle); if (!rc) rc = rc2; - out_pop: pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); llog_ctxt_put(ctxt); out_free: OBD_FREE(buf, LLOG_CHUNK_SIZE); - RETURN(rc); + return rc; } int llog_origin_handle_prev_block(struct ptlrpc_request *req) @@ -284,7 +292,9 @@ int llog_origin_handle_prev_block(struct ptlrpc_request *req) RETURN(-ENOMEM); ctxt = llog_get_context(obd, body->lgd_ctxt_idx); - LASSERT(ctxt != NULL); + if (ctxt == NULL) + GOTO(out_free, rc = -ENODEV); + disk_obd = ctxt->loc_exp->exp_obd; push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); @@ -312,17 +322,17 @@ int llog_origin_handle_prev_block(struct ptlrpc_request *req) ptr = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF+1, LLOG_CHUNK_SIZE); memcpy(ptr, buf, LLOG_CHUNK_SIZE); - + EXIT; out_close: rc2 = llog_close(loghandle); if (!rc) rc = rc2; - out_pop: pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); llog_ctxt_put(ctxt); +out_free: OBD_FREE(buf, LLOG_CHUNK_SIZE); - RETURN(rc); + return rc; } int llog_origin_handle_read_header(struct ptlrpc_request *req) @@ -349,15 +359,15 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req) ctxt = llog_get_context(obd, body->lgd_ctxt_idx); if (ctxt == NULL) - RETURN(-EINVAL); + RETURN(-ENODEV); + disk_obd = ctxt->loc_exp->exp_obd; push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); - rc = llog_create(ctxt, &loghandle, &body->lgd_logid, NULL); if (rc) GOTO(out_pop, rc); - /* init_handle reads the header */ + /* llog_init_handle() reads the header */ flags = body->lgd_llh_flags; rc = llog_init_handle(loghandle, flags, NULL); if (rc) @@ -369,7 +379,7 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req) hdr = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*hdr)); memcpy(hdr, loghandle->lgh_hdr, sizeof(*hdr)); - + EXIT; out_close: rc2 = llog_close(loghandle); if (!rc) @@ -377,25 +387,22 @@ out_close: out_pop: pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); llog_ctxt_put(ctxt); - RETURN(rc); + return rc; } int llog_origin_handle_close(struct ptlrpc_request *req) { - int rc; - - rc = 0; - - RETURN(rc); + ENTRY; + RETURN(0); } int llog_origin_handle_cancel(struct ptlrpc_request *req) { struct obd_device *obd = req->rq_export->exp_obd; + int num_cookies, rc = 0, err, i, failed = 0; struct obd_device *disk_obd; struct llog_cookie *logcookies; struct llog_ctxt *ctxt = NULL; - int num_cookies, rc = 0, err, i; struct lvfs_run_ctxt saved; struct llog_handle *cathandle; struct inode *inode; @@ -407,15 +414,13 @@ int llog_origin_handle_cancel(struct ptlrpc_request *req) num_cookies = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF) / sizeof(*logcookies); if (logcookies == NULL || num_cookies == 0) { - DEBUG_REQ(D_HA, req, "no cookies sent"); + DEBUG_REQ(D_HA, req, "No llog cookies sent"); RETURN(-EFAULT); } ctxt = llog_get_context(obd, logcookies->lgc_subsys); - if (ctxt == NULL) { - CWARN("llog subsys not setup or already cleanup\n"); - RETURN(-ENOENT); - } + if (ctxt == NULL) + RETURN(-ENODEV); disk_obd = ctxt->loc_exp->exp_obd; push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); @@ -427,29 +432,46 @@ int llog_origin_handle_cancel(struct ptlrpc_request *req) handle = fsfilt_start_log(disk_obd, inode, FSFILT_OP_CANCEL_UNLINK, NULL, 1); if (IS_ERR(handle)) { - CERROR("fsfilt_start failed: %ld\n", PTR_ERR(handle)); + CERROR("fsfilt_start_log() failed: %ld\n", + PTR_ERR(handle)); GOTO(pop_ctxt, rc = PTR_ERR(handle)); } rc = llog_cat_cancel_records(cathandle, 1, logcookies); + /* Do not raise -ENOENT errors for resent rpcs. This rec already + * might be killed. */ + if (rc == -ENOENT && + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) { + /* Do not change this message, reply-single.sh test_59b + * expects to find this in dmesg. */ + CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n", + req); + rc = 0; + } else if (rc == 0) { + CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n", + num_cookies); + } + err = fsfilt_commit(disk_obd, inode, handle, 0); if (err) { - CERROR("error committing transaction: %d\n", err); + CERROR("Error committing transaction: %d\n", err); if (!rc) rc = err; + failed++; GOTO(pop_ctxt, rc); - } + } else if (rc) + failed++; } + EXIT; pop_ctxt: pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); if (rc) - CERROR("cancel %d llog-records failed: %d\n", num_cookies, rc); - else - CDEBUG(D_RPCTRACE, "cancel %d llog-records\n", num_cookies); + CERROR("Cancel %d of %d llog-records failed: %d\n", + failed, num_cookies, rc); llog_ctxt_put(ctxt); - RETURN(rc); + return rc; } EXPORT_SYMBOL(llog_origin_handle_cancel); @@ -463,9 +485,10 @@ static int llog_catinfo_config(struct obd_device *obd, char *buf, int buf_len, char name[4][64]; int rc, i, l, remains = buf_len; char *out = buf; + ENTRY; if (ctxt == NULL || mds == NULL) - GOTO(release_ctxt, rc = -EOPNOTSUPP); + GOTO(release_ctxt, rc = -ENODEV); push_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); @@ -502,11 +525,12 @@ static int llog_catinfo_config(struct obd_device *obd, char *buf, int buf_len, if (remains <= 0) break; } + GOTO(out_pop, rc); out_pop: pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); release_ctxt: llog_ctxt_put(ctxt); - RETURN(rc); + return rc; } struct cb_data { @@ -527,6 +551,7 @@ static int llog_catinfo_cb(struct llog_handle *cat, struct llog_logid_rec *lir; int l, rc, index, count = 0; struct cb_data *cbd = (struct cb_data*)data; + ENTRY; if (cbd->init) { out = cbd->out; @@ -538,13 +563,14 @@ static int llog_catinfo_cb(struct llog_handle *cat, RETURN(-EINVAL); if (!cbd->ctxt) - RETURN(-EINVAL); + RETURN(-ENODEV); lir = (struct llog_logid_rec *)rec; logid = &lir->lid_id; rc = llog_create(ctxt, &handle, logid, NULL); if (rc) RETURN(-EINVAL); + rc = llog_init_handle(handle, 0, NULL); if (rc) GOTO(out_close, rc); @@ -568,10 +594,10 @@ static int llog_catinfo_cb(struct llog_handle *cat, CWARN("Not enough memory\n"); rc = -ENOMEM; } - + GOTO(out_close, rc); out_close: llog_close(handle); - RETURN(rc); + return rc; } static int llog_catinfo_deletions(struct obd_device *obd, char *buf, @@ -586,9 +612,10 @@ static int llog_catinfo_deletions(struct obd_device *obd, char *buf, int rc; struct cb_data data; struct llog_ctxt *ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + ENTRY; if (ctxt == NULL || mds == NULL) - GOTO(release_ctxt, rc = -EOPNOTSUPP); + GOTO(release_ctxt, rc = -ENODEV); count = mds->mds_lov_desc.ld_tgt_count; size = sizeof(*idarray) * count; @@ -639,17 +666,15 @@ static int llog_catinfo_deletions(struct obd_device *obd, char *buf, if (data.remains <= 0) break; } + GOTO(out_pop, rc); out_pop: pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); out_free: - /* release semphore */ mutex_up(&obd->obd_llog_cat_process); - OBD_VFREE(idarray, size); release_ctxt: llog_ctxt_put(ctxt); - - RETURN(rc); + return rc; } int llog_catinfo(struct ptlrpc_request *req) @@ -660,11 +685,11 @@ int llog_catinfo(struct ptlrpc_request *req) char *buf, *reply; int rc, buf_len = LLOG_CHUNK_SIZE; int size[2] = { sizeof(struct ptlrpc_body), buf_len }; + ENTRY; OBD_ALLOC(buf, buf_len); if (buf == NULL) - return -ENOMEM; - memset(buf, 0, buf_len); + RETURN(-ENOMEM); keyword = lustre_msg_string(req->rq_reqmsg, REQ_REC_OFF, 0); @@ -686,7 +711,7 @@ int llog_catinfo(struct ptlrpc_request *req) if (strlen(buf) == 0) sprintf(buf, "%s", "No log informations\n"); memcpy(reply, buf, buf_len); - + GOTO(out_free, rc); out_free: OBD_FREE(buf, buf_len); return rc; diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 5174336012..761c4727ad 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -212,7 +212,7 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) * llog_receptor_accept(). */ request = ptlrpc_prep_req(import, LUSTRE_LOG_VERSION, - OBD_LOG_CANCEL, 2, size,bufs); + OBD_LOG_CANCEL, 2, size, bufs); if (request == NULL) { CERROR("Can't allocate request for sending llcd %p\n", llcd); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index b8b5c3485a..0433306200 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1321,6 +1321,21 @@ test_59() { } run_test 59 "test log_commit_thread vs filter_destroy race" +# bug 17323 +test_59b() { + mkdir -p $DIR/$tdir + createmany -o $DIR/$tdir/$tfile-%d 2000 + sync +#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 + do_facet mds "lctl set_param fail_loc=0x606" + unlinkmany $DIR/$tdir/$tfile-%d 2000 + sleep 60 + do_facet mds "lctl set_param fail_loc=0x0" + $LCTL dk | grep -q "RESENT cancel req" || return 1 + rmdir $DIR/$tdir +} +run_test 59b "resent handle in llog_origin_handle_cancel" + # race between add unlink llog vs cat log init in post_recovery (only for b1_6) # bug 12086: should no oops and No ctxt error for this test test_60() { diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index f9e30b7e05..56d2747aa8 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -8,7 +8,7 @@ set -e ONLY=${ONLY:-"$*"} # bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 15528/2330 5188 10764 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27u 42a 42b 42c 42d 45 51d 62 68 75 $SANITY_EXCEPT" } +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"120a 27u 42a 42b 42c 42d 45 51d 62 68 75 $SANITY_EXCEPT" } # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! # Tests that fail on uml, maybe elsewhere, FIXME -- GitLab