diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 62841646c0dd92c194eb5d43f96b8d597503139f..c698c0493ea539587104fddc056fa70ddeb0d31e 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -109,6 +109,11 @@ behaviour. Details : This will achieve local-only flock/fcntl locks coherentness. +Severity : minor +Frequency : rare +Bugzilla : 11658 +Description: log_commit_thread vs filter_destroy race leads to crash +Details : Take import reference before releasing llog record semaphore -------------------------------------------------------------------------------- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index ee2c6bafa6b07ce76f5b651abfda324ca56feb8e..c514d165d17a10163514332da1be6ca298187900 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -169,6 +169,7 @@ extern int obd_race_state; #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 #define OBD_FAIL_PTLRPC_DROP_RPC 0x505 #define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 2355264cce166b779dc0ba035423d16e716f9614..b74c72ec1a1bd49b0cc6150ead3c2e4ade5e3ad3 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -222,6 +222,7 @@ static int log_commit_thread(void *arg) struct llog_commit_master *lcm = arg; struct llog_commit_daemon *lcd; struct llog_canceld_ctxt *llcd, *n; + struct obd_import *import = NULL; ENTRY; OBD_ALLOC(lcd, sizeof(*lcd)); @@ -243,10 +244,13 @@ static int log_commit_thread(void *arg) CDEBUG(D_HA, "%s started\n", cfs_curproc_comm()); do { struct ptlrpc_request *request; - struct obd_import *import = NULL; struct list_head *sending_list; int rc = 0; + if (import) + class_import_put(import); + import = NULL; + /* If we do not have enough pages available, allocate some */ while (atomic_read(&lcm->lcm_llcd_numfree) < lcm->lcm_llcd_minfree) { @@ -272,6 +276,8 @@ static int log_commit_thread(void *arg) sending_list = &lcm->lcm_llcd_pending; resend: + if (import) + class_import_put(import); import = NULL; if (lcm->lcm_flags & LLOG_LCM_FL_EXIT) { lcm->lcm_llcd_maxfree = 0; @@ -301,6 +307,8 @@ static int log_commit_thread(void *arg) typeof(*llcd), llcd_list); LASSERT(llcd->llcd_lcm == lcm); import = llcd->llcd_ctxt->loc_imp; + if (import) + class_import_get(import); } list_for_each_entry_safe(llcd, n, sending_list, llcd_list) { LASSERT(llcd->llcd_lcm == lcm); @@ -351,6 +359,8 @@ static int log_commit_thread(void *arg) continue; } + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10); + request = ptlrpc_prep_req(import, LUSTRE_LOG_VERSION, OBD_LOG_CANCEL, 2, size,bufs); if (request == NULL) { @@ -404,6 +414,9 @@ static int log_commit_thread(void *arg) } } while(1); + if (import) + class_import_put(import); + /* If we are force exiting, just drop all of the cookies. */ if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) { spin_lock(&lcm->lcm_llcd_lock); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 1d385f85ca98b98dbf991b5cded75779d0631a09..2a806e47ba17eee27254841cd533fd1b7797b144 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1122,5 +1122,23 @@ test_58() { } run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)" +# log_commit_thread vs filter_destroy race used to lead to import use after free +# bug 11658 +test_59() { + mkdir $DIR/$tdir + createmany -o $DIR/$tdir/$tfile-%d 200 + sync + unlinkmany $DIR/$tdir/$tfile-%d 200 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 + do_facet ost "sysctl -w lustre.fail_loc=0x507" + fail ost + fail mds + do_facet ost "sysctl -w lustre.fail_loc=0x0" + sleep 20 + rmdir $DIR/$tdir +} +run_test 59 "test log_commit_thread vs filter_destroy race" + + equals_msg `basename $0`: test complete, cleaning up $CLEANUP