diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 35fef70ac6463b7221e146916cb5bf14ae3e4ce6..55fb231d061f47b3b470417150822e43d94c636b 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -22,6 +22,13 @@ tbd Cluster File Systems, Inc. <info@clusterfs.com> * Note that reiserfs quotas are temporarily disabled on SLES 10 in this kernel. +Severity : normal +Frequency : rare +Bugzilla : 11315 +Description: OST "spontaneously" evicts client; client has imp_pingable == 0 +Details : Due to a race condition, liblustre clients were occasionally + evicted incorrectly. + Severity : enhancement Bugzilla : 10997 Description: lfs setstripe use optional parameters instead of postional diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 9add69a9c50cc52ae271163f642517ef250d68c5..29eb85119f7ea952e8c89aaa3ba5786eedc5f0a2 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -318,7 +318,10 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) CDEBUG(D_HA, "marking %s %s->%s as inactive\n", name, obddev->obd_name, cli->cl_target_uuid.uuid); + + spin_lock(&imp->imp_lock); imp->imp_invalid = 1; + spin_unlock(&imp->imp_lock); } } @@ -445,7 +448,9 @@ int client_disconnect_export(struct obd_export *exp) /* Mark import deactivated now, so we don't try to reconnect if any * of the cleanup RPCs fails (e.g. ldlm cancel, etc). We don't * fully deactivate the import, or that would drop all requests. */ + spin_lock(&imp->imp_lock); imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); /* Some non-replayable imports (MDS's OSCs) are pinged, so just * delete it regardless. (It's safe to delete an import that was @@ -527,7 +532,10 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data, CDEBUG(D_HA, "%s: committing for initial connect of %s\n", obd->obd_name, exp->exp_client_uuid.uuid); + + spin_lock(&exp->exp_lock); exp->exp_need_sync = 0; + spin_unlock(&exp->exp_lock); } EXPORT_SYMBOL(target_client_add_cb); @@ -693,7 +701,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) break; } + spin_lock(&export->exp_lock); export->exp_connecting = 1; + spin_unlock(&export->exp_lock); spin_unlock(&target->obd_dev_lock); LASSERT(export->exp_obd == target); @@ -814,14 +824,17 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) GOTO(out, rc = -EALREADY); } export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); - spin_unlock(&export->exp_lock); /* request from liblustre? Don't evict it for not pinging. */ if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { export->exp_libclient = 1; + spin_unlock(&export->exp_lock); + spin_lock(&target->obd_dev_lock); list_del_init(&export->exp_obd_chain_timed); spin_unlock(&target->obd_dev_lock); + } else { + spin_unlock(&export->exp_lock); } if (export->exp_connection != NULL) @@ -856,8 +869,11 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) class_import_put(revimp); out: - if (export) + if (export) { + spin_lock(&export->exp_lock); export->exp_connecting = 0; + spin_unlock(&export->exp_lock); + } if (targref) class_decref(targref); if (rc) @@ -1338,7 +1354,10 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) export */ if (req->rq_export->exp_replay_needed) { --obd->obd_recoverable_clients; + + spin_lock(&req->rq_export->exp_lock); req->rq_export->exp_replay_needed = 0; + spin_unlock(&req->rq_export->exp_lock); } recovery_done = (obd->obd_recoverable_clients == 0); spin_unlock_bh(&obd->obd_processing_task_lock); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 7ebe0ee370ca18e2c84f3e4c474f636d36f24966..47466c372dee81deb071bb21747abb317ebe0255 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -861,7 +861,10 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen, if (KEY_IS(KEY_INIT_RECOV)) { if (vallen != sizeof(int)) RETURN(-EINVAL); + spin_lock(&imp->imp_lock); imp->imp_initial_recov = *(int *)val; + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n", exp->exp_obd->obd_name, imp->imp_initial_recov); RETURN(0); @@ -870,9 +873,13 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen, if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { if (vallen != sizeof(int)) RETURN(-EINVAL); + + spin_lock(&imp->imp_lock); imp->imp_initial_recov_bk = *(int *)val; if (imp->imp_initial_recov_bk) imp->imp_initial_recov = 1; + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "%s: set imp_initial_recov_bk = %d\n", exp->exp_obd->obd_name, imp->imp_initial_recov_bk); RETURN(0); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index bf9646b2f976debd9ffbe90a539f0db07c49caf6..c53b906b3907313ba890264a2c854af0eb72ae49 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -380,7 +380,11 @@ int mds_init_export(struct obd_export *exp) INIT_LIST_HEAD(&med->med_open_head); spin_lock_init(&med->med_open_lock); + + spin_lock(&exp->exp_lock); exp->exp_connecting = 1; + spin_unlock(&exp->exp_lock); + RETURN(0); } diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 3f65d73b2040b6a2d477f398616174b3a7a99345..3c386d46f1063ca81481a30142585df4fc8f5c5b 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -142,8 +142,11 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp, } else { rc = fsfilt_add_journal_cb(obd, 0, handle, target_client_add_cb, exp); - if (rc == 0) + if (rc == 0) { + spin_lock(&exp->exp_lock); exp->exp_need_sync = 1; + spin_unlock(&exp->exp_lock); + } rc = fsfilt_write_record(obd, file, med->med_mcd, sizeof(*med->med_mcd), &off, rc /* sync if no cb */); @@ -209,7 +212,7 @@ int mds_client_free(struct obd_export *exp) push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = fsfilt_write_record(obd, mds->mds_rcvd_filp, &zero_mcd, sizeof(zero_mcd), &off, - !exp->exp_libclient); + (!exp->exp_libclient || exp->exp_need_sync)); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); CDEBUG(rc == 0 ? D_INFO : D_ERROR, @@ -406,8 +409,12 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) mcd = NULL; + + spin_lock(&exp->exp_lock); exp->exp_replay_needed = 1; exp->exp_connecting = 0; + spin_unlock(&exp->exp_lock); + obd->obd_recoverable_clients++; obd->obd_max_recoverable_clients++; class_export_put(exp); diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 36a578bd894f0cced07c1cc235c8cca8a5aae434..4b8fd76ea4bedcf574b192bc2b56d4f169875f32 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -744,7 +744,9 @@ int mgc_set_info_async(struct obd_export *exp, obd_count keylen, if (KEY_IS(KEY_INIT_RECOV)) { if (vallen != sizeof(int)) RETURN(-EINVAL); + spin_lock(&imp->imp_lock); imp->imp_initial_recov = *(int *)val; + spin_unlock(&imp->imp_lock); CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n", exp->exp_obd->obd_name, imp->imp_initial_recov); RETURN(0); @@ -755,10 +757,12 @@ int mgc_set_info_async(struct obd_export *exp, obd_count keylen, if (vallen != sizeof(int)) RETURN(-EINVAL); value = *(int *)val; + spin_lock(&imp->imp_lock); imp->imp_initial_recov_bk = value > 0; /* Even after the initial connection, give up all comms if nobody answers the first time. */ imp->imp_recon_bk = 1; + spin_unlock(&imp->imp_lock); CDEBUG(D_MGC, "InitRecov %s %d/%d:d%d:i%d:r%d:or%d:%s\n", imp->imp_obd->obd_name, value, imp->imp_initial_recov, imp->imp_deactive, imp->imp_invalid, diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index cea757ed2ba749dbdd68feeca4b6c1fdb8ac7af2..34b137c1822e2e3a2ea1a3569093f61ad5efd57a 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -823,7 +823,10 @@ void class_destroy_import(struct obd_import *import) class_handle_unhash(&import->imp_handle); + spin_lock(&import->imp_lock); import->imp_generation++; + spin_unlock(&import->imp_lock); + class_import_put(import); } EXPORT_SYMBOL(class_destroy_import); @@ -900,7 +903,10 @@ static void class_disconnect_export_list(struct list_head *list, int flags) while (!list_empty(list)) { exp = list_entry(list->next, struct obd_export, exp_obd_chain); class_export_get(exp); + + spin_lock(&exp->exp_lock); exp->exp_flags = flags; + spin_unlock(&exp->exp_lock); if (obd_uuid_equals(&exp->exp_client_uuid, &exp->exp_obd->obd_uuid)) { @@ -920,7 +926,11 @@ static void class_disconnect_export_list(struct list_head *list, int flags) class_export_put(exp); continue; } + + spin_lock(&fake_exp->exp_lock); fake_exp->exp_flags = flags; + spin_unlock(&fake_exp->exp_lock); + rc = obd_disconnect(fake_exp); class_export_put(exp); if (rc) { diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 86bf7917cea11572fa39eef04778c068741ac23a..ca283d11d773ef70c5f44a23923c39f823b80e88 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -475,9 +475,13 @@ void class_decref(struct obd_device *obd) if (err) CERROR("Precleanup %s returned %d\n", obd->obd_name, err); + + spin_lock(&obd->obd_self_export->exp_lock); obd->obd_self_export->exp_flags |= (obd->obd_fail ? OBD_OPT_FAILOVER : 0) | (obd->obd_force ? OBD_OPT_FORCE : 0); + spin_unlock(&obd->obd_self_export->exp_lock); + /* note that we'll recurse into class_decref again */ class_unlink_export(obd->obd_self_export); return; diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 779a2153dd14823357d19c9dc8956abe9fbfcc98..cbbafbc4f960c8b8ea31600996ae663b94f9e996 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -265,8 +265,11 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, } else { rc = fsfilt_add_journal_cb(obd, 0, handle, target_client_add_cb, exp); - if (rc == 0) + if (rc == 0) { + spin_lock(&exp->exp_lock); exp->exp_need_sync = 1; + spin_unlock(&exp->exp_lock); + } rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, fed->fed_fcd, sizeof(*fed->fed_fcd), @@ -332,7 +335,7 @@ static int filter_client_free(struct obd_export *exp) push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_fcd, sizeof(zero_fcd), &off, - !exp->exp_libclient); + (!exp->exp_libclient || exp->exp_need_sync)); if (rc == 0) /* update server's transno */ @@ -536,7 +539,10 @@ static int filter_init_export(struct obd_export *exp) { spin_lock_init(&exp->exp_filter_data.fed_lock); INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list); + + spin_lock(&exp->exp_lock); exp->exp_connecting = 1; + spin_unlock(&exp->exp_lock); return 0; } @@ -757,8 +763,12 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */ fcd = NULL; + + spin_lock(&exp->exp_lock); exp->exp_replay_needed = 1; exp->exp_connecting = 0; + spin_unlock(&exp->exp_lock); + obd->obd_recoverable_clients++; obd->obd_max_recoverable_clients++; class_export_put(exp); @@ -2403,12 +2413,15 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, /* set cancel cookie callback function */ if (fsfilt_add_journal_cb(exp->exp_obd, 0, handle, filter_cancel_cookies_cb, - fcc)) + fcc)) { + spin_lock(&exp->exp_lock); exp->exp_need_sync = 1; - else + spin_unlock(&exp->exp_lock); + } else { fcc = NULL; } } + } if (OBD_FAIL_CHECK(OBD_FAIL_OST_SETATTR_CREDITS)) fsfilt_extend(exp->exp_obd, inode, 0, handle); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index d9db97d687105e31cac0620e1e7a68f589a58a6a..9a492fee00494a82ed80d215375a5c203b298bf7 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3212,9 +3212,11 @@ static int osc_setinfo_mds_conn_interpret(struct ptlrpc_request *req, "ctxt %p: %d\n", ctxt, rc); } + spin_lock(&imp->imp_lock); imp->imp_server_timeout = 1; - CDEBUG(D_HA, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd)); imp->imp_pingable = 1; + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd)); RETURN(rc); } @@ -3254,7 +3256,9 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, if (KEY_IS(KEY_INIT_RECOV)) { if (vallen != sizeof(int)) RETURN(-EINVAL); + spin_lock(&imp->imp_lock); imp->imp_initial_recov = *(int *)val; + spin_unlock(&imp->imp_lock); CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n", exp->exp_obd->obd_name, imp->imp_initial_recov); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 2d96ad770a528b435e2831b1a484e0ad8a3e1eee..fe597070f80409a6649b3d5a4755b37b97956d4c 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -405,7 +405,9 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) if (imp->imp_recon_bk) { CDEBUG(D_HA, "Last reconnection attempt (%d) for %s\n", imp->imp_conn_cnt, obd2cli_tgt(imp->imp_obd)); + spin_lock(&imp->imp_lock); imp->imp_last_recon = 1; + spin_unlock(&imp->imp_lock); } } @@ -442,7 +444,9 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) aa->pcaa_initial_connect = initial_connect; if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); /* On an initial connect, we don't know which one of a failover server pair is up. Don't wait long. */ #ifdef CRAY_XT3 @@ -515,7 +519,6 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, spin_unlock(&imp->imp_lock); RETURN(0); } - spin_unlock(&imp->imp_lock); if (rc) GOTO(out, rc); @@ -529,11 +532,13 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, if (aa->pcaa_initial_connect) { if (msg_flags & MSG_CONNECT_REPLAYABLE) { + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); CDEBUG(D_HA, "connected to replayable target: %s\n", obd2cli_tgt(imp->imp_obd)); - imp->imp_replayable = 1; } else { imp->imp_replayable = 0; + spin_unlock(&imp->imp_lock); } if (msg_flags & MSG_CONNECT_NEXT_VER) { @@ -550,6 +555,8 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); GOTO(finish, rc = 0); + } else { + spin_unlock(&imp->imp_lock); } /* Determine what recovery state to move the import to. */ @@ -595,7 +602,11 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, CDEBUG(D_HA, "%s: reconnected to %s during replay\n", imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); imp->imp_resend_replay = 1; + spin_unlock(&imp->imp_lock); + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); @@ -770,7 +781,9 @@ finish: (char *)imp->imp_connection->c_remote_uuid.uuid, rc); } + spin_lock(&imp->imp_lock); imp->imp_last_recon = 0; + spin_unlock(&imp->imp_lock); cfs_waitq_signal(&imp->imp_recovery_waitq); RETURN(rc); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 2d578084984097d5c8dabb6a40c317ad286fce4a..94c0d78a655c864d8210064cf2052cc65640d3c8 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -117,7 +117,9 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) req = NULL; } + spin_lock(&imp->imp_lock); imp->imp_resend_replay = 0; + spin_unlock(&imp->imp_lock); if (req != NULL) { rc = ptlrpc_replay_req(req); @@ -231,12 +233,18 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active) LCONSOLE_WARN("setting import %s INACTIVE by administrator " "request\n", obd2cli_tgt(imp->imp_obd)); ptlrpc_invalidate_import(imp); + + spin_lock(&imp->imp_lock); imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); } /* When activating, mark import valid, and attempt recovery */ if (active) { + spin_lock(&imp->imp_lock); imp->imp_deactive = 0; + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "setting import %s VALID\n", obd2cli_tgt(imp->imp_obd)); rc = ptlrpc_recover_import(imp, NULL); @@ -254,7 +262,10 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid) /* force import to be disconnected. */ ptlrpc_set_import_discon(imp, 0); + spin_lock(&imp->imp_lock); imp->imp_deactive = 0; + spin_unlock(&imp->imp_lock); + rc = ptlrpc_recover_import_no_retry(imp, new_uuid); RETURN(rc);