From 7d44a355cd36afb7bf9768e686e293b80fcc6511 Mon Sep 17 00:00:00 2001 From: nathan <nathan> Date: Wed, 18 Jul 2007 19:37:28 +0000 Subject: [PATCH] b=12860 i=adilger i=green fix mds_lov_synchronize race --- lustre/ChangeLog | 7 +++ lustre/include/obd_support.h | 1 + lustre/lov/lov_obd.c | 18 ++++--- lustre/mds/handler.c | 3 +- lustre/mds/mds_lov.c | 97 +++++++++++++++++++++++------------- lustre/osc/osc_create.c | 63 ++++++++++++----------- lustre/osc/osc_request.c | 5 +- lustre/ptlrpc/client.c | 6 +-- lustre/ptlrpc/events.c | 2 +- 9 files changed, 121 insertions(+), 81 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 4df8ec7257..bc6c481e15 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -14,6 +14,13 @@ tbd Cluster File Systems, Inc. <info@clusterfs.com> * Note that reiserfs quotas are disabled on SLES 10 in this kernel. * bug fixes +Severity : minor +Frequency : at statup only +Bugzilla : 12860 +Description: mds_lov_synchronize race leads to various problems +Details : simultaneous MDT->OST connections at startup can cause the + sync to abort, leaving the OSC in a bad state. + Severity : enhancement Bugzilla : 12194 Description: add optional extra BUILD_VERSION info diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index ca7414be85..b368b56fc6 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -102,6 +102,7 @@ extern int obd_race_state; #define OBD_FAIL_MDS_FS_SETUP 0x135 #define OBD_FAIL_MDS_RESEND 0x136 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 6132e65690..7f5970c9e7 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -908,7 +908,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa, /* if called for a specific target, we don't care if it is not active. */ - if (!lov->lov_tgts[i]->ltd_active == 0 && ost_uuid == NULL) { + if (!lov->lov_tgts[i]->ltd_active && ost_uuid == NULL) { CDEBUG(D_HA, "lov idx %d inactive\n", i); continue; } @@ -2375,6 +2375,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, { struct obd_device *obddev = class_exp2obd(exp); struct lov_obd *lov = &obddev->u.lov; + obd_count count; int i, rc = 0, err, incr = 0, check_uuid = 0, do_inactive = 0; int no_set = !set; ENTRY; @@ -2385,15 +2386,20 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(-ENOMEM); } + lov_getref(obddev); + count = lov->desc.ld_tgt_count; + if (KEY_IS(KEY_NEXT_ID)) { - if (vallen != lov->desc.ld_tgt_count * sizeof(obd_id)) - RETURN(-EINVAL); + /* We must use mds's idea of # osts for indexing into + mds->mds_lov_objids */ + count = vallen / sizeof(obd_id); + LASSERT(count <= lov->desc.ld_tgt_count); vallen = sizeof(obd_id); incr = sizeof(obd_id); do_inactive = 1; } else if (KEY_IS("checksum")) { do_inactive = 1; - } else if (KEY_IS("mds_conn") || KEY_IS("unlinked")) { + } else if (KEY_IS(KEY_MDS_CONN) || KEY_IS("unlinked")) { check_uuid = val ? 1 : 0; } else if (KEY_IS("evict_by_nid")) { /* use defaults: @@ -2401,9 +2407,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, */ } - lov_getref(obddev); - - for (i = 0; i < lov->desc.ld_tgt_count; i++, val = (char *)val + incr) { + for (i = 0; i < count; i++, val = (char *)val + incr) { /* OST was disconnected */ if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) continue; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 1c4e05f1ee..14f095b3ca 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2142,9 +2142,10 @@ int mds_postrecov(struct obd_device *obd) LASSERT(!obd->obd_recovering); LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); - /* FIXME why not put this in the synchronize? */ /* set nextid first, so we are sure it happens */ + mutex_down(&obd->obd_dev_sem); rc = mds_lov_set_nextid(obd); + mutex_up(&obd->obd_dev_sem); if (rc) { CERROR("%s: mds_lov_set_nextid failed %d\n", obd->obd_name, rc); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 869661155a..36f893e764 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -159,8 +159,10 @@ int mds_lov_set_nextid(struct obd_device *obd) ENTRY; LASSERT(!obd->obd_recovering); - LASSERT(mds->mds_lov_objids != NULL); + + /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */ + LASSERT_SEM_LOCKED(&obd->obd_dev_sem); rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID), KEY_NEXT_ID, @@ -238,9 +240,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov) /* If we added a target we have to reconnect the llogs */ /* We only _need_ to do this at first add (idx), or the first time after recovery. However, it should now be safe to call anytime. */ - mutex_down(&obd->obd_dev_sem); llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count, NULL); - mutex_up(&obd->obd_dev_sem); out: OBD_FREE(ld, sizeof(*ld)); @@ -260,45 +260,54 @@ static int mds_lov_update_mds(struct obd_device *obd, int rc = 0; ENTRY; + /* Don't let anyone else mess with mds_lov_objids now */ + mutex_down(&obd->obd_dev_sem); + old_count = mds->mds_lov_desc.ld_tgt_count; rc = mds_lov_update_desc(obd, mds->mds_osc_exp); - if (rc) - RETURN(rc); + if (rc) + GOTO(out, rc); CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n", idx, obd->obd_recovering, obd->obd_async_recov, old_count, mds->mds_lov_desc.ld_tgt_count); /* idx is set as data from lov_notify. */ - if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) { - if (idx >= mds->mds_lov_desc.ld_tgt_count) { - CERROR("index %d > count %d!\n", idx, - mds->mds_lov_desc.ld_tgt_count); - RETURN(-EINVAL); - } - - if (idx >= mds->mds_lov_objids_in_file) { - /* We never read this lastid; ask the osc */ - obd_id lastid; - __u32 size = sizeof(lastid); - rc = obd_get_info(watched->obd_self_export, - strlen("last_id"), - "last_id", &size, &lastid); - if (rc) - RETURN(rc); - mds->mds_lov_objids[idx] = lastid; - mds->mds_lov_objids_dirty = 1; - mds_lov_write_objids(obd); - } else { - /* We have read this lastid from disk; tell the osc. - Don't call this during recovery. */ - rc = mds_lov_set_nextid(obd); + if (idx == MDSLOV_NO_INDEX || obd->obd_recovering) + GOTO(out, rc); + + if (idx >= mds->mds_lov_desc.ld_tgt_count) { + CERROR("index %d > count %d!\n", idx, + mds->mds_lov_desc.ld_tgt_count); + GOTO(out, rc = -EINVAL); + } + + if (idx >= mds->mds_lov_objids_in_file) { + /* We never read this lastid; ask the osc */ + obd_id lastid; + __u32 size = sizeof(lastid); + rc = obd_get_info(watched->obd_self_export, strlen("last_id"), + "last_id", &size, &lastid); + if (rc) + GOTO(out, rc); + mds->mds_lov_objids[idx] = lastid; + mds->mds_lov_objids_dirty = 1; + mds_lov_write_objids(obd); + } else { + /* We have read this lastid from disk; tell the osc. + Don't call this during recovery. */ + rc = mds_lov_set_nextid(obd); + if (rc) { + CERROR("Failed to set next id, idx=%d rc=%d\n", idx,rc); + /* Don't abort the rest of the sync */ + rc = 0; } - - CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n", - mds->mds_lov_objids[idx], idx); } + CDEBUG(D_CONFIG, "last object "LPU64" from OST %d rc=%d\n", + mds->mds_lov_objids[idx], idx, rc); +out: + mutex_up(&obd->obd_dev_sem); RETURN(rc); } @@ -350,6 +359,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) /* Deny new client connections until we are sure we have some OSTs */ obd->obd_no_conn = 1; + mutex_down(&obd->obd_dev_sem); rc = mds_lov_read_objids(obd); if (rc) { CERROR("cannot read %s: rc = %d\n", "lov_objids", rc); @@ -384,6 +394,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) "writing objids file: %d\n", rc); } } + mutex_up(&obd->obd_dev_sem); /* I want to see a callback happen when the OBD moves to a * "For General Use" state, and that's when we'll call @@ -395,6 +406,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) RETURN(rc); err_reg: + mutex_up(&obd->obd_dev_sem); obd_register_observer(mds->mds_osc_obd, NULL); err_discon: obd_disconnect(mds->mds_osc_exp); @@ -649,9 +661,13 @@ static int __mds_lov_synchronize(void *data) uuid = &watched->u.cli.cl_target_uuid; LASSERT(uuid); + OBD_RACE(OBD_FAIL_MDS_LOV_SYNC_RACE); + rc = mds_lov_update_mds(obd, watched, idx, uuid); - if (rc != 0) + if (rc != 0) { + CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc); GOTO(out, rc); + } rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_MDS_CONN), KEY_MDS_CONN, 0, uuid, NULL); @@ -663,8 +679,8 @@ static int __mds_lov_synchronize(void *data) NULL, NULL, uuid); if (rc != 0) { - CERROR("%s: failed at llog_origin_connect: %d\n", - obd->obd_name, rc); + CERROR("%s failed at llog_origin_connect: %d\n", + obd_uuid2str(uuid), rc); GOTO(out, rc); } @@ -676,13 +692,20 @@ static int __mds_lov_synchronize(void *data) rc = mds_lov_clear_orphans(mds, uuid); if (rc != 0) { - CERROR("%s: failed at mds_lov_clear_orphans: %d\n", - obd->obd_name, rc); + CERROR("%s failed at mds_lov_clear_orphans: %d\n", + obd_uuid2str(uuid), rc); GOTO(out, rc); } EXIT; out: + if (rc) { + /* Deactivate it for safety */ + CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid), + rc); + obd_notify(mds->mds_osc_obd, watched, OBD_NOTIFY_INACTIVE, + NULL); + } class_decref(obd); return rc; } @@ -790,7 +813,9 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, /* We still have to fix the lov descriptor for ost's added after the mdt in the config log. They didn't make it into mds_lov_connect. */ + mutex_down(&obd->obd_dev_sem); rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp); + mutex_up(&obd->obd_dev_sem); RETURN(rc); } diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 251483e2a4..5d01ce9a6b 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -66,7 +66,23 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) spin_lock(&oscc->oscc_lock); oscc->oscc_flags &= ~OSCC_FLAG_CREATING; - if (rc == -ENOSPC || rc == -EROFS) { + switch (rc) { + case 0: { + if (body) { + int diff = body->oa.o_id - oscc->oscc_last_id; + + if (diff < oscc->oscc_grow_count) + oscc->oscc_grow_count = + max(diff/3, OST_MIN_PRECREATE); + else + oscc->oscc_flags &= ~OSCC_FLAG_LOW; + oscc->oscc_last_id = body->oa.o_id; + } + spin_unlock(&oscc->oscc_lock); + break; + } + case -ENOSPC: + case -EROFS: { oscc->oscc_flags |= OSCC_FLAG_NOSPC; if (body && rc == -ENOSPC) { oscc->oscc_grow_count = OST_MIN_PRECREATE; @@ -74,7 +90,17 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) } spin_unlock(&oscc->oscc_lock); DEBUG_REQ(D_INODE, req, "OST out of space, flagging"); - } else if (rc != 0 && rc != -EIO) { + break; + } + case -EIO: { + /* filter always set body->oa.o_id as the last_id + * of filter (see filter_handle_precreate for detail)*/ + if (body && body->oa.o_id > oscc->oscc_last_id) + oscc->oscc_last_id = body->oa.o_id; + spin_unlock(&oscc->oscc_lock); + break; + } + default: { oscc->oscc_flags |= OSCC_FLAG_RECOVERING; oscc->oscc_grow_count = OST_MIN_PRECREATE; spin_unlock(&oscc->oscc_lock); @@ -82,26 +108,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) "unknown rc %d from async create: failing oscc", rc); ptlrpc_fail_import(req->rq_import, lustre_msg_get_conn_cnt(req->rq_reqmsg)); - } else { - if (rc == 0) { - if (body) { - int diff = body->oa.o_id - oscc->oscc_last_id; - - if (diff < oscc->oscc_grow_count) - oscc->oscc_grow_count = - max(diff/3, OST_MIN_PRECREATE); - else - oscc->oscc_flags &= ~OSCC_FLAG_LOW; - oscc->oscc_last_id = body->oa.o_id; - } - } else { - /* filter always set body->oa.o_id as the last_id - * of filter (see filter_handle_precreate for detail)*/ - if (body && body->oa.o_id > oscc->oscc_last_id) - oscc->oscc_last_id = body->oa.o_id; - } - spin_unlock(&oscc->oscc_lock); - + } } CDEBUG(D_HA, "preallocated through id "LPU64" (next to use "LPU64")\n", @@ -310,8 +317,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa, if (oscc_recovering(oscc)) { struct l_wait_info lwi; - CDEBUG(D_HA,"%p: oscc recovery in progress, waiting\n", - oscc); + CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n", + oscc->oscc_obd->obd_name); lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)), NULL, NULL); @@ -319,12 +326,12 @@ int osc_create(struct obd_export *exp, struct obdo *oa, !oscc_recovering(oscc), &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); if (rc == -ETIMEDOUT) { - CDEBUG(D_HA,"%p: timeout waiting on recovery\n", - oscc); + CDEBUG(D_HA,"%s: timeout waiting on recovery\n", + oscc->oscc_obd->obd_name); RETURN(rc); } - CDEBUG(D_HA, "%p: oscc recovery over, waking up\n", - oscc); + CDEBUG(D_HA, "%s: oscc recovery over, waking up\n", + oscc->oscc_obd->obd_name); } spin_lock(&oscc->oscc_lock); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 8dcd394861..58f1a9e513 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -427,13 +427,12 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, CDEBUG(D_HA, "transno: "LPD64"\n", lustre_msg_get_transno(req->rq_repmsg)); - EXIT; out_req: ptlrpc_req_finished(req); out: if (rc && !*ea) obd_free_memmd(exp, &lsm); - return rc; + RETURN(rc); } static int osc_punch_interpret(struct ptlrpc_request *req, @@ -3342,7 +3341,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, if (req == NULL) RETURN(-ENOMEM); - if (KEY_IS("mds_conn")) + if (KEY_IS(KEY_MDS_CONN)) req->rq_interpret_reply = osc_setinfo_mds_conn_interpret; ptlrpc_req_set_repsize(req, 1, NULL); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 56723f1910..889de36609 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -511,13 +511,9 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, imp->imp_state == LUSTRE_IMP_CONNECTING) { /* allow CONNECT even if import is invalid */ ; } else if (imp->imp_invalid) { - /* If the import has been invalidated (such as by an OST - * failure) the request must fail with -EINVAL. This - * indicates the requests should be discarded, an -EIO - * may result in a resend of the request. */ if (!imp->imp_deactive) DEBUG_REQ(D_ERROR, req, "IMP_INVALID"); - *status = -EINVAL; + *status = -ESHUTDOWN; /* bz 12940 */ } else if (req->rq_import_generation != imp->imp_generation) { DEBUG_REQ(D_ERROR, req, "req wrong generation:"); *status = -EIO; diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 61419d32da..25f13301bf 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -220,7 +220,7 @@ void request_in_callback(lnet_event_t *ev) if (ev->unlinked) { service->srv_nrqbd_receiving--; - CDEBUG(D_RPCTRACE,"Buffer complete: %d buffers still posted\n", + CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n", service->srv_nrqbd_receiving); /* Normally, don't complain about 0 buffers posted; LNET won't -- GitLab