From 7d44a355cd36afb7bf9768e686e293b80fcc6511 Mon Sep 17 00:00:00 2001
From: nathan <nathan>
Date: Wed, 18 Jul 2007 19:37:28 +0000
Subject: [PATCH] b=12860 i=adilger i=green fix mds_lov_synchronize race

---
 lustre/ChangeLog             |  7 +++
 lustre/include/obd_support.h |  1 +
 lustre/lov/lov_obd.c         | 18 ++++---
 lustre/mds/handler.c         |  3 +-
 lustre/mds/mds_lov.c         | 97 +++++++++++++++++++++++-------------
 lustre/osc/osc_create.c      | 63 ++++++++++++-----------
 lustre/osc/osc_request.c     |  5 +-
 lustre/ptlrpc/client.c       |  6 +--
 lustre/ptlrpc/events.c       |  2 +-
 9 files changed, 121 insertions(+), 81 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 4df8ec7257..bc6c481e15 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -14,6 +14,13 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
        * Note that reiserfs quotas are disabled on SLES 10 in this kernel.
        * bug fixes
 
+Severity   : minor
+Frequency  : at statup only
+Bugzilla   : 12860
+Description: mds_lov_synchronize race leads to various problems
+Details    : simultaneous MDT->OST connections at startup can cause the 
+	     sync to abort, leaving the OSC in a bad state.
+
 Severity   : enhancement
 Bugzilla   : 12194
 Description: add optional extra BUILD_VERSION info
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index ca7414be85..b368b56fc6 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -102,6 +102,7 @@ extern int obd_race_state;
 #define OBD_FAIL_MDS_FS_SETUP            0x135
 #define OBD_FAIL_MDS_RESEND              0x136
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
 
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c
index 6132e65690..7f5970c9e7 100644
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -908,7 +908,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
 
                 /* if called for a specific target, we don't
                    care if it is not active. */
-                if (!lov->lov_tgts[i]->ltd_active == 0 && ost_uuid == NULL) {
+                if (!lov->lov_tgts[i]->ltd_active && ost_uuid == NULL) {
                         CDEBUG(D_HA, "lov idx %d inactive\n", i);
                         continue;
                 }
@@ -2375,6 +2375,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
 {
         struct obd_device *obddev = class_exp2obd(exp);
         struct lov_obd *lov = &obddev->u.lov;
+        obd_count count;
         int i, rc = 0, err, incr = 0, check_uuid = 0, do_inactive = 0;
         int no_set = !set;
         ENTRY;
@@ -2385,15 +2386,20 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                         RETURN(-ENOMEM);
         }
 
+        lov_getref(obddev);
+        count = lov->desc.ld_tgt_count;
+
         if (KEY_IS(KEY_NEXT_ID)) {
-                if (vallen != lov->desc.ld_tgt_count * sizeof(obd_id))
-                        RETURN(-EINVAL);
+                /* We must use mds's idea of # osts for indexing into
+                   mds->mds_lov_objids */
+                count = vallen / sizeof(obd_id);
+                LASSERT(count <= lov->desc.ld_tgt_count);
                 vallen = sizeof(obd_id);
                 incr = sizeof(obd_id);
                 do_inactive = 1;
         } else if (KEY_IS("checksum")) {
                 do_inactive = 1;
-        } else if (KEY_IS("mds_conn") || KEY_IS("unlinked")) {
+        } else if (KEY_IS(KEY_MDS_CONN) || KEY_IS("unlinked")) {
                 check_uuid = val ? 1 : 0;
         } else if (KEY_IS("evict_by_nid")) {
                 /* use defaults:
@@ -2401,9 +2407,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                  */
         }
 
-        lov_getref(obddev);
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++, val = (char *)val + incr) {
+        for (i = 0; i < count; i++, val = (char *)val + incr) {
                 /* OST was disconnected */
                 if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
                         continue;
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index 1c4e05f1ee..14f095b3ca 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -2142,9 +2142,10 @@ int mds_postrecov(struct obd_device *obd)
         LASSERT(!obd->obd_recovering);
         LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
 
-        /* FIXME why not put this in the synchronize? */
         /* set nextid first, so we are sure it happens */
+        mutex_down(&obd->obd_dev_sem);
         rc = mds_lov_set_nextid(obd);
+        mutex_up(&obd->obd_dev_sem);
         if (rc) {
                 CERROR("%s: mds_lov_set_nextid failed %d\n",
                        obd->obd_name, rc);
diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c
index 869661155a..36f893e764 100644
--- a/lustre/mds/mds_lov.c
+++ b/lustre/mds/mds_lov.c
@@ -159,8 +159,10 @@ int mds_lov_set_nextid(struct obd_device *obd)
         ENTRY;
 
         LASSERT(!obd->obd_recovering);
-
         LASSERT(mds->mds_lov_objids != NULL);
+     
+        /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */
+        LASSERT_SEM_LOCKED(&obd->obd_dev_sem);
 
         rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID),
                                 KEY_NEXT_ID,
@@ -238,9 +240,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
         /* If we added a target we have to reconnect the llogs */
         /* We only _need_ to do this at first add (idx), or the first time
            after recovery.  However, it should now be safe to call anytime. */
-        mutex_down(&obd->obd_dev_sem);
         llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count, NULL);
-        mutex_up(&obd->obd_dev_sem);
 
 out:
         OBD_FREE(ld, sizeof(*ld));
@@ -260,45 +260,54 @@ static int mds_lov_update_mds(struct obd_device *obd,
         int rc = 0;
         ENTRY;
 
+        /* Don't let anyone else mess with mds_lov_objids now */
+        mutex_down(&obd->obd_dev_sem);
+
         old_count = mds->mds_lov_desc.ld_tgt_count;
         rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
-        if (rc)
-                RETURN(rc);
+        if (rc) 
+                GOTO(out, rc);
 
         CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n",
                idx, obd->obd_recovering, obd->obd_async_recov, old_count,
                mds->mds_lov_desc.ld_tgt_count);
 
         /* idx is set as data from lov_notify. */
-        if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) {
-                if (idx >= mds->mds_lov_desc.ld_tgt_count) {
-                        CERROR("index %d > count %d!\n", idx,
-                               mds->mds_lov_desc.ld_tgt_count);
-                        RETURN(-EINVAL);
-                }
-
-                if (idx >= mds->mds_lov_objids_in_file) {
-                        /* We never read this lastid; ask the osc */
-                        obd_id lastid;
-                        __u32 size = sizeof(lastid);
-                        rc = obd_get_info(watched->obd_self_export,
-                                          strlen("last_id"),
-                                          "last_id", &size, &lastid);
-                        if (rc)
-                                RETURN(rc);
-                        mds->mds_lov_objids[idx] = lastid;
-                        mds->mds_lov_objids_dirty = 1;
-                        mds_lov_write_objids(obd);
-                } else {
-                        /* We have read this lastid from disk; tell the osc.
-                           Don't call this during recovery. */
-                        rc = mds_lov_set_nextid(obd);
+        if (idx == MDSLOV_NO_INDEX || obd->obd_recovering) 
+                GOTO(out, rc);
+                
+        if (idx >= mds->mds_lov_desc.ld_tgt_count) {
+                CERROR("index %d > count %d!\n", idx, 
+                       mds->mds_lov_desc.ld_tgt_count);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (idx >= mds->mds_lov_objids_in_file) {
+                /* We never read this lastid; ask the osc */
+                obd_id lastid;
+                __u32 size = sizeof(lastid);
+                rc = obd_get_info(watched->obd_self_export, strlen("last_id"),
+                                  "last_id", &size, &lastid);
+                if (rc)
+                        GOTO(out, rc);
+                mds->mds_lov_objids[idx] = lastid;
+                mds->mds_lov_objids_dirty = 1;
+                mds_lov_write_objids(obd);
+        } else {
+                /* We have read this lastid from disk; tell the osc.
+                   Don't call this during recovery. */
+                rc = mds_lov_set_nextid(obd);
+                if (rc) {
+                        CERROR("Failed to set next id, idx=%d rc=%d\n", idx,rc);
+                        /* Don't abort the rest of the sync */
+                        rc = 0;
                 }
-
-                CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n",
-                      mds->mds_lov_objids[idx], idx);
         }
 
+        CDEBUG(D_CONFIG, "last object "LPU64" from OST %d rc=%d\n",
+               mds->mds_lov_objids[idx], idx, rc);
+out:
+        mutex_up(&obd->obd_dev_sem);
         RETURN(rc);
 }
 
@@ -350,6 +359,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
         /* Deny new client connections until we are sure we have some OSTs */
         obd->obd_no_conn = 1;
 
+        mutex_down(&obd->obd_dev_sem);
         rc = mds_lov_read_objids(obd);
         if (rc) {
                 CERROR("cannot read %s: rc = %d\n", "lov_objids", rc);
@@ -384,6 +394,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                                        "writing objids file: %d\n", rc);
                 }
         }
+        mutex_up(&obd->obd_dev_sem);
 
         /* I want to see a callback happen when the OBD moves to a
          * "For General Use" state, and that's when we'll call
@@ -395,6 +406,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
         RETURN(rc);
 
 err_reg:
+        mutex_up(&obd->obd_dev_sem);
         obd_register_observer(mds->mds_osc_obd, NULL);
 err_discon:
         obd_disconnect(mds->mds_osc_exp);
@@ -649,9 +661,13 @@ static int __mds_lov_synchronize(void *data)
         uuid = &watched->u.cli.cl_target_uuid;
         LASSERT(uuid);
 
+        OBD_RACE(OBD_FAIL_MDS_LOV_SYNC_RACE);
+
         rc = mds_lov_update_mds(obd, watched, idx, uuid);
-        if (rc != 0)
+        if (rc != 0) {
+                CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc);
                 GOTO(out, rc);
+        }
 
         rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_MDS_CONN),
                                 KEY_MDS_CONN, 0, uuid, NULL);
@@ -663,8 +679,8 @@ static int __mds_lov_synchronize(void *data)
                           NULL, NULL, uuid);
 
         if (rc != 0) {
-                CERROR("%s: failed at llog_origin_connect: %d\n",
-                       obd->obd_name, rc);
+                CERROR("%s failed at llog_origin_connect: %d\n",
+                       obd_uuid2str(uuid), rc);
                 GOTO(out, rc);
         }
 
@@ -676,13 +692,20 @@ static int __mds_lov_synchronize(void *data)
 
         rc = mds_lov_clear_orphans(mds, uuid);
         if (rc != 0) {
-                CERROR("%s: failed at mds_lov_clear_orphans: %d\n",
-                       obd->obd_name, rc);
+                CERROR("%s failed at mds_lov_clear_orphans: %d\n",
+                       obd_uuid2str(uuid), rc);
                 GOTO(out, rc);
         }
 
         EXIT;
 out:
+        if (rc) {
+                /* Deactivate it for safety */
+                CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid),
+                       rc);
+                obd_notify(mds->mds_osc_obd, watched, OBD_NOTIFY_INACTIVE,
+                           NULL);
+        }
         class_decref(obd);
         return rc;
 }
@@ -790,7 +813,9 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
                 /* We still have to fix the lov descriptor for ost's added
                    after the mdt in the config log.  They didn't make it into
                    mds_lov_connect. */
+                mutex_down(&obd->obd_dev_sem);
                 rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp);
+                mutex_up(&obd->obd_dev_sem);
                 RETURN(rc);
         }
 
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c
index 251483e2a4..5d01ce9a6b 100644
--- a/lustre/osc/osc_create.c
+++ b/lustre/osc/osc_create.c
@@ -66,7 +66,23 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
         
         spin_lock(&oscc->oscc_lock);
         oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
-        if (rc == -ENOSPC || rc == -EROFS) {
+        switch (rc) {
+        case 0: {
+                if (body) {
+                        int diff = body->oa.o_id - oscc->oscc_last_id;
+
+                        if (diff < oscc->oscc_grow_count)
+                                oscc->oscc_grow_count =
+                                        max(diff/3, OST_MIN_PRECREATE);
+                        else
+                                oscc->oscc_flags &= ~OSCC_FLAG_LOW;
+                        oscc->oscc_last_id = body->oa.o_id;
+                }
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
+        case -ENOSPC:
+        case -EROFS: {
                 oscc->oscc_flags |= OSCC_FLAG_NOSPC;
                 if (body && rc == -ENOSPC) {
                         oscc->oscc_grow_count = OST_MIN_PRECREATE;
@@ -74,7 +90,17 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                 }
                 spin_unlock(&oscc->oscc_lock);
                 DEBUG_REQ(D_INODE, req, "OST out of space, flagging");
-        } else if (rc != 0 && rc != -EIO) {
+                break;
+        }
+        case -EIO: {
+                /* filter always set body->oa.o_id as the last_id 
+                 * of filter (see filter_handle_precreate for detail)*/
+                if (body && body->oa.o_id > oscc->oscc_last_id)
+                        oscc->oscc_last_id = body->oa.o_id;
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
+        default: {
                 oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
                 oscc->oscc_grow_count = OST_MIN_PRECREATE;
                 spin_unlock(&oscc->oscc_lock);
@@ -82,26 +108,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                           "unknown rc %d from async create: failing oscc", rc);
                 ptlrpc_fail_import(req->rq_import,
                                    lustre_msg_get_conn_cnt(req->rq_reqmsg));
-        } else {
-                if (rc == 0) {
-                        if (body) {
-                                int diff = body->oa.o_id - oscc->oscc_last_id;
-
-                                if (diff < oscc->oscc_grow_count)
-                                        oscc->oscc_grow_count =
-                                                max(diff/3, OST_MIN_PRECREATE);
-                                else
-                                        oscc->oscc_flags &= ~OSCC_FLAG_LOW;
-                                oscc->oscc_last_id = body->oa.o_id;
-                        }
-                } else {
-                        /* filter always set body->oa.o_id as the last_id 
-                         * of filter (see filter_handle_precreate for detail)*/
-                        if (body && body->oa.o_id > oscc->oscc_last_id)
-                                oscc->oscc_last_id = body->oa.o_id;
-                }
-                spin_unlock(&oscc->oscc_lock);
-
+        }
         }
 
         CDEBUG(D_HA, "preallocated through id "LPU64" (next to use "LPU64")\n",
@@ -310,8 +317,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                 if (oscc_recovering(oscc)) {
                         struct l_wait_info lwi;
 
-                        CDEBUG(D_HA,"%p: oscc recovery in progress, waiting\n",
-                               oscc);
+                        CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n",
+                               oscc->oscc_obd->obd_name);
 
                         lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)),
                                           NULL, NULL);
@@ -319,12 +326,12 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                                           !oscc_recovering(oscc), &lwi);
                         LASSERT(rc == 0 || rc == -ETIMEDOUT);
                         if (rc == -ETIMEDOUT) {
-                                CDEBUG(D_HA,"%p: timeout waiting on recovery\n",
-                                       oscc);
+                                CDEBUG(D_HA,"%s: timeout waiting on recovery\n",
+                                       oscc->oscc_obd->obd_name);
                                 RETURN(rc);
                         }
-                        CDEBUG(D_HA, "%p: oscc recovery over, waking up\n",
-                               oscc);
+                        CDEBUG(D_HA, "%s: oscc recovery over, waking up\n",
+                               oscc->oscc_obd->obd_name);
                 }
 
                 spin_lock(&oscc->oscc_lock);
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 8dcd394861..58f1a9e513 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -427,13 +427,12 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
 
         CDEBUG(D_HA, "transno: "LPD64"\n",
                lustre_msg_get_transno(req->rq_repmsg));
-        EXIT;
 out_req:
         ptlrpc_req_finished(req);
 out:
         if (rc && !*ea)
                 obd_free_memmd(exp, &lsm);
-        return rc;
+        RETURN(rc);
 }
 
 static int osc_punch_interpret(struct ptlrpc_request *req,
@@ -3342,7 +3341,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-        if (KEY_IS("mds_conn"))
+        if (KEY_IS(KEY_MDS_CONN))
                 req->rq_interpret_reply = osc_setinfo_mds_conn_interpret;
 
         ptlrpc_req_set_repsize(req, 1, NULL);
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 56723f1910..889de36609 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -511,13 +511,9 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
                  imp->imp_state == LUSTRE_IMP_CONNECTING) {
                 /* allow CONNECT even if import is invalid */ ;
         } else if (imp->imp_invalid) {
-                /* If the import has been invalidated (such as by an OST
-                 * failure) the request must fail with -EINVAL.  This
-                 * indicates the requests should be discarded, an -EIO
-                 * may result in a resend of the request. */
                 if (!imp->imp_deactive)
                         DEBUG_REQ(D_ERROR, req, "IMP_INVALID");
-                *status = -EINVAL;
+                *status = -ESHUTDOWN; /* bz 12940 */
         } else if (req->rq_import_generation != imp->imp_generation) {
                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
                 *status = -EIO;
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c
index 61419d32da..25f13301bf 100644
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -220,7 +220,7 @@ void request_in_callback(lnet_event_t *ev)
 
         if (ev->unlinked) {
                 service->srv_nrqbd_receiving--;
-                CDEBUG(D_RPCTRACE,"Buffer complete: %d buffers still posted\n",
+                CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
                        service->srv_nrqbd_receiving);
 
                 /* Normally, don't complain about 0 buffers posted; LNET won't
-- 
GitLab