From d911b48194ae1913a740c18cd3040f9945ba2520 Mon Sep 17 00:00:00 2001
From: alex <alex>
Date: Fri, 25 Jun 2004 09:26:06 +0000
Subject: [PATCH] changes to get test #46 working

- bunch of fixes to handle dir splitting over N < MDS nodes
  - fixes to MDS code to handle dir splitting over N < MDS nodes properly
  - mds_try_to_split() should ignore requests to split dir over 1 node
  - checks for NULL exports in LMV
  - checks for wrong -ERESTART handling: because of software errors
    it possible that lmv will loop forever. we need to assert on this.
  - one more sanity test 3c to test lfs dirstripe and activity on such a dirs
---
 lustre/lmv/lmv_intent.c    | 15 +++++---
 lustre/lmv/lmv_obd.c       | 74 +++++++++++++++++++++++++-------------
 lustre/mds/handler.c       | 14 ++++----
 lustre/mds/mds_lmv.c       |  5 ++-
 lustre/mds/mds_open.c      | 10 +++---
 lustre/mds/mds_reint.c     | 10 +++---
 lustre/tests/sanity-lmv.sh | 22 ++++++++++++
 7 files changed, 104 insertions(+), 46 deletions(-)

diff --git a/lustre/lmv/lmv_intent.c b/lustre/lmv/lmv_intent.c
index d860e86d4c..e012cbb8e2 100644
--- a/lustre/lmv/lmv_intent.c
+++ b/lustre/lmv/lmv_intent.c
@@ -130,25 +130,28 @@ int lmv_intent_open(struct obd_export *exp, struct ll_uctxt *uctxt,
         struct ll_fid rpfid = *pfid;
         struct lmv_obj *obj;
         struct mea *mea;
-        int rc, mds;
+        int rc, mds, loop = 0;
         ENTRY;
 
         /* IT_OPEN is intended to open (and create, possible) an object. Parent
          * (pfid) may be splitted dir */
 
 repeat:
+        LASSERT(++loop <= 2);
         mds = rpfid.mds;
         obj = lmv_grab_obj(obd, &rpfid);
         if (obj) {
                 /* directory is already splitted, so we have to forward
                  * request to the right MDS */
                 mds = raw_name2idx(obj->objcount, (char *)name, len);
-                CDEBUG(D_OTHER, "forward to MDS #%u\n", mds);
+                CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
+                       (unsigned long) rpfid.mds, (unsigned long) rpfid.id,
+                       (unsigned long) rpfid.generation);
                 rpfid = obj->objs[mds].fid;
                 lmv_put_obj(obj);
         }
 
-        rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, &rpfid, name,
+        rc = md_intent_lock(lmv->tgts[rpfid.mds].ltd_exp, uctxt, &rpfid, name,
                             len, lmm, lmmsize, cfid, it, flags, reqp,
                             cb_blocking);
         if (rc == -ERESTART) {
@@ -244,6 +247,7 @@ int lmv_intent_getattr(struct obd_export *exp, struct ll_uctxt *uctxt,
                         /* in fact, we need not this with current intent_lock(),
                          * but it may change some day */
                         rpfid = obj->objs[mds].fid;
+                        mds = rpfid.mds;
                         lmv_put_obj(obj);
                 }
                 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, &rpfid, name,
@@ -278,6 +282,7 @@ int lmv_intent_getattr(struct obd_export *exp, struct ll_uctxt *uctxt,
                 /* directory is already splitted. calculate mds */
                 mds = raw_name2idx(obj->objcount, (char *) name, len);
                 rpfid = obj->objs[mds].fid;
+                mds = rpfid.mds;
                 lmv_put_obj(obj);
                 
                 CDEBUG(D_OTHER, "forward to MDS #%u (slave %lu/%lu/%lu)\n",
@@ -473,7 +478,7 @@ int lmv_intent_lookup(struct obd_export *exp, struct ll_uctxt *uctxt,
         struct ll_fid rpfid = *pfid;
         struct lmv_obj *obj;
         struct mea *mea;
-        int rc, mds;
+        int rc, mds, loop = 0;
         ENTRY;
 
         /* IT_LOOKUP is intended to produce name -> fid resolving (let's call
@@ -505,6 +510,7 @@ int lmv_intent_lookup(struct obd_export *exp, struct ll_uctxt *uctxt,
 
         mds = pfid->mds;
 repeat:
+        LASSERT(++loop <= 2);
         /* this is lookup. during lookup we have to update all the attributes,
          * because returned values will be put in struct inode */
 
@@ -514,6 +520,7 @@ repeat:
                         /* directory is already splitted. calculate mds */
                         mds = raw_name2idx(obj->objcount, (char *)name, len);
                         rpfid = obj->objs[mds].fid;
+                        mds = rpfid.mds;
                 }
                 lmv_put_obj(obj);
         }
diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c
index 3819209511..b9e67b1925 100644
--- a/lustre/lmv/lmv_obd.c
+++ b/lustre/lmv/lmv_obd.c
@@ -395,8 +395,12 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
                 int err;
 
-                err = obd_iocontrol(cmd, lmv->tgts[i].ltd_exp,
-                                    len, karg, uarg);
+                if (lmv->tgts[i].ltd_exp == NULL) {
+                        CWARN("%s: NULL export for %d\n", obddev->obd_name, i);
+                        continue;
+                }
+
+                err = obd_iocontrol(cmd, lmv->tgts[i].ltd_exp, len, karg, uarg);
                 if (err) {
                         if (lmv->tgts[i].active) {
                                 CERROR("error: iocontrol MDC %s on MDT"
@@ -489,6 +493,11 @@ static int lmv_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                 RETURN(rc);
                 
         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                if (lmv->tgts[i].ltd_exp == NULL) {
+                        CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+                        continue;
+                }
+
                 rc = obd_statfs(lmv->tgts[i].ltd_exp->exp_obd, &temp, max_age);
                 if (rc) {
                         CERROR("can't stat MDS #%d (%s)\n", i,
@@ -578,6 +587,12 @@ static int lmv_getattr(struct obd_export *exp, struct ll_fid *fid,
         
                 for (i = 0; i < obj->objcount; i++) {
 
+                        if (lmv->tgts[i].ltd_exp == NULL) {
+                                CWARN("%s: NULL export for %d\n",
+                                      obd->obd_name, i);
+                                continue;
+                        }
+
                         /* skip master obj. */
                         if (fid_equal(&obj->fid, &obj->objs[i].fid))
                                 continue;
@@ -644,6 +659,7 @@ static int lmv_change_cbdata_name(struct obd_export *exp, struct ll_fid *pfid,
         if (obj) {
                 /* directory is splitted. look for right mds for this name. */
                 mds = raw_name2idx(obj->objcount, name, len);
+                mds = obj->objs[mds].fid.mds;
                 lmv_put_obj(obj);
         }
         rc = md_change_cbdata(lmv->tgts[mds].ltd_exp, cfid, it, data);
@@ -737,7 +753,7 @@ int lmv_create(struct obd_export *exp, struct mdc_op_data *op_data,
         struct lmv_obd *lmv = &obd->u.lmv;
         struct mds_body *body;
         struct lmv_obj *obj;
-        int rc, mds;
+        int rc, mds, loop = 0;
         ENTRY;
 
         rc = lmv_check_connect(obd);
@@ -747,6 +763,7 @@ int lmv_create(struct obd_export *exp, struct mdc_op_data *op_data,
         if (!lmv->desc.ld_active_tgt_count)
                 RETURN(-EIO);
 repeat:
+        LASSERT(++loop <= 2);
         obj = lmv_grab_obj(obd, &op_data->fid1);
         if (obj) {
                 mds = raw_name2idx(obj->objcount, op_data->name,
@@ -819,13 +836,13 @@ int lmv_enqueue_slaves(struct obd_export *exp, int locktype,
 
         LASSERT(mea != NULL);
         for (i = 0; i < mea->mea_count; i++) {
-                if (lmv->tgts[i].ltd_exp == NULL)
-                        continue;
-
                 memset(&data2, 0, sizeof(data2));
                 data2.fid1 = mea->mea_fids[i];
                 mds = data2.fid1.mds;
                 
+                if (lmv->tgts[mds].ltd_exp == NULL)
+                        continue;
+
                 rc = md_enqueue(lmv->tgts[mds].ltd_exp, locktype, it, lockmode,
                                 &data2, lockh + i, lmm, lmmsize, cb_completion,
                                 cb_blocking, cb_data);
@@ -909,7 +926,7 @@ int lmv_getattr_name(struct obd_export *exp, struct ll_fid *fid,
         struct obd_device *obd = exp->exp_obd;
         struct lmv_obd *lmv = &obd->u.lmv;
         struct ll_fid rfid = *fid;
-        int rc, mds = fid->mds;
+        int rc, mds = fid->mds, loop = 0;
         struct mds_body *body;
         struct lmv_obj *obj;
         ENTRY;
@@ -917,6 +934,7 @@ int lmv_getattr_name(struct obd_export *exp, struct ll_fid *fid,
 	if (rc)
 		RETURN(rc);
 repeat:
+        LASSERT(++loop <= 2);
         obj = lmv_grab_obj(obd, fid);
         if (obj) {
                 /* directory is splitted. look for right mds for this name */
@@ -931,7 +949,7 @@ repeat:
                (unsigned long)rfid.mds, (unsigned long)rfid.id,
                (unsigned long)rfid.generation);
 
-        rc = md_getattr_name(lmv->tgts[mds].ltd_exp, &rfid, filename,
+        rc = md_getattr_name(lmv->tgts[rfid.mds].ltd_exp, &rfid, filename,
                              namelen, valid, ea_size, request);
         if (rc == 0) {
                 /* this could be cross-node reference. in this case all we have
@@ -1099,10 +1117,10 @@ int lmv_setattr(struct obd_export *exp, struct mdc_op_data *data,
 {
         struct obd_device *obd = exp->exp_obd;
         struct lmv_obd *lmv = &obd->u.lmv;
-        int rc = 0, i = data->fid1.mds;
         struct ptlrpc_request *req;
         struct mds_body *body;
         struct lmv_obj *obj;
+        int rc = 0, i;
         ENTRY;
 
         rc = lmv_check_connect(obd);
@@ -1120,8 +1138,8 @@ int lmv_setattr(struct obd_export *exp, struct mdc_op_data *data,
                 for (i = 0; i < obj->objcount; i++) {
                         data->fid1 = obj->objs[i].fid;
                         
-                        rc = md_setattr(lmv->tgts[i].ltd_exp, data, iattr,
-                                        ea, ealen, ea2, ea2len, &req);
+                        rc = md_setattr(lmv->tgts[data->fid1.mds].ltd_exp, data,
+                                        iattr, ea, ealen, ea2, ea2len, &req);
 
                         if (fid_equal(&obj->fid, &obj->objs[i].fid)) {
                                 /* this is master object and this request should
@@ -1136,14 +1154,14 @@ int lmv_setattr(struct obd_export *exp, struct mdc_op_data *data,
                 }
                 lmv_put_obj(obj);
         } else {
-                LASSERT(i < lmv->desc.ld_tgt_count);
-                rc = md_setattr(lmv->tgts[i].ltd_exp, data, iattr, ea,
-                                ealen, ea2, ea2len, request); 
+                LASSERT(data->fid1.mds < lmv->desc.ld_tgt_count);
+                rc = md_setattr(lmv->tgts[data->fid1.mds].ltd_exp, data,
+                                iattr, ea, ealen, ea2, ea2len, request); 
                 if (rc == 0) {
                         body = lustre_msg_buf((*request)->rq_repmsg, 0,
                                               sizeof(*body));
                         LASSERT(body != NULL);
-                        LASSERT(body->mds == i);
+                        LASSERT(body->mds == data->fid1.mds);
                 }
         }
         RETURN(rc);
@@ -1161,7 +1179,8 @@ int lmv_sync(struct obd_export *exp, struct ll_fid *fid,
 	if (rc)
 		RETURN(rc);
 
-        rc = md_sync(lmv->tgts[0].ltd_exp, fid, request); 
+        CWARN("%s: ->m_sync() isn't implemented yet\n", obd->obd_name);
+        rc = md_sync(lmv->tgts[fid->mds].ltd_exp, fid, request); 
         RETURN(rc);
 }
 
@@ -1282,13 +1301,14 @@ int lmv_unlink_slaves(struct obd_export *exp, struct mdc_op_data *data,
 
         LASSERT(mea != NULL);
         for (i = 0; i < mea->mea_count; i++) {
-                if (lmv->tgts[i].ltd_exp == NULL)
-                        continue;
-
                 memset(&data2, 0, sizeof(data2));
                 data2.fid1 = mea->mea_fids[i];
                 data2.create_mode = MDS_MODE_DONT_LOCK | S_IFDIR;
                 mds = data2.fid1.mds;
+
+                if (lmv->tgts[mds].ltd_exp == NULL)
+                        continue;
+
                 rc = md_unlink(lmv->tgts[mds].ltd_exp, &data2, req);
                 CDEBUG(D_OTHER, "unlink slave %lu/%lu/%lu -> %d\n",
                        (unsigned long) mea->mea_fids[i].mds,
@@ -1369,6 +1389,7 @@ struct obd_device *lmv_get_real_obd(struct obd_export *exp,
         rc = lmv_check_connect(obd);
 	if (rc)
 		RETURN(ERR_PTR(rc));
+#warning "we need well-desgined readdir() implementation to remove this mess"
         obd = lmv->tgts[0].ltd_exp->exp_obd;
         EXIT;
         return obd;
@@ -1396,6 +1417,11 @@ int lmv_init_ea_size(struct obd_export *exp, int easize, int cookiesize)
                 RETURN(0);
 
         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                if (lmv->tgts[i].ltd_exp == NULL) {
+                        CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+                        continue;
+                }
+
                 rc = obd_init_ea_size(lmv->tgts[i].ltd_exp, easize, cookiesize);
                 if (rc) {
                         CERROR("obd_init_ea_size() failed on MDT target %d, "
@@ -1436,9 +1462,10 @@ int lmv_obd_create(struct obd_export *exp, struct obdo *oa,
 {
         struct obd_device *obd = exp->exp_obd;
         struct lmv_obd *lmv = &obd->u.lmv;
-        struct mea *mea;
         int i, c, rc = 0;
+        struct mea *mea;
         struct ll_fid mfid;
+        int lcount;
         ENTRY;
 
         rc = lmv_check_connect(obd);
@@ -1473,9 +1500,8 @@ int lmv_obd_create(struct obd_export *exp, struct obdo *oa,
                 mea->mea_count = lmv->desc.ld_tgt_count;
 
         mea->mea_master = -1;
-        
-        for (i = 0, c = 0; c < mea->mea_count && 
-                i < lmv->desc.ld_tgt_count; i++) {
+        lcount = lmv->desc.ld_tgt_count;
+        for (i = 0, c = 0; c < mea->mea_count && i < lcount; i++) {
                 struct lov_stripe_md obj_md;
                 struct lov_stripe_md *obj_mdp = &obj_md;
                
@@ -1510,8 +1536,6 @@ int lmv_obd_create(struct obd_export *exp, struct obdo *oa,
                 c++;
                 CDEBUG(D_OTHER, "dirobj at mds %d: "LPU64"/%u\n",
                        i, oa->o_id, oa->o_generation);
-                CDEBUG(D_ERROR, "dirobj at mds %d: "LPU64"/%u\n",
-                       i, oa->o_id, oa->o_generation);
         }
         LASSERT(c == mea->mea_count);
         CDEBUG(D_OTHER, "%d dirobjects created\n", (int) mea->mea_count);
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index dd203ae174..5f9f3266fe 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -304,6 +304,7 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
         if (!inode)
                 RETURN(ERR_PTR(-ENOENT));
 
+#warning "I think we need something another here -bzzz"
 #if 0
         /* here we disabled generation check, as root inode i_generation
          * of cache mds and real mds are different. */
@@ -821,9 +822,9 @@ int mds_check_mds_num(struct obd_device *obd, struct inode* inode,
                  * should live at this MDS or at another one */
                 int i;
                 i = mea_name2idx(mea, name, namelen - 1);
-                if (mea->mea_master != i) {
-                        CERROR("inapropriate MDS(%d) for %s. should be %d\n",
-                                mea->mea_master, name, i);
+                if (mea->mea_master != mea->mea_fids[i].mds) {
+                        CERROR("inapropriate MDS(%d) for %s. should be %d(%d)\n",
+                               mea->mea_master, name, mea->mea_fids[i].mds, i);
                         rc = -ERESTART;
                 }
         }
@@ -1450,10 +1451,11 @@ repeat:
                 rc = fsfilt_set_md(obd, new->d_inode, handle, mea, mealen);
                 up(&new->d_inode->i_sem);
                 OBD_FREE(mea, mealen);
+                CDEBUG(D_OTHER, "%s: mark non-splittable %lu/%u - %d\n",
+                       obd->obd_name, new->d_inode->i_ino,
+                       new->d_inode->i_generation, flags);
         } else if (rc == 0 && body->oa.o_easize) {
-                flags = mds_try_to_split_dir(obd, new, NULL, body->oa.o_easize);
-                CERROR("%s: splitted %lu/%u - %d\n", obd->obd_name,
-                       new->d_inode->i_ino, new->d_inode->i_generation, flags);
+                mds_try_to_split_dir(obd, new, NULL, body->oa.o_easize);
         }
 
 cleanup:
diff --git a/lustre/mds/mds_lmv.c b/lustre/mds/mds_lmv.c
index 8b72d9c186..0b7eeac1be 100644
--- a/lustre/mds/mds_lmv.c
+++ b/lustre/mds/mds_lmv.c
@@ -537,15 +537,14 @@ int mds_try_to_split_dir(struct obd_device *obd, struct dentry *dentry,
                 return 0;
         if (rc == MDS_NO_SPLIT_EXPECTED && nstripes == 0)
                 return 0;
+        if (nstripes && nstripes == 1)
+                return 0;
         
         LASSERT(mea == NULL || *mea == NULL);
 
         CDEBUG(D_OTHER, "%s: split directory %u/%lu/%lu\n",
                obd->obd_name, mds->mds_num, dir->i_ino,
                (unsigned long) dir->i_generation);
-        CDEBUG(D_ERROR, "%s: split directory %u/%lu/%lu: %d/%d\n",
-               obd->obd_name, mds->mds_num, dir->i_ino,
-               (unsigned long) dir->i_generation, rc, nstripes);
 
         if (mea == NULL)
                 mea = &tmea;
diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c
index c0fe7ef2e5..9c86d33d78 100644
--- a/lustre/mds/mds_open.c
+++ b/lustre/mds/mds_open.c
@@ -905,10 +905,12 @@ int mds_open(struct mds_update_record *rec, int offset,
                  * should live at this MDS or at another one */
                 int i;
                 i = mea_name2idx(mea, rec->ur_name, rec->ur_namelen - 1);
-                if (mea->mea_master != i) {
-                        CERROR("inapropriate MDS(%d) for %lu/%u:%s. should be %d\n",
-                                mea->mea_master, dparent->d_inode->i_ino,
-                                dparent->d_inode->i_generation, rec->ur_name, i);
+                if (mea->mea_master != mea->mea_fids[i].mds) {
+                        CERROR("%s: inapropriate MDS(%d) for %lu/%u:%s."
+                               " should be %d(%d)\n", obd->obd_name,
+                               mea->mea_master, dparent->d_inode->i_ino,
+                               dparent->d_inode->i_generation, rec->ur_name,
+                               mea->mea_fids[i].mds, i);
                         GOTO(cleanup, rc = -ERESTART);
                 }
         }
diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c
index 5e7886e631..870ce10390 100644
--- a/lustre/mds/mds_reint.c
+++ b/lustre/mds/mds_reint.c
@@ -602,10 +602,12 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  * should live at this MDS or at another one */
                 int i;
                 i = mea_name2idx(mea, rec->ur_name, rec->ur_namelen - 1);
-                if (mea->mea_master != i) {
-                        CERROR("inapropriate MDS(%d) for %lu/%u:%s. should be %d\n",
-                                mea->mea_master, dparent->d_inode->i_ino,
-                                dparent->d_inode->i_generation, rec->ur_name, i);
+                if (mea->mea_master != mea->mea_fids[i].mds) {
+                        CERROR("inapropriate MDS(%d) for %lu/%u:%s."
+                               " should be %d(%d)\n",
+                               mea->mea_master, dparent->d_inode->i_ino,
+                               dparent->d_inode->i_generation, rec->ur_name,
+                               mea->mea_fids[i].mds, i);
                         GOTO(cleanup, rc = -ERESTART);
                 }
         }
diff --git a/lustre/tests/sanity-lmv.sh b/lustre/tests/sanity-lmv.sh
index c9d6a538e6..29df4305a0 100644
--- a/lustre/tests/sanity-lmv.sh
+++ b/lustre/tests/sanity-lmv.sh
@@ -298,6 +298,28 @@ test_3a() {
 }
 run_test 3a " dir splitting with cross-ref ============================="
 
+test_3b() {
+	mkdir $DIR/3b1 || error
+	createmany -m $DIR/3b1/f 5000 || error
+	rm -rf $DIR/3b1 || error
+}
+run_test 3b " dir splitting via createmany -m ============================="
+
+test_3c() {
+	mkdir $DIR/3c1 || error
+	echo "MDS nodes: $MDSCOUNT"
+	for j in `seq 3`; do
+		for i in `seq 10`; do
+			$LFS dirstripe $DIR/3c1/d-${j}-${i} $j || error
+			createmany -m $DIR/3c1/d-${j}-${i}/m 200 || error
+			createmany -o $DIR/3c1/d-${j}-${i}/o 200 || error
+		done
+	done
+	rm -rf $DIR/3c1 || error
+}
+
+run_test 3c " dir splitting via lfs stripe ============================="
+
 TMPDIR=$OLDTMPDIR
 TMP=$OLDTMP
 HOME=$OLDHOME
-- 
GitLab