From 995c1c5e542b639f42ef2fcf58927b3941e0bd5a Mon Sep 17 00:00:00 2001
From: shadow <shadow>
Date: Tue, 20 May 2008 06:45:57 +0000
Subject: [PATCH] fix ASSERTION(client_stat->nid_exp_ref_count == 0) during run
 acc-small. b=15139 i=tappro i=komal

---
 lustre/ChangeLog                 |  7 ++++
 lustre/include/obd.h             |  2 +-
 lustre/obdclass/class_hash.c     |  2 +-
 lustre/obdclass/lprocfs_status.c | 58 ++++++++++++++++++++------------
 lustre/obdclass/obd_config.c     |  2 +-
 5 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 63a30f0d3c..8b1978375b 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -24,6 +24,13 @@ tbd Sun Microsystems, Inc.
 	  	'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'.
 	  For more information, please refer to bugzilla 13904.
 
+Severity   : normal
+Bugzilla   : 15139
+Frequency  : rare, on clear nid stats
+Description: ASSERTION(client_stat->nid_exp_ref_count == 0)
+Details    : when clean nid stats sometimes try destroy live entry,
+             and this produce panic in free.
+
 Severity   : major
 Bugzilla   : 15575
 Description: Stack overflow during MDS log replay
diff --git a/lustre/include/obd.h b/lustre/include/obd.h
index 94593cb04d..40e196de8a 100644
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -793,7 +793,7 @@ struct obd_device {
         cfs_waitq_t             obd_llog_waitq;
         struct list_head        obd_exports;
         int                     obd_num_exports;
-        spinlock_t              nid_lock;
+        spinlock_t              obd_nid_lock;
         struct ldlm_namespace  *obd_namespace;
         struct ptlrpc_client    obd_ldlm_client; /* XXX OST/MDS only */
         /* a spinlock is OK for what we do now, may need a semaphore later */
diff --git a/lustre/obdclass/class_hash.c b/lustre/obdclass/class_hash.c
index c578564251..880cd2bdec 100644
--- a/lustre/obdclass/class_hash.c
+++ b/lustre/obdclass/class_hash.c
@@ -669,7 +669,7 @@ void nidstats_refcount_put(struct hlist_node * actual_hnode)
 
         data = hlist_entry(actual_hnode, struct nid_stat, nid_hash);
         data->nid_exp_ref_count--;
-
+        EXIT;
 }
 
 /*******************************************************************************/
diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c
index 9615614c6d..ac9990fc74 100644
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -753,10 +753,8 @@ int lprocfs_obd_cleanup(struct obd_device *obd)
         return 0;
 }
 
-void lprocfs_free_client_stats(void *obj, void *data)
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
 {
-        struct nid_stat *client_stat = obj;
-
         CDEBUG(D_CONFIG, "stat %p - data %p/%p/%p\n", client_stat,
                client_stat->nid_proc, client_stat->nid_stats,
                client_stat->nid_brw_stats);
@@ -765,7 +763,6 @@ void lprocfs_free_client_stats(void *obj, void *data)
                  client_stat->nid_exp_ref_count);
 
         hlist_del_init(&client_stat->nid_hash);
-        list_del(&client_stat->nid_list);
 
         if (client_stat->nid_proc)
                 lprocfs_remove(&client_stat->nid_proc);
@@ -787,10 +784,12 @@ void lprocfs_free_per_client_stats(struct obd_device *obd)
         ENTRY;
 
         /* we need extra list - because hash_exit called to early */
+        /* not need locking because all clients is died */
         while(!list_empty(&obd->obd_nid_stats)) {
                 stat = list_entry(obd->obd_nid_stats.next,
                                   struct nid_stat, nid_list);
-                lprocfs_free_client_stats(stat, NULL);
+                list_del_init(&stat->nid_list);
+                lprocfs_free_client_stats(stat);
         }
 
         EXIT;
@@ -1255,24 +1254,29 @@ EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
 
 void lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
 {
-        struct nid_stat *client_stat = obj;
+        struct nid_stat *stat = obj;
         int i;
 
-        if(client_stat->nid_exp_ref_count == 1) {
-                hlist_del_init(&client_stat->nid_hash);
-                lprocfs_free_client_stats(client_stat, data);
-                OBD_FREE(client_stat, sizeof(struct nid_stat));
+        /* object has only hash + iterate_all references.
+         * add/delete blocked by hash bucket lock */
+        CDEBUG(D_INFO,"refcnt %d\n", stat->nid_exp_ref_count);
+        if(stat->nid_exp_ref_count == 2) {
+                hlist_del_init(&stat->nid_hash);
+                stat->nid_exp_ref_count--;
+                spin_lock(&stat->nid_obd->obd_nid_lock);
+                list_del_init(&stat->nid_list);
+                spin_unlock(&stat->nid_obd->obd_nid_lock);
+                list_add(&stat->nid_list, data);
                 EXIT;
                 return;
         }
         /* we has reference to object - only clear data*/
-        if (client_stat->nid_stats) {
-                lprocfs_clear_stats(client_stat->nid_stats);
-        }
+        if (stat->nid_stats)
+                lprocfs_clear_stats(stat->nid_stats);
 
-        if (client_stat->nid_brw_stats) {
+        if (stat->nid_brw_stats) {
                 for (i = 0; i < BRW_LAST; i++)
-                        lprocfs_oh_clear(&client_stat->nid_brw_stats->hist[i]);
+                        lprocfs_oh_clear(&stat->nid_brw_stats->hist[i]);
         }
         EXIT;
         return;
@@ -1283,10 +1287,17 @@ int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
                                          unsigned long count, void *data)
 {
         struct obd_device *obd = (struct obd_device *)data;
-
+        struct nid_stat *client_stat;
+        CFS_LIST_HEAD(free_list);
 
         lustre_hash_iterate_all(obd->obd_nid_stats_hash_body,
-                                lprocfs_free_client_stats, NULL);
+                                lprocfs_nid_stats_clear_write_cb, &free_list);
+
+        while (!list_empty(&free_list)) {
+                client_stat = list_entry(free_list.next, struct nid_stat, nid_list);
+                list_del_init(&client_stat->nid_list);
+                lprocfs_free_client_stats(client_stat);
+        }
 
         return count;
 }
@@ -1323,6 +1334,11 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
         tmp->nid_obd = exp->exp_obd;
         tmp->nid_exp_ref_count = 1; /* need live in hash after destroy export */
 
+       /* protect competitive add to list, not need locking on destroy */
+        spin_lock(&obd->obd_nid_lock);
+        list_add(&tmp->nid_list, &obd->obd_nid_stats);
+        spin_unlock(&obd->obd_nid_lock);
+
         tmp1= lustre_hash_findadd_unique(obd->obd_nid_stats_hash_body, nid,
                                          &tmp->nid_hash);
         CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
@@ -1348,16 +1364,14 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
         if (rc)
                 CWARN("Error adding the uuid file\n");
 
-        /* protect competitive add to list, not need locking on destroy */
-        spin_lock(&obd->nid_lock);
-        list_add(&tmp->nid_list, &obd->obd_nid_stats);
-        spin_unlock(&obd->nid_lock);
-
         exp->exp_nid_stats = tmp;
         *newnid = 1;
         RETURN(rc);
 
 destroy_new:
+        spin_lock(&obd->obd_nid_lock);
+        list_del(&tmp->nid_list);
+        spin_unlock(&obd->obd_nid_lock);
         OBD_FREE(tmp, sizeof(struct nid_stat));
         RETURN(rc);
 }
diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c
index eba45b5e22..0f98e59013 100644
--- a/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@ -191,7 +191,7 @@ int class_attach(struct lustre_cfg *lcfg)
         CFS_INIT_LIST_HEAD(&obd->obd_exports);
         CFS_INIT_LIST_HEAD(&obd->obd_exports_timed);
         CFS_INIT_LIST_HEAD(&obd->obd_nid_stats);
-        spin_lock_init(&obd->nid_lock);
+        spin_lock_init(&obd->obd_nid_lock);
         spin_lock_init(&obd->obd_dev_lock);
         sema_init(&obd->obd_dev_sem, 1);
         spin_lock_init(&obd->obd_osfs_lock);
-- 
GitLab