From 995c1c5e542b639f42ef2fcf58927b3941e0bd5a Mon Sep 17 00:00:00 2001 From: shadow <shadow> Date: Tue, 20 May 2008 06:45:57 +0000 Subject: [PATCH] fix ASSERTION(client_stat->nid_exp_ref_count == 0) during run acc-small. b=15139 i=tappro i=komal --- lustre/ChangeLog | 7 ++++ lustre/include/obd.h | 2 +- lustre/obdclass/class_hash.c | 2 +- lustre/obdclass/lprocfs_status.c | 58 ++++++++++++++++++++------------ lustre/obdclass/obd_config.c | 2 +- 5 files changed, 46 insertions(+), 25 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 63a30f0d3c..8b1978375b 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -24,6 +24,13 @@ tbd Sun Microsystems, Inc. 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'. For more information, please refer to bugzilla 13904. +Severity : normal +Bugzilla : 15139 +Frequency : rare, on clear nid stats +Description: ASSERTION(client_stat->nid_exp_ref_count == 0) +Details : when clean nid stats sometimes try destroy live entry, + and this produce panic in free. + Severity : major Bugzilla : 15575 Description: Stack overflow during MDS log replay diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 94593cb04d..40e196de8a 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -793,7 +793,7 @@ struct obd_device { cfs_waitq_t obd_llog_waitq; struct list_head obd_exports; int obd_num_exports; - spinlock_t nid_lock; + spinlock_t obd_nid_lock; struct ldlm_namespace *obd_namespace; struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ /* a spinlock is OK for what we do now, may need a semaphore later */ diff --git a/lustre/obdclass/class_hash.c b/lustre/obdclass/class_hash.c index c578564251..880cd2bdec 100644 --- a/lustre/obdclass/class_hash.c +++ b/lustre/obdclass/class_hash.c @@ -669,7 +669,7 @@ void nidstats_refcount_put(struct hlist_node * actual_hnode) data = hlist_entry(actual_hnode, struct nid_stat, nid_hash); data->nid_exp_ref_count--; - + EXIT; } /*******************************************************************************/ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 9615614c6d..ac9990fc74 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -753,10 +753,8 @@ int lprocfs_obd_cleanup(struct obd_device *obd) return 0; } -void lprocfs_free_client_stats(void *obj, void *data) +static void lprocfs_free_client_stats(struct nid_stat *client_stat) { - struct nid_stat *client_stat = obj; - CDEBUG(D_CONFIG, "stat %p - data %p/%p/%p\n", client_stat, client_stat->nid_proc, client_stat->nid_stats, client_stat->nid_brw_stats); @@ -765,7 +763,6 @@ void lprocfs_free_client_stats(void *obj, void *data) client_stat->nid_exp_ref_count); hlist_del_init(&client_stat->nid_hash); - list_del(&client_stat->nid_list); if (client_stat->nid_proc) lprocfs_remove(&client_stat->nid_proc); @@ -787,10 +784,12 @@ void lprocfs_free_per_client_stats(struct obd_device *obd) ENTRY; /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ while(!list_empty(&obd->obd_nid_stats)) { stat = list_entry(obd->obd_nid_stats.next, struct nid_stat, nid_list); - lprocfs_free_client_stats(stat, NULL); + list_del_init(&stat->nid_list); + lprocfs_free_client_stats(stat); } EXIT; @@ -1255,24 +1254,29 @@ EXPORT_SYMBOL(lprocfs_nid_stats_clear_read); void lprocfs_nid_stats_clear_write_cb(void *obj, void *data) { - struct nid_stat *client_stat = obj; + struct nid_stat *stat = obj; int i; - if(client_stat->nid_exp_ref_count == 1) { - hlist_del_init(&client_stat->nid_hash); - lprocfs_free_client_stats(client_stat, data); - OBD_FREE(client_stat, sizeof(struct nid_stat)); + /* object has only hash + iterate_all references. + * add/delete blocked by hash bucket lock */ + CDEBUG(D_INFO,"refcnt %d\n", stat->nid_exp_ref_count); + if(stat->nid_exp_ref_count == 2) { + hlist_del_init(&stat->nid_hash); + stat->nid_exp_ref_count--; + spin_lock(&stat->nid_obd->obd_nid_lock); + list_del_init(&stat->nid_list); + spin_unlock(&stat->nid_obd->obd_nid_lock); + list_add(&stat->nid_list, data); EXIT; return; } /* we has reference to object - only clear data*/ - if (client_stat->nid_stats) { - lprocfs_clear_stats(client_stat->nid_stats); - } + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); - if (client_stat->nid_brw_stats) { + if (stat->nid_brw_stats) { for (i = 0; i < BRW_LAST; i++) - lprocfs_oh_clear(&client_stat->nid_brw_stats->hist[i]); + lprocfs_oh_clear(&stat->nid_brw_stats->hist[i]); } EXIT; return; @@ -1283,10 +1287,17 @@ int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, unsigned long count, void *data) { struct obd_device *obd = (struct obd_device *)data; - + struct nid_stat *client_stat; + CFS_LIST_HEAD(free_list); lustre_hash_iterate_all(obd->obd_nid_stats_hash_body, - lprocfs_free_client_stats, NULL); + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } return count; } @@ -1323,6 +1334,11 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) tmp->nid_obd = exp->exp_obd; tmp->nid_exp_ref_count = 1; /* need live in hash after destroy export */ + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&tmp->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + tmp1= lustre_hash_findadd_unique(obd->obd_nid_stats_hash_body, nid, &tmp->nid_hash); CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", @@ -1348,16 +1364,14 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) if (rc) CWARN("Error adding the uuid file\n"); - /* protect competitive add to list, not need locking on destroy */ - spin_lock(&obd->nid_lock); - list_add(&tmp->nid_list, &obd->obd_nid_stats); - spin_unlock(&obd->nid_lock); - exp->exp_nid_stats = tmp; *newnid = 1; RETURN(rc); destroy_new: + spin_lock(&obd->obd_nid_lock); + list_del(&tmp->nid_list); + spin_unlock(&obd->obd_nid_lock); OBD_FREE(tmp, sizeof(struct nid_stat)); RETURN(rc); } diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index eba45b5e22..0f98e59013 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -191,7 +191,7 @@ int class_attach(struct lustre_cfg *lcfg) CFS_INIT_LIST_HEAD(&obd->obd_exports); CFS_INIT_LIST_HEAD(&obd->obd_exports_timed); CFS_INIT_LIST_HEAD(&obd->obd_nid_stats); - spin_lock_init(&obd->nid_lock); + spin_lock_init(&obd->obd_nid_lock); spin_lock_init(&obd->obd_dev_lock); sema_init(&obd->obd_dev_sem, 1); spin_lock_init(&obd->obd_osfs_lock); -- GitLab