diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 41fb301dfc9d6275190afd75c4bfad4ea3fb0c21..c39cb6f5793bc59bdb4bca04ba64fa574d0c0973 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -39,6 +39,7 @@ extern unsigned int obd_fail_loc; extern unsigned int obd_timeout; extern char obd_lustre_upcall[128]; extern unsigned int obd_sync_filter; +extern wait_queue_head_t obd_race_waitq; #define OBD_FAIL_MDS 0x100 #define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 @@ -129,6 +130,7 @@ extern unsigned int obd_sync_filter; #define OBD_FAIL_OBD_LOGD_NET 0x602 #define OBD_FAIL_TGT_REPLY_NET 0x700 +#define OBD_FAIL_TGT_CONN_RACE 0x701 /* preparation for a more advanced failure testbed (not functional yet) */ #define OBD_FAIL_MASK_SYS 0x0000FF00 @@ -174,6 +176,22 @@ do { \ } \ } while(0) +/* The idea here is to synchronise two threads to force a race. The + * first thread that calls this with a matching fail_loc is put to + * sleep. The next thread that calls with the same fail_loc wakes up + * the first and continues. */ +#define OBD_RACE(id) \ +do { \ + if (OBD_FAIL_CHECK_ONCE(id)) { \ + CERROR("obd_race id %x sleeping\n", (id)); \ + sleep_on(&obd_race_waitq); \ + CERROR("obd_fail_race id %x awake\n", (id)); \ + } else if ((obd_fail_loc & OBD_FAIL_MASK_LOC) == \ + ((id) & OBD_FAIL_MASK_LOC)) { \ + wake_up(&obd_race_waitq); \ + } \ +} while(0) + #define fixme() CDEBUG(D_OTHER, "FIXME\n"); #ifdef __KERNEL__ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index bcaed00f4109f7431282dd97d8159b2ad8e08029..0d514db4a2e3c71806edfa418d95caddab6d1211 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -372,8 +372,11 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) struct list_head *p; char *str, *tmp; int rc = 0, abort_recovery; + unsigned long flags; ENTRY; + OBD_RACE(OBD_FAIL_TGT_CONN_RACE); + LASSERT_REQSWAB (req, 0); str = lustre_msg_string(req->rq_reqmsg, 0, sizeof(tgtuuid) - 1); if (str == NULL) { @@ -386,7 +389,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (!target) { target = class_name2obd(str); } - + if (!target || target->obd_stopping || !target->obd_set_up) { CERROR("UUID '%s' is not available for connect\n", str); GOTO(out, rc = -ENODEV); @@ -498,6 +501,17 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) export = req->rq_export = class_conn2export(&conn); LASSERT(export != NULL); + spin_lock_irqsave(&export->exp_lock, flags); + if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) { + CERROR("%s: already connected at a higher conn_cnt: %d > %d\n", + cluuid.uuid, export->exp_conn_cnt, + req->rq_reqmsg->conn_cnt); + spin_unlock_irqrestore(&export->exp_lock, flags); + GOTO(out, rc = -EALREADY); + } + export->exp_conn_cnt = req->rq_reqmsg->conn_cnt; + spin_unlock_irqrestore(&export->exp_lock, flags); + /* request from liblustre? */ if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) export->exp_libclient = 1; @@ -507,9 +521,6 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) export->exp_connection = ptlrpc_get_connection(&req->rq_peer, &remote_uuid); - LASSERT(export->exp_conn_cnt < req->rq_reqmsg->conn_cnt); - export->exp_conn_cnt = req->rq_reqmsg->conn_cnt; - if (rc == EALREADY) { /* We indicate the reconnection in a flag, not an error code. */ lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 5e2c305243d6bb8eb8aacba9d4ba297785cb8c31..e3351a6b956063183a72057787469340eb82af06 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -88,6 +88,8 @@ unsigned int obd_timeout = 100; char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall */ unsigned int obd_sync_filter; /* = 0, don't sync by default */ +DECLARE_WAIT_QUEUE_HEAD(obd_race_waitq); + #ifdef __KERNEL__ /* opening /dev/obd */ static int obd_class_open(struct inode * inode, struct file * file) @@ -375,6 +377,7 @@ void *obd_psdev = NULL; EXPORT_SYMBOL(obd_dev); EXPORT_SYMBOL(obdo_cachep); EXPORT_SYMBOL(obd_fail_loc); +EXPORT_SYMBOL(obd_race_waitq); EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(obd_lustre_upcall); EXPORT_SYMBOL(obd_sync_filter); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 119ca9950a1b46fe3f7fafe855dc3f26b79eae71..a90a6e108b27407c822c5ff101c61e13c179eb7d 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -138,7 +138,7 @@ void lprocfs_remove(struct proc_dir_entry *root) LASSERT(root != NULL); parent = root->parent; LASSERT(parent != NULL); - + while (1) { while (temp->subdir != NULL) temp = temp->subdir; diff --git a/lustre/obdclass/sysctl.c b/lustre/obdclass/sysctl.c index f4749855cb06a408c75b0ef5060bf7e05f85376a..8c93a481863bf7df1b3824672728460922521c86 100644 --- a/lustre/obdclass/sysctl.c +++ b/lustre/obdclass/sysctl.c @@ -54,11 +54,14 @@ enum { OBD_SYNCFILTER, /* XXX temporary, as we play with sync osts.. */ }; +int proc_fail_loc(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp); + static ctl_table obd_table[] = { {OBD_FAIL_LOC, "fail_loc", &obd_fail_loc, sizeof(int), 0644, NULL, &proc_dointvec}, {OBD_TIMEOUT, "timeout", &obd_timeout, sizeof(int), 0644, NULL, - &proc_dointvec}, + &proc_fail_loc}, /* XXX need to lock so we avoid update races with recovery upcall! */ {OBD_UPCALL, "upcall", obd_lustre_upcall, 128, 0644, NULL, &proc_dostring, &sysctl_string }, @@ -88,3 +91,15 @@ void obd_sysctl_clean (void) obd_table_header = NULL; #endif } + +int proc_fail_loc(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + int rc; + int old_fail_loc = obd_fail_loc; + + rc = proc_dointvec(table,write,filp,buffer,lenp); + if (old_fail_loc != obd_fail_loc) + wake_up(&obd_race_waitq); + return rc; +} diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index ef241b2fecc1758b664fb6a461e302912a5b38c1..8b1c6e308d5521cc0f5626aead2f56a3b57879a8 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -854,6 +854,16 @@ test_43() { } run_test 43 "mds osc import failure during recovery; don't LBUG" +test_44() { + mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices` + do_facet mds "sysctl -w lustre.fail_loc=0x80000701" + $LCTL --device $mdcdev recover + df $MOUNT + do_facet mds "sysctl -w lustre.fail_loc=0" + return 0 +} +run_test 44 "race in target handle connect" + equals_msg test complete, cleaning up $CLEANUP