diff --git a/lustre/ChangeLog b/lustre/ChangeLog index aa9a7e4d9c1e08dc7d62e6ad980fa974556d9b3a..a8163579f7d4fd043e4cf34394a5a8cedbf91847 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -115,6 +115,13 @@ Details : Assetion hit is result of rare race between disconnect and connet to same nid. target_handle_connect found old connect cockie and tried to reconnect, but can't find export for this cockie. +Severity : normal +Bugzilla : 11756 +Frequency : rare +Description: umount blocks forever on error +Details : In result of wrong using obd_no_recov and obd_force flags client can + hand if cancel or some other requests is lost. + -------------------------------------------------------------------------------- 2007-07-30 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 44feeba2fad61144800f3cd3434cbd37d99d1f5b..f6edb49d519cb86e3e48b9ce1f03932655f105b1 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -464,17 +464,17 @@ int client_disconnect_export(struct obd_export *exp) (void)ptlrpc_pinger_del_import(imp); if (obd->obd_namespace != NULL) { - /* obd_no_recov == local only */ + /* obd_force == local only */ ldlm_cli_cancel_unused(obd->obd_namespace, NULL, - obd->obd_no_recov ? LDLM_FL_LOCAL_ONLY:0, + obd->obd_force ? LDLM_FL_LOCAL_ONLY:0, NULL); ldlm_namespace_free_prior(obd->obd_namespace); to_be_freed = obd->obd_namespace; obd->obd_namespace = NULL; } - /* Yeah, obd_no_recov also (mainly) means "forced shutdown". */ - if (!obd->obd_no_recov) + /* Yeah, obd_force means "forced shutdown". */ + if (!obd->obd_force) rc = ptlrpc_disconnect_import(imp, 0); ptlrpc_invalidate_import(imp); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 19b4091d04d8e2b31b392f39d1b2cfa0fdffb38b..e55a184cc4714d28b42526a044d941f84081b402 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -68,7 +68,7 @@ static int ll_close_inode_openhandle(struct inode *inode, * canceling "open lock" and we do not call mdc_close() in this case, as * it will not be successful, as import is already deactivated. */ - if (obd->obd_no_recov) + if (obd->obd_force) GOTO(out, rc = 0); OBDO_ALLOC(oa); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index ef072e50514c0ffec700bbaf5dc0508c92b1e100..47f4cdf733a09c73ccfd93ac93124937c2e5dcda 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1016,7 +1016,7 @@ void ll_put_super(struct super_block *sb) if (sbi->ll_mdc_exp) { obd = class_exp2obd(sbi->ll_mdc_exp); if (obd) - force = obd->obd_no_recov; + force = obd->obd_force; } /* We need to set force before the lov_disconnect in @@ -1824,7 +1824,7 @@ void ll_umount_begin(struct super_block *sb) EXIT; return; } - obd->obd_no_recov = 1; + obd->obd_force = 1; obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_mdc_exp, sizeof ioc_data, &ioc_data, NULL); @@ -1836,7 +1836,7 @@ void ll_umount_begin(struct super_block *sb) return; } - obd->obd_no_recov = 1; + obd->obd_force = 1; obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_osc_exp, sizeof ioc_data, &ioc_data, NULL); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index d75d1e90e1caff29271a4b05a740fc081f1dc182..52d95743e9a3bfd97b12d4018910fef480bae63a 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -269,6 +269,9 @@ int ptlrpc_pinger_add_import(struct obd_import *imp) mutex_down(&pinger_sem); CDEBUG(D_HA, "adding pingable import %s->%s\n", imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we add to pinger we want recovery on this import */ + imp->imp_obd->obd_no_recov = 0; + ptlrpc_update_next_ping(imp); /* XXX sort, blah blah */ list_add_tail(&imp->imp_pinger_chain, &pinger_imports); @@ -290,6 +293,8 @@ int ptlrpc_pinger_del_import(struct obd_import *imp) list_del_init(&imp->imp_pinger_chain); CDEBUG(D_HA, "removing pingable import %s->%s\n", imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we remove from pinger we don't want recovery on this import */ + imp->imp_obd->obd_no_recov = 1; class_import_put(imp); mutex_up(&pinger_sem); RETURN(0); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index f207d8182218697bd010f75c7bffe32e97b06006..b40c7c0a701da55041a1551a8050c2de67ebbe8d 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -87,6 +87,16 @@ stop_ost2() { stop ost2 -f || return 93 } +start_client() { + echo "start client on `facet_active_host client`" + start client || return 99 +} + +stop_client() { + echo "stop client on `facet_active_host client`" + stop client || return 100 +} + mount_client() { local MOUNTPATH=$1 echo "mount $FSNAME on ${MOUNTPATH}....." @@ -109,8 +119,12 @@ umount_client() { } manual_umount_client(){ + local rc + local FORCE=$1 echo "manual umount lustre on ${MOUNT}...." - do_facet client "umount -d $MOUNT" + do_facet client "umount -d ${FORCE} $MOUNT" + rc=$? + return $rc } setup() { @@ -1079,6 +1093,7 @@ test_32a() { # mount a second time to make sure we didnt leave upgrade flag on $TUNEFS --dryrun $TMP/$tdir/mds || error "tunefs failed" + load_modules start mds $TMP/$tdir/mds "-o loop,exclude=lustre-OST0000" || return 12 cleanup_nocli @@ -1153,5 +1168,66 @@ run_test 33 "Mount ost with a large index number" umount_client $MOUNT cleanup_nocli +test_33() { + setup + + do_facet client dd if=/dev/zero of=$MOUNT/24 bs=1024k count=1 + # Drop lock cancelation reply during umount + #define OBD_FAIL_LDLM_CANCEL 0x304 + do_facet client sysctl -w lustre.fail_loc=0x80000304 + #sysctl -w lnet.debug=-1 + umount_client $MOUNT + cleanup +} +run_test 33 "Drop cancel during umount" + +test_34a() { + setup + do_facet client multiop $DIR/file O_c & + + manual_umount_client + rc=$? + do_facet client killall -USR1 multiop + if [ $rc -eq 0 ]; then + error "umount not fail!" + fi + sleep 1 + cleanup +} +run_test 34a "umount with opened file should be fail" + + +test_34b() { + setup + touch $DIR/$tfile || return 1 + stop_mds --force || return 2 + + manual_umount_client --force + rc=$? + if [ $rc -ne 0 ]; then + error "mtab after failed umount - rc $rc" + fi + + cleanup + return 0 +} +run_test 34b "force umount with failed mds should be normal" + +test_34c() { + setup + touch $DIR/$tfile || return 1 + stop_ost --force || return 2 + + manual_umount_client --force + rc=$? + if [ $rc -ne 0 ]; then + error "mtab after failed umount - rc $rc" + fi + + cleanup + return 0 +} +run_test 34c "force umount with failed mds should be normal" + equals_msg "Done" echo "$0: completed"