From 50e3ab3c73cd70f6616a5baf4fa6a4c1a84a7e56 Mon Sep 17 00:00:00 2001 From: rread <rread> Date: Mon, 12 Jul 2004 22:19:10 +0000 Subject: [PATCH] b=3405 - Fix insanity.sh, still using the insanity-local.sh config by default. - More cleanups in replay-single.sh. We need to generalize failures a bit more, so we can choose a different mds each time, if more than one is available. --- lustre/tests/cfg/insanity-local.sh | 5 +- lustre/tests/insanity.sh | 68 +++++++++++-------- lustre/tests/replay-single.sh | 104 ++++++++++++++--------------- 3 files changed, 96 insertions(+), 81 deletions(-) diff --git a/lustre/tests/cfg/insanity-local.sh b/lustre/tests/cfg/insanity-local.sh index 1e0c6e974b..a4839c80a1 100644 --- a/lustre/tests/cfg/insanity-local.sh +++ b/lustre/tests/cfg/insanity-local.sh @@ -1,5 +1,6 @@ -mds_HOST=${mds_HOST:-`hostname`} -mdsfailover_HOST=${mdsfailover_HOST:-""} +MDSCOUNT=${MDSCOUNT:-1} +mds1_HOST=${mds1_HOST:-`hostname`} +mds1failover_HOST=${mds1failover_HOST:-""} ost1_HOST=${ost1_HOST:-"`hostname`"} ost2_HOST=${ost2_HOST:-"`hostname`"} EXTRA_OSTS=${EXTRA_OSTS:-"`hostname`"} diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 68d0ff9282..9e392c774f 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -14,7 +14,7 @@ ALWAYS_EXCEPT="10" build_test_filter -assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT +assert_env MDSCOUNT mds1_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT #### # Initialize all the ostN_HOST @@ -109,22 +109,32 @@ reintegrate_clients() { gen_config() { rm -f $XMLCONFIG - add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE - - if [ ! -z "$mdsfailover_HOST" ]; then - add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE + if [ "$MDSCOUNT" -gt 1 ]; then + add_lmv lmv1 + for mds in `mds_list`; do + MDSDEV=$TMP/${mds}-`hostname` + add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1 + done + MDS=lmv1 + add_lov_to_lmv lov1 lmv1 --stripe_sz $STRIPE_BYTES \ + --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 + else + add_mds mds1 --dev $MDSDEV --size $MDSSIZE + if [ ! -z "$mds1failover_HOST" ]; then + add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE + fi + add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \ + --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 + MDS=mds1_svc fi - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 for i in `seq $NUMOST`; do dev=`printf $OSTDEV $i` add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \ --journal-size $OSTJOURNALSIZE done - - add_client client mds --lov lov1 --path $MOUNT + add_client client --mds $MDS --lov lov1 --path $MOUNT } setup() { @@ -134,8 +144,10 @@ setup() { start ost$i ${REFORMAT} $OSTLCONFARGS done [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE - wait_for mds - start mds $MDSLCONFARGS ${REFORMAT} + for mds in `mds_list`; do + wait_for $mds + start $mds $MDSLCONFARGS ${REFORMAT} + done while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT @@ -144,7 +156,9 @@ setup() { cleanup() { zconf_umount $CLIENTS $MOUNT - stop mds ${FORCE} $MDSLCONFARGS || : + for mds in `mds_list`; do + stop $mds ${FORCE} $MDSLCONFARGS || : + done for i in `seq $NUMOST`; do stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || : done @@ -228,7 +242,7 @@ echo "Starting Test 17 at `date`" test_0() { echo "Failover MDS" - facet_failover mds + facet_failover mds1 echo "Waiting for df pid: $DFPID" wait $DFPID || return 1 @@ -259,12 +273,12 @@ test_2() { client_df echo "Failing MDS" - shutdown_facet mds - reboot_facet mds + shutdown_facet mds1 + reboot_facet mds1 # prepare for MDS failover - change_active mds - reboot_facet mds + change_active mds1 + reboot_facet mds1 client_df & DFPID=$! @@ -279,8 +293,8 @@ test_2() { start ost1 echo "Failover MDS" - wait_for mds - start mds + wait_for mds1 + start mds1 #Check FS wait $DFPID @@ -299,7 +313,7 @@ test_3() { echo "Verify Lustre filesystem is up and running" #MDS Portion - facet_failover mds + facet_failover mds1 wait $DFPID || echo df failed: $? #Check FS @@ -337,12 +351,12 @@ test_4() { #MDS Portion echo "Failing MDS" - shutdown_facet mds - reboot_facet mds + shutdown_facet mds1 + reboot_facet mds1 # prepare for MDS failover - change_active mds - reboot_facet mds + change_active mds1 + reboot_facet mds1 client_df & DFPID=$! @@ -355,8 +369,8 @@ test_4() { start ost1 echo "Failover MDS" - wait_for mds - start mds + wait_for mds1 + start mds1 #Check FS wait $DFPID @@ -479,7 +493,7 @@ test_7() { #MDS Portion echo "Failing MDS" - facet_failover mds + facet_failover mds1 #Check FS echo "Test Lustre stability after MDS failover" diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index e3da13baec..2a6cddb303 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -34,7 +34,7 @@ gen_config() { --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 else add_mds mds1 --dev $MDSDEV --size $MDSSIZE - if [ ! -z "$mdsfailover_HOST" ]; then + if [ ! -z "$mds1failover_HOST" ]; then add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE fi add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \ @@ -110,16 +110,16 @@ test_0b() { run_test 0b "ensure object created after recover exists. (3284)" test_1() { - replay_barrier mds2 + replay_barrier mds1 mcreate $DIR/$tfile - fail mds2 + fail mds1 $CHECKSTAT -t file $DIR/$tfile || return 1 rm $DIR/$tfile } run_test 1 "simple create" test_2a() { - replay_barrier mds + replay_barrier mds1 touch $DIR/$tfile fail mds1 $CHECKSTAT -t file $DIR/$tfile || return 1 @@ -129,7 +129,7 @@ run_test 2a "touch" test_2b() { ./mcreate $DIR/$tfile - replay_barrier mds + replay_barrier mds1 touch $DIR/$tfile fail mds1 $CHECKSTAT -t file $DIR/$tfile || return 1 @@ -138,7 +138,7 @@ test_2b() { run_test 2b "touch" test_3a() { - replay_barrier mds + replay_barrier mds1 mcreate $DIR/$tfile o_directory $DIR/$tfile fail mds1 @@ -148,7 +148,7 @@ test_3a() { run_test 3a "replay failed open(O_DIRECTORY)" test_3b() { - replay_barrier mds + replay_barrier mds1 #define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE do_facet mds "sysctl -w lustre.fail_loc=0x80000114" touch $DIR/$tfile @@ -160,7 +160,7 @@ test_3b() { run_test 3b "replay failed open -ENOMEM" test_3c() { - replay_barrier mds + replay_barrier mds1 #define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE do_facet mds "sysctl -w lustre.fail_loc=0x80000128" touch $DIR/$tfile @@ -173,7 +173,7 @@ test_3c() { run_test 3c "replay failed open -ENOMEM" test_4() { - replay_barrier mds + replay_barrier mds1 for i in `seq 10`; do echo "tag-$i" > $DIR/$tfile-$i done @@ -185,7 +185,7 @@ test_4() { run_test 4 "|x| 10 open(O_CREAT)s" test_4b() { - replay_barrier mds + replay_barrier mds1 rm -rf $DIR/$tfile-* fail mds1 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true @@ -195,7 +195,7 @@ run_test 4b "|x| rm 10 files" # The idea is to get past the first block of precreated files on both # osts, and then replay. test_5() { - replay_barrier mds + replay_barrier mds1 for i in `seq 220`; do echo "tag-$i" > $DIR/$tfile-$i done @@ -211,7 +211,7 @@ run_test 5 "|x| 220 open(O_CREAT)" test_6() { - replay_barrier mds + replay_barrier mds1 mkdir $DIR/$tdir mcreate $DIR/$tdir/$tfile fail mds1 @@ -223,7 +223,7 @@ test_6() { run_test 6 "mkdir + contained create" test_6b() { - replay_barrier mds + replay_barrier mds1 rm -rf $DIR/$tdir fail mds1 $CHECKSTAT -t dir $DIR/$tdir && return 1 || true @@ -232,7 +232,7 @@ run_test 6b "|X| rmdir" test_7() { mkdir $DIR/$tdir - replay_barrier mds + replay_barrier mds1 mcreate $DIR/$tdir/$tfile fail mds1 $CHECKSTAT -t dir $DIR/$tdir || return 1 @@ -242,7 +242,7 @@ test_7() { run_test 7 "mkdir |X| contained create" test_8() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile mo_c & MULTIPID=$! sleep 1 @@ -256,7 +256,7 @@ test_8() { run_test 8 "creat open |X| close" test_9() { - replay_barrier mds + replay_barrier mds1 mcreate $DIR/$tfile local old_inum=`ls -i $DIR/$tfile | awk '{print $1}'` fail mds1 @@ -276,7 +276,7 @@ run_test 9 "|X| create (same inum/gen)" test_10() { mcreate $DIR/$tfile - replay_barrier mds + replay_barrier mds1 mv $DIR/$tfile $DIR/$tfile-2 rm -f $DIR/$tfile fail mds1 @@ -291,7 +291,7 @@ test_11() { mcreate $DIR/$tfile echo "old" > $DIR/$tfile mv $DIR/$tfile $DIR/$tfile-2 - replay_barrier mds + replay_barrier mds1 echo "new" > $DIR/$tfile grep new $DIR/$tfile grep old $DIR/$tfile-2 @@ -308,7 +308,7 @@ test_12() { # give multiop a chance to open sleep 1 rm -f $DIR/$tfile - replay_barrier mds + replay_barrier mds1 kill -USR1 $pid wait $pid || return 1 @@ -329,7 +329,7 @@ test_13() { sleep 1 chmod 0 $DIR/$tfile $CHECKSTAT -p 0 $DIR/$tfile - replay_barrier mds + replay_barrier mds1 fail mds1 kill -USR1 $pid wait $pid || return 1 @@ -345,7 +345,7 @@ test_14() { # give multiop a chance to open sleep 1 rm -f $DIR/$tfile - replay_barrier mds + replay_barrier mds1 kill -USR1 $pid || return 1 wait $pid || return 2 @@ -361,7 +361,7 @@ test_15() { # give multiop a chance to open sleep 1 rm -f $DIR/$tfile - replay_barrier mds + replay_barrier mds1 touch $DIR/g11 || return 1 kill -USR1 $pid wait $pid || return 2 @@ -375,7 +375,7 @@ run_test 15 "open(O_CREAT), unlink |X| touch new, close" test_16() { - replay_barrier mds + replay_barrier mds1 mcreate $DIR/$tfile munlink $DIR/$tfile mcreate $DIR/$tfile-2 @@ -387,7 +387,7 @@ test_16() { run_test 16 "|X| open(O_CREAT), unlink, touch new, unlink new" test_17() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile O_c & pid=$! # give multiop a chance to open @@ -401,7 +401,7 @@ test_17() { run_test 17 "|X| open(O_CREAT), |replay| close" test_18() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open @@ -425,7 +425,7 @@ run_test 18 "|X| open(O_CREAT), unlink, touch new, close, touch, unlink" # bug 1855 (a simpler form of test_11 above) test_19() { - replay_barrier mds + replay_barrier mds1 mcreate $DIR/$tfile echo "old" > $DIR/$tfile mv $DIR/$tfile $DIR/$tfile-2 @@ -436,7 +436,7 @@ test_19() { run_test 19 "|X| mcreate, open, write, rename " test_20() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open @@ -452,7 +452,7 @@ test_20() { run_test 20 "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" test_21() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open @@ -475,7 +475,7 @@ test_22() { # give multiop a chance to open sleep 1 - replay_barrier mds + replay_barrier mds1 rm -f $DIR/$tfile fail mds1 @@ -492,7 +492,7 @@ test_23() { # give multiop a chance to open sleep 1 - replay_barrier mds + replay_barrier mds1 rm -f $DIR/$tfile touch $DIR/g11 || return 1 @@ -511,7 +511,7 @@ test_24() { # give multiop a chance to open sleep 1 - replay_barrier mds + replay_barrier mds1 fail mds1 rm -f $DIR/$tfile kill -USR1 $pid @@ -528,7 +528,7 @@ test_25() { sleep 1 rm -f $DIR/$tfile - replay_barrier mds + replay_barrier mds1 fail mds1 kill -USR1 $pid wait $pid || return 1 @@ -538,7 +538,7 @@ test_25() { run_test 25 "open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" test_26() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile-1 O_tSc & pid1=$! multiop $DIR/$tfile-2 O_tSc & @@ -560,7 +560,7 @@ test_26() { run_test 26 "|X| open(O_CREAT), unlink two, close one, replay, close one (test mds_cleanup_orphans)" test_27() { - replay_barrier mds + replay_barrier mds1 multiop $DIR/$tfile-1 O_tSc & pid1=$! multiop $DIR/$tfile-2 O_tSc & @@ -588,7 +588,7 @@ test_28() { pid2=$! # give multiop a chance to open sleep 1 - replay_barrier mds + replay_barrier mds1 rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 kill -USR1 $pid2 @@ -610,7 +610,7 @@ test_29() { pid2=$! # give multiop a chance to open sleep 1 - replay_barrier mds + replay_barrier mds1 rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 @@ -635,7 +635,7 @@ test_30() { rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 - replay_barrier mds + replay_barrier mds1 fail mds1 kill -USR1 $pid1 wait $pid1 || return 1 @@ -656,7 +656,7 @@ test_31() { sleep 1 rm -f $DIR/$tfile-1 - replay_barrier mds + replay_barrier mds1 rm -f $DIR/$tfile-2 fail mds1 kill -USR1 $pid1 @@ -689,9 +689,9 @@ run_test 32 "close() notices client eviction; close() after client eviction" # Abort recovery before client complete test_33() { - replay_barrier mds + replay_barrier mds1 touch $DIR/$tfile - fail_abort mds + fail_abort mds1 # this file should be gone, because the replay was aborted $CHECKSTAT -t file $DIR/$tfile && return 1 return 0 @@ -705,8 +705,8 @@ test_34() { sleep 1 rm -f $DIR/$tfile - replay_barrier mds - fail_abort mds + replay_barrier mds1 + fail_abort mds1 kill -USR1 $pid [ -e $DIR/$tfile ] && return 1 sync @@ -725,7 +725,7 @@ test_35() { sync sleep 1 # give a chance to remove from MDS - fail_abort mds + fail_abort mds1 $CHECKSTAT -t file $DIR/$tfile && return 1 || true } run_test 35 "test recovery from llog for unlink op" @@ -733,10 +733,10 @@ run_test 35 "test recovery from llog for unlink op" # b=2432 resent cancel after replay uses wrong cookie, # so don't resend cancels test_36() { - replay_barrier mds + replay_barrier mds1 touch $DIR/$tfile checkstat $DIR/$tfile - facet_failover mds + facet_failover mds1 cancel_lru_locks MDC if dmesg | grep "unknown lock cookie"; then echo "cancel after replay failed" @@ -755,10 +755,10 @@ test_37() { sleep 1 rmdir $DIR/$tfile - replay_barrier mds + replay_barrier mds1 # clear the dmesg buffer so we only see errors from this recovery dmesg -c >/dev/null - fail_abort mds + fail_abort mds1 kill -USR1 $pid dmesg | grep "mds_unlink_orphan.*error .* unlinking orphan" && return 1 sync @@ -769,7 +769,7 @@ run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans test_38() { createmany -o $DIR/$tfile-%d 800 unlinkmany $DIR/$tfile-%d 0 400 - replay_barrier mds + replay_barrier mds1 fail mds1 unlinkmany $DIR/$tfile-%d 400 400 sleep 2 @@ -779,7 +779,7 @@ run_test 38 "test recovery from unlink llog (test llog_gen_rec) " test_39() { createmany -o $DIR/$tfile-%d 800 - replay_barrier mds + replay_barrier mds1 unlinkmany $DIR/$tfile-%d 0 400 fail mds1 unlinkmany $DIR/$tfile-%d 400 400 @@ -801,7 +801,7 @@ test_40(){ writeme -s $MOUNT/${tfile}-2 & WRITE_PID=$! sleep 1 - facet_failover mds + facet_failover mds1 #define OBD_FAIL_MDS_CONNECT_NET 0x117 do_facet mds "sysctl -w lustre.fail_loc=0x80000117" kill -USR1 $PID @@ -873,11 +873,11 @@ run_test 42 "recovery after ost failure" # b=2530 # timeout in MDS/OST recovery RPC will LBUG MDS test_43() { - replay_barrier mds + replay_barrier mds1 # OBD_FAIL_OST_CREATE_NET 0x204 do_facet ost "sysctl -w lustre.fail_loc=0x80000204" - facet_failover mds + facet_failover mds1 df $MOUNT || return 1 sleep 10 do_facet ost "sysctl -w lustre.fail_loc=0" -- GitLab