Newer
Older
[ -d /proc/$close_pid ] || return 2
replay_barrier_nodf mds
fail mds
wait $close_pid || return 3
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
rm -rf $DIR/${tdir}-*
}
run_test 53a "|X| close request while two MDC requests in flight"
test_53b() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
multiop $DIR/$tdir-1/f O_c &
close_pid=$!
#define OBD_FAIL_MDS_REINT_NET 0x107
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
kill -USR1 $close_pid
cancel_lru_locks MDC # force the close
wait $close_pid || return 1
# open should still be here
[ -d /proc/$open_pid ] || return 2
replay_barrier_nodf mds
fail mds
wait $open_pid || return 3
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
rm -rf $DIR/${tdir}-*
}
run_test 53b "|X| open request while two MDC requests in flight"
test_53c() {
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
close_pid=$!
kill -USR1 $close_pid
cancel_lru_locks MDC # force the close
replay_barrier_nodf mds
fail_nodf mds
wait $open_pid || return 1
sleep 2
# close should be gone
[ -d /proc/$close_pid ] && return 2
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
rm -rf $DIR/${tdir}-*
}
run_test 53c "|X| open request and close request while two MDC requests in flight"
test_53d() {
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
close_pid=$!
# give multiop a chance to open
sleep 1
# define OBD_FAIL_MDS_CLOSE_NET_REP 0X138
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
mcreate $DIR/${tdir}-2/f || return 1
# close should still be here
[ -d /proc/$close_pid ] || return 2
replay_barrier_nodf mds
fail mds
wait $close_pid || return 3
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
rm -rf $DIR/${tdir}-*
}
run_test 53d "|X| close reply while two MDC requests in flight"
test_53e() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
multiop $DIR/$tdir-1/f O_c &
close_pid=$!
#define OBD_FAIL_MDS_REINT_NET_REP 0x119
kill -USR1 $close_pid
cancel_lru_locks MDC # force the close
wait $close_pid || return 1
# open should still be here
[ -d /proc/$open_pid ] || return 2
replay_barrier_nodf mds
fail mds
wait $open_pid || return 3
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
rm -rf $DIR/${tdir}-*
}
run_test 53e "|X| open reply while two MDC requests in flight"
test_53f() {
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
close_pid=$!
kill -USR1 $close_pid
cancel_lru_locks MDC
replay_barrier_nodf mds
fail_nodf mds
wait $open_pid || return 1
sleep 2
#close should be gone
[ -d /proc/$close_pid ] && return 2
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
rm -rf $DIR/${tdir}-*
}
run_test 53f "|X| open reply and close reply while two MDC requests in flight"
test_53g() {
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
close_pid=$!
kill -USR1 $close_pid
cancel_lru_locks MDC # force the close
replay_barrier_nodf mds
fail_nodf mds
wait $open_pid || return 1
sleep 2
# close should be gone
[ -d /proc/$close_pid ] && return 2
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
rm -rf $DIR/${tdir}-*
}
run_test 53g "|X| drop open reply and close request while close and open are both in flight"
test_53h() {
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
close_pid=$!
kill -USR1 $close_pid
cancel_lru_locks MDC # force the close
sleep 1
replay_barrier_nodf mds
fail_nodf mds
wait $open_pid || return 1
sleep 2
# close should be gone
[ -d /proc/$close_pid ] && return 2
$CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
$CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
rm -rf $DIR/${tdir}-*
}
run_test 53h "|X| open request and close reply while two MDC requests in flight"
#b3761 ASSERTION(hash != 0) failed
test_55() {
# OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
touch $DIR/$tfile &
# give touch a chance to run
sleep 5
run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
#b3440 ASSERTION(rec->ur_fid2->id) failed
run_test 56 "don't replay a symlink open request (3440)"
#recovery one mds-ost setattr from llog
test_57() {
#define OBD_FAIL_MDS_OST_SETATTR 0x12c
replay_barrier mds
fail mds
sleep 1
$CHECKSTAT -t file $DIR/$tfile || return 1
run_test 57 "test recovery from llog for setattr op"
createmany -o $DIR/$tdir/$tfile-%d 2500
replay_barrier mds
fail mds
sleep 2
$CHECKSTAT -t file $DIR/$tdir/$tfile-* >/dev/null || return 1
unlinkmany $DIR/$tdir/$tfile-%d 2500
rmdir $DIR/$tdir
}
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
run_test 58a "test recovery from llog for setattr op (test llog_gen_rec)"
test_58b() {
mount_client $MOUNT2
mkdir -p $DIR/$tdir
touch $DIR/$tdir/$tfile
replay_barrier mds
setfattr -n trusted.foo -v bar $DIR/$tdir/$tfile
fail mds
VAL=`getfattr --absolute-names --only-value -n trusted.foo $MOUNT2/$tdir/$tfile`
[ x$VAL = x"bar" ] || return 1
rm -f $DIR/$tdir/$tfile
rmdir $DIR/$tdir
zconf_umount `hostname` $MOUNT2
}
run_test 58b "test replay of setxattr op"
test_58c() { # bug 16570
mount_client $MOUNT2
mkdir -p $DIR/$tdir
touch $DIR/$tdir/$tfile
drop_request "setfattr -n trusted.foo -v bar $DIR/$tdir/$tfile" || \
return 1
VAL=`getfattr --absolute-names --only-value -n trusted.foo $MOUNT2/$tdir/$tfile`
[ x$VAL = x"bar" ] || return 2
drop_reint_reply "setfattr -n trusted.foo1 -v bar1 $DIR/$tdir/$tfile" || \
return 3
VAL=`getfattr --absolute-names --only-value -n trusted.foo1 $MOUNT2/$tdir/$tfile`
[ x$VAL = x"bar1" ] || return 4
rm -f $DIR/$tdir/$tfile
rmdir $DIR/$tdir
zconf_umount `hostname` $MOUNT2
}
run_test 58c "resend/reconstruct setxattr op"
# log_commit_thread vs filter_destroy race used to lead to import use after free
# bug 11658
test_59() {
createmany -o $DIR/$tdir/$tfile-%d 200
sync
unlinkmany $DIR/$tdir/$tfile-%d 200
#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
sleep 20
rmdir $DIR/$tdir
}
run_test 59 "test log_commit_thread vs filter_destroy race"
# race between add unlink llog vs cat log init in post_recovery (only for b1_6)
# bug 12086: should no oops and No ctxt error for this test
test_60() {
createmany -o $DIR/$tdir/$tfile-%d 200
replay_barrier mds
unlinkmany $DIR/$tdir/$tfile-%d 0 100
fail mds
unlinkmany $DIR/$tdir/$tfile-%d 100 100
local no_ctxt=`dmesg | grep "No ctxt"`
[ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
}
run_test 60 "test llog post recovery init vs llog unlink"
createmany -o $DIR/$tdir/$tfile-%d 800
replay_barrier ost1
# OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
unlinkmany $DIR/$tdir/$tfile-%d 800
$CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
rmdir $DIR/$tdir
}
#test race mds llog sync vs llog cleanup
test_61b() {
# OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
facet_failover mds
sleep 10
fail mds
do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 || return 1
}
run_test 61b "test race mds llog sync vs llog cleanup"
#test race cancel cookie cb vs llog cleanup
test_61c() {
# OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
touch $DIR/$tfile
rm $DIR/$tfile
sleep 10
fail ost1
}
run_test 61c "test race mds llog sync vs llog cleanup"
test_61d() { # bug 16002
#define OBD_FAIL_OBD_LLOG_SETUP 0x605
stop mds
do_facet mds "lctl set_param fail_loc=0x80000605"
start mds $MDSDEV $MDS_MOUNT_OPTS && error "mds start should have failed"
do_facet mds "lctl set_param fail_loc=0"
start mds $MDSDEV $MDS_MOUNT_OPTS || error "cannot restart mds"
}
run_test 61d "error in llog_setup should cleanup the llog context correctly"
replay_barrier mds
createmany -o $DIR/$tdir/$tfile- 25
#define OBD_FAIL_TGT_REPLAY_DROP 0x707
unlinkmany $DIR/$tdir/$tfile- 25 || return 2
return 0
}
run_test 62 "don't mis-drop resent replay"
# Suppose that all osts have the same at_max
for facet in mds client ost; do
eval AT_MAX_SAVE_${facet}=$(at_max_get $facet)
done
if ! at_is_valid; then
skip "AT env is invalid"
return 1
fi
local at_max
for facet in mds client ost; do
at_max=$(at_max_get $facet)
if [ $at_max -ne $at_max_new ]; then
echo "AT value on $facet is $at_max, set it by force temporarily to $at_max_new"
at_max_set $at_max_new $facet
AT_MAX_SET=1
fi
done
local at_history=$(do_facet mds "find /sys/ -name at_history")
[ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1
ATOLDBASE=$(do_facet mds "cat $at_history")
do_facet mds "echo 8 >> $at_history"
do_facet ost1 "echo 8 >> $at_history"
# sleep for a while to cool down, should be > 8s and also allow
# at least one ping to be sent. simply use TIMEOUT to be safe.
sleep $TIMEOUT
# Slow down a request to the current service time, this is critical
# because previous tests may have caused this value to increase.
REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts |
awk '/portal 12/ {print $5}'`
REQ_DELAY=$((${REQ_DELAY} + 5))
do_facet mds lctl set_param fail_val=$((${REQ_DELAY} * 1000))
createmany -o $DIR/$tfile 10 > /dev/null
unlinkmany $DIR/$tfile 10 > /dev/null
# check for log message
$LCTL dk | grep "Early reply #" || error "No early reply"
run_test 65a "AT: verify early replies"
test_65b() #bug 3055
{
# Slow down a request to the current service time, this is critical
# because previous tests may have caused this value to increase.
REQ_DELAY=`lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts |
awk '/portal 6/ {print $5}'`
REQ_DELAY=$((${REQ_DELAY} + 5))
do_facet ost1 lctl set_param fail_val=${REQ_DELAY}
rm -f $DIR/$tfile
lfs setstripe $DIR/$tfile --index=0 --count=1
# force some real bulk transfer
# check for log message
$LCTL dk | grep "Early reply #" || error "No early reply"
debugrestore
}
run_test 65b "AT: verify early replies on packed reply / bulk"
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
# adjust 5s at a time so no early reply is sent (within deadline)
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
do_facet mds "lctl set_param fail_val=10000"
do_facet mds "lctl set_param fail_loc=0x8000050a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
sleep 9
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal | grep "portal 12"
CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $5}')
WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $7}')
echo "Current MDT timeout $CUR, worst $WORST"
[ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
}
run_test 66a "AT: verify MDT service time adjusts with no early replies"
test_66b() #bug 3055
{
ORIG=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}')
CUR=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}')
WORST=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $6}')
echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
[ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
}
run_test 66b "AT: verify net latency adjusts"
test_67a() #bug 3055
{
CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN2 - $CONN1))
echo "$ATTEMPTS osc reconnect attemps on gradual slow"
[ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect"
return 0
}
run_test 67a "AT: verify slow request processing doesn't induce reconnects"
test_67b() #bug 3055
{
CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
do_facet ost1 "lctl set_param fail_val=20000"
do_facet ost1 "lctl set_param fail_loc=0x80000223"
cp /etc/profile $DIR/$tfile || error "cp failed"
client_reconnect
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN2 - $CONN1))
echo "$ATTEMPTS osc reconnect attemps on instant slow"
# do it again; should not timeout
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN3 - $CONN2))
echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
[ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
return 0
}
run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
at_start || return 0
local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min)
[ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0
local ENQ_MIN=$(cat $ldlm_enqueue_min)
echo $TIMEOUT >> $ldlm_enqueue_min
rm -f $DIR/${tfile}_[1-2]
lfs setstripe $DIR/$tfile --index=0 --count=1
#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
lctl set_param fail_val=$(($TIMEOUT - 1))
lctl set_param fail_loc=0x80000312
lctl set_param fail_val=$((TIMEOUT * 3 / 2))
lctl set_param fail_loc=0x80000312
return 0
}
run_test 68 "AT: verify slowing locks"
at_history=$(do_facet mds "find /sys/ -name at_history")
do_facet mds "echo $ATOLDBASE >> $at_history" || true
do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
for facet in mds client ost; do
var=AT_MAX_SAVE_${facet}
echo restore AT on $facet to saved value ${!var}
at_max_set ${!var} $facet
AT_NEW=$(at_max_get $facet)
echo Restored AT value on $facet $AT_NEW
[ $AT_NEW -ne ${!var} ] && \
error "$facet : AT value was not restored SAVED ${!var} NEW $AT_NEW"
done
# start multi-client tests
test_70a () {
[ -z "$CLIENTS" ] && \
{ skip "Need two or more clients." && return; }
[ $CLIENTCOUNT -lt 2 ] && \
{ skip "Need two or more clients, have $CLIENTCOUNT" && return; }
echo "mount clients $CLIENTS ..."
zconf_mount_clients $CLIENTS $DIR
local clients=${CLIENTS//,/ }
echo "Write/read files on $DIR ; clients $CLIENTS ... "
for CLIENT in $clients; do
do_node $CLIENT dd bs=1M count=10 if=/dev/zero \
of=$DIR/${tfile}_${CLIENT} 2>/dev/null || \
error "dd failed on $CLIENT"
done
local prev_client=$(echo $clients | sed 's/^.* \(.\+\)$/\1/')
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
for C in ${CLIENTS//,/ }; do
do_node $prev_client dd if=$DIR/${tfile}_${C} of=/dev/null 2>/dev/null || \
error "dd if=$DIR/${tfile}_${C} failed on $prev_client"
prev_client=$C
done
ls $DIR
}
run_test 70a "check multi client t-f"
test_70b () {
[ -z "$CLIENTS" ] && \
{ skip "Need two or more clients." && return; }
[ $CLIENTCOUNT -lt 2 ] && \
{ skip "Need two or more clients, have $CLIENTCOUNT" && return; }
zconf_mount_clients $CLIENTS $DIR
local duration="-t 60"
local cmd="rundbench 1 $duration "
local PID=""
for CLIENT in ${CLIENTS//,/ }; do
$PDSH $CLIENT "set -x; PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:${DBENCH_LIB} DBENCH_LIB=${DBENCH_LIB} $cmd" &
PID=$!
echo $PID >pid.$CLIENT
echo "Started load PID=`cat pid.$CLIENT`"
done
replay_barrier mds
sleep 3 # give clients a time to do operations
log "$TESTNAME fail mds 1"
fail mds
# wait for client to reconnect to MDS
sleep $TIMEOUT
for CLIENT in ${CLIENTS//,/ }; do
PID=`cat pid.$CLIENT`
wait $PID
rc=$?
echo "load on ${CLIENT} returned $rc"
done
}
run_test 70b "mds recovery; $CLIENTCOUNT clients"
# end multi-client tests
equals_msg `basename $0`: test complete, cleaning up