Newer
Older
sleep $((3 * TIMEOUT))
# Without 2824, this createmany would hang
createmany -o $DIR/$tfile 20 || return 3
unlinkmany $DIR/$tfile 20 || return 4
return 0
}
run_test 47 "MDS->OSC failure during precreate cleanup (2824)"
replay_barrier mds
createmany -o $DIR/$tfile 20 || return 1
# OBD_FAIL_OST_EROFS 0x216
fail mds
do_facet ost1 "sysctl -w lustre.fail_loc=0x80000216"
createmany -o $DIR/$tfile 20 20 || return 2
unlinkmany $DIR/$tfile 40 || return 3
do_facet ost1 "sysctl -w lustre.fail_loc=0"
return 0
run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
local oscdev=`grep ${ost1_svc}-osc- $LPROC/devices | awk '{print $1}'`
[ "$oscdev" ] || return 1
$LCTL --device $oscdev recover && $LCTL --device $oscdev recover
# give the mds_lov_sync threads a chance to run
sleep 5
}
run_test 50 "Double OSC recovery, don't LASSERT (3812)"
# b3764 timed out lock replay
test_52() {
touch $DIR/$tfile
multiop $DIR/$tfile s || return 1
replay_barrier mds
#define OBD_FAIL_LDLM_REPLY 0x30c
do_facet mds "sysctl -w lustre.fail_loc=0x8000030c"
fail mds || return 2
do_facet mds "sysctl -w lustre.fail_loc=0x0"
$CHECKSTAT -t file $DIR/$tfile-* && return 3 || true
}
run_test 52 "time out lock replay (3764)"
#b_cray 53 "|X| open request and close reply while two MDC requests in flight"
#b_cray 54 "|X| open request and close reply while two MDC requests in flight"
#b3761 ASSERTION(hash != 0) failed
test_55() {
# OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
do_facet mds "sysctl -w lustre.fail_loc=0x8000012b"
touch $DIR/$tfile &
# give touch a chance to run
sleep 5
do_facet mds "sysctl -w lustre.fail_loc=0x0"
rm $DIR/$tfile
return 0
run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
#b3440 ASSERTION(rec->ur_fid2->id) failed
run_test 56 "don't replay a symlink open request (3440)"
#recovery one mds-ost setattr from llog
test_57() {
#define OBD_FAIL_MDS_OST_SETATTR 0x12c
do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
replay_barrier mds
fail mds
sleep 1
$CHECKSTAT -t file $DIR/$tfile || return 1
do_facet mds "sysctl -w lustre.fail_loc=0x0"
run_test 57 "test recovery from llog for setattr op"
#define OBD_FAIL_MDS_OST_SETATTR 0x12c
do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
mkdir $DIR/$tdir
createmany -o $DIR/$tdir/$tfile-%d 2500
replay_barrier mds
fail mds
sleep 2
$CHECKSTAT -t file $DIR/$tdir/$tfile-* || return 1
do_facet mds "sysctl -w lustre.fail_loc=0x0"
unlinkmany $DIR/$tdir/$tfile-%d 2500
rmdir $DIR/$tdir
}
run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"
# log_commit_thread vs filter_destroy race used to lead to import use after free
# bug 11658
test_59() {
mkdir $DIR/$tdir
createmany -o $DIR/$tdir/$tfile-%d 200
sync
unlinkmany $DIR/$tdir/$tfile-%d 200
#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
sleep 20
rmdir $DIR/$tdir
}
run_test 59 "test log_commit_thread vs filter_destroy race"
# race between add unlink llog vs cat log init in post_recovery (only for b1_6)
# bug 12086: should no oops and No ctxt error for this test
test_60() {
mkdir $DIR/$tdir
createmany -o $DIR/$tdir/$tfile-%d 200
replay_barrier mds
unlinkmany $DIR/$tdir/$tfile-%d 0 100
fail mds
unlinkmany $DIR/$tdir/$tfile-%d 100 100
local no_ctxt=`dmesg | grep "No ctxt"`
[ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
}
run_test 60 "test llog post recovery init vs llog unlink"
mkdir $DIR/$tdir
createmany -o $DIR/$tdir/$tfile-%d 800
replay_barrier ost1
# OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
unlinkmany $DIR/$tdir/$tfile-%d 800
do_facet ost "sysctl -w lustre.fail_loc=0x80000221"
facet_failover ost1
sleep 10
fail ost1
sleep 30
do_facet ost "sysctl -w lustre.fail_loc=0x0"
$CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
rmdir $DIR/$tdir
}
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
#test race mds llog sync vs llog cleanup
test_61b() {
# OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
do_facet mds "sysctl -w lustre.fail_loc=0x8000013a"
facet_failover mds
sleep 10
fail mds
do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 || return 1
}
run_test 61b "test race mds llog sync vs llog cleanup"
#test race cancel cookie cb vs llog cleanup
test_61c() {
# OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
touch $DIR/$tfile
do_facet ost "sysctl -w lustre.fail_loc=0x80000222"
rm $DIR/$tfile
sleep 10
fail ost1
}
run_test 61c "test race mds llog sync vs llog cleanup"
do_facet mds "sysctl -w lustre.adaptive_history=8"
do_facet ost1 "sysctl -w lustre.adaptive_history=8"
fi
}
test_65() #bug 3055
{
at_start
$LCTL dk > /dev/null
# slow down a request
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
createmany -o $DIR/$tfile 10 > /dev/null
unlinkmany $DIR/$tfile 10 > /dev/null
# check for log message
$LCTL dk | grep "Early reply #" || error "No early reply"
# client should show 30s timeouts
grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
sleep 9
grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
}
run_test 65 "AT: verify early replies"
test_66a() #bug 3055
{
at_start
grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
# adjust 5s at a time so no early reply is sent (within deadline)
do_facet mds "sysctl -w lustre.fail_val=5000"
#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
do_facet mds "sysctl -w lustre.fail_val=10000"
do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
do_facet mds "sysctl -w lustre.fail_loc=0"
sleep 9
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
echo "Current MDT timeout $CUR, worst $WORST"
[ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
}
run_test 66a "AT: verify MDT service time adjusts with no early replies"
test_66b() #bug 3055
{
at_start
ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
sysctl -w lustre.fail_val=$(($ORIG + 5))
#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
sysctl -w lustre.fail_loc=0x50c
ls $DIR/$tfile > /dev/null 2>&1
sysctl -w lustre.fail_loc=0
CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
[ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
}
run_test 66b "AT: verify net latency adjusts"
test_67a() #bug 3055
{
at_start
CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
# sleeping threads may drive values above this
do_facet ost1 "sysctl -w lustre.fail_val=400"
#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
do_facet ost1 "sysctl -w lustre.fail_loc=0"
CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
ATTEMPTS=$(($CONN2 - $CONN1))
echo "$ATTEMPTS osc reconnect attemps on gradual slow"
[ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
return 0
}
run_test 67a "AT: verify slow request processing doesn't induce reconnects"
test_67b() #bug 3055
{
at_start
CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
#define OBD_FAIL_OST_PAUSE_CREATE 0x223
do_facet ost1 "sysctl -w lustre.fail_val=20000"
do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
cp /etc/profile $DIR/$tfile || error "cp failed"
client_reconnect
cat $LPROC/ost/OSS/ost_create/timeouts
log "phase 2"
CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
ATTEMPTS=$(($CONN2 - $CONN1))
echo "$ATTEMPTS osc reconnect attemps on instant slow"
# do it again; should not timeout
do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
cp /etc/profile $DIR/$tfile || error "cp failed"
do_facet ost1 "sysctl -w lustre.fail_loc=0"
client_reconnect
cat $LPROC/ost/OSS/ost_create/timeouts
CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
ATTEMPTS=$(($CONN3 - $CONN2))
echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
[ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
return 0
}
run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
if [ -n "$ATOLDBASE" ]; then
do_facet mds "sysctl -w lustre.adaptive_history=$ATOLDBASE"
do_facet ost1 "sysctl -w lustre.adaptive_history=$ATOLDBASE"
equals_msg `basename $0`: test complete, cleaning up