Skip to content
Snippets Groups Projects
replay-single.sh 34.5 KiB
Newer Older
    sleep $((3 * TIMEOUT))

    # Without 2824, this createmany would hang 
    createmany -o $DIR/$tfile 20 || return 3
    unlinkmany $DIR/$tfile 20 || return 4

Nathan Rutman's avatar
Nathan Rutman committed
    do_facet ost1 "sysctl -w lustre.fail_loc=0"
    return 0
}
run_test 47 "MDS->OSC failure during precreate cleanup (2824)"

Robert Read's avatar
Robert Read committed
test_48() {
Nathan Rutman's avatar
Nathan Rutman committed
    replay_barrier mds
    createmany -o $DIR/$tfile 20  || return 1
    # OBD_FAIL_OST_EROFS 0x216
    fail mds
    do_facet ost1 "sysctl -w lustre.fail_loc=0x80000216"
Wang Di's avatar
Wang Di committed
    df $MOUNT || return 2
Robert Read's avatar
Robert Read committed

Nathan Rutman's avatar
Nathan Rutman committed
    createmany -o $DIR/$tfile 20 20 || return 2
    unlinkmany $DIR/$tfile 40 || return 3
Robert Read's avatar
Robert Read committed

Nathan Rutman's avatar
Nathan Rutman committed
    do_facet ost1 "sysctl -w lustre.fail_loc=0"
    return 0
alex's avatar
alex committed
}
Nathan Rutman's avatar
Nathan Rutman committed
run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
alex's avatar
alex committed

Nathan Rutman's avatar
Nathan Rutman committed
    local oscdev=`grep ${ost1_svc}-osc- $LPROC/devices | awk '{print $1}'`
    [ "$oscdev" ] || return 1
    $LCTL --device $oscdev recover &&  $LCTL --device $oscdev recover
    # give the mds_lov_sync threads a chance to run
    sleep 5
}
run_test 50 "Double OSC recovery, don't LASSERT (3812)"

# b3764 timed out lock replay
test_52() {
    touch $DIR/$tfile
Nathan Rutman's avatar
Nathan Rutman committed
    cancel_lru_locks mdc
Nathan Rutman's avatar
Nathan Rutman committed
    multiop $DIR/$tfile s || return 1
    replay_barrier mds
#define OBD_FAIL_LDLM_REPLY              0x30c
    do_facet mds "sysctl -w lustre.fail_loc=0x8000030c"
    fail mds || return 2
    do_facet mds "sysctl -w lustre.fail_loc=0x0"

    $CHECKSTAT -t file $DIR/$tfile-* && return 3 || true
}
run_test 52 "time out lock replay (3764)"

Nathan Rutman's avatar
Nathan Rutman committed
#b_cray 53 "|X| open request and close reply while two MDC requests in flight"
#b_cray 54 "|X| open request and close reply while two MDC requests in flight"
Nathan Rutman's avatar
Nathan Rutman committed
#b3761 ASSERTION(hash != 0) failed
test_55() {
# OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
    do_facet mds "sysctl -w lustre.fail_loc=0x8000012b"
    touch $DIR/$tfile &
    # give touch a chance to run
    sleep 5
    do_facet mds "sysctl -w lustre.fail_loc=0x0"
    rm $DIR/$tfile
    return 0
Nathan Rutman's avatar
Nathan Rutman committed
run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
Yury Umanets's avatar
Yury Umanets committed

#b3440 ASSERTION(rec->ur_fid2->id) failed
Nathan Rutman's avatar
Nathan Rutman committed
test_56() {
Yury Umanets's avatar
Yury Umanets committed
    ln -s foo $DIR/$tfile
Nathan Rutman's avatar
Nathan Rutman committed
    replay_barrier mds
Yury Umanets's avatar
Yury Umanets committed
    #drop_reply "cat $DIR/$tfile"
Nathan Rutman's avatar
Nathan Rutman committed
    fail mds
Yury Umanets's avatar
Yury Umanets committed
    sleep 10
}
Nathan Rutman's avatar
Nathan Rutman committed
run_test 56 "don't replay a symlink open request (3440)"
Nathan Rutman's avatar
Nathan Rutman committed
#recovery one mds-ost setattr from llog
test_57() {
#define OBD_FAIL_MDS_OST_SETATTR       0x12c
    do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
Yury Umanets's avatar
Yury Umanets committed
    touch $DIR/$tfile
Nathan Rutman's avatar
Nathan Rutman committed
    replay_barrier mds
    fail mds
    sleep 1
    $CHECKSTAT -t file $DIR/$tfile || return 1
    do_facet mds "sysctl -w lustre.fail_loc=0x0"
Yury Umanets's avatar
Yury Umanets committed
    rm $DIR/$tfile
}
Nathan Rutman's avatar
Nathan Rutman committed
run_test 57 "test recovery from llog for setattr op"
alex's avatar
alex committed

Nathan Rutman's avatar
Nathan Rutman committed
#recovery many mds-ost setattr from llog
test_58() {
Nathan Rutman's avatar
Nathan Rutman committed
#define OBD_FAIL_MDS_OST_SETATTR       0x12c
    do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
    mkdir $DIR/$tdir
    createmany -o $DIR/$tdir/$tfile-%d 2500
    replay_barrier mds
    fail mds
    sleep 2
    $CHECKSTAT -t file $DIR/$tdir/$tfile-* || return 1
    do_facet mds "sysctl -w lustre.fail_loc=0x0"
    unlinkmany $DIR/$tdir/$tfile-%d 2500
    rmdir $DIR/$tdir
}
run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"
Robert Read's avatar
Robert Read committed

Oleg Drokin's avatar
Oleg Drokin committed
# log_commit_thread vs filter_destroy race used to lead to import use after free
# bug 11658
test_59() {
    mkdir $DIR/$tdir
    createmany -o $DIR/$tdir/$tfile-%d 200
    sync
    unlinkmany $DIR/$tdir/$tfile-%d 200
#define OBD_FAIL_PTLRPC_DELAY_RECOV       0x507
Nathan Rutman's avatar
Nathan Rutman committed
    do_facet ost1 "sysctl -w lustre.fail_loc=0x507"
    fail ost1
Oleg Drokin's avatar
Oleg Drokin committed
    fail mds
Nathan Rutman's avatar
Nathan Rutman committed
    do_facet ost1 "sysctl -w lustre.fail_loc=0x0"
Oleg Drokin's avatar
Oleg Drokin committed
    sleep 20
    rmdir $DIR/$tdir
}
run_test 59 "test log_commit_thread vs filter_destroy race"

Wang Di's avatar
Wang Di committed
# race between add unlink llog vs cat log init in post_recovery (only for b1_6)
# bug 12086: should no oops and No ctxt error for this test
test_60() {
    mkdir $DIR/$tdir
    createmany -o $DIR/$tdir/$tfile-%d 200
    replay_barrier mds
    unlinkmany $DIR/$tdir/$tfile-%d 0 100
    fail mds
    unlinkmany $DIR/$tdir/$tfile-%d 100 100
    local no_ctxt=`dmesg | grep "No ctxt"`
    [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery" 
}
run_test 60 "test llog post recovery init vs llog unlink"
Oleg Drokin's avatar
Oleg Drokin committed

Wang Di's avatar
Wang Di committed
#test race  llog recovery thread vs llog cleanup
Nathan Rutman's avatar
Nathan Rutman committed
test_61a() {
Wang Di's avatar
Wang Di committed
    mkdir $DIR/$tdir
    createmany -o $DIR/$tdir/$tfile-%d 800
    replay_barrier ost1 
#   OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 
    unlinkmany $DIR/$tdir/$tfile-%d 800 
    do_facet ost "sysctl -w lustre.fail_loc=0x80000221"
    facet_failover ost1
    sleep 10 
    fail ost1
    sleep 30
    do_facet ost "sysctl -w lustre.fail_loc=0x0"
    $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
    rmdir $DIR/$tdir
}
Nathan Rutman's avatar
Nathan Rutman committed
run_test 61a "test race llog recovery vs llog cleanup"
Wang Di's avatar
Wang Di committed

#test race  mds llog sync vs llog cleanup
test_61b() {
#   OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a 
    do_facet mds "sysctl -w lustre.fail_loc=0x8000013a"
    facet_failover mds 
    sleep 10
    fail mds
    do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 || return 1
}
run_test 61b "test race mds llog sync vs llog cleanup"

#test race  cancel cookie cb vs llog cleanup
test_61c() {
#   OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 
    touch $DIR/$tfile 
    do_facet ost "sysctl -w lustre.fail_loc=0x80000222"
    rm $DIR/$tfile    
    sleep 10
    fail ost1
}
run_test 61c "test race mds llog sync vs llog cleanup"

Nathan Rutman's avatar
Nathan Rutman committed
#Adaptive Timeouts
Nathan Rutman's avatar
Nathan Rutman committed
at_start() #bug 3055
{
    if [ -z "$ATOLDBASE" ]; then
Nathan Rutman's avatar
Nathan Rutman committed
	ATOLDBASE=$(do_facet mds "sysctl -n lustre.adaptive_history")
Nathan Rutman's avatar
Nathan Rutman committed
        # speed up the timebase so we can check decreasing AT
Nathan Rutman's avatar
Nathan Rutman committed
	do_facet mds "sysctl -w lustre.adaptive_history=8"
	do_facet ost1 "sysctl -w lustre.adaptive_history=8"
Nathan Rutman's avatar
Nathan Rutman committed
    fi
}

test_65() #bug 3055
{
    at_start
    $LCTL dk > /dev/null
    # slow down a request
Nathan Rutman's avatar
Nathan Rutman committed
    do_facet mds sysctl -w lustre.fail_val=30000
Nathan Rutman's avatar
Nathan Rutman committed
#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
Nathan Rutman's avatar
Nathan Rutman committed
    do_facet mds sysctl -w lustre.fail_loc=0x8000050a
Nathan Rutman's avatar
Nathan Rutman committed
    createmany -o $DIR/$tfile 10 > /dev/null
    unlinkmany $DIR/$tfile 10 > /dev/null
    # check for log message
    $LCTL dk | grep "Early reply #" || error "No early reply" 
    # client should show 30s timeouts
    grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
    sleep 9
    grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
}
run_test 65 "AT: verify early replies"

test_66a() #bug 3055
{
    at_start
    grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
    # adjust 5s at a time so no early reply is sent (within deadline)
    do_facet mds "sysctl -w lustre.fail_val=5000"
#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
    do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
    createmany -o $DIR/$tfile 20 > /dev/null
    unlinkmany $DIR/$tfile 20 > /dev/null
    grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
    do_facet mds "sysctl -w lustre.fail_val=10000"
    do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
    createmany -o $DIR/$tfile 20 > /dev/null
    unlinkmany $DIR/$tfile 20 > /dev/null
    grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
    do_facet mds "sysctl -w lustre.fail_loc=0"
    sleep 9
    createmany -o $DIR/$tfile 20 > /dev/null
    unlinkmany $DIR/$tfile 20 > /dev/null
    grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
    CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
    WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
    echo "Current MDT timeout $CUR, worst $WORST"
    [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST" 
}
run_test 66a "AT: verify MDT service time adjusts with no early replies"

test_66b() #bug 3055
{
    at_start
    ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
    sysctl -w lustre.fail_val=$(($ORIG + 5))
#define OBD_FAIL_PTLRPC_PAUSE_REP      0x50c
    sysctl -w lustre.fail_loc=0x50c
    ls $DIR/$tfile > /dev/null 2>&1
    sysctl -w lustre.fail_loc=0
    CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
    WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
    echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
    [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG" 
}
run_test 66b "AT: verify net latency adjusts"

test_67a() #bug 3055
{
    at_start
    CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
    # sleeping threads may drive values above this
    do_facet ost1 "sysctl -w lustre.fail_val=400"
#define OBD_FAIL_PTLRPC_PAUSE_REQ    0x50a
    do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
    createmany -o $DIR/$tfile 20 > /dev/null
    unlinkmany $DIR/$tfile 20 > /dev/null
    do_facet ost1 "sysctl -w lustre.fail_loc=0"
    CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
    ATTEMPTS=$(($CONN2 - $CONN1))
    echo "$ATTEMPTS osc reconnect attemps on gradual slow"
    [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
    return 0
}
run_test 67a "AT: verify slow request processing doesn't induce reconnects"

test_67b() #bug 3055
{
    at_start
    CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
#define OBD_FAIL_OST_PAUSE_CREATE        0x223
    do_facet ost1 "sysctl -w lustre.fail_val=20000"
    do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
    cp /etc/profile $DIR/$tfile || error "cp failed"
    client_reconnect
    cat $LPROC/ost/OSS/ost_create/timeouts
    log "phase 2"
    CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
    ATTEMPTS=$(($CONN2 - $CONN1))
    echo "$ATTEMPTS osc reconnect attemps on instant slow"
    # do it again; should not timeout
    do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
    cp /etc/profile $DIR/$tfile || error "cp failed"
    do_facet ost1 "sysctl -w lustre.fail_loc=0"
    client_reconnect
    cat $LPROC/ost/OSS/ost_create/timeouts
    CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
    ATTEMPTS=$(($CONN3 - $CONN2))
    echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
    [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
    return 0
}
run_test 67b "AT: verify instant slowdown doesn't induce reconnects"

if [ -n "$ATOLDBASE" ]; then
Nathan Rutman's avatar
Nathan Rutman committed
    do_facet mds "sysctl -w lustre.adaptive_history=$ATOLDBASE"
    do_facet ost1 "sysctl -w lustre.adaptive_history=$ATOLDBASE"
Nathan Rutman's avatar
Nathan Rutman committed
fi
# end of AT tests includes above lines


Nathan Rutman's avatar
Nathan Rutman committed
equals_msg `basename $0`: test complete, cleaning up
Wang Di's avatar
Wang Di committed
check_and_cleanup_lustre
Elena Gryaznova's avatar
Elena Gryaznova committed
[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true