diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index f4b775422543d6a0fe23573a249ebfa52c63794a..018770c6573ca4fafad64255ec6c3e51533c3a3b 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -35,6 +35,8 @@ assert_env mds_HOST MDS_MKFS_OPTS assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT assert_env LIVE_CLIENT FSNAME +# FAIL_CLIENTS list should not contain the LIVE_CLIENT +FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g") # This can be a regexp, to allow more clients CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"} @@ -371,6 +373,7 @@ test_6() { echo "Test Lustre stability after OST failure" client_df & DFPIDA=$! + echo DFPIDA=$DFPIDA sleep 5 #CLIENT Portion @@ -381,17 +384,20 @@ test_6() { echo "Test Lustre stability after CLIENTs failure" client_df & DFPIDB=$! + echo DFPIDB=$DFPIDB sleep 5 #Reintegration echo "Reintegrating OST/CLIENTs" wait_for ost1 start_ost 1 - reintegrate_clients + reintegrate_clients || return 1 sleep 5 + wait_remote_prog df $((TIMEOUT * 3 + 10)) wait $DFPIDA wait $DFPIDB + echo "Verifying mount" [ -z "$(mounted_lustre_filesystems)" ] && return 3 client_df diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 39518fc1df9f5434dc5560c18ceef129029b0cea..321d8f3d2af1bd6e569063f7da97397bf3ef0500 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -674,6 +674,38 @@ wait_exit_ST () { return 1 } +wait_remote_prog () { + local prog=$1 + local WAIT=0 + local INTERVAL=5 + local rc=0 + + [ "$PDSH" = "no_dsh" ] && return 0 + + while [ $WAIT -lt $2 ]; do + running=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep) + [ -z "${running}" ] && return 0 + echo "waited $WAIT for: " + echo "$running" + [ $INTERVAL -lt 60 ] && INTERVAL=$((INTERVAL + INTERVAL)) + sleep $INTERVAL + WAIT=$((WAIT + INTERVAL)) + done + local pids=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep | awk '{print $2}') + [ -z "$pids" ] && return 0 + echo "$PDSH processes still exists after $WAIT seconds. Still running: $pids" + for pid in $pids; do + cat /proc/${pid}/status || true + cat /proc/${pid}/wchan || true + echo "Killing $pid" + kill -9 $pid || true + sleep 1 + ps -P $pid && rc=1 + done + + return $rc +} + client_df() { # not every config has many clients if [ ! -z "$CLIENTS" ]; then