* Added obdfilter-survey + README

d03c475e · Eric Barton · 1add4b9c · d03c475e · d03c475e · d03c475e
Commit d03c475e authored 20 years ago by Eric Barton
--- a/lustre-iokit/obdfilter-survey/README
+++ b/lustre-iokit/obdfilter-survey/README
+Requirements
+------------
+. lustre OSS up and running
+Overview
+--------
+This survey may be used to characterise the performance of a lustre OSS.
+It can exercise the OSS either locally or remotely via the network.
+The script uses lctl::test_brw to drive the echo_client doing sequential
+I/O with varying numbers of threads and objects.  One instance of lctl is
+spawned for each OST.
+Running
+-------
+The script must be customised according to the particular device under test
+and where it should keep its working files.   Customisation variables are
+described clearly at the start of the script.
+When the script runs, it creates a number of working files and a pair of
+result files.  All files start with the prefix given by ${rslt}.
+${rslt}_<date/time>.summary       same as stdout
+${rslt}_<date/time>.detail_tmp*   tmp files
+${rslt}_<date/time>.detail        collected tmp files for post-mortem
+The script iterates over the given numbers of threads and objects
+performing all the specified tests and checking that all test processes
+completed successfully.
+Local OSS
+---------
+To test a local OSS, setup 'ost_names' with the names of each OST.  If you
+are unsure, do 'lctl device_list' and looks for obdfilter instanced e.g...
+[root@ns9 root]# lctl device_list
+  0 UP confobd conf_ost3 OSD_ost3_ns9_UUID 1
+  1 UP obdfilter ost3 ost3_UUID 1
+  2 UP ost OSS OSS_UUID 1
+  3 AT confobd conf_ost12 OSD_ost12_ns9_UUID 1
+[root@ns9 root]# 
+Here device number 1 is an obdfilter instance called 'ost3'.
+The script configures an instance of echo_client for each name in ost_names
+and tears it down on normal completion.  Note that it does NOT clean up
+properly (i.e. manual cleanup is required) if it is not allowed to run to
+completion.
+Remote OSS
+----------
+To test OSS performance over the network, you need to create a lustre
+configuration that creates echo_client instances for each OST.
+Script output
+-------------
+The summary file and stdout contain lines like...
+ost 8 sz 67108864K rsz 1024 obj    8 thr    8 write  613.54 [ 64.00, 82.00] 
+ost 8          is the total number of OSTs under test.
+sz 67108864K   is the total amount of data read or written (in K).
+rsz 1024       is the record size (size of each echo_client I/O).
+obj    8       is the total number of objects over all OSTs
+thr    8       is the total number of threads over all OSTs and objects
+write          is the test name.  If more tests have been specified they
+               all appear on the same line.
+613.54         is the aggregate bandwidth over all OSTs measured by
+	       dividing the total number of MB by the elapsed time.
+[64.00, 82.00] are the minimum and maximum instantaneous bandwidths seen on
+               any individual OST.  
+Note that although the numbers of threads and objects are specifed per-OST
+in the customisation section of the script, results are reported aggregated
+over all OSTs.
+Visualising Results
+-------------------
+I've found it most useful to import the summary data (it's fixed width)
+into Excel (or any graphing package) and graph bandwidth v. # threads for
+varying numbers of concurrent regions.  This shows how the device performs
+with varying queue depth.  If the series (varying numbers of concurrent
+regions) all seem to land on top of each other, it shows the device is
+phased by seeks at the given record size.
--- a/lustre-iokit/obdfilter-survey/obdfilter-survey
+++ b/lustre-iokit/obdfilter-survey/obdfilter-survey
+#!/bin/bash
+######################################################################
+# customize per survey
+# specify either the obdecho client names or the obdfilter names
+client_names=()
+ost_names=(ost{1,2,3,4,5,6,7,8})
+# result file prefix
+rslt=/tmp/obdfilter_survey
+# lustre root (leave blank unless running with own source)
+lustre_root=
+# what to do (we always do an initial write)
+#tests="rewrite read reread rewrite_again"
+tests="rewrite read"
+# total size (MBytes)
+# large enough to avoid cache effects
+size=8192
+# record size (KBytes)
+rszlo=1024
+rszhi=1024
+# number of objects per OST
+nobjlo=1
+nobjhi=32
+# threads per OST (1024 max)
+thrlo=1
+thrhi=128
+# restart from here iff all are defined
+restart_rsz=
+restart_thr=1
+restart_nobj=1
+# machine's page size
+PAGE_SIZE=64
+# max buffer_mem (total_threads * buffer size)
+# (to avoid lctl ENOMEM problems)
+max_buffer_mem=$((256*1024))
+#####################################################################
+snap=1
+verify=1
+check_obdecho() {
+    lsmod | grep obdecho > /dev/null 2>&1
+}
+check_obdecho
+load_obdecho=$(($? != 0))
+if [ -z "$lustre_root" ]; then
+    lctl=lctl
+    if ((load_obdecho)); then
+	modprobe obdecho
+    fi
+else
+    lctl=${lustre_root}/lctl
+    if ((load_obdecho)); then
+	if [ -f ${lustre_root}/obdecho/obdecho.ko ]; then
+	    insmod ${lustre_root}/obdecho/obdecho.ko
+	else
+	    insmod ${lustre_root}/obdecho/obdecho.o
+	fi
+    fi
+fi
+check_obdecho || (echo "Can't load obdecho"; exit 1)
+get_devno () {
+    local type=$1
+    local name=$2
+    $lctl device_list | awk "{if (\$2 == \"UP\" && \$3 == \"$type\" && \$4 == \"$name\") {\
+	                         print \$1; exit}}"
+}
+get_ec_devno () {
+    local idx=$1
+    local client_name=${client_names[idx]}
+    local ost_name=${ost_names[idx]}
+    if [ -z "$client_name" ]; then
+	if [ -z "$ost_name" ]; then
+	    echo "client and ost name both null" 1>&2
+	    return
+	fi
+	client_name=${ost_name}_echo_client
+    fi
+    ec=`get_devno echo_client $client_name`
+    if [ -n "$ec" ]; then
+	echo $ec $client_name 
+	return
+    fi
+    if [ -z "$ost_name" ]; then
+	echo "no echo client and ost_name not set" 1>&2
+	return
+    fi
+    ost=`get_devno obdfilter $ost_name`
+    if [ -z "$ost" ]; then
+	echo "OST $ost_name not setup" 1>&2
+	return
+    fi
+    $lctl <<EOF
+        attach echo_client $client_name ${client_name}_UUID
+	setup $ost_name
+EOF
+    ec=`get_devno echo_client $client_name`
+    if [ -z "$ec" ]; then
+	echo "Can't setup echo client" 1>&2
+	return
+    fi
+    echo $ec $client_name 1
+}
+teardown_ec_devno () {
+    local idx=$1
+    local client_name=${client_names[$idx]}
+    if ((do_teardown_ec[$idx])); then
+	$lctl <<EOF
+	cfg $client_name
+	cleanup
+	detach
+EOF
+   fi
+}
+create_objects () {
+    # create a set of objects, check there are 'n' contiguous ones and
+    # return the first or 'ERROR'
+    local devno=$1
+    local nobj=$2
+    local rfile=$3
+    $lctl --device $devno create $nobj > $rfile 2>&1
+    n=(`awk < $rfile \
+	'/is object id/ {obj=strtonum($6);\
+	                 first=!not_first; not_first=1;\
+	                 if (first) first_obj=obj;
+		         else if (obj != prev + 1) exit;\
+	                 prev=obj; n++}\
+            END {printf "%d %d\n", first_obj, n}'`)
+    if ((n[1] != nobj)); then
+	echo "ERROR"
+    else
+	echo ${n[0]}
+    fi
+}
+destroy_objects () {
+    local devno=$1
+    local obj0=$2
+    local nobj=$3
+    local rfile=$4
+    $lctl --device $devno destroy $obj0 $nobj > $rfile 2>&1
+}
+get_stats () {
+    local rfile=$1
+    awk < $rfile \
+	'/^Selected device [0-9]+$/ {n = 0; next}\
+	/error/ {n = -1; exit}\
+	/^[0-9]+\/[0-9]+ Total: [0-9]+\.[0-9]+\/second$/ {n++; v=strtonum($3); \
+	                                                  if (n == 1 || v < min) min = v;\
+	                                                  if (n == 1 || v > max) max = v;\
+	                                                  next}\
+        {if (n != 0) {n = -1; exit}}\
+	END {printf "%d %f %f\n", n, min, max}'
+}
+get_global_stats () {
+    local rfile=$1
+    awk < $rfile 'BEGIN {n = 0;}\
+	          {n++; if (n == 1) {err = $1; min = $2; max = $3} else\
+	                            {if ($1 < err) err = $1;\
+                                     if ($2 < min) min = $2;\
+				     if ($3 > max) max = $3}}\
+	          END {if (n == 0) err = 0;\
+		       printf "%d %f %f\n", err, min, max}'
+}
+testname2type () {
+    # 'x' disables data check
+    if ((verify)); then
+	x=""
+    else
+	x="x"
+    fi
+    case $1 in
+	*write*)  echo "w$x";;
+	*)        echo "r$x";;
+    esac
+}
+start=`date +%F@%R`
+rsltf="${rslt}_${start}.summary"
+echo -n > $rsltf
+workf="${rslt}_${start}.detail"
+echo -n > $workf
+print_summary () {
+    if [ "$1" = "-n" ]; then
+	minusn=$1; shift
+    else
+	minusn=""
+    fi
+    echo $minusn "$*" >> $rsltf
+    echo $minusn "$*"
+}
+ndevs=${#client_names[@]}
+if ((ndevs < ${#ost_names[@]} )); then
+    ndevs=${#ost_names[@]}
+fi
+for ((idx = 0; idx < ndevs; idx++)); do
+    devno=(`get_ec_devno $idx`)
+    if ((${#devno[@]} < 2)); then
+	exit 1
+    fi
+    devnos[$idx]=${devno[0]}
+    client_names[$idx]=${devno[1]}
+    do_teardown_ec[$idx]=$((${#devno[@]} > 2))
+done
+echo 0 > /proc/sys/portals/debug
+for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
+    for ((nobj=$nobjlo;nobj<=$nobjhi;nobj*=2)); do 
+	for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do
+	    if ((thr < nobj)); then
+		continue
+	    fi
+	    # restart?
+	    if [ -n "$restart_rsz" -a\
+		 -n "$restart_nobj" -a\
+		 -n "$restart_thr" ]; then
+		if ((rsz < restart_rsz ||\
+		     (rsz == restart_rsz &&\
+		      (nobj < restart_nobj ||\
+		       (nobj == restart_nobj &&\
+			thr < restart_thr))))); then
+		    continue;
+		fi
+	    fi
+	    # compute parameters
+	    total_thr=$((ndevs*thr))
+	    total_nobj=$((ndevs*nobj))
+	    pages=$((rsz/PAGE_SIZE))
+	    actual_rsz=$((pages*PAGE_SIZE))
+	    count=$((size*1024/(actual_rsz*thr)))
+	    actual_size=$((actual_rsz*count*thr))
+            total_size=$((actual_size*ndevs))
+	    # show computed parameters
+	    str=`printf 'ost %2d sz %8dK rsz %4d obj %4d thr %4d ' \
+		     $ndevs $total_size $actual_rsz $total_nobj $total_thr`
+	    echo "=======================> $str" >> $workf
+	    print_summary -n "$str"
+	    if ((total_thr * actual_rsz > max_buffer_mem)); then
+		print_summary "Too much buffer space"
+		continue
+	    fi
+	    # create the objects
+	    tmpf="${workf}_tmp"
+	    for ((idx=0; idx < ndevs; idx++)); do
+		devno=${devnos[$idx]}
+		first_obj=`create_objects $devno $nobj $tmpf`
+		echo "========> Create [$idx]" >> $workf
+		cat $tmpf >> $workf
+		rm $tmpf
+		if [ $first_obj = "ERROR" ]; then
+		    print_summary "created object #s [$idx] not contiguous"
+		    exit 1
+		fi
+		first_objs[$idx]=$first_obj
+	    done
+	    for test in write $tests; do
+		print_summary -n "$test "
+		t0=`date +%s.%N`
+		for ((idx=0; idx < ndevs; idx++)); do
+		    devno=${devnos[$idx]}
+		    tmpfi="${tmpf}_$idx"
+		    first_obj=${first_objs[$idx]}
+		    $lctl > $tmpfi 2>&1 \
+			--threads $thr -$snap $devno \
+			test_brw $count `testname2type $test` q $pages ${thr}t${first_obj} &
+		done
+		wait
+		t1=`date +%s.%N`
+		str=`awk "BEGIN {printf \"%7.2f \",\
+		         $total_size / (( $t1 - $t0 ) * 1024)}"`
+		print_summary -n "$str"
+		echo -n > $tmpf
+		for ((idx=0; idx < ndevs; idx++)); do
+		    tmpfi="${tmpf}_$idx"
+		    echo "========> $test [$idx]" >> $workf
+		    cat $tmpfi >> $workf
+		    get_stats $tmpfi >> $tmpf
+		    rm $tmpfi
+		done
+		echo "========> $test [$idx] global" >> $workf
+		cat $tmpf >> $workf
+		stats=(`get_global_stats $tmpf`)
+		rm $tmpf
+		if ((stats[0] <= 0)); then
+		    if ((stats[0] < 0)); then
+			str=`printf "%15s " ERROR`
+		    else
+			str=`printf "%15s " SHORT`
+		    fi
+		else
+		    str=`awk "BEGIN {printf \"[%6.2f,%6.2f] \",\
+			     (${stats[1]} * $actual_rsz)/1024,\
+			     (${stats[2]} * $actual_rsz)/1024; exit}"`
+		fi
+		print_summary -n "$str"
+	    done
+	    print_summary ""
+	    for ((idx=0; idx < ndevs; idx++)); do
+		devno=${devnos[$idx]}
+		first_obj=${first_objs[$idx]}
+		destroy_objects $devno $first_obj $nobj $tmpf
+		echo "========> Destroy [$idx]" >> $workf
+		cat $tmpf >> $workf
+		rm $tmpf
+	    done
+	done
+    done
+done
+for ((idx=0; idx < ndevs; idx++)); do
+    teardown_ec_devno $idx
+done
+if ((load_obdecho)); then
+    rmmod obdecho
+fi
--- a/lustre-iokit/sgpdd-survey/README
+++ b/lustre-iokit/sgpdd-survey/README
@@ -41,9 +41,9 @@ The summary file and stdout contain lines like...
 total_size  8388608K rsz 1024 thr     1 crg   1  180.45 MB/s   1 x  180.50 =  180.50 MB/s
-The first number is the bandwidth computed by measuring total data and
+The number immediately before the first MB/s is the bandwidth computed by
-elapsed time.  The other numbers are a check on the bandwidths reported by
+measuring total data and elapsed time.  The other numbers are a check on
-the individual sgp_dd instances.
+the bandwidths reported by the individual sgp_dd instances.
 If there are so many threads that sgp_dd is unlikely to be able to allocate
 I/O buffers, "ENOMEM" is printed.

--- a/lustre-iokit/sgpdd-survey/sgpdd-survey
+++ b/lustre-iokit/sgpdd-survey/sgpdd-survey
@@ -74,8 +74,8 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
 	    # show computed parameters
 	    actual_rsz=$((bpt*bs/1024))
 	    actual_size=$((bs*count*crg/1024))
-	    str=`printf 'total_size %8dK rsz %4d thr %5d crg %3d ' \
+	    str=`printf 'total_size %8dK rsz %4d crg %5d thr %3d ' \
-		         $actual_size $actual_rsz $thr $crg`
+		         $actual_size $actual_rsz $crg $thr`
 	    echo "==============> $str" >> $workf
 	    print_summary -n "$str"
 	    freemem=`awk < /proc/meminfo '/^MemTotal:/ {printf "%d\n", $2}'`