From 7eb3e9a6594ae3ad9377eb02e663f38a0059ef5a Mon Sep 17 00:00:00 2001 From: adilger <adilger> Date: Sun, 16 Mar 2008 01:40:52 +0000 Subject: [PATCH] Branch b1_6 Add llbackup tool for parallel backups. It isn't really Lustre-specific, just using tar in parallel on multiple nodes. b=14711 i=johann i=bowen.zhou --- lustre/utils/llbackup | 537 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 537 insertions(+) create mode 100755 lustre/utils/llbackup diff --git a/lustre/utils/llbackup b/lustre/utils/llbackup new file mode 100755 index 0000000000..616c8094c1 --- /dev/null +++ b/lustre/utils/llbackup @@ -0,0 +1,537 @@ +#!/bin/bash +# do a parallel backup and restore of specified files +# +# Copyright (C) 2008 Sun Microsystems, Inc. +# +# This file is part of Lustre, http://www.lustre.org. +# +# Lustre is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. +# +# Lustre is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Lustre; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +# Author: Andreas Dilger <adilger@sun.com> +# + +VERSION=1.1.0 + +export LC_ALL=C + +RSH=${RSH:-"ssh"} # use "bash -c" for local testing +SPLITMB=${SPLITMB:-8192} # target chunk size (uncompressed) +SPLITCOUNT=${SPLITCOUNT:-200} # number of files to give each client +TAR=${TAR:-"tar"} # also htar in theory works (untested) + +#PROGPATH=$(which $0 2> /dev/null) || PROGPATH=$PWD/$0 +case $0 in + .*) PROGPATH=$PWD/$0 ;; + *) PROGPATH=$0 ;; +esac + +PROGNAME="$(basename $PROGPATH)" +LOGPREFIX="$PROGNAME" +log() { + echo "$LOGPREFIX: $*" 1>&2 +} + +fatal() { + log "ERROR: $*" + exit 1 +} + +usage() { + log "$*" + echo "usage: $PROGNAME [-chjvxz] [-C directory] [-e rsh] [-i inputlist]" + echo -e "\t\t[-l logdir] [-n nodes] [-s splitmb] [-T tar] -f ${FTYPE}base" + echo -e "\t-c create archive" + echo -e "\t-C directory: relative directory for filenames (default PWD)" + echo -e "\t-e rsh: specify the passwordless remote shell (default $RSH)" + if [ "$OP" = "backup" ]; then + echo -e "\t-f outputfile: specify base output filename for backup" + else + echo -e "\t-f ${OP}filelist: specify list of files to $OP" + fi + echo -e "\t-h: print help message and exit (use -x -h for restore help)" + if [ "$OP" = "backup" ]; then + echo -e "\t-i inputfile: list of files to backup (default stdin)" + fi + echo -e "\t-j: use bzip2 compression on $FTYPE file(s)" + echo -e "\t-l logdir: directory for output logs" + echo -e "\t-n nodes: comma-separated list of client nodes to run ${OP}s" + if [ "$OP" = "backup" ]; then + echo -e "\t-s splitmb: target size for backup chunks " \ + "(default ${SPLITMB}MiB)" + echo -e "\t-S splitcount: number of files sent to each client "\ + "(default ${SPLITCOUNT})" + fi + echo -e "\t-t: list table of contents of tarfile" + echo -e "\t-T tar: specify the backup command (default $TAR)" + echo -e "\t-v: be verbose - list all files being processed" + echo -e "\t-V: print version number and exit" + echo -e "\t-x: extract files instead of backing them up" + echo -e "\t-z: use gzip compression on $FTYPE file(s)" + exit 1 +} + +usage_inactive() { + usage "inactive argument '$1 $2' in '$3' mode" +} + +set_op_type() { + case $1 in + *backup*) OP=backup; FTYPE=output; TAROP="-c" ;; + *list*) OP=list; FTYPE=input; TAROP="-t"; SPLITCOUNT=1 ;; + *restore*) OP=restore; FTYPE=input; TAROP="-x"; SPLITCOUNT=1 ;; + *) FTYPE="output"; usage "unknown archive operation '$1'";; + esac +} + +#echo ARGV: "$@" + +# --fileonly, --remote are internal-use-only options +TEMPARGS=$(getopt -n $LOGPREFIX -o cC:e:f:hi:jl:n:ps:S:tT:vVxz --long create,extract,list,restore,directory:,rsh:,outputbase:,help,inputfile:,bzip2,logdir:,nodes:,permissions:splitmb,splitcount,tar:,verbose,version,gzip,fileonly,remote: -- "$@") + +eval set -- "$TEMPARGS" + +set_op_type $PROGNAME + +# parse input arguments, and accumulate the client-specific args +while true; do + case "$1" in + -c|--create) [ "$OP" != "backup" ] && + usage "can't use $1 $TAROP at the same time" + OP="backup"; ARGS="$ARGS $1"; shift ;; + -C|--directory) GOTODIR="$2"; cd "$2" || usage "error cd to -C $2"; + ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -e|--rsh) RSH="$2"; shift 2;; + -f|--outputbase)OUTPUTBASE="$2";ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -h|--help) ARGS=""; break;; + -i|--inputfile) INPUT="$2"; shift 2;; + -j|--bzip2) TARCOMP="-j"; ARGS="$ARGS $1"; shift ;; + -l|--logdir) LOGDIR="$2"; ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -n|--nodes) NODELIST="$NODELIST,$2"; + ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -p|--permissions) PERM="-p"; ARGS="$ARGS $1"; shift ;; + -s|--splitmb) [ "$OP" != "backup" ] && usage_inactive $1 $2 $OP + SPLITMB=$2; ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -S|--splitcount)[ "$OP" != "backup" ] && usage_inactive $1 $2 $OP + SPLITCOUNT=$2; ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -t|--list) [ "$OP" != "backup" -a "$OP" != "list" ] && + usage "can't use $1 $TAROP at the same time" + OP="list"; ARGS="$ARGS $1"; shift ;; + -T|--tar) TAR="$2"; ARGS="$ARGS $1 \"$2\""; shift 2 ;; + -v|--vebose) [ "$VERBOSE" = "-v" ] && set -vx # be extra verbose + VERBOSE="-v"; ARGS="$ARGS $1"; shift ;; + -V|--version) echo "$LOGPREFIX: version $VERSION"; exit 0;; + -x|--extract|--restore) + [ "$OP" != "backup" -a "$OP" != "restore" ] && + usage "can't use $1 $TAROP at the same time" + OP="restore"; ARGS="$ARGS $1"; shift ;; + -z|--gzip) TARCOMP="-z"; ARGS="$ARGS $1"; shift ;; + # these commands are for internal use only + --remote) NODENUM="$2"; LOGPREFIX="$(hostname).$2"; shift 2;; + --fileonly) FILEONLY="yes"; shift;; + --) shift; break;; + *) usage "unknown argument '$1'" 1>&2 ;; + esac +done + +set_op_type $OP + +#log "ARGS: $ARGS" + +[ -z "$ARGS" ] && usage "$OP a list of files, running on multiple nodes" + +# we should be able to use any backup tool that can accept filenames +# from an input file instead of just pathnames on the command-line. +# Unset TARCOMP for htar, as it doesn't support on-the-fly compression. +TAREXT= +case "$(basename $TAR)" in + htar*) TARARG="-L"; TAROUT="-f"; TARCOMP=""; MINKB=0 ;; + tar*|gnutar*|gtar*) TARARG="-T"; TAROUT="-b 2048 -f"; TAREXT=.tar ;; + *) fatal "unknown archiver '$TAR'" ;; +esac + +if [ "$OP" = "backup" ]; then + [ -z "$OUTPUTBASE" ] && usage "'-f ${FTYPE}base' must be given for $OP" + # Make sure we leave some margin free in the output filesystem for the + # chunks. If we are dumping to a network filesystem (denoted by having + # a ':' in the name, not sure how else to check) then we assume this + # filesystem is shared among all clients and expect the other nodes + # to also consume space there. + OUTPUTFS=$(dirname $OUTPUTBASE) + NETFS=$(df -P $OUTPUTFS | awk '/^[[:alnum:]]*:/ { print $1 }') + MINKB=${MINKB:-$((SPLITMB * 2 * 1024))} + [ "$NETFS" ] && MINKB=$(($(echo $NODELIST | tr ',' ' ' | wc -w) * $MINKB)) + + # Compress the output files as we go. + case "$TARCOMP" in + -z) TAREXT="$TAREXT.gz";; + -j) TAREXT="$TAREXT.bz2";; + esac +else + [ -z "$OUTPUTBASE" ] && + usage "-f ${OP}filelist must be specified for $OP" + # we want to be able to use this for a list of files to restore + # but it is convenient to use $INPUT for reading the pathnames + # of the tar files during restore/list operations to handle stdin + [ "$INPUT" ] && usage "-i inputbase unsupported for $OP" + INPUT=$OUTPUTBASE + TARARG="" +fi + +[ -z "$NODELIST" ] && NODELIST="localhost" + +# If we are writing to a char or block device (e.g. tape) don't add any suffix +# We can't currently specify a different target device per client... +if [ -b "$OUTPUTBASE" -o -c "$OUTPUTBASE" ]; then + MINKB=0 + [ -z "$LOGDIR" ] && LOGDIR="/var/log" + LOGBASE="$LOGDIR/$PROGNAME" +elif [ -d "$OUTPUTBASE" ]; then + usage "-f $OUTPUTBASE must be a pathname, not a directory" +else + [ -z "$LOGDIR" ] && LOGBASE="$OUTPUTBASE" || LOGBASE="$LOGDIR/$PROGNAME" +fi +LOGBASE="$LOGBASE.$(date +%Y%m%d%H%M)" + +# tar up a sinle list of files into a chunk. We don't exit if there is an +# error returned, since that might happen frequently with e.g. files moving +# and no longer being available for backup. +# usage: run_one_tar {file_list} {chunk_nr} {chunkbytes} +DONE_MSG="FINISH_THIS_PROGRAM_NOW_I_TELL_YOU" +KILL_MSG="EXIT_THIS_PROGRAM_NOW_I_TELL_YOU" +run_one_backup() { +#set -vx + TMPLIST="$1" + CHUNK="$2" + CHUNKMB="$(($3 / 1048576))" + if [ -b "$OUTPUTBASE" -o -c "$OUTPUTBASE" ]; then + OUTFILE="$OUTPUTBASE" + else + OUTFILE="$OUTPUTBASE.$NODENUM.$CHUNK$TAREXT" + fi + CHUNKBASE="$LOGBASE.$NODENUM.$CHUNK" + LISTFILE="$CHUNKBASE.list" + LOG="$CHUNKBASE.log" + + cp "$TMPLIST" "$LISTFILE" + + SLEPT=0 + FREEKB=$(df -P $OUTPUTFS 2> /dev/null | tail -n 1 | awk '{print $4}') + while [ $FREEKB -lt $MINKB ]; do + sleep 5 + SLEPT=$((SLEPT + 5)) + if [ $((SLEPT % 60)) -eq 10 ]; then + log "waiting ${SLEPT}s for ${MINKB}kB free in $OUTPUTFS" + fi + FREEKB=$(df -P $OUTPUTFS | tail -n 1 | awk '{print $4}') + done + [ $SLEPT -gt 0 ] && log "waited ${SLEPT}s for space in ${OUTPUTFS}" + log "$LISTFILE started - est. ${CHUNKMB}MB" + START=$(date +%s) + eval $TAR $TAROP $PERM $TARARG "$TMPLIST" -v $TARCOMP $TAROUT "$OUTFILE" \ + 2>&1 >>"$LOG" | tee -a $LOG | grep -v "Removing leading" + RC=${PIPESTATUS[0]} + ELAPSE=$(($(date +%s) - START)) + if [ $RC -eq 0 ]; then + if [ -f "$OUTFILE" ]; then + BYTES=$(stat -c '%s' "$OUTFILE") + CHUNKMB=$((BYTES / 1048576)) + log "$LISTFILE finished - act. ${CHUNKMB}MB/${ELAPSE}s" + else + log "$LISTFILE finished OK - ${ELAPSE}s" + fi + echo "OK" > $CHUNKBASE.done + else + echo "ERROR=$RC" > $CHUNKBASE.done + log "ERROR: $LISTFILE exited with rc=$RC" + fi + rm $TMPLIST + return $RC +} + +run_one_restore_or_list() { +#set -vx + INPUTFILE="$1" + LOG="$LOGBASE.$(basename $INPUTFILE).restore.log" + + SLEPT=0 + while [ $MINKB != 0 -a ! -r "$INPUTFILE" ]; do + SLEPT=$((SLEPT + 5)) + if [ $((SLEPT % 60)) -eq 10 ]; then + log "waiting ${SLEPT}s for $INPUTFILE staging" + fi + sleep 5 + done + [ $SLEPT -gt 0 ] && log "waited ${SLEPT}s for $INPUTFILE staging" + log "$OP of $INPUTFILE started" + START=$(date +%s) + eval $TAR $TAROP -v $TARCOMP $TAROUT "$INPUTFILE" 2>&1 >>"$LOG" | + tee -a "$LOG" | grep -v "Removing leading" + RC=${PIPESTATUS[0]} + ELAPSE=$(($(date +%s) - START)) + [ "$OP" = "list" ] && cat $LOG + if [ $RC -eq 0 ]; then + log "$INPUTFILE finished OK - ${ELAPSE}s" + echo "OK" > $INPUTFILE.restore.done + else + echo "ERROR=$RC" > $INPUTFILE.restore.done + log "ERROR: $OP of $INPUTFILE exited with rc=$RC" + fi + return $RC +} + +# Run as a remote command and read input filenames from stdin and create tar +# output files of the requested size. The input filenames can either be: +# "bytes filename" or "filename" depending on whether FILEONLY is set. +# +# Read filenames until we have either a large enough list of small files, +# or we get a very large single file that is backed up by itself. +run_remotely() { +#set -vx + log "started thread" + RCMAX=0 + + [ "$FILEONLY" ] && PARAMS="FILENAME" || PARAMS="BYTES FILENAME" + + if [ "$OP" = "backup" ]; then + TMPBASE=$PROGNAME.$LOGPREFIX.temp + TMPFILE="$(mktemp -t $TMPBASE.$(date +%s).XXXXXXX)" + OUTPUTFILENUM=0 + SUMBYTES=0 + fi + BYTES="" + + while read $PARAMS; do + [ "$FILENAME" = "$DONE_MSG" -o "$BYTES" = "$DONE_MSG" ] && break + if [ "$FILENAME" = "$KILL_MSG" -o "$BYTES" = "$KILL_MSG" ]; then + log "exiting $OP on request" + [ "$TARPID" ] && kill -9 $TARPID 2> /dev/null + exit 9 + fi + + case "$OP" in + list|restore) + run_one_restore_or_list $FILENAME; RC=$? + ;; + backup) STAT=($(stat -c '%s %F' "$FILENAME")) + [ "$FILEONLY" ] && BYTES=${STAT[0]} + # if this is a directory that has files in it, it will + # be backed up as part of this (or some other) backup. + # Only include it in the backup if empty, otherwise + # the files therein will be backed up multiple times + if [ "${STAT[1]}" = "directory" ]; then + NUM=`find "$FILENAME" -maxdepth 1|head -2|wc -l` + [ "$NUM" -gt 1 ] && continue + fi + [ "$VERBOSE" ] && log "$FILENAME" + + # if a file is > 3/4 of chunk size, archive by itself + # avoid shell math: 1024 * 1024 / (3/4) = 1398101 + if [ $((BYTES / 1398101)) -gt $SPLITMB ]; then + # create a very temp list for just this file + TARLIST=$(mktemp -t $TMPBASE.$(date +%s).XXXXXX) + echo "$FILENAME" > "$TARLIST" + TARBYTES=$BYTES + else + SUMBYTES=$((SUMBYTES + BYTES)) + echo "$FILENAME" >> $TMPFILE + + # not large enough input list, keep collecting + [ $((SUMBYTES >> 20)) -lt $SPLITMB ] && continue + + TARBYTES=$SUMBYTES + SUMBYTES=0 + TARLIST="$TMPFILE" + TMPFILE=$(mktemp -t $TMPBASE.$(date +%s).XXXXXXX) + fi + + wait $TARPID + RC=$? + run_one_backup "$TARLIST" "$OUTPUTFILENUM" $TARBYTES & + TARPID=$! + OUTPUTFILENUM=$((OUTPUTFILENUM + 1)) + ;; + esac + + [ $RC -gt $RCMAX ] && RCMAX=$RC + done + + if [ "$TARPID" ]; then + wait $TARPID + RC=$? + [ $RC -gt $RCMAX ] && RCMAX=$RC + fi + + if [ -s "$TMPFILE" ]; then + run_one_backup "$TMPFILE" "$OUTPUTFILENUM" $SUMBYTES + RC=$? + [ $RC -gt $RCMAX ] && RCMAX=$RC + fi + exit $RCMAX +} + +# If we are a client then just run that subroutine and exit +[ "$NODENUM" ] && run_remotely && exit 0 + +# Tell the clients to exit. Their input pipes might be busy so it may +# take a while for them to consume the files and finish. +CLEANING=no +cleanup() { + log "cleaning up remote processes" + for FD in $(seq $BASEFD $((BASEFD + NUMCLI - 1))); do + echo "$DONE_MSG" >&$FD + done + CLEANING=yes + + SLEPT=0 + RUN=$(ps auxww | egrep -v "grep|bash" | grep -c "$PROGNAME.*remote") + while [ $RUN -gt 0 ]; do + set +vx + #ps auxww | grep "$PROGNAME.*remote" | egrep -v "grep|bash" + sleep 1 + SLEPT=$((SLEPT + 1)) + [ $((SLEPT % 30)) -eq 0 ] && + log "wait for $RUN processes to finish" + [ $((SLEPT % 300)) -eq 0 ] && + ps auxww |grep "$PROGNAME.*remote" |egrep -v "grep|bash" + RUN=$(ps auxww|egrep -v "grep|bash"|grep -c "$PROGNAME.*remote") + done + trap 0 +} + +do_cleanup() { + if [ "$CLEANING" = "yes" ]; then + log "killing all remote processes - may not stop immediately" + for FD in $(seq $BASEFD $((BASEFD + NUMCLI - 1))); do + echo "$KILL_MSG" >&$FD + done + sleep 1 + PROCS=$(ps auxww|awk '/$PROGNAME.*remote/ { print $2 }') + [ "$PROCS" ] && kill -9 $PROCS + trap 0 + fi + + cleanup +} + +# values that only need to be determined on the master +# always read from stdin, even if it is a file, to be more consistent +case "$INPUT" in + -|"") INPUT="standard input";; + *) if [ ! -r "$INPUT" ]; then + [ "$VERBOSE" ] && ls -l "$INPUT" + usage "can't read input file '$INPUT'" + fi + exec <$INPUT ;; +esac + +# if unspecified, run remote clients in the current PWD to get correct paths +[ -z "$GOTODIR" ] && ARGS="$ARGS -C \"$PWD\"" + +# main() +BASEFD=100 +NUMCLI=0 + +# Check if the input list has the file size specified or not. Input +# lines should be of the form "{bytes} {filename}" or "{filename}". +# If no size is given then the file sizes are determined by the clients +# to do the chunking (useful for making a full backup, but not as good +# at evenly distributing the data among clients). In rare cases the first +# file specified may have a blank line and no size - check that as well. +if [ "$OP" = "backup" ]; then + read BYTES FILENAME + if [ -z "$FILENAME" -a -e "$BYTES" ]; then + FILENAME="$BYTES" + BYTES="" + FILEONLY="yes" && ARGS="$ARGS --fileonly" + elif [ -e "$BYTES $FILENAME" ]; then + FILENAME="$BYTES $FILENAME" + BYTES="" + FILEONLY="yes" && ARGS="$ARGS --fileonly" + elif [ ! -e "$FILENAME" ]; then + log "input was '$BYTES $FILENAME'" + fatal "first line of '$INPUT' is not a file" + fi +else + FILEONLY="yes" && ARGS="$ARGS --fileonly" +fi + +# kill the $RSH processes if we get a signal +trap do_cleanup INT EXIT + +# start up the remote processes, each one with its stdin attached to a +# different output file descriptor, so that we can communicate with them +# individually when sending files to back up. We generate a remote log +# file and also return output to this process. +for CLIENT in $(echo $NODELIST | tr ',' ' '); do + FD=$((BASEFD+NUMCLI)) + LOG=$OUTPUTBASE.$CLIENT.$FD.log + eval "exec $FD> >($RSH $CLIENT '$PROGPATH --remote=$NUMCLI $ARGS')" + RC=$? + if [ $RC -eq 0 ]; then + log "starting $0.$NUMCLI on $CLIENT" + NUMCLI=$((NUMCLI + 1)) + else + log "ERROR: failed '$RSH $CLIENT $PROGPATH': RC=$?" + fi +done + +if [ $NUMCLI -eq 0 ]; then + fatal "unable to start any threads" +fi + +CURRCLI=0 +# We don't want to use "BYTES FILENAME" if the input doesn't include the +# size, as this might cause problems with files with whitespace in them. +# Instead we just have two different loops depending on whether the size +# is in the input file or not. We dish out the files either by size +# (to fill a chunk), or just round-robin and hope for the best. +if [ "$FILEONLY" ]; then + if [ "$FILENAME" ]; then + [ "$VERBOSE" ] && log "$FILENAME" + echo "$FILENAME" 1>&$BASEFD # rewrite initial line + fi + # if we don't know the size, just round-robin among the clients + while read FILENAME; do + FD=$((BASEFD+CURRCLI)) + [ -n "$VERBOSE" -a "$OP" != "backup" ] && log "$OP $FILENAME" + echo "$FILENAME" 1>&$FD + + COUNT=$((COUNT + 1)) + if [ $COUNT -ge $SPLITCOUNT ]; then + CURRCLI=$(((CURRCLI + 1) % NUMCLI)) + COUNT=0 + fi + done +else + [ "$VERBOSE" ] && log "$FILENAME" + echo $BYTES "$FILENAME" 1>&$BASEFD # rewrite initial line + # if we know the size, then give each client enough to start a chunk + while read BYTES FILENAME; do + FD=$((BASEFD+CURRCLI)) + [ "$VERBOSE" ] && log "$FILENAME" + echo $BYTES "$FILENAME" >&$FD + + # take tar blocking factor into account + [ $BYTES -lt 10240 ] && BYTES=10240 + SUMBYTES=$((SUMBYTES + BYTES)) + if [ $((SUMBYTES / 1048576)) -ge $SPLITMB ]; then + CURRCLI=$(((CURRCLI + 1) % NUMCLI)) + SUMBYTES=0 + fi + done +fi + +# Once all of the files have been given out, wait for the remote processes +# to complete. That might take a while depending on the size of the backup. +cleanup -- GitLab